1 /*
2 * Copyright © 2012 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "ir3.h"
7
8 #include <assert.h>
9 #include <errno.h>
10 #include <stdbool.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "util/bitscan.h"
16 #include "util/half_float.h"
17 #include "util/ralloc.h"
18 #include "util/u_math.h"
19
20 #include "instr-a3xx.h"
21 #include "ir3_shader.h"
22
23 /* simple allocator to carve allocations out of an up-front allocated heap,
24 * so that we can free everything easily in one shot.
25 */
26 void *
ir3_alloc(struct ir3 * shader,int sz)27 ir3_alloc(struct ir3 *shader, int sz)
28 {
29 return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
30 }
31
32 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)33 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
34 {
35 struct ir3 *shader = rzalloc(v, struct ir3);
36
37 shader->compiler = compiler;
38 shader->type = v->type;
39
40 list_inithead(&shader->block_list);
41 list_inithead(&shader->array_list);
42
43 return shader;
44 }
45
46 void
ir3_destroy(struct ir3 * shader)47 ir3_destroy(struct ir3 *shader)
48 {
49 ralloc_free(shader);
50 }
51
52 static bool
is_shared_consts(struct ir3_compiler * compiler,const struct ir3_const_state * const_state,struct ir3_register * reg)53 is_shared_consts(struct ir3_compiler *compiler,
54 const struct ir3_const_state *const_state,
55 struct ir3_register *reg)
56 {
57 if (const_state->push_consts_type == IR3_PUSH_CONSTS_SHARED &&
58 reg->flags & IR3_REG_CONST) {
59 uint32_t min_const_reg = regid(compiler->shared_consts_base_offset, 0);
60 uint32_t max_const_reg =
61 regid(compiler->shared_consts_base_offset +
62 compiler->shared_consts_size, 0);
63 return reg->num >= min_const_reg && min_const_reg < max_const_reg;
64 }
65
66 return false;
67 }
68
69 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)70 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
71 struct ir3_info *info)
72 {
73 struct ir3_shader_variant *v = info->data;
74
75 if (reg->flags & IR3_REG_IMMED) {
76 /* nothing to do */
77 return;
78 }
79
80 /* Shared consts don't need to be included into constlen. */
81 if (is_shared_consts(v->compiler, ir3_const_state(v), reg))
82 return;
83
84 unsigned components;
85 int16_t max;
86
87 if (reg->flags & IR3_REG_RELATIV) {
88 components = reg->size;
89 max = (reg->array.base + components - 1);
90 } else {
91 components = util_last_bit(reg->wrmask);
92 max = (reg->num + components - 1);
93 }
94
95 if (reg->flags & IR3_REG_CONST) {
96 info->max_const = MAX2(info->max_const, max >> 2);
97 } else if (max < regid(48, 0)) {
98 if (reg->flags & IR3_REG_HALF) {
99 if (v->mergedregs) {
100 /* starting w/ a6xx, half regs conflict with full regs: */
101 info->max_reg = MAX2(info->max_reg, max >> 3);
102 } else {
103 info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
104 }
105 } else {
106 info->max_reg = MAX2(info->max_reg, max >> 2);
107 }
108 }
109 }
110
111 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)112 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
113 {
114 const struct ir3_compiler *compiler = v->compiler;
115
116 /* If the user forced a particular wavesize respect that. */
117 if (v->shader_options.real_wavesize == IR3_SINGLE_ONLY)
118 return false;
119 if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY)
120 return true;
121
122 /* We can't support more than compiler->branchstack_size diverging threads
123 * in a wave. Thus, doubling the threadsize is only possible if we don't
124 * exceed the branchstack size limit.
125 */
126 if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
127 compiler->branchstack_size) {
128 return false;
129 }
130
131 switch (v->type) {
132 case MESA_SHADER_KERNEL:
133 case MESA_SHADER_COMPUTE: {
134 unsigned threads_per_wg =
135 v->local_size[0] * v->local_size[1] * v->local_size[2];
136
137 /* For a5xx, if the workgroup size is greater than the maximum number
138 * of threads per core with 32 threads per wave (512) then we have to
139 * use the doubled threadsize because otherwise the workgroup wouldn't
140 * fit. For smaller workgroup sizes, we follow the blob and use the
141 * smaller threadsize.
142 */
143 if (compiler->gen < 6) {
144 return v->local_size_variable ||
145 threads_per_wg >
146 compiler->threadsize_base * compiler->max_waves;
147 }
148
149 /* On a6xx, we prefer the larger threadsize unless the workgroup is
150 * small enough that it would be useless. Note that because
151 * threadsize_base is bumped to 64, we don't have to worry about the
152 * workgroup fitting, unlike the a5xx case.
153 */
154 if (!v->local_size_variable) {
155 if (threads_per_wg <= compiler->threadsize_base)
156 return false;
157 }
158 }
159 FALLTHROUGH;
160 case MESA_SHADER_FRAGMENT: {
161 /* Check that doubling the threadsize wouldn't exceed the regfile size */
162 return regs_count * 2 <= compiler->reg_size_vec4;
163 }
164
165 default:
166 /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
167 * stages - the bit doesn't exist. The blob never used it for the VS
168 * on earlier gen's anyway.
169 */
170 return false;
171 }
172 }
173
174 /* Get the maximum number of waves that could be used even if this shader
175 * didn't use any registers.
176 */
177 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)178 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
179 bool double_threadsize)
180 {
181 const struct ir3_compiler *compiler = v->compiler;
182 unsigned max_waves = compiler->max_waves;
183
184 /* Compute the limit based on branchstack */
185 if (v->branchstack > 0) {
186 unsigned branchstack_max_waves = compiler->branchstack_size /
187 v->branchstack *
188 compiler->wave_granularity;
189 max_waves = MIN2(max_waves, branchstack_max_waves);
190 }
191
192 /* If this is a compute shader, compute the limit based on shared size */
193 if ((v->type == MESA_SHADER_COMPUTE) ||
194 (v->type == MESA_SHADER_KERNEL)) {
195 unsigned threads_per_wg =
196 v->local_size[0] * v->local_size[1] * v->local_size[2];
197 unsigned waves_per_wg =
198 DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
199 (double_threadsize ? 2 : 1) *
200 compiler->wave_granularity);
201
202 /* Shared is allocated in chunks of 1k */
203 unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
204 if (shared_per_wg > 0 && !v->local_size_variable) {
205 unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
206
207 max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
208 compiler->wave_granularity);
209 }
210
211 /* If we have a compute shader that has a big workgroup, a barrier, and
212 * a branchstack which limits max_waves - this may result in a situation
213 * when we cannot run concurrently all waves of the workgroup, which
214 * would lead to a hang.
215 *
216 * TODO: Could we spill branchstack or is there other way around?
217 * Blob just explodes in such case.
218 */
219 if (v->has_barrier && (max_waves < waves_per_wg)) {
220 mesa_loge(
221 "Compute shader (%s) which has workgroup barrier cannot be used "
222 "because it's impossible to have enough concurrent waves.",
223 v->name);
224 exit(1);
225 }
226 }
227
228 return max_waves;
229 }
230
231 /* Get the maximum number of waves that could be launched limited by reg size.
232 */
233 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)234 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
235 unsigned reg_count, bool double_threadsize)
236 {
237 return reg_count ? (compiler->reg_size_vec4 /
238 (reg_count * (double_threadsize ? 2 : 1)) *
239 compiler->wave_granularity)
240 : compiler->max_waves;
241 }
242
243 void
ir3_collect_info(struct ir3_shader_variant * v)244 ir3_collect_info(struct ir3_shader_variant *v)
245 {
246 struct ir3_info *info = &v->info;
247 struct ir3 *shader = v->ir;
248 const struct ir3_compiler *compiler = v->compiler;
249
250 memset(info, 0, sizeof(*info));
251 info->data = v;
252 info->max_reg = -1;
253 info->max_half_reg = -1;
254 info->max_const = -1;
255 info->multi_dword_ldp_stp = false;
256
257 uint32_t instr_count = 0;
258 foreach_block (block, &shader->block_list) {
259 foreach_instr (instr, &block->instr_list) {
260 instr_count++;
261 }
262 }
263
264 v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
265
266 /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
267 * doesn't try to decode the following data as instructions (such as the
268 * next stage's shader in turnip)
269 */
270 info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
271 info->sizedwords = info->size / 4;
272
273 info->early_preamble = v->early_preamble;
274
275 bool in_preamble = false;
276 bool has_eq = false;
277
278 foreach_block (block, &shader->block_list) {
279 int sfu_delay = 0, mem_delay = 0;
280
281 foreach_instr (instr, &block->instr_list) {
282
283 foreach_src (reg, instr) {
284 collect_reg_info(instr, reg, info);
285 }
286
287 foreach_dst (reg, instr) {
288 if (is_dest_gpr(reg)) {
289 collect_reg_info(instr, reg, info);
290 }
291 }
292
293 if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
294 unsigned components = instr->srcs[2]->uim_val;
295
296 /* This covers any multi-component access that could straddle
297 * across multiple double-words.
298 */
299 if (components > 1)
300 info->multi_dword_ldp_stp = true;
301
302 if (instr->opc == OPC_STP)
303 info->stp_count += components;
304 else
305 info->ldp_count += components;
306 }
307
308 if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
309 (instr->dsts[0]->flags & IR3_REG_EI))
310 info->last_baryf = info->instrs_count;
311
312 if ((instr->opc == OPC_NOP) && (instr->flags & IR3_INSTR_EQ)) {
313 info->last_helper = info->instrs_count;
314 has_eq = true;
315 }
316
317 if (v->type == MESA_SHADER_FRAGMENT && v->need_pixlod &&
318 instr->opc == OPC_END && !v->prefetch_end_of_quad && !has_eq)
319 info->last_helper = info->instrs_count;
320
321 if (instr->opc == OPC_SHPS)
322 in_preamble = true;
323
324 /* Don't count instructions in the preamble for instruction-count type
325 * stats, because their effect should be much smaller.
326 * TODO: we should probably have separate stats for preamble
327 * instructions, but that would blow up the amount of stats...
328 */
329 if (!in_preamble) {
330 unsigned instrs_count = 1 + instr->repeat + instr->nop;
331 unsigned nops_count = instr->nop;
332
333 if (instr->opc == OPC_NOP) {
334 nops_count = 1 + instr->repeat;
335 info->instrs_per_cat[0] += nops_count;
336 } else if (!is_meta(instr)) {
337 info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
338 info->instrs_per_cat[0] += nops_count;
339 }
340
341 if (instr->opc == OPC_MOV) {
342 if (instr->cat1.src_type == instr->cat1.dst_type) {
343 info->mov_count += 1 + instr->repeat;
344 } else {
345 info->cov_count += 1 + instr->repeat;
346 }
347 }
348
349 info->instrs_count += instrs_count;
350 info->nops_count += nops_count;
351
352 if (instr->flags & IR3_INSTR_SS) {
353 info->ss++;
354 info->sstall += sfu_delay;
355 sfu_delay = 0;
356 }
357
358 if (instr->flags & IR3_INSTR_SY) {
359 info->sy++;
360 info->systall += mem_delay;
361 mem_delay = 0;
362 }
363
364 if (is_ss_producer(instr)) {
365 sfu_delay = soft_ss_delay(instr);
366 } else {
367 int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
368 sfu_delay -= n;
369 }
370
371 if (is_sy_producer(instr)) {
372 mem_delay = soft_sy_delay(instr, shader);
373 } else {
374 int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
375 mem_delay -= n;
376 }
377 }
378
379 if (instr->opc == OPC_SHPE)
380 in_preamble = false;
381 }
382 }
383
384 /* for vertex shader, the inputs are loaded into registers before the shader
385 * is executed, so max_regs from the shader instructions might not properly
386 * reflect the # of registers actually used, especially in case passthrough
387 * varyings.
388 *
389 * Likewise, for fragment shader, we can have some regs which are passed
390 * input values but never touched by the resulting shader (ie. as result
391 * of dead code elimination or simply because we don't know how to turn
392 * the reg off.
393 */
394 for (unsigned i = 0; i < v->inputs_count; i++) {
395 /* skip frag inputs fetch via bary.f since their reg's are
396 * not written by gpu before shader starts (and in fact the
397 * regid's might not even be valid)
398 */
399 if (v->inputs[i].bary)
400 continue;
401
402 /* ignore high regs that are global to all threads in a warp
403 * (they exist by default) (a5xx+)
404 */
405 if (v->inputs[i].regid >= regid(48, 0))
406 continue;
407
408 if (v->inputs[i].compmask) {
409 unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
410 int32_t regid = v->inputs[i].regid + n;
411 if (v->inputs[i].half) {
412 if (!v->mergedregs) {
413 v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
414 } else {
415 v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
416 }
417 } else {
418 v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
419 }
420 }
421 }
422
423 for (unsigned i = 0; i < v->num_sampler_prefetch; i++) {
424 unsigned n = util_last_bit(v->sampler_prefetch[i].wrmask) - 1;
425 int32_t regid = v->sampler_prefetch[i].dst + n;
426 if (v->sampler_prefetch[i].half_precision) {
427 if (!v->mergedregs) {
428 v->info.max_half_reg = MAX2(v->info.max_half_reg, regid >> 2);
429 } else {
430 v->info.max_reg = MAX2(v->info.max_reg, regid >> 3);
431 }
432 } else {
433 v->info.max_reg = MAX2(v->info.max_reg, regid >> 2);
434 }
435 }
436
437 /* TODO: for a5xx and below, is there a separate regfile for
438 * half-registers?
439 */
440 unsigned regs_count =
441 info->max_reg + 1 +
442 (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
443
444 info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
445
446 /* TODO this is different for earlier gens, but earlier gens don't use this */
447 info->subgroup_size = v->info.double_threadsize ? 128 : 64;
448
449 unsigned reg_independent_max_waves =
450 ir3_get_reg_independent_max_waves(v, info->double_threadsize);
451 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
452 compiler, regs_count, info->double_threadsize);
453 info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
454 assert(info->max_waves <= v->compiler->max_waves);
455 }
456
457 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)458 reg_create(struct ir3 *shader, int num, int flags)
459 {
460 struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
461 reg->wrmask = 1;
462 reg->flags = flags;
463 reg->num = num;
464 return reg;
465 }
466
467 static void
insert_instr(struct ir3_cursor cursor,struct ir3_instruction * instr)468 insert_instr(struct ir3_cursor cursor, struct ir3_instruction *instr)
469 {
470 struct ir3 *shader = instr->block->shader;
471
472 instr->serialno = ++shader->instr_count;
473
474 switch (cursor.option) {
475 case IR3_CURSOR_BEFORE_BLOCK:
476 list_add(&instr->node, &cursor.block->instr_list);
477 break;
478 case IR3_CURSOR_AFTER_BLOCK:
479 list_addtail(&instr->node, &cursor.block->instr_list);
480 break;
481 case IR3_CURSOR_BEFORE_INSTR:
482 list_addtail(&instr->node, &cursor.instr->node);
483 break;
484 case IR3_CURSOR_AFTER_INSTR:
485 list_add(&instr->node, &cursor.instr->node);
486 break;
487 }
488
489 if (is_input(instr))
490 array_insert(shader, shader->baryfs, instr);
491 }
492
493 struct ir3_block *
ir3_block_create(struct ir3 * shader)494 ir3_block_create(struct ir3 *shader)
495 {
496 struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
497 #if MESA_DEBUG
498 block->serialno = ++shader->block_count;
499 #endif
500 block->shader = shader;
501 list_inithead(&block->node);
502 list_inithead(&block->instr_list);
503 return block;
504 }
505
506 static struct ir3_instruction *
block_get_last_instruction(struct ir3_block * block)507 block_get_last_instruction(struct ir3_block *block)
508 {
509 if (list_is_empty(&block->instr_list))
510 return NULL;
511 return list_last_entry(&block->instr_list, struct ir3_instruction, node);
512 }
513
514 struct ir3_instruction *
ir3_block_get_terminator(struct ir3_block * block)515 ir3_block_get_terminator(struct ir3_block *block)
516 {
517 struct ir3_instruction *last = block_get_last_instruction(block);
518
519 if (last && is_terminator(last))
520 return last;
521
522 return NULL;
523 }
524
525 struct ir3_instruction *
ir3_block_take_terminator(struct ir3_block * block)526 ir3_block_take_terminator(struct ir3_block *block)
527 {
528 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
529
530 if (terminator)
531 list_delinit(&terminator->node);
532
533 return terminator;
534 }
535
536 struct ir3_instruction *
ir3_block_get_last_non_terminator(struct ir3_block * block)537 ir3_block_get_last_non_terminator(struct ir3_block *block)
538 {
539 struct ir3_instruction *last = block_get_last_instruction(block);
540
541 if (!last)
542 return NULL;
543
544 if (!is_terminator(last))
545 return last;
546
547 if (last->node.prev != &block->instr_list)
548 return list_entry(last->node.prev, struct ir3_instruction, node);
549
550 return NULL;
551 }
552
553 struct ir3_instruction *
ir3_block_get_last_phi(struct ir3_block * block)554 ir3_block_get_last_phi(struct ir3_block *block)
555 {
556 struct ir3_instruction *last_phi = NULL;
557
558 foreach_instr (instr, &block->instr_list) {
559 if (instr->opc != OPC_META_PHI)
560 break;
561
562 last_phi = instr;
563 }
564
565 return last_phi;
566 }
567
568 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)569 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
570 {
571 array_insert(block, block->predecessors, pred);
572 }
573
574 void
ir3_block_link_physical(struct ir3_block * pred,struct ir3_block * succ)575 ir3_block_link_physical(struct ir3_block *pred,
576 struct ir3_block *succ)
577 {
578 array_insert(pred, pred->physical_successors, succ);
579 array_insert(succ, succ->physical_predecessors, pred);
580 }
581
582 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)583 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
584 {
585 for (unsigned i = 0; i < block->predecessors_count; i++) {
586 if (block->predecessors[i] == pred) {
587 if (i < block->predecessors_count - 1) {
588 block->predecessors[i] =
589 block->predecessors[block->predecessors_count - 1];
590 }
591
592 block->predecessors_count--;
593 return;
594 }
595 }
596 }
597
598 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)599 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
600 {
601 for (unsigned i = 0; i < block->predecessors_count; i++) {
602 if (block->predecessors[i] == pred) {
603 return i;
604 }
605 }
606
607 unreachable("ir3_block_get_pred_index() invalid predecessor");
608 }
609
610 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)611 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
612 {
613 /* Add extra sources for array destinations and the address reg */
614 if (1 <= opc_cat(opc))
615 nsrc += 2;
616 struct ir3_instruction *instr;
617 unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
618 (nsrc * sizeof(instr->srcs[0]));
619 char *ptr = ir3_alloc(block->shader, sz);
620
621 instr = (struct ir3_instruction *)ptr;
622 ptr += sizeof(*instr);
623 instr->dsts = (struct ir3_register **)ptr;
624 instr->srcs = instr->dsts + ndst;
625
626 #if MESA_DEBUG
627 instr->dsts_max = ndst;
628 instr->srcs_max = nsrc;
629 #endif
630
631 list_inithead(&instr->rpt_node);
632 return instr;
633 }
634
635 static void
add_to_address_users(struct ir3_instruction * instr)636 add_to_address_users(struct ir3_instruction *instr)
637 {
638 assert(instr->address != NULL);
639
640 struct ir3 *ir = instr->block->shader;
641 struct ir3_register *addr_reg = instr->address->def;
642 assert(reg_num(addr_reg) == REG_A0);
643 unsigned comp = reg_comp(addr_reg);
644 if (comp == 0) {
645 array_insert(ir, ir->a0_users, instr);
646 } else {
647 assert(comp == 1);
648 array_insert(ir, ir->a1_users, instr);
649 }
650 }
651
652 static struct ir3_block *
get_block(struct ir3_cursor cursor)653 get_block(struct ir3_cursor cursor)
654 {
655 switch (cursor.option) {
656 case IR3_CURSOR_BEFORE_BLOCK:
657 case IR3_CURSOR_AFTER_BLOCK:
658 return cursor.block;
659 case IR3_CURSOR_BEFORE_INSTR:
660 case IR3_CURSOR_AFTER_INSTR:
661 return cursor.instr->block;
662 }
663
664 unreachable("illegal cursor option");
665 }
666
667 struct ir3_instruction *
ir3_instr_create_at(struct ir3_cursor cursor,opc_t opc,int ndst,int nsrc)668 ir3_instr_create_at(struct ir3_cursor cursor, opc_t opc, int ndst, int nsrc)
669 {
670 struct ir3_block *block = get_block(cursor);
671 struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
672 instr->block = block;
673 instr->opc = opc;
674 insert_instr(cursor, instr);
675 return instr;
676 }
677
678 struct ir3_instruction *
ir3_build_instr(struct ir3_builder * builder,opc_t opc,int ndst,int nsrc)679 ir3_build_instr(struct ir3_builder *builder, opc_t opc, int ndst, int nsrc)
680 {
681 struct ir3_instruction *instr =
682 ir3_instr_create_at(builder->cursor, opc, ndst, nsrc);
683 builder->cursor = ir3_after_instr(instr);
684 return instr;
685 }
686
687 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)688 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
689 {
690 return ir3_instr_create_at(ir3_before_terminator(block), opc, ndst, nsrc);
691 }
692
693 struct ir3_instruction *
ir3_instr_create_at_end(struct ir3_block * block,opc_t opc,int ndst,int nsrc)694 ir3_instr_create_at_end(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
695 {
696 return ir3_instr_create_at(ir3_after_block(block), opc, ndst, nsrc);
697 }
698
699 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)700 ir3_instr_clone(struct ir3_instruction *instr)
701 {
702 struct ir3_instruction *new_instr = instr_create(
703 instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
704 struct ir3_register **dsts, **srcs;
705
706 dsts = new_instr->dsts;
707 srcs = new_instr->srcs;
708 *new_instr = *instr;
709 new_instr->dsts = dsts;
710 new_instr->srcs = srcs;
711 list_inithead(&new_instr->rpt_node);
712
713 insert_instr(ir3_before_terminator(instr->block), new_instr);
714
715 /* clone registers: */
716 new_instr->dsts_count = 0;
717 new_instr->srcs_count = 0;
718 foreach_dst (reg, instr) {
719 struct ir3_register *new_reg =
720 ir3_dst_create(new_instr, reg->num, reg->flags);
721 *new_reg = *reg;
722 if (new_reg->instr)
723 new_reg->instr = new_instr;
724 }
725 foreach_src (reg, instr) {
726 struct ir3_register *new_reg =
727 ir3_src_create(new_instr, reg->num, reg->flags);
728 *new_reg = *reg;
729 }
730
731 if (instr->address) {
732 assert(instr->srcs_count > 0);
733 new_instr->address = new_instr->srcs[instr->srcs_count - 1];
734 add_to_address_users(new_instr);
735 }
736
737 return new_instr;
738 }
739
740 /* Add a false dependency to instruction, to ensure it is scheduled first: */
741 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)742 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
743 {
744 for (unsigned i = 0; i < instr->deps_count; i++) {
745 if (instr->deps[i] == dep)
746 return;
747 }
748
749 array_insert(instr, instr->deps, dep);
750 }
751
752 void
ir3_instr_remove(struct ir3_instruction * instr)753 ir3_instr_remove(struct ir3_instruction *instr)
754 {
755 list_delinit(&instr->node);
756 list_delinit(&instr->rpt_node);
757 }
758
759 void
ir3_instr_create_rpt(struct ir3_instruction ** instrs,unsigned n)760 ir3_instr_create_rpt(struct ir3_instruction **instrs, unsigned n)
761 {
762 assert(n > 0 && !ir3_instr_is_rpt(instrs[0]));
763
764 for (unsigned i = 1; i < n; ++i) {
765 assert(!ir3_instr_is_rpt(instrs[i]));
766 assert(instrs[i]->serialno > instrs[i - 1]->serialno);
767
768 list_addtail(&instrs[i]->rpt_node, &instrs[0]->rpt_node);
769 }
770 }
771
772 bool
ir3_instr_is_rpt(const struct ir3_instruction * instr)773 ir3_instr_is_rpt(const struct ir3_instruction *instr)
774 {
775 return !list_is_empty(&instr->rpt_node);
776 }
777
778 bool
ir3_instr_is_first_rpt(const struct ir3_instruction * instr)779 ir3_instr_is_first_rpt(const struct ir3_instruction *instr)
780 {
781 if (!ir3_instr_is_rpt(instr))
782 return false;
783
784 struct ir3_instruction *prev_rpt =
785 list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
786 return prev_rpt->serialno > instr->serialno;
787 }
788
789 struct ir3_instruction *
ir3_instr_prev_rpt(const struct ir3_instruction * instr)790 ir3_instr_prev_rpt(const struct ir3_instruction *instr)
791 {
792 assert(ir3_instr_is_rpt(instr));
793
794 if (ir3_instr_is_first_rpt(instr))
795 return NULL;
796 return list_entry(instr->rpt_node.prev, struct ir3_instruction, rpt_node);
797 }
798
799 struct ir3_instruction *
ir3_instr_first_rpt(struct ir3_instruction * instr)800 ir3_instr_first_rpt(struct ir3_instruction *instr)
801 {
802 assert(ir3_instr_is_rpt(instr));
803
804 while (!ir3_instr_is_first_rpt(instr)) {
805 instr = ir3_instr_prev_rpt(instr);
806 assert(instr);
807 }
808
809 return instr;
810 }
811
812 unsigned
ir3_instr_rpt_length(const struct ir3_instruction * instr)813 ir3_instr_rpt_length(const struct ir3_instruction *instr)
814 {
815 assert(ir3_instr_is_first_rpt(instr));
816
817 return list_length(&instr->rpt_node) + 1;
818 }
819
820 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)821 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
822 {
823 struct ir3 *shader = instr->block->shader;
824 #if MESA_DEBUG
825 assert(instr->srcs_count < instr->srcs_max);
826 #endif
827 struct ir3_register *reg = reg_create(shader, num, flags);
828 instr->srcs[instr->srcs_count++] = reg;
829 return reg;
830 }
831
832 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)833 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
834 {
835 struct ir3 *shader = instr->block->shader;
836 #if MESA_DEBUG
837 assert(instr->dsts_count < instr->dsts_max);
838 #endif
839 struct ir3_register *reg = reg_create(shader, num, flags);
840 instr->dsts[instr->dsts_count++] = reg;
841 return reg;
842 }
843
844 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)845 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
846 {
847 struct ir3_register *new_reg = reg_create(shader, 0, 0);
848 *new_reg = *reg;
849 return new_reg;
850 }
851
852 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)853 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
854 struct ir3_register *last_write)
855 {
856 assert(reg->flags & IR3_REG_ARRAY);
857 struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
858 *new_reg = *reg;
859 new_reg->def = last_write;
860 ir3_reg_tie(reg, new_reg);
861 }
862
863 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)864 ir3_instr_set_address(struct ir3_instruction *instr,
865 struct ir3_instruction *addr)
866 {
867 if (!instr->address) {
868 assert(instr->block == addr->block);
869
870 instr->address =
871 ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
872 instr->address->def = addr->dsts[0];
873 add_to_address_users(instr);
874 } else {
875 assert(instr->address->def->instr == addr);
876 }
877 }
878
879 /* Does this instruction use the scalar ALU?
880 */
881 bool
is_scalar_alu(struct ir3_instruction * instr,const struct ir3_compiler * compiler)882 is_scalar_alu(struct ir3_instruction *instr,
883 const struct ir3_compiler *compiler)
884 {
885 /* MOVMSK seems to always need (ss) even with other scalar ALU instructions
886 */
887 return instr->opc != OPC_MOVMSK &&
888 instr->opc != OPC_SCAN_CLUSTERS_MACRO &&
889 instr->opc != OPC_SCAN_MACRO &&
890 is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) &&
891 /* scalar->scalar mov instructions (but NOT cov) were supported before the
892 * scalar ALU was supported, but they still required (ss) whereas on GPUs
893 * that have a scalar ALU they are executed on it and do not require (ss).
894 * We have to be careful to return false for these if scalar ALU isn't
895 * supported, so that we treat them like vector->scalar mov instructions
896 * (such as requiring (ss)).
897 */
898 compiler->has_scalar_alu &&
899 /* moves from normal to shared seem to use a separate ALU as before and
900 * require a (ss) on dependent instructions.
901 */
902 ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) ||
903 (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)));
904 }
905
906 void
ir3_block_clear_mark(struct ir3_block * block)907 ir3_block_clear_mark(struct ir3_block *block)
908 {
909 foreach_instr (instr, &block->instr_list)
910 instr->flags &= ~IR3_INSTR_MARK;
911 }
912
913 void
ir3_clear_mark(struct ir3 * ir)914 ir3_clear_mark(struct ir3 *ir)
915 {
916 foreach_block (block, &ir->block_list) {
917 ir3_block_clear_mark(block);
918 }
919 }
920
921 unsigned
ir3_count_instructions(struct ir3 * ir)922 ir3_count_instructions(struct ir3 *ir)
923 {
924 unsigned cnt = 1;
925 foreach_block (block, &ir->block_list) {
926 block->start_ip = cnt;
927 foreach_instr (instr, &block->instr_list) {
928 instr->ip = cnt++;
929 }
930 block->end_ip = cnt;
931 }
932 return cnt;
933 }
934
935 unsigned
ir3_count_instructions_sched(struct ir3 * ir)936 ir3_count_instructions_sched(struct ir3 *ir)
937 {
938 unsigned cnt = 1;
939 foreach_block (block, &ir->block_list) {
940 block->start_ip = cnt;
941 foreach_instr (instr, &block->instr_list) {
942 if (!is_terminator(instr))
943 instr->ip = cnt++;
944 }
945 block->end_ip = cnt;
946 }
947 return cnt;
948 }
949
950 /* When counting instructions for RA, we insert extra fake instructions at the
951 * beginning of each block, where values become live, and at the end where
952 * values die. This prevents problems where values live-in at the beginning or
953 * live-out at the end of a block from being treated as if they were
954 * live-in/live-out at the first/last instruction, which would be incorrect.
955 * In ir3_legalize these ip's are assumed to be actual ip's of the final
956 * program, so it would be incorrect to use this everywhere.
957 */
958
959 unsigned
ir3_count_instructions_ra(struct ir3 * ir)960 ir3_count_instructions_ra(struct ir3 *ir)
961 {
962 unsigned cnt = 1;
963 foreach_block (block, &ir->block_list) {
964 block->start_ip = cnt++;
965 foreach_instr (instr, &block->instr_list) {
966 instr->ip = cnt++;
967 }
968 block->end_ip = cnt++;
969 }
970 return cnt;
971 }
972
973 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)974 ir3_lookup_array(struct ir3 *ir, unsigned id)
975 {
976 foreach_array (arr, &ir->array_list)
977 if (arr->id == id)
978 return arr;
979 return NULL;
980 }
981
ir3_find_ssa_uses_for(struct ir3 * ir,void * mem_ctx,use_filter_cb filter)982 void ir3_find_ssa_uses_for(struct ir3 *ir, void *mem_ctx, use_filter_cb filter)
983 {
984 /* We could do this in a single pass if we can assume instructions
985 * are always sorted. Which currently might not always be true.
986 * (In particular after ir3_group pass, but maybe other places.)
987 */
988 foreach_block (block, &ir->block_list)
989 foreach_instr (instr, &block->instr_list)
990 instr->uses = NULL;
991
992 foreach_block (block, &ir->block_list) {
993 foreach_instr (instr, &block->instr_list) {
994 foreach_ssa_src_n (src, n, instr) {
995 if (!filter(instr, n))
996 continue;
997 if (!src->uses)
998 src->uses = _mesa_pointer_set_create(mem_ctx);
999 _mesa_set_add(src->uses, instr);
1000 }
1001 }
1002 }
1003 }
1004
1005 static bool
no_false_deps(struct ir3_instruction * instr,unsigned src_n)1006 no_false_deps(struct ir3_instruction *instr, unsigned src_n)
1007 {
1008 return !__is_false_dep(instr, src_n);
1009 }
1010
1011 static bool
any_src(struct ir3_instruction * instr,unsigned src_n)1012 any_src(struct ir3_instruction *instr, unsigned src_n)
1013 {
1014 return true;
1015 }
1016
1017 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)1018 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
1019 {
1020 if (falsedeps)
1021 return ir3_find_ssa_uses_for(ir, mem_ctx, any_src);
1022 return ir3_find_ssa_uses_for(ir, mem_ctx, no_false_deps);
1023 }
1024
1025 /**
1026 * Set the destination type of an instruction, for example if a
1027 * conversion is folded in, handling the special cases where the
1028 * instruction's dest type or opcode needs to be fixed up.
1029 */
1030 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)1031 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
1032 {
1033 if (half) {
1034 instr->dsts[0]->flags |= IR3_REG_HALF;
1035 } else {
1036 instr->dsts[0]->flags &= ~IR3_REG_HALF;
1037 }
1038
1039 switch (opc_cat(instr->opc)) {
1040 case 1: /* move instructions */
1041 if (half) {
1042 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
1043 } else {
1044 instr->cat1.dst_type = full_type(instr->cat1.dst_type);
1045 }
1046 break;
1047 case 4:
1048 if (half) {
1049 instr->opc = cat4_half_opc(instr->opc);
1050 } else {
1051 instr->opc = cat4_full_opc(instr->opc);
1052 }
1053 break;
1054 case 5:
1055 if (half) {
1056 instr->cat5.type = half_type(instr->cat5.type);
1057 } else {
1058 instr->cat5.type = full_type(instr->cat5.type);
1059 }
1060 break;
1061 }
1062 }
1063
1064 /**
1065 * One-time fixup for instruction src-types. Other than cov's that
1066 * are folded, an instruction's src type does not change.
1067 */
1068 void
ir3_fixup_src_type(struct ir3_instruction * instr)1069 ir3_fixup_src_type(struct ir3_instruction *instr)
1070 {
1071 if (instr->srcs_count == 0)
1072 return;
1073
1074 switch (opc_cat(instr->opc)) {
1075 case 1: /* move instructions */
1076 if (instr->srcs[0]->flags & IR3_REG_HALF) {
1077 instr->cat1.src_type = half_type(instr->cat1.src_type);
1078 } else {
1079 instr->cat1.src_type = full_type(instr->cat1.src_type);
1080 }
1081 break;
1082 case 3:
1083 if (instr->srcs[0]->flags & IR3_REG_HALF) {
1084 instr->opc = cat3_half_opc(instr->opc);
1085 } else {
1086 instr->opc = cat3_full_opc(instr->opc);
1087 }
1088 break;
1089 }
1090 }
1091
1092 /**
1093 * Map a floating point immed to FLUT (float lookup table) value,
1094 * returns negative for immediates that cannot be mapped.
1095 */
1096 int
ir3_flut(struct ir3_register * src_reg)1097 ir3_flut(struct ir3_register *src_reg)
1098 {
1099 static const struct {
1100 uint32_t f32;
1101 uint16_t f16;
1102 } flut[] = {
1103 { .f32 = 0x00000000, .f16 = 0x0000 }, /* 0.0 */
1104 { .f32 = 0x3f000000, .f16 = 0x3800 }, /* 0.5 */
1105 { .f32 = 0x3f800000, .f16 = 0x3c00 }, /* 1.0 */
1106 { .f32 = 0x40000000, .f16 = 0x4000 }, /* 2.0 */
1107 { .f32 = 0x402df854, .f16 = 0x4170 }, /* e */
1108 { .f32 = 0x40490fdb, .f16 = 0x4248 }, /* pi */
1109 { .f32 = 0x3ea2f983, .f16 = 0x3518 }, /* 1/pi */
1110 { .f32 = 0x3f317218, .f16 = 0x398c }, /* 1/log2(e) */
1111 { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 }, /* log2(e) */
1112 { .f32 = 0x3e9a209b, .f16 = 0x34d1 }, /* 1/log2(10) */
1113 { .f32 = 0x40549a78, .f16 = 0x42a5 }, /* log2(10) */
1114 { .f32 = 0x40800000, .f16 = 0x4400 }, /* 4.0 */
1115 };
1116
1117 if (src_reg->flags & IR3_REG_HALF) {
1118 /* Note that half-float immeds are already lowered to 16b in nir: */
1119 uint32_t imm = src_reg->uim_val;
1120 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1121 if (flut[i].f16 == imm) {
1122 return i;
1123 }
1124 }
1125 } else {
1126 uint32_t imm = src_reg->uim_val;
1127 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
1128 if (flut[i].f32 == imm) {
1129 return i;
1130 }
1131 }
1132 }
1133
1134 return -1;
1135 }
1136
1137 static unsigned
cp_flags(unsigned flags)1138 cp_flags(unsigned flags)
1139 {
1140 /* only considering these flags (at least for now): */
1141 flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
1142 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
1143 IR3_REG_SHARED);
1144 return flags;
1145 }
1146
1147 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)1148 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
1149 {
1150 struct ir3_compiler *compiler = instr->block->shader->compiler;
1151 unsigned valid_flags;
1152
1153 flags = cp_flags(flags);
1154
1155 /* If destination is indirect, then source cannot be.. at least
1156 * I don't think so..
1157 */
1158 if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
1159 (flags & IR3_REG_RELATIV))
1160 return false;
1161
1162 if (flags & IR3_REG_RELATIV) {
1163 /* TODO need to test on earlier gens.. pretty sure the earlier
1164 * problem was just that we didn't check that the src was from
1165 * same block (since we can't propagate address register values
1166 * across blocks currently)
1167 */
1168 if (compiler->gen < 6)
1169 return false;
1170
1171 /* NOTE in the special try_swap_mad_two_srcs() case we can be
1172 * called on a src that has already had an indirect load folded
1173 * in, in which case ssa() returns NULL
1174 */
1175 if (instr->srcs[n]->flags & IR3_REG_SSA) {
1176 struct ir3_instruction *src = ssa(instr->srcs[n]);
1177 if (src->address->def->instr->block != instr->block)
1178 return false;
1179 }
1180 }
1181
1182 if (is_meta(instr)) {
1183 /* collect and phi nodes support const/immed sources, which will be
1184 * turned into move instructions, but not anything else.
1185 */
1186 if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
1187 return false;
1188
1189 /* Except for immed/const sources, source and dest shared-ness must match.
1190 */
1191 if (!(flags & (IR3_REG_IMMED | IR3_REG_CONST)) &&
1192 (flags & IR3_REG_SHARED) != (instr->dsts[0]->flags & IR3_REG_SHARED))
1193 return false;
1194
1195 return true;
1196 }
1197
1198 switch (opc_cat(instr->opc)) {
1199 case 0: /* end, chmask */
1200 return flags == 0;
1201 case 1:
1202 switch (instr->opc) {
1203 case OPC_MOVMSK:
1204 case OPC_SWZ:
1205 case OPC_SCT:
1206 case OPC_GAT:
1207 valid_flags = IR3_REG_SHARED;
1208 break;
1209 case OPC_SCAN_MACRO:
1210 if (n == 0)
1211 return flags == 0;
1212 else
1213 return flags == IR3_REG_SHARED;
1214 break;
1215 case OPC_SCAN_CLUSTERS_MACRO:
1216 if (n == 0)
1217 return flags == IR3_REG_SHARED;
1218 else
1219 return flags == 0;
1220 break;
1221 default: {
1222 valid_flags =
1223 IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
1224
1225 /* floating-point conversions when moving from non-shared to shared
1226 * seem not to work. We only use floating-point types in ir3 for
1227 * conversions, so don't bother specially handling the case where the
1228 * types are equal. Same goes for 8-bit sign extension.
1229 */
1230 if ((instr->dsts[0]->flags & IR3_REG_SHARED) &&
1231 !(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)) &&
1232 ((full_type(instr->cat1.src_type) == TYPE_F32 ||
1233 full_type(instr->cat1.dst_type) == TYPE_F32) ||
1234 (instr->cat1.src_type == TYPE_U8 &&
1235 full_type(instr->cat1.dst_type) == TYPE_S32)))
1236 return false;
1237
1238 /* Conversions seem not to work in shared->shared copies before scalar
1239 * ALU is supported.
1240 */
1241 if (!compiler->has_scalar_alu &&
1242 (flags & IR3_REG_SHARED) &&
1243 (instr->dsts[0]->flags & IR3_REG_SHARED) &&
1244 instr->cat1.src_type != instr->cat1.dst_type)
1245 return false;
1246 }
1247 }
1248 if (flags & ~valid_flags)
1249 return false;
1250 break;
1251 case 2:
1252 valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
1253 IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
1254
1255 if (flags & ~valid_flags)
1256 return false;
1257
1258 /* Allow an immediate src1 for flat.b, since it's ignored */
1259 if (instr->opc == OPC_FLAT_B &&
1260 n == 1 && flags == IR3_REG_IMMED)
1261 return true;
1262
1263 /* cat2/cat3 scalar ALU instructions must not have regular sources. */
1264 if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1265 if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1266 return false;
1267 }
1268
1269 if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
1270 unsigned m = n ^ 1;
1271 /* cannot deal w/ const or shared in both srcs:
1272 * (note that some cat2 actually only have a single src)
1273 */
1274 if (m < instr->srcs_count) {
1275 struct ir3_register *reg = instr->srcs[m];
1276 if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1277 if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
1278 return false;
1279 } else {
1280 if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
1281 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
1282 return false;
1283 }
1284 if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
1285 return false;
1286 }
1287 }
1288 break;
1289 case 3:
1290 valid_flags =
1291 ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
1292
1293 switch (instr->opc) {
1294 case OPC_SHRM:
1295 case OPC_SHLM:
1296 case OPC_SHRG:
1297 case OPC_SHLG:
1298 case OPC_ANDG: {
1299 valid_flags |= IR3_REG_IMMED;
1300 /* Can be RELATIV+CONST but not CONST: */
1301 if (flags & IR3_REG_RELATIV)
1302 valid_flags |= IR3_REG_CONST;
1303 break;
1304 }
1305 case OPC_WMM:
1306 case OPC_WMM_ACCU: {
1307 valid_flags = IR3_REG_SHARED;
1308 if (n == 2)
1309 valid_flags = IR3_REG_CONST;
1310 break;
1311 }
1312 case OPC_DP2ACC:
1313 case OPC_DP4ACC:
1314 break;
1315 default:
1316 valid_flags |= IR3_REG_CONST;
1317 }
1318
1319 if (flags & ~valid_flags)
1320 return false;
1321
1322 if (flags & (IR3_REG_CONST | IR3_REG_RELATIV) ||
1323 (!(instr->dsts[0]->flags & IR3_REG_SHARED) &&
1324 (flags & IR3_REG_SHARED))) {
1325 /* cannot deal w/ const/shared/relativ in 2nd src: */
1326 if (n == 1)
1327 return false;
1328 }
1329
1330 if (instr->dsts[0]->flags & IR3_REG_SHARED) {
1331 if (!(flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)))
1332 return false;
1333 }
1334
1335 break;
1336 case 4:
1337 if ((instr->dsts[0]->flags & IR3_REG_SHARED) != (flags & IR3_REG_SHARED))
1338 return false;
1339 /* seems like blob compiler avoids const as src.. */
1340 /* TODO double check if this is still the case on a4xx */
1341 if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
1342 return false;
1343 if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
1344 return false;
1345 break;
1346 case 5:
1347 if (instr->opc == OPC_ISAM && (instr->flags & IR3_INSTR_V)) {
1348 if (((instr->flags & IR3_INSTR_S2EN) && n == 2) ||
1349 (!(instr->flags & IR3_INSTR_S2EN) && n == 1)) {
1350 return flags == IR3_REG_IMMED;
1351 }
1352 }
1353 /* no flags allowed */
1354 if (flags)
1355 return false;
1356 break;
1357 case 6:
1358 valid_flags = IR3_REG_IMMED;
1359
1360 if (instr->opc == OPC_STC && n == 1)
1361 valid_flags |= IR3_REG_SHARED;
1362
1363 if (flags & ~valid_flags)
1364 return false;
1365
1366 if (flags & IR3_REG_IMMED) {
1367 /* doesn't seem like we can have immediate src for store
1368 * instructions:
1369 *
1370 * TODO this restriction could also apply to load instructions,
1371 * but for load instructions this arg is the address (and not
1372 * really sure any good way to test a hard-coded immed addr src)
1373 */
1374 if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1375 return false;
1376
1377 if ((instr->opc == OPC_LDL) && (n == 0))
1378 return false;
1379
1380 if ((instr->opc == OPC_STL) && (n != 2))
1381 return false;
1382
1383 if ((instr->opc == OPC_LDP) && (n == 0))
1384 return false;
1385
1386 if ((instr->opc == OPC_STP) && (n != 2))
1387 return false;
1388
1389 if (instr->opc == OPC_STLW && n == 0)
1390 return false;
1391
1392 if (instr->opc == OPC_LDLW && n == 0)
1393 return false;
1394
1395 /* disallow immediates in anything but the SSBO slot argument for
1396 * cat6 instructions:
1397 */
1398 if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1399 return false;
1400
1401 if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1402 is_bindless_atomic(instr->opc))
1403 return false;
1404
1405 if (instr->opc == OPC_STG && (n == 2))
1406 return false;
1407
1408 if (instr->opc == OPC_STG_A && (n == 4))
1409 return false;
1410
1411 if (instr->opc == OPC_LDG && (n == 0))
1412 return false;
1413
1414 if (instr->opc == OPC_LDG_A && (n < 2))
1415 return false;
1416
1417 if (instr->opc == OPC_STC && n != 0)
1418 return false;
1419
1420 /* as with atomics, these cat6 instrs can only have an immediate
1421 * for SSBO/IBO slot argument
1422 */
1423 switch (instr->opc) {
1424 case OPC_LDIB:
1425 case OPC_STIB:
1426 if (n != 0 && n != 2)
1427 return false;
1428 break;
1429 case OPC_RESINFO:
1430 if (n != 0)
1431 return false;
1432 break;
1433 default:
1434 break;
1435 }
1436 }
1437
1438 break;
1439 }
1440
1441 return true;
1442 }
1443
1444 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1445 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1446 {
1447 if (instr->opc == OPC_MOV || is_meta(instr))
1448 return true;
1449
1450 if (is_mem(instr)) {
1451 switch (instr->opc) {
1452 /* Some load/store instructions have a 13-bit offset and size which must
1453 * always be an immediate and the rest of the sources cannot be
1454 * immediates, so the frontend is responsible for checking the size:
1455 */
1456 case OPC_LDL:
1457 case OPC_STL:
1458 case OPC_LDP:
1459 case OPC_STP:
1460 case OPC_LDG:
1461 case OPC_STG:
1462 case OPC_SPILL_MACRO:
1463 case OPC_RELOAD_MACRO:
1464 case OPC_LDG_A:
1465 case OPC_STG_A:
1466 case OPC_LDLW:
1467 case OPC_STLW:
1468 case OPC_LDLV:
1469 return true;
1470 default:
1471 /* most cat6 src immediates can only encode 8 bits: */
1472 return !(immed & ~0xff);
1473 }
1474 }
1475
1476 /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1477 return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1478 }
1479
1480 struct ir3_instruction *
ir3_get_cond_for_nonzero_compare(struct ir3_instruction * instr)1481 ir3_get_cond_for_nonzero_compare(struct ir3_instruction *instr)
1482 {
1483 /* If instr is a negation (likely as a result of an nir_b2n), we can ignore
1484 * that and use its source, since the nonzero-ness stays the same.
1485 */
1486 if (instr->opc == OPC_ABSNEG_S && instr->flags == 0 &&
1487 (instr->srcs[0]->flags & (IR3_REG_SNEG | IR3_REG_SABS)) ==
1488 IR3_REG_SNEG) {
1489 return instr->srcs[0]->def->instr;
1490 }
1491
1492 return instr;
1493 }
1494
1495 bool
ir3_supports_rpt(struct ir3_compiler * compiler,unsigned opc)1496 ir3_supports_rpt(struct ir3_compiler *compiler, unsigned opc)
1497 {
1498 switch (opc_cat(opc)) {
1499 case 0:
1500 return opc == OPC_NOP;
1501 case 1:
1502 return opc == OPC_MOV || opc == OPC_SWZ || opc == OPC_MOVMSK;
1503 case 2:
1504 if (opc == OPC_BARY_F && !compiler->has_rpt_bary_f)
1505 return false;
1506 return true;
1507 case 3:
1508 return opc != OPC_DP2ACC && opc != OPC_DP4ACC;
1509 case 4:
1510 return opc != OPC_RCP;
1511 default:
1512 return false;
1513 }
1514 }
1515