1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and promotes
28 * immediate values to registers by emitting a mov(1) instruction.
29 */
30
31 #include "brw_fs.h"
32 #include "brw_fs_builder.h"
33 #include "brw_cfg.h"
34 #include "util/half_float.h"
35
36 using namespace brw;
37
38 static const bool debug = false;
39
40 enum PACKED interpreted_type {
41 float_only = 0,
42 integer_only,
43 either_type
44 };
45
46 struct value {
47 /** Raw bit pattern of the value. */
48 nir_const_value value;
49
50 /** Instruction that uses this instance of the value. */
51 unsigned instr_index;
52
53 /** Size, in bits, of the value. */
54 uint8_t bit_size;
55
56 /**
57 * Which source of instr is this value?
58 *
59 * \note This field is not actually used by \c brw_combine_constants, but
60 * it is generally very useful to callers.
61 */
62 uint8_t src;
63
64 /**
65 * In what ways can instr interpret this value?
66 *
67 * Choices are floating-point only, integer only, or either type.
68 */
69 enum interpreted_type type;
70
71 /**
72 * Only try to make a single source non-constant.
73 *
74 * On some architectures, some instructions require that all sources be
75 * non-constant. For example, the multiply-accumulate instruction on Intel
76 * GPUs upto Gen11 require that all sources be non-constant. Other
77 * instructions, like the selection instruction, allow one constant source.
78 *
79 * If a single constant source is allowed, set this flag to true.
80 *
81 * If an instruction allows a single constant and it has only a signle
82 * constant to begin, it should be included. Various places in
83 * \c combine_constants will assume that there are multiple constants if
84 * \c ::allow_one_constant is set. This may even be enforced by in-code
85 * assertions.
86 */
87 bool allow_one_constant;
88
89 /**
90 * Restrict values that can reach this value to not include negations.
91 *
92 * This is useful for instructions that cannot have source modifiers. For
93 * example, on Intel GPUs the integer source of a shift instruction (e.g.,
94 * SHL) can have a source modifier, but the integer source of the bitfield
95 * insertion instruction (i.e., BFI2) cannot. A pair of these instructions
96 * might have sources that are negations of each other. Using this flag
97 * will ensure that the BFI2 does not have a negated source, but the SHL
98 * might.
99 */
100 bool no_negations;
101
102 /**
103 * \name UtilCombineConstantsPrivate
104 * Private data used only by brw_combine_constants
105 *
106 * Any data stored in these fields will be overwritten by the call to
107 * \c brw_combine_constants. No assumptions should be made about the
108 * state of these fields after that function returns.
109 */
110 /**@{*/
111 /** Mask of negations that can be generated from this value. */
112 uint8_t reachable_mask;
113
114 /** Mask of negations that can generate this value. */
115 uint8_t reaching_mask;
116
117 /**
118 * Value with the next source from the same instruction.
119 *
120 * This pointer may be \c NULL. If it is not \c NULL, it will form a
121 * singly-linked circular list of values. The list is unorderd. That is,
122 * as the list is iterated, the \c ::src values will be in arbitrary order.
123 *
124 * \todo Is it even possible for there to be more than two elements in this
125 * list? This pass does not operate on vecN instructions or intrinsics, so
126 * the theoretical limit should be three. However, instructions with all
127 * constant sources should have been folded away.
128 */
129 struct value *next_src;
130 /**@}*/
131 };
132
133 struct combine_constants_value {
134 /** Raw bit pattern of the constant loaded. */
135 nir_const_value value;
136
137 /**
138 * Index of the first user.
139 *
140 * This is the offset into \c combine_constants_result::user_map of the
141 * first user of this value.
142 */
143 unsigned first_user;
144
145 /** Number of users of this value. */
146 unsigned num_users;
147
148 /** Size, in bits, of the value. */
149 uint8_t bit_size;
150 };
151
152 struct combine_constants_user {
153 /** Index into the array of values passed to brw_combine_constants. */
154 unsigned index;
155
156 /**
157 * Manner in which the value should be interpreted in the instruction.
158 *
159 * This is only useful when ::negate is set. Unless the corresponding
160 * value::type is \c either_type, this field must have the same value as
161 * value::type.
162 */
163 enum interpreted_type type;
164
165 /** Should this value be negated to generate the original value? */
166 bool negate;
167 };
168
169 class combine_constants_result {
170 public:
combine_constants_result(unsigned num_candidates,bool & success)171 combine_constants_result(unsigned num_candidates, bool &success)
172 : num_values_to_emit(0), user_map(NULL)
173 {
174 user_map = (struct combine_constants_user *) calloc(num_candidates,
175 sizeof(user_map[0]));
176
177 /* In the worst case, the number of output values will be equal to the
178 * number of input values. Allocate a buffer that is known to be large
179 * enough now, and it can be reduced later.
180 */
181 values_to_emit =
182 (struct combine_constants_value *) calloc(num_candidates,
183 sizeof(values_to_emit[0]));
184
185 success = (user_map != NULL && values_to_emit != NULL);
186 }
187
~combine_constants_result()188 ~combine_constants_result()
189 {
190 free(values_to_emit);
191 free(user_map);
192 }
193
append_value(const nir_const_value & value,unsigned bit_size)194 void append_value(const nir_const_value &value, unsigned bit_size)
195 {
196 values_to_emit[num_values_to_emit].value = value;
197 values_to_emit[num_values_to_emit].first_user = 0;
198 values_to_emit[num_values_to_emit].num_users = 0;
199 values_to_emit[num_values_to_emit].bit_size = bit_size;
200 num_values_to_emit++;
201 }
202
203 unsigned num_values_to_emit;
204 struct combine_constants_value *values_to_emit;
205
206 struct combine_constants_user *user_map;
207 };
208
209 #define VALUE_INDEX 0
210 #define FLOAT_NEG_INDEX 1
211 #define INT_NEG_INDEX 2
212 #define MAX_NUM_REACHABLE 3
213
214 #define VALUE_EXISTS (1 << VALUE_INDEX)
215 #define FLOAT_NEG_EXISTS (1 << FLOAT_NEG_INDEX)
216 #define INT_NEG_EXISTS (1 << INT_NEG_INDEX)
217
218 static bool
negation_exists(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)219 negation_exists(nir_const_value v, unsigned bit_size,
220 enum interpreted_type base_type)
221 {
222 /* either_type does not make sense in this context. */
223 assert(base_type == float_only || base_type == integer_only);
224
225 switch (bit_size) {
226 case 8:
227 if (base_type == float_only)
228 return false;
229 else
230 return v.i8 != 0 && v.i8 != INT8_MIN;
231
232 case 16:
233 if (base_type == float_only)
234 return !util_is_half_nan(v.i16);
235 else
236 return v.i16 != 0 && v.i16 != INT16_MIN;
237
238 case 32:
239 if (base_type == float_only)
240 return !isnan(v.f32);
241 else
242 return v.i32 != 0 && v.i32 != INT32_MIN;
243
244 case 64:
245 if (base_type == float_only)
246 return !isnan(v.f64);
247 else
248 return v.i64 != 0 && v.i64 != INT64_MIN;
249
250 default:
251 unreachable("unsupported bit-size should have already been filtered.");
252 }
253 }
254
255 static nir_const_value
negate(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)256 negate(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
257 {
258 /* either_type does not make sense in this context. */
259 assert(base_type == float_only || base_type == integer_only);
260
261 nir_const_value ret = { 0, };
262
263 switch (bit_size) {
264 case 8:
265 assert(base_type == integer_only);
266 ret.i8 = -v.i8;
267 break;
268
269 case 16:
270 if (base_type == float_only)
271 ret.u16 = v.u16 ^ INT16_MIN;
272 else
273 ret.i16 = -v.i16;
274 break;
275
276 case 32:
277 if (base_type == float_only)
278 ret.u32 = v.u32 ^ INT32_MIN;
279 else
280 ret.i32 = -v.i32;
281 break;
282
283 case 64:
284 if (base_type == float_only)
285 ret.u64 = v.u64 ^ INT64_MIN;
286 else
287 ret.i64 = -v.i64;
288 break;
289
290 default:
291 unreachable("unsupported bit-size should have already been filtered.");
292 }
293
294 return ret;
295 }
296
297 static nir_const_value
absolute(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)298 absolute(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
299 {
300 /* either_type does not make sense in this context. */
301 assert(base_type == float_only || base_type == integer_only);
302
303 nir_const_value ret = { 0, };
304
305 switch (bit_size) {
306 case 8:
307 assert(base_type == integer_only);
308 ret.i8 = abs(v.i8);
309 break;
310
311 case 16:
312 if (base_type == float_only)
313 ret.u16 = v.u16 & 0x7fff;
314 else
315 ret.i16 = abs(v.i16);
316 break;
317
318 case 32:
319 if (base_type == float_only)
320 ret.f32 = fabs(v.f32);
321 else
322 ret.i32 = abs(v.i32);
323 break;
324
325 case 64:
326 if (base_type == float_only)
327 ret.f64 = fabs(v.f64);
328 else {
329 if (sizeof(v.i64) == sizeof(long int)) {
330 ret.i64 = labs((long int) v.i64);
331 } else {
332 assert(sizeof(v.i64) == sizeof(long long int));
333 ret.i64 = llabs((long long int) v.i64);
334 }
335 }
336 break;
337
338 default:
339 unreachable("unsupported bit-size should have already been filtered.");
340 }
341
342 return ret;
343 }
344
345 static void
calculate_masks(nir_const_value v,enum interpreted_type type,unsigned bit_size,uint8_t * reachable_mask,uint8_t * reaching_mask)346 calculate_masks(nir_const_value v, enum interpreted_type type,
347 unsigned bit_size, uint8_t *reachable_mask,
348 uint8_t *reaching_mask)
349 {
350 *reachable_mask = 0;
351 *reaching_mask = 0;
352
353 /* Calculate the extended reachable mask. */
354 if (type == float_only || type == either_type) {
355 if (negation_exists(v, bit_size, float_only))
356 *reachable_mask |= FLOAT_NEG_EXISTS;
357 }
358
359 if (type == integer_only || type == either_type) {
360 if (negation_exists(v, bit_size, integer_only))
361 *reachable_mask |= INT_NEG_EXISTS;
362 }
363
364 /* Calculate the extended reaching mask. All of the "is this negation
365 * possible" was already determined for the reachable_mask, so reuse that
366 * data.
367 */
368 if (type == float_only || type == either_type) {
369 if (*reachable_mask & FLOAT_NEG_EXISTS)
370 *reaching_mask |= FLOAT_NEG_EXISTS;
371 }
372
373 if (type == integer_only || type == either_type) {
374 if (*reachable_mask & INT_NEG_EXISTS)
375 *reaching_mask |= INT_NEG_EXISTS;
376 }
377 }
378
379 static void
calculate_reachable_values(nir_const_value v,unsigned bit_size,unsigned reachable_mask,nir_const_value * reachable_values)380 calculate_reachable_values(nir_const_value v,
381 unsigned bit_size,
382 unsigned reachable_mask,
383 nir_const_value *reachable_values)
384 {
385 memset(reachable_values, 0, MAX_NUM_REACHABLE * sizeof(reachable_values[0]));
386
387 reachable_values[VALUE_INDEX] = v;
388
389 if (reachable_mask & INT_NEG_EXISTS) {
390 const nir_const_value neg = negate(v, bit_size, integer_only);
391
392 reachable_values[INT_NEG_INDEX] = neg;
393 }
394
395 if (reachable_mask & FLOAT_NEG_EXISTS) {
396 const nir_const_value neg = negate(v, bit_size, float_only);
397
398 reachable_values[FLOAT_NEG_INDEX] = neg;
399 }
400 }
401
402 static bool
value_equal(nir_const_value a,nir_const_value b,unsigned bit_size)403 value_equal(nir_const_value a, nir_const_value b, unsigned bit_size)
404 {
405 switch (bit_size) {
406 case 8:
407 return a.u8 == b.u8;
408 case 16:
409 return a.u16 == b.u16;
410 case 32:
411 return a.u32 == b.u32;
412 case 64:
413 return a.u64 == b.u64;
414 default:
415 unreachable("unsupported bit-size should have already been filtered.");
416 }
417 }
418
419 /** Can these values be the same with one level of negation? */
420 static bool
value_can_equal(const nir_const_value * from,uint8_t reachable_mask,nir_const_value to,uint8_t reaching_mask,unsigned bit_size)421 value_can_equal(const nir_const_value *from, uint8_t reachable_mask,
422 nir_const_value to, uint8_t reaching_mask,
423 unsigned bit_size)
424 {
425 const uint8_t combined_mask = reachable_mask & reaching_mask;
426
427 return value_equal(from[VALUE_INDEX], to, bit_size) ||
428 ((combined_mask & INT_NEG_EXISTS) &&
429 value_equal(from[INT_NEG_INDEX], to, bit_size)) ||
430 ((combined_mask & FLOAT_NEG_EXISTS) &&
431 value_equal(from[FLOAT_NEG_INDEX], to, bit_size));
432 }
433
434 static void
preprocess_candidates(struct value * candidates,unsigned num_candidates)435 preprocess_candidates(struct value *candidates, unsigned num_candidates)
436 {
437 /* Calculate the reaching_mask and reachable_mask for each candidate. */
438 for (unsigned i = 0; i < num_candidates; i++) {
439 calculate_masks(candidates[i].value,
440 candidates[i].type,
441 candidates[i].bit_size,
442 &candidates[i].reachable_mask,
443 &candidates[i].reaching_mask);
444
445 /* If negations are not allowed, then only the original value is
446 * reaching.
447 */
448 if (candidates[i].no_negations)
449 candidates[i].reaching_mask = 0;
450 }
451
452 for (unsigned i = 0; i < num_candidates; i++)
453 candidates[i].next_src = NULL;
454
455 for (unsigned i = 0; i < num_candidates - 1; i++) {
456 if (candidates[i].next_src != NULL)
457 continue;
458
459 struct value *prev = &candidates[i];
460
461 for (unsigned j = i + 1; j < num_candidates; j++) {
462 if (candidates[i].instr_index == candidates[j].instr_index) {
463 prev->next_src = &candidates[j];
464 prev = prev->next_src;
465 }
466 }
467
468 /* Close the cycle. */
469 if (prev != &candidates[i])
470 prev->next_src = &candidates[i];
471 }
472 }
473
474 static bool
reaching_value_exists(const struct value * c,const struct combine_constants_value * values,unsigned num_values)475 reaching_value_exists(const struct value *c,
476 const struct combine_constants_value *values,
477 unsigned num_values)
478 {
479 nir_const_value reachable_values[MAX_NUM_REACHABLE];
480
481 calculate_reachable_values(c->value, c->bit_size, c->reaching_mask,
482 reachable_values);
483
484 /* Check to see if the value is already in the result set. */
485 for (unsigned j = 0; j < num_values; j++) {
486 if (c->bit_size == values[j].bit_size &&
487 value_can_equal(reachable_values, c->reaching_mask,
488 values[j].value, c->reaching_mask,
489 c->bit_size)) {
490 return true;
491 }
492 }
493
494 return false;
495 }
496
497 static combine_constants_result *
combine_constants_greedy(struct value * candidates,unsigned num_candidates)498 combine_constants_greedy(struct value *candidates, unsigned num_candidates)
499 {
500 bool success;
501 combine_constants_result *result =
502 new combine_constants_result(num_candidates, success);
503 if (result == NULL || !success) {
504 delete result;
505 return NULL;
506 }
507
508 BITSET_WORD *remain =
509 (BITSET_WORD *) calloc(BITSET_WORDS(num_candidates), sizeof(remain[0]));
510
511 if (remain == NULL) {
512 delete result;
513 return NULL;
514 }
515
516 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
517
518 /* Operate in three passes. The first pass handles all values that must be
519 * emitted and for which a negation cannot exist.
520 */
521 unsigned i;
522 for (i = 0; i < num_candidates; i++) {
523 if (candidates[i].allow_one_constant ||
524 (candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS))) {
525 continue;
526 }
527
528 /* Check to see if the value is already in the result set. */
529 bool found = false;
530 const unsigned num_values = result->num_values_to_emit;
531 for (unsigned j = 0; j < num_values; j++) {
532 if (candidates[i].bit_size == result->values_to_emit[j].bit_size &&
533 value_equal(candidates[i].value,
534 result->values_to_emit[j].value,
535 candidates[i].bit_size)) {
536 found = true;
537 break;
538 }
539 }
540
541 if (!found)
542 result->append_value(candidates[i].value, candidates[i].bit_size);
543
544 BITSET_CLEAR(remain, i);
545 }
546
547 /* The second pass handles all values that must be emitted and for which a
548 * negation can exist.
549 */
550 BITSET_FOREACH_SET(i, remain, num_candidates) {
551 if (candidates[i].allow_one_constant)
552 continue;
553
554 assert(candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS));
555
556 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
557 result->num_values_to_emit)) {
558 result->append_value(absolute(candidates[i].value,
559 candidates[i].bit_size,
560 candidates[i].type),
561 candidates[i].bit_size);
562 }
563
564 BITSET_CLEAR(remain, i);
565 }
566
567 /* The third pass handles all of the values that may not have to be
568 * emitted. These are the values where allow_one_constant is set.
569 */
570 BITSET_FOREACH_SET(i, remain, num_candidates) {
571 assert(candidates[i].allow_one_constant);
572
573 /* The BITSET_FOREACH_SET macro does not detect changes to the bitset
574 * that occur within the current word. Since code in this loop may
575 * clear bits from the set, re-test here.
576 */
577 if (!BITSET_TEST(remain, i))
578 continue;
579
580 assert(candidates[i].next_src != NULL);
581
582 const struct value *const other_candidate = candidates[i].next_src;
583 const unsigned j = other_candidate - candidates;
584
585 if (!reaching_value_exists(&candidates[i], result->values_to_emit,
586 result->num_values_to_emit)) {
587 /* Before emitting a value, see if a match for the other source of
588 * the instruction exists.
589 */
590 if (!reaching_value_exists(&candidates[j], result->values_to_emit,
591 result->num_values_to_emit)) {
592 result->append_value(candidates[i].value, candidates[i].bit_size);
593 }
594 }
595
596 /* Mark both sources as handled. */
597 BITSET_CLEAR(remain, i);
598 BITSET_CLEAR(remain, j);
599 }
600
601 /* As noted above, there will never be more values in the output than in
602 * the input. If there are fewer values, reduce the size of the
603 * allocation.
604 */
605 if (result->num_values_to_emit < num_candidates) {
606 result->values_to_emit = (struct combine_constants_value *)
607 realloc(result->values_to_emit, sizeof(result->values_to_emit[0]) *
608 result->num_values_to_emit);
609
610 /* Is it even possible for a reducing realloc to fail? */
611 assert(result->values_to_emit != NULL);
612 }
613
614 /* Create the mapping from "combined" constants to list of candidates
615 * passed in by the caller.
616 */
617 memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
618
619 unsigned total_users = 0;
620
621 const unsigned num_values = result->num_values_to_emit;
622 for (unsigned value_idx = 0; value_idx < num_values; value_idx++) {
623 result->values_to_emit[value_idx].first_user = total_users;
624
625 uint8_t reachable_mask;
626 uint8_t unused_mask;
627
628 calculate_masks(result->values_to_emit[value_idx].value, either_type,
629 result->values_to_emit[value_idx].bit_size,
630 &reachable_mask, &unused_mask);
631
632 nir_const_value reachable_values[MAX_NUM_REACHABLE];
633
634 calculate_reachable_values(result->values_to_emit[value_idx].value,
635 result->values_to_emit[value_idx].bit_size,
636 reachable_mask, reachable_values);
637
638 for (unsigned i = 0; i < num_candidates; i++) {
639 bool matched = false;
640
641 if (!BITSET_TEST(remain, i))
642 continue;
643
644 if (candidates[i].bit_size != result->values_to_emit[value_idx].bit_size)
645 continue;
646
647 if (value_equal(candidates[i].value, result->values_to_emit[value_idx].value,
648 result->values_to_emit[value_idx].bit_size)) {
649 result->user_map[total_users].index = i;
650 result->user_map[total_users].type = candidates[i].type;
651 result->user_map[total_users].negate = false;
652 total_users++;
653
654 matched = true;
655 BITSET_CLEAR(remain, i);
656 } else {
657 const uint8_t combined_mask = reachable_mask &
658 candidates[i].reaching_mask;
659
660 enum interpreted_type type = either_type;
661
662 if ((combined_mask & INT_NEG_EXISTS) &&
663 value_equal(candidates[i].value,
664 reachable_values[INT_NEG_INDEX],
665 candidates[i].bit_size)) {
666 type = integer_only;
667 }
668
669 if (type == either_type &&
670 (combined_mask & FLOAT_NEG_EXISTS) &&
671 value_equal(candidates[i].value,
672 reachable_values[FLOAT_NEG_INDEX],
673 candidates[i].bit_size)) {
674 type = float_only;
675 }
676
677 if (type != either_type) {
678 /* Finding a match on this path implies that the user must
679 * allow source negations.
680 */
681 assert(!candidates[i].no_negations);
682
683 result->user_map[total_users].index = i;
684 result->user_map[total_users].type = type;
685 result->user_map[total_users].negate = true;
686 total_users++;
687
688 matched = true;
689 BITSET_CLEAR(remain, i);
690 }
691 }
692
693 /* Mark the other source of instructions that can have a constant
694 * source. Selection is the prime example of this, and we want to
695 * avoid generating sequences like bcsel(a, fneg(b), ineg(c)).
696 *
697 * This also makes sure that the assertion (below) that *all* values
698 * were processed holds even when some values may be allowed to
699 * remain as constants.
700 *
701 * FINISHME: There may be value in only doing this when type ==
702 * either_type. If both sources are loaded, a register allocator may
703 * be able to make a better choice about which value to "spill"
704 * (i.e., replace with an immediate) under heavy register pressure.
705 */
706 if (matched && candidates[i].allow_one_constant) {
707 const struct value *const other_src = candidates[i].next_src;
708 const unsigned idx = other_src - candidates;
709
710 assert(idx < num_candidates);
711 BITSET_CLEAR(remain, idx);
712 }
713 }
714
715 assert(total_users > result->values_to_emit[value_idx].first_user);
716 result->values_to_emit[value_idx].num_users =
717 total_users - result->values_to_emit[value_idx].first_user;
718 }
719
720 /* Verify that all of the values were emitted by the loop above. If any
721 * bits are still set in remain, then some value was not emitted. The use
722 * of memset to populate remain prevents the use of a more performant loop.
723 */
724 #ifndef NDEBUG
725 bool pass = true;
726
727 BITSET_FOREACH_SET(i, remain, num_candidates) {
728 fprintf(stderr, "candidate %d was not processed: { "
729 ".b = %s, "
730 ".f32 = %f, .f64 = %g, "
731 ".i8 = %d, .u8 = 0x%02x, "
732 ".i16 = %d, .u16 = 0x%04x, "
733 ".i32 = %d, .u32 = 0x%08x, "
734 ".i64 = %" PRId64 ", .u64 = 0x%016" PRIx64 " }\n",
735 i,
736 candidates[i].value.b ? "true" : "false",
737 candidates[i].value.f32, candidates[i].value.f64,
738 candidates[i].value.i8, candidates[i].value.u8,
739 candidates[i].value.i16, candidates[i].value.u16,
740 candidates[i].value.i32, candidates[i].value.u32,
741 candidates[i].value.i64, candidates[i].value.u64);
742 pass = false;
743 }
744
745 assert(pass && "All values should have been processed.");
746 #endif
747
748 free(remain);
749
750 return result;
751 }
752
753 static combine_constants_result *
brw_combine_constants(struct value * candidates,unsigned num_candidates)754 brw_combine_constants(struct value *candidates, unsigned num_candidates)
755 {
756 preprocess_candidates(candidates, num_candidates);
757
758 return combine_constants_greedy(candidates, num_candidates);
759 }
760
761 /**
762 * Box for storing fs_inst and some other necessary data
763 *
764 * \sa box_instruction
765 */
766 struct fs_inst_box {
767 fs_inst *inst;
768 unsigned ip;
769 bblock_t *block;
770 };
771
772 /** A box for putting fs_regs in a linked list. */
773 struct reg_link {
774 DECLARE_RALLOC_CXX_OPERATORS(reg_link)
775
reg_linkreg_link776 reg_link(fs_inst *inst, unsigned src, bool negate, enum interpreted_type type)
777 : inst(inst), src(src), negate(negate), type(type) {}
778
779 struct exec_node link;
780 fs_inst *inst;
781 uint8_t src;
782 bool negate;
783 enum interpreted_type type;
784 };
785
786 static struct exec_node *
link(void * mem_ctx,fs_inst * inst,unsigned src,bool negate,enum interpreted_type type)787 link(void *mem_ctx, fs_inst *inst, unsigned src, bool negate,
788 enum interpreted_type type)
789 {
790 reg_link *l = new(mem_ctx) reg_link(inst, src, negate, type);
791 return &l->link;
792 }
793
794 /**
795 * Information about an immediate value.
796 */
797 struct imm {
798 /** The common ancestor of all blocks using this immediate value. */
799 bblock_t *block;
800
801 /**
802 * The instruction generating the immediate value, if all uses are contained
803 * within a single basic block. Otherwise, NULL.
804 */
805 fs_inst *inst;
806
807 /**
808 * A list of fs_regs that refer to this immediate. If we promote it, we'll
809 * have to patch these up to refer to the new GRF.
810 */
811 exec_list *uses;
812
813 /** The immediate value */
814 union {
815 char bytes[8];
816 double df;
817 int64_t d64;
818 float f;
819 int32_t d;
820 int16_t w;
821 };
822 uint8_t size;
823
824 /** When promoting half-float we need to account for certain restrictions */
825 bool is_half_float;
826
827 /**
828 * The GRF register and subregister number where we've decided to store the
829 * constant value.
830 */
831 uint8_t subreg_offset;
832 uint16_t nr;
833
834 /** Is the value used only in a single basic block? */
835 bool used_in_single_block;
836
837 uint16_t first_use_ip;
838 uint16_t last_use_ip;
839 };
840
841 /** The working set of information about immediates. */
842 struct table {
843 struct value *values;
844 int size;
845 int num_values;
846
847 struct imm *imm;
848 int len;
849
850 struct fs_inst_box *boxes;
851 unsigned num_boxes;
852 unsigned size_boxes;
853 };
854
855 static struct value *
new_value(struct table * table,void * mem_ctx)856 new_value(struct table *table, void *mem_ctx)
857 {
858 if (table->num_values == table->size) {
859 table->size *= 2;
860 table->values = reralloc(mem_ctx, table->values, struct value, table->size);
861 }
862 return &table->values[table->num_values++];
863 }
864
865 /**
866 * Store an instruction with some other data in a table.
867 *
868 * \returns the index into the dynamic array of boxes for the instruction.
869 */
870 static unsigned
box_instruction(struct table * table,void * mem_ctx,fs_inst * inst,unsigned ip,bblock_t * block)871 box_instruction(struct table *table, void *mem_ctx, fs_inst *inst,
872 unsigned ip, bblock_t *block)
873 {
874 /* It is common for box_instruction to be called consecutively for each
875 * source of an instruction. As a result, the most common case for finding
876 * an instruction in the table is when that instruction was the last one
877 * added. Search the list back to front.
878 */
879 for (unsigned i = table->num_boxes; i > 0; /* empty */) {
880 i--;
881
882 if (table->boxes[i].inst == inst)
883 return i;
884 }
885
886 if (table->num_boxes == table->size_boxes) {
887 table->size_boxes *= 2;
888 table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box,
889 table->size_boxes);
890 }
891
892 assert(table->num_boxes < table->size_boxes);
893
894 const unsigned idx = table->num_boxes++;
895 fs_inst_box *ib = &table->boxes[idx];
896
897 ib->inst = inst;
898 ib->block = block;
899 ib->ip = ip;
900
901 return idx;
902 }
903
904 /**
905 * Comparator used for sorting an array of imm structures.
906 *
907 * We sort by basic block number, then last use IP, then first use IP (least
908 * to greatest). This sorting causes immediates live in the same area to be
909 * allocated to the same register in the hopes that all values will be dead
910 * about the same time and the register can be reused.
911 */
912 static int
compare(const void * _a,const void * _b)913 compare(const void *_a, const void *_b)
914 {
915 const struct imm *a = (const struct imm *)_a,
916 *b = (const struct imm *)_b;
917
918 int block_diff = a->block->num - b->block->num;
919 if (block_diff)
920 return block_diff;
921
922 int end_diff = a->last_use_ip - b->last_use_ip;
923 if (end_diff)
924 return end_diff;
925
926 return a->first_use_ip - b->first_use_ip;
927 }
928
929 static struct brw_reg
build_imm_reg_for_copy(struct imm * imm)930 build_imm_reg_for_copy(struct imm *imm)
931 {
932 switch (imm->size) {
933 case 8:
934 return brw_imm_d(imm->d64);
935 case 4:
936 return brw_imm_d(imm->d);
937 case 2:
938 return brw_imm_w(imm->w);
939 default:
940 unreachable("not implemented");
941 }
942 }
943
944 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)945 get_alignment_for_imm(const struct imm *imm)
946 {
947 if (imm->is_half_float)
948 return 4; /* At least MAD seems to require this */
949 else
950 return imm->size;
951 }
952
953 static bool
representable_as_hf(float f,uint16_t * hf)954 representable_as_hf(float f, uint16_t *hf)
955 {
956 union fi u;
957 uint16_t h = _mesa_float_to_half(f);
958 u.f = _mesa_half_to_float(h);
959
960 if (u.f == f) {
961 *hf = h;
962 return true;
963 }
964
965 return false;
966 }
967
968 static bool
representable_as_w(int d,int16_t * w)969 representable_as_w(int d, int16_t *w)
970 {
971 int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
972 if (!res) {
973 *w = d;
974 return true;
975 }
976
977 return false;
978 }
979
980 static bool
representable_as_uw(unsigned ud,uint16_t * uw)981 representable_as_uw(unsigned ud, uint16_t *uw)
982 {
983 if (!(ud & 0xffff0000)) {
984 *uw = ud;
985 return true;
986 }
987
988 return false;
989 }
990
991 static bool
supports_src_as_imm(const struct intel_device_info * devinfo,const fs_inst * inst)992 supports_src_as_imm(const struct intel_device_info *devinfo, const fs_inst *inst)
993 {
994 if (devinfo->ver < 12)
995 return false;
996
997 switch (inst->opcode) {
998 case BRW_OPCODE_ADD3:
999 /* ADD3 only exists on Gfx12.5+. */
1000 return true;
1001
1002 case BRW_OPCODE_CSEL:
1003 /* While MAD can mix F and HF sources on some platforms, CSEL cannot. */
1004 return inst->src[0].type != BRW_TYPE_F;
1005
1006 case BRW_OPCODE_MAD:
1007 /* Integer types can always mix sizes. Floating point types can mix
1008 * sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or
1009 * all be F.
1010 */
1011 return devinfo->verx10 < 125 || inst->src[0].type != BRW_TYPE_F;
1012
1013 default:
1014 return false;
1015 }
1016 }
1017
1018 static bool
can_promote_src_as_imm(const struct intel_device_info * devinfo,fs_inst * inst,unsigned src_idx)1019 can_promote_src_as_imm(const struct intel_device_info *devinfo, fs_inst *inst,
1020 unsigned src_idx)
1021 {
1022 bool can_promote = false;
1023
1024 /* Experiment shows that we can only support src0 as immediate for MAD on
1025 * Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation
1026 * only propagates into src0. It's possible that src2 works for W or UW MAD
1027 * on Gfx12.5.
1028 */
1029 if (src_idx != 0)
1030 return false;
1031
1032 if (!supports_src_as_imm(devinfo, inst))
1033 return false;
1034
1035 /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
1036 * since HF/F mixed mode has been removed from the hardware.
1037 */
1038 switch (inst->src[src_idx].type) {
1039 case BRW_TYPE_F: {
1040 uint16_t hf;
1041 if (representable_as_hf(inst->src[src_idx].f, &hf)) {
1042 inst->src[src_idx] = retype(brw_imm_uw(hf), BRW_TYPE_HF);
1043 can_promote = true;
1044 }
1045 break;
1046 }
1047 case BRW_TYPE_D: {
1048 int16_t w;
1049 if (representable_as_w(inst->src[src_idx].d, &w)) {
1050 inst->src[src_idx] = brw_imm_w(w);
1051 can_promote = true;
1052 }
1053 break;
1054 }
1055 case BRW_TYPE_UD: {
1056 uint16_t uw;
1057 if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
1058 inst->src[src_idx] = brw_imm_uw(uw);
1059 can_promote = true;
1060 }
1061 break;
1062 }
1063 case BRW_TYPE_W:
1064 case BRW_TYPE_UW:
1065 case BRW_TYPE_HF:
1066 can_promote = true;
1067 break;
1068 default:
1069 break;
1070 }
1071
1072 return can_promote;
1073 }
1074
1075 static void
add_candidate_immediate(struct table * table,fs_inst * inst,unsigned ip,unsigned i,bool allow_one_constant,bblock_t * block,const struct intel_device_info * devinfo,void * const_ctx)1076 add_candidate_immediate(struct table *table, fs_inst *inst, unsigned ip,
1077 unsigned i,
1078 bool allow_one_constant,
1079 bblock_t *block,
1080 const struct intel_device_info *devinfo,
1081 void *const_ctx)
1082 {
1083 struct value *v = new_value(table, const_ctx);
1084
1085 unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block);
1086
1087 v->value.u64 = inst->src[i].d64;
1088 v->bit_size = brw_type_size_bits(inst->src[i].type);
1089 v->instr_index = box_idx;
1090 v->src = i;
1091 v->allow_one_constant = allow_one_constant;
1092
1093 /* Right-shift instructions are special. They can have source modifiers,
1094 * but changing the type can change the semantic of the instruction. Only
1095 * allow negations on a right shift if the source type is already signed.
1096 */
1097 v->no_negations = !inst->can_do_source_mods(devinfo) ||
1098 ((inst->opcode == BRW_OPCODE_SHR ||
1099 inst->opcode == BRW_OPCODE_ASR) &&
1100 brw_type_is_uint(inst->src[i].type));
1101
1102 switch (inst->src[i].type) {
1103 case BRW_TYPE_DF:
1104 case BRW_TYPE_F:
1105 case BRW_TYPE_HF:
1106 v->type = float_only;
1107 break;
1108
1109 case BRW_TYPE_UQ:
1110 case BRW_TYPE_Q:
1111 case BRW_TYPE_UD:
1112 case BRW_TYPE_D:
1113 case BRW_TYPE_UW:
1114 case BRW_TYPE_W:
1115 v->type = integer_only;
1116 break;
1117
1118 case BRW_TYPE_VF:
1119 case BRW_TYPE_UV:
1120 case BRW_TYPE_V:
1121 case BRW_TYPE_UB:
1122 case BRW_TYPE_B:
1123 default:
1124 unreachable("not reached");
1125 }
1126
1127 /* It is safe to change the type of the operands of a select instruction
1128 * that has no conditional modifier, no source modifiers, and no saturate
1129 * modifer.
1130 */
1131 if (inst->opcode == BRW_OPCODE_SEL &&
1132 inst->conditional_mod == BRW_CONDITIONAL_NONE &&
1133 !inst->src[0].negate && !inst->src[0].abs &&
1134 !inst->src[1].negate && !inst->src[1].abs &&
1135 !inst->saturate) {
1136 v->type = either_type;
1137 }
1138 }
1139
1140 struct register_allocation {
1141 /** VGRF for storing values. */
1142 unsigned nr;
1143
1144 /**
1145 * Mask of currently available slots in this register.
1146 *
1147 * Each register is 16, 16-bit slots. Allocations require 1, 2, or 4 slots
1148 * for word, double-word, or quad-word values, respectively.
1149 */
1150 uint16_t avail;
1151 };
1152
1153 static brw_reg
allocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned bytes,unsigned align_bytes,brw::simple_allocator & alloc)1154 allocate_slots(struct register_allocation *regs, unsigned num_regs,
1155 unsigned bytes, unsigned align_bytes,
1156 brw::simple_allocator &alloc)
1157 {
1158 assert(bytes == 2 || bytes == 4 || bytes == 8);
1159 assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8);
1160
1161 const unsigned words = bytes / 2;
1162 const unsigned align_words = align_bytes / 2;
1163 const uint16_t mask = (1U << words) - 1;
1164
1165 for (unsigned i = 0; i < num_regs; i++) {
1166 for (unsigned j = 0; j <= (16 - words); j += align_words) {
1167 const uint16_t x = regs[i].avail >> j;
1168
1169 if ((x & mask) == mask) {
1170 if (regs[i].nr == UINT_MAX)
1171 regs[i].nr = alloc.allocate(1);
1172
1173 regs[i].avail &= ~(mask << j);
1174
1175 brw_reg reg = brw_vgrf(regs[i].nr, BRW_TYPE_F);
1176 reg.offset = j * 2;
1177
1178 return reg;
1179 }
1180 }
1181 }
1182
1183 unreachable("No free slots found.");
1184 }
1185
1186 static void
deallocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned reg_nr,unsigned subreg_offset,unsigned bytes)1187 deallocate_slots(struct register_allocation *regs, unsigned num_regs,
1188 unsigned reg_nr, unsigned subreg_offset, unsigned bytes)
1189 {
1190 assert(bytes == 2 || bytes == 4 || bytes == 8);
1191 assert(subreg_offset % 2 == 0);
1192 assert(subreg_offset + bytes <= 32);
1193
1194 const unsigned words = bytes / 2;
1195 const unsigned offset = subreg_offset / 2;
1196 const uint16_t mask = ((1U << words) - 1) << offset;
1197
1198 for (unsigned i = 0; i < num_regs; i++) {
1199 if (regs[i].nr == reg_nr) {
1200 regs[i].avail |= mask;
1201 return;
1202 }
1203 }
1204
1205 unreachable("No such register found.");
1206 }
1207
1208 static void
parcel_out_registers(struct imm * imm,unsigned len,const bblock_t * cur_block,struct register_allocation * regs,unsigned num_regs,brw::simple_allocator & alloc,unsigned ver)1209 parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block,
1210 struct register_allocation *regs, unsigned num_regs,
1211 brw::simple_allocator &alloc, unsigned ver)
1212 {
1213 /* Each basic block has two distinct set of constants. There is the set of
1214 * constants that only have uses in that block, and there is the set of
1215 * constants that have uses after that block.
1216 *
1217 * Allocation proceeds in three passes.
1218 *
1219 * 1. Allocate space for the values that are used outside this block.
1220 *
1221 * 2. Allocate space for the values that are used only in this block.
1222 *
1223 * 3. Deallocate the space for the values that are used only in this block.
1224 */
1225
1226 for (unsigned pass = 0; pass < 2; pass++) {
1227 const bool used_in_single_block = pass != 0;
1228
1229 for (unsigned i = 0; i < len; i++) {
1230 if (imm[i].block == cur_block &&
1231 imm[i].used_in_single_block == used_in_single_block) {
1232 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1233 *
1234 * "In Align16 mode, the channel selects and channel enables apply
1235 * to a pair of half-floats, because these parameters are defined
1236 * for DWord elements ONLY. This is applicable when both source
1237 * and destination are half-floats."
1238 *
1239 * This means that Align16 instructions that use promoted HF
1240 * immediates and use a <0,1,0>:HF region would read 2 HF slots
1241 * instead of replicating the single one we want. To avoid this, we
1242 * always populate both HF slots within a DWord with the constant.
1243 */
1244 const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1245
1246 const brw_reg reg = allocate_slots(regs, num_regs,
1247 imm[i].size * width,
1248 get_alignment_for_imm(&imm[i]),
1249 alloc);
1250
1251 imm[i].nr = reg.nr;
1252 imm[i].subreg_offset = reg.offset;
1253 }
1254 }
1255 }
1256
1257 for (unsigned i = 0; i < len; i++) {
1258 if (imm[i].block == cur_block && imm[i].used_in_single_block) {
1259 const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1260
1261 deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset,
1262 imm[i].size * width);
1263 }
1264 }
1265 }
1266
1267 bool
brw_fs_opt_combine_constants(fs_visitor & s)1268 brw_fs_opt_combine_constants(fs_visitor &s)
1269 {
1270 const intel_device_info *devinfo = s.devinfo;
1271 void *const_ctx = ralloc_context(NULL);
1272
1273 struct table table;
1274
1275 /* For each of the dynamic arrays in the table, allocate about a page of
1276 * memory. On LP64 systems, this gives 126 value objects 169 fs_inst_box
1277 * objects. Even larger shaders that have been obverved rarely need more
1278 * than 20 or 30 values. Most smaller shaders, which is most shaders, need
1279 * at most a couple dozen fs_inst_box.
1280 */
1281 table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value);
1282 table.num_values = 0;
1283 table.values = ralloc_array(const_ctx, struct value, table.size);
1284
1285 table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box);
1286 table.num_boxes = 0;
1287 table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes);
1288
1289 const brw::idom_tree &idom = s.idom_analysis.require();
1290 unsigned ip = -1;
1291
1292 /* Make a pass through all instructions and mark each constant is used in
1293 * instruction sources that cannot legally be immediate values.
1294 */
1295 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1296 ip++;
1297
1298 switch (inst->opcode) {
1299 case SHADER_OPCODE_INT_QUOTIENT:
1300 case SHADER_OPCODE_INT_REMAINDER:
1301 case SHADER_OPCODE_POW:
1302 if (inst->src[0].file == IMM) {
1303 add_candidate_immediate(&table, inst, ip, 0, false, block,
1304 devinfo, const_ctx);
1305 }
1306 break;
1307
1308 /* FINISHME: CSEL handling could be better. For some cases, src[0] and
1309 * src[1] can be commutative (e.g., any integer comparison). In those
1310 * cases when src[1] is IMM, the sources could be exchanged. In
1311 * addition, when both sources are IMM that could be represented as
1312 * 16-bits, it would be better to add both sources with
1313 * allow_one_constant=true as is done for SEL.
1314 */
1315 case BRW_OPCODE_ADD3:
1316 case BRW_OPCODE_CSEL:
1317 case BRW_OPCODE_MAD: {
1318 for (int i = 0; i < inst->sources; i++) {
1319 if (inst->src[i].file != IMM)
1320 continue;
1321
1322 if (can_promote_src_as_imm(devinfo, inst, i))
1323 continue;
1324
1325 add_candidate_immediate(&table, inst, ip, i, false, block,
1326 devinfo, const_ctx);
1327 }
1328
1329 break;
1330 }
1331
1332 case BRW_OPCODE_BFE:
1333 case BRW_OPCODE_BFI2:
1334 case BRW_OPCODE_LRP:
1335 for (int i = 0; i < inst->sources; i++) {
1336 if (inst->src[i].file != IMM)
1337 continue;
1338
1339 add_candidate_immediate(&table, inst, ip, i, false, block,
1340 devinfo, const_ctx);
1341 }
1342
1343 break;
1344
1345 case BRW_OPCODE_SEL:
1346 if (inst->src[0].file == IMM) {
1347 /* It is possible to have src0 be immediate but src1 not be
1348 * immediate for the non-commutative conditional modifiers (e.g.,
1349 * G).
1350 */
1351 if (inst->conditional_mod == BRW_CONDITIONAL_NONE ||
1352 /* Only GE and L are commutative. */
1353 inst->conditional_mod == BRW_CONDITIONAL_GE ||
1354 inst->conditional_mod == BRW_CONDITIONAL_L) {
1355 assert(inst->src[1].file == IMM);
1356
1357 add_candidate_immediate(&table, inst, ip, 0, true, block,
1358 devinfo, const_ctx);
1359 add_candidate_immediate(&table, inst, ip, 1, true, block,
1360 devinfo, const_ctx);
1361 } else {
1362 add_candidate_immediate(&table, inst, ip, 0, false, block,
1363 devinfo, const_ctx);
1364 }
1365 }
1366 break;
1367
1368 case BRW_OPCODE_ASR:
1369 case BRW_OPCODE_BFI1:
1370 case BRW_OPCODE_MUL:
1371 case BRW_OPCODE_ROL:
1372 case BRW_OPCODE_ROR:
1373 case BRW_OPCODE_SHL:
1374 case BRW_OPCODE_SHR:
1375 if (inst->src[0].file == IMM) {
1376 add_candidate_immediate(&table, inst, ip, 0, false, block,
1377 devinfo, const_ctx);
1378 }
1379 break;
1380
1381 default:
1382 break;
1383 }
1384 }
1385
1386 if (table.num_values == 0) {
1387 ralloc_free(const_ctx);
1388 return false;
1389 }
1390
1391 combine_constants_result *result =
1392 brw_combine_constants(table.values, table.num_values);
1393
1394 table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit);
1395 table.len = 0;
1396
1397 for (unsigned i = 0; i < result->num_values_to_emit; i++) {
1398 struct imm *imm = &table.imm[table.len];
1399
1400 imm->block = NULL;
1401 imm->inst = NULL;
1402 imm->d64 = result->values_to_emit[i].value.u64;
1403 imm->size = result->values_to_emit[i].bit_size / 8;
1404
1405 imm->is_half_float = false;
1406
1407 imm->first_use_ip = UINT16_MAX;
1408 imm->last_use_ip = 0;
1409
1410 imm->uses = new(const_ctx) exec_list;
1411
1412 const unsigned first_user = result->values_to_emit[i].first_user;
1413 const unsigned last_user = first_user +
1414 result->values_to_emit[i].num_users;
1415
1416 for (unsigned j = first_user; j < last_user; j++) {
1417 const unsigned idx = table.values[result->user_map[j].index].instr_index;
1418 fs_inst_box *const ib = &table.boxes[idx];
1419
1420 const unsigned src = table.values[result->user_map[j].index].src;
1421
1422 imm->uses->push_tail(link(const_ctx, ib->inst, src,
1423 result->user_map[j].negate,
1424 result->user_map[j].type));
1425
1426 if (imm->block == NULL) {
1427 /* Block should only be NULL on the first pass. On the first
1428 * pass, inst should also be NULL.
1429 */
1430 assert(imm->inst == NULL);
1431
1432 imm->inst = ib->inst;
1433 imm->block = ib->block;
1434 imm->first_use_ip = ib->ip;
1435 imm->last_use_ip = ib->ip;
1436 imm->used_in_single_block = true;
1437 } else {
1438 bblock_t *intersection = idom.intersect(ib->block,
1439 imm->block);
1440
1441 if (ib->block != imm->block)
1442 imm->used_in_single_block = false;
1443
1444 if (imm->first_use_ip > ib->ip) {
1445 imm->first_use_ip = ib->ip;
1446
1447 /* If the first-use instruction is to be tracked, block must be
1448 * the block that contains it. The old block was read in the
1449 * idom.intersect call above, so it is safe to overwrite it
1450 * here.
1451 */
1452 imm->inst = ib->inst;
1453 imm->block = ib->block;
1454 }
1455
1456 if (imm->last_use_ip < ib->ip)
1457 imm->last_use_ip = ib->ip;
1458
1459 /* The common dominator is not the block that contains the
1460 * first-use instruction, so don't track that instruction. The
1461 * load instruction will be added in the common dominator block
1462 * instead of before the first-use instruction.
1463 */
1464 if (intersection != imm->block)
1465 imm->inst = NULL;
1466
1467 imm->block = intersection;
1468 }
1469
1470 if (ib->inst->src[src].type == BRW_TYPE_HF)
1471 imm->is_half_float = true;
1472 }
1473
1474 table.len++;
1475 }
1476
1477 delete result;
1478
1479 if (table.len == 0) {
1480 ralloc_free(const_ctx);
1481 return false;
1482 }
1483 if (s.cfg->num_blocks != 1)
1484 qsort(table.imm, table.len, sizeof(struct imm), compare);
1485
1486 struct register_allocation *regs =
1487 (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
1488
1489 for (int i = 0; i < table.len; i++) {
1490 regs[i].nr = UINT_MAX;
1491 regs[i].avail = 0xffff;
1492 }
1493
1494 foreach_block(block, s.cfg) {
1495 parcel_out_registers(table.imm, table.len, block, regs, table.len,
1496 s.alloc, devinfo->ver);
1497 }
1498
1499 free(regs);
1500
1501 bool rebuild_cfg = false;
1502
1503 /* Insert MOVs to load the constant values into GRFs. */
1504 for (int i = 0; i < table.len; i++) {
1505 struct imm *imm = &table.imm[i];
1506
1507 /* Insert it either before the instruction that generated the immediate
1508 * or after the last non-control flow instruction of the common ancestor.
1509 */
1510 exec_node *n;
1511 bblock_t *insert_block;
1512 if (imm->inst != nullptr) {
1513 n = imm->inst;
1514 insert_block = imm->block;
1515 } else {
1516 if (imm->block->start()->opcode == BRW_OPCODE_DO) {
1517 /* DO blocks are weird. They can contain only the single DO
1518 * instruction. As a result, MOV instructions cannot be added to
1519 * the DO block.
1520 */
1521 bblock_t *next_block = imm->block->next();
1522 if (next_block->starts_with_control_flow()) {
1523 /* This is the difficult case. This occurs for code like
1524 *
1525 * do {
1526 * do {
1527 * ...
1528 * } while (...);
1529 * } while (...);
1530 *
1531 * when the MOV instructions need to be inserted between the
1532 * two DO instructions.
1533 *
1534 * To properly handle this scenario, a new block would need to
1535 * be inserted. Doing so would require modifying arbitrary many
1536 * CONTINUE, BREAK, and WHILE instructions to point to the new
1537 * block.
1538 *
1539 * It is unlikely that this would ever be correct. Instead,
1540 * insert the MOV instructions in the known wrong place and
1541 * rebuild the CFG at the end of the pass.
1542 */
1543 insert_block = imm->block;
1544 n = insert_block->last_non_control_flow_inst()->next;
1545
1546 rebuild_cfg = true;
1547 } else {
1548 insert_block = next_block;
1549 n = insert_block->start();
1550 }
1551 } else {
1552 insert_block = imm->block;
1553 n = insert_block->last_non_control_flow_inst()->next;
1554 }
1555 }
1556
1557 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1558 *
1559 * "In Align16 mode, the channel selects and channel enables apply to a
1560 * pair of half-floats, because these parameters are defined for DWord
1561 * elements ONLY. This is applicable when both source and destination
1562 * are half-floats."
1563 *
1564 * This means that Align16 instructions that use promoted HF immediates
1565 * and use a <0,1,0>:HF region would read 2 HF slots instead of
1566 * replicating the single one we want. To avoid this, we always populate
1567 * both HF slots within a DWord with the constant.
1568 */
1569 const uint32_t width = 1;
1570 const fs_builder ibld = fs_builder(&s, width).at(insert_block, n).exec_all();
1571
1572 brw_reg reg = brw_vgrf(imm->nr, BRW_TYPE_F);
1573 reg.offset = imm->subreg_offset;
1574 reg.stride = 0;
1575
1576 /* Put the immediate in an offset aligned to its size. Some instructions
1577 * seem to have additional alignment requirements, so account for that
1578 * too.
1579 */
1580 assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
1581
1582 struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
1583
1584 /* Ensure we have enough space in the register to copy the immediate */
1585 assert(reg.offset + brw_type_size_bytes(imm_reg.type) * width <= REG_SIZE);
1586
1587 ibld.MOV(retype(reg, imm_reg.type), imm_reg);
1588 }
1589 s.shader_stats.promoted_constants = table.len;
1590
1591 /* Rewrite the immediate sources to refer to the new GRFs. */
1592 for (int i = 0; i < table.len; i++) {
1593 foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
1594 brw_reg *reg = &link->inst->src[link->src];
1595
1596 if (link->inst->opcode == BRW_OPCODE_SEL) {
1597 if (link->type == either_type) {
1598 /* Do not change the register type. */
1599 } else if (link->type == integer_only) {
1600 reg->type = brw_int_type(brw_type_size_bytes(reg->type), true);
1601 } else {
1602 assert(link->type == float_only);
1603
1604 switch (brw_type_size_bytes(reg->type)) {
1605 case 2:
1606 reg->type = BRW_TYPE_HF;
1607 break;
1608 case 4:
1609 reg->type = BRW_TYPE_F;
1610 break;
1611 case 8:
1612 reg->type = BRW_TYPE_DF;
1613 break;
1614 default:
1615 unreachable("Bad type size");
1616 }
1617 }
1618 } else if ((link->inst->opcode == BRW_OPCODE_SHL ||
1619 link->inst->opcode == BRW_OPCODE_ASR) &&
1620 link->negate) {
1621 reg->type = brw_int_type(brw_type_size_bytes(reg->type), true);
1622 }
1623
1624 #if MESA_DEBUG
1625 switch (reg->type) {
1626 case BRW_TYPE_DF:
1627 assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
1628 (fabs(reg->df) == fabs(table.imm[i].df)));
1629 break;
1630 case BRW_TYPE_F:
1631 assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
1632 (fabsf(reg->f) == fabsf(table.imm[i].f)));
1633 break;
1634 case BRW_TYPE_HF:
1635 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
1636 isnan(_mesa_half_to_float(table.imm[i].w))) ||
1637 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
1638 fabsf(_mesa_half_to_float(table.imm[i].w))));
1639 break;
1640 case BRW_TYPE_Q:
1641 assert(abs(reg->d64) == abs(table.imm[i].d64));
1642 break;
1643 case BRW_TYPE_UQ:
1644 assert(!link->negate);
1645 assert(reg->d64 == table.imm[i].d64);
1646 break;
1647 case BRW_TYPE_D:
1648 assert(abs(reg->d) == abs(table.imm[i].d));
1649 break;
1650 case BRW_TYPE_UD:
1651 assert(!link->negate);
1652 assert(reg->d == table.imm[i].d);
1653 break;
1654 case BRW_TYPE_W:
1655 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
1656 break;
1657 case BRW_TYPE_UW:
1658 assert(!link->negate);
1659 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
1660 break;
1661 default:
1662 break;
1663 }
1664 #endif
1665
1666 assert(link->inst->can_do_source_mods(devinfo) || !link->negate);
1667
1668 reg->file = VGRF;
1669 reg->offset = table.imm[i].subreg_offset;
1670 reg->stride = 0;
1671 reg->negate = link->negate;
1672 reg->nr = table.imm[i].nr;
1673 }
1674 }
1675
1676 /* Fixup any SEL instructions that have src0 still as an immediate. Fixup
1677 * the types of any SEL instruction that have a negation on one of the
1678 * sources. Adding the negation may have changed the type of that source,
1679 * so the other source (and destination) must be changed to match.
1680 */
1681 for (unsigned i = 0; i < table.num_boxes; i++) {
1682 fs_inst *inst = table.boxes[i].inst;
1683
1684 if (inst->opcode != BRW_OPCODE_SEL)
1685 continue;
1686
1687 /* If both sources have negation, the types had better be the same! */
1688 assert(!inst->src[0].negate || !inst->src[1].negate ||
1689 inst->src[0].type == inst->src[1].type);
1690
1691 /* If either source has a negation, force the type of the other source
1692 * and the type of the result to be the same.
1693 */
1694 if (inst->src[0].negate) {
1695 inst->src[1].type = inst->src[0].type;
1696 inst->dst.type = inst->src[0].type;
1697 }
1698
1699 if (inst->src[1].negate) {
1700 inst->src[0].type = inst->src[1].type;
1701 inst->dst.type = inst->src[1].type;
1702 }
1703
1704 if (inst->src[0].file != IMM)
1705 continue;
1706
1707 assert(inst->src[1].file != IMM);
1708 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE ||
1709 inst->conditional_mod == BRW_CONDITIONAL_GE ||
1710 inst->conditional_mod == BRW_CONDITIONAL_L);
1711
1712 brw_reg temp = inst->src[0];
1713 inst->src[0] = inst->src[1];
1714 inst->src[1] = temp;
1715
1716 /* If this was predicated, flipping operands means we also need to flip
1717 * the predicate.
1718 */
1719 if (inst->conditional_mod == BRW_CONDITIONAL_NONE)
1720 inst->predicate_inverse = !inst->predicate_inverse;
1721 }
1722
1723 if (debug) {
1724 for (int i = 0; i < table.len; i++) {
1725 struct imm *imm = &table.imm[i];
1726
1727 fprintf(stderr,
1728 "0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
1729 "IP: %4d to %4d, length %4d\n",
1730 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
1731 imm->block->num,
1732 imm->nr,
1733 imm->subreg_offset,
1734 imm->first_use_ip,
1735 imm->last_use_ip,
1736 imm->last_use_ip - imm->first_use_ip);
1737 }
1738 }
1739
1740 if (rebuild_cfg) {
1741 /* When the CFG is initially built, the instructions are removed from
1742 * the list of instructions stored in fs_visitor -- the same exec_node
1743 * is used for membership in that list and in a block list. So we need
1744 * to pull them back before rebuilding the CFG.
1745 */
1746 assert(exec_list_length(&s.instructions) == 0);
1747 foreach_block(block, s.cfg) {
1748 exec_list_append(&s.instructions, &block->instructions);
1749 }
1750
1751 delete s.cfg;
1752 s.cfg = NULL;
1753 brw_calculate_cfg(s);
1754 }
1755
1756 ralloc_free(const_ctx);
1757
1758 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES |
1759 (rebuild_cfg ? DEPENDENCY_BLOCKS : DEPENDENCY_NOTHING));
1760
1761 return true;
1762 }
1763