1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "bi_builder.h"
25 #include "va_compiler.h"
26 #include "valhall.h"
27 #include "valhall_enums.h"
28
29 /* This file contains the final passes of the compiler. Running after
30 * scheduling and RA, the IR is now finalized, so we need to emit it to actual
31 * bits on the wire (as well as fixup branches)
32 */
33
34 /*
35 * Unreachable for encoding failures, when hitting an invalid instruction.
36 * Prints the (first) failing instruction to aid debugging.
37 */
38 NORETURN static void PRINTFLIKE(2, 3)
invalid_instruction(const bi_instr * I,const char * cause,...)39 invalid_instruction(const bi_instr *I, const char *cause, ...)
40 {
41 fputs("\nInvalid ", stderr);
42
43 va_list ap;
44 va_start(ap, cause);
45 vfprintf(stderr, cause, ap);
46 va_end(ap);
47
48 fputs(":\n\t", stderr);
49 bi_print_instr(I, stderr);
50 fprintf(stderr, "\n");
51
52 unreachable("Invalid instruction");
53 }
54
55 /*
56 * Like assert, but prints the instruction if the assertion fails to aid
57 * debugging invalid inputs to the packing module.
58 */
59 #define pack_assert(I, cond) \
60 if (!(cond)) \
61 invalid_instruction(I, "invariant " #cond);
62
63 /*
64 * Validate that two adjacent 32-bit sources form an aligned 64-bit register
65 * pair. This is a compiler invariant, required on Valhall but not on Bifrost.
66 */
67 static void
va_validate_register_pair(const bi_instr * I,unsigned s)68 va_validate_register_pair(const bi_instr *I, unsigned s)
69 {
70 ASSERTED bi_index lo = I->src[s], hi = I->src[s + 1];
71
72 pack_assert(I, lo.type == hi.type);
73
74 if (lo.type == BI_INDEX_REGISTER) {
75 pack_assert(I, hi.value & 1);
76 pack_assert(I, hi.value == lo.value + 1);
77 } else if (lo.type == BI_INDEX_FAU && lo.value & BIR_FAU_IMMEDIATE) {
78 /* Small constants are zero extended, so the top word encode zero */
79 pack_assert(I, hi.value == (BIR_FAU_IMMEDIATE | 0));
80 } else {
81 pack_assert(I, hi.offset & 1);
82 pack_assert(I, hi.offset == lo.offset + 1);
83 }
84 }
85
86 static unsigned
va_pack_reg(const bi_instr * I,bi_index idx)87 va_pack_reg(const bi_instr *I, bi_index idx)
88 {
89 pack_assert(I, idx.type == BI_INDEX_REGISTER);
90 pack_assert(I, idx.value < 64);
91
92 return idx.value;
93 }
94
95 static unsigned
va_pack_fau_special(const bi_instr * I,enum bir_fau fau)96 va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
97 {
98 switch (fau) {
99 case BIR_FAU_ATEST_PARAM:
100 return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM;
101 case BIR_FAU_TLS_PTR:
102 return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER;
103 case BIR_FAU_WLS_PTR:
104 return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER;
105 case BIR_FAU_LANE_ID:
106 return VA_FAU_SPECIAL_PAGE_3_LANE_ID;
107 case BIR_FAU_PROGRAM_COUNTER:
108 return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER;
109 case BIR_FAU_SAMPLE_POS_ARRAY:
110 return VA_FAU_SPECIAL_PAGE_0_SAMPLE;
111
112 case BIR_FAU_BLEND_0 ...(BIR_FAU_BLEND_0 + 7):
113 return VA_FAU_SPECIAL_PAGE_0_BLEND_DESCRIPTOR_0 + (fau - BIR_FAU_BLEND_0);
114
115 default:
116 invalid_instruction(I, "FAU");
117 }
118 }
119
120 /*
121 * Encode a 64-bit FAU source. The offset is ignored, so this function can be
122 * used to encode a 32-bit FAU source by or'ing in the appropriate offset.
123 */
124 static unsigned
va_pack_fau_64(const bi_instr * I,bi_index idx)125 va_pack_fau_64(const bi_instr *I, bi_index idx)
126 {
127 pack_assert(I, idx.type == BI_INDEX_FAU);
128
129 unsigned val = (idx.value & BITFIELD_MASK(5));
130
131 if (idx.value & BIR_FAU_IMMEDIATE)
132 return (0x3 << 6) | (val << 1);
133 else if (idx.value & BIR_FAU_UNIFORM)
134 return (0x2 << 6) | (val << 1);
135 else
136 return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
137 }
138
139 static unsigned
va_pack_src(const bi_instr * I,unsigned s)140 va_pack_src(const bi_instr *I, unsigned s)
141 {
142 bi_index idx = I->src[s];
143
144 if (idx.type == BI_INDEX_REGISTER) {
145 unsigned value = va_pack_reg(I, idx);
146 if (idx.discard)
147 value |= (1 << 6);
148 return value;
149 } else if (idx.type == BI_INDEX_FAU) {
150 pack_assert(I, idx.offset <= 1);
151 return va_pack_fau_64(I, idx) | idx.offset;
152 }
153
154 invalid_instruction(I, "type of source %u", s);
155 }
156
157 static unsigned
va_pack_wrmask(const bi_instr * I)158 va_pack_wrmask(const bi_instr *I)
159 {
160 switch (I->dest[0].swizzle) {
161 case BI_SWIZZLE_H00:
162 return 0x1;
163 case BI_SWIZZLE_H11:
164 return 0x2;
165 case BI_SWIZZLE_H01:
166 return 0x3;
167 default:
168 invalid_instruction(I, "write mask");
169 }
170 }
171
172 static enum va_atomic_operation
va_pack_atom_opc(const bi_instr * I)173 va_pack_atom_opc(const bi_instr *I)
174 {
175 switch (I->atom_opc) {
176 case BI_ATOM_OPC_AADD:
177 return VA_ATOMIC_OPERATION_AADD;
178 case BI_ATOM_OPC_ASMIN:
179 return VA_ATOMIC_OPERATION_ASMIN;
180 case BI_ATOM_OPC_ASMAX:
181 return VA_ATOMIC_OPERATION_ASMAX;
182 case BI_ATOM_OPC_AUMIN:
183 return VA_ATOMIC_OPERATION_AUMIN;
184 case BI_ATOM_OPC_AUMAX:
185 return VA_ATOMIC_OPERATION_AUMAX;
186 case BI_ATOM_OPC_AAND:
187 return VA_ATOMIC_OPERATION_AAND;
188 case BI_ATOM_OPC_AOR:
189 return VA_ATOMIC_OPERATION_AOR;
190 case BI_ATOM_OPC_AXOR:
191 return VA_ATOMIC_OPERATION_AXOR;
192 case BI_ATOM_OPC_ACMPXCHG:
193 case BI_ATOM_OPC_AXCHG:
194 return VA_ATOMIC_OPERATION_AXCHG;
195 default:
196 invalid_instruction(I, "atomic opcode");
197 }
198 }
199
200 static enum va_atomic_operation_with_1
va_pack_atom_opc_1(const bi_instr * I)201 va_pack_atom_opc_1(const bi_instr *I)
202 {
203 switch (I->atom_opc) {
204 case BI_ATOM_OPC_AINC:
205 return VA_ATOMIC_OPERATION_WITH_1_AINC;
206 case BI_ATOM_OPC_ADEC:
207 return VA_ATOMIC_OPERATION_WITH_1_ADEC;
208 case BI_ATOM_OPC_AUMAX1:
209 return VA_ATOMIC_OPERATION_WITH_1_AUMAX1;
210 case BI_ATOM_OPC_ASMAX1:
211 return VA_ATOMIC_OPERATION_WITH_1_ASMAX1;
212 case BI_ATOM_OPC_AOR1:
213 return VA_ATOMIC_OPERATION_WITH_1_AOR1;
214 default:
215 invalid_instruction(I, "atomic opcode with implicit 1");
216 }
217 }
218
219 static unsigned
va_pack_dest(const bi_instr * I)220 va_pack_dest(const bi_instr *I)
221 {
222 assert(I->nr_dests);
223 return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
224 }
225
226 static enum va_widen
va_pack_widen_f32(const bi_instr * I,enum bi_swizzle swz)227 va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
228 {
229 switch (swz) {
230 case BI_SWIZZLE_H01:
231 return VA_WIDEN_NONE;
232 case BI_SWIZZLE_H00:
233 return VA_WIDEN_H0;
234 case BI_SWIZZLE_H11:
235 return VA_WIDEN_H1;
236 default:
237 invalid_instruction(I, "widen");
238 }
239 }
240
241 static enum va_swizzles_16_bit
va_pack_swizzle_f16(const bi_instr * I,enum bi_swizzle swz)242 va_pack_swizzle_f16(const bi_instr *I, enum bi_swizzle swz)
243 {
244 switch (swz) {
245 case BI_SWIZZLE_H00:
246 return VA_SWIZZLES_16_BIT_H00;
247 case BI_SWIZZLE_H10:
248 return VA_SWIZZLES_16_BIT_H10;
249 case BI_SWIZZLE_H01:
250 return VA_SWIZZLES_16_BIT_H01;
251 case BI_SWIZZLE_H11:
252 return VA_SWIZZLES_16_BIT_H11;
253 default:
254 invalid_instruction(I, "16-bit swizzle");
255 }
256 }
257
258 static unsigned
va_pack_widen(const bi_instr * I,enum bi_swizzle swz,enum va_size size)259 va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
260 {
261 if (size == VA_SIZE_8) {
262 switch (swz) {
263 case BI_SWIZZLE_H01:
264 return VA_SWIZZLES_8_BIT_B0123;
265 case BI_SWIZZLE_H00:
266 return VA_SWIZZLES_8_BIT_B0101;
267 case BI_SWIZZLE_H11:
268 return VA_SWIZZLES_8_BIT_B2323;
269 case BI_SWIZZLE_B0000:
270 return VA_SWIZZLES_8_BIT_B0000;
271 case BI_SWIZZLE_B1111:
272 return VA_SWIZZLES_8_BIT_B1111;
273 case BI_SWIZZLE_B2222:
274 return VA_SWIZZLES_8_BIT_B2222;
275 case BI_SWIZZLE_B3333:
276 return VA_SWIZZLES_8_BIT_B3333;
277 default:
278 invalid_instruction(I, "8-bit widen");
279 }
280 } else if (size == VA_SIZE_16) {
281 switch (swz) {
282 case BI_SWIZZLE_H00:
283 return VA_SWIZZLES_16_BIT_H00;
284 case BI_SWIZZLE_H10:
285 return VA_SWIZZLES_16_BIT_H10;
286 case BI_SWIZZLE_H01:
287 return VA_SWIZZLES_16_BIT_H01;
288 case BI_SWIZZLE_H11:
289 return VA_SWIZZLES_16_BIT_H11;
290 case BI_SWIZZLE_B0000:
291 return VA_SWIZZLES_16_BIT_B00;
292 case BI_SWIZZLE_B1111:
293 return VA_SWIZZLES_16_BIT_B11;
294 case BI_SWIZZLE_B2222:
295 return VA_SWIZZLES_16_BIT_B22;
296 case BI_SWIZZLE_B3333:
297 return VA_SWIZZLES_16_BIT_B33;
298 default:
299 invalid_instruction(I, "16-bit widen");
300 }
301 } else if (size == VA_SIZE_32) {
302 switch (swz) {
303 case BI_SWIZZLE_H01:
304 return VA_SWIZZLES_32_BIT_NONE;
305 case BI_SWIZZLE_H00:
306 return VA_SWIZZLES_32_BIT_H0;
307 case BI_SWIZZLE_H11:
308 return VA_SWIZZLES_32_BIT_H1;
309 case BI_SWIZZLE_B0000:
310 return VA_SWIZZLES_32_BIT_B0;
311 case BI_SWIZZLE_B1111:
312 return VA_SWIZZLES_32_BIT_B1;
313 case BI_SWIZZLE_B2222:
314 return VA_SWIZZLES_32_BIT_B2;
315 case BI_SWIZZLE_B3333:
316 return VA_SWIZZLES_32_BIT_B3;
317 default:
318 invalid_instruction(I, "32-bit widen");
319 }
320 } else {
321 invalid_instruction(I, "type size for widen");
322 }
323 }
324
325 static enum va_half_swizzles_8_bit
va_pack_halfswizzle(const bi_instr * I,enum bi_swizzle swz)326 va_pack_halfswizzle(const bi_instr *I, enum bi_swizzle swz)
327 {
328 switch (swz) {
329 case BI_SWIZZLE_B0000:
330 return VA_HALF_SWIZZLES_8_BIT_B00;
331 case BI_SWIZZLE_B1111:
332 return VA_HALF_SWIZZLES_8_BIT_B11;
333 case BI_SWIZZLE_B2222:
334 return VA_HALF_SWIZZLES_8_BIT_B22;
335 case BI_SWIZZLE_B3333:
336 return VA_HALF_SWIZZLES_8_BIT_B33;
337 case BI_SWIZZLE_B0011:
338 return VA_HALF_SWIZZLES_8_BIT_B01;
339 case BI_SWIZZLE_B2233:
340 return VA_HALF_SWIZZLES_8_BIT_B23;
341 case BI_SWIZZLE_B0022:
342 return VA_HALF_SWIZZLES_8_BIT_B02;
343 default:
344 invalid_instruction(I, "v2u8 swizzle");
345 }
346 }
347
348 static enum va_lanes_8_bit
va_pack_shift_lanes(const bi_instr * I,enum bi_swizzle swz)349 va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz)
350 {
351 switch (swz) {
352 case BI_SWIZZLE_H01:
353 return VA_LANES_8_BIT_B02;
354 case BI_SWIZZLE_B0000:
355 return VA_LANES_8_BIT_B00;
356 case BI_SWIZZLE_B1111:
357 return VA_LANES_8_BIT_B11;
358 case BI_SWIZZLE_B2222:
359 return VA_LANES_8_BIT_B22;
360 case BI_SWIZZLE_B3333:
361 return VA_LANES_8_BIT_B33;
362 default:
363 invalid_instruction(I, "lane shift");
364 }
365 }
366
367 static enum va_combine
va_pack_combine(const bi_instr * I,enum bi_swizzle swz)368 va_pack_combine(const bi_instr *I, enum bi_swizzle swz)
369 {
370 switch (swz) {
371 case BI_SWIZZLE_H01:
372 return VA_COMBINE_NONE;
373 case BI_SWIZZLE_H00:
374 return VA_COMBINE_H0;
375 case BI_SWIZZLE_H11:
376 return VA_COMBINE_H1;
377 default:
378 invalid_instruction(I, "branch lane");
379 }
380 }
381
382 static enum va_source_format
va_pack_source_format(const bi_instr * I)383 va_pack_source_format(const bi_instr *I)
384 {
385 switch (I->source_format) {
386 case BI_SOURCE_FORMAT_FLAT32:
387 return VA_SOURCE_FORMAT_SRC_FLAT32;
388 case BI_SOURCE_FORMAT_FLAT16:
389 return VA_SOURCE_FORMAT_SRC_FLAT16;
390 case BI_SOURCE_FORMAT_F32:
391 return VA_SOURCE_FORMAT_SRC_F32;
392 case BI_SOURCE_FORMAT_F16:
393 return VA_SOURCE_FORMAT_SRC_F16;
394 }
395
396 invalid_instruction(I, "source format");
397 }
398
399 static uint64_t
va_pack_rhadd(const bi_instr * I)400 va_pack_rhadd(const bi_instr *I)
401 {
402 switch (I->round) {
403 case BI_ROUND_RTN:
404 return 0; /* hadd */
405 case BI_ROUND_RTP:
406 return BITFIELD_BIT(30); /* rhadd */
407 default:
408 unreachable("Invalid round for HADD");
409 }
410 }
411
412 static uint64_t
va_pack_alu(const bi_instr * I)413 va_pack_alu(const bi_instr *I)
414 {
415 struct va_opcode_info info = valhall_opcodes[I->op];
416 uint64_t hex = 0;
417
418 switch (I->op) {
419 /* Add FREXP flags */
420 case BI_OPCODE_FREXPE_F32:
421 case BI_OPCODE_FREXPE_V2F16:
422 case BI_OPCODE_FREXPM_F32:
423 case BI_OPCODE_FREXPM_V2F16:
424 if (I->sqrt)
425 hex |= 1ull << 24;
426 if (I->log)
427 hex |= 1ull << 25;
428 break;
429
430 /* Add mux type */
431 case BI_OPCODE_MUX_I32:
432 case BI_OPCODE_MUX_V2I16:
433 case BI_OPCODE_MUX_V4I8:
434 hex |= (uint64_t)I->mux << 32;
435 break;
436
437 /* Add .eq flag */
438 case BI_OPCODE_BRANCHZ_I16:
439 case BI_OPCODE_BRANCHZI:
440 pack_assert(I, I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE);
441
442 if (I->cmpf == BI_CMPF_EQ)
443 hex |= (1ull << 36);
444
445 if (I->op == BI_OPCODE_BRANCHZI)
446 hex |= (0x1ull << 40); /* Absolute */
447 else
448 hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8;
449
450 break;
451
452 /* Add arithmetic flag */
453 case BI_OPCODE_RSHIFT_AND_I32:
454 case BI_OPCODE_RSHIFT_AND_V2I16:
455 case BI_OPCODE_RSHIFT_AND_V4I8:
456 case BI_OPCODE_RSHIFT_OR_I32:
457 case BI_OPCODE_RSHIFT_OR_V2I16:
458 case BI_OPCODE_RSHIFT_OR_V4I8:
459 case BI_OPCODE_RSHIFT_XOR_I32:
460 case BI_OPCODE_RSHIFT_XOR_V2I16:
461 case BI_OPCODE_RSHIFT_XOR_V4I8:
462 hex |= (uint64_t)I->arithmetic << 34;
463 break;
464
465 case BI_OPCODE_LEA_BUF_IMM:
466 /* Buffer table index */
467 hex |= 0xD << 8;
468 break;
469
470 case BI_OPCODE_LEA_ATTR_IMM:
471 hex |= ((uint64_t)I->table) << 16;
472 hex |= ((uint64_t)I->attribute_index) << 20;
473 break;
474
475 case BI_OPCODE_IADD_IMM_I32:
476 case BI_OPCODE_IADD_IMM_V2I16:
477 case BI_OPCODE_IADD_IMM_V4I8:
478 case BI_OPCODE_FADD_IMM_F32:
479 case BI_OPCODE_FADD_IMM_V2F16:
480 hex |= ((uint64_t)I->index) << 8;
481 break;
482
483 case BI_OPCODE_CLPER_I32:
484 hex |= ((uint64_t)I->inactive_result) << 22;
485 hex |= ((uint64_t)I->lane_op) << 32;
486 hex |= ((uint64_t)I->subgroup) << 36;
487 break;
488
489 case BI_OPCODE_LD_VAR:
490 case BI_OPCODE_LD_VAR_FLAT:
491 case BI_OPCODE_LD_VAR_IMM:
492 case BI_OPCODE_LD_VAR_FLAT_IMM:
493 case BI_OPCODE_LD_VAR_BUF_F16:
494 case BI_OPCODE_LD_VAR_BUF_F32:
495 case BI_OPCODE_LD_VAR_BUF_IMM_F16:
496 case BI_OPCODE_LD_VAR_BUF_IMM_F32:
497 case BI_OPCODE_LD_VAR_SPECIAL:
498 if (I->op == BI_OPCODE_LD_VAR_SPECIAL)
499 hex |= ((uint64_t)I->varying_name) << 12; /* instead of index */
500 else if (I->op == BI_OPCODE_LD_VAR_BUF_IMM_F16 ||
501 I->op == BI_OPCODE_LD_VAR_BUF_IMM_F32) {
502 hex |= ((uint64_t)I->index) << 16;
503 } else if (I->op == BI_OPCODE_LD_VAR_IMM ||
504 I->op == BI_OPCODE_LD_VAR_FLAT_IMM) {
505 hex |= ((uint64_t)I->table) << 8;
506 hex |= ((uint64_t)I->index) << 12;
507 }
508
509 hex |= ((uint64_t)va_pack_source_format(I)) << 24;
510 hex |= ((uint64_t)I->update) << 36;
511 hex |= ((uint64_t)I->sample) << 38;
512 break;
513
514 case BI_OPCODE_LD_ATTR_IMM:
515 hex |= ((uint64_t)I->table) << 16;
516 hex |= ((uint64_t)I->attribute_index) << 20;
517 break;
518
519 case BI_OPCODE_LD_TEX_IMM:
520 case BI_OPCODE_LEA_TEX_IMM:
521 hex |= ((uint64_t)I->table) << 16;
522 hex |= ((uint64_t)I->texture_index) << 20;
523 break;
524
525 case BI_OPCODE_ZS_EMIT:
526 if (I->stencil)
527 hex |= (1 << 24);
528 if (I->z)
529 hex |= (1 << 25);
530 break;
531
532 default:
533 break;
534 }
535
536 /* FMA_RSCALE.f32 special modes treated as extra opcodes */
537 if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
538 pack_assert(I, I->special < 4);
539 hex |= ((uint64_t)I->special) << 48;
540 }
541
542 /* Add the normal destination or a placeholder. Staging destinations are
543 * added elsewhere, as they require special handling for control fields.
544 */
545 if (info.has_dest && info.nr_staging_dests == 0) {
546 hex |= (uint64_t)va_pack_dest(I) << 40;
547 } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
548 pack_assert(I, I->nr_dests == 0);
549 hex |= 0xC0ull << 40; /* Placeholder */
550 }
551
552 bool swap12 = va_swap_12(I->op);
553
554 /* First src is staging if we read, skip it when packing sources */
555 unsigned src_offset = bi_opcode_props[I->op].sr_read ? 1 : 0;
556
557 for (unsigned i = 0; i < info.nr_srcs; ++i) {
558 unsigned logical_i = (swap12 && i == 1) ? 2 : (swap12 && i == 2) ? 1 : i;
559
560 struct va_src_info src_info = info.srcs[i];
561 enum va_size size = src_info.size;
562
563 bi_index src = I->src[logical_i + src_offset];
564 hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
565
566 if (src_info.notted) {
567 if (src.neg)
568 hex |= (1ull << 35);
569 } else if (src_info.absneg) {
570 unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
571 unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
572
573 if (src.neg)
574 hex |= 1ull << neg_offs;
575 if (src.abs)
576 hex |= 1ull << abs_offs;
577 } else {
578 if (src.neg)
579 invalid_instruction(I, "negate");
580 if (src.abs)
581 invalid_instruction(I, "absolute value");
582 }
583
584 if (src_info.swizzle) {
585 unsigned offs = 24 + ((2 - i) * 2);
586 unsigned S = src.swizzle;
587 pack_assert(I, size == VA_SIZE_16 || size == VA_SIZE_32);
588
589 uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S)
590 : va_pack_swizzle_f16(I, S));
591 hex |= v << offs;
592 } else if (src_info.widen) {
593 unsigned offs = (i == 1) ? 26 : 36;
594 hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs;
595 } else if (src_info.lane) {
596 unsigned offs =
597 (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36) : 28;
598
599 if (src_info.size == VA_SIZE_16) {
600 hex |= (src.swizzle == BI_SWIZZLE_H11 ? 1 : 0) << offs;
601 } else if (I->op == BI_OPCODE_BRANCHZ_I16) {
602 hex |= ((uint64_t)va_pack_combine(I, src.swizzle) << 37);
603 } else {
604 pack_assert(I, src_info.size == VA_SIZE_8);
605 unsigned comp = src.swizzle - BI_SWIZZLE_B0000;
606 pack_assert(I, comp < 4);
607 hex |= (uint64_t)comp << offs;
608 }
609 } else if (src_info.lanes) {
610 pack_assert(I, src_info.size == VA_SIZE_8);
611 pack_assert(I, i == 1);
612 hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
613 } else if (src_info.combine) {
614 /* Treat as swizzle, subgroup ops not yet supported */
615 pack_assert(I, src_info.size == VA_SIZE_32);
616 pack_assert(I, i == 0);
617 hex |= (uint64_t)va_pack_widen_f32(I, src.swizzle) << 37;
618 } else if (src_info.halfswizzle) {
619 pack_assert(I, src_info.size == VA_SIZE_8);
620 pack_assert(I, i == 0);
621 hex |= (uint64_t)va_pack_halfswizzle(I, src.swizzle) << 36;
622 } else if (src.swizzle != BI_SWIZZLE_H01) {
623 invalid_instruction(I, "swizzle");
624 }
625 }
626
627 if (info.saturate)
628 hex |= (uint64_t)I->saturate << 30;
629 if (info.rhadd)
630 hex |= va_pack_rhadd(I);
631 if (info.clamp)
632 hex |= (uint64_t)I->clamp << 32;
633 if (info.round_mode)
634 hex |= (uint64_t)I->round << 30;
635 if (info.condition)
636 hex |= (uint64_t)I->cmpf << 32;
637 if (info.result_type)
638 hex |= (uint64_t)I->result_type << 30;
639
640 return hex;
641 }
642
643 static uint64_t
va_pack_byte_offset(const bi_instr * I)644 va_pack_byte_offset(const bi_instr *I)
645 {
646 int16_t offset = I->byte_offset;
647 if (offset != I->byte_offset)
648 invalid_instruction(I, "byte offset");
649
650 uint16_t offset_as_u16 = offset;
651 return ((uint64_t)offset_as_u16) << 8;
652 }
653
654 static uint64_t
va_pack_byte_offset_8(const bi_instr * I)655 va_pack_byte_offset_8(const bi_instr *I)
656 {
657 uint8_t offset = I->byte_offset;
658 if (offset != I->byte_offset)
659 invalid_instruction(I, "byte offset");
660
661 return ((uint64_t)offset) << 8;
662 }
663
664 static uint64_t
va_pack_load(const bi_instr * I,bool buffer_descriptor)665 va_pack_load(const bi_instr *I, bool buffer_descriptor)
666 {
667 const uint8_t load_lane_identity[8] = {
668 VA_LOAD_LANE_8_BIT_B0, VA_LOAD_LANE_16_BIT_H0,
669 VA_LOAD_LANE_24_BIT_IDENTITY, VA_LOAD_LANE_32_BIT_W0,
670 VA_LOAD_LANE_48_BIT_IDENTITY, VA_LOAD_LANE_64_BIT_IDENTITY,
671 VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY,
672 };
673
674 unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
675 uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36;
676
677 // unsigned
678 hex |= (1ull << 39);
679
680 if (!buffer_descriptor)
681 hex |= va_pack_byte_offset(I);
682
683 hex |= (uint64_t)va_pack_src(I, 0) << 0;
684
685 if (buffer_descriptor)
686 hex |= (uint64_t)va_pack_src(I, 1) << 8;
687
688 return hex;
689 }
690
691 static uint64_t
va_pack_memory_access(const bi_instr * I)692 va_pack_memory_access(const bi_instr *I)
693 {
694 switch (I->seg) {
695 case BI_SEG_TL:
696 return VA_MEMORY_ACCESS_FORCE;
697 case BI_SEG_POS:
698 return VA_MEMORY_ACCESS_ISTREAM;
699 case BI_SEG_VARY:
700 return VA_MEMORY_ACCESS_ESTREAM;
701 default:
702 return VA_MEMORY_ACCESS_NONE;
703 }
704 }
705
706 static uint64_t
va_pack_store(const bi_instr * I)707 va_pack_store(const bi_instr *I)
708 {
709 uint64_t hex = va_pack_memory_access(I) << 24;
710
711 va_validate_register_pair(I, 1);
712 hex |= (uint64_t)va_pack_src(I, 1) << 0;
713
714 hex |= va_pack_byte_offset(I);
715
716 return hex;
717 }
718
719 static enum va_lod_mode
va_pack_lod_mode(const bi_instr * I)720 va_pack_lod_mode(const bi_instr *I)
721 {
722 switch (I->va_lod_mode) {
723 case BI_VA_LOD_MODE_ZERO_LOD:
724 return VA_LOD_MODE_ZERO;
725 case BI_VA_LOD_MODE_COMPUTED_LOD:
726 return VA_LOD_MODE_COMPUTED;
727 case BI_VA_LOD_MODE_EXPLICIT:
728 return VA_LOD_MODE_EXPLICIT;
729 case BI_VA_LOD_MODE_COMPUTED_BIAS:
730 return VA_LOD_MODE_COMPUTED_BIAS;
731 case BI_VA_LOD_MODE_GRDESC:
732 return VA_LOD_MODE_GRDESC;
733 }
734
735 invalid_instruction(I, "LOD mode");
736 }
737
738 static enum va_register_type
va_pack_register_type(const bi_instr * I)739 va_pack_register_type(const bi_instr *I)
740 {
741 switch (I->register_format) {
742 case BI_REGISTER_FORMAT_F16:
743 case BI_REGISTER_FORMAT_F32:
744 return VA_REGISTER_TYPE_F;
745
746 case BI_REGISTER_FORMAT_U16:
747 case BI_REGISTER_FORMAT_U32:
748 return VA_REGISTER_TYPE_U;
749
750 case BI_REGISTER_FORMAT_S16:
751 case BI_REGISTER_FORMAT_S32:
752 return VA_REGISTER_TYPE_S;
753
754 default:
755 invalid_instruction(I, "register type");
756 }
757 }
758
759 static enum va_register_format
va_pack_register_format(const bi_instr * I)760 va_pack_register_format(const bi_instr *I)
761 {
762 switch (I->register_format) {
763 case BI_REGISTER_FORMAT_AUTO:
764 return VA_REGISTER_FORMAT_AUTO;
765 case BI_REGISTER_FORMAT_F32:
766 return VA_REGISTER_FORMAT_F32;
767 case BI_REGISTER_FORMAT_F16:
768 return VA_REGISTER_FORMAT_F16;
769 case BI_REGISTER_FORMAT_S32:
770 return VA_REGISTER_FORMAT_S32;
771 case BI_REGISTER_FORMAT_S16:
772 return VA_REGISTER_FORMAT_S16;
773 case BI_REGISTER_FORMAT_U32:
774 return VA_REGISTER_FORMAT_U32;
775 case BI_REGISTER_FORMAT_U16:
776 return VA_REGISTER_FORMAT_U16;
777 default:
778 invalid_instruction(I, "register format");
779 }
780 }
781
782 uint64_t
va_pack_instr(const bi_instr * I)783 va_pack_instr(const bi_instr *I)
784 {
785 struct va_opcode_info info = valhall_opcodes[I->op];
786
787 uint64_t hex = info.exact | (((uint64_t)I->flow) << 59);
788 hex |= ((uint64_t)va_select_fau_page(I)) << 57;
789
790 if (info.slot)
791 hex |= ((uint64_t)I->slot << 30);
792
793 if (info.sr_count) {
794 bool read = bi_opcode_props[I->op].sr_read;
795 bi_index sr = read ? I->src[0] : I->dest[0];
796
797 unsigned count =
798 read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0);
799
800 hex |= ((uint64_t)count << 33);
801 hex |= (uint64_t)va_pack_reg(I, sr) << 40;
802 hex |= ((uint64_t)info.sr_control << 46);
803 }
804
805 if (info.sr_write_count) {
806 hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36;
807 hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
808 }
809
810 if (info.vecsize)
811 hex |= ((uint64_t)I->vecsize << 28);
812
813 if (info.register_format)
814 hex |= ((uint64_t)va_pack_register_format(I)) << 24;
815
816 switch (I->op) {
817 case BI_OPCODE_LOAD_I8:
818 case BI_OPCODE_LOAD_I16:
819 case BI_OPCODE_LOAD_I24:
820 case BI_OPCODE_LOAD_I32:
821 case BI_OPCODE_LOAD_I48:
822 case BI_OPCODE_LOAD_I64:
823 case BI_OPCODE_LOAD_I96:
824 case BI_OPCODE_LOAD_I128:
825 hex |= va_pack_load(I, false);
826 break;
827
828 case BI_OPCODE_LD_BUFFER_I8:
829 case BI_OPCODE_LD_BUFFER_I16:
830 case BI_OPCODE_LD_BUFFER_I24:
831 case BI_OPCODE_LD_BUFFER_I32:
832 case BI_OPCODE_LD_BUFFER_I48:
833 case BI_OPCODE_LD_BUFFER_I64:
834 case BI_OPCODE_LD_BUFFER_I96:
835 case BI_OPCODE_LD_BUFFER_I128:
836 hex |= va_pack_load(I, true);
837 break;
838
839 case BI_OPCODE_STORE_I8:
840 case BI_OPCODE_STORE_I16:
841 case BI_OPCODE_STORE_I24:
842 case BI_OPCODE_STORE_I32:
843 case BI_OPCODE_STORE_I48:
844 case BI_OPCODE_STORE_I64:
845 case BI_OPCODE_STORE_I96:
846 case BI_OPCODE_STORE_I128:
847 hex |= va_pack_store(I);
848 break;
849
850 case BI_OPCODE_ATOM1_RETURN_I32:
851 /* Permit omitting the destination for plain ATOM1 */
852 if (!bi_count_write_registers(I, 0)) {
853 hex |= (0x40ull << 40); // fake read
854 }
855
856 /* 64-bit source */
857 va_validate_register_pair(I, 0);
858 hex |= (uint64_t)va_pack_src(I, 0) << 0;
859 hex |= va_pack_byte_offset_8(I);
860 hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
861 break;
862
863 case BI_OPCODE_ATOM_I32:
864 case BI_OPCODE_ATOM_RETURN_I32:
865 /* 64-bit source */
866 va_validate_register_pair(I, 1);
867 hex |= (uint64_t)va_pack_src(I, 1) << 0;
868 hex |= va_pack_byte_offset_8(I);
869 hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
870
871 if (I->op == BI_OPCODE_ATOM_RETURN_I32)
872 hex |= (0xc0ull << 40); // flags
873
874 if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
875 hex |= (1 << 26); /* .compare */
876
877 break;
878
879 case BI_OPCODE_ST_CVT:
880 /* Staging read */
881 hex |= va_pack_store(I);
882
883 /* Conversion descriptor */
884 hex |= (uint64_t)va_pack_src(I, 3) << 16;
885 break;
886
887 case BI_OPCODE_BLEND: {
888 /* Source 0 - Blend descriptor (64-bit) */
889 hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
890 va_validate_register_pair(I, 2);
891
892 /* Target */
893 if (I->branch_offset & 0x7)
894 invalid_instruction(I, "unaligned branch");
895 hex |= ((I->branch_offset >> 3) << 8);
896
897 /* Source 2 - coverage mask */
898 hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
899
900 /* Vector size */
901 unsigned vecsize = 4;
902 hex |= ((uint64_t)(vecsize - 1) << 28);
903
904 break;
905 }
906
907 case BI_OPCODE_TEX_SINGLE:
908 case BI_OPCODE_TEX_FETCH:
909 case BI_OPCODE_TEX_GATHER: {
910 /* Image to read from */
911 hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
912
913 if (I->op == BI_OPCODE_TEX_FETCH && I->shadow)
914 invalid_instruction(I, "TEX_FETCH does not support .shadow");
915
916 if (I->wide_indices)
917 hex |= (1ull << 8);
918 if (I->array_enable)
919 hex |= (1ull << 10);
920 if (I->texel_offset)
921 hex |= (1ull << 11);
922 if (I->shadow)
923 hex |= (1ull << 12);
924 if (I->skip)
925 hex |= (1ull << 39);
926 if (!bi_is_regfmt_16(I->register_format))
927 hex |= (1ull << 46);
928
929 if (I->op == BI_OPCODE_TEX_SINGLE)
930 hex |= ((uint64_t)va_pack_lod_mode(I)) << 13;
931
932 if (I->op == BI_OPCODE_TEX_GATHER) {
933 if (I->integer_coordinates)
934 hex |= (1 << 13);
935 hex |= ((uint64_t)I->fetch_component) << 14;
936 }
937
938 hex |= (I->write_mask << 22);
939 hex |= ((uint64_t)va_pack_register_type(I)) << 26;
940 hex |= ((uint64_t)I->dimension) << 28;
941
942 break;
943 }
944
945 default:
946 if (!info.exact && I->op != BI_OPCODE_NOP)
947 invalid_instruction(I, "opcode");
948
949 hex |= va_pack_alu(I);
950 break;
951 }
952
953 return hex;
954 }
955
956 static unsigned
va_instructions_in_block(bi_block * block)957 va_instructions_in_block(bi_block *block)
958 {
959 unsigned offset = 0;
960
961 bi_foreach_instr_in_block(block, _) {
962 offset++;
963 }
964
965 return offset;
966 }
967
968 /* Calculate branch_offset from a branch_target for a direct relative branch */
969
970 static void
va_lower_branch_target(bi_context * ctx,bi_block * start,bi_instr * I)971 va_lower_branch_target(bi_context *ctx, bi_block *start, bi_instr *I)
972 {
973 /* Precondition: unlowered relative branch */
974 bi_block *target = I->branch_target;
975 assert(target != NULL);
976
977 /* Signed since we might jump backwards */
978 signed offset = 0;
979
980 /* Determine if the target block is strictly greater in source order */
981 bool forwards = target->index > start->index;
982
983 if (forwards) {
984 /* We have to jump through this block */
985 bi_foreach_instr_in_block_from(start, _, I) {
986 offset++;
987 }
988
989 /* We then need to jump over every following block until the target */
990 bi_foreach_block_from(ctx, start, blk) {
991 /* End just before the target */
992 if (blk == target)
993 break;
994
995 /* Count other blocks */
996 if (blk != start)
997 offset += va_instructions_in_block(blk);
998 }
999 } else {
1000 /* Jump through the beginning of this block */
1001 bi_foreach_instr_in_block_from_rev(start, ins, I) {
1002 if (ins != I)
1003 offset--;
1004 }
1005
1006 /* Jump over preceding blocks up to and including the target to get to
1007 * the beginning of the target */
1008 bi_foreach_block_from_rev(ctx, start, blk) {
1009 if (blk == start)
1010 continue;
1011
1012 offset -= va_instructions_in_block(blk);
1013
1014 /* End just after the target */
1015 if (blk == target)
1016 break;
1017 }
1018 }
1019
1020 /* Offset is relative to the next instruction, so bias */
1021 offset--;
1022
1023 /* Update the instruction */
1024 I->branch_offset = offset;
1025 }
1026
1027 /*
1028 * Late lowering to insert blend shader calls after BLEND instructions. Required
1029 * to support blend shaders, so this pass may be omitted if it is known that
1030 * blend shaders are never used.
1031 *
1032 * This lowering runs late because it introduces control flow changes without
1033 * modifying the control flow graph. It hardcodes registers, meaning running
1034 * after RA makes sense. Finally, it hardcodes a manually sized instruction
1035 * sequence, requiring it to run after scheduling.
1036 *
1037 * As it is Valhall specific, running it as a pre-pack lowering is sensible.
1038 */
1039 static void
va_lower_blend(bi_context * ctx)1040 va_lower_blend(bi_context *ctx)
1041 {
1042 /* Program counter for *next* instruction */
1043 bi_index pc = bi_fau(BIR_FAU_PROGRAM_COUNTER, false);
1044
1045 bi_foreach_instr_global_safe(ctx, I) {
1046 if (I->op != BI_OPCODE_BLEND)
1047 continue;
1048
1049 bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
1050
1051 unsigned prolog_length = 2 * 8;
1052
1053 /* By ABI, r48 is the link register shared with blend shaders */
1054 assert(bi_is_equiv(I->dest[0], bi_register(48)));
1055
1056 if (I->flow == VA_FLOW_END)
1057 bi_iadd_imm_i32_to(&b, I->dest[0], va_zero_lut(), 0);
1058 else
1059 bi_iadd_imm_i32_to(&b, I->dest[0], pc, prolog_length - 8);
1060
1061 bi_branchzi(&b, va_zero_lut(), I->src[3], BI_CMPF_EQ);
1062
1063 /* For fixed function: skip the prologue, or return */
1064 if (I->flow != VA_FLOW_END)
1065 I->branch_offset = prolog_length;
1066 }
1067 }
1068
1069 void
bi_pack_valhall(bi_context * ctx,struct util_dynarray * emission)1070 bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission)
1071 {
1072 unsigned orig_size = emission->size;
1073
1074 va_validate(stderr, ctx);
1075
1076 /* Late lowering */
1077 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
1078 va_lower_blend(ctx);
1079
1080 bi_foreach_block(ctx, block) {
1081 bi_foreach_instr_in_block(block, I) {
1082 if (I->op == BI_OPCODE_BRANCHZ_I16)
1083 va_lower_branch_target(ctx, block, I);
1084
1085 uint64_t hex = va_pack_instr(I);
1086 util_dynarray_append(emission, uint64_t, hex);
1087 }
1088 }
1089
1090 /* Pad with zeroes, but keep empty programs empty so they may be omitted
1091 * altogether. Failing to do this would result in a program containing only
1092 * zeroes, which is invalid and will raise an encoding fault.
1093 *
1094 * Pad an extra 16 byte (one instruction) to separate primary and secondary
1095 * shader disassembles. This is not strictly necessary, but it's a good
1096 * practice. 128 bytes is the optimal program alignment on Trym, so pad
1097 * secondary shaders up to 128 bytes. This may help the instruction cache.
1098 */
1099 if (orig_size != emission->size) {
1100 unsigned aligned = ALIGN_POT(emission->size + 16, 128);
1101 unsigned count = aligned - emission->size;
1102
1103 memset(util_dynarray_grow(emission, uint8_t, count), 0, count);
1104 }
1105 }
1106