1 /*
2 * Copyright (C) 2022 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #pragma once
25
26 #if !defined(PAN_ARCH) || PAN_ARCH < 10
27 #error "cs_builder.h requires PAN_ARCH >= 10"
28 #endif
29
30 #include "gen_macros.h"
31
32 #include "util/bitset.h"
33 #include "util/list.h"
34 #include "util/u_dynarray.h"
35
36 /*
37 * cs_builder implements a builder for CSF command streams. It manages the
38 * allocation and overflow behaviour of queues and provides helpers for emitting
39 * commands to run on the CSF pipe.
40 *
41 * Users are responsible for the CS buffer allocation and must initialize the
42 * command stream with an initial buffer using cs_builder_init(). The CS can
43 * be extended with new buffers allocated with cs_builder_conf::alloc_buffer()
44 * if the builder runs out of memory.
45 */
46
47 struct cs_buffer {
48 /* CPU pointer */
49 uint64_t *cpu;
50
51 /* GPU pointer */
52 uint64_t gpu;
53
54 /* Capacity in number of 64-bit instructions */
55 uint32_t capacity;
56 };
57
58 /**
59 * This is used to check that:
60 * 1. registers are not used as a source after being loaded without a
61 * WAIT(<ls_scoreboard>) in the middle
62 * 2. registers are not reused (used as a destination) after they served as a
63 * STORE() source without a WAIT(<ls_scoreboard>) in the middle
64 */
65 struct cs_load_store_tracker {
66 BITSET_DECLARE(pending_loads, 256);
67 BITSET_DECLARE(pending_stores, 256);
68 uint8_t sb_slot;
69 };
70
71 enum cs_reg_perm {
72 CS_REG_NO_ACCESS = 0,
73 CS_REG_RD = BITFIELD_BIT(1),
74 CS_REG_WR = BITFIELD_BIT(2),
75 CS_REG_RW = CS_REG_RD | CS_REG_WR,
76 };
77
78 struct cs_builder;
79
80 typedef enum cs_reg_perm (*reg_perm_cb_t)(struct cs_builder *b, unsigned reg);
81
82 struct cs_builder_conf {
83 /* Number of 32-bit registers in the hardware register file */
84 uint8_t nr_registers;
85
86 /* Number of 32-bit registers used by the kernel at submission time */
87 uint8_t nr_kernel_registers;
88
89 /* CS buffer allocator */
90 struct cs_buffer (*alloc_buffer)(void *cookie);
91
92 /* Optional load/store tracker. */
93 struct cs_load_store_tracker *ls_tracker;
94
95 /* Optional register access checker. */
96 reg_perm_cb_t reg_perm;
97
98 /* Cookie passed back to alloc_buffer() */
99 void *cookie;
100 };
101
102 /* The CS is formed of one or more CS chunks linked with JUMP instructions.
103 * The builder keeps track of the current chunk and the position inside this
104 * chunk, so it can emit new instructions, and decide when a new chunk needs
105 * to be allocated.
106 */
107 struct cs_chunk {
108 /* CS buffer object backing this chunk */
109 struct cs_buffer buffer;
110
111 union {
112 /* Current position in the buffer object when the chunk is active. */
113 uint32_t pos;
114
115 /* Chunk size when the chunk was wrapped. */
116 uint32_t size;
117 };
118 };
119
120 /* Monolithic sequence of instruction. Must live in a virtually contiguous
121 * portion of code.
122 */
123 struct cs_block {
124 /* Used to insert the block in the block stack. */
125 struct list_head node;
126 };
127
128 #define CS_LABEL_INVALID_POS ~0u
129
130 /* Labels can only be used inside a cs_block. They can be defined and
131 * referenced before they are set to point to a specific position
132 * in the block. */
133 struct cs_label {
134 /* The last reference we have seen pointing to this block before
135 * it was set. If set to CS_LABEL_INVALID_POS, no forward reference
136 * pointing to this label exist.
137 */
138 uint32_t last_forward_ref;
139
140 /* The label target. If set to CS_LABEL_INVALID_POS, the label has
141 * not been set yet.
142 */
143 uint32_t target;
144 };
145
146 struct cs_builder {
147 /* CS builder configuration */
148 struct cs_builder_conf conf;
149
150 /* True if an allocation failed, making the whole CS invalid. */
151 bool invalid;
152
153 /* Initial (root) CS chunk. */
154 struct cs_chunk root_chunk;
155
156 /* Current CS chunk. */
157 struct cs_chunk cur_chunk;
158
159 /* Temporary storage for inner blocks that need to be built
160 * and copied in one monolithic sequence of instructions with no
161 * jump in the middle.
162 */
163 struct {
164 struct list_head stack;
165 struct cs_block *cur;
166 struct util_dynarray instrs;
167 } blocks;
168
169 /* Move immediate instruction at the end of the last CS chunk that needs to
170 * be patched with the final length of the current CS chunk in order to
171 * facilitate correct overflow behaviour.
172 */
173 uint32_t *length_patch;
174
175 /* Used as temporary storage when the allocator couldn't allocate a new
176 * CS chunk.
177 */
178 uint64_t discard_instr_slot;
179 };
180
181 static inline void
cs_builder_init(struct cs_builder * b,const struct cs_builder_conf * conf,struct cs_buffer root_buffer)182 cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf,
183 struct cs_buffer root_buffer)
184 {
185 *b = (struct cs_builder){
186 .conf = *conf,
187 .root_chunk.buffer = root_buffer,
188 .cur_chunk.buffer = root_buffer,
189 };
190
191 /* We need at least 3 registers for CS chunk linking. Assume the kernel needs
192 * at least that too.
193 */
194 b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3);
195
196 list_inithead(&b->blocks.stack);
197 util_dynarray_init(&b->blocks.instrs, NULL);
198 }
199
200 static inline bool
cs_is_valid(struct cs_builder * b)201 cs_is_valid(struct cs_builder *b)
202 {
203 return !b->invalid;
204 }
205
206 static inline bool
cs_is_empty(struct cs_builder * b)207 cs_is_empty(struct cs_builder *b)
208 {
209 return b->cur_chunk.pos == 0 &&
210 b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu;
211 }
212
213 static inline uint64_t
cs_root_chunk_gpu_addr(struct cs_builder * b)214 cs_root_chunk_gpu_addr(struct cs_builder *b)
215 {
216 return b->root_chunk.buffer.gpu;
217 }
218
219 static inline uint32_t
cs_root_chunk_size(struct cs_builder * b)220 cs_root_chunk_size(struct cs_builder *b)
221 {
222 /* Make sure cs_finish() was called. */
223 assert(!memcmp(&b->cur_chunk, &(struct cs_chunk){0}, sizeof(b->cur_chunk)));
224
225 return b->root_chunk.size * sizeof(uint64_t);
226 }
227
228 /*
229 * Wrap the current queue. External users shouldn't call this function
230 * directly, they should call cs_finish() when they are done building
231 * the command stream, which will in turn call cs_wrap_queue().
232 *
233 * Internally, this is also used to finalize internal CS chunks when
234 * allocating new sub-chunks. See cs_alloc_chunk() for details.
235 *
236 * This notably requires patching the previous chunk with the length
237 * we ended up emitting for this chunk.
238 */
239 static inline void
cs_wrap_chunk(struct cs_builder * b)240 cs_wrap_chunk(struct cs_builder *b)
241 {
242 if (!cs_is_valid(b))
243 return;
244
245 if (b->length_patch) {
246 *b->length_patch = (b->cur_chunk.pos * 8);
247 b->length_patch = NULL;
248 }
249
250 if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu)
251 b->root_chunk.size = b->cur_chunk.size;
252 }
253
254 /* Call this when you are done building a command stream and want to prepare
255 * it for submission.
256 */
257 static inline void
cs_finish(struct cs_builder * b)258 cs_finish(struct cs_builder *b)
259 {
260 if (!cs_is_valid(b))
261 return;
262
263 cs_wrap_chunk(b);
264
265 /* This prevents adding instructions after that point. */
266 memset(&b->cur_chunk, 0, sizeof(b->cur_chunk));
267
268 util_dynarray_fini(&b->blocks.instrs);
269 }
270
271 enum cs_index_type {
272 CS_INDEX_REGISTER = 0,
273 CS_INDEX_UNDEF,
274 };
275
276 struct cs_index {
277 enum cs_index_type type;
278
279 /* Number of 32-bit words in the index, must be nonzero */
280 uint8_t size;
281
282 union {
283 uint64_t imm;
284 uint8_t reg;
285 };
286 };
287
288 static inline struct cs_index
cs_undef(void)289 cs_undef(void)
290 {
291 return (struct cs_index){
292 .type = CS_INDEX_UNDEF,
293 };
294 }
295
296 static inline uint8_t
cs_to_reg_tuple(struct cs_index idx,ASSERTED unsigned expected_size)297 cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size)
298 {
299 assert(idx.type == CS_INDEX_REGISTER);
300 assert(idx.size == expected_size);
301
302 return idx.reg;
303 }
304
305 static inline unsigned
cs_src_tuple(struct cs_builder * b,struct cs_index src,ASSERTED unsigned count)306 cs_src_tuple(struct cs_builder *b, struct cs_index src, ASSERTED unsigned count)
307 {
308 unsigned reg = cs_to_reg_tuple(src, count);
309
310 if (unlikely(b->conf.reg_perm)) {
311 for (unsigned i = reg; i < reg + count; i++) {
312 assert((b->conf.reg_perm(b, i) & CS_REG_RD) ||
313 !"Trying to read a restricted register");
314 }
315 }
316
317 struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
318
319 if (unlikely(ls_tracker)) {
320 for (unsigned i = reg; i < reg + count; i++) {
321 if (BITSET_TEST(ls_tracker->pending_loads, i))
322 assert(!"register used as a source before flushing loads\n");
323 }
324 }
325
326 return reg;
327 }
328
329 static inline unsigned
cs_src32(struct cs_builder * b,struct cs_index src)330 cs_src32(struct cs_builder *b, struct cs_index src)
331 {
332 return cs_src_tuple(b, src, 1);
333 }
334
335 static inline unsigned
cs_src64(struct cs_builder * b,struct cs_index src)336 cs_src64(struct cs_builder *b, struct cs_index src)
337 {
338 return cs_src_tuple(b, src, 2);
339 }
340
341 static inline unsigned
cs_dst_tuple(struct cs_builder * b,struct cs_index dst,ASSERTED unsigned count)342 cs_dst_tuple(struct cs_builder *b, struct cs_index dst, ASSERTED unsigned count)
343 {
344 unsigned reg = cs_to_reg_tuple(dst, count);
345
346 if (unlikely(b->conf.reg_perm)) {
347 for (unsigned i = reg; i < reg + count; i++) {
348 assert((b->conf.reg_perm(b, i) & CS_REG_WR) ||
349 !"Trying to write a restricted register");
350 }
351 }
352
353 struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
354
355 if (unlikely(ls_tracker)) {
356 for (unsigned i = reg; i < reg + count; i++) {
357 if (BITSET_TEST(ls_tracker->pending_stores, i))
358 assert(
359 !"register reused as a destination before flushing stores\n");
360 }
361 }
362
363 return reg;
364 }
365
366 static inline unsigned
cs_dst32(struct cs_builder * b,struct cs_index dst)367 cs_dst32(struct cs_builder *b, struct cs_index dst)
368 {
369 return cs_dst_tuple(b, dst, 1);
370 }
371
372 static inline unsigned
cs_dst64(struct cs_builder * b,struct cs_index dst)373 cs_dst64(struct cs_builder *b, struct cs_index dst)
374 {
375 return cs_dst_tuple(b, dst, 2);
376 }
377
378 static inline struct cs_index
cs_reg_tuple(ASSERTED struct cs_builder * b,unsigned reg,unsigned size)379 cs_reg_tuple(ASSERTED struct cs_builder *b, unsigned reg, unsigned size)
380 {
381 assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers &&
382 "overflowed register file");
383 assert(size <= 16 && "unsupported");
384
385 return (struct cs_index){
386 .type = CS_INDEX_REGISTER,
387 .size = size,
388 .reg = reg,
389 };
390 }
391
392 static inline struct cs_index
cs_reg32(struct cs_builder * b,unsigned reg)393 cs_reg32(struct cs_builder *b, unsigned reg)
394 {
395 return cs_reg_tuple(b, reg, 1);
396 }
397
398 static inline struct cs_index
cs_reg64(struct cs_builder * b,unsigned reg)399 cs_reg64(struct cs_builder *b, unsigned reg)
400 {
401 assert((reg % 2) == 0 && "unaligned 64-bit reg");
402 return cs_reg_tuple(b, reg, 2);
403 }
404
405 /*
406 * The top of the register file is reserved for cs_builder internal use. We
407 * need 3 spare registers for handling command queue overflow. These are
408 * available here.
409 */
410 static inline uint8_t
cs_overflow_address_reg(struct cs_builder * b)411 cs_overflow_address_reg(struct cs_builder *b)
412 {
413 return b->conf.nr_registers - 2;
414 }
415
416 static inline uint8_t
cs_overflow_length_reg(struct cs_builder * b)417 cs_overflow_length_reg(struct cs_builder *b)
418 {
419 return b->conf.nr_registers - 3;
420 }
421
422 static inline struct cs_index
cs_extract32(struct cs_builder * b,struct cs_index idx,unsigned word)423 cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word)
424 {
425 assert(idx.type == CS_INDEX_REGISTER && "unsupported");
426 assert(word < idx.size && "overrun");
427
428 return cs_reg32(b, idx.reg + word);
429 }
430
431 #define JUMP_SEQ_INSTR_COUNT 4
432
433 static inline void *
cs_alloc_ins(struct cs_builder * b,uint32_t num_instrs)434 cs_alloc_ins(struct cs_builder *b, uint32_t num_instrs)
435 {
436 /* If an allocation failure happened before, we just discard all following
437 * instructions.
438 */
439 if (unlikely(!cs_is_valid(b)))
440 return &b->discard_instr_slot;
441
442 if (b->blocks.cur)
443 return util_dynarray_grow(&b->blocks.instrs, uint64_t, num_instrs);
444
445 /* Lazy root chunk allocation. */
446 if (unlikely(!b->root_chunk.buffer.cpu)) {
447 b->root_chunk.buffer = b->conf.alloc_buffer(b->conf.cookie);
448 b->cur_chunk.buffer = b->root_chunk.buffer;
449 if (!b->cur_chunk.buffer.cpu) {
450 b->invalid = true;
451 return &b->discard_instr_slot;
452 }
453 }
454
455 /* If the current chunk runs out of space, allocate a new one and jump to it.
456 * We actually do this a few instructions before running out, because the
457 * sequence to jump to a new queue takes multiple instructions.
458 */
459 if (unlikely((b->cur_chunk.size + num_instrs + JUMP_SEQ_INSTR_COUNT) >
460 b->cur_chunk.buffer.capacity)) {
461 /* Now, allocate a new chunk */
462 struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie);
463
464 /* Allocation failure, from now on, all new instructions will be
465 * discarded.
466 */
467 if (unlikely(!b->cur_chunk.buffer.cpu))
468 return &b->discard_instr_slot;
469
470 uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
471
472 pan_pack(ptr, CS_MOVE, I) {
473 I.destination = cs_overflow_address_reg(b);
474 I.immediate = newbuf.gpu;
475 }
476
477 ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
478
479 pan_pack(ptr, CS_MOVE32, I) {
480 I.destination = cs_overflow_length_reg(b);
481 }
482
483 /* The length will be patched in later */
484 uint32_t *length_patch = (uint32_t *)ptr;
485
486 ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
487
488 pan_pack(ptr, CS_JUMP, I) {
489 I.length = cs_overflow_length_reg(b);
490 I.address = cs_overflow_address_reg(b);
491 }
492
493 /* Now that we've emitted everything, finish up the previous queue */
494 cs_wrap_chunk(b);
495
496 /* And make this one current */
497 b->length_patch = length_patch;
498 b->cur_chunk.buffer = newbuf;
499 b->cur_chunk.pos = 0;
500 }
501
502 assert(b->cur_chunk.size + num_instrs - 1 < b->cur_chunk.buffer.capacity);
503 uint32_t pos = b->cur_chunk.pos;
504 b->cur_chunk.pos += num_instrs;
505 return b->cur_chunk.buffer.cpu + pos;
506 }
507
508 /*
509 * Helper to emit a new instruction into the command queue. The allocation needs
510 * to be separated out being pan_pack can evaluate its argument multiple times,
511 * yet cs_alloc has side effects.
512 */
513 #define cs_emit(b, T, cfg) pan_pack(cs_alloc_ins(b, 1), CS_##T, cfg)
514
515 /* Asynchronous operations take a mask of scoreboard slots to wait on
516 * before executing the instruction, and signal a scoreboard slot when
517 * the operation is complete.
518 * A wait_mask of zero means the operation is synchronous, and signal_slot
519 * is ignored in that case.
520 */
521 struct cs_async_op {
522 uint16_t wait_mask;
523 uint8_t signal_slot;
524 };
525
526 static inline struct cs_async_op
cs_defer(unsigned wait_mask,unsigned signal_slot)527 cs_defer(unsigned wait_mask, unsigned signal_slot)
528 {
529 /* The scoreboard slot to signal is incremented before the wait operation,
530 * waiting on it would cause an infinite wait.
531 */
532 assert(!(wait_mask & BITFIELD_BIT(signal_slot)));
533
534 return (struct cs_async_op){
535 .wait_mask = wait_mask,
536 .signal_slot = signal_slot,
537 };
538 }
539
540 static inline struct cs_async_op
cs_now(void)541 cs_now(void)
542 {
543 return (struct cs_async_op){
544 .wait_mask = 0,
545 .signal_slot = ~0,
546 };
547 }
548
549 static inline bool
cs_instr_is_asynchronous(enum mali_cs_opcode opcode,uint16_t wait_mask)550 cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
551 {
552 switch (opcode) {
553 case MALI_CS_OPCODE_FLUSH_CACHE2:
554 case MALI_CS_OPCODE_FINISH_TILING:
555 case MALI_CS_OPCODE_LOAD_MULTIPLE:
556 case MALI_CS_OPCODE_STORE_MULTIPLE:
557 case MALI_CS_OPCODE_RUN_COMPUTE:
558 case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
559 case MALI_CS_OPCODE_RUN_FRAGMENT:
560 case MALI_CS_OPCODE_RUN_FULLSCREEN:
561 case MALI_CS_OPCODE_RUN_IDVS:
562 case MALI_CS_OPCODE_RUN_TILING:
563 /* Always asynchronous. */
564 return true;
565
566 case MALI_CS_OPCODE_FINISH_FRAGMENT:
567 case MALI_CS_OPCODE_SYNC_ADD32:
568 case MALI_CS_OPCODE_SYNC_SET32:
569 case MALI_CS_OPCODE_SYNC_ADD64:
570 case MALI_CS_OPCODE_SYNC_SET64:
571 case MALI_CS_OPCODE_STORE_STATE:
572 case MALI_CS_OPCODE_TRACE_POINT:
573 case MALI_CS_OPCODE_HEAP_OPERATION:
574 /* Asynchronous only if wait_mask != 0. */
575 return wait_mask != 0;
576
577 default:
578 return false;
579 }
580 }
581
582 #define cs_apply_async(I, async) \
583 do { \
584 I.wait_mask = async.wait_mask; \
585 I.signal_slot = cs_instr_is_asynchronous(I.opcode, I.wait_mask) \
586 ? async.signal_slot \
587 : 0; \
588 assert(I.signal_slot != ~0 || \
589 !"Can't use cs_now() on pure async instructions"); \
590 } while (0)
591
592 static inline void
cs_move32_to(struct cs_builder * b,struct cs_index dest,unsigned imm)593 cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm)
594 {
595 cs_emit(b, MOVE32, I) {
596 I.destination = cs_dst32(b, dest);
597 I.immediate = imm;
598 }
599 }
600
601 static inline void
cs_move48_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)602 cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
603 {
604 cs_emit(b, MOVE, I) {
605 I.destination = cs_dst64(b, dest);
606 I.immediate = imm;
607 }
608 }
609
610 static inline void
cs_block_start(struct cs_builder * b,struct cs_block * block)611 cs_block_start(struct cs_builder *b, struct cs_block *block)
612 {
613 list_addtail(&block->node, &b->blocks.stack);
614 b->blocks.cur = block;
615 }
616
617 static inline void
cs_block_end(struct cs_builder * b)618 cs_block_end(struct cs_builder *b)
619 {
620 assert(b->blocks.cur);
621
622 list_del(&b->blocks.cur->node);
623
624 if (!list_is_empty(&b->blocks.stack)) {
625 b->blocks.cur = list_last_entry(&b->blocks.stack, struct cs_block, node);
626 return;
627 }
628
629 b->blocks.cur = NULL;
630
631 uint32_t num_instrs =
632 util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
633 void *buffer = cs_alloc_ins(b, num_instrs);
634
635 memcpy(buffer, b->blocks.instrs.data, b->blocks.instrs.size);
636 util_dynarray_clear(&b->blocks.instrs);
637 }
638
639 static inline uint32_t
cs_block_next_pos(struct cs_builder * b)640 cs_block_next_pos(struct cs_builder *b)
641 {
642 assert(b->blocks.cur);
643
644 return util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
645 }
646
647 static inline void
cs_branch(struct cs_builder * b,int offset,enum mali_cs_condition cond,struct cs_index val)648 cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond,
649 struct cs_index val)
650 {
651 cs_emit(b, BRANCH, I) {
652 I.offset = offset;
653 I.condition = cond;
654 I.value = cs_src32(b, val);
655 }
656 }
657
658 static inline void
cs_branch_label(struct cs_builder * b,struct cs_label * label,enum mali_cs_condition cond,struct cs_index val)659 cs_branch_label(struct cs_builder *b, struct cs_label *label,
660 enum mali_cs_condition cond, struct cs_index val)
661 {
662 assert(b->blocks.cur);
663
664 if (label->target == CS_LABEL_INVALID_POS) {
665 uint32_t branch_ins_pos = cs_block_next_pos(b);
666
667 /* Instead of emitting a BRANCH with the final offset, we record the
668 * diff between the current branch, and the previous branch that was
669 * referencing this unset label. This way we build a single link list
670 * that can be walked when the label is set with cs_set_label().
671 * We use -1 as the end-of-list marker.
672 */
673 int16_t offset = -1;
674 if (label->last_forward_ref != CS_LABEL_INVALID_POS) {
675 assert(label->last_forward_ref < branch_ins_pos);
676 assert(branch_ins_pos - label->last_forward_ref <= INT16_MAX);
677 offset = branch_ins_pos - label->last_forward_ref;
678 }
679
680 cs_emit(b, BRANCH, I) {
681 I.offset = offset;
682 I.condition = cond;
683 I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
684 }
685
686 label->last_forward_ref = branch_ins_pos;
687 } else {
688 int32_t offset = label->target - cs_block_next_pos(b) - 1;
689
690 /* The branch target is encoded in a 16-bit signed integer, make sure we
691 * don't underflow.
692 */
693 assert(offset >= INT16_MIN);
694
695 /* Backward references are easy, we can emit them immediately. */
696 cs_emit(b, BRANCH, I) {
697 I.offset = offset;
698 I.condition = cond;
699 I.value = cs_src32(b, val);
700 }
701 }
702 }
703
704 static inline void
cs_label_init(struct cs_label * label)705 cs_label_init(struct cs_label *label)
706 {
707 label->last_forward_ref = CS_LABEL_INVALID_POS;
708 label->target = CS_LABEL_INVALID_POS;
709 }
710
711 static inline void
cs_set_label(struct cs_builder * b,struct cs_label * label)712 cs_set_label(struct cs_builder *b, struct cs_label *label)
713 {
714 assert(label->target == CS_LABEL_INVALID_POS);
715 label->target = cs_block_next_pos(b);
716
717 for (uint32_t next_forward_ref, forward_ref = label->last_forward_ref;
718 forward_ref != CS_LABEL_INVALID_POS; forward_ref = next_forward_ref) {
719 uint64_t *ins =
720 util_dynarray_element(&b->blocks.instrs, uint64_t, forward_ref);
721
722 assert(forward_ref < label->target);
723 assert(label->target - forward_ref <= INT16_MAX);
724
725 /* Save the next forward reference to this target before overwritting
726 * it with the final offset.
727 */
728 int16_t offset = *ins & BITFIELD64_MASK(16);
729
730 next_forward_ref =
731 offset > 0 ? forward_ref - offset : CS_LABEL_INVALID_POS;
732
733 assert(next_forward_ref == CS_LABEL_INVALID_POS ||
734 next_forward_ref < forward_ref);
735
736 *ins &= ~BITFIELD64_MASK(16);
737 *ins |= label->target - forward_ref - 1;
738 }
739 }
740
741 struct cs_loop {
742 struct cs_label start, end;
743 struct cs_block block;
744 enum mali_cs_condition cond;
745 struct cs_index val;
746 struct cs_load_store_tracker *orig_ls_state;
747 struct cs_load_store_tracker ls_state;
748 };
749
750 static inline enum mali_cs_condition
cs_invert_cond(enum mali_cs_condition cond)751 cs_invert_cond(enum mali_cs_condition cond)
752 {
753 switch (cond) {
754 case MALI_CS_CONDITION_LEQUAL:
755 return MALI_CS_CONDITION_GREATER;
756 case MALI_CS_CONDITION_EQUAL:
757 return MALI_CS_CONDITION_NEQUAL;
758 case MALI_CS_CONDITION_LESS:
759 return MALI_CS_CONDITION_GEQUAL;
760 case MALI_CS_CONDITION_GREATER:
761 return MALI_CS_CONDITION_LEQUAL;
762 case MALI_CS_CONDITION_NEQUAL:
763 return MALI_CS_CONDITION_EQUAL;
764 case MALI_CS_CONDITION_GEQUAL:
765 return MALI_CS_CONDITION_LESS;
766 case MALI_CS_CONDITION_ALWAYS:
767 unreachable("cannot invert ALWAYS");
768 default:
769 unreachable("invalid cond");
770 }
771 }
772
773 static inline void
cs_loop_diverge_ls_update(struct cs_builder * b,struct cs_loop * loop)774 cs_loop_diverge_ls_update(struct cs_builder *b, struct cs_loop *loop)
775 {
776 if (likely(!b->conf.ls_tracker))
777 return;
778
779 if (!loop->orig_ls_state) {
780 loop->orig_ls_state = b->conf.ls_tracker;
781 loop->ls_state = *loop->orig_ls_state;
782 b->conf.ls_tracker = &loop->ls_state;
783 } else {
784 BITSET_OR(loop->orig_ls_state->pending_loads,
785 loop->orig_ls_state->pending_loads,
786 loop->ls_state.pending_loads);
787 BITSET_OR(loop->orig_ls_state->pending_stores,
788 loop->orig_ls_state->pending_stores,
789 loop->ls_state.pending_stores);
790 }
791 }
792
793 static inline struct cs_loop *
cs_do_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)794 cs_do_while_start(struct cs_builder *b, struct cs_loop *loop,
795 enum mali_cs_condition cond, struct cs_index val)
796 {
797 *loop = (struct cs_loop){
798 .cond = cond,
799 .val = val,
800 };
801
802 cs_block_start(b, &loop->block);
803 cs_label_init(&loop->start);
804 cs_label_init(&loop->end);
805 cs_set_label(b, &loop->start);
806 return loop;
807 }
808
809 static inline struct cs_loop *
cs_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)810 cs_while_start(struct cs_builder *b, struct cs_loop *loop,
811 enum mali_cs_condition cond, struct cs_index val)
812 {
813 cs_do_while_start(b, loop, cond, val);
814
815 /* Do an initial check on the condition, and if it's false, jump to
816 * the end of the loop block. For 'while(true)' loops, skip the
817 * conditional branch.
818 */
819 if (cond != MALI_CS_CONDITION_ALWAYS) {
820 cs_branch_label(b, &loop->end, cs_invert_cond(cond), val);
821 cs_loop_diverge_ls_update(b, loop);
822 }
823
824 return loop;
825 }
826
827 static inline void
cs_loop_continue(struct cs_builder * b,enum mali_cs_condition cond,struct cs_index val)828 cs_loop_continue(struct cs_builder *b, enum mali_cs_condition cond,
829 struct cs_index val)
830 {
831 assert(b->blocks.cur);
832
833 struct cs_loop *loop = container_of(b->blocks.cur, struct cs_loop, block);
834
835 cs_branch_label(b, &loop->start, cond, val);
836 cs_loop_diverge_ls_update(b, loop);
837 }
838
839 static inline void
cs_loop_break(struct cs_builder * b,enum mali_cs_condition cond,struct cs_index val)840 cs_loop_break(struct cs_builder *b, enum mali_cs_condition cond,
841 struct cs_index val)
842 {
843 assert(b->blocks.cur);
844
845 struct cs_loop *loop = container_of(b->blocks.cur, struct cs_loop, block);
846
847 cs_branch_label(b, &loop->end, cond, val);
848 cs_loop_diverge_ls_update(b, loop);
849 }
850
851 static inline void
cs_while_end(struct cs_builder * b)852 cs_while_end(struct cs_builder *b)
853 {
854 assert(b->blocks.cur);
855
856 struct cs_loop *loop = container_of(b->blocks.cur, struct cs_loop, block);
857
858 cs_branch_label(b, &loop->start, loop->cond, loop->val);
859 cs_set_label(b, &loop->end);
860 cs_block_end(b);
861
862 if (unlikely(loop->orig_ls_state)) {
863 BITSET_OR(loop->orig_ls_state->pending_loads,
864 loop->orig_ls_state->pending_loads,
865 loop->ls_state.pending_loads);
866 BITSET_OR(loop->orig_ls_state->pending_stores,
867 loop->orig_ls_state->pending_stores,
868 loop->ls_state.pending_stores);
869 b->conf.ls_tracker = loop->orig_ls_state;
870 }
871 }
872
873 #define cs_while(__b, cond, val) \
874 for (struct cs_loop __loop_storage, \
875 *__loop = cs_while_start(__b, &__loop_storage, cond, val); \
876 __loop != NULL; cs_while_end(__b), __loop = NULL)
877
878 /* Pseudoinstructions follow */
879
880 static inline void
cs_move64_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)881 cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
882 {
883 if (imm < (1ull << 48)) {
884 /* Zero extends */
885 cs_move48_to(b, dest, imm);
886 } else {
887 cs_move32_to(b, cs_extract32(b, dest, 0), imm);
888 cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32);
889 }
890 }
891
892 static inline void
cs_wait_slots(struct cs_builder * b,unsigned wait_mask,bool progress_inc)893 cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc)
894 {
895 struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
896
897 cs_emit(b, WAIT, I) {
898 I.wait_mask = wait_mask;
899 I.progress_increment = progress_inc;
900 }
901
902 /* We don't do advanced tracking of cs_defer(), and assume that
903 * load/store will be flushed with an explicit wait on the load/store
904 * scoreboard. */
905 if (unlikely(ls_tracker) &&
906 (wait_mask & BITFIELD_BIT(ls_tracker->sb_slot))) {
907 BITSET_CLEAR_RANGE(ls_tracker->pending_loads, 0, 255);
908 BITSET_CLEAR_RANGE(ls_tracker->pending_stores, 0, 255);
909 }
910 }
911
912 static inline void
cs_wait_slot(struct cs_builder * b,unsigned slot,bool progress_inc)913 cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc)
914 {
915 assert(slot < 8 && "invalid slot");
916
917 cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc);
918 }
919
920 struct cs_shader_res_sel {
921 uint8_t srt, fau, spd, tsd;
922 };
923
924 static inline struct cs_shader_res_sel
cs_shader_res_sel(unsigned srt,unsigned fau,unsigned spd,unsigned tsd)925 cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd)
926 {
927 return (struct cs_shader_res_sel){
928 .srt = srt,
929 .fau = fau,
930 .spd = spd,
931 .tsd = tsd,
932 };
933 }
934
935 static inline void
cs_run_compute(struct cs_builder * b,unsigned task_increment,enum mali_task_axis task_axis,bool progress_inc,struct cs_shader_res_sel res_sel)936 cs_run_compute(struct cs_builder *b, unsigned task_increment,
937 enum mali_task_axis task_axis, bool progress_inc,
938 struct cs_shader_res_sel res_sel)
939 {
940 cs_emit(b, RUN_COMPUTE, I) {
941 I.task_increment = task_increment;
942 I.task_axis = task_axis;
943 I.progress_increment = progress_inc;
944 I.srt_select = res_sel.srt;
945 I.spd_select = res_sel.spd;
946 I.tsd_select = res_sel.tsd;
947 I.fau_select = res_sel.fau;
948 }
949 }
950
951 static inline void
cs_run_tiling(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_shader_res_sel res_sel)952 cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
953 struct cs_shader_res_sel res_sel)
954 {
955 cs_emit(b, RUN_TILING, I) {
956 I.flags_override = flags_override;
957 I.progress_increment = progress_inc;
958 I.srt_select = res_sel.srt;
959 I.spd_select = res_sel.spd;
960 I.tsd_select = res_sel.tsd;
961 I.fau_select = res_sel.fau;
962 }
963 }
964
965 static inline void
cs_run_idvs(struct cs_builder * b,uint32_t flags_override,bool progress_inc,bool malloc_enable,struct cs_shader_res_sel varying_sel,struct cs_shader_res_sel frag_sel,struct cs_index draw_id)966 cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
967 bool malloc_enable, struct cs_shader_res_sel varying_sel,
968 struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
969 {
970 cs_emit(b, RUN_IDVS, I) {
971 I.flags_override = flags_override;
972 I.progress_increment = progress_inc;
973 I.malloc_enable = malloc_enable;
974
975 if (draw_id.type == CS_INDEX_UNDEF) {
976 I.draw_id_register_enable = false;
977 } else {
978 I.draw_id_register_enable = true;
979 I.draw_id = cs_src32(b, draw_id);
980 }
981
982 assert(varying_sel.spd == 1);
983 assert(varying_sel.fau == 0 || varying_sel.fau == 1);
984 assert(varying_sel.srt == 0 || varying_sel.srt == 1);
985 assert(varying_sel.tsd == 0 || varying_sel.tsd == 1);
986 I.varying_fau_select = varying_sel.fau == 1;
987 I.varying_srt_select = varying_sel.srt == 1;
988 I.varying_tsd_select = varying_sel.tsd == 1;
989
990 assert(frag_sel.spd == 2);
991 assert(frag_sel.fau == 2);
992 assert(frag_sel.srt == 2 || frag_sel.srt == 0);
993 assert(frag_sel.tsd == 2 || frag_sel.tsd == 0);
994 I.fragment_srt_select = frag_sel.srt == 2;
995 I.fragment_tsd_select = frag_sel.tsd == 2;
996 }
997 }
998
999 static inline void
cs_run_fragment(struct cs_builder * b,bool enable_tem,enum mali_tile_render_order tile_order,bool progress_inc)1000 cs_run_fragment(struct cs_builder *b, bool enable_tem,
1001 enum mali_tile_render_order tile_order, bool progress_inc)
1002 {
1003 cs_emit(b, RUN_FRAGMENT, I) {
1004 I.enable_tem = enable_tem;
1005 I.tile_order = tile_order;
1006 I.progress_increment = progress_inc;
1007 }
1008 }
1009
1010 static inline void
cs_run_fullscreen(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_index dcd)1011 cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
1012 bool progress_inc, struct cs_index dcd)
1013 {
1014 cs_emit(b, RUN_FULLSCREEN, I) {
1015 I.flags_override = flags_override;
1016 I.progress_increment = progress_inc;
1017 I.dcd = cs_src64(b, dcd);
1018 }
1019 }
1020
1021 static inline void
cs_finish_tiling(struct cs_builder * b,bool progress_inc)1022 cs_finish_tiling(struct cs_builder *b, bool progress_inc)
1023 {
1024 cs_emit(b, FINISH_TILING, I)
1025 I.progress_increment = progress_inc;
1026 }
1027
1028 static inline void
cs_finish_fragment(struct cs_builder * b,bool increment_frag_completed,struct cs_index first_free_heap_chunk,struct cs_index last_free_heap_chunk,struct cs_async_op async)1029 cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed,
1030 struct cs_index first_free_heap_chunk,
1031 struct cs_index last_free_heap_chunk,
1032 struct cs_async_op async)
1033 {
1034 cs_emit(b, FINISH_FRAGMENT, I) {
1035 I.increment_fragment_completed = increment_frag_completed;
1036 cs_apply_async(I, async);
1037 I.first_heap_chunk = cs_src64(b, first_free_heap_chunk);
1038 I.last_heap_chunk = cs_src64(b, last_free_heap_chunk);
1039 }
1040 }
1041
1042 static inline void
cs_add32(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1043 cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1044 unsigned imm)
1045 {
1046 cs_emit(b, ADD_IMMEDIATE32, I) {
1047 I.destination = cs_dst32(b, dest);
1048 I.source = cs_src32(b, src);
1049 I.immediate = imm;
1050 }
1051 }
1052
1053 static inline void
cs_add64(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1054 cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1055 unsigned imm)
1056 {
1057 cs_emit(b, ADD_IMMEDIATE64, I) {
1058 I.destination = cs_dst64(b, dest);
1059 I.source = cs_src64(b, src);
1060 I.immediate = imm;
1061 }
1062 }
1063
1064 static inline void
cs_umin32(struct cs_builder * b,struct cs_index dest,struct cs_index src1,struct cs_index src2)1065 cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
1066 struct cs_index src2)
1067 {
1068 cs_emit(b, UMIN32, I) {
1069 I.destination = cs_dst32(b, dest);
1070 I.source_1 = cs_src32(b, src1);
1071 I.source_2 = cs_src32(b, src2);
1072 }
1073 }
1074
1075 static inline void
cs_load_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,unsigned mask,int offset)1076 cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address,
1077 unsigned mask, int offset)
1078 {
1079 unsigned count = util_last_bit(mask);
1080 unsigned base_reg = cs_dst_tuple(b, dest, count);
1081
1082 cs_emit(b, LOAD_MULTIPLE, I) {
1083 I.base_register = base_reg;
1084 I.address = cs_src64(b, address);
1085 I.mask = mask;
1086 I.offset = offset;
1087 }
1088
1089 if (unlikely(b->conf.ls_tracker)) {
1090 for (unsigned i = 0; i < count; i++) {
1091 if (mask & BITFIELD_BIT(i))
1092 BITSET_SET(b->conf.ls_tracker->pending_loads, base_reg + i);
1093 }
1094 }
1095 }
1096
1097 static inline void
cs_load32_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1098 cs_load32_to(struct cs_builder *b, struct cs_index dest,
1099 struct cs_index address, int offset)
1100 {
1101 cs_load_to(b, dest, address, BITFIELD_MASK(1), offset);
1102 }
1103
1104 static inline void
cs_load64_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1105 cs_load64_to(struct cs_builder *b, struct cs_index dest,
1106 struct cs_index address, int offset)
1107 {
1108 cs_load_to(b, dest, address, BITFIELD_MASK(2), offset);
1109 }
1110
1111 static inline void
cs_store(struct cs_builder * b,struct cs_index data,struct cs_index address,unsigned mask,int offset)1112 cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address,
1113 unsigned mask, int offset)
1114 {
1115 unsigned count = util_last_bit(mask);
1116 unsigned base_reg = cs_src_tuple(b, data, count);
1117
1118 cs_emit(b, STORE_MULTIPLE, I) {
1119 I.base_register = base_reg;
1120 I.address = cs_src64(b, address);
1121 I.mask = mask;
1122 I.offset = offset;
1123 }
1124
1125 if (unlikely(b->conf.ls_tracker)) {
1126 for (unsigned i = 0; i < count; i++) {
1127 if (mask & BITFIELD_BIT(i))
1128 BITSET_SET(b->conf.ls_tracker->pending_stores, base_reg + i);
1129 }
1130 }
1131 }
1132
1133 static inline void
cs_store32(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1134 cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address,
1135 int offset)
1136 {
1137 cs_store(b, data, address, BITFIELD_MASK(1), offset);
1138 }
1139
1140 static inline void
cs_store64(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1141 cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address,
1142 int offset)
1143 {
1144 cs_store(b, data, address, BITFIELD_MASK(2), offset);
1145 }
1146
1147 /*
1148 * Select which scoreboard entry will track endpoint tasks and other tasks
1149 * respectively. Pass to cs_wait to wait later.
1150 */
1151 static inline void
cs_set_scoreboard_entry(struct cs_builder * b,unsigned ep,unsigned other)1152 cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other)
1153 {
1154 assert(ep < 8 && "invalid slot");
1155 assert(other < 8 && "invalid slot");
1156
1157 cs_emit(b, SET_SB_ENTRY, I) {
1158 I.endpoint_entry = ep;
1159 I.other_entry = other;
1160 }
1161
1162 /* We assume the load/store scoreboard entry is static to keep things
1163 * simple. */
1164 if (unlikely(b->conf.ls_tracker))
1165 assert(b->conf.ls_tracker->sb_slot == other);
1166 }
1167
1168 static inline void
cs_progress_wait(struct cs_builder * b,unsigned queue,struct cs_index ref)1169 cs_progress_wait(struct cs_builder *b, unsigned queue, struct cs_index ref)
1170 {
1171 cs_emit(b, PROGRESS_WAIT, I) {
1172 I.source = cs_src64(b, ref);
1173 I.queue = queue;
1174 }
1175 }
1176
1177 static inline void
cs_set_exception_handler(struct cs_builder * b,enum mali_cs_exception_type exception_type,struct cs_index address,struct cs_index length)1178 cs_set_exception_handler(struct cs_builder *b,
1179 enum mali_cs_exception_type exception_type,
1180 struct cs_index address, struct cs_index length)
1181 {
1182 cs_emit(b, SET_EXCEPTION_HANDLER, I) {
1183 I.exception_type = exception_type;
1184 I.address = cs_src64(b, address);
1185 I.length = cs_src32(b, length);
1186 }
1187 }
1188
1189 static inline void
cs_call(struct cs_builder * b,struct cs_index address,struct cs_index length)1190 cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length)
1191 {
1192 cs_emit(b, CALL, I) {
1193 I.address = cs_src64(b, address);
1194 I.length = cs_src32(b, length);
1195 }
1196 }
1197
1198 static inline void
cs_jump(struct cs_builder * b,struct cs_index address,struct cs_index length)1199 cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
1200 {
1201 cs_emit(b, JUMP, I) {
1202 I.address = cs_src64(b, address);
1203 I.length = cs_src32(b, length);
1204 }
1205 }
1206
1207 enum cs_res_id {
1208 CS_COMPUTE_RES = BITFIELD_BIT(0),
1209 CS_FRAG_RES = BITFIELD_BIT(1),
1210 CS_TILER_RES = BITFIELD_BIT(2),
1211 CS_IDVS_RES = BITFIELD_BIT(3),
1212 };
1213
1214 static inline void
cs_req_res(struct cs_builder * b,u32 res_mask)1215 cs_req_res(struct cs_builder *b, u32 res_mask)
1216 {
1217 cs_emit(b, REQ_RESOURCE, I) {
1218 I.compute = res_mask & CS_COMPUTE_RES;
1219 I.tiler = res_mask & CS_TILER_RES;
1220 I.idvs = res_mask & CS_IDVS_RES;
1221 I.fragment = res_mask & CS_FRAG_RES;
1222 }
1223 }
1224
1225 static inline void
cs_flush_caches(struct cs_builder * b,enum mali_cs_flush_mode l2,enum mali_cs_flush_mode lsc,bool other_inv,struct cs_index flush_id,struct cs_async_op async)1226 cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2,
1227 enum mali_cs_flush_mode lsc, bool other_inv,
1228 struct cs_index flush_id, struct cs_async_op async)
1229 {
1230 cs_emit(b, FLUSH_CACHE2, I) {
1231 I.l2_flush_mode = l2;
1232 I.lsc_flush_mode = lsc;
1233 I.other_invalidate = other_inv;
1234 I.latest_flush_id = cs_src32(b, flush_id);
1235 cs_apply_async(I, async);
1236 }
1237 }
1238
1239 #define CS_SYNC_OPS(__cnt_width) \
1240 static inline void cs_sync##__cnt_width##_set( \
1241 struct cs_builder *b, bool propagate_error, \
1242 enum mali_cs_sync_scope scope, struct cs_index val, \
1243 struct cs_index addr, struct cs_async_op async) \
1244 { \
1245 cs_emit(b, SYNC_SET##__cnt_width, I) { \
1246 I.error_propagate = propagate_error; \
1247 I.scope = scope; \
1248 I.data = cs_src##__cnt_width(b, val); \
1249 I.address = cs_src64(b, addr); \
1250 cs_apply_async(I, async); \
1251 } \
1252 } \
1253 \
1254 static inline void cs_sync##__cnt_width##_add( \
1255 struct cs_builder *b, bool propagate_error, \
1256 enum mali_cs_sync_scope scope, struct cs_index val, \
1257 struct cs_index addr, struct cs_async_op async) \
1258 { \
1259 cs_emit(b, SYNC_ADD##__cnt_width, I) { \
1260 I.error_propagate = propagate_error; \
1261 I.scope = scope; \
1262 I.data = cs_src##__cnt_width(b, val); \
1263 I.address = cs_src64(b, addr); \
1264 cs_apply_async(I, async); \
1265 } \
1266 } \
1267 \
1268 static inline void cs_sync##__cnt_width##_wait( \
1269 struct cs_builder *b, bool reject_error, enum mali_cs_condition cond, \
1270 struct cs_index ref, struct cs_index addr) \
1271 { \
1272 assert(cond == MALI_CS_CONDITION_LEQUAL || \
1273 cond == MALI_CS_CONDITION_GREATER); \
1274 cs_emit(b, SYNC_WAIT##__cnt_width, I) { \
1275 I.error_reject = reject_error; \
1276 I.condition = cond; \
1277 I.data = cs_src##__cnt_width(b, ref); \
1278 I.address = cs_src64(b, addr); \
1279 } \
1280 }
1281
1282 CS_SYNC_OPS(32)
1283 CS_SYNC_OPS(64)
1284
1285 static inline void
cs_store_state(struct cs_builder * b,struct cs_index address,int offset,enum mali_cs_state state,struct cs_async_op async)1286 cs_store_state(struct cs_builder *b, struct cs_index address, int offset,
1287 enum mali_cs_state state, struct cs_async_op async)
1288 {
1289 cs_emit(b, STORE_STATE, I) {
1290 I.offset = offset;
1291 I.state = state;
1292 I.address = cs_src64(b, address);
1293 cs_apply_async(I, async);
1294 }
1295 }
1296
1297 static inline void
cs_prot_region(struct cs_builder * b,unsigned size)1298 cs_prot_region(struct cs_builder *b, unsigned size)
1299 {
1300 cs_emit(b, PROT_REGION, I) {
1301 I.size = size;
1302 }
1303 }
1304
1305 static inline void
cs_progress_store(struct cs_builder * b,struct cs_index src)1306 cs_progress_store(struct cs_builder *b, struct cs_index src)
1307 {
1308 cs_emit(b, PROGRESS_STORE, I)
1309 I.source = cs_src64(b, src);
1310 }
1311
1312 static inline void
cs_progress_load(struct cs_builder * b,struct cs_index dst)1313 cs_progress_load(struct cs_builder *b, struct cs_index dst)
1314 {
1315 cs_emit(b, PROGRESS_LOAD, I)
1316 I.destination = cs_dst64(b, dst);
1317 }
1318
1319 static inline void
cs_run_compute_indirect(struct cs_builder * b,unsigned wg_per_task,bool progress_inc,struct cs_shader_res_sel res_sel)1320 cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
1321 bool progress_inc, struct cs_shader_res_sel res_sel)
1322 {
1323 cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
1324 I.workgroups_per_task = wg_per_task;
1325 I.progress_increment = progress_inc;
1326 I.srt_select = res_sel.srt;
1327 I.spd_select = res_sel.spd;
1328 I.tsd_select = res_sel.tsd;
1329 I.fau_select = res_sel.fau;
1330 }
1331 }
1332
1333 static inline void
cs_error_barrier(struct cs_builder * b)1334 cs_error_barrier(struct cs_builder *b)
1335 {
1336 cs_emit(b, ERROR_BARRIER, _)
1337 ;
1338 }
1339
1340 static inline void
cs_heap_set(struct cs_builder * b,struct cs_index address)1341 cs_heap_set(struct cs_builder *b, struct cs_index address)
1342 {
1343 cs_emit(b, HEAP_SET, I) {
1344 I.address = cs_src64(b, address);
1345 }
1346 }
1347
1348 static inline void
cs_heap_operation(struct cs_builder * b,enum mali_cs_heap_operation operation,struct cs_async_op async)1349 cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation,
1350 struct cs_async_op async)
1351 {
1352 cs_emit(b, HEAP_OPERATION, I) {
1353 I.operation = operation;
1354 cs_apply_async(I, async);
1355 }
1356 }
1357
1358 static inline void
cs_vt_start(struct cs_builder * b,struct cs_async_op async)1359 cs_vt_start(struct cs_builder *b, struct cs_async_op async)
1360 {
1361 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async);
1362 }
1363
1364 static inline void
cs_vt_end(struct cs_builder * b,struct cs_async_op async)1365 cs_vt_end(struct cs_builder *b, struct cs_async_op async)
1366 {
1367 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async);
1368 }
1369
1370 static inline void
cs_frag_end(struct cs_builder * b,struct cs_async_op async)1371 cs_frag_end(struct cs_builder *b, struct cs_async_op async)
1372 {
1373 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async);
1374 }
1375
1376 static inline void
cs_trace_point(struct cs_builder * b,struct cs_index regs,struct cs_async_op async)1377 cs_trace_point(struct cs_builder *b, struct cs_index regs,
1378 struct cs_async_op async)
1379 {
1380 cs_emit(b, TRACE_POINT, I) {
1381 I.base_register = cs_src_tuple(b, regs, regs.size);
1382 I.register_count = regs.size;
1383 cs_apply_async(I, async);
1384 }
1385 }
1386
1387 struct cs_match {
1388 struct cs_block block;
1389 struct cs_label break_label;
1390 struct cs_block case_block;
1391 struct cs_label next_case_label;
1392 struct cs_index val;
1393 struct cs_index scratch_reg;
1394 struct cs_load_store_tracker case_ls_state;
1395 struct cs_load_store_tracker ls_state;
1396 struct cs_load_store_tracker *orig_ls_state;
1397 bool default_emitted;
1398 };
1399
1400 static inline struct cs_match *
cs_match_start(struct cs_builder * b,struct cs_match * match,struct cs_index val,struct cs_index scratch_reg)1401 cs_match_start(struct cs_builder *b, struct cs_match *match,
1402 struct cs_index val, struct cs_index scratch_reg)
1403 {
1404 *match = (struct cs_match){
1405 .val = val,
1406 .scratch_reg = scratch_reg,
1407 .orig_ls_state = b->conf.ls_tracker,
1408 };
1409
1410 cs_block_start(b, &match->block);
1411 cs_label_init(&match->break_label);
1412 cs_label_init(&match->next_case_label);
1413
1414 return match;
1415 }
1416
1417 static inline void
cs_match_case_ls_set(struct cs_builder * b,struct cs_match * match)1418 cs_match_case_ls_set(struct cs_builder *b, struct cs_match *match)
1419 {
1420 if (unlikely(match->orig_ls_state)) {
1421 match->case_ls_state = *match->orig_ls_state;
1422 b->conf.ls_tracker = &match->case_ls_state;
1423 }
1424 }
1425
1426 static inline void
cs_match_case_ls_get(struct cs_match * match)1427 cs_match_case_ls_get(struct cs_match *match)
1428 {
1429 if (unlikely(match->orig_ls_state)) {
1430 BITSET_OR(match->ls_state.pending_loads,
1431 match->case_ls_state.pending_loads,
1432 match->ls_state.pending_loads);
1433 BITSET_OR(match->ls_state.pending_stores,
1434 match->case_ls_state.pending_stores,
1435 match->ls_state.pending_stores);
1436 }
1437 }
1438
1439 static inline void
cs_match_case(struct cs_builder * b,struct cs_match * match,uint32_t id)1440 cs_match_case(struct cs_builder *b, struct cs_match *match, uint32_t id)
1441 {
1442 assert(b->blocks.cur && (b->blocks.cur == &match->block ||
1443 b->blocks.cur == &match->case_block));
1444 assert(!match->default_emitted || !"default case must be last");
1445 if (match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS) {
1446 cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1447 cs_undef());
1448 assert(b->blocks.cur == &match->case_block);
1449 cs_block_end(b);
1450 cs_match_case_ls_get(match);
1451 cs_set_label(b, &match->next_case_label);
1452 cs_label_init(&match->next_case_label);
1453 }
1454
1455 if (id)
1456 cs_add32(b, match->scratch_reg, match->val, -id);
1457
1458 cs_branch_label(b, &match->next_case_label, MALI_CS_CONDITION_NEQUAL,
1459 id ? match->scratch_reg : match->val);
1460
1461 cs_match_case_ls_set(b, match);
1462 cs_block_start(b, &match->case_block);
1463 }
1464
1465 static inline void
cs_match_default(struct cs_builder * b,struct cs_match * match)1466 cs_match_default(struct cs_builder *b, struct cs_match *match)
1467 {
1468 assert(b->blocks.cur && (b->blocks.cur == &match->block ||
1469 b->blocks.cur == &match->case_block));
1470 assert(match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS ||
1471 !"default case requires at least one other case");
1472 cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1473 cs_undef());
1474
1475 if (b->blocks.cur == &match->case_block) {
1476 cs_block_end(b);
1477 cs_match_case_ls_get(match);
1478 }
1479
1480 cs_set_label(b, &match->next_case_label);
1481 cs_label_init(&match->next_case_label);
1482 cs_match_case_ls_set(b, match);
1483 cs_block_start(b, &match->case_block);
1484 match->default_emitted = true;
1485 }
1486
1487 static inline void
cs_match_end(struct cs_builder * b,struct cs_match * match)1488 cs_match_end(struct cs_builder *b, struct cs_match *match)
1489 {
1490 assert(b->blocks.cur && (b->blocks.cur == &match->block ||
1491 b->blocks.cur == &match->case_block));
1492
1493 if (b->blocks.cur == &match->case_block) {
1494 cs_match_case_ls_get(match);
1495 cs_block_end(b);
1496 }
1497
1498 if (unlikely(match->orig_ls_state)) {
1499 if (!match->default_emitted) {
1500 /* If we don't have a default, assume we don't handle all possible cases
1501 * and the match load/store state with the original load/store state.
1502 */
1503 BITSET_OR(match->orig_ls_state->pending_loads,
1504 match->ls_state.pending_loads,
1505 match->orig_ls_state->pending_loads);
1506 BITSET_OR(match->orig_ls_state->pending_stores,
1507 match->ls_state.pending_stores,
1508 match->orig_ls_state->pending_stores);
1509 } else {
1510 *match->orig_ls_state = match->ls_state;
1511 }
1512
1513 b->conf.ls_tracker = match->orig_ls_state;
1514 }
1515
1516 cs_set_label(b, &match->next_case_label);
1517 cs_set_label(b, &match->break_label);
1518
1519 cs_block_end(b);
1520 }
1521
1522 #define cs_match(__b, __val, __scratch) \
1523 for (struct cs_match __match_storage, \
1524 *__match = cs_match_start(__b, &__match_storage, __val, __scratch); \
1525 __match != NULL; cs_match_end(__b, &__match_storage), __match = NULL)
1526
1527 #define cs_case(__b, __ref) \
1528 for (bool __case_defined = ({ \
1529 cs_match_case(__b, __match, __ref); \
1530 false; \
1531 }); \
1532 !__case_defined; __case_defined = true)
1533
1534 #define cs_default(__b) \
1535 for (bool __default_defined = ({ \
1536 cs_match_default(__b, __match); \
1537 false; \
1538 }); \
1539 !__default_defined; __default_defined = true)
1540