xref: /aosp_15_r20/external/mesa3d/src/panfrost/lib/genxml/cs_builder.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2022 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #pragma once
25 
26 #if !defined(PAN_ARCH) || PAN_ARCH < 10
27 #error "cs_builder.h requires PAN_ARCH >= 10"
28 #endif
29 
30 #include "gen_macros.h"
31 
32 #include "util/bitset.h"
33 #include "util/list.h"
34 #include "util/u_dynarray.h"
35 
36 /*
37  * cs_builder implements a builder for CSF command streams. It manages the
38  * allocation and overflow behaviour of queues and provides helpers for emitting
39  * commands to run on the CSF pipe.
40  *
41  * Users are responsible for the CS buffer allocation and must initialize the
42  * command stream with an initial buffer using cs_builder_init(). The CS can
43  * be extended with new buffers allocated with cs_builder_conf::alloc_buffer()
44  * if the builder runs out of memory.
45  */
46 
47 struct cs_buffer {
48    /* CPU pointer */
49    uint64_t *cpu;
50 
51    /* GPU pointer */
52    uint64_t gpu;
53 
54    /* Capacity in number of 64-bit instructions */
55    uint32_t capacity;
56 };
57 
58 /**
59  * This is used to check that:
60  * 1. registers are not used as a source after being loaded without a
61  *    WAIT(<ls_scoreboard>) in the middle
62  * 2. registers are not reused (used as a destination) after they served as a
63  *    STORE() source without a WAIT(<ls_scoreboard>) in the middle
64  */
65 struct cs_load_store_tracker {
66    BITSET_DECLARE(pending_loads, 256);
67    BITSET_DECLARE(pending_stores, 256);
68    uint8_t sb_slot;
69 };
70 
71 enum cs_reg_perm {
72    CS_REG_NO_ACCESS = 0,
73    CS_REG_RD = BITFIELD_BIT(1),
74    CS_REG_WR = BITFIELD_BIT(2),
75    CS_REG_RW = CS_REG_RD | CS_REG_WR,
76 };
77 
78 struct cs_builder;
79 
80 typedef enum cs_reg_perm (*reg_perm_cb_t)(struct cs_builder *b, unsigned reg);
81 
82 struct cs_builder_conf {
83    /* Number of 32-bit registers in the hardware register file */
84    uint8_t nr_registers;
85 
86    /* Number of 32-bit registers used by the kernel at submission time */
87    uint8_t nr_kernel_registers;
88 
89    /* CS buffer allocator */
90    struct cs_buffer (*alloc_buffer)(void *cookie);
91 
92    /* Optional load/store tracker. */
93    struct cs_load_store_tracker *ls_tracker;
94 
95    /* Optional register access checker. */
96    reg_perm_cb_t reg_perm;
97 
98    /* Cookie passed back to alloc_buffer() */
99    void *cookie;
100 };
101 
102 /* The CS is formed of one or more CS chunks linked with JUMP instructions.
103  * The builder keeps track of the current chunk and the position inside this
104  * chunk, so it can emit new instructions, and decide when a new chunk needs
105  * to be allocated.
106  */
107 struct cs_chunk {
108    /* CS buffer object backing this chunk */
109    struct cs_buffer buffer;
110 
111    union {
112       /* Current position in the buffer object when the chunk is active. */
113       uint32_t pos;
114 
115       /* Chunk size when the chunk was wrapped. */
116       uint32_t size;
117    };
118 };
119 
120 /* Monolithic sequence of instruction. Must live in a virtually contiguous
121  * portion of code.
122  */
123 struct cs_block {
124    /* Used to insert the block in the block stack. */
125    struct list_head node;
126 };
127 
128 #define CS_LABEL_INVALID_POS ~0u
129 
130 /* Labels can only be used inside a cs_block. They can be defined and
131  * referenced before they are set to point to a specific position
132  * in the block. */
133 struct cs_label {
134    /* The last reference we have seen pointing to this block before
135     * it was set. If set to CS_LABEL_INVALID_POS, no forward reference
136     * pointing to this label exist.
137     */
138    uint32_t last_forward_ref;
139 
140    /* The label target. If set to CS_LABEL_INVALID_POS, the label has
141     * not been set yet.
142     */
143    uint32_t target;
144 };
145 
146 struct cs_builder {
147    /* CS builder configuration */
148    struct cs_builder_conf conf;
149 
150    /* True if an allocation failed, making the whole CS invalid. */
151    bool invalid;
152 
153    /* Initial (root) CS chunk. */
154    struct cs_chunk root_chunk;
155 
156    /* Current CS chunk. */
157    struct cs_chunk cur_chunk;
158 
159    /* Temporary storage for inner blocks that need to be built
160     * and copied in one monolithic sequence of instructions with no
161     * jump in the middle.
162     */
163    struct {
164       struct list_head stack;
165       struct cs_block *cur;
166       struct util_dynarray instrs;
167    } blocks;
168 
169    /* Move immediate instruction at the end of the last CS chunk that needs to
170     * be patched with the final length of the current CS chunk in order to
171     * facilitate correct overflow behaviour.
172     */
173    uint32_t *length_patch;
174 
175    /* Used as temporary storage when the allocator couldn't allocate a new
176     * CS chunk.
177     */
178    uint64_t discard_instr_slot;
179 };
180 
181 static inline void
cs_builder_init(struct cs_builder * b,const struct cs_builder_conf * conf,struct cs_buffer root_buffer)182 cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf,
183                 struct cs_buffer root_buffer)
184 {
185    *b = (struct cs_builder){
186       .conf = *conf,
187       .root_chunk.buffer = root_buffer,
188       .cur_chunk.buffer = root_buffer,
189    };
190 
191    /* We need at least 3 registers for CS chunk linking. Assume the kernel needs
192     * at least that too.
193     */
194    b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3);
195 
196    list_inithead(&b->blocks.stack);
197    util_dynarray_init(&b->blocks.instrs, NULL);
198 }
199 
200 static inline bool
cs_is_valid(struct cs_builder * b)201 cs_is_valid(struct cs_builder *b)
202 {
203    return !b->invalid;
204 }
205 
206 static inline bool
cs_is_empty(struct cs_builder * b)207 cs_is_empty(struct cs_builder *b)
208 {
209    return b->cur_chunk.pos == 0 &&
210           b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu;
211 }
212 
213 static inline uint64_t
cs_root_chunk_gpu_addr(struct cs_builder * b)214 cs_root_chunk_gpu_addr(struct cs_builder *b)
215 {
216    return b->root_chunk.buffer.gpu;
217 }
218 
219 static inline uint32_t
cs_root_chunk_size(struct cs_builder * b)220 cs_root_chunk_size(struct cs_builder *b)
221 {
222    /* Make sure cs_finish() was called. */
223    assert(!memcmp(&b->cur_chunk, &(struct cs_chunk){0}, sizeof(b->cur_chunk)));
224 
225    return b->root_chunk.size * sizeof(uint64_t);
226 }
227 
228 /*
229  * Wrap the current queue. External users shouldn't call this function
230  * directly, they should call cs_finish() when they are done building
231  * the command stream, which will in turn call cs_wrap_queue().
232  *
233  * Internally, this is also used to finalize internal CS chunks when
234  * allocating new sub-chunks. See cs_alloc_chunk() for details.
235  *
236  * This notably requires patching the previous chunk with the length
237  * we ended up emitting for this chunk.
238  */
239 static inline void
cs_wrap_chunk(struct cs_builder * b)240 cs_wrap_chunk(struct cs_builder *b)
241 {
242    if (!cs_is_valid(b))
243       return;
244 
245    if (b->length_patch) {
246       *b->length_patch = (b->cur_chunk.pos * 8);
247       b->length_patch = NULL;
248    }
249 
250    if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu)
251       b->root_chunk.size = b->cur_chunk.size;
252 }
253 
254 /* Call this when you are done building a command stream and want to prepare
255  * it for submission.
256  */
257 static inline void
cs_finish(struct cs_builder * b)258 cs_finish(struct cs_builder *b)
259 {
260    if (!cs_is_valid(b))
261       return;
262 
263    cs_wrap_chunk(b);
264 
265    /* This prevents adding instructions after that point. */
266    memset(&b->cur_chunk, 0, sizeof(b->cur_chunk));
267 
268    util_dynarray_fini(&b->blocks.instrs);
269 }
270 
271 enum cs_index_type {
272    CS_INDEX_REGISTER = 0,
273    CS_INDEX_UNDEF,
274 };
275 
276 struct cs_index {
277    enum cs_index_type type;
278 
279    /* Number of 32-bit words in the index, must be nonzero */
280    uint8_t size;
281 
282    union {
283       uint64_t imm;
284       uint8_t reg;
285    };
286 };
287 
288 static inline struct cs_index
cs_undef(void)289 cs_undef(void)
290 {
291    return (struct cs_index){
292       .type = CS_INDEX_UNDEF,
293    };
294 }
295 
296 static inline uint8_t
cs_to_reg_tuple(struct cs_index idx,ASSERTED unsigned expected_size)297 cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size)
298 {
299    assert(idx.type == CS_INDEX_REGISTER);
300    assert(idx.size == expected_size);
301 
302    return idx.reg;
303 }
304 
305 static inline unsigned
cs_src_tuple(struct cs_builder * b,struct cs_index src,ASSERTED unsigned count)306 cs_src_tuple(struct cs_builder *b, struct cs_index src, ASSERTED unsigned count)
307 {
308    unsigned reg = cs_to_reg_tuple(src, count);
309 
310    if (unlikely(b->conf.reg_perm)) {
311       for (unsigned i = reg; i < reg + count; i++) {
312          assert((b->conf.reg_perm(b, i) & CS_REG_RD) ||
313                 !"Trying to read a restricted register");
314       }
315    }
316 
317    struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
318 
319    if (unlikely(ls_tracker)) {
320       for (unsigned i = reg; i < reg + count; i++) {
321          if (BITSET_TEST(ls_tracker->pending_loads, i))
322             assert(!"register used as a source before flushing loads\n");
323       }
324    }
325 
326    return reg;
327 }
328 
329 static inline unsigned
cs_src32(struct cs_builder * b,struct cs_index src)330 cs_src32(struct cs_builder *b, struct cs_index src)
331 {
332    return cs_src_tuple(b, src, 1);
333 }
334 
335 static inline unsigned
cs_src64(struct cs_builder * b,struct cs_index src)336 cs_src64(struct cs_builder *b, struct cs_index src)
337 {
338    return cs_src_tuple(b, src, 2);
339 }
340 
341 static inline unsigned
cs_dst_tuple(struct cs_builder * b,struct cs_index dst,ASSERTED unsigned count)342 cs_dst_tuple(struct cs_builder *b, struct cs_index dst, ASSERTED unsigned count)
343 {
344    unsigned reg = cs_to_reg_tuple(dst, count);
345 
346    if (unlikely(b->conf.reg_perm)) {
347       for (unsigned i = reg; i < reg + count; i++) {
348          assert((b->conf.reg_perm(b, i) & CS_REG_WR) ||
349                 !"Trying to write a restricted register");
350       }
351    }
352 
353    struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
354 
355    if (unlikely(ls_tracker)) {
356       for (unsigned i = reg; i < reg + count; i++) {
357          if (BITSET_TEST(ls_tracker->pending_stores, i))
358             assert(
359                !"register reused as a destination before flushing stores\n");
360       }
361    }
362 
363    return reg;
364 }
365 
366 static inline unsigned
cs_dst32(struct cs_builder * b,struct cs_index dst)367 cs_dst32(struct cs_builder *b, struct cs_index dst)
368 {
369    return cs_dst_tuple(b, dst, 1);
370 }
371 
372 static inline unsigned
cs_dst64(struct cs_builder * b,struct cs_index dst)373 cs_dst64(struct cs_builder *b, struct cs_index dst)
374 {
375    return cs_dst_tuple(b, dst, 2);
376 }
377 
378 static inline struct cs_index
cs_reg_tuple(ASSERTED struct cs_builder * b,unsigned reg,unsigned size)379 cs_reg_tuple(ASSERTED struct cs_builder *b, unsigned reg, unsigned size)
380 {
381    assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers &&
382           "overflowed register file");
383    assert(size <= 16 && "unsupported");
384 
385    return (struct cs_index){
386       .type = CS_INDEX_REGISTER,
387       .size = size,
388       .reg = reg,
389    };
390 }
391 
392 static inline struct cs_index
cs_reg32(struct cs_builder * b,unsigned reg)393 cs_reg32(struct cs_builder *b, unsigned reg)
394 {
395    return cs_reg_tuple(b, reg, 1);
396 }
397 
398 static inline struct cs_index
cs_reg64(struct cs_builder * b,unsigned reg)399 cs_reg64(struct cs_builder *b, unsigned reg)
400 {
401    assert((reg % 2) == 0 && "unaligned 64-bit reg");
402    return cs_reg_tuple(b, reg, 2);
403 }
404 
405 /*
406  * The top of the register file is reserved for cs_builder internal use. We
407  * need 3 spare registers for handling command queue overflow. These are
408  * available here.
409  */
410 static inline uint8_t
cs_overflow_address_reg(struct cs_builder * b)411 cs_overflow_address_reg(struct cs_builder *b)
412 {
413    return b->conf.nr_registers - 2;
414 }
415 
416 static inline uint8_t
cs_overflow_length_reg(struct cs_builder * b)417 cs_overflow_length_reg(struct cs_builder *b)
418 {
419    return b->conf.nr_registers - 3;
420 }
421 
422 static inline struct cs_index
cs_extract32(struct cs_builder * b,struct cs_index idx,unsigned word)423 cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word)
424 {
425    assert(idx.type == CS_INDEX_REGISTER && "unsupported");
426    assert(word < idx.size && "overrun");
427 
428    return cs_reg32(b, idx.reg + word);
429 }
430 
431 #define JUMP_SEQ_INSTR_COUNT 4
432 
433 static inline void *
cs_alloc_ins(struct cs_builder * b,uint32_t num_instrs)434 cs_alloc_ins(struct cs_builder *b, uint32_t num_instrs)
435 {
436    /* If an allocation failure happened before, we just discard all following
437     * instructions.
438     */
439    if (unlikely(!cs_is_valid(b)))
440       return &b->discard_instr_slot;
441 
442    if (b->blocks.cur)
443       return util_dynarray_grow(&b->blocks.instrs, uint64_t, num_instrs);
444 
445    /* Lazy root chunk allocation. */
446    if (unlikely(!b->root_chunk.buffer.cpu)) {
447       b->root_chunk.buffer = b->conf.alloc_buffer(b->conf.cookie);
448       b->cur_chunk.buffer = b->root_chunk.buffer;
449       if (!b->cur_chunk.buffer.cpu) {
450          b->invalid = true;
451          return &b->discard_instr_slot;
452       }
453    }
454 
455    /* If the current chunk runs out of space, allocate a new one and jump to it.
456     * We actually do this a few instructions before running out, because the
457     * sequence to jump to a new queue takes multiple instructions.
458     */
459    if (unlikely((b->cur_chunk.size + num_instrs + JUMP_SEQ_INSTR_COUNT) >
460                 b->cur_chunk.buffer.capacity)) {
461       /* Now, allocate a new chunk */
462       struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie);
463 
464       /* Allocation failure, from now on, all new instructions will be
465        * discarded.
466        */
467       if (unlikely(!b->cur_chunk.buffer.cpu))
468          return &b->discard_instr_slot;
469 
470       uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
471 
472       pan_pack(ptr, CS_MOVE, I) {
473          I.destination = cs_overflow_address_reg(b);
474          I.immediate = newbuf.gpu;
475       }
476 
477       ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
478 
479       pan_pack(ptr, CS_MOVE32, I) {
480          I.destination = cs_overflow_length_reg(b);
481       }
482 
483       /* The length will be patched in later */
484       uint32_t *length_patch = (uint32_t *)ptr;
485 
486       ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
487 
488       pan_pack(ptr, CS_JUMP, I) {
489          I.length = cs_overflow_length_reg(b);
490          I.address = cs_overflow_address_reg(b);
491       }
492 
493       /* Now that we've emitted everything, finish up the previous queue */
494       cs_wrap_chunk(b);
495 
496       /* And make this one current */
497       b->length_patch = length_patch;
498       b->cur_chunk.buffer = newbuf;
499       b->cur_chunk.pos = 0;
500    }
501 
502    assert(b->cur_chunk.size + num_instrs - 1 < b->cur_chunk.buffer.capacity);
503    uint32_t pos = b->cur_chunk.pos;
504    b->cur_chunk.pos += num_instrs;
505    return b->cur_chunk.buffer.cpu + pos;
506 }
507 
508 /*
509  * Helper to emit a new instruction into the command queue. The allocation needs
510  * to be separated out being pan_pack can evaluate its argument multiple times,
511  * yet cs_alloc has side effects.
512  */
513 #define cs_emit(b, T, cfg) pan_pack(cs_alloc_ins(b, 1), CS_##T, cfg)
514 
515 /* Asynchronous operations take a mask of scoreboard slots to wait on
516  * before executing the instruction, and signal a scoreboard slot when
517  * the operation is complete.
518  * A wait_mask of zero means the operation is synchronous, and signal_slot
519  * is ignored in that case.
520  */
521 struct cs_async_op {
522    uint16_t wait_mask;
523    uint8_t signal_slot;
524 };
525 
526 static inline struct cs_async_op
cs_defer(unsigned wait_mask,unsigned signal_slot)527 cs_defer(unsigned wait_mask, unsigned signal_slot)
528 {
529    /* The scoreboard slot to signal is incremented before the wait operation,
530     * waiting on it would cause an infinite wait.
531     */
532    assert(!(wait_mask & BITFIELD_BIT(signal_slot)));
533 
534    return (struct cs_async_op){
535       .wait_mask = wait_mask,
536       .signal_slot = signal_slot,
537    };
538 }
539 
540 static inline struct cs_async_op
cs_now(void)541 cs_now(void)
542 {
543    return (struct cs_async_op){
544       .wait_mask = 0,
545       .signal_slot = ~0,
546    };
547 }
548 
549 static inline bool
cs_instr_is_asynchronous(enum mali_cs_opcode opcode,uint16_t wait_mask)550 cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
551 {
552    switch (opcode) {
553    case MALI_CS_OPCODE_FLUSH_CACHE2:
554    case MALI_CS_OPCODE_FINISH_TILING:
555    case MALI_CS_OPCODE_LOAD_MULTIPLE:
556    case MALI_CS_OPCODE_STORE_MULTIPLE:
557    case MALI_CS_OPCODE_RUN_COMPUTE:
558    case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
559    case MALI_CS_OPCODE_RUN_FRAGMENT:
560    case MALI_CS_OPCODE_RUN_FULLSCREEN:
561    case MALI_CS_OPCODE_RUN_IDVS:
562    case MALI_CS_OPCODE_RUN_TILING:
563       /* Always asynchronous. */
564       return true;
565 
566    case MALI_CS_OPCODE_FINISH_FRAGMENT:
567    case MALI_CS_OPCODE_SYNC_ADD32:
568    case MALI_CS_OPCODE_SYNC_SET32:
569    case MALI_CS_OPCODE_SYNC_ADD64:
570    case MALI_CS_OPCODE_SYNC_SET64:
571    case MALI_CS_OPCODE_STORE_STATE:
572    case MALI_CS_OPCODE_TRACE_POINT:
573    case MALI_CS_OPCODE_HEAP_OPERATION:
574       /* Asynchronous only if wait_mask != 0. */
575       return wait_mask != 0;
576 
577    default:
578       return false;
579    }
580 }
581 
582 #define cs_apply_async(I, async)                                               \
583    do {                                                                        \
584       I.wait_mask = async.wait_mask;                                           \
585       I.signal_slot = cs_instr_is_asynchronous(I.opcode, I.wait_mask)          \
586                          ? async.signal_slot                                   \
587                          : 0;                                                  \
588       assert(I.signal_slot != ~0 ||                                            \
589              !"Can't use cs_now() on pure async instructions");                \
590    } while (0)
591 
592 static inline void
cs_move32_to(struct cs_builder * b,struct cs_index dest,unsigned imm)593 cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm)
594 {
595    cs_emit(b, MOVE32, I) {
596       I.destination = cs_dst32(b, dest);
597       I.immediate = imm;
598    }
599 }
600 
601 static inline void
cs_move48_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)602 cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
603 {
604    cs_emit(b, MOVE, I) {
605       I.destination = cs_dst64(b, dest);
606       I.immediate = imm;
607    }
608 }
609 
610 static inline void
cs_block_start(struct cs_builder * b,struct cs_block * block)611 cs_block_start(struct cs_builder *b, struct cs_block *block)
612 {
613    list_addtail(&block->node, &b->blocks.stack);
614    b->blocks.cur = block;
615 }
616 
617 static inline void
cs_block_end(struct cs_builder * b)618 cs_block_end(struct cs_builder *b)
619 {
620    assert(b->blocks.cur);
621 
622    list_del(&b->blocks.cur->node);
623 
624    if (!list_is_empty(&b->blocks.stack)) {
625       b->blocks.cur = list_last_entry(&b->blocks.stack, struct cs_block, node);
626       return;
627    }
628 
629    b->blocks.cur = NULL;
630 
631    uint32_t num_instrs =
632       util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
633    void *buffer = cs_alloc_ins(b, num_instrs);
634 
635    memcpy(buffer, b->blocks.instrs.data, b->blocks.instrs.size);
636    util_dynarray_clear(&b->blocks.instrs);
637 }
638 
639 static inline uint32_t
cs_block_next_pos(struct cs_builder * b)640 cs_block_next_pos(struct cs_builder *b)
641 {
642    assert(b->blocks.cur);
643 
644    return util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
645 }
646 
647 static inline void
cs_branch(struct cs_builder * b,int offset,enum mali_cs_condition cond,struct cs_index val)648 cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond,
649           struct cs_index val)
650 {
651    cs_emit(b, BRANCH, I) {
652       I.offset = offset;
653       I.condition = cond;
654       I.value = cs_src32(b, val);
655    }
656 }
657 
658 static inline void
cs_branch_label(struct cs_builder * b,struct cs_label * label,enum mali_cs_condition cond,struct cs_index val)659 cs_branch_label(struct cs_builder *b, struct cs_label *label,
660                 enum mali_cs_condition cond, struct cs_index val)
661 {
662    assert(b->blocks.cur);
663 
664    if (label->target == CS_LABEL_INVALID_POS) {
665       uint32_t branch_ins_pos = cs_block_next_pos(b);
666 
667       /* Instead of emitting a BRANCH with the final offset, we record the
668        * diff between the current branch, and the previous branch that was
669        * referencing this unset label. This way we build a single link list
670        * that can be walked when the label is set with cs_set_label().
671        * We use -1 as the end-of-list marker.
672        */
673       int16_t offset = -1;
674       if (label->last_forward_ref != CS_LABEL_INVALID_POS) {
675          assert(label->last_forward_ref < branch_ins_pos);
676          assert(branch_ins_pos - label->last_forward_ref <= INT16_MAX);
677          offset = branch_ins_pos - label->last_forward_ref;
678       }
679 
680       cs_emit(b, BRANCH, I) {
681          I.offset = offset;
682          I.condition = cond;
683          I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
684       }
685 
686       label->last_forward_ref = branch_ins_pos;
687    } else {
688       int32_t offset = label->target - cs_block_next_pos(b) - 1;
689 
690       /* The branch target is encoded in a 16-bit signed integer, make sure we
691        * don't underflow.
692        */
693       assert(offset >= INT16_MIN);
694 
695       /* Backward references are easy, we can emit them immediately. */
696       cs_emit(b, BRANCH, I) {
697          I.offset = offset;
698          I.condition = cond;
699          I.value = cs_src32(b, val);
700       }
701    }
702 }
703 
704 static inline void
cs_label_init(struct cs_label * label)705 cs_label_init(struct cs_label *label)
706 {
707    label->last_forward_ref = CS_LABEL_INVALID_POS;
708    label->target = CS_LABEL_INVALID_POS;
709 }
710 
711 static inline void
cs_set_label(struct cs_builder * b,struct cs_label * label)712 cs_set_label(struct cs_builder *b, struct cs_label *label)
713 {
714    assert(label->target == CS_LABEL_INVALID_POS);
715    label->target = cs_block_next_pos(b);
716 
717    for (uint32_t next_forward_ref, forward_ref = label->last_forward_ref;
718         forward_ref != CS_LABEL_INVALID_POS; forward_ref = next_forward_ref) {
719       uint64_t *ins =
720          util_dynarray_element(&b->blocks.instrs, uint64_t, forward_ref);
721 
722       assert(forward_ref < label->target);
723       assert(label->target - forward_ref <= INT16_MAX);
724 
725       /* Save the next forward reference to this target before overwritting
726        * it with the final offset.
727        */
728       int16_t offset = *ins & BITFIELD64_MASK(16);
729 
730       next_forward_ref =
731          offset > 0 ? forward_ref - offset : CS_LABEL_INVALID_POS;
732 
733       assert(next_forward_ref == CS_LABEL_INVALID_POS ||
734              next_forward_ref < forward_ref);
735 
736       *ins &= ~BITFIELD64_MASK(16);
737       *ins |= label->target - forward_ref - 1;
738    }
739 }
740 
741 struct cs_loop {
742    struct cs_label start, end;
743    struct cs_block block;
744    enum mali_cs_condition cond;
745    struct cs_index val;
746    struct cs_load_store_tracker *orig_ls_state;
747    struct cs_load_store_tracker ls_state;
748 };
749 
750 static inline enum mali_cs_condition
cs_invert_cond(enum mali_cs_condition cond)751 cs_invert_cond(enum mali_cs_condition cond)
752 {
753    switch (cond) {
754    case MALI_CS_CONDITION_LEQUAL:
755       return MALI_CS_CONDITION_GREATER;
756    case MALI_CS_CONDITION_EQUAL:
757       return MALI_CS_CONDITION_NEQUAL;
758    case MALI_CS_CONDITION_LESS:
759       return MALI_CS_CONDITION_GEQUAL;
760    case MALI_CS_CONDITION_GREATER:
761       return MALI_CS_CONDITION_LEQUAL;
762    case MALI_CS_CONDITION_NEQUAL:
763       return MALI_CS_CONDITION_EQUAL;
764    case MALI_CS_CONDITION_GEQUAL:
765       return MALI_CS_CONDITION_LESS;
766    case MALI_CS_CONDITION_ALWAYS:
767       unreachable("cannot invert ALWAYS");
768    default:
769       unreachable("invalid cond");
770    }
771 }
772 
773 static inline void
cs_loop_diverge_ls_update(struct cs_builder * b,struct cs_loop * loop)774 cs_loop_diverge_ls_update(struct cs_builder *b, struct cs_loop *loop)
775 {
776    if (likely(!b->conf.ls_tracker))
777       return;
778 
779    if (!loop->orig_ls_state) {
780       loop->orig_ls_state = b->conf.ls_tracker;
781       loop->ls_state = *loop->orig_ls_state;
782       b->conf.ls_tracker = &loop->ls_state;
783    } else {
784       BITSET_OR(loop->orig_ls_state->pending_loads,
785                 loop->orig_ls_state->pending_loads,
786                 loop->ls_state.pending_loads);
787       BITSET_OR(loop->orig_ls_state->pending_stores,
788                 loop->orig_ls_state->pending_stores,
789                 loop->ls_state.pending_stores);
790    }
791 }
792 
793 static inline struct cs_loop *
cs_do_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)794 cs_do_while_start(struct cs_builder *b, struct cs_loop *loop,
795                   enum mali_cs_condition cond, struct cs_index val)
796 {
797    *loop = (struct cs_loop){
798       .cond = cond,
799       .val = val,
800    };
801 
802    cs_block_start(b, &loop->block);
803    cs_label_init(&loop->start);
804    cs_label_init(&loop->end);
805    cs_set_label(b, &loop->start);
806    return loop;
807 }
808 
809 static inline struct cs_loop *
cs_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)810 cs_while_start(struct cs_builder *b, struct cs_loop *loop,
811                enum mali_cs_condition cond, struct cs_index val)
812 {
813    cs_do_while_start(b, loop, cond, val);
814 
815    /* Do an initial check on the condition, and if it's false, jump to
816     * the end of the loop block. For 'while(true)' loops, skip the
817     * conditional branch.
818     */
819    if (cond != MALI_CS_CONDITION_ALWAYS) {
820       cs_branch_label(b, &loop->end, cs_invert_cond(cond), val);
821       cs_loop_diverge_ls_update(b, loop);
822    }
823 
824    return loop;
825 }
826 
827 static inline void
cs_loop_continue(struct cs_builder * b,enum mali_cs_condition cond,struct cs_index val)828 cs_loop_continue(struct cs_builder *b, enum mali_cs_condition cond,
829                  struct cs_index val)
830 {
831    assert(b->blocks.cur);
832 
833    struct cs_loop *loop = container_of(b->blocks.cur, struct cs_loop, block);
834 
835    cs_branch_label(b, &loop->start, cond, val);
836    cs_loop_diverge_ls_update(b, loop);
837 }
838 
839 static inline void
cs_loop_break(struct cs_builder * b,enum mali_cs_condition cond,struct cs_index val)840 cs_loop_break(struct cs_builder *b, enum mali_cs_condition cond,
841               struct cs_index val)
842 {
843    assert(b->blocks.cur);
844 
845    struct cs_loop *loop = container_of(b->blocks.cur, struct cs_loop, block);
846 
847    cs_branch_label(b, &loop->end, cond, val);
848    cs_loop_diverge_ls_update(b, loop);
849 }
850 
851 static inline void
cs_while_end(struct cs_builder * b)852 cs_while_end(struct cs_builder *b)
853 {
854    assert(b->blocks.cur);
855 
856    struct cs_loop *loop = container_of(b->blocks.cur, struct cs_loop, block);
857 
858    cs_branch_label(b, &loop->start, loop->cond, loop->val);
859    cs_set_label(b, &loop->end);
860    cs_block_end(b);
861 
862    if (unlikely(loop->orig_ls_state)) {
863       BITSET_OR(loop->orig_ls_state->pending_loads,
864                 loop->orig_ls_state->pending_loads,
865                 loop->ls_state.pending_loads);
866       BITSET_OR(loop->orig_ls_state->pending_stores,
867                 loop->orig_ls_state->pending_stores,
868                 loop->ls_state.pending_stores);
869       b->conf.ls_tracker = loop->orig_ls_state;
870    }
871 }
872 
873 #define cs_while(__b, cond, val)                                               \
874    for (struct cs_loop __loop_storage,                                         \
875         *__loop = cs_while_start(__b, &__loop_storage, cond, val);             \
876         __loop != NULL; cs_while_end(__b), __loop = NULL)
877 
878 /* Pseudoinstructions follow */
879 
880 static inline void
cs_move64_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)881 cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
882 {
883    if (imm < (1ull << 48)) {
884       /* Zero extends */
885       cs_move48_to(b, dest, imm);
886    } else {
887       cs_move32_to(b, cs_extract32(b, dest, 0), imm);
888       cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32);
889    }
890 }
891 
892 static inline void
cs_wait_slots(struct cs_builder * b,unsigned wait_mask,bool progress_inc)893 cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc)
894 {
895    struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
896 
897    cs_emit(b, WAIT, I) {
898       I.wait_mask = wait_mask;
899       I.progress_increment = progress_inc;
900    }
901 
902    /* We don't do advanced tracking of cs_defer(), and assume that
903     * load/store will be flushed with an explicit wait on the load/store
904     * scoreboard. */
905    if (unlikely(ls_tracker) &&
906        (wait_mask & BITFIELD_BIT(ls_tracker->sb_slot))) {
907       BITSET_CLEAR_RANGE(ls_tracker->pending_loads, 0, 255);
908       BITSET_CLEAR_RANGE(ls_tracker->pending_stores, 0, 255);
909    }
910 }
911 
912 static inline void
cs_wait_slot(struct cs_builder * b,unsigned slot,bool progress_inc)913 cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc)
914 {
915    assert(slot < 8 && "invalid slot");
916 
917    cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc);
918 }
919 
920 struct cs_shader_res_sel {
921    uint8_t srt, fau, spd, tsd;
922 };
923 
924 static inline struct cs_shader_res_sel
cs_shader_res_sel(unsigned srt,unsigned fau,unsigned spd,unsigned tsd)925 cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd)
926 {
927    return (struct cs_shader_res_sel){
928       .srt = srt,
929       .fau = fau,
930       .spd = spd,
931       .tsd = tsd,
932    };
933 }
934 
935 static inline void
cs_run_compute(struct cs_builder * b,unsigned task_increment,enum mali_task_axis task_axis,bool progress_inc,struct cs_shader_res_sel res_sel)936 cs_run_compute(struct cs_builder *b, unsigned task_increment,
937                enum mali_task_axis task_axis, bool progress_inc,
938                struct cs_shader_res_sel res_sel)
939 {
940    cs_emit(b, RUN_COMPUTE, I) {
941       I.task_increment = task_increment;
942       I.task_axis = task_axis;
943       I.progress_increment = progress_inc;
944       I.srt_select = res_sel.srt;
945       I.spd_select = res_sel.spd;
946       I.tsd_select = res_sel.tsd;
947       I.fau_select = res_sel.fau;
948    }
949 }
950 
951 static inline void
cs_run_tiling(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_shader_res_sel res_sel)952 cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
953               struct cs_shader_res_sel res_sel)
954 {
955    cs_emit(b, RUN_TILING, I) {
956       I.flags_override = flags_override;
957       I.progress_increment = progress_inc;
958       I.srt_select = res_sel.srt;
959       I.spd_select = res_sel.spd;
960       I.tsd_select = res_sel.tsd;
961       I.fau_select = res_sel.fau;
962    }
963 }
964 
965 static inline void
cs_run_idvs(struct cs_builder * b,uint32_t flags_override,bool progress_inc,bool malloc_enable,struct cs_shader_res_sel varying_sel,struct cs_shader_res_sel frag_sel,struct cs_index draw_id)966 cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
967             bool malloc_enable, struct cs_shader_res_sel varying_sel,
968             struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
969 {
970    cs_emit(b, RUN_IDVS, I) {
971       I.flags_override = flags_override;
972       I.progress_increment = progress_inc;
973       I.malloc_enable = malloc_enable;
974 
975       if (draw_id.type == CS_INDEX_UNDEF) {
976          I.draw_id_register_enable = false;
977       } else {
978          I.draw_id_register_enable = true;
979          I.draw_id = cs_src32(b, draw_id);
980       }
981 
982       assert(varying_sel.spd == 1);
983       assert(varying_sel.fau == 0 || varying_sel.fau == 1);
984       assert(varying_sel.srt == 0 || varying_sel.srt == 1);
985       assert(varying_sel.tsd == 0 || varying_sel.tsd == 1);
986       I.varying_fau_select = varying_sel.fau == 1;
987       I.varying_srt_select = varying_sel.srt == 1;
988       I.varying_tsd_select = varying_sel.tsd == 1;
989 
990       assert(frag_sel.spd == 2);
991       assert(frag_sel.fau == 2);
992       assert(frag_sel.srt == 2 || frag_sel.srt == 0);
993       assert(frag_sel.tsd == 2 || frag_sel.tsd == 0);
994       I.fragment_srt_select = frag_sel.srt == 2;
995       I.fragment_tsd_select = frag_sel.tsd == 2;
996    }
997 }
998 
999 static inline void
cs_run_fragment(struct cs_builder * b,bool enable_tem,enum mali_tile_render_order tile_order,bool progress_inc)1000 cs_run_fragment(struct cs_builder *b, bool enable_tem,
1001                 enum mali_tile_render_order tile_order, bool progress_inc)
1002 {
1003    cs_emit(b, RUN_FRAGMENT, I) {
1004       I.enable_tem = enable_tem;
1005       I.tile_order = tile_order;
1006       I.progress_increment = progress_inc;
1007    }
1008 }
1009 
1010 static inline void
cs_run_fullscreen(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_index dcd)1011 cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
1012                   bool progress_inc, struct cs_index dcd)
1013 {
1014    cs_emit(b, RUN_FULLSCREEN, I) {
1015       I.flags_override = flags_override;
1016       I.progress_increment = progress_inc;
1017       I.dcd = cs_src64(b, dcd);
1018    }
1019 }
1020 
1021 static inline void
cs_finish_tiling(struct cs_builder * b,bool progress_inc)1022 cs_finish_tiling(struct cs_builder *b, bool progress_inc)
1023 {
1024    cs_emit(b, FINISH_TILING, I)
1025       I.progress_increment = progress_inc;
1026 }
1027 
1028 static inline void
cs_finish_fragment(struct cs_builder * b,bool increment_frag_completed,struct cs_index first_free_heap_chunk,struct cs_index last_free_heap_chunk,struct cs_async_op async)1029 cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed,
1030                    struct cs_index first_free_heap_chunk,
1031                    struct cs_index last_free_heap_chunk,
1032                    struct cs_async_op async)
1033 {
1034    cs_emit(b, FINISH_FRAGMENT, I) {
1035       I.increment_fragment_completed = increment_frag_completed;
1036       cs_apply_async(I, async);
1037       I.first_heap_chunk = cs_src64(b, first_free_heap_chunk);
1038       I.last_heap_chunk = cs_src64(b, last_free_heap_chunk);
1039    }
1040 }
1041 
1042 static inline void
cs_add32(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1043 cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1044          unsigned imm)
1045 {
1046    cs_emit(b, ADD_IMMEDIATE32, I) {
1047       I.destination = cs_dst32(b, dest);
1048       I.source = cs_src32(b, src);
1049       I.immediate = imm;
1050    }
1051 }
1052 
1053 static inline void
cs_add64(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1054 cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1055          unsigned imm)
1056 {
1057    cs_emit(b, ADD_IMMEDIATE64, I) {
1058       I.destination = cs_dst64(b, dest);
1059       I.source = cs_src64(b, src);
1060       I.immediate = imm;
1061    }
1062 }
1063 
1064 static inline void
cs_umin32(struct cs_builder * b,struct cs_index dest,struct cs_index src1,struct cs_index src2)1065 cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
1066           struct cs_index src2)
1067 {
1068    cs_emit(b, UMIN32, I) {
1069       I.destination = cs_dst32(b, dest);
1070       I.source_1 = cs_src32(b, src1);
1071       I.source_2 = cs_src32(b, src2);
1072    }
1073 }
1074 
1075 static inline void
cs_load_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,unsigned mask,int offset)1076 cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address,
1077            unsigned mask, int offset)
1078 {
1079    unsigned count = util_last_bit(mask);
1080    unsigned base_reg = cs_dst_tuple(b, dest, count);
1081 
1082    cs_emit(b, LOAD_MULTIPLE, I) {
1083       I.base_register = base_reg;
1084       I.address = cs_src64(b, address);
1085       I.mask = mask;
1086       I.offset = offset;
1087    }
1088 
1089    if (unlikely(b->conf.ls_tracker)) {
1090       for (unsigned i = 0; i < count; i++) {
1091          if (mask & BITFIELD_BIT(i))
1092             BITSET_SET(b->conf.ls_tracker->pending_loads, base_reg + i);
1093       }
1094    }
1095 }
1096 
1097 static inline void
cs_load32_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1098 cs_load32_to(struct cs_builder *b, struct cs_index dest,
1099              struct cs_index address, int offset)
1100 {
1101    cs_load_to(b, dest, address, BITFIELD_MASK(1), offset);
1102 }
1103 
1104 static inline void
cs_load64_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1105 cs_load64_to(struct cs_builder *b, struct cs_index dest,
1106              struct cs_index address, int offset)
1107 {
1108    cs_load_to(b, dest, address, BITFIELD_MASK(2), offset);
1109 }
1110 
1111 static inline void
cs_store(struct cs_builder * b,struct cs_index data,struct cs_index address,unsigned mask,int offset)1112 cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address,
1113          unsigned mask, int offset)
1114 {
1115    unsigned count = util_last_bit(mask);
1116    unsigned base_reg = cs_src_tuple(b, data, count);
1117 
1118    cs_emit(b, STORE_MULTIPLE, I) {
1119       I.base_register = base_reg;
1120       I.address = cs_src64(b, address);
1121       I.mask = mask;
1122       I.offset = offset;
1123    }
1124 
1125    if (unlikely(b->conf.ls_tracker)) {
1126       for (unsigned i = 0; i < count; i++) {
1127          if (mask & BITFIELD_BIT(i))
1128             BITSET_SET(b->conf.ls_tracker->pending_stores, base_reg + i);
1129       }
1130    }
1131 }
1132 
1133 static inline void
cs_store32(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1134 cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address,
1135            int offset)
1136 {
1137    cs_store(b, data, address, BITFIELD_MASK(1), offset);
1138 }
1139 
1140 static inline void
cs_store64(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1141 cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address,
1142            int offset)
1143 {
1144    cs_store(b, data, address, BITFIELD_MASK(2), offset);
1145 }
1146 
1147 /*
1148  * Select which scoreboard entry will track endpoint tasks and other tasks
1149  * respectively. Pass to cs_wait to wait later.
1150  */
1151 static inline void
cs_set_scoreboard_entry(struct cs_builder * b,unsigned ep,unsigned other)1152 cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other)
1153 {
1154    assert(ep < 8 && "invalid slot");
1155    assert(other < 8 && "invalid slot");
1156 
1157    cs_emit(b, SET_SB_ENTRY, I) {
1158       I.endpoint_entry = ep;
1159       I.other_entry = other;
1160    }
1161 
1162    /* We assume the load/store scoreboard entry is static to keep things
1163     * simple. */
1164    if (unlikely(b->conf.ls_tracker))
1165       assert(b->conf.ls_tracker->sb_slot == other);
1166 }
1167 
1168 static inline void
cs_progress_wait(struct cs_builder * b,unsigned queue,struct cs_index ref)1169 cs_progress_wait(struct cs_builder *b, unsigned queue, struct cs_index ref)
1170 {
1171    cs_emit(b, PROGRESS_WAIT, I) {
1172       I.source = cs_src64(b, ref);
1173       I.queue = queue;
1174    }
1175 }
1176 
1177 static inline void
cs_set_exception_handler(struct cs_builder * b,enum mali_cs_exception_type exception_type,struct cs_index address,struct cs_index length)1178 cs_set_exception_handler(struct cs_builder *b,
1179                          enum mali_cs_exception_type exception_type,
1180                          struct cs_index address, struct cs_index length)
1181 {
1182    cs_emit(b, SET_EXCEPTION_HANDLER, I) {
1183       I.exception_type = exception_type;
1184       I.address = cs_src64(b, address);
1185       I.length = cs_src32(b, length);
1186    }
1187 }
1188 
1189 static inline void
cs_call(struct cs_builder * b,struct cs_index address,struct cs_index length)1190 cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length)
1191 {
1192    cs_emit(b, CALL, I) {
1193       I.address = cs_src64(b, address);
1194       I.length = cs_src32(b, length);
1195    }
1196 }
1197 
1198 static inline void
cs_jump(struct cs_builder * b,struct cs_index address,struct cs_index length)1199 cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
1200 {
1201    cs_emit(b, JUMP, I) {
1202       I.address = cs_src64(b, address);
1203       I.length = cs_src32(b, length);
1204    }
1205 }
1206 
1207 enum cs_res_id {
1208    CS_COMPUTE_RES = BITFIELD_BIT(0),
1209    CS_FRAG_RES = BITFIELD_BIT(1),
1210    CS_TILER_RES = BITFIELD_BIT(2),
1211    CS_IDVS_RES = BITFIELD_BIT(3),
1212 };
1213 
1214 static inline void
cs_req_res(struct cs_builder * b,u32 res_mask)1215 cs_req_res(struct cs_builder *b, u32 res_mask)
1216 {
1217    cs_emit(b, REQ_RESOURCE, I) {
1218       I.compute = res_mask & CS_COMPUTE_RES;
1219       I.tiler = res_mask & CS_TILER_RES;
1220       I.idvs = res_mask & CS_IDVS_RES;
1221       I.fragment = res_mask & CS_FRAG_RES;
1222    }
1223 }
1224 
1225 static inline void
cs_flush_caches(struct cs_builder * b,enum mali_cs_flush_mode l2,enum mali_cs_flush_mode lsc,bool other_inv,struct cs_index flush_id,struct cs_async_op async)1226 cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2,
1227                 enum mali_cs_flush_mode lsc, bool other_inv,
1228                 struct cs_index flush_id, struct cs_async_op async)
1229 {
1230    cs_emit(b, FLUSH_CACHE2, I) {
1231       I.l2_flush_mode = l2;
1232       I.lsc_flush_mode = lsc;
1233       I.other_invalidate = other_inv;
1234       I.latest_flush_id = cs_src32(b, flush_id);
1235       cs_apply_async(I, async);
1236    }
1237 }
1238 
1239 #define CS_SYNC_OPS(__cnt_width)                                               \
1240    static inline void cs_sync##__cnt_width##_set(                              \
1241       struct cs_builder *b, bool propagate_error,                              \
1242       enum mali_cs_sync_scope scope, struct cs_index val,                      \
1243       struct cs_index addr, struct cs_async_op async)                          \
1244    {                                                                           \
1245       cs_emit(b, SYNC_SET##__cnt_width, I) {                                   \
1246          I.error_propagate = propagate_error;                                  \
1247          I.scope = scope;                                                      \
1248          I.data = cs_src##__cnt_width(b, val);                                 \
1249          I.address = cs_src64(b, addr);                                        \
1250          cs_apply_async(I, async);                                             \
1251       }                                                                        \
1252    }                                                                           \
1253                                                                                \
1254    static inline void cs_sync##__cnt_width##_add(                              \
1255       struct cs_builder *b, bool propagate_error,                              \
1256       enum mali_cs_sync_scope scope, struct cs_index val,                      \
1257       struct cs_index addr, struct cs_async_op async)                          \
1258    {                                                                           \
1259       cs_emit(b, SYNC_ADD##__cnt_width, I) {                                   \
1260          I.error_propagate = propagate_error;                                  \
1261          I.scope = scope;                                                      \
1262          I.data = cs_src##__cnt_width(b, val);                                 \
1263          I.address = cs_src64(b, addr);                                        \
1264          cs_apply_async(I, async);                                             \
1265       }                                                                        \
1266    }                                                                           \
1267                                                                                \
1268    static inline void cs_sync##__cnt_width##_wait(                             \
1269       struct cs_builder *b, bool reject_error, enum mali_cs_condition cond,    \
1270       struct cs_index ref, struct cs_index addr)                               \
1271    {                                                                           \
1272       assert(cond == MALI_CS_CONDITION_LEQUAL ||                               \
1273              cond == MALI_CS_CONDITION_GREATER);                               \
1274       cs_emit(b, SYNC_WAIT##__cnt_width, I) {                                  \
1275          I.error_reject = reject_error;                                        \
1276          I.condition = cond;                                                   \
1277          I.data = cs_src##__cnt_width(b, ref);                                 \
1278          I.address = cs_src64(b, addr);                                        \
1279       }                                                                        \
1280    }
1281 
1282 CS_SYNC_OPS(32)
1283 CS_SYNC_OPS(64)
1284 
1285 static inline void
cs_store_state(struct cs_builder * b,struct cs_index address,int offset,enum mali_cs_state state,struct cs_async_op async)1286 cs_store_state(struct cs_builder *b, struct cs_index address, int offset,
1287                enum mali_cs_state state, struct cs_async_op async)
1288 {
1289    cs_emit(b, STORE_STATE, I) {
1290       I.offset = offset;
1291       I.state = state;
1292       I.address = cs_src64(b, address);
1293       cs_apply_async(I, async);
1294    }
1295 }
1296 
1297 static inline void
cs_prot_region(struct cs_builder * b,unsigned size)1298 cs_prot_region(struct cs_builder *b, unsigned size)
1299 {
1300    cs_emit(b, PROT_REGION, I) {
1301       I.size = size;
1302    }
1303 }
1304 
1305 static inline void
cs_progress_store(struct cs_builder * b,struct cs_index src)1306 cs_progress_store(struct cs_builder *b, struct cs_index src)
1307 {
1308    cs_emit(b, PROGRESS_STORE, I)
1309       I.source = cs_src64(b, src);
1310 }
1311 
1312 static inline void
cs_progress_load(struct cs_builder * b,struct cs_index dst)1313 cs_progress_load(struct cs_builder *b, struct cs_index dst)
1314 {
1315    cs_emit(b, PROGRESS_LOAD, I)
1316       I.destination = cs_dst64(b, dst);
1317 }
1318 
1319 static inline void
cs_run_compute_indirect(struct cs_builder * b,unsigned wg_per_task,bool progress_inc,struct cs_shader_res_sel res_sel)1320 cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
1321                         bool progress_inc, struct cs_shader_res_sel res_sel)
1322 {
1323    cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
1324       I.workgroups_per_task = wg_per_task;
1325       I.progress_increment = progress_inc;
1326       I.srt_select = res_sel.srt;
1327       I.spd_select = res_sel.spd;
1328       I.tsd_select = res_sel.tsd;
1329       I.fau_select = res_sel.fau;
1330    }
1331 }
1332 
1333 static inline void
cs_error_barrier(struct cs_builder * b)1334 cs_error_barrier(struct cs_builder *b)
1335 {
1336    cs_emit(b, ERROR_BARRIER, _)
1337       ;
1338 }
1339 
1340 static inline void
cs_heap_set(struct cs_builder * b,struct cs_index address)1341 cs_heap_set(struct cs_builder *b, struct cs_index address)
1342 {
1343    cs_emit(b, HEAP_SET, I) {
1344       I.address = cs_src64(b, address);
1345    }
1346 }
1347 
1348 static inline void
cs_heap_operation(struct cs_builder * b,enum mali_cs_heap_operation operation,struct cs_async_op async)1349 cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation,
1350                   struct cs_async_op async)
1351 {
1352    cs_emit(b, HEAP_OPERATION, I) {
1353       I.operation = operation;
1354       cs_apply_async(I, async);
1355    }
1356 }
1357 
1358 static inline void
cs_vt_start(struct cs_builder * b,struct cs_async_op async)1359 cs_vt_start(struct cs_builder *b, struct cs_async_op async)
1360 {
1361    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async);
1362 }
1363 
1364 static inline void
cs_vt_end(struct cs_builder * b,struct cs_async_op async)1365 cs_vt_end(struct cs_builder *b, struct cs_async_op async)
1366 {
1367    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async);
1368 }
1369 
1370 static inline void
cs_frag_end(struct cs_builder * b,struct cs_async_op async)1371 cs_frag_end(struct cs_builder *b, struct cs_async_op async)
1372 {
1373    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async);
1374 }
1375 
1376 static inline void
cs_trace_point(struct cs_builder * b,struct cs_index regs,struct cs_async_op async)1377 cs_trace_point(struct cs_builder *b, struct cs_index regs,
1378                struct cs_async_op async)
1379 {
1380    cs_emit(b, TRACE_POINT, I) {
1381       I.base_register = cs_src_tuple(b, regs, regs.size);
1382       I.register_count = regs.size;
1383       cs_apply_async(I, async);
1384    }
1385 }
1386 
1387 struct cs_match {
1388    struct cs_block block;
1389    struct cs_label break_label;
1390    struct cs_block case_block;
1391    struct cs_label next_case_label;
1392    struct cs_index val;
1393    struct cs_index scratch_reg;
1394    struct cs_load_store_tracker case_ls_state;
1395    struct cs_load_store_tracker ls_state;
1396    struct cs_load_store_tracker *orig_ls_state;
1397    bool default_emitted;
1398 };
1399 
1400 static inline struct cs_match *
cs_match_start(struct cs_builder * b,struct cs_match * match,struct cs_index val,struct cs_index scratch_reg)1401 cs_match_start(struct cs_builder *b, struct cs_match *match,
1402                struct cs_index val, struct cs_index scratch_reg)
1403 {
1404    *match = (struct cs_match){
1405       .val = val,
1406       .scratch_reg = scratch_reg,
1407       .orig_ls_state = b->conf.ls_tracker,
1408    };
1409 
1410    cs_block_start(b, &match->block);
1411    cs_label_init(&match->break_label);
1412    cs_label_init(&match->next_case_label);
1413 
1414    return match;
1415 }
1416 
1417 static inline void
cs_match_case_ls_set(struct cs_builder * b,struct cs_match * match)1418 cs_match_case_ls_set(struct cs_builder *b, struct cs_match *match)
1419 {
1420    if (unlikely(match->orig_ls_state)) {
1421       match->case_ls_state = *match->orig_ls_state;
1422       b->conf.ls_tracker = &match->case_ls_state;
1423    }
1424 }
1425 
1426 static inline void
cs_match_case_ls_get(struct cs_match * match)1427 cs_match_case_ls_get(struct cs_match *match)
1428 {
1429    if (unlikely(match->orig_ls_state)) {
1430       BITSET_OR(match->ls_state.pending_loads,
1431                 match->case_ls_state.pending_loads,
1432                 match->ls_state.pending_loads);
1433       BITSET_OR(match->ls_state.pending_stores,
1434                 match->case_ls_state.pending_stores,
1435                 match->ls_state.pending_stores);
1436    }
1437 }
1438 
1439 static inline void
cs_match_case(struct cs_builder * b,struct cs_match * match,uint32_t id)1440 cs_match_case(struct cs_builder *b, struct cs_match *match, uint32_t id)
1441 {
1442    assert(b->blocks.cur && (b->blocks.cur == &match->block ||
1443                             b->blocks.cur == &match->case_block));
1444    assert(!match->default_emitted || !"default case must be last");
1445    if (match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS) {
1446       cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1447                       cs_undef());
1448       assert(b->blocks.cur == &match->case_block);
1449       cs_block_end(b);
1450       cs_match_case_ls_get(match);
1451       cs_set_label(b, &match->next_case_label);
1452       cs_label_init(&match->next_case_label);
1453    }
1454 
1455    if (id)
1456       cs_add32(b, match->scratch_reg, match->val, -id);
1457 
1458    cs_branch_label(b, &match->next_case_label, MALI_CS_CONDITION_NEQUAL,
1459                    id ? match->scratch_reg : match->val);
1460 
1461    cs_match_case_ls_set(b, match);
1462    cs_block_start(b, &match->case_block);
1463 }
1464 
1465 static inline void
cs_match_default(struct cs_builder * b,struct cs_match * match)1466 cs_match_default(struct cs_builder *b, struct cs_match *match)
1467 {
1468    assert(b->blocks.cur && (b->blocks.cur == &match->block ||
1469                             b->blocks.cur == &match->case_block));
1470    assert(match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS ||
1471           !"default case requires at least one other case");
1472    cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1473                    cs_undef());
1474 
1475    if (b->blocks.cur == &match->case_block) {
1476       cs_block_end(b);
1477       cs_match_case_ls_get(match);
1478    }
1479 
1480    cs_set_label(b, &match->next_case_label);
1481    cs_label_init(&match->next_case_label);
1482    cs_match_case_ls_set(b, match);
1483    cs_block_start(b, &match->case_block);
1484    match->default_emitted = true;
1485 }
1486 
1487 static inline void
cs_match_end(struct cs_builder * b,struct cs_match * match)1488 cs_match_end(struct cs_builder *b, struct cs_match *match)
1489 {
1490    assert(b->blocks.cur && (b->blocks.cur == &match->block ||
1491                             b->blocks.cur == &match->case_block));
1492 
1493    if (b->blocks.cur == &match->case_block) {
1494       cs_match_case_ls_get(match);
1495       cs_block_end(b);
1496    }
1497 
1498    if (unlikely(match->orig_ls_state)) {
1499       if (!match->default_emitted) {
1500          /* If we don't have a default, assume we don't handle all possible cases
1501           * and the match load/store state with the original load/store state.
1502           */
1503          BITSET_OR(match->orig_ls_state->pending_loads,
1504                    match->ls_state.pending_loads,
1505                    match->orig_ls_state->pending_loads);
1506          BITSET_OR(match->orig_ls_state->pending_stores,
1507                    match->ls_state.pending_stores,
1508                    match->orig_ls_state->pending_stores);
1509       } else {
1510          *match->orig_ls_state = match->ls_state;
1511       }
1512 
1513       b->conf.ls_tracker = match->orig_ls_state;
1514    }
1515 
1516    cs_set_label(b, &match->next_case_label);
1517    cs_set_label(b, &match->break_label);
1518 
1519    cs_block_end(b);
1520 }
1521 
1522 #define cs_match(__b, __val, __scratch)                                        \
1523    for (struct cs_match __match_storage,                                       \
1524         *__match = cs_match_start(__b, &__match_storage, __val, __scratch);    \
1525         __match != NULL; cs_match_end(__b, &__match_storage), __match = NULL)
1526 
1527 #define cs_case(__b, __ref)                                                    \
1528    for (bool __case_defined = ({                                               \
1529            cs_match_case(__b, __match, __ref);                                 \
1530            false;                                                              \
1531         });                                                                    \
1532         !__case_defined; __case_defined = true)
1533 
1534 #define cs_default(__b)                                                        \
1535    for (bool __default_defined = ({                                            \
1536            cs_match_default(__b, __match);                                     \
1537            false;                                                              \
1538         });                                                                    \
1539         !__default_defined; __default_defined = true)
1540