xref: /aosp_15_r20/external/mesa3d/src/intel/common/mi_builder.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef MI_BUILDER_H
25 #define MI_BUILDER_H
26 
27 #include "dev/intel_device_info.h"
28 #include "genxml/genX_bits.h"
29 #include "util/bitscan.h"
30 #include "util/fast_idiv_by_const.h"
31 #include "util/u_math.h"
32 
33 #ifndef MI_BUILDER_NUM_ALLOC_GPRS
34 /** The number of GPRs the MI builder is allowed to allocate
35  *
36  * This may be set by a user of this API so that it can reserve some GPRs at
37  * the top end for its own use.
38  */
39 #define MI_BUILDER_NUM_ALLOC_GPRS 16
40 #endif
41 
42 #ifndef MI_BUILDER_DEFAULT_WRITE_CHECK
43 #define MI_BUILDER_DEFAULT_WRITE_CHECK true
44 #endif
45 
46 #ifndef MI_BUILDER_RAW_MEM_FENCING
47 #define MI_BUILDER_RAW_MEM_FENCING GFX_VER >= 20
48 #endif
49 
50 /** These must be defined by the user of the builder
51  *
52  * void *__gen_get_batch_dwords(__gen_user_data *user_data,
53  *                              unsigned num_dwords);
54  *
55  * __gen_address_type
56  * __gen_address_offset(__gen_address_type addr, uint64_t offset);
57  *
58  *
59  * If self-modifying batches are supported, we must be able to pass batch
60  * addresses around as void*s so pinning as well as batch chaining or some
61  * other mechanism for ensuring batch pointers remain valid during building is
62  * required. The following function must also be defined, it returns an
63  * address in canonical form:
64  *
65  * __gen_address_type
66  * __gen_get_batch_address(__gen_user_data *user_data, void *location);
67  *
68  * Also, __gen_combine_address must accept a location value of NULL and return
69  * a fully valid 64-bit address.
70  */
71 
72 /**
73  * On Gfx20+ this must also be defined by the user of the builder
74  *
75  * bool *
76  * __gen_get_write_fencing_status(__gen_user_data *user_data);
77  *
78  * Returns a pointer to a boolean tracking the status of fencing for MI
79  * commands writing to memory.
80  */
81 
82 /*
83  * Start of the actual MI builder
84  */
85 
86 #define __genxml_cmd_length(cmd) cmd ## _length
87 #define __genxml_cmd_header(cmd) cmd ## _header
88 #define __genxml_cmd_pack(cmd) cmd ## _pack
89 
90 #define mi_builder_pack(b, cmd, dst, name)                          \
91    for (struct cmd name = { __genxml_cmd_header(cmd) },                 \
92         *_dst = (struct cmd *)(dst); __builtin_expect(_dst != NULL, 1); \
93         __genxml_cmd_pack(cmd)((b)->user_data, (void *)_dst, &name),    \
94         _dst = NULL)
95 
96 /* Get the instruction pointer inside a mi_builder_pack() block */
97 #define mi_builder_get_inst_ptr(b) \
98    ((uint8_t *)_dst)
99 
100 #define mi_builder_emit(b, cmd, name)                               \
101    mi_builder_pack((b), cmd, __gen_get_batch_dwords((b)->user_data, __genxml_cmd_length(cmd)), name)
102 
103 enum mi_value_type {
104    MI_VALUE_TYPE_IMM,
105    MI_VALUE_TYPE_MEM32,
106    MI_VALUE_TYPE_MEM64,
107    MI_VALUE_TYPE_REG32,
108    MI_VALUE_TYPE_REG64,
109 };
110 
111 struct mi_value {
112    enum mi_value_type type;
113 
114    union {
115       uint64_t imm;
116       __gen_address_type addr;
117       uint32_t reg;
118    };
119 
120 #if GFX_VERx10 >= 75
121    bool invert;
122 #endif
123 };
124 
125 struct mi_reg_num {
126    uint32_t num;
127 #if GFX_VER >= 11
128    bool cs;
129 #endif
130 };
131 
132 static inline struct mi_reg_num
mi_adjust_reg_num(uint32_t reg)133 mi_adjust_reg_num(uint32_t reg)
134 {
135 #if GFX_VER >= 11
136    bool cs = reg >= 0x2000 && reg < 0x4000;
137    return (struct mi_reg_num) {
138       .num = reg - (cs ? 0x2000 : 0),
139       .cs = cs,
140    };
141 #else
142    return (struct mi_reg_num) { .num = reg, };
143 #endif
144 }
145 
146 #if GFX_VER >= 9
147 #define MI_BUILDER_MAX_MATH_DWORDS 256
148 #else
149 #define MI_BUILDER_MAX_MATH_DWORDS 64
150 #endif
151 
152 struct mi_builder {
153    const struct intel_device_info *devinfo;
154    __gen_user_data *user_data;
155 
156    bool no_read_write_fencing;
157 
158 #if GFX_VERx10 >= 75
159    uint32_t gprs;
160    uint8_t gpr_refs[MI_BUILDER_NUM_ALLOC_GPRS];
161 
162    unsigned num_math_dwords;
163    uint32_t math_dwords[MI_BUILDER_MAX_MATH_DWORDS];
164 #endif
165 
166 #if GFX_VERx10 >= 125
167    uint32_t mocs;
168 #endif
169 
170 #if GFX_VER >= 12
171    bool write_check;
172 #endif
173 };
174 
175 static inline void
mi_builder_init(struct mi_builder * b,const struct intel_device_info * devinfo,__gen_user_data * user_data)176 mi_builder_init(struct mi_builder *b,
177                 const struct intel_device_info *devinfo,
178                 __gen_user_data *user_data)
179 {
180    memset(b, 0, sizeof(*b));
181    b->devinfo = devinfo;
182    b->user_data = user_data;
183 
184 #if GFX_VER >= 12
185    b->write_check = MI_BUILDER_DEFAULT_WRITE_CHECK;
186 #endif
187    b->no_read_write_fencing = false;
188 #if GFX_VERx10 >= 75
189    b->gprs = 0;
190    b->num_math_dwords = 0;
191 #endif
192 }
193 
194 static inline void
mi_builder_flush_math(struct mi_builder * b)195 mi_builder_flush_math(struct mi_builder *b)
196 {
197 #if GFX_VERx10 >= 75
198    if (b->num_math_dwords == 0)
199       return;
200 
201    uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
202                                                      1 + b->num_math_dwords);
203    mi_builder_pack(b, GENX(MI_MATH), dw, math) {
204 #if GFX_VERx10 >= 125
205       math.MOCS = b->mocs;
206 #endif
207       math.DWordLength = 1 + b->num_math_dwords - GENX(MI_MATH_length_bias);
208    }
209    memcpy(dw + 1, b->math_dwords, b->num_math_dwords * sizeof(uint32_t));
210    b->num_math_dwords = 0;
211 #endif
212 }
213 
214 /**
215  * Set mocs index to mi_build
216  *
217  * This is required when a MI_MATH instruction will be emitted and
218  * the code is used in GFX 12.5 or newer.
219  */
220 static inline void
mi_builder_set_mocs(UNUSED struct mi_builder * b,UNUSED uint32_t mocs)221 mi_builder_set_mocs(UNUSED struct mi_builder *b, UNUSED uint32_t mocs)
222 {
223 #if GFX_VERx10 >= 125
224    if (b->mocs != 0 && b->mocs != mocs)
225       mi_builder_flush_math(b);
226    b->mocs = mocs;
227 #endif
228 }
229 
230 /**
231  * Set write checks on immediate writes
232  *
233  * This ensures that the next memory write will complete only when all emitted
234  * previously emitted memory write are .
235  */
236 static inline void
mi_builder_set_write_check(UNUSED struct mi_builder * b,UNUSED bool check)237 mi_builder_set_write_check(UNUSED struct mi_builder *b, UNUSED bool check)
238 {
239 #if GFX_VER >= 12
240    b->write_check = check;
241 #endif
242 }
243 
244 static inline bool
mi_builder_write_checked(UNUSED struct mi_builder * b)245 mi_builder_write_checked(UNUSED struct mi_builder *b)
246 {
247 #if GFX_VER >= 12
248    return b->write_check;
249 #else
250    return false;
251 #endif
252 }
253 
254 #define _MI_BUILDER_GPR_BASE 0x2600
255 /* The actual hardware limit on GPRs */
256 #define _MI_BUILDER_NUM_HW_GPRS 16
257 
258 #if GFX_VERx10 >= 75
259 
260 static inline bool
mi_value_is_reg(struct mi_value val)261 mi_value_is_reg(struct mi_value val)
262 {
263    return val.type == MI_VALUE_TYPE_REG32 ||
264           val.type == MI_VALUE_TYPE_REG64;
265 }
266 
267 static inline bool
mi_value_is_gpr(struct mi_value val)268 mi_value_is_gpr(struct mi_value val)
269 {
270    return mi_value_is_reg(val) &&
271           val.reg >= _MI_BUILDER_GPR_BASE &&
272           val.reg < _MI_BUILDER_GPR_BASE +
273                     _MI_BUILDER_NUM_HW_GPRS * 8;
274 }
275 
276 static inline bool
_mi_value_is_allocated_gpr(struct mi_value val)277 _mi_value_is_allocated_gpr(struct mi_value val)
278 {
279    return mi_value_is_reg(val) &&
280           val.reg >= _MI_BUILDER_GPR_BASE &&
281           val.reg < _MI_BUILDER_GPR_BASE +
282                     MI_BUILDER_NUM_ALLOC_GPRS * 8;
283 }
284 
285 static inline uint32_t
_mi_value_as_gpr(struct mi_value val)286 _mi_value_as_gpr(struct mi_value val)
287 {
288    assert(mi_value_is_gpr(val));
289    /* Some of the GRL metakernels will generate 64bit value in a GP register,
290     * then use only half of that as the last operation on that value. So allow
291     * unref on part of a GP register.
292     */
293    assert(val.reg % 4 == 0);
294    return (val.reg - _MI_BUILDER_GPR_BASE) / 8;
295 }
296 
297 static inline struct mi_value
mi_new_gpr(struct mi_builder * b)298 mi_new_gpr(struct mi_builder *b)
299 {
300    unsigned gpr = ffs(~b->gprs) - 1;
301    assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
302    assert(b->gpr_refs[gpr] == 0);
303    b->gprs |= (1u << gpr);
304    b->gpr_refs[gpr] = 1;
305 
306    return (struct mi_value) {
307       .type = MI_VALUE_TYPE_REG64,
308       .reg = _MI_BUILDER_GPR_BASE + gpr * 8,
309    };
310 }
311 
312 static inline struct mi_value
mi_reserve_gpr(struct mi_builder * b,unsigned gpr)313 mi_reserve_gpr(struct mi_builder *b, unsigned gpr)
314 {
315    assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
316    assert(!(b->gprs & (1 << gpr)));
317    assert(b->gpr_refs[gpr] == 0);
318    b->gprs |= (1u << gpr);
319    b->gpr_refs[gpr] = 128; /* Enough that we won't unref it */
320 
321    return (struct mi_value) {
322       .type = MI_VALUE_TYPE_REG64,
323       .reg = _MI_BUILDER_GPR_BASE + gpr * 8,
324    };
325 }
326 #endif /* GFX_VERx10 >= 75 */
327 
328 /** Take a reference to a mi_value
329  *
330  * The MI builder uses reference counting to automatically free ALU GPRs for
331  * re-use in calculations.  All mi_* math functions consume the reference
332  * they are handed for each source and return a reference to a value which the
333  * caller must consume.  In particular, if you pas the same value into a
334  * single mi_* math function twice (say to add a number to itself), you
335  * are responsible for calling mi_value_ref() to get a second reference
336  * because the mi_* math function will consume it twice.
337  */
338 static inline void
mi_value_add_refs(struct mi_builder * b,struct mi_value val,unsigned num_refs)339 mi_value_add_refs(struct mi_builder *b, struct mi_value val, unsigned num_refs)
340 {
341 #if GFX_VERx10 >= 75
342    if (_mi_value_is_allocated_gpr(val)) {
343       unsigned gpr = _mi_value_as_gpr(val);
344       assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
345       assert(b->gprs & (1u << gpr));
346       assert(b->gpr_refs[gpr] < UINT8_MAX);
347       b->gpr_refs[gpr] += num_refs;
348    }
349 #endif /* GFX_VERx10 >= 75 */
350 }
351 
352 static inline struct mi_value
mi_value_ref(struct mi_builder * b,struct mi_value val)353 mi_value_ref(struct mi_builder *b, struct mi_value val)
354 {
355    mi_value_add_refs(b, val, 1);
356    return val;
357 }
358 
359 
360 /** Drop a reference to a mi_value
361  *
362  * See also mi_value_ref.
363  */
364 static inline void
mi_value_unref(struct mi_builder * b,struct mi_value val)365 mi_value_unref(struct mi_builder *b, struct mi_value val)
366 {
367 #if GFX_VERx10 >= 75
368    if (_mi_value_is_allocated_gpr(val)) {
369       unsigned gpr = _mi_value_as_gpr(val);
370       assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
371       assert(b->gprs & (1u << gpr));
372       assert(b->gpr_refs[gpr] > 0);
373       if (--b->gpr_refs[gpr] == 0)
374          b->gprs &= ~(1u << gpr);
375    }
376 #endif /* GFX_VERx10 >= 75 */
377 }
378 
379 /* On Gfx20+ memory read/write can be process unordered, so we need to track
380  * the writes to memory to make sure any memory read will see the effect of a
381  * previous write.
382  */
383 static inline void
mi_builder_set_write(struct mi_builder * b)384 mi_builder_set_write(struct mi_builder *b)
385 {
386 #if MI_BUILDER_RAW_MEM_FENCING
387    *__gen_get_write_fencing_status(b->user_data) = true;
388 #endif
389 }
390 
391 static inline void
mi_ensure_write_fence(struct mi_builder * b)392 mi_ensure_write_fence(struct mi_builder *b)
393 {
394 #if MI_BUILDER_RAW_MEM_FENCING
395    if (!b->no_read_write_fencing &&
396        *__gen_get_write_fencing_status(b->user_data)) {
397       mi_builder_emit(b, GENX(MI_MEM_FENCE), fence)
398          fence.FenceType = FENCE_TYPE_MI_WRITE;
399       *__gen_get_write_fencing_status(b->user_data) = false;
400    }
401 #endif
402 }
403 
404 static inline struct mi_value
mi_imm(uint64_t imm)405 mi_imm(uint64_t imm)
406 {
407    return (struct mi_value) {
408       .type = MI_VALUE_TYPE_IMM,
409       .imm = imm,
410    };
411 }
412 
413 static inline struct mi_value
mi_reg32(uint32_t reg)414 mi_reg32(uint32_t reg)
415 {
416    struct mi_value val = {
417       .type = MI_VALUE_TYPE_REG32,
418       .reg = reg,
419    };
420 #if GFX_VERx10 >= 75
421    assert(!_mi_value_is_allocated_gpr(val));
422 #endif
423    return val;
424 }
425 
426 static inline struct mi_value
mi_reg64(uint32_t reg)427 mi_reg64(uint32_t reg)
428 {
429    struct mi_value val = {
430       .type = MI_VALUE_TYPE_REG64,
431       .reg = reg,
432    };
433 #if GFX_VERx10 >= 75
434    assert(!_mi_value_is_allocated_gpr(val));
435 #endif
436    return val;
437 }
438 
439 static inline struct mi_value
mi_mem32(__gen_address_type addr)440 mi_mem32(__gen_address_type addr)
441 {
442    return (struct mi_value) {
443       .type = MI_VALUE_TYPE_MEM32,
444       .addr = addr,
445    };
446 }
447 
448 static inline struct mi_value
mi_mem64(__gen_address_type addr)449 mi_mem64(__gen_address_type addr)
450 {
451    return (struct mi_value) {
452       .type = MI_VALUE_TYPE_MEM64,
453       .addr = addr,
454    };
455 }
456 
457 static inline struct mi_value
mi_value_half(struct mi_value value,bool top_32_bits)458 mi_value_half(struct mi_value value, bool top_32_bits)
459 {
460    switch (value.type) {
461    case MI_VALUE_TYPE_IMM:
462       if (top_32_bits)
463          value.imm >>= 32;
464       else
465          value.imm &= 0xffffffffu;
466       return value;
467 
468    case MI_VALUE_TYPE_MEM32:
469       assert(!top_32_bits);
470       return value;
471 
472    case MI_VALUE_TYPE_MEM64:
473       if (top_32_bits)
474          value.addr = __gen_address_offset(value.addr, 4);
475       value.type = MI_VALUE_TYPE_MEM32;
476       return value;
477 
478    case MI_VALUE_TYPE_REG32:
479       assert(!top_32_bits);
480       return value;
481 
482    case MI_VALUE_TYPE_REG64:
483       if (top_32_bits)
484          value.reg += 4;
485       value.type = MI_VALUE_TYPE_REG32;
486       return value;
487    }
488 
489    unreachable("Invalid mi_value type");
490 }
491 
492 static inline void
_mi_copy_no_unref(struct mi_builder * b,struct mi_value dst,struct mi_value src)493 _mi_copy_no_unref(struct mi_builder *b,
494                   struct mi_value dst, struct mi_value src)
495 {
496 #if GFX_VERx10 >= 75
497    /* TODO: We could handle src.invert by emitting a bit of math if we really
498     * wanted to.
499     */
500    assert(!dst.invert && !src.invert);
501 #endif
502    mi_builder_flush_math(b);
503 
504    if (src.type == MI_VALUE_TYPE_MEM64 ||
505        src.type == MI_VALUE_TYPE_MEM32)
506       mi_ensure_write_fence(b);
507 
508    switch (dst.type) {
509    case MI_VALUE_TYPE_IMM:
510       unreachable("Cannot copy to an immediate");
511 
512    case MI_VALUE_TYPE_MEM64:
513    case MI_VALUE_TYPE_REG64:
514       switch (src.type) {
515       case MI_VALUE_TYPE_IMM:
516          if (dst.type == MI_VALUE_TYPE_REG64) {
517             uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
518                                                               GENX(MI_LOAD_REGISTER_IMM_length) + 2);
519             struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
520             mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
521                lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) + 2 -
522                                  GENX(MI_LOAD_REGISTER_IMM_length_bias);
523 #if GFX_VER >= 11
524                lri.AddCSMMIOStartOffset = reg.cs;
525 #endif
526             }
527             dw[1] = reg.num;
528             dw[2] = src.imm;
529             dw[3] = reg.num + 4;
530             dw[4] = src.imm >> 32;
531          } else {
532 #if GFX_VER >= 8
533             assert(dst.type == MI_VALUE_TYPE_MEM64);
534             uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
535                                                               GENX(MI_STORE_DATA_IMM_length) + 1);
536             mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdi) {
537                sdi.DWordLength = GENX(MI_STORE_DATA_IMM_length) + 1 -
538                                  GENX(MI_STORE_DATA_IMM_length_bias);
539                sdi.StoreQword = true;
540                sdi.Address = dst.addr;
541 #if GFX_VER >= 12
542                sdi.ForceWriteCompletionCheck = b->write_check;
543 #endif
544             }
545             dw[3] = src.imm;
546             dw[4] = src.imm >> 32;
547 #else
548          _mi_copy_no_unref(b, mi_value_half(dst, false),
549                               mi_value_half(src, false));
550          _mi_copy_no_unref(b, mi_value_half(dst, true),
551                               mi_value_half(src, true));
552 #endif
553          }
554          break;
555       case MI_VALUE_TYPE_REG32:
556       case MI_VALUE_TYPE_MEM32:
557          _mi_copy_no_unref(b, mi_value_half(dst, false),
558                               mi_value_half(src, false));
559          _mi_copy_no_unref(b, mi_value_half(dst, true),
560                               mi_imm(0));
561          break;
562       case MI_VALUE_TYPE_REG64:
563       case MI_VALUE_TYPE_MEM64:
564          _mi_copy_no_unref(b, mi_value_half(dst, false),
565                               mi_value_half(src, false));
566          _mi_copy_no_unref(b, mi_value_half(dst, true),
567                               mi_value_half(src, true));
568          break;
569       default:
570          unreachable("Invalid mi_value type");
571       }
572       break;
573 
574    case MI_VALUE_TYPE_MEM32:
575       switch (src.type) {
576       case MI_VALUE_TYPE_IMM:
577          mi_builder_emit(b, GENX(MI_STORE_DATA_IMM), sdi) {
578             sdi.Address = dst.addr;
579 #if GFX_VER >= 12
580             sdi.ForceWriteCompletionCheck = b->write_check;
581 #endif
582             sdi.ImmediateData = src.imm;
583          }
584          break;
585 
586       case MI_VALUE_TYPE_MEM32:
587       case MI_VALUE_TYPE_MEM64:
588 #if GFX_VER >= 8
589          mi_builder_emit(b, GENX(MI_COPY_MEM_MEM), cmm) {
590             cmm.DestinationMemoryAddress = dst.addr;
591             cmm.SourceMemoryAddress = src.addr;
592          }
593 #elif GFX_VERx10 == 75
594          {
595             struct mi_value tmp = mi_new_gpr(b);
596             _mi_copy_no_unref(b, tmp, src);
597             _mi_copy_no_unref(b, dst, tmp);
598             mi_value_unref(b, tmp);
599          }
600 #else
601          unreachable("Cannot do mem <-> mem copy on IVB and earlier");
602 #endif
603          break;
604 
605       case MI_VALUE_TYPE_REG32:
606       case MI_VALUE_TYPE_REG64:
607          mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
608             struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
609             srm.RegisterAddress = reg.num;
610 #if GFX_VER >= 11
611             srm.AddCSMMIOStartOffset = reg.cs;
612 #endif
613             srm.MemoryAddress = dst.addr;
614          }
615          break;
616 
617       default:
618          unreachable("Invalid mi_value type");
619       }
620       break;
621 
622    case MI_VALUE_TYPE_REG32:
623       switch (src.type) {
624       case MI_VALUE_TYPE_IMM:
625          mi_builder_emit(b, GENX(MI_LOAD_REGISTER_IMM), lri) {
626             struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
627             lri.RegisterOffset = reg.num;
628 #if GFX_VER >= 11
629             lri.AddCSMMIOStartOffset = reg.cs;
630 #endif
631             lri.DataDWord = src.imm;
632          }
633          break;
634 
635       case MI_VALUE_TYPE_MEM32:
636       case MI_VALUE_TYPE_MEM64:
637 #if GFX_VER >= 7
638          mi_builder_emit(b, GENX(MI_LOAD_REGISTER_MEM), lrm) {
639             struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
640             lrm.RegisterAddress = reg.num;
641 #if GFX_VER >= 11
642             lrm.AddCSMMIOStartOffset = reg.cs;
643 #endif
644             lrm.MemoryAddress = src.addr;
645          }
646 #else
647          unreachable("Cannot load do mem -> reg copy on SNB and earlier");
648 #endif
649          break;
650 
651       case MI_VALUE_TYPE_REG32:
652       case MI_VALUE_TYPE_REG64:
653 #if GFX_VERx10 >= 75
654          if (src.reg != dst.reg) {
655             mi_builder_emit(b, GENX(MI_LOAD_REGISTER_REG), lrr) {
656                struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
657                lrr.SourceRegisterAddress = reg.num;
658 #if GFX_VER >= 11
659                lrr.AddCSMMIOStartOffsetSource = reg.cs;
660 #endif
661                reg = mi_adjust_reg_num(dst.reg);
662                lrr.DestinationRegisterAddress = reg.num;
663 #if GFX_VER >= 11
664                lrr.AddCSMMIOStartOffsetDestination = reg.cs;
665 #endif
666             }
667          }
668 #else
669          unreachable("Cannot do reg <-> reg copy on IVB and earlier");
670 #endif
671          break;
672 
673       default:
674          unreachable("Invalid mi_value type");
675       }
676       break;
677 
678    default:
679       unreachable("Invalid mi_value type");
680    }
681 
682 
683    if (dst.type == MI_VALUE_TYPE_MEM64 ||
684        dst.type == MI_VALUE_TYPE_MEM32) {
685       /* Immediate writes can already wait for writes, so no need to do
686        * additional fencing later.
687        */
688       if (src.type != MI_VALUE_TYPE_IMM || !mi_builder_write_checked(b))
689          mi_builder_set_write(b);
690    }
691 }
692 
693 #if GFX_VERx10 >= 75
694 static inline struct mi_value
695 mi_resolve_invert(struct mi_builder *b, struct mi_value src);
696 #endif
697 
698 /** Store the value in src to the value represented by dst
699  *
700  * If the bit size of src and dst mismatch, this function does an unsigned
701  * integer cast.  If src has more bits than dst, it takes the bottom bits.  If
702  * src has fewer bits then dst, it fills the top bits with zeros.
703  *
704  * This function consumes one reference for each of src and dst.
705  */
706 static inline void
mi_store(struct mi_builder * b,struct mi_value dst,struct mi_value src)707 mi_store(struct mi_builder *b, struct mi_value dst, struct mi_value src)
708 {
709 #if GFX_VERx10 >= 75
710    src = mi_resolve_invert(b, src);
711 #endif
712    _mi_copy_no_unref(b, dst, src);
713    mi_value_unref(b, src);
714    mi_value_unref(b, dst);
715 }
716 
717 static inline void
mi_memset(struct mi_builder * b,__gen_address_type dst,uint32_t value,uint32_t size)718 mi_memset(struct mi_builder *b, __gen_address_type dst,
719           uint32_t value, uint32_t size)
720 {
721 #if GFX_VERx10 >= 75
722    assert(b->num_math_dwords == 0);
723 #endif
724 
725    /* This memset operates in units of dwords. */
726    assert(size % 4 == 0);
727 
728    for (uint32_t i = 0; i < size; i += 4) {
729       mi_store(b, mi_mem32(__gen_address_offset(dst, i)),
730                       mi_imm(value));
731    }
732 }
733 
734 /* NOTE: On IVB, this function stomps GFX7_3DPRIM_BASE_VERTEX */
735 static inline void
mi_memcpy(struct mi_builder * b,__gen_address_type dst,__gen_address_type src,uint32_t size)736 mi_memcpy(struct mi_builder *b, __gen_address_type dst,
737           __gen_address_type src, uint32_t size)
738 {
739 #if GFX_VERx10 >= 75
740    assert(b->num_math_dwords == 0);
741 #endif
742 
743    /* Flush once only */
744    mi_ensure_write_fence(b);
745    b->no_read_write_fencing = true;
746 
747    /* Hold off write checks until the last write. */
748    bool write_check = mi_builder_write_checked(b);
749    mi_builder_set_write_check(b, false);
750 
751    /* This memcpy operates in units of dwords. */
752    assert(size % 4 == 0);
753 
754    for (uint32_t i = 0; i < size; i += 4) {
755       if (i == size - 4)
756          mi_builder_set_write_check(b, write_check);
757 
758       struct mi_value dst_val = mi_mem32(__gen_address_offset(dst, i));
759       struct mi_value src_val = mi_mem32(__gen_address_offset(src, i));
760 #if GFX_VERx10 >= 75
761       mi_store(b, dst_val, src_val);
762 #else
763       /* IVB does not have a general purpose register for command streamer
764        * commands. Therefore, we use an alternate temporary register.
765        */
766       struct mi_value tmp_reg = mi_reg32(0x2440); /* GFX7_3DPRIM_BASE_VERTEX */
767       mi_store(b, tmp_reg, src_val);
768       mi_store(b, dst_val, tmp_reg);
769 #endif
770    }
771 
772    b->no_read_write_fencing = false;
773 }
774 
775 /*
776  * MI_MATH Section.  Only available on Haswell+
777  */
778 
779 #if GFX_VERx10 >= 75
780 
781 /**
782  * Perform a predicated store (assuming the condition is already loaded
783  * in the MI_PREDICATE_RESULT register) of the value in src to the memory
784  * location specified by dst.  Non-memory destinations are not supported.
785  *
786  * This function consumes one reference for each of src and dst.
787  */
788 static inline void
mi_store_if(struct mi_builder * b,struct mi_value dst,struct mi_value src)789 mi_store_if(struct mi_builder *b, struct mi_value dst, struct mi_value src)
790 {
791    assert(!dst.invert && !src.invert);
792 
793    mi_builder_flush_math(b);
794 
795    /* We can only predicate MI_STORE_REGISTER_MEM, so restrict the
796     * destination to be memory, and resolve the source to a temporary
797     * register if it isn't in one already.
798     */
799    assert(dst.type == MI_VALUE_TYPE_MEM64 ||
800           dst.type == MI_VALUE_TYPE_MEM32);
801 
802    if (src.type != MI_VALUE_TYPE_REG32 &&
803        src.type != MI_VALUE_TYPE_REG64) {
804       struct mi_value tmp = mi_new_gpr(b);
805       _mi_copy_no_unref(b, tmp, src);
806       src = tmp;
807    }
808 
809    if (dst.type == MI_VALUE_TYPE_MEM64) {
810       mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
811          struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
812          srm.RegisterAddress = reg.num;
813 #if GFX_VER >= 11
814          srm.AddCSMMIOStartOffset = reg.cs;
815 #endif
816          srm.MemoryAddress = dst.addr;
817          srm.PredicateEnable = true;
818       }
819       mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
820          struct mi_reg_num reg = mi_adjust_reg_num(src.reg + 4);
821          srm.RegisterAddress = reg.num;
822 #if GFX_VER >= 11
823          srm.AddCSMMIOStartOffset = reg.cs;
824 #endif
825          srm.MemoryAddress = __gen_address_offset(dst.addr, 4);
826          srm.PredicateEnable = true;
827       }
828    } else {
829       mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
830          struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
831          srm.RegisterAddress = reg.num;
832 #if GFX_VER >= 11
833          srm.AddCSMMIOStartOffset = reg.cs;
834 #endif
835          srm.MemoryAddress = dst.addr;
836          srm.PredicateEnable = true;
837       }
838    }
839 
840    mi_builder_set_write(b);
841 
842    mi_value_unref(b, src);
843    mi_value_unref(b, dst);
844 }
845 
846 static inline void
_mi_builder_push_math(struct mi_builder * b,const uint32_t * dwords,unsigned num_dwords)847 _mi_builder_push_math(struct mi_builder *b,
848                       const uint32_t *dwords,
849                       unsigned num_dwords)
850 {
851    assert(num_dwords < MI_BUILDER_MAX_MATH_DWORDS);
852    if (b->num_math_dwords + num_dwords > MI_BUILDER_MAX_MATH_DWORDS)
853       mi_builder_flush_math(b);
854 
855    memcpy(&b->math_dwords[b->num_math_dwords],
856           dwords, num_dwords * sizeof(*dwords));
857    b->num_math_dwords += num_dwords;
858 }
859 
860 static inline uint32_t
_mi_pack_alu(uint32_t opcode,uint32_t operand1,uint32_t operand2)861 _mi_pack_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
862 {
863    struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
864       .Operand2 = operand2,
865       .Operand1 = operand1,
866       .ALUOpcode = opcode,
867    };
868 
869    uint32_t dw;
870    GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
871 
872    return dw;
873 }
874 
875 static inline struct mi_value
mi_value_to_gpr(struct mi_builder * b,struct mi_value val)876 mi_value_to_gpr(struct mi_builder *b, struct mi_value val)
877 {
878    if (mi_value_is_gpr(val))
879       return val;
880 
881    /* Save off the invert flag because it makes copy() grumpy */
882    bool invert = val.invert;
883    val.invert = false;
884 
885    struct mi_value tmp = mi_new_gpr(b);
886    _mi_copy_no_unref(b, tmp, val);
887    tmp.invert = invert;
888 
889    return tmp;
890 }
891 
892 static inline uint64_t
mi_value_to_u64(struct mi_value val)893 mi_value_to_u64(struct mi_value val)
894 {
895    assert(val.type == MI_VALUE_TYPE_IMM);
896    return val.invert ? ~val.imm : val.imm;
897 }
898 
899 static inline uint32_t
_mi_math_load_src(struct mi_builder * b,unsigned src,struct mi_value * val)900 _mi_math_load_src(struct mi_builder *b, unsigned src, struct mi_value *val)
901 {
902    if (val->type == MI_VALUE_TYPE_IMM &&
903        (val->imm == 0 || val->imm == UINT64_MAX)) {
904       uint64_t imm = val->invert ? ~val->imm : val->imm;
905       return _mi_pack_alu(imm ? MI_ALU_LOAD1 : MI_ALU_LOAD0, src, 0);
906    } else {
907       *val = mi_value_to_gpr(b, *val);
908       return _mi_pack_alu(val->invert ? MI_ALU_LOADINV : MI_ALU_LOAD,
909                           src, _mi_value_as_gpr(*val));
910    }
911 }
912 
913 static inline struct mi_value
mi_math_binop(struct mi_builder * b,uint32_t opcode,struct mi_value src0,struct mi_value src1,uint32_t store_op,uint32_t store_src)914 mi_math_binop(struct mi_builder *b, uint32_t opcode,
915               struct mi_value src0, struct mi_value src1,
916               uint32_t store_op, uint32_t store_src)
917 {
918    struct mi_value dst = mi_new_gpr(b);
919 
920    uint32_t dw[4];
921    dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &src0);
922    dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &src1);
923    dw[2] = _mi_pack_alu(opcode, 0, 0);
924    dw[3] = _mi_pack_alu(store_op, _mi_value_as_gpr(dst), store_src);
925    _mi_builder_push_math(b, dw, 4);
926 
927    mi_value_unref(b, src0);
928    mi_value_unref(b, src1);
929 
930    return dst;
931 }
932 
933 static inline struct mi_value
mi_inot(struct mi_builder * b,struct mi_value val)934 mi_inot(struct mi_builder *b, struct mi_value val)
935 {
936    if (val.type == MI_VALUE_TYPE_IMM)
937       return mi_imm(~mi_value_to_u64(val));
938 
939    val.invert = !val.invert;
940    return val;
941 }
942 
943 static inline struct mi_value
mi_resolve_invert(struct mi_builder * b,struct mi_value src)944 mi_resolve_invert(struct mi_builder *b, struct mi_value src)
945 {
946    if (!src.invert)
947       return src;
948 
949    assert(src.type != MI_VALUE_TYPE_IMM);
950    return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
951                            MI_ALU_STORE, MI_ALU_ACCU);
952 }
953 
954 static inline struct mi_value
mi_iadd(struct mi_builder * b,struct mi_value src0,struct mi_value src1)955 mi_iadd(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
956 {
957    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
958       return mi_imm(mi_value_to_u64(src0) + mi_value_to_u64(src1));
959 
960    return mi_math_binop(b, MI_ALU_ADD, src0, src1,
961                            MI_ALU_STORE, MI_ALU_ACCU);
962 }
963 
964 static inline struct mi_value
mi_iadd_imm(struct mi_builder * b,struct mi_value src,uint64_t N)965 mi_iadd_imm(struct mi_builder *b,
966                 struct mi_value src, uint64_t N)
967 {
968    if (N == 0)
969       return src;
970 
971    return mi_iadd(b, src, mi_imm(N));
972 }
973 
974 static inline struct mi_value
mi_isub(struct mi_builder * b,struct mi_value src0,struct mi_value src1)975 mi_isub(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
976 {
977    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
978       return mi_imm(mi_value_to_u64(src0) - mi_value_to_u64(src1));
979 
980    return mi_math_binop(b, MI_ALU_SUB, src0, src1,
981                            MI_ALU_STORE, MI_ALU_ACCU);
982 }
983 
984 static inline struct mi_value
mi_ieq(struct mi_builder * b,struct mi_value src0,struct mi_value src1)985 mi_ieq(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
986 {
987    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
988       return mi_imm(mi_value_to_u64(src0) == mi_value_to_u64(src1) ? ~0ull : 0);
989 
990    /* Compute "equal" by subtracting and storing the zero bit */
991    return mi_math_binop(b, MI_ALU_SUB, src0, src1,
992                             MI_ALU_STORE, MI_ALU_ZF);
993 }
994 
995 static inline struct mi_value
mi_ine(struct mi_builder * b,struct mi_value src0,struct mi_value src1)996 mi_ine(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
997 {
998    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
999       return mi_imm(mi_value_to_u64(src0) != mi_value_to_u64(src1) ? ~0ull : 0);
1000 
1001    /* Compute "not equal" by subtracting and storing the inverse zero bit */
1002    return mi_math_binop(b, MI_ALU_SUB, src0, src1,
1003                             MI_ALU_STOREINV, MI_ALU_ZF);
1004 }
1005 
1006 static inline struct mi_value
mi_ult(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1007 mi_ult(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1008 {
1009    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1010       return mi_imm(mi_value_to_u64(src0) < mi_value_to_u64(src1) ? ~0ull : 0);
1011 
1012    /* Compute "less than" by subtracting and storing the carry bit */
1013    return mi_math_binop(b, MI_ALU_SUB, src0, src1,
1014                            MI_ALU_STORE, MI_ALU_CF);
1015 }
1016 
1017 static inline struct mi_value
mi_uge(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1018 mi_uge(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1019 {
1020    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1021       return mi_imm(mi_value_to_u64(src0) >= mi_value_to_u64(src1) ? ~0ull : 0);
1022 
1023    /* Compute "less than" by subtracting and storing the carry bit */
1024    return mi_math_binop(b, MI_ALU_SUB, src0, src1,
1025                            MI_ALU_STOREINV, MI_ALU_CF);
1026 }
1027 
1028 static inline struct mi_value
mi_iand(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1029 mi_iand(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1030 {
1031    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1032       return mi_imm(mi_value_to_u64(src0) & mi_value_to_u64(src1));
1033 
1034    return mi_math_binop(b, MI_ALU_AND, src0, src1,
1035                            MI_ALU_STORE, MI_ALU_ACCU);
1036 }
1037 
1038 static inline struct mi_value
mi_nz(struct mi_builder * b,struct mi_value src)1039 mi_nz(struct mi_builder *b, struct mi_value src)
1040 {
1041    if (src.type == MI_VALUE_TYPE_IMM)
1042       return mi_imm(mi_value_to_u64(src) != 0 ? ~0ull : 0);
1043 
1044    return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
1045                            MI_ALU_STOREINV, MI_ALU_ZF);
1046 }
1047 
1048 static inline struct mi_value
mi_z(struct mi_builder * b,struct mi_value src)1049 mi_z(struct mi_builder *b, struct mi_value src)
1050 {
1051    if (src.type == MI_VALUE_TYPE_IMM)
1052       return mi_imm(mi_value_to_u64(src) == 0 ? ~0ull : 0);
1053 
1054    return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
1055                            MI_ALU_STORE, MI_ALU_ZF);
1056 }
1057 
1058 static inline struct mi_value
mi_ior(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1059 mi_ior(struct mi_builder *b,
1060        struct mi_value src0, struct mi_value src1)
1061 {
1062    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1063       return mi_imm(mi_value_to_u64(src0) | mi_value_to_u64(src1));
1064 
1065    return mi_math_binop(b, MI_ALU_OR, src0, src1,
1066                            MI_ALU_STORE, MI_ALU_ACCU);
1067 }
1068 
1069 #if GFX_VERx10 >= 125
1070 static inline struct mi_value
mi_ishl(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1071 mi_ishl(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1072 {
1073    if (src1.type == MI_VALUE_TYPE_IMM) {
1074       assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
1075       assert(mi_value_to_u64(src1) <= 32);
1076    }
1077 
1078    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1079       return mi_imm(mi_value_to_u64(src0) << mi_value_to_u64(src1));
1080 
1081    return mi_math_binop(b, MI_ALU_SHL, src0, src1,
1082                            MI_ALU_STORE, MI_ALU_ACCU);
1083 }
1084 
1085 static inline struct mi_value
mi_ushr(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1086 mi_ushr(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1087 {
1088    if (src1.type == MI_VALUE_TYPE_IMM) {
1089       assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
1090       assert(mi_value_to_u64(src1) <= 32);
1091    }
1092 
1093    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1094       return mi_imm(mi_value_to_u64(src0) >> mi_value_to_u64(src1));
1095 
1096    return mi_math_binop(b, MI_ALU_SHR, src0, src1,
1097                            MI_ALU_STORE, MI_ALU_ACCU);
1098 }
1099 
1100 static inline struct mi_value
mi_ushr_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1101 mi_ushr_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1102 {
1103    if (shift == 0)
1104       return src;
1105 
1106    if (shift >= 64)
1107       return mi_imm(0);
1108 
1109    if (src.type == MI_VALUE_TYPE_IMM)
1110       return mi_imm(mi_value_to_u64(src) >> shift);
1111 
1112    struct mi_value res = mi_value_to_gpr(b, src);
1113 
1114    /* Annoyingly, we only have power-of-two shifts */
1115    while (shift) {
1116       int bit = u_bit_scan(&shift);
1117       assert(bit <= 5);
1118       res = mi_ushr(b, res, mi_imm(1ULL << bit));
1119    }
1120 
1121    return res;
1122 }
1123 
1124 static inline struct mi_value
mi_ishr(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1125 mi_ishr(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1126 {
1127    if (src1.type == MI_VALUE_TYPE_IMM) {
1128       assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
1129       assert(mi_value_to_u64(src1) <= 32);
1130    }
1131 
1132    if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1133       return mi_imm((int64_t)mi_value_to_u64(src0) >> mi_value_to_u64(src1));
1134 
1135    return mi_math_binop(b, MI_ALU_SAR, src0, src1,
1136                             MI_ALU_STORE, MI_ALU_ACCU);
1137 }
1138 
1139 static inline struct mi_value
mi_ishr_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1140 mi_ishr_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1141 {
1142    if (shift == 0)
1143       return src;
1144 
1145    if (shift >= 64)
1146       return mi_imm(0);
1147 
1148    if (src.type == MI_VALUE_TYPE_IMM)
1149       return mi_imm((int64_t)mi_value_to_u64(src) >> shift);
1150 
1151    struct mi_value res = mi_value_to_gpr(b, src);
1152 
1153    /* Annoyingly, we only have power-of-two shifts */
1154    while (shift) {
1155       int bit = u_bit_scan(&shift);
1156       assert(bit <= 5);
1157       res = mi_ishr(b, res, mi_imm(1 << bit));
1158    }
1159 
1160    return res;
1161 }
1162 #endif /* if GFX_VERx10 >= 125 */
1163 
1164 static inline struct mi_value
mi_imul_imm(struct mi_builder * b,struct mi_value src,uint32_t N)1165 mi_imul_imm(struct mi_builder *b, struct mi_value src, uint32_t N)
1166 {
1167    if (src.type == MI_VALUE_TYPE_IMM)
1168       return mi_imm(mi_value_to_u64(src) * N);
1169 
1170    if (N == 0) {
1171       mi_value_unref(b, src);
1172       return mi_imm(0);
1173    }
1174 
1175    if (N == 1)
1176       return src;
1177 
1178    src = mi_value_to_gpr(b, src);
1179 
1180    struct mi_value res = mi_value_ref(b, src);
1181 
1182    unsigned top_bit = 31 - __builtin_clz(N);
1183    for (int i = top_bit - 1; i >= 0; i--) {
1184       res = mi_iadd(b, res, mi_value_ref(b, res));
1185       if (N & (1 << i))
1186          res = mi_iadd(b, res, mi_value_ref(b, src));
1187    }
1188 
1189    mi_value_unref(b, src);
1190 
1191    return res;
1192 }
1193 
1194 static inline struct mi_value
mi_ishl_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1195 mi_ishl_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1196 {
1197    if (shift == 0)
1198       return src;
1199 
1200    if (shift >= 64)
1201       return mi_imm(0);
1202 
1203    if (src.type == MI_VALUE_TYPE_IMM)
1204       return mi_imm(mi_value_to_u64(src) << shift);
1205 
1206    struct mi_value res = mi_value_to_gpr(b, src);
1207 
1208 #if GFX_VERx10 >= 125
1209    /* Annoyingly, we only have power-of-two shifts */
1210    while (shift) {
1211       int bit = u_bit_scan(&shift);
1212       assert(bit <= 5);
1213       res = mi_ishl(b, res, mi_imm(1 << bit));
1214    }
1215 #else
1216    for (unsigned i = 0; i < shift; i++)
1217       res = mi_iadd(b, res, mi_value_ref(b, res));
1218 #endif
1219 
1220    return res;
1221 }
1222 
1223 static inline struct mi_value
mi_ushr32_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1224 mi_ushr32_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1225 {
1226    if (shift == 0)
1227       return src;
1228 
1229    if (shift >= 64)
1230       return mi_imm(0);
1231 
1232    /* We right-shift by left-shifting by 32 - shift and taking the top 32 bits
1233     * of the result.
1234     */
1235    if (src.type == MI_VALUE_TYPE_IMM)
1236       return mi_imm((mi_value_to_u64(src) >> shift) & UINT32_MAX);
1237 
1238    if (shift > 32) {
1239       struct mi_value tmp = mi_new_gpr(b);
1240       _mi_copy_no_unref(b, mi_value_half(tmp, false),
1241                                mi_value_half(src, true));
1242       _mi_copy_no_unref(b, mi_value_half(tmp, true), mi_imm(0));
1243       mi_value_unref(b, src);
1244       src = tmp;
1245       shift -= 32;
1246    }
1247    assert(shift <= 32);
1248    struct mi_value tmp = mi_ishl_imm(b, src, 32 - shift);
1249    struct mi_value dst = mi_new_gpr(b);
1250    _mi_copy_no_unref(b, mi_value_half(dst, false),
1251                             mi_value_half(tmp, true));
1252    _mi_copy_no_unref(b, mi_value_half(dst, true), mi_imm(0));
1253    mi_value_unref(b, tmp);
1254    return dst;
1255 }
1256 
1257 static inline struct mi_value
mi_udiv32_imm(struct mi_builder * b,struct mi_value N,uint32_t D)1258 mi_udiv32_imm(struct mi_builder *b, struct mi_value N, uint32_t D)
1259 {
1260    if (N.type == MI_VALUE_TYPE_IMM) {
1261       assert(mi_value_to_u64(N) <= UINT32_MAX);
1262       return mi_imm(mi_value_to_u64(N) / D);
1263    }
1264 
1265    /* We implicitly assume that N is only a 32-bit value */
1266    if (D == 0) {
1267       /* This is invalid but we should do something */
1268       return mi_imm(0);
1269    } else if (util_is_power_of_two_or_zero(D)) {
1270       return mi_ushr32_imm(b, N, util_logbase2(D));
1271    } else {
1272       struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
1273       assert(m.multiplier <= UINT32_MAX);
1274 
1275       if (m.pre_shift)
1276          N = mi_ushr32_imm(b, N, m.pre_shift);
1277 
1278       /* Do the 32x32 multiply  into gpr0 */
1279       N = mi_imul_imm(b, N, m.multiplier);
1280 
1281       if (m.increment)
1282          N = mi_iadd(b, N, mi_imm(m.multiplier));
1283 
1284       N = mi_ushr32_imm(b, N, 32);
1285 
1286       if (m.post_shift)
1287          N = mi_ushr32_imm(b, N, m.post_shift);
1288 
1289       return N;
1290    }
1291 }
1292 
1293 #endif /* MI_MATH section */
1294 
1295 /* This assumes addresses of strictly more than 32bits (aka. Gfx8+). */
1296 #if MI_BUILDER_CAN_WRITE_BATCH
1297 
1298 struct mi_reloc_imm_token {
1299    enum mi_value_type dst_type;
1300    uint32_t *ptr[2];
1301 };
1302 
1303 /* Emits a immediate write to an address/register where the immediate value
1304  * can be updated later.
1305  */
1306 static inline struct mi_reloc_imm_token
mi_store_relocated_imm(struct mi_builder * b,struct mi_value dst)1307 mi_store_relocated_imm(struct mi_builder *b, struct mi_value dst)
1308 {
1309    mi_builder_flush_math(b);
1310 
1311    struct mi_reloc_imm_token token = {
1312       .dst_type = dst.type,
1313    };
1314 
1315    uint32_t *dw;
1316    switch (dst.type) {
1317    case MI_VALUE_TYPE_MEM32:
1318       dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1319                                               GENX(MI_STORE_DATA_IMM_length));
1320       mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdm) {
1321          sdm.DWordLength = GENX(MI_STORE_DATA_IMM_length) -
1322                            GENX(MI_STORE_DATA_IMM_length_bias);
1323          sdm.Address = dst.addr;
1324       }
1325       token.ptr[0] = dw + GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32;
1326       mi_builder_set_write(b);
1327       break;
1328 
1329    case MI_VALUE_TYPE_MEM64:
1330       dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1331                                               GENX(MI_STORE_DATA_IMM_length) + 1);
1332       mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdm) {
1333          sdm.DWordLength = GENX(MI_STORE_DATA_IMM_length) + 1 -
1334                            GENX(MI_STORE_DATA_IMM_length_bias);
1335          sdm.Address = dst.addr;
1336       }
1337       token.ptr[0] = &dw[GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32];
1338       token.ptr[1] = &dw[GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32 + 1];
1339       mi_builder_set_write(b);
1340       break;
1341 
1342    case MI_VALUE_TYPE_REG32:
1343       dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1344                                               GENX(MI_LOAD_REGISTER_IMM_length));
1345       mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
1346          lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) -
1347                            GENX(MI_LOAD_REGISTER_IMM_length_bias);
1348          struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
1349 #if GFX_VER >= 11
1350          lri.AddCSMMIOStartOffset = reg.cs;
1351 #endif
1352          lri.RegisterOffset = reg.num;
1353       }
1354       token.ptr[0] = &dw[2];
1355       break;
1356 
1357    case MI_VALUE_TYPE_REG64: {
1358       dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1359                                               GENX(MI_LOAD_REGISTER_IMM_length) + 2);
1360       struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
1361       mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
1362          lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) + 2 -
1363                            GENX(MI_LOAD_REGISTER_IMM_length_bias);
1364 #if GFX_VER >= 11
1365          lri.AddCSMMIOStartOffset = reg.cs;
1366 #endif
1367       }
1368       dw[1] = reg.num;
1369       dw[3] = reg.num + 4;
1370       token.ptr[0] = &dw[2];
1371       token.ptr[1] = &dw[4];
1372       break;
1373    }
1374 
1375    default:
1376       unreachable("Invalid value type");
1377    }
1378 
1379    mi_value_unref(b, dst);
1380    return token;
1381 }
1382 
1383 static inline void
mi_relocate_store_imm(struct mi_reloc_imm_token token,uint64_t value)1384 mi_relocate_store_imm(struct mi_reloc_imm_token token, uint64_t value)
1385 {
1386    switch (token.dst_type) {
1387    case MI_VALUE_TYPE_MEM64:
1388    case MI_VALUE_TYPE_REG64:
1389       *token.ptr[1] = value >> 32;
1390       FALLTHROUGH;
1391    case MI_VALUE_TYPE_MEM32:
1392    case MI_VALUE_TYPE_REG32:
1393       *token.ptr[0] = value & 0xffffffff;
1394       break;
1395    default:
1396       unreachable("Invalid value type");
1397    }
1398 }
1399 
1400 struct mi_address_token {
1401    /* Pointers to address memory fields in the batch. */
1402    uint64_t *ptrs[2];
1403 };
1404 
1405 /* Emits a 64bit memory write to a yet unknown address using a value from a
1406  * register
1407  */
1408 static inline struct mi_address_token
mi_store_relocated_address_reg64(struct mi_builder * b,struct mi_value addr_reg)1409 mi_store_relocated_address_reg64(struct mi_builder *b, struct mi_value addr_reg)
1410 {
1411    mi_builder_flush_math(b);
1412 
1413    assert(addr_reg.type == MI_VALUE_TYPE_REG64);
1414 
1415    struct mi_address_token token = {};
1416 
1417    for (unsigned i = 0; i < 2; i++) {
1418       mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
1419          srm.RegisterAddress = addr_reg.reg + (i * 4);
1420 
1421          const unsigned addr_dw =
1422             GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8;
1423          token.ptrs[i] = (uint64_t *)(mi_builder_get_inst_ptr(_dst) + addr_dw);
1424       }
1425    }
1426 
1427    mi_builder_set_write(b);
1428    mi_value_unref(b, addr_reg);
1429    return token;
1430 }
1431 
1432 static inline void
mi_self_mod_barrier(struct mi_builder * b,unsigned cs_prefetch_size)1433 mi_self_mod_barrier(struct mi_builder *b, unsigned cs_prefetch_size)
1434 {
1435    /* First make sure all the memory writes from previous modifying commands
1436     * have landed. We want to do this before going through the CS cache,
1437     * otherwise we could be fetching memory that hasn't been written to yet.
1438     */
1439    mi_builder_emit(b, GENX(PIPE_CONTROL), pc) {
1440       pc.CommandStreamerStallEnable = true;
1441    }
1442    /* Documentation says Gfx11+ should be able to invalidate the command cache
1443     * but experiment show it doesn't work properly, so for now just get over
1444     * the CS prefetch.
1445     */
1446    for (uint32_t i = 0; i < (cs_prefetch_size / 4); i++)
1447       mi_builder_emit(b, GENX(MI_NOOP), noop);
1448 }
1449 
1450 static inline void
mi_resolve_relocated_address_token(struct mi_builder * b,struct mi_address_token token,void * batch_location)1451 mi_resolve_relocated_address_token(struct mi_builder *b,
1452                                    struct mi_address_token token,
1453                                    void *batch_location)
1454 {
1455    __gen_address_type addr = __gen_get_batch_address(b->user_data,
1456                                                     batch_location);
1457    uint64_t addr_addr_u64 = __gen_combine_address(b->user_data, batch_location,
1458                                                   addr, 0);
1459    *(token.ptrs[0]) = addr_addr_u64;
1460    *(token.ptrs[1]) = addr_addr_u64 + 4;
1461 }
1462 
1463 #endif /* MI_BUILDER_CAN_WRITE_BATCH */
1464 
1465 #if GFX_VERx10 >= 125
1466 
1467 /*
1468  * Indirect load/store.  Only available on XE_HP+
1469  */
1470 
1471 MUST_CHECK static inline struct mi_value
mi_load_mem64_offset(struct mi_builder * b,__gen_address_type addr,struct mi_value offset)1472 mi_load_mem64_offset(struct mi_builder *b,
1473                      __gen_address_type addr, struct mi_value offset)
1474 {
1475    mi_ensure_write_fence(b);
1476 
1477    uint64_t addr_u64 = __gen_combine_address(b->user_data, NULL, addr, 0);
1478    struct mi_value addr_val = mi_imm(addr_u64);
1479 
1480    struct mi_value dst = mi_new_gpr(b);
1481 
1482    uint32_t dw[5];
1483    dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &addr_val);
1484    dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &offset);
1485    dw[2] = _mi_pack_alu(MI_ALU_ADD, 0, 0);
1486    dw[3] = _mi_pack_alu(MI_ALU_LOADIND, _mi_value_as_gpr(dst), MI_ALU_ACCU);
1487    dw[4] = _mi_pack_alu(MI_ALU_FENCE_RD, 0, 0);
1488    _mi_builder_push_math(b, dw, 5);
1489 
1490    mi_value_unref(b, addr_val);
1491    mi_value_unref(b, offset);
1492 
1493    return dst;
1494 }
1495 
1496 static inline void
mi_store_mem64_offset(struct mi_builder * b,__gen_address_type addr,struct mi_value offset,struct mi_value data)1497 mi_store_mem64_offset(struct mi_builder *b,
1498                           __gen_address_type addr, struct mi_value offset,
1499                           struct mi_value data)
1500 {
1501    uint64_t addr_u64 = __gen_combine_address(b->user_data, NULL, addr, 0);
1502    struct mi_value addr_val = mi_imm(addr_u64);
1503 
1504    data = mi_value_to_gpr(b, mi_resolve_invert(b, data));
1505 
1506    uint32_t dw[5];
1507    dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &addr_val);
1508    dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &offset);
1509    dw[2] = _mi_pack_alu(MI_ALU_ADD, 0, 0);
1510    dw[3] = _mi_pack_alu(MI_ALU_STOREIND, MI_ALU_ACCU, _mi_value_as_gpr(data));
1511    dw[4] = _mi_pack_alu(MI_ALU_FENCE_WR, 0, 0);
1512    _mi_builder_push_math(b, dw, 5);
1513 
1514    mi_value_unref(b, addr_val);
1515    mi_value_unref(b, offset);
1516    mi_value_unref(b, data);
1517 
1518    /* This is the only math case which has side-effects outside of regular
1519     * registers to flush math afterwards so we don't confuse anyone.
1520     */
1521    mi_builder_flush_math(b);
1522    /* mi_builder_set_write() is not required here because we have a FENCE_WR
1523     * in the ALU instruction.
1524     */
1525 }
1526 
1527 #endif /* GFX_VERx10 >= 125 */
1528 
1529 #if GFX_VER >= 9
1530 
1531 /*
1532  * Control-flow Section.  Only available on Gfx9+
1533  */
1534 
1535 struct _mi_goto {
1536    bool predicated;
1537    void *mi_bbs;
1538 };
1539 
1540 struct mi_goto_target {
1541    bool placed;
1542    unsigned num_gotos;
1543    struct _mi_goto gotos[8];
1544    __gen_address_type addr;
1545 };
1546 
1547 #define MI_GOTO_TARGET_INIT ((struct mi_goto_target) {})
1548 
1549 /* On >= Gfx12.5, the predication of MI_BATCH_BUFFER_START is driven by the
1550  * bit0 of the MI_SET_PREDICATE_RESULT register.
1551  *
1552  * ACM PRMs, Vol 2a: Command Reference: Instructions, MI_BATCH_BUFFER_START,
1553  * Predication Enable:
1554  *
1555  *   "This bit is used to enable predication of this command. If this bit is
1556  *    set and Bit 0 of the MI_SET_PREDICATE_RESULT register is set, this
1557  *    command is ignored. Otherwise the command is performed normally."
1558  *
1559  * The register offset is not listed in the PRMs, but BSpec places it a
1560  * 0x2418.
1561  *
1562  * On < Gfx12.5, the predication of MI_BATCH_BUFFER_START is driven by the
1563  * bit0 of MI_PREDICATE_RESULT_1.
1564  *
1565  * SKL PRMs, Vol 2a: Command Reference: Instructions, MI_BATCH_BUFFER_START,
1566  * Predication Enable:
1567  *
1568  *    "This bit is used to enable predication of this command. If this bit is
1569  *     set and Bit 0 of the MI_PREDICATE_RESULT_1 register is clear, this
1570  *     command is ignored. Otherwise the command is performed normally.
1571  *     Specific to the Render command stream only."
1572  *
1573  * The register offset is listed in the SKL PRMs, Vol 2c: Command Reference:
1574  * Registers, MI_PREDICATE_RESULT_1, at 0x241C.
1575  */
1576 #if GFX_VERx10 >= 125
1577 #define MI_BUILDER_MI_PREDICATE_RESULT_num  0x2418
1578 #else
1579 #define MI_BUILDER_MI_PREDICATE_RESULT_num  0x241C
1580 #endif
1581 
1582 static inline void
mi_goto_if(struct mi_builder * b,struct mi_value cond,struct mi_goto_target * t)1583 mi_goto_if(struct mi_builder *b, struct mi_value cond,
1584            struct mi_goto_target *t)
1585 {
1586    /* First, set up the predicate, if any */
1587    bool predicated;
1588    if (cond.type == MI_VALUE_TYPE_IMM) {
1589       /* If it's an immediate, the goto either doesn't happen or happens
1590        * unconditionally.
1591        */
1592       if (mi_value_to_u64(cond) == 0)
1593          return;
1594 
1595       assert(mi_value_to_u64(cond) == ~0ull);
1596       predicated = false;
1597    } else if (mi_value_is_reg(cond) &&
1598               cond.reg == MI_BUILDER_MI_PREDICATE_RESULT_num) {
1599       /* If it's MI_PREDICATE_RESULT, we use whatever predicate the client
1600        * provided us with
1601        */
1602       assert(cond.type == MI_VALUE_TYPE_REG32);
1603       predicated = true;
1604    } else {
1605       mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), cond);
1606       predicated = true;
1607    }
1608 
1609 #if GFX_VERx10 >= 125
1610    if (predicated) {
1611       mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
1612          sp.PredicateEnable = NOOPOnResultClear;
1613       }
1614    }
1615 #endif
1616    if (t->placed) {
1617       mi_builder_emit(b, GENX(MI_BATCH_BUFFER_START), bbs) {
1618          bbs.PredicationEnable         = predicated;
1619          bbs.AddressSpaceIndicator     = ASI_PPGTT;
1620          bbs.BatchBufferStartAddress   = t->addr;
1621       }
1622    } else {
1623       assert(t->num_gotos < ARRAY_SIZE(t->gotos));
1624       struct _mi_goto g = {
1625          .predicated = predicated,
1626          .mi_bbs = __gen_get_batch_dwords(b->user_data,
1627                                           GENX(MI_BATCH_BUFFER_START_length)),
1628       };
1629       memset(g.mi_bbs, 0, 4 * GENX(MI_BATCH_BUFFER_START_length));
1630       t->gotos[t->num_gotos++] = g;
1631    }
1632    if (predicated) {
1633 #if GFX_VERx10 >= 125
1634       mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
1635          sp.PredicateEnable = NOOPNever;
1636       }
1637 #else
1638       mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), mi_imm(0));
1639 #endif
1640    }
1641 }
1642 
1643 static inline void
mi_goto(struct mi_builder * b,struct mi_goto_target * t)1644 mi_goto(struct mi_builder *b, struct mi_goto_target *t)
1645 {
1646    mi_goto_if(b, mi_imm(-1), t);
1647 }
1648 
1649 static inline void
mi_goto_target(struct mi_builder * b,struct mi_goto_target * t)1650 mi_goto_target(struct mi_builder *b, struct mi_goto_target *t)
1651 {
1652 #if GFX_VERx10 >= 125
1653    mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
1654       sp.PredicateEnable = NOOPNever;
1655       t->addr = __gen_get_batch_address(b->user_data,
1656                                         mi_builder_get_inst_ptr(b));
1657    }
1658 #else
1659    mi_builder_emit(b, GENX(MI_NOOP), sp) {
1660       t->addr = __gen_get_batch_address(b->user_data,
1661                                         mi_builder_get_inst_ptr(b));
1662    }
1663    mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), mi_imm(0));
1664 #endif
1665    t->placed = true;
1666 
1667    struct GENX(MI_BATCH_BUFFER_START) bbs = { GENX(MI_BATCH_BUFFER_START_header) };
1668    bbs.AddressSpaceIndicator     = ASI_PPGTT;
1669    bbs.BatchBufferStartAddress   = t->addr;
1670 
1671    for (unsigned i = 0; i < t->num_gotos; i++) {
1672       bbs.PredicationEnable = t->gotos[i].predicated;
1673       GENX(MI_BATCH_BUFFER_START_pack)(b->user_data, t->gotos[i].mi_bbs, &bbs);
1674    }
1675 }
1676 
1677 static inline struct mi_goto_target
mi_goto_target_init_and_place(struct mi_builder * b)1678 mi_goto_target_init_and_place(struct mi_builder *b)
1679 {
1680    struct mi_goto_target t = MI_GOTO_TARGET_INIT;
1681    mi_goto_target(b, &t);
1682    return t;
1683 }
1684 
1685 #define mi_loop(b) \
1686    for (struct mi_goto_target __break = MI_GOTO_TARGET_INIT, \
1687         __continue = mi_goto_target_init_and_place(b); !__break.placed; \
1688         mi_goto(b, &__continue), mi_goto_target(b, &__break))
1689 
1690 #define mi_break(b) mi_goto(b, &__break)
1691 #define mi_break_if(b, cond) mi_goto_if(b, cond, &__break)
1692 #define mi_continue(b) mi_goto(b, &__continue)
1693 #define mi_continue_if(b, cond) mi_goto_if(b, cond, &__continue)
1694 
1695 #endif /* GFX_VER >= 9 */
1696 
1697 #endif /* MI_BUILDER_H */
1698