1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef MI_BUILDER_H
25 #define MI_BUILDER_H
26
27 #include "dev/intel_device_info.h"
28 #include "genxml/genX_bits.h"
29 #include "util/bitscan.h"
30 #include "util/fast_idiv_by_const.h"
31 #include "util/u_math.h"
32
33 #ifndef MI_BUILDER_NUM_ALLOC_GPRS
34 /** The number of GPRs the MI builder is allowed to allocate
35 *
36 * This may be set by a user of this API so that it can reserve some GPRs at
37 * the top end for its own use.
38 */
39 #define MI_BUILDER_NUM_ALLOC_GPRS 16
40 #endif
41
42 #ifndef MI_BUILDER_DEFAULT_WRITE_CHECK
43 #define MI_BUILDER_DEFAULT_WRITE_CHECK true
44 #endif
45
46 #ifndef MI_BUILDER_RAW_MEM_FENCING
47 #define MI_BUILDER_RAW_MEM_FENCING GFX_VER >= 20
48 #endif
49
50 /** These must be defined by the user of the builder
51 *
52 * void *__gen_get_batch_dwords(__gen_user_data *user_data,
53 * unsigned num_dwords);
54 *
55 * __gen_address_type
56 * __gen_address_offset(__gen_address_type addr, uint64_t offset);
57 *
58 *
59 * If self-modifying batches are supported, we must be able to pass batch
60 * addresses around as void*s so pinning as well as batch chaining or some
61 * other mechanism for ensuring batch pointers remain valid during building is
62 * required. The following function must also be defined, it returns an
63 * address in canonical form:
64 *
65 * __gen_address_type
66 * __gen_get_batch_address(__gen_user_data *user_data, void *location);
67 *
68 * Also, __gen_combine_address must accept a location value of NULL and return
69 * a fully valid 64-bit address.
70 */
71
72 /**
73 * On Gfx20+ this must also be defined by the user of the builder
74 *
75 * bool *
76 * __gen_get_write_fencing_status(__gen_user_data *user_data);
77 *
78 * Returns a pointer to a boolean tracking the status of fencing for MI
79 * commands writing to memory.
80 */
81
82 /*
83 * Start of the actual MI builder
84 */
85
86 #define __genxml_cmd_length(cmd) cmd ## _length
87 #define __genxml_cmd_header(cmd) cmd ## _header
88 #define __genxml_cmd_pack(cmd) cmd ## _pack
89
90 #define mi_builder_pack(b, cmd, dst, name) \
91 for (struct cmd name = { __genxml_cmd_header(cmd) }, \
92 *_dst = (struct cmd *)(dst); __builtin_expect(_dst != NULL, 1); \
93 __genxml_cmd_pack(cmd)((b)->user_data, (void *)_dst, &name), \
94 _dst = NULL)
95
96 /* Get the instruction pointer inside a mi_builder_pack() block */
97 #define mi_builder_get_inst_ptr(b) \
98 ((uint8_t *)_dst)
99
100 #define mi_builder_emit(b, cmd, name) \
101 mi_builder_pack((b), cmd, __gen_get_batch_dwords((b)->user_data, __genxml_cmd_length(cmd)), name)
102
103 enum mi_value_type {
104 MI_VALUE_TYPE_IMM,
105 MI_VALUE_TYPE_MEM32,
106 MI_VALUE_TYPE_MEM64,
107 MI_VALUE_TYPE_REG32,
108 MI_VALUE_TYPE_REG64,
109 };
110
111 struct mi_value {
112 enum mi_value_type type;
113
114 union {
115 uint64_t imm;
116 __gen_address_type addr;
117 uint32_t reg;
118 };
119
120 #if GFX_VERx10 >= 75
121 bool invert;
122 #endif
123 };
124
125 struct mi_reg_num {
126 uint32_t num;
127 #if GFX_VER >= 11
128 bool cs;
129 #endif
130 };
131
132 static inline struct mi_reg_num
mi_adjust_reg_num(uint32_t reg)133 mi_adjust_reg_num(uint32_t reg)
134 {
135 #if GFX_VER >= 11
136 bool cs = reg >= 0x2000 && reg < 0x4000;
137 return (struct mi_reg_num) {
138 .num = reg - (cs ? 0x2000 : 0),
139 .cs = cs,
140 };
141 #else
142 return (struct mi_reg_num) { .num = reg, };
143 #endif
144 }
145
146 #if GFX_VER >= 9
147 #define MI_BUILDER_MAX_MATH_DWORDS 256
148 #else
149 #define MI_BUILDER_MAX_MATH_DWORDS 64
150 #endif
151
152 struct mi_builder {
153 const struct intel_device_info *devinfo;
154 __gen_user_data *user_data;
155
156 bool no_read_write_fencing;
157
158 #if GFX_VERx10 >= 75
159 uint32_t gprs;
160 uint8_t gpr_refs[MI_BUILDER_NUM_ALLOC_GPRS];
161
162 unsigned num_math_dwords;
163 uint32_t math_dwords[MI_BUILDER_MAX_MATH_DWORDS];
164 #endif
165
166 #if GFX_VERx10 >= 125
167 uint32_t mocs;
168 #endif
169
170 #if GFX_VER >= 12
171 bool write_check;
172 #endif
173 };
174
175 static inline void
mi_builder_init(struct mi_builder * b,const struct intel_device_info * devinfo,__gen_user_data * user_data)176 mi_builder_init(struct mi_builder *b,
177 const struct intel_device_info *devinfo,
178 __gen_user_data *user_data)
179 {
180 memset(b, 0, sizeof(*b));
181 b->devinfo = devinfo;
182 b->user_data = user_data;
183
184 #if GFX_VER >= 12
185 b->write_check = MI_BUILDER_DEFAULT_WRITE_CHECK;
186 #endif
187 b->no_read_write_fencing = false;
188 #if GFX_VERx10 >= 75
189 b->gprs = 0;
190 b->num_math_dwords = 0;
191 #endif
192 }
193
194 static inline void
mi_builder_flush_math(struct mi_builder * b)195 mi_builder_flush_math(struct mi_builder *b)
196 {
197 #if GFX_VERx10 >= 75
198 if (b->num_math_dwords == 0)
199 return;
200
201 uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
202 1 + b->num_math_dwords);
203 mi_builder_pack(b, GENX(MI_MATH), dw, math) {
204 #if GFX_VERx10 >= 125
205 math.MOCS = b->mocs;
206 #endif
207 math.DWordLength = 1 + b->num_math_dwords - GENX(MI_MATH_length_bias);
208 }
209 memcpy(dw + 1, b->math_dwords, b->num_math_dwords * sizeof(uint32_t));
210 b->num_math_dwords = 0;
211 #endif
212 }
213
214 /**
215 * Set mocs index to mi_build
216 *
217 * This is required when a MI_MATH instruction will be emitted and
218 * the code is used in GFX 12.5 or newer.
219 */
220 static inline void
mi_builder_set_mocs(UNUSED struct mi_builder * b,UNUSED uint32_t mocs)221 mi_builder_set_mocs(UNUSED struct mi_builder *b, UNUSED uint32_t mocs)
222 {
223 #if GFX_VERx10 >= 125
224 if (b->mocs != 0 && b->mocs != mocs)
225 mi_builder_flush_math(b);
226 b->mocs = mocs;
227 #endif
228 }
229
230 /**
231 * Set write checks on immediate writes
232 *
233 * This ensures that the next memory write will complete only when all emitted
234 * previously emitted memory write are .
235 */
236 static inline void
mi_builder_set_write_check(UNUSED struct mi_builder * b,UNUSED bool check)237 mi_builder_set_write_check(UNUSED struct mi_builder *b, UNUSED bool check)
238 {
239 #if GFX_VER >= 12
240 b->write_check = check;
241 #endif
242 }
243
244 static inline bool
mi_builder_write_checked(UNUSED struct mi_builder * b)245 mi_builder_write_checked(UNUSED struct mi_builder *b)
246 {
247 #if GFX_VER >= 12
248 return b->write_check;
249 #else
250 return false;
251 #endif
252 }
253
254 #define _MI_BUILDER_GPR_BASE 0x2600
255 /* The actual hardware limit on GPRs */
256 #define _MI_BUILDER_NUM_HW_GPRS 16
257
258 #if GFX_VERx10 >= 75
259
260 static inline bool
mi_value_is_reg(struct mi_value val)261 mi_value_is_reg(struct mi_value val)
262 {
263 return val.type == MI_VALUE_TYPE_REG32 ||
264 val.type == MI_VALUE_TYPE_REG64;
265 }
266
267 static inline bool
mi_value_is_gpr(struct mi_value val)268 mi_value_is_gpr(struct mi_value val)
269 {
270 return mi_value_is_reg(val) &&
271 val.reg >= _MI_BUILDER_GPR_BASE &&
272 val.reg < _MI_BUILDER_GPR_BASE +
273 _MI_BUILDER_NUM_HW_GPRS * 8;
274 }
275
276 static inline bool
_mi_value_is_allocated_gpr(struct mi_value val)277 _mi_value_is_allocated_gpr(struct mi_value val)
278 {
279 return mi_value_is_reg(val) &&
280 val.reg >= _MI_BUILDER_GPR_BASE &&
281 val.reg < _MI_BUILDER_GPR_BASE +
282 MI_BUILDER_NUM_ALLOC_GPRS * 8;
283 }
284
285 static inline uint32_t
_mi_value_as_gpr(struct mi_value val)286 _mi_value_as_gpr(struct mi_value val)
287 {
288 assert(mi_value_is_gpr(val));
289 /* Some of the GRL metakernels will generate 64bit value in a GP register,
290 * then use only half of that as the last operation on that value. So allow
291 * unref on part of a GP register.
292 */
293 assert(val.reg % 4 == 0);
294 return (val.reg - _MI_BUILDER_GPR_BASE) / 8;
295 }
296
297 static inline struct mi_value
mi_new_gpr(struct mi_builder * b)298 mi_new_gpr(struct mi_builder *b)
299 {
300 unsigned gpr = ffs(~b->gprs) - 1;
301 assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
302 assert(b->gpr_refs[gpr] == 0);
303 b->gprs |= (1u << gpr);
304 b->gpr_refs[gpr] = 1;
305
306 return (struct mi_value) {
307 .type = MI_VALUE_TYPE_REG64,
308 .reg = _MI_BUILDER_GPR_BASE + gpr * 8,
309 };
310 }
311
312 static inline struct mi_value
mi_reserve_gpr(struct mi_builder * b,unsigned gpr)313 mi_reserve_gpr(struct mi_builder *b, unsigned gpr)
314 {
315 assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
316 assert(!(b->gprs & (1 << gpr)));
317 assert(b->gpr_refs[gpr] == 0);
318 b->gprs |= (1u << gpr);
319 b->gpr_refs[gpr] = 128; /* Enough that we won't unref it */
320
321 return (struct mi_value) {
322 .type = MI_VALUE_TYPE_REG64,
323 .reg = _MI_BUILDER_GPR_BASE + gpr * 8,
324 };
325 }
326 #endif /* GFX_VERx10 >= 75 */
327
328 /** Take a reference to a mi_value
329 *
330 * The MI builder uses reference counting to automatically free ALU GPRs for
331 * re-use in calculations. All mi_* math functions consume the reference
332 * they are handed for each source and return a reference to a value which the
333 * caller must consume. In particular, if you pas the same value into a
334 * single mi_* math function twice (say to add a number to itself), you
335 * are responsible for calling mi_value_ref() to get a second reference
336 * because the mi_* math function will consume it twice.
337 */
338 static inline void
mi_value_add_refs(struct mi_builder * b,struct mi_value val,unsigned num_refs)339 mi_value_add_refs(struct mi_builder *b, struct mi_value val, unsigned num_refs)
340 {
341 #if GFX_VERx10 >= 75
342 if (_mi_value_is_allocated_gpr(val)) {
343 unsigned gpr = _mi_value_as_gpr(val);
344 assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
345 assert(b->gprs & (1u << gpr));
346 assert(b->gpr_refs[gpr] < UINT8_MAX);
347 b->gpr_refs[gpr] += num_refs;
348 }
349 #endif /* GFX_VERx10 >= 75 */
350 }
351
352 static inline struct mi_value
mi_value_ref(struct mi_builder * b,struct mi_value val)353 mi_value_ref(struct mi_builder *b, struct mi_value val)
354 {
355 mi_value_add_refs(b, val, 1);
356 return val;
357 }
358
359
360 /** Drop a reference to a mi_value
361 *
362 * See also mi_value_ref.
363 */
364 static inline void
mi_value_unref(struct mi_builder * b,struct mi_value val)365 mi_value_unref(struct mi_builder *b, struct mi_value val)
366 {
367 #if GFX_VERx10 >= 75
368 if (_mi_value_is_allocated_gpr(val)) {
369 unsigned gpr = _mi_value_as_gpr(val);
370 assert(gpr < MI_BUILDER_NUM_ALLOC_GPRS);
371 assert(b->gprs & (1u << gpr));
372 assert(b->gpr_refs[gpr] > 0);
373 if (--b->gpr_refs[gpr] == 0)
374 b->gprs &= ~(1u << gpr);
375 }
376 #endif /* GFX_VERx10 >= 75 */
377 }
378
379 /* On Gfx20+ memory read/write can be process unordered, so we need to track
380 * the writes to memory to make sure any memory read will see the effect of a
381 * previous write.
382 */
383 static inline void
mi_builder_set_write(struct mi_builder * b)384 mi_builder_set_write(struct mi_builder *b)
385 {
386 #if MI_BUILDER_RAW_MEM_FENCING
387 *__gen_get_write_fencing_status(b->user_data) = true;
388 #endif
389 }
390
391 static inline void
mi_ensure_write_fence(struct mi_builder * b)392 mi_ensure_write_fence(struct mi_builder *b)
393 {
394 #if MI_BUILDER_RAW_MEM_FENCING
395 if (!b->no_read_write_fencing &&
396 *__gen_get_write_fencing_status(b->user_data)) {
397 mi_builder_emit(b, GENX(MI_MEM_FENCE), fence)
398 fence.FenceType = FENCE_TYPE_MI_WRITE;
399 *__gen_get_write_fencing_status(b->user_data) = false;
400 }
401 #endif
402 }
403
404 static inline struct mi_value
mi_imm(uint64_t imm)405 mi_imm(uint64_t imm)
406 {
407 return (struct mi_value) {
408 .type = MI_VALUE_TYPE_IMM,
409 .imm = imm,
410 };
411 }
412
413 static inline struct mi_value
mi_reg32(uint32_t reg)414 mi_reg32(uint32_t reg)
415 {
416 struct mi_value val = {
417 .type = MI_VALUE_TYPE_REG32,
418 .reg = reg,
419 };
420 #if GFX_VERx10 >= 75
421 assert(!_mi_value_is_allocated_gpr(val));
422 #endif
423 return val;
424 }
425
426 static inline struct mi_value
mi_reg64(uint32_t reg)427 mi_reg64(uint32_t reg)
428 {
429 struct mi_value val = {
430 .type = MI_VALUE_TYPE_REG64,
431 .reg = reg,
432 };
433 #if GFX_VERx10 >= 75
434 assert(!_mi_value_is_allocated_gpr(val));
435 #endif
436 return val;
437 }
438
439 static inline struct mi_value
mi_mem32(__gen_address_type addr)440 mi_mem32(__gen_address_type addr)
441 {
442 return (struct mi_value) {
443 .type = MI_VALUE_TYPE_MEM32,
444 .addr = addr,
445 };
446 }
447
448 static inline struct mi_value
mi_mem64(__gen_address_type addr)449 mi_mem64(__gen_address_type addr)
450 {
451 return (struct mi_value) {
452 .type = MI_VALUE_TYPE_MEM64,
453 .addr = addr,
454 };
455 }
456
457 static inline struct mi_value
mi_value_half(struct mi_value value,bool top_32_bits)458 mi_value_half(struct mi_value value, bool top_32_bits)
459 {
460 switch (value.type) {
461 case MI_VALUE_TYPE_IMM:
462 if (top_32_bits)
463 value.imm >>= 32;
464 else
465 value.imm &= 0xffffffffu;
466 return value;
467
468 case MI_VALUE_TYPE_MEM32:
469 assert(!top_32_bits);
470 return value;
471
472 case MI_VALUE_TYPE_MEM64:
473 if (top_32_bits)
474 value.addr = __gen_address_offset(value.addr, 4);
475 value.type = MI_VALUE_TYPE_MEM32;
476 return value;
477
478 case MI_VALUE_TYPE_REG32:
479 assert(!top_32_bits);
480 return value;
481
482 case MI_VALUE_TYPE_REG64:
483 if (top_32_bits)
484 value.reg += 4;
485 value.type = MI_VALUE_TYPE_REG32;
486 return value;
487 }
488
489 unreachable("Invalid mi_value type");
490 }
491
492 static inline void
_mi_copy_no_unref(struct mi_builder * b,struct mi_value dst,struct mi_value src)493 _mi_copy_no_unref(struct mi_builder *b,
494 struct mi_value dst, struct mi_value src)
495 {
496 #if GFX_VERx10 >= 75
497 /* TODO: We could handle src.invert by emitting a bit of math if we really
498 * wanted to.
499 */
500 assert(!dst.invert && !src.invert);
501 #endif
502 mi_builder_flush_math(b);
503
504 if (src.type == MI_VALUE_TYPE_MEM64 ||
505 src.type == MI_VALUE_TYPE_MEM32)
506 mi_ensure_write_fence(b);
507
508 switch (dst.type) {
509 case MI_VALUE_TYPE_IMM:
510 unreachable("Cannot copy to an immediate");
511
512 case MI_VALUE_TYPE_MEM64:
513 case MI_VALUE_TYPE_REG64:
514 switch (src.type) {
515 case MI_VALUE_TYPE_IMM:
516 if (dst.type == MI_VALUE_TYPE_REG64) {
517 uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
518 GENX(MI_LOAD_REGISTER_IMM_length) + 2);
519 struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
520 mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
521 lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) + 2 -
522 GENX(MI_LOAD_REGISTER_IMM_length_bias);
523 #if GFX_VER >= 11
524 lri.AddCSMMIOStartOffset = reg.cs;
525 #endif
526 }
527 dw[1] = reg.num;
528 dw[2] = src.imm;
529 dw[3] = reg.num + 4;
530 dw[4] = src.imm >> 32;
531 } else {
532 #if GFX_VER >= 8
533 assert(dst.type == MI_VALUE_TYPE_MEM64);
534 uint32_t *dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
535 GENX(MI_STORE_DATA_IMM_length) + 1);
536 mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdi) {
537 sdi.DWordLength = GENX(MI_STORE_DATA_IMM_length) + 1 -
538 GENX(MI_STORE_DATA_IMM_length_bias);
539 sdi.StoreQword = true;
540 sdi.Address = dst.addr;
541 #if GFX_VER >= 12
542 sdi.ForceWriteCompletionCheck = b->write_check;
543 #endif
544 }
545 dw[3] = src.imm;
546 dw[4] = src.imm >> 32;
547 #else
548 _mi_copy_no_unref(b, mi_value_half(dst, false),
549 mi_value_half(src, false));
550 _mi_copy_no_unref(b, mi_value_half(dst, true),
551 mi_value_half(src, true));
552 #endif
553 }
554 break;
555 case MI_VALUE_TYPE_REG32:
556 case MI_VALUE_TYPE_MEM32:
557 _mi_copy_no_unref(b, mi_value_half(dst, false),
558 mi_value_half(src, false));
559 _mi_copy_no_unref(b, mi_value_half(dst, true),
560 mi_imm(0));
561 break;
562 case MI_VALUE_TYPE_REG64:
563 case MI_VALUE_TYPE_MEM64:
564 _mi_copy_no_unref(b, mi_value_half(dst, false),
565 mi_value_half(src, false));
566 _mi_copy_no_unref(b, mi_value_half(dst, true),
567 mi_value_half(src, true));
568 break;
569 default:
570 unreachable("Invalid mi_value type");
571 }
572 break;
573
574 case MI_VALUE_TYPE_MEM32:
575 switch (src.type) {
576 case MI_VALUE_TYPE_IMM:
577 mi_builder_emit(b, GENX(MI_STORE_DATA_IMM), sdi) {
578 sdi.Address = dst.addr;
579 #if GFX_VER >= 12
580 sdi.ForceWriteCompletionCheck = b->write_check;
581 #endif
582 sdi.ImmediateData = src.imm;
583 }
584 break;
585
586 case MI_VALUE_TYPE_MEM32:
587 case MI_VALUE_TYPE_MEM64:
588 #if GFX_VER >= 8
589 mi_builder_emit(b, GENX(MI_COPY_MEM_MEM), cmm) {
590 cmm.DestinationMemoryAddress = dst.addr;
591 cmm.SourceMemoryAddress = src.addr;
592 }
593 #elif GFX_VERx10 == 75
594 {
595 struct mi_value tmp = mi_new_gpr(b);
596 _mi_copy_no_unref(b, tmp, src);
597 _mi_copy_no_unref(b, dst, tmp);
598 mi_value_unref(b, tmp);
599 }
600 #else
601 unreachable("Cannot do mem <-> mem copy on IVB and earlier");
602 #endif
603 break;
604
605 case MI_VALUE_TYPE_REG32:
606 case MI_VALUE_TYPE_REG64:
607 mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
608 struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
609 srm.RegisterAddress = reg.num;
610 #if GFX_VER >= 11
611 srm.AddCSMMIOStartOffset = reg.cs;
612 #endif
613 srm.MemoryAddress = dst.addr;
614 }
615 break;
616
617 default:
618 unreachable("Invalid mi_value type");
619 }
620 break;
621
622 case MI_VALUE_TYPE_REG32:
623 switch (src.type) {
624 case MI_VALUE_TYPE_IMM:
625 mi_builder_emit(b, GENX(MI_LOAD_REGISTER_IMM), lri) {
626 struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
627 lri.RegisterOffset = reg.num;
628 #if GFX_VER >= 11
629 lri.AddCSMMIOStartOffset = reg.cs;
630 #endif
631 lri.DataDWord = src.imm;
632 }
633 break;
634
635 case MI_VALUE_TYPE_MEM32:
636 case MI_VALUE_TYPE_MEM64:
637 #if GFX_VER >= 7
638 mi_builder_emit(b, GENX(MI_LOAD_REGISTER_MEM), lrm) {
639 struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
640 lrm.RegisterAddress = reg.num;
641 #if GFX_VER >= 11
642 lrm.AddCSMMIOStartOffset = reg.cs;
643 #endif
644 lrm.MemoryAddress = src.addr;
645 }
646 #else
647 unreachable("Cannot load do mem -> reg copy on SNB and earlier");
648 #endif
649 break;
650
651 case MI_VALUE_TYPE_REG32:
652 case MI_VALUE_TYPE_REG64:
653 #if GFX_VERx10 >= 75
654 if (src.reg != dst.reg) {
655 mi_builder_emit(b, GENX(MI_LOAD_REGISTER_REG), lrr) {
656 struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
657 lrr.SourceRegisterAddress = reg.num;
658 #if GFX_VER >= 11
659 lrr.AddCSMMIOStartOffsetSource = reg.cs;
660 #endif
661 reg = mi_adjust_reg_num(dst.reg);
662 lrr.DestinationRegisterAddress = reg.num;
663 #if GFX_VER >= 11
664 lrr.AddCSMMIOStartOffsetDestination = reg.cs;
665 #endif
666 }
667 }
668 #else
669 unreachable("Cannot do reg <-> reg copy on IVB and earlier");
670 #endif
671 break;
672
673 default:
674 unreachable("Invalid mi_value type");
675 }
676 break;
677
678 default:
679 unreachable("Invalid mi_value type");
680 }
681
682
683 if (dst.type == MI_VALUE_TYPE_MEM64 ||
684 dst.type == MI_VALUE_TYPE_MEM32) {
685 /* Immediate writes can already wait for writes, so no need to do
686 * additional fencing later.
687 */
688 if (src.type != MI_VALUE_TYPE_IMM || !mi_builder_write_checked(b))
689 mi_builder_set_write(b);
690 }
691 }
692
693 #if GFX_VERx10 >= 75
694 static inline struct mi_value
695 mi_resolve_invert(struct mi_builder *b, struct mi_value src);
696 #endif
697
698 /** Store the value in src to the value represented by dst
699 *
700 * If the bit size of src and dst mismatch, this function does an unsigned
701 * integer cast. If src has more bits than dst, it takes the bottom bits. If
702 * src has fewer bits then dst, it fills the top bits with zeros.
703 *
704 * This function consumes one reference for each of src and dst.
705 */
706 static inline void
mi_store(struct mi_builder * b,struct mi_value dst,struct mi_value src)707 mi_store(struct mi_builder *b, struct mi_value dst, struct mi_value src)
708 {
709 #if GFX_VERx10 >= 75
710 src = mi_resolve_invert(b, src);
711 #endif
712 _mi_copy_no_unref(b, dst, src);
713 mi_value_unref(b, src);
714 mi_value_unref(b, dst);
715 }
716
717 static inline void
mi_memset(struct mi_builder * b,__gen_address_type dst,uint32_t value,uint32_t size)718 mi_memset(struct mi_builder *b, __gen_address_type dst,
719 uint32_t value, uint32_t size)
720 {
721 #if GFX_VERx10 >= 75
722 assert(b->num_math_dwords == 0);
723 #endif
724
725 /* This memset operates in units of dwords. */
726 assert(size % 4 == 0);
727
728 for (uint32_t i = 0; i < size; i += 4) {
729 mi_store(b, mi_mem32(__gen_address_offset(dst, i)),
730 mi_imm(value));
731 }
732 }
733
734 /* NOTE: On IVB, this function stomps GFX7_3DPRIM_BASE_VERTEX */
735 static inline void
mi_memcpy(struct mi_builder * b,__gen_address_type dst,__gen_address_type src,uint32_t size)736 mi_memcpy(struct mi_builder *b, __gen_address_type dst,
737 __gen_address_type src, uint32_t size)
738 {
739 #if GFX_VERx10 >= 75
740 assert(b->num_math_dwords == 0);
741 #endif
742
743 /* Flush once only */
744 mi_ensure_write_fence(b);
745 b->no_read_write_fencing = true;
746
747 /* Hold off write checks until the last write. */
748 bool write_check = mi_builder_write_checked(b);
749 mi_builder_set_write_check(b, false);
750
751 /* This memcpy operates in units of dwords. */
752 assert(size % 4 == 0);
753
754 for (uint32_t i = 0; i < size; i += 4) {
755 if (i == size - 4)
756 mi_builder_set_write_check(b, write_check);
757
758 struct mi_value dst_val = mi_mem32(__gen_address_offset(dst, i));
759 struct mi_value src_val = mi_mem32(__gen_address_offset(src, i));
760 #if GFX_VERx10 >= 75
761 mi_store(b, dst_val, src_val);
762 #else
763 /* IVB does not have a general purpose register for command streamer
764 * commands. Therefore, we use an alternate temporary register.
765 */
766 struct mi_value tmp_reg = mi_reg32(0x2440); /* GFX7_3DPRIM_BASE_VERTEX */
767 mi_store(b, tmp_reg, src_val);
768 mi_store(b, dst_val, tmp_reg);
769 #endif
770 }
771
772 b->no_read_write_fencing = false;
773 }
774
775 /*
776 * MI_MATH Section. Only available on Haswell+
777 */
778
779 #if GFX_VERx10 >= 75
780
781 /**
782 * Perform a predicated store (assuming the condition is already loaded
783 * in the MI_PREDICATE_RESULT register) of the value in src to the memory
784 * location specified by dst. Non-memory destinations are not supported.
785 *
786 * This function consumes one reference for each of src and dst.
787 */
788 static inline void
mi_store_if(struct mi_builder * b,struct mi_value dst,struct mi_value src)789 mi_store_if(struct mi_builder *b, struct mi_value dst, struct mi_value src)
790 {
791 assert(!dst.invert && !src.invert);
792
793 mi_builder_flush_math(b);
794
795 /* We can only predicate MI_STORE_REGISTER_MEM, so restrict the
796 * destination to be memory, and resolve the source to a temporary
797 * register if it isn't in one already.
798 */
799 assert(dst.type == MI_VALUE_TYPE_MEM64 ||
800 dst.type == MI_VALUE_TYPE_MEM32);
801
802 if (src.type != MI_VALUE_TYPE_REG32 &&
803 src.type != MI_VALUE_TYPE_REG64) {
804 struct mi_value tmp = mi_new_gpr(b);
805 _mi_copy_no_unref(b, tmp, src);
806 src = tmp;
807 }
808
809 if (dst.type == MI_VALUE_TYPE_MEM64) {
810 mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
811 struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
812 srm.RegisterAddress = reg.num;
813 #if GFX_VER >= 11
814 srm.AddCSMMIOStartOffset = reg.cs;
815 #endif
816 srm.MemoryAddress = dst.addr;
817 srm.PredicateEnable = true;
818 }
819 mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
820 struct mi_reg_num reg = mi_adjust_reg_num(src.reg + 4);
821 srm.RegisterAddress = reg.num;
822 #if GFX_VER >= 11
823 srm.AddCSMMIOStartOffset = reg.cs;
824 #endif
825 srm.MemoryAddress = __gen_address_offset(dst.addr, 4);
826 srm.PredicateEnable = true;
827 }
828 } else {
829 mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
830 struct mi_reg_num reg = mi_adjust_reg_num(src.reg);
831 srm.RegisterAddress = reg.num;
832 #if GFX_VER >= 11
833 srm.AddCSMMIOStartOffset = reg.cs;
834 #endif
835 srm.MemoryAddress = dst.addr;
836 srm.PredicateEnable = true;
837 }
838 }
839
840 mi_builder_set_write(b);
841
842 mi_value_unref(b, src);
843 mi_value_unref(b, dst);
844 }
845
846 static inline void
_mi_builder_push_math(struct mi_builder * b,const uint32_t * dwords,unsigned num_dwords)847 _mi_builder_push_math(struct mi_builder *b,
848 const uint32_t *dwords,
849 unsigned num_dwords)
850 {
851 assert(num_dwords < MI_BUILDER_MAX_MATH_DWORDS);
852 if (b->num_math_dwords + num_dwords > MI_BUILDER_MAX_MATH_DWORDS)
853 mi_builder_flush_math(b);
854
855 memcpy(&b->math_dwords[b->num_math_dwords],
856 dwords, num_dwords * sizeof(*dwords));
857 b->num_math_dwords += num_dwords;
858 }
859
860 static inline uint32_t
_mi_pack_alu(uint32_t opcode,uint32_t operand1,uint32_t operand2)861 _mi_pack_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
862 {
863 struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
864 .Operand2 = operand2,
865 .Operand1 = operand1,
866 .ALUOpcode = opcode,
867 };
868
869 uint32_t dw;
870 GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
871
872 return dw;
873 }
874
875 static inline struct mi_value
mi_value_to_gpr(struct mi_builder * b,struct mi_value val)876 mi_value_to_gpr(struct mi_builder *b, struct mi_value val)
877 {
878 if (mi_value_is_gpr(val))
879 return val;
880
881 /* Save off the invert flag because it makes copy() grumpy */
882 bool invert = val.invert;
883 val.invert = false;
884
885 struct mi_value tmp = mi_new_gpr(b);
886 _mi_copy_no_unref(b, tmp, val);
887 tmp.invert = invert;
888
889 return tmp;
890 }
891
892 static inline uint64_t
mi_value_to_u64(struct mi_value val)893 mi_value_to_u64(struct mi_value val)
894 {
895 assert(val.type == MI_VALUE_TYPE_IMM);
896 return val.invert ? ~val.imm : val.imm;
897 }
898
899 static inline uint32_t
_mi_math_load_src(struct mi_builder * b,unsigned src,struct mi_value * val)900 _mi_math_load_src(struct mi_builder *b, unsigned src, struct mi_value *val)
901 {
902 if (val->type == MI_VALUE_TYPE_IMM &&
903 (val->imm == 0 || val->imm == UINT64_MAX)) {
904 uint64_t imm = val->invert ? ~val->imm : val->imm;
905 return _mi_pack_alu(imm ? MI_ALU_LOAD1 : MI_ALU_LOAD0, src, 0);
906 } else {
907 *val = mi_value_to_gpr(b, *val);
908 return _mi_pack_alu(val->invert ? MI_ALU_LOADINV : MI_ALU_LOAD,
909 src, _mi_value_as_gpr(*val));
910 }
911 }
912
913 static inline struct mi_value
mi_math_binop(struct mi_builder * b,uint32_t opcode,struct mi_value src0,struct mi_value src1,uint32_t store_op,uint32_t store_src)914 mi_math_binop(struct mi_builder *b, uint32_t opcode,
915 struct mi_value src0, struct mi_value src1,
916 uint32_t store_op, uint32_t store_src)
917 {
918 struct mi_value dst = mi_new_gpr(b);
919
920 uint32_t dw[4];
921 dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &src0);
922 dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &src1);
923 dw[2] = _mi_pack_alu(opcode, 0, 0);
924 dw[3] = _mi_pack_alu(store_op, _mi_value_as_gpr(dst), store_src);
925 _mi_builder_push_math(b, dw, 4);
926
927 mi_value_unref(b, src0);
928 mi_value_unref(b, src1);
929
930 return dst;
931 }
932
933 static inline struct mi_value
mi_inot(struct mi_builder * b,struct mi_value val)934 mi_inot(struct mi_builder *b, struct mi_value val)
935 {
936 if (val.type == MI_VALUE_TYPE_IMM)
937 return mi_imm(~mi_value_to_u64(val));
938
939 val.invert = !val.invert;
940 return val;
941 }
942
943 static inline struct mi_value
mi_resolve_invert(struct mi_builder * b,struct mi_value src)944 mi_resolve_invert(struct mi_builder *b, struct mi_value src)
945 {
946 if (!src.invert)
947 return src;
948
949 assert(src.type != MI_VALUE_TYPE_IMM);
950 return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
951 MI_ALU_STORE, MI_ALU_ACCU);
952 }
953
954 static inline struct mi_value
mi_iadd(struct mi_builder * b,struct mi_value src0,struct mi_value src1)955 mi_iadd(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
956 {
957 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
958 return mi_imm(mi_value_to_u64(src0) + mi_value_to_u64(src1));
959
960 return mi_math_binop(b, MI_ALU_ADD, src0, src1,
961 MI_ALU_STORE, MI_ALU_ACCU);
962 }
963
964 static inline struct mi_value
mi_iadd_imm(struct mi_builder * b,struct mi_value src,uint64_t N)965 mi_iadd_imm(struct mi_builder *b,
966 struct mi_value src, uint64_t N)
967 {
968 if (N == 0)
969 return src;
970
971 return mi_iadd(b, src, mi_imm(N));
972 }
973
974 static inline struct mi_value
mi_isub(struct mi_builder * b,struct mi_value src0,struct mi_value src1)975 mi_isub(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
976 {
977 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
978 return mi_imm(mi_value_to_u64(src0) - mi_value_to_u64(src1));
979
980 return mi_math_binop(b, MI_ALU_SUB, src0, src1,
981 MI_ALU_STORE, MI_ALU_ACCU);
982 }
983
984 static inline struct mi_value
mi_ieq(struct mi_builder * b,struct mi_value src0,struct mi_value src1)985 mi_ieq(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
986 {
987 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
988 return mi_imm(mi_value_to_u64(src0) == mi_value_to_u64(src1) ? ~0ull : 0);
989
990 /* Compute "equal" by subtracting and storing the zero bit */
991 return mi_math_binop(b, MI_ALU_SUB, src0, src1,
992 MI_ALU_STORE, MI_ALU_ZF);
993 }
994
995 static inline struct mi_value
mi_ine(struct mi_builder * b,struct mi_value src0,struct mi_value src1)996 mi_ine(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
997 {
998 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
999 return mi_imm(mi_value_to_u64(src0) != mi_value_to_u64(src1) ? ~0ull : 0);
1000
1001 /* Compute "not equal" by subtracting and storing the inverse zero bit */
1002 return mi_math_binop(b, MI_ALU_SUB, src0, src1,
1003 MI_ALU_STOREINV, MI_ALU_ZF);
1004 }
1005
1006 static inline struct mi_value
mi_ult(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1007 mi_ult(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1008 {
1009 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1010 return mi_imm(mi_value_to_u64(src0) < mi_value_to_u64(src1) ? ~0ull : 0);
1011
1012 /* Compute "less than" by subtracting and storing the carry bit */
1013 return mi_math_binop(b, MI_ALU_SUB, src0, src1,
1014 MI_ALU_STORE, MI_ALU_CF);
1015 }
1016
1017 static inline struct mi_value
mi_uge(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1018 mi_uge(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1019 {
1020 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1021 return mi_imm(mi_value_to_u64(src0) >= mi_value_to_u64(src1) ? ~0ull : 0);
1022
1023 /* Compute "less than" by subtracting and storing the carry bit */
1024 return mi_math_binop(b, MI_ALU_SUB, src0, src1,
1025 MI_ALU_STOREINV, MI_ALU_CF);
1026 }
1027
1028 static inline struct mi_value
mi_iand(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1029 mi_iand(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1030 {
1031 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1032 return mi_imm(mi_value_to_u64(src0) & mi_value_to_u64(src1));
1033
1034 return mi_math_binop(b, MI_ALU_AND, src0, src1,
1035 MI_ALU_STORE, MI_ALU_ACCU);
1036 }
1037
1038 static inline struct mi_value
mi_nz(struct mi_builder * b,struct mi_value src)1039 mi_nz(struct mi_builder *b, struct mi_value src)
1040 {
1041 if (src.type == MI_VALUE_TYPE_IMM)
1042 return mi_imm(mi_value_to_u64(src) != 0 ? ~0ull : 0);
1043
1044 return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
1045 MI_ALU_STOREINV, MI_ALU_ZF);
1046 }
1047
1048 static inline struct mi_value
mi_z(struct mi_builder * b,struct mi_value src)1049 mi_z(struct mi_builder *b, struct mi_value src)
1050 {
1051 if (src.type == MI_VALUE_TYPE_IMM)
1052 return mi_imm(mi_value_to_u64(src) == 0 ? ~0ull : 0);
1053
1054 return mi_math_binop(b, MI_ALU_ADD, src, mi_imm(0),
1055 MI_ALU_STORE, MI_ALU_ZF);
1056 }
1057
1058 static inline struct mi_value
mi_ior(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1059 mi_ior(struct mi_builder *b,
1060 struct mi_value src0, struct mi_value src1)
1061 {
1062 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1063 return mi_imm(mi_value_to_u64(src0) | mi_value_to_u64(src1));
1064
1065 return mi_math_binop(b, MI_ALU_OR, src0, src1,
1066 MI_ALU_STORE, MI_ALU_ACCU);
1067 }
1068
1069 #if GFX_VERx10 >= 125
1070 static inline struct mi_value
mi_ishl(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1071 mi_ishl(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1072 {
1073 if (src1.type == MI_VALUE_TYPE_IMM) {
1074 assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
1075 assert(mi_value_to_u64(src1) <= 32);
1076 }
1077
1078 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1079 return mi_imm(mi_value_to_u64(src0) << mi_value_to_u64(src1));
1080
1081 return mi_math_binop(b, MI_ALU_SHL, src0, src1,
1082 MI_ALU_STORE, MI_ALU_ACCU);
1083 }
1084
1085 static inline struct mi_value
mi_ushr(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1086 mi_ushr(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1087 {
1088 if (src1.type == MI_VALUE_TYPE_IMM) {
1089 assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
1090 assert(mi_value_to_u64(src1) <= 32);
1091 }
1092
1093 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1094 return mi_imm(mi_value_to_u64(src0) >> mi_value_to_u64(src1));
1095
1096 return mi_math_binop(b, MI_ALU_SHR, src0, src1,
1097 MI_ALU_STORE, MI_ALU_ACCU);
1098 }
1099
1100 static inline struct mi_value
mi_ushr_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1101 mi_ushr_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1102 {
1103 if (shift == 0)
1104 return src;
1105
1106 if (shift >= 64)
1107 return mi_imm(0);
1108
1109 if (src.type == MI_VALUE_TYPE_IMM)
1110 return mi_imm(mi_value_to_u64(src) >> shift);
1111
1112 struct mi_value res = mi_value_to_gpr(b, src);
1113
1114 /* Annoyingly, we only have power-of-two shifts */
1115 while (shift) {
1116 int bit = u_bit_scan(&shift);
1117 assert(bit <= 5);
1118 res = mi_ushr(b, res, mi_imm(1ULL << bit));
1119 }
1120
1121 return res;
1122 }
1123
1124 static inline struct mi_value
mi_ishr(struct mi_builder * b,struct mi_value src0,struct mi_value src1)1125 mi_ishr(struct mi_builder *b, struct mi_value src0, struct mi_value src1)
1126 {
1127 if (src1.type == MI_VALUE_TYPE_IMM) {
1128 assert(util_is_power_of_two_or_zero(mi_value_to_u64(src1)));
1129 assert(mi_value_to_u64(src1) <= 32);
1130 }
1131
1132 if (src0.type == MI_VALUE_TYPE_IMM && src1.type == MI_VALUE_TYPE_IMM)
1133 return mi_imm((int64_t)mi_value_to_u64(src0) >> mi_value_to_u64(src1));
1134
1135 return mi_math_binop(b, MI_ALU_SAR, src0, src1,
1136 MI_ALU_STORE, MI_ALU_ACCU);
1137 }
1138
1139 static inline struct mi_value
mi_ishr_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1140 mi_ishr_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1141 {
1142 if (shift == 0)
1143 return src;
1144
1145 if (shift >= 64)
1146 return mi_imm(0);
1147
1148 if (src.type == MI_VALUE_TYPE_IMM)
1149 return mi_imm((int64_t)mi_value_to_u64(src) >> shift);
1150
1151 struct mi_value res = mi_value_to_gpr(b, src);
1152
1153 /* Annoyingly, we only have power-of-two shifts */
1154 while (shift) {
1155 int bit = u_bit_scan(&shift);
1156 assert(bit <= 5);
1157 res = mi_ishr(b, res, mi_imm(1 << bit));
1158 }
1159
1160 return res;
1161 }
1162 #endif /* if GFX_VERx10 >= 125 */
1163
1164 static inline struct mi_value
mi_imul_imm(struct mi_builder * b,struct mi_value src,uint32_t N)1165 mi_imul_imm(struct mi_builder *b, struct mi_value src, uint32_t N)
1166 {
1167 if (src.type == MI_VALUE_TYPE_IMM)
1168 return mi_imm(mi_value_to_u64(src) * N);
1169
1170 if (N == 0) {
1171 mi_value_unref(b, src);
1172 return mi_imm(0);
1173 }
1174
1175 if (N == 1)
1176 return src;
1177
1178 src = mi_value_to_gpr(b, src);
1179
1180 struct mi_value res = mi_value_ref(b, src);
1181
1182 unsigned top_bit = 31 - __builtin_clz(N);
1183 for (int i = top_bit - 1; i >= 0; i--) {
1184 res = mi_iadd(b, res, mi_value_ref(b, res));
1185 if (N & (1 << i))
1186 res = mi_iadd(b, res, mi_value_ref(b, src));
1187 }
1188
1189 mi_value_unref(b, src);
1190
1191 return res;
1192 }
1193
1194 static inline struct mi_value
mi_ishl_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1195 mi_ishl_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1196 {
1197 if (shift == 0)
1198 return src;
1199
1200 if (shift >= 64)
1201 return mi_imm(0);
1202
1203 if (src.type == MI_VALUE_TYPE_IMM)
1204 return mi_imm(mi_value_to_u64(src) << shift);
1205
1206 struct mi_value res = mi_value_to_gpr(b, src);
1207
1208 #if GFX_VERx10 >= 125
1209 /* Annoyingly, we only have power-of-two shifts */
1210 while (shift) {
1211 int bit = u_bit_scan(&shift);
1212 assert(bit <= 5);
1213 res = mi_ishl(b, res, mi_imm(1 << bit));
1214 }
1215 #else
1216 for (unsigned i = 0; i < shift; i++)
1217 res = mi_iadd(b, res, mi_value_ref(b, res));
1218 #endif
1219
1220 return res;
1221 }
1222
1223 static inline struct mi_value
mi_ushr32_imm(struct mi_builder * b,struct mi_value src,uint32_t shift)1224 mi_ushr32_imm(struct mi_builder *b, struct mi_value src, uint32_t shift)
1225 {
1226 if (shift == 0)
1227 return src;
1228
1229 if (shift >= 64)
1230 return mi_imm(0);
1231
1232 /* We right-shift by left-shifting by 32 - shift and taking the top 32 bits
1233 * of the result.
1234 */
1235 if (src.type == MI_VALUE_TYPE_IMM)
1236 return mi_imm((mi_value_to_u64(src) >> shift) & UINT32_MAX);
1237
1238 if (shift > 32) {
1239 struct mi_value tmp = mi_new_gpr(b);
1240 _mi_copy_no_unref(b, mi_value_half(tmp, false),
1241 mi_value_half(src, true));
1242 _mi_copy_no_unref(b, mi_value_half(tmp, true), mi_imm(0));
1243 mi_value_unref(b, src);
1244 src = tmp;
1245 shift -= 32;
1246 }
1247 assert(shift <= 32);
1248 struct mi_value tmp = mi_ishl_imm(b, src, 32 - shift);
1249 struct mi_value dst = mi_new_gpr(b);
1250 _mi_copy_no_unref(b, mi_value_half(dst, false),
1251 mi_value_half(tmp, true));
1252 _mi_copy_no_unref(b, mi_value_half(dst, true), mi_imm(0));
1253 mi_value_unref(b, tmp);
1254 return dst;
1255 }
1256
1257 static inline struct mi_value
mi_udiv32_imm(struct mi_builder * b,struct mi_value N,uint32_t D)1258 mi_udiv32_imm(struct mi_builder *b, struct mi_value N, uint32_t D)
1259 {
1260 if (N.type == MI_VALUE_TYPE_IMM) {
1261 assert(mi_value_to_u64(N) <= UINT32_MAX);
1262 return mi_imm(mi_value_to_u64(N) / D);
1263 }
1264
1265 /* We implicitly assume that N is only a 32-bit value */
1266 if (D == 0) {
1267 /* This is invalid but we should do something */
1268 return mi_imm(0);
1269 } else if (util_is_power_of_two_or_zero(D)) {
1270 return mi_ushr32_imm(b, N, util_logbase2(D));
1271 } else {
1272 struct util_fast_udiv_info m = util_compute_fast_udiv_info(D, 32, 32);
1273 assert(m.multiplier <= UINT32_MAX);
1274
1275 if (m.pre_shift)
1276 N = mi_ushr32_imm(b, N, m.pre_shift);
1277
1278 /* Do the 32x32 multiply into gpr0 */
1279 N = mi_imul_imm(b, N, m.multiplier);
1280
1281 if (m.increment)
1282 N = mi_iadd(b, N, mi_imm(m.multiplier));
1283
1284 N = mi_ushr32_imm(b, N, 32);
1285
1286 if (m.post_shift)
1287 N = mi_ushr32_imm(b, N, m.post_shift);
1288
1289 return N;
1290 }
1291 }
1292
1293 #endif /* MI_MATH section */
1294
1295 /* This assumes addresses of strictly more than 32bits (aka. Gfx8+). */
1296 #if MI_BUILDER_CAN_WRITE_BATCH
1297
1298 struct mi_reloc_imm_token {
1299 enum mi_value_type dst_type;
1300 uint32_t *ptr[2];
1301 };
1302
1303 /* Emits a immediate write to an address/register where the immediate value
1304 * can be updated later.
1305 */
1306 static inline struct mi_reloc_imm_token
mi_store_relocated_imm(struct mi_builder * b,struct mi_value dst)1307 mi_store_relocated_imm(struct mi_builder *b, struct mi_value dst)
1308 {
1309 mi_builder_flush_math(b);
1310
1311 struct mi_reloc_imm_token token = {
1312 .dst_type = dst.type,
1313 };
1314
1315 uint32_t *dw;
1316 switch (dst.type) {
1317 case MI_VALUE_TYPE_MEM32:
1318 dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1319 GENX(MI_STORE_DATA_IMM_length));
1320 mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdm) {
1321 sdm.DWordLength = GENX(MI_STORE_DATA_IMM_length) -
1322 GENX(MI_STORE_DATA_IMM_length_bias);
1323 sdm.Address = dst.addr;
1324 }
1325 token.ptr[0] = dw + GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32;
1326 mi_builder_set_write(b);
1327 break;
1328
1329 case MI_VALUE_TYPE_MEM64:
1330 dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1331 GENX(MI_STORE_DATA_IMM_length) + 1);
1332 mi_builder_pack(b, GENX(MI_STORE_DATA_IMM), dw, sdm) {
1333 sdm.DWordLength = GENX(MI_STORE_DATA_IMM_length) + 1 -
1334 GENX(MI_STORE_DATA_IMM_length_bias);
1335 sdm.Address = dst.addr;
1336 }
1337 token.ptr[0] = &dw[GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32];
1338 token.ptr[1] = &dw[GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 32 + 1];
1339 mi_builder_set_write(b);
1340 break;
1341
1342 case MI_VALUE_TYPE_REG32:
1343 dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1344 GENX(MI_LOAD_REGISTER_IMM_length));
1345 mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
1346 lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) -
1347 GENX(MI_LOAD_REGISTER_IMM_length_bias);
1348 struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
1349 #if GFX_VER >= 11
1350 lri.AddCSMMIOStartOffset = reg.cs;
1351 #endif
1352 lri.RegisterOffset = reg.num;
1353 }
1354 token.ptr[0] = &dw[2];
1355 break;
1356
1357 case MI_VALUE_TYPE_REG64: {
1358 dw = (uint32_t *)__gen_get_batch_dwords(b->user_data,
1359 GENX(MI_LOAD_REGISTER_IMM_length) + 2);
1360 struct mi_reg_num reg = mi_adjust_reg_num(dst.reg);
1361 mi_builder_pack(b, GENX(MI_LOAD_REGISTER_IMM), dw, lri) {
1362 lri.DWordLength = GENX(MI_LOAD_REGISTER_IMM_length) + 2 -
1363 GENX(MI_LOAD_REGISTER_IMM_length_bias);
1364 #if GFX_VER >= 11
1365 lri.AddCSMMIOStartOffset = reg.cs;
1366 #endif
1367 }
1368 dw[1] = reg.num;
1369 dw[3] = reg.num + 4;
1370 token.ptr[0] = &dw[2];
1371 token.ptr[1] = &dw[4];
1372 break;
1373 }
1374
1375 default:
1376 unreachable("Invalid value type");
1377 }
1378
1379 mi_value_unref(b, dst);
1380 return token;
1381 }
1382
1383 static inline void
mi_relocate_store_imm(struct mi_reloc_imm_token token,uint64_t value)1384 mi_relocate_store_imm(struct mi_reloc_imm_token token, uint64_t value)
1385 {
1386 switch (token.dst_type) {
1387 case MI_VALUE_TYPE_MEM64:
1388 case MI_VALUE_TYPE_REG64:
1389 *token.ptr[1] = value >> 32;
1390 FALLTHROUGH;
1391 case MI_VALUE_TYPE_MEM32:
1392 case MI_VALUE_TYPE_REG32:
1393 *token.ptr[0] = value & 0xffffffff;
1394 break;
1395 default:
1396 unreachable("Invalid value type");
1397 }
1398 }
1399
1400 struct mi_address_token {
1401 /* Pointers to address memory fields in the batch. */
1402 uint64_t *ptrs[2];
1403 };
1404
1405 /* Emits a 64bit memory write to a yet unknown address using a value from a
1406 * register
1407 */
1408 static inline struct mi_address_token
mi_store_relocated_address_reg64(struct mi_builder * b,struct mi_value addr_reg)1409 mi_store_relocated_address_reg64(struct mi_builder *b, struct mi_value addr_reg)
1410 {
1411 mi_builder_flush_math(b);
1412
1413 assert(addr_reg.type == MI_VALUE_TYPE_REG64);
1414
1415 struct mi_address_token token = {};
1416
1417 for (unsigned i = 0; i < 2; i++) {
1418 mi_builder_emit(b, GENX(MI_STORE_REGISTER_MEM), srm) {
1419 srm.RegisterAddress = addr_reg.reg + (i * 4);
1420
1421 const unsigned addr_dw =
1422 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8;
1423 token.ptrs[i] = (uint64_t *)(mi_builder_get_inst_ptr(_dst) + addr_dw);
1424 }
1425 }
1426
1427 mi_builder_set_write(b);
1428 mi_value_unref(b, addr_reg);
1429 return token;
1430 }
1431
1432 static inline void
mi_self_mod_barrier(struct mi_builder * b,unsigned cs_prefetch_size)1433 mi_self_mod_barrier(struct mi_builder *b, unsigned cs_prefetch_size)
1434 {
1435 /* First make sure all the memory writes from previous modifying commands
1436 * have landed. We want to do this before going through the CS cache,
1437 * otherwise we could be fetching memory that hasn't been written to yet.
1438 */
1439 mi_builder_emit(b, GENX(PIPE_CONTROL), pc) {
1440 pc.CommandStreamerStallEnable = true;
1441 }
1442 /* Documentation says Gfx11+ should be able to invalidate the command cache
1443 * but experiment show it doesn't work properly, so for now just get over
1444 * the CS prefetch.
1445 */
1446 for (uint32_t i = 0; i < (cs_prefetch_size / 4); i++)
1447 mi_builder_emit(b, GENX(MI_NOOP), noop);
1448 }
1449
1450 static inline void
mi_resolve_relocated_address_token(struct mi_builder * b,struct mi_address_token token,void * batch_location)1451 mi_resolve_relocated_address_token(struct mi_builder *b,
1452 struct mi_address_token token,
1453 void *batch_location)
1454 {
1455 __gen_address_type addr = __gen_get_batch_address(b->user_data,
1456 batch_location);
1457 uint64_t addr_addr_u64 = __gen_combine_address(b->user_data, batch_location,
1458 addr, 0);
1459 *(token.ptrs[0]) = addr_addr_u64;
1460 *(token.ptrs[1]) = addr_addr_u64 + 4;
1461 }
1462
1463 #endif /* MI_BUILDER_CAN_WRITE_BATCH */
1464
1465 #if GFX_VERx10 >= 125
1466
1467 /*
1468 * Indirect load/store. Only available on XE_HP+
1469 */
1470
1471 MUST_CHECK static inline struct mi_value
mi_load_mem64_offset(struct mi_builder * b,__gen_address_type addr,struct mi_value offset)1472 mi_load_mem64_offset(struct mi_builder *b,
1473 __gen_address_type addr, struct mi_value offset)
1474 {
1475 mi_ensure_write_fence(b);
1476
1477 uint64_t addr_u64 = __gen_combine_address(b->user_data, NULL, addr, 0);
1478 struct mi_value addr_val = mi_imm(addr_u64);
1479
1480 struct mi_value dst = mi_new_gpr(b);
1481
1482 uint32_t dw[5];
1483 dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &addr_val);
1484 dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &offset);
1485 dw[2] = _mi_pack_alu(MI_ALU_ADD, 0, 0);
1486 dw[3] = _mi_pack_alu(MI_ALU_LOADIND, _mi_value_as_gpr(dst), MI_ALU_ACCU);
1487 dw[4] = _mi_pack_alu(MI_ALU_FENCE_RD, 0, 0);
1488 _mi_builder_push_math(b, dw, 5);
1489
1490 mi_value_unref(b, addr_val);
1491 mi_value_unref(b, offset);
1492
1493 return dst;
1494 }
1495
1496 static inline void
mi_store_mem64_offset(struct mi_builder * b,__gen_address_type addr,struct mi_value offset,struct mi_value data)1497 mi_store_mem64_offset(struct mi_builder *b,
1498 __gen_address_type addr, struct mi_value offset,
1499 struct mi_value data)
1500 {
1501 uint64_t addr_u64 = __gen_combine_address(b->user_data, NULL, addr, 0);
1502 struct mi_value addr_val = mi_imm(addr_u64);
1503
1504 data = mi_value_to_gpr(b, mi_resolve_invert(b, data));
1505
1506 uint32_t dw[5];
1507 dw[0] = _mi_math_load_src(b, MI_ALU_SRCA, &addr_val);
1508 dw[1] = _mi_math_load_src(b, MI_ALU_SRCB, &offset);
1509 dw[2] = _mi_pack_alu(MI_ALU_ADD, 0, 0);
1510 dw[3] = _mi_pack_alu(MI_ALU_STOREIND, MI_ALU_ACCU, _mi_value_as_gpr(data));
1511 dw[4] = _mi_pack_alu(MI_ALU_FENCE_WR, 0, 0);
1512 _mi_builder_push_math(b, dw, 5);
1513
1514 mi_value_unref(b, addr_val);
1515 mi_value_unref(b, offset);
1516 mi_value_unref(b, data);
1517
1518 /* This is the only math case which has side-effects outside of regular
1519 * registers to flush math afterwards so we don't confuse anyone.
1520 */
1521 mi_builder_flush_math(b);
1522 /* mi_builder_set_write() is not required here because we have a FENCE_WR
1523 * in the ALU instruction.
1524 */
1525 }
1526
1527 #endif /* GFX_VERx10 >= 125 */
1528
1529 #if GFX_VER >= 9
1530
1531 /*
1532 * Control-flow Section. Only available on Gfx9+
1533 */
1534
1535 struct _mi_goto {
1536 bool predicated;
1537 void *mi_bbs;
1538 };
1539
1540 struct mi_goto_target {
1541 bool placed;
1542 unsigned num_gotos;
1543 struct _mi_goto gotos[8];
1544 __gen_address_type addr;
1545 };
1546
1547 #define MI_GOTO_TARGET_INIT ((struct mi_goto_target) {})
1548
1549 /* On >= Gfx12.5, the predication of MI_BATCH_BUFFER_START is driven by the
1550 * bit0 of the MI_SET_PREDICATE_RESULT register.
1551 *
1552 * ACM PRMs, Vol 2a: Command Reference: Instructions, MI_BATCH_BUFFER_START,
1553 * Predication Enable:
1554 *
1555 * "This bit is used to enable predication of this command. If this bit is
1556 * set and Bit 0 of the MI_SET_PREDICATE_RESULT register is set, this
1557 * command is ignored. Otherwise the command is performed normally."
1558 *
1559 * The register offset is not listed in the PRMs, but BSpec places it a
1560 * 0x2418.
1561 *
1562 * On < Gfx12.5, the predication of MI_BATCH_BUFFER_START is driven by the
1563 * bit0 of MI_PREDICATE_RESULT_1.
1564 *
1565 * SKL PRMs, Vol 2a: Command Reference: Instructions, MI_BATCH_BUFFER_START,
1566 * Predication Enable:
1567 *
1568 * "This bit is used to enable predication of this command. If this bit is
1569 * set and Bit 0 of the MI_PREDICATE_RESULT_1 register is clear, this
1570 * command is ignored. Otherwise the command is performed normally.
1571 * Specific to the Render command stream only."
1572 *
1573 * The register offset is listed in the SKL PRMs, Vol 2c: Command Reference:
1574 * Registers, MI_PREDICATE_RESULT_1, at 0x241C.
1575 */
1576 #if GFX_VERx10 >= 125
1577 #define MI_BUILDER_MI_PREDICATE_RESULT_num 0x2418
1578 #else
1579 #define MI_BUILDER_MI_PREDICATE_RESULT_num 0x241C
1580 #endif
1581
1582 static inline void
mi_goto_if(struct mi_builder * b,struct mi_value cond,struct mi_goto_target * t)1583 mi_goto_if(struct mi_builder *b, struct mi_value cond,
1584 struct mi_goto_target *t)
1585 {
1586 /* First, set up the predicate, if any */
1587 bool predicated;
1588 if (cond.type == MI_VALUE_TYPE_IMM) {
1589 /* If it's an immediate, the goto either doesn't happen or happens
1590 * unconditionally.
1591 */
1592 if (mi_value_to_u64(cond) == 0)
1593 return;
1594
1595 assert(mi_value_to_u64(cond) == ~0ull);
1596 predicated = false;
1597 } else if (mi_value_is_reg(cond) &&
1598 cond.reg == MI_BUILDER_MI_PREDICATE_RESULT_num) {
1599 /* If it's MI_PREDICATE_RESULT, we use whatever predicate the client
1600 * provided us with
1601 */
1602 assert(cond.type == MI_VALUE_TYPE_REG32);
1603 predicated = true;
1604 } else {
1605 mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), cond);
1606 predicated = true;
1607 }
1608
1609 #if GFX_VERx10 >= 125
1610 if (predicated) {
1611 mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
1612 sp.PredicateEnable = NOOPOnResultClear;
1613 }
1614 }
1615 #endif
1616 if (t->placed) {
1617 mi_builder_emit(b, GENX(MI_BATCH_BUFFER_START), bbs) {
1618 bbs.PredicationEnable = predicated;
1619 bbs.AddressSpaceIndicator = ASI_PPGTT;
1620 bbs.BatchBufferStartAddress = t->addr;
1621 }
1622 } else {
1623 assert(t->num_gotos < ARRAY_SIZE(t->gotos));
1624 struct _mi_goto g = {
1625 .predicated = predicated,
1626 .mi_bbs = __gen_get_batch_dwords(b->user_data,
1627 GENX(MI_BATCH_BUFFER_START_length)),
1628 };
1629 memset(g.mi_bbs, 0, 4 * GENX(MI_BATCH_BUFFER_START_length));
1630 t->gotos[t->num_gotos++] = g;
1631 }
1632 if (predicated) {
1633 #if GFX_VERx10 >= 125
1634 mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
1635 sp.PredicateEnable = NOOPNever;
1636 }
1637 #else
1638 mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), mi_imm(0));
1639 #endif
1640 }
1641 }
1642
1643 static inline void
mi_goto(struct mi_builder * b,struct mi_goto_target * t)1644 mi_goto(struct mi_builder *b, struct mi_goto_target *t)
1645 {
1646 mi_goto_if(b, mi_imm(-1), t);
1647 }
1648
1649 static inline void
mi_goto_target(struct mi_builder * b,struct mi_goto_target * t)1650 mi_goto_target(struct mi_builder *b, struct mi_goto_target *t)
1651 {
1652 #if GFX_VERx10 >= 125
1653 mi_builder_emit(b, GENX(MI_SET_PREDICATE), sp) {
1654 sp.PredicateEnable = NOOPNever;
1655 t->addr = __gen_get_batch_address(b->user_data,
1656 mi_builder_get_inst_ptr(b));
1657 }
1658 #else
1659 mi_builder_emit(b, GENX(MI_NOOP), sp) {
1660 t->addr = __gen_get_batch_address(b->user_data,
1661 mi_builder_get_inst_ptr(b));
1662 }
1663 mi_store(b, mi_reg32(MI_BUILDER_MI_PREDICATE_RESULT_num), mi_imm(0));
1664 #endif
1665 t->placed = true;
1666
1667 struct GENX(MI_BATCH_BUFFER_START) bbs = { GENX(MI_BATCH_BUFFER_START_header) };
1668 bbs.AddressSpaceIndicator = ASI_PPGTT;
1669 bbs.BatchBufferStartAddress = t->addr;
1670
1671 for (unsigned i = 0; i < t->num_gotos; i++) {
1672 bbs.PredicationEnable = t->gotos[i].predicated;
1673 GENX(MI_BATCH_BUFFER_START_pack)(b->user_data, t->gotos[i].mi_bbs, &bbs);
1674 }
1675 }
1676
1677 static inline struct mi_goto_target
mi_goto_target_init_and_place(struct mi_builder * b)1678 mi_goto_target_init_and_place(struct mi_builder *b)
1679 {
1680 struct mi_goto_target t = MI_GOTO_TARGET_INIT;
1681 mi_goto_target(b, &t);
1682 return t;
1683 }
1684
1685 #define mi_loop(b) \
1686 for (struct mi_goto_target __break = MI_GOTO_TARGET_INIT, \
1687 __continue = mi_goto_target_init_and_place(b); !__break.placed; \
1688 mi_goto(b, &__continue), mi_goto_target(b, &__break))
1689
1690 #define mi_break(b) mi_goto(b, &__break)
1691 #define mi_break_if(b, cond) mi_goto_if(b, cond, &__break)
1692 #define mi_continue(b) mi_goto(b, &__continue)
1693 #define mi_continue_if(b, cond) mi_goto_if(b, cond, &__continue)
1694
1695 #endif /* GFX_VER >= 9 */
1696
1697 #endif /* MI_BUILDER_H */
1698