1 /*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30
31 #include "util/libdrm.h"
32
33 #include "intel_device_info.h"
34 #include "intel_wa.h"
35 #include "i915/intel_device_info.h"
36 #include "xe/intel_device_info.h"
37
38 #include "common/intel_gem.h"
39 #include "util/u_debug.h"
40 #include "util/log.h"
41 #include "util/macros.h"
42
43 static const struct {
44 const char *name;
45 int pci_id;
46 } name_map[] = {
47 { "lpt", 0x27a2 },
48 { "brw", 0x2a02 },
49 { "g4x", 0x2a42 },
50 { "ilk", 0x0042 },
51 { "snb", 0x0126 },
52 { "ivb", 0x016a },
53 { "hsw", 0x0d2e },
54 { "byt", 0x0f33 },
55 { "bdw", 0x162e },
56 { "chv", 0x22B3 },
57 { "skl", 0x1912 },
58 { "bxt", 0x5A85 },
59 { "kbl", 0x5912 },
60 { "aml", 0x591C },
61 { "glk", 0x3185 },
62 { "cfl", 0x3E9B },
63 { "whl", 0x3EA1 },
64 { "cml", 0x9b41 },
65 { "icl", 0x8a52 },
66 { "ehl", 0x4571 },
67 { "jsl", 0x4E71 },
68 { "tgl", 0x9a49 },
69 { "rkl", 0x4c8a },
70 { "dg1", 0x4905 },
71 { "adl", 0x4680 },
72 { "sg1", 0x4907 },
73 { "rpl", 0xa780 },
74 { "dg2", 0x5690 },
75 { "mtl", 0x7d60 },
76 { "arl", 0x7d67 },
77 { "lnl", 0x64a0 },
78 { "bmg", 0xe202 },
79 };
80
81 /**
82 * Get the PCI ID for the device name.
83 *
84 * Returns -1 if the device is not known.
85 */
86 int
intel_device_name_to_pci_device_id(const char * name)87 intel_device_name_to_pci_device_id(const char *name)
88 {
89 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
90 if (!strcmp(name_map[i].name, name))
91 return name_map[i].pci_id;
92 }
93
94 return -1;
95 }
96
97 static const struct intel_device_info intel_device_info_gfx3 = {
98 .ver = 3,
99 .platform = INTEL_PLATFORM_GFX3,
100 .simulator_id = -1,
101 .num_slices = 1,
102 .num_subslices = { 1, },
103 .max_eus_per_subslice = 8,
104 .num_thread_per_eu = 4,
105 .grf_size = 32,
106 .timestamp_frequency = 12500000,
107 };
108
109 static const struct intel_device_info intel_device_info_i965 = {
110 .ver = 4,
111 .platform = INTEL_PLATFORM_I965,
112 .has_negative_rhw_bug = true,
113 .num_slices = 1,
114 .num_subslices = { 1, },
115 .max_eus_per_subslice = 8,
116 .num_thread_per_eu = 4,
117 .grf_size = 32,
118 .max_vs_threads = 16,
119 .max_gs_threads = 2,
120 .max_wm_threads = 8 * 4,
121 .urb = {
122 .size = 256,
123 },
124 .timestamp_frequency = 12500000,
125 .simulator_id = -1,
126 };
127
128 static const struct intel_device_info intel_device_info_g4x = {
129 .ver = 4,
130 .verx10 = 45,
131 .has_pln = true,
132 .has_compr4 = true,
133 .has_surface_tile_offset = true,
134 .platform = INTEL_PLATFORM_G4X,
135 .num_slices = 1,
136 .num_subslices = { 1, },
137 .max_eus_per_subslice = 10,
138 .num_thread_per_eu = 5,
139 .grf_size = 32,
140 .max_vs_threads = 32,
141 .max_gs_threads = 2,
142 .max_wm_threads = 10 * 5,
143 .urb = {
144 .size = 384,
145 },
146 .timestamp_frequency = 12500000,
147 .simulator_id = -1,
148 };
149
150 static const struct intel_device_info intel_device_info_ilk = {
151 .ver = 5,
152 .platform = INTEL_PLATFORM_ILK,
153 .has_pln = true,
154 .has_compr4 = true,
155 .has_surface_tile_offset = true,
156 .num_slices = 1,
157 .num_subslices = { 1, },
158 .max_eus_per_subslice = 12,
159 .num_thread_per_eu = 6,
160 .grf_size = 32,
161 .max_vs_threads = 72,
162 .max_gs_threads = 32,
163 .max_wm_threads = 12 * 6,
164 .urb = {
165 .size = 1024,
166 },
167 .timestamp_frequency = 12500000,
168 .simulator_id = -1,
169 };
170
171 static const struct intel_device_info intel_device_info_snb_gt1 = {
172 .ver = 6,
173 .gt = 1,
174 .platform = INTEL_PLATFORM_SNB,
175 .has_hiz_and_separate_stencil = true,
176 .has_llc = true,
177 .has_pln = true,
178 .has_surface_tile_offset = true,
179 .needs_unlit_centroid_workaround = true,
180 .num_slices = 1,
181 .num_subslices = { 1, },
182 .max_eus_per_subslice = 6,
183 .num_thread_per_eu = 6, /* Not confirmed */
184 .grf_size = 32,
185 .max_vs_threads = 24,
186 .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
187 .max_wm_threads = 40,
188 .urb = {
189 .size = 32,
190 .min_entries = {
191 [MESA_SHADER_VERTEX] = 24,
192 },
193 .max_entries = {
194 [MESA_SHADER_VERTEX] = 256,
195 [MESA_SHADER_GEOMETRY] = 256,
196 },
197 },
198 .timestamp_frequency = 12500000,
199 .simulator_id = -1,
200 };
201
202 static const struct intel_device_info intel_device_info_snb_gt2 = {
203 .ver = 6,
204 .gt = 2,
205 .platform = INTEL_PLATFORM_SNB,
206 .has_hiz_and_separate_stencil = true,
207 .has_llc = true,
208 .has_pln = true,
209 .has_surface_tile_offset = true,
210 .needs_unlit_centroid_workaround = true,
211 .num_slices = 1,
212 .num_subslices = { 1, },
213 .max_eus_per_subslice = 12,
214 .num_thread_per_eu = 6, /* Not confirmed */
215 .grf_size = 32,
216 .max_vs_threads = 60,
217 .max_gs_threads = 60,
218 .max_wm_threads = 80,
219 .urb = {
220 .size = 64,
221 .min_entries = {
222 [MESA_SHADER_VERTEX] = 24,
223 },
224 .max_entries = {
225 [MESA_SHADER_VERTEX] = 256,
226 [MESA_SHADER_GEOMETRY] = 256,
227 },
228 },
229 .timestamp_frequency = 12500000,
230 .simulator_id = -1,
231 };
232
233 #define GFX7_FEATURES \
234 .ver = 7, \
235 .has_hiz_and_separate_stencil = true, \
236 .must_use_separate_stencil = true, \
237 .has_llc = true, \
238 .has_pln = true, \
239 .has_64bit_float = true, \
240 .has_surface_tile_offset = true, \
241 .grf_size = 32, \
242 .timestamp_frequency = 12500000, \
243 .max_constant_urb_size_kb = 16
244
245 static const struct intel_device_info intel_device_info_ivb_gt1 = {
246 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 1,
247 .num_slices = 1,
248 .num_subslices = { 1, },
249 .max_eus_per_subslice = 6,
250 .num_thread_per_eu = 6,
251 .l3_banks = 2,
252 .max_vs_threads = 36,
253 .max_tcs_threads = 36,
254 .max_tes_threads = 36,
255 .max_gs_threads = 36,
256 .max_wm_threads = 48,
257 .max_cs_threads = 36,
258 .urb = {
259 .min_entries = {
260 [MESA_SHADER_VERTEX] = 32,
261 [MESA_SHADER_TESS_EVAL] = 10,
262 },
263 .max_entries = {
264 [MESA_SHADER_VERTEX] = 512,
265 [MESA_SHADER_TESS_CTRL] = 32,
266 [MESA_SHADER_TESS_EVAL] = 288,
267 [MESA_SHADER_GEOMETRY] = 192,
268 },
269 },
270 .simulator_id = 7,
271 };
272
273 static const struct intel_device_info intel_device_info_ivb_gt2 = {
274 GFX7_FEATURES, .platform = INTEL_PLATFORM_IVB, .gt = 2,
275 .num_slices = 1,
276 .num_subslices = { 1, },
277 .max_eus_per_subslice = 12,
278 .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
279 * @max_wm_threads ... */
280 .l3_banks = 4,
281 .max_vs_threads = 128,
282 .max_tcs_threads = 128,
283 .max_tes_threads = 128,
284 .max_gs_threads = 128,
285 .max_wm_threads = 172,
286 .max_cs_threads = 64,
287 .urb = {
288 .min_entries = {
289 [MESA_SHADER_VERTEX] = 32,
290 [MESA_SHADER_TESS_EVAL] = 10,
291 },
292 .max_entries = {
293 [MESA_SHADER_VERTEX] = 704,
294 [MESA_SHADER_TESS_CTRL] = 64,
295 [MESA_SHADER_TESS_EVAL] = 448,
296 [MESA_SHADER_GEOMETRY] = 320,
297 },
298 },
299 .simulator_id = 7,
300 };
301
302 static const struct intel_device_info intel_device_info_byt = {
303 GFX7_FEATURES, .platform = INTEL_PLATFORM_BYT, .gt = 1,
304 .num_slices = 1,
305 .num_subslices = { 1, },
306 .max_eus_per_subslice = 4,
307 .num_thread_per_eu = 8,
308 .l3_banks = 1,
309 .has_llc = false,
310 .max_vs_threads = 36,
311 .max_tcs_threads = 36,
312 .max_tes_threads = 36,
313 .max_gs_threads = 36,
314 .max_wm_threads = 48,
315 .max_cs_threads = 32,
316 .urb = {
317 .min_entries = {
318 [MESA_SHADER_VERTEX] = 32,
319 [MESA_SHADER_TESS_EVAL] = 10,
320 },
321 .max_entries = {
322 [MESA_SHADER_VERTEX] = 512,
323 [MESA_SHADER_TESS_CTRL] = 32,
324 [MESA_SHADER_TESS_EVAL] = 288,
325 [MESA_SHADER_GEOMETRY] = 192,
326 },
327 },
328 .simulator_id = 10,
329 };
330
331 #define HSW_FEATURES \
332 GFX7_FEATURES, \
333 .platform = INTEL_PLATFORM_HSW, \
334 .verx10 = 75, \
335 .supports_simd16_3src = true
336
337 static const struct intel_device_info intel_device_info_hsw_gt1 = {
338 HSW_FEATURES, .gt = 1,
339 .num_slices = 1,
340 .num_subslices = { 1, },
341 .max_eus_per_subslice = 10,
342 .num_thread_per_eu = 7,
343 .l3_banks = 2,
344 .max_vs_threads = 70,
345 .max_tcs_threads = 70,
346 .max_tes_threads = 70,
347 .max_gs_threads = 70,
348 .max_wm_threads = 102,
349 .max_cs_threads = 70,
350 .urb = {
351 .min_entries = {
352 [MESA_SHADER_VERTEX] = 32,
353 [MESA_SHADER_TESS_EVAL] = 10,
354 },
355 .max_entries = {
356 [MESA_SHADER_VERTEX] = 640,
357 [MESA_SHADER_TESS_CTRL] = 64,
358 [MESA_SHADER_TESS_EVAL] = 384,
359 [MESA_SHADER_GEOMETRY] = 256,
360 },
361 },
362 .simulator_id = 9,
363 };
364
365 static const struct intel_device_info intel_device_info_hsw_gt2 = {
366 HSW_FEATURES, .gt = 2,
367 .num_slices = 1,
368 .num_subslices = { 2, },
369 .max_eus_per_subslice = 10,
370 .num_thread_per_eu = 7,
371 .l3_banks = 4,
372 .max_vs_threads = 280,
373 .max_tcs_threads = 256,
374 .max_tes_threads = 280,
375 .max_gs_threads = 256,
376 .max_wm_threads = 204,
377 .max_cs_threads = 70,
378 .urb = {
379 .min_entries = {
380 [MESA_SHADER_VERTEX] = 64,
381 [MESA_SHADER_TESS_EVAL] = 10,
382 },
383 .max_entries = {
384 [MESA_SHADER_VERTEX] = 1664,
385 [MESA_SHADER_TESS_CTRL] = 128,
386 [MESA_SHADER_TESS_EVAL] = 960,
387 [MESA_SHADER_GEOMETRY] = 640,
388 },
389 },
390 .simulator_id = 9,
391 };
392
393 static const struct intel_device_info intel_device_info_hsw_gt3 = {
394 HSW_FEATURES, .gt = 3,
395 .num_slices = 2,
396 .num_subslices = { 2, 2, },
397 .max_eus_per_subslice = 10,
398 .num_thread_per_eu = 7,
399 .l3_banks = 8,
400 .max_vs_threads = 280,
401 .max_tcs_threads = 256,
402 .max_tes_threads = 280,
403 .max_gs_threads = 256,
404 .max_wm_threads = 408,
405 .max_cs_threads = 70,
406 .urb = {
407 .min_entries = {
408 [MESA_SHADER_VERTEX] = 64,
409 [MESA_SHADER_TESS_EVAL] = 10,
410 },
411 .max_entries = {
412 [MESA_SHADER_VERTEX] = 1664,
413 [MESA_SHADER_TESS_CTRL] = 128,
414 [MESA_SHADER_TESS_EVAL] = 960,
415 [MESA_SHADER_GEOMETRY] = 640,
416 },
417 },
418 .max_constant_urb_size_kb = 32,
419 .simulator_id = 9,
420 };
421
422 /* It's unclear how well supported sampling from the hiz buffer is on GFX8,
423 * so keep things conservative for now and set has_sample_with_hiz = false.
424 */
425 #define GFX8_FEATURES \
426 .ver = 8, \
427 .has_hiz_and_separate_stencil = true, \
428 .must_use_separate_stencil = true, \
429 .has_llc = true, \
430 .has_sample_with_hiz = false, \
431 .has_pln = true, \
432 .has_integer_dword_mul = true, \
433 .has_64bit_float = true, \
434 .has_64bit_int = true, \
435 .supports_simd16_3src = true, \
436 .has_surface_tile_offset = true, \
437 .num_thread_per_eu = 7, \
438 .grf_size = 32, \
439 .max_vs_threads = 504, \
440 .max_tcs_threads = 504, \
441 .max_tes_threads = 504, \
442 .max_gs_threads = 504, \
443 .max_wm_threads = 384, \
444 .max_threads_per_psd = 64, \
445 .timestamp_frequency = 12500000, \
446 .max_constant_urb_size_kb = 32
447
448 static const struct intel_device_info intel_device_info_bdw_gt1 = {
449 GFX8_FEATURES, .gt = 1,
450 .platform = INTEL_PLATFORM_BDW,
451 .num_slices = 1,
452 .num_subslices = { 2, },
453 .max_eus_per_subslice = 6,
454 .l3_banks = 2,
455 .max_cs_threads = 42,
456 .urb = {
457 .min_entries = {
458 [MESA_SHADER_VERTEX] = 64,
459 [MESA_SHADER_TESS_EVAL] = 34,
460 },
461 .max_entries = {
462 [MESA_SHADER_VERTEX] = 2560,
463 [MESA_SHADER_TESS_CTRL] = 504,
464 [MESA_SHADER_TESS_EVAL] = 1536,
465 /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
466 [MESA_SHADER_GEOMETRY] = 690,
467 },
468 },
469 .simulator_id = 11,
470 };
471
472 static const struct intel_device_info intel_device_info_bdw_gt2 = {
473 GFX8_FEATURES, .gt = 2,
474 .platform = INTEL_PLATFORM_BDW,
475 .num_slices = 1,
476 .num_subslices = { 3, },
477 .max_eus_per_subslice = 8,
478 .l3_banks = 4,
479 .max_cs_threads = 56,
480 .urb = {
481 .min_entries = {
482 [MESA_SHADER_VERTEX] = 64,
483 [MESA_SHADER_TESS_EVAL] = 34,
484 },
485 .max_entries = {
486 [MESA_SHADER_VERTEX] = 2560,
487 [MESA_SHADER_TESS_CTRL] = 504,
488 [MESA_SHADER_TESS_EVAL] = 1536,
489 [MESA_SHADER_GEOMETRY] = 960,
490 },
491 },
492 .simulator_id = 11,
493 };
494
495 static const struct intel_device_info intel_device_info_bdw_gt3 = {
496 GFX8_FEATURES, .gt = 3,
497 .platform = INTEL_PLATFORM_BDW,
498 .num_slices = 2,
499 .num_subslices = { 3, 3, },
500 .max_eus_per_subslice = 8,
501 .l3_banks = 8,
502 .max_cs_threads = 56,
503 .urb = {
504 .min_entries = {
505 [MESA_SHADER_VERTEX] = 64,
506 [MESA_SHADER_TESS_EVAL] = 34,
507 },
508 .max_entries = {
509 [MESA_SHADER_VERTEX] = 2560,
510 [MESA_SHADER_TESS_CTRL] = 504,
511 [MESA_SHADER_TESS_EVAL] = 1536,
512 [MESA_SHADER_GEOMETRY] = 960,
513 },
514 },
515 .simulator_id = 11,
516 };
517
518 static const struct intel_device_info intel_device_info_chv = {
519 GFX8_FEATURES, .platform = INTEL_PLATFORM_CHV, .gt = 1,
520 .has_llc = false,
521 .has_integer_dword_mul = false,
522 .num_slices = 1,
523 .num_subslices = { 2, },
524 .max_eus_per_subslice = 8,
525 .l3_banks = 2,
526 .max_vs_threads = 80,
527 .max_tcs_threads = 80,
528 .max_tes_threads = 80,
529 .max_gs_threads = 80,
530 .max_wm_threads = 128,
531 .max_cs_threads = 6 * 7,
532 .urb = {
533 .min_entries = {
534 [MESA_SHADER_VERTEX] = 34,
535 [MESA_SHADER_TESS_EVAL] = 34,
536 },
537 .max_entries = {
538 [MESA_SHADER_VERTEX] = 640,
539 [MESA_SHADER_TESS_CTRL] = 80,
540 [MESA_SHADER_TESS_EVAL] = 384,
541 [MESA_SHADER_GEOMETRY] = 256,
542 },
543 },
544 .simulator_id = 13,
545 };
546
547 #define GFX9_HW_INFO \
548 .ver = 9, \
549 .max_vs_threads = 336, \
550 .max_gs_threads = 336, \
551 .max_tcs_threads = 336, \
552 .max_tes_threads = 336, \
553 .max_threads_per_psd = 64, \
554 .max_cs_threads = 56, \
555 .timestamp_frequency = 12000000, \
556 .urb = { \
557 .min_entries = { \
558 [MESA_SHADER_VERTEX] = 64, \
559 [MESA_SHADER_TESS_EVAL] = 34, \
560 }, \
561 .max_entries = { \
562 [MESA_SHADER_VERTEX] = 1856, \
563 [MESA_SHADER_TESS_CTRL] = 672, \
564 [MESA_SHADER_TESS_EVAL] = 1120, \
565 [MESA_SHADER_GEOMETRY] = 640, \
566 }, \
567 }
568
569 #define GFX9_LP_FEATURES \
570 GFX8_FEATURES, \
571 GFX9_HW_INFO, \
572 .has_integer_dword_mul = false, \
573 .gt = 1, \
574 .has_llc = false, \
575 .has_sample_with_hiz = true, \
576 .has_illegal_ccs_values = true, \
577 .num_slices = 1, \
578 .num_thread_per_eu = 6, \
579 .max_vs_threads = 112, \
580 .max_tcs_threads = 112, \
581 .max_tes_threads = 112, \
582 .max_gs_threads = 112, \
583 .max_cs_threads = 6 * 6, \
584 .timestamp_frequency = 19200000, \
585 .urb = { \
586 .min_entries = { \
587 [MESA_SHADER_VERTEX] = 34, \
588 [MESA_SHADER_TESS_EVAL] = 34, \
589 }, \
590 .max_entries = { \
591 [MESA_SHADER_VERTEX] = 704, \
592 [MESA_SHADER_TESS_CTRL] = 256, \
593 [MESA_SHADER_TESS_EVAL] = 416, \
594 [MESA_SHADER_GEOMETRY] = 256, \
595 }, \
596 }
597
598 #define GFX9_LP_FEATURES_3X6 \
599 GFX9_LP_FEATURES, \
600 .num_subslices = { 3, }, \
601 .max_eus_per_subslice = 6
602
603 #define GFX9_LP_FEATURES_2X6 \
604 GFX9_LP_FEATURES, \
605 .num_subslices = { 2, }, \
606 .max_eus_per_subslice = 6, \
607 .max_vs_threads = 56, \
608 .max_tcs_threads = 56, \
609 .max_tes_threads = 56, \
610 .max_gs_threads = 56, \
611 .max_cs_threads = 6 * 6, \
612 .urb = { \
613 .min_entries = { \
614 [MESA_SHADER_VERTEX] = 34, \
615 [MESA_SHADER_TESS_EVAL] = 34, \
616 }, \
617 .max_entries = { \
618 [MESA_SHADER_VERTEX] = 352, \
619 [MESA_SHADER_TESS_CTRL] = 128, \
620 [MESA_SHADER_TESS_EVAL] = 208, \
621 [MESA_SHADER_GEOMETRY] = 128, \
622 }, \
623 }
624
625 #define GFX9_FEATURES \
626 GFX8_FEATURES, \
627 GFX9_HW_INFO, \
628 .has_sample_with_hiz = true, \
629 .has_illegal_ccs_values = true, \
630 .cooperative_matrix_configurations = { \
631 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
632 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
633 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
634 }
635
636 static const struct intel_device_info intel_device_info_skl_gt1 = {
637 GFX9_FEATURES, .gt = 1,
638 .platform = INTEL_PLATFORM_SKL,
639 .num_slices = 1,
640 .num_subslices = { 2, },
641 .max_eus_per_subslice = 6,
642 .l3_banks = 2,
643 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
644 * leading to some vertices to go missing if we use too much URB.
645 */
646 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
647 .simulator_id = 12,
648 };
649
650 static const struct intel_device_info intel_device_info_skl_gt2 = {
651 GFX9_FEATURES, .gt = 2,
652 .platform = INTEL_PLATFORM_SKL,
653 .num_slices = 1,
654 .num_subslices = { 3, },
655 .max_eus_per_subslice = 8,
656 .l3_banks = 4,
657 .simulator_id = 12,
658 };
659
660 static const struct intel_device_info intel_device_info_skl_gt3 = {
661 GFX9_FEATURES, .gt = 3,
662 .platform = INTEL_PLATFORM_SKL,
663 .num_slices = 2,
664 .num_subslices = { 3, 3, },
665 .max_eus_per_subslice = 8,
666 .l3_banks = 8,
667 .simulator_id = 12,
668 };
669
670 static const struct intel_device_info intel_device_info_skl_gt4 = {
671 GFX9_FEATURES, .gt = 4,
672 .platform = INTEL_PLATFORM_SKL,
673 .num_slices = 3,
674 .num_subslices = { 3, 3, 3, },
675 .max_eus_per_subslice = 8,
676 .l3_banks = 12,
677 /* From the "L3 Allocation and Programming" documentation:
678 *
679 * "URB is limited to 1008KB due to programming restrictions. This is not a
680 * restriction of the L3 implementation, but of the FF and other clients.
681 * Therefore, in a GT4 implementation it is possible for the programmed
682 * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
683 * only 1008KB of this will be used."
684 */
685 .simulator_id = 12,
686 };
687
688 static const struct intel_device_info intel_device_info_bxt = {
689 GFX9_LP_FEATURES_3X6,
690 .platform = INTEL_PLATFORM_BXT,
691 .l3_banks = 2,
692 .simulator_id = 14,
693 };
694
695 static const struct intel_device_info intel_device_info_bxt_2x6 = {
696 GFX9_LP_FEATURES_2X6,
697 .platform = INTEL_PLATFORM_BXT,
698 .l3_banks = 1,
699 .simulator_id = 14,
700 };
701 /*
702 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
703 * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
704 */
705
706 static const struct intel_device_info intel_device_info_kbl_gt1 = {
707 GFX9_FEATURES,
708 .platform = INTEL_PLATFORM_KBL,
709 .gt = 1,
710
711 .max_cs_threads = 7 * 6,
712 .num_slices = 1,
713 .num_subslices = { 2, },
714 .max_eus_per_subslice = 6,
715 .l3_banks = 2,
716 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
717 * leading to some vertices to go missing if we use too much URB.
718 */
719 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
720 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
721 .simulator_id = 16,
722 };
723
724 static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
725 GFX9_FEATURES,
726 .platform = INTEL_PLATFORM_KBL,
727 .gt = 1,
728
729 .max_cs_threads = 7 * 6,
730 .num_slices = 1,
731 .num_subslices = { 3, },
732 .max_eus_per_subslice = 6,
733 .l3_banks = 4,
734 .simulator_id = 16,
735 };
736
737 static const struct intel_device_info intel_device_info_kbl_gt2 = {
738 GFX9_FEATURES,
739 .platform = INTEL_PLATFORM_KBL,
740 .gt = 2,
741
742 .num_slices = 1,
743 .num_subslices = { 3, },
744 .max_eus_per_subslice = 8,
745 .l3_banks = 4,
746 .simulator_id = 16,
747 };
748
749 static const struct intel_device_info intel_device_info_kbl_gt3 = {
750 GFX9_FEATURES,
751 .platform = INTEL_PLATFORM_KBL,
752 .gt = 3,
753
754 .num_slices = 2,
755 .num_subslices = { 3, 3, },
756 .max_eus_per_subslice = 8,
757 .l3_banks = 8,
758 .simulator_id = 16,
759 };
760
761 static const struct intel_device_info intel_device_info_kbl_gt4 = {
762 GFX9_FEATURES,
763 .platform = INTEL_PLATFORM_KBL,
764 .gt = 4,
765
766 /*
767 * From the "L3 Allocation and Programming" documentation:
768 *
769 * "URB is limited to 1008KB due to programming restrictions. This
770 * is not a restriction of the L3 implementation, but of the FF and
771 * other clients. Therefore, in a GT4 implementation it is
772 * possible for the programmed allocation of the L3 data array to
773 * provide 3*384KB=1152KB for URB, but only 1008KB of this
774 * will be used."
775 */
776 .num_slices = 3,
777 .num_subslices = { 3, 3, 3, },
778 .max_eus_per_subslice = 8,
779 .l3_banks = 12,
780 .simulator_id = 16,
781 };
782
783 static const struct intel_device_info intel_device_info_glk = {
784 GFX9_LP_FEATURES_3X6,
785 .platform = INTEL_PLATFORM_GLK,
786 .l3_banks = 2,
787 .simulator_id = 17,
788 };
789
790 static const struct intel_device_info intel_device_info_glk_2x6 = {
791 GFX9_LP_FEATURES_2X6,
792 .platform = INTEL_PLATFORM_GLK,
793 .l3_banks = 2,
794 .simulator_id = 17,
795 };
796
797 static const struct intel_device_info intel_device_info_cfl_gt1 = {
798 GFX9_FEATURES,
799 .platform = INTEL_PLATFORM_CFL,
800 .gt = 1,
801
802 .num_slices = 1,
803 .num_subslices = { 2, },
804 .max_eus_per_subslice = 6,
805 .l3_banks = 2,
806 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
807 * leading to some vertices to go missing if we use too much URB.
808 */
809 .urb.max_entries[MESA_SHADER_VERTEX] = 928,
810 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
811 .simulator_id = 24,
812 };
813 static const struct intel_device_info intel_device_info_cfl_gt2 = {
814 GFX9_FEATURES,
815 .platform = INTEL_PLATFORM_CFL,
816 .gt = 2,
817
818 .num_slices = 1,
819 .num_subslices = { 3, },
820 .max_eus_per_subslice = 8,
821 .l3_banks = 4,
822 .simulator_id = 24,
823 };
824
825 static const struct intel_device_info intel_device_info_cfl_gt3 = {
826 GFX9_FEATURES,
827 .platform = INTEL_PLATFORM_CFL,
828 .gt = 3,
829
830 .num_slices = 2,
831 .num_subslices = { 3, 3, },
832 .max_eus_per_subslice = 8,
833 .l3_banks = 8,
834 .simulator_id = 24,
835 };
836
837 #define subslices(args...) { args, }
838
839 #define GFX11_HW_INFO \
840 .ver = 11, \
841 .has_pln = false, \
842 .max_vs_threads = 364, \
843 .max_gs_threads = 224, \
844 .max_tcs_threads = 224, \
845 .max_tes_threads = 364, \
846 .max_threads_per_psd = 64, \
847 .max_cs_threads = 56
848
849 #define GFX11_FEATURES(_gt, _slices, _subslices, _l3, _platform) \
850 GFX8_FEATURES, \
851 GFX11_HW_INFO, \
852 .platform = _platform, \
853 .has_64bit_float = false, \
854 .has_64bit_int = false, \
855 .has_integer_dword_mul = false, \
856 .has_sample_with_hiz = false, \
857 .has_illegal_ccs_values = true, \
858 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
859 .num_subslices = _subslices, \
860 .max_eus_per_subslice = 8, \
861 .cooperative_matrix_configurations = { \
862 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
863 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
864 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
865 }
866
867 #define GFX11_URB_MIN_MAX_ENTRIES \
868 .min_entries = { \
869 [MESA_SHADER_VERTEX] = 64, \
870 [MESA_SHADER_TESS_EVAL] = 34, \
871 }, \
872 .max_entries = { \
873 [MESA_SHADER_VERTEX] = 2384, \
874 [MESA_SHADER_TESS_CTRL] = 1032, \
875 [MESA_SHADER_TESS_EVAL] = 2384, \
876 [MESA_SHADER_GEOMETRY] = 1032, \
877 }
878
879 static const struct intel_device_info intel_device_info_icl_gt2 = {
880 GFX11_FEATURES(2, 1, subslices(8), 8, INTEL_PLATFORM_ICL),
881 .urb = {
882 GFX11_URB_MIN_MAX_ENTRIES,
883 },
884 .simulator_id = 19,
885 };
886
887 static const struct intel_device_info intel_device_info_icl_gt1_5 = {
888 GFX11_FEATURES(1, 1, subslices(6), 6, INTEL_PLATFORM_ICL),
889 .urb = {
890 GFX11_URB_MIN_MAX_ENTRIES,
891 },
892 .simulator_id = 19,
893 };
894
895 static const struct intel_device_info intel_device_info_icl_gt1 = {
896 GFX11_FEATURES(1, 1, subslices(4), 6, INTEL_PLATFORM_ICL),
897 .urb = {
898 GFX11_URB_MIN_MAX_ENTRIES,
899 },
900 .simulator_id = 19,
901 };
902
903 static const struct intel_device_info intel_device_info_icl_gt0_5 = {
904 GFX11_FEATURES(1, 1, subslices(1), 6, INTEL_PLATFORM_ICL),
905 .urb = {
906 GFX11_URB_MIN_MAX_ENTRIES,
907 },
908 .simulator_id = 19,
909 };
910
911 #define GFX11_LP_FEATURES \
912 .urb = { \
913 GFX11_URB_MIN_MAX_ENTRIES, \
914 }, \
915 .disable_ccs_repack = true, \
916 .has_illegal_ccs_values = true, \
917 .simulator_id = 28
918
919 static const struct intel_device_info intel_device_info_ehl_4x8 = {
920 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
921 GFX11_LP_FEATURES,
922 };
923
924 static const struct intel_device_info intel_device_info_ehl_4x6 = {
925 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
926 GFX11_LP_FEATURES,
927 .max_eus_per_subslice = 6,
928 };
929
930 static const struct intel_device_info intel_device_info_ehl_4x5 = {
931 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
932 GFX11_LP_FEATURES,
933 .max_eus_per_subslice = 5,
934 };
935
936 static const struct intel_device_info intel_device_info_ehl_4x4 = {
937 GFX11_FEATURES(1, 1, subslices(4), 4, INTEL_PLATFORM_EHL),
938 GFX11_LP_FEATURES,
939 .max_eus_per_subslice = 4,
940 };
941
942 static const struct intel_device_info intel_device_info_ehl_2x8 = {
943 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
944 GFX11_LP_FEATURES,
945 };
946
947 static const struct intel_device_info intel_device_info_ehl_2x4 = {
948 GFX11_FEATURES(1, 1, subslices(2), 4, INTEL_PLATFORM_EHL),
949 GFX11_LP_FEATURES,
950 .max_eus_per_subslice = 4,
951 };
952
953 #define GFX12_HW_INFO \
954 .ver = 12, \
955 .has_pln = false, \
956 .has_sample_with_hiz = false, \
957 .has_aux_map = true, \
958 .max_vs_threads = 546, \
959 .max_gs_threads = 336, \
960 .max_tcs_threads = 336, \
961 .max_tes_threads = 546, \
962 .max_threads_per_psd = 64, \
963 .max_cs_threads = 112, /* threads per DSS */ \
964 .urb = { \
965 .size = 512, /* For intel_stub_gpu */ \
966 .min_entries = { \
967 [MESA_SHADER_VERTEX] = 64, \
968 [MESA_SHADER_TESS_EVAL] = 34, \
969 }, \
970 .max_entries = { \
971 [MESA_SHADER_VERTEX] = 3576, \
972 [MESA_SHADER_TESS_CTRL] = 1548, \
973 [MESA_SHADER_TESS_EVAL] = 3576, \
974 [MESA_SHADER_GEOMETRY] = 1548, \
975 }, \
976 }
977
978 #define GFX12_FEATURES(_gt, _slices, _l3) \
979 GFX8_FEATURES, \
980 GFX12_HW_INFO, \
981 .has_64bit_float = false, \
982 .has_64bit_int = false, \
983 .has_integer_dword_mul = false, \
984 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
985 .simulator_id = 22, \
986 .max_eus_per_subslice = 16, \
987 /* BSpec 45101 (r51017) */ \
988 .pat = { \
989 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
990 .cached_coherent = PAT_ENTRY(0, WB), \
991 /* CPU: WC, GPU: PAT 1 => WC */ \
992 .scanout = PAT_ENTRY(1, WC), \
993 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
994 .writeback_incoherent = PAT_ENTRY(0, WB), \
995 /* CPU: WC, GPU: PAT 1 => WC */ \
996 .writecombining = PAT_ENTRY(1, WC), \
997 }, \
998 .cooperative_matrix_configurations = { \
999 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
1000 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
1001 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
1002 }
1003
1004 #define dual_subslices(args...) { args, }
1005
1006 #define GFX12_GT05_FEATURES \
1007 GFX12_FEATURES(1, 1, 4), \
1008 .num_subslices = dual_subslices(1)
1009
1010 #define GFX12_GT_FEATURES(_gt) \
1011 GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8), \
1012 .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
1013
1014 static const struct intel_device_info intel_device_info_tgl_gt1 = {
1015 GFX12_GT_FEATURES(1),
1016 .platform = INTEL_PLATFORM_TGL,
1017 };
1018
1019 static const struct intel_device_info intel_device_info_tgl_gt2 = {
1020 GFX12_GT_FEATURES(2),
1021 .platform = INTEL_PLATFORM_TGL,
1022 };
1023
1024 static const struct intel_device_info intel_device_info_rkl_gt05 = {
1025 GFX12_GT05_FEATURES,
1026 .platform = INTEL_PLATFORM_RKL,
1027 };
1028
1029 static const struct intel_device_info intel_device_info_rkl_gt1 = {
1030 GFX12_GT_FEATURES(1),
1031 .platform = INTEL_PLATFORM_RKL,
1032 };
1033
1034 static const struct intel_device_info intel_device_info_adl_gt05 = {
1035 GFX12_GT05_FEATURES,
1036 .platform = INTEL_PLATFORM_ADL,
1037 };
1038
1039 static const struct intel_device_info intel_device_info_adl_gt1 = {
1040 GFX12_GT_FEATURES(1),
1041 .platform = INTEL_PLATFORM_ADL,
1042 };
1043
1044 static const struct intel_device_info intel_device_info_adl_n = {
1045 GFX12_GT_FEATURES(1),
1046 .platform = INTEL_PLATFORM_ADL,
1047 .is_adl_n = true,
1048 };
1049
1050 static const struct intel_device_info intel_device_info_adl_gt2 = {
1051 GFX12_GT_FEATURES(2),
1052 .platform = INTEL_PLATFORM_ADL,
1053 };
1054
1055 static const struct intel_device_info intel_device_info_rpl = {
1056 GFX12_FEATURES(1, 1, 4),
1057 .num_subslices = dual_subslices(2),
1058 .platform = INTEL_PLATFORM_RPL,
1059 };
1060
1061 static const struct intel_device_info intel_device_info_rpl_p = {
1062 GFX12_GT_FEATURES(2),
1063 .platform = INTEL_PLATFORM_RPL,
1064 };
1065
1066 #define GFX12_DG1_SG1_FEATURES \
1067 GFX12_GT_FEATURES(2), \
1068 .platform = INTEL_PLATFORM_DG1, \
1069 .has_llc = false, \
1070 .has_local_mem = true, \
1071 .urb.size = 768, \
1072 .simulator_id = 30
1073
1074 static const struct intel_device_info intel_device_info_dg1 = {
1075 GFX12_DG1_SG1_FEATURES,
1076 };
1077
1078 static const struct intel_device_info intel_device_info_sg1 = {
1079 GFX12_DG1_SG1_FEATURES,
1080 };
1081
1082 #define XEHP_URB_MIN_MAX_ENTRIES \
1083 .min_entries = { \
1084 [MESA_SHADER_VERTEX] = 64, \
1085 [MESA_SHADER_TESS_EVAL] = 34, \
1086 }, \
1087 .max_entries = { \
1088 [MESA_SHADER_VERTEX] = 3832, /* BSpec 47138 */ \
1089 [MESA_SHADER_TESS_CTRL] = 1548, /* BSpec 47137 */ \
1090 [MESA_SHADER_TESS_EVAL] = 3576, /* BSpec 47135 */ \
1091 [MESA_SHADER_GEOMETRY] = 1548, /* BSpec 47136 */ \
1092 }
1093
1094 #define XEHP_FEATURES(_gt, _slices, _l3) \
1095 GFX8_FEATURES, \
1096 .needs_null_push_constant_tbimr_workaround = true, \
1097 .has_64bit_float = false, \
1098 .has_64bit_int = false, \
1099 .has_integer_dword_mul = false, \
1100 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
1101 .num_subslices = dual_subslices(1), /* updated by topology */\
1102 .ver = 12, \
1103 .has_pln = false, \
1104 .has_sample_with_hiz = false, \
1105 .max_vs_threads = 546, /* BSpec 46312 */ \
1106 .max_gs_threads = 336, /* BSpec 46299 */ \
1107 .max_tcs_threads = 336, /* BSpec 46300 */ \
1108 .max_tes_threads = 546, /* BSpec 46298 */ \
1109 .max_threads_per_psd = 64, \
1110 .max_cs_threads = 112, /* threads per DSS */ \
1111 .urb = { \
1112 .size = 768, /* For intel_stub_gpu */ \
1113 XEHP_URB_MIN_MAX_ENTRIES, \
1114 }, \
1115 .num_thread_per_eu = 8 /* BSpec 44472 */, \
1116 .max_eus_per_subslice = 16, \
1117 .verx10 = 125, \
1118 .has_llc = false, \
1119 .has_lsc = true, \
1120 .has_local_mem = true, \
1121 .has_aux_map = false, \
1122 .simulator_id = 29, \
1123 .cooperative_matrix_configurations = { \
1124 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
1125 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
1126 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 8, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
1127 }
1128
1129 #define DG2_FEATURES \
1130 /* (Sub)slice info comes from the kernel topology info */ \
1131 XEHP_FEATURES(0, 1, 0), \
1132 .revision = 4, /* For offline compiler */ \
1133 .has_coarse_pixel_primitive_and_cb = true, \
1134 .has_mesh_shading = true, \
1135 .has_ray_tracing = true, \
1136 .has_flat_ccs = true, \
1137 /* There is no PAT table for DG2, using TGL ones */ \
1138 /* BSpec 45101 (r51017) */ \
1139 .pat = { \
1140 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
1141 .cached_coherent = PAT_ENTRY(0, WB), \
1142 /* CPU: WC, GPU: PAT 1 => WC */ \
1143 .scanout = PAT_ENTRY(1, WC), \
1144 /* CPU: WB, GPU: PAT 0 => WB, 2WAY */ \
1145 .writeback_incoherent = PAT_ENTRY(0, WB), \
1146 /* CPU: WC, GPU: PAT 1 => WC */ \
1147 .writecombining = PAT_ENTRY(1, WC), \
1148 }
1149
1150 static const struct intel_device_info intel_device_info_dg2_g10 = {
1151 DG2_FEATURES,
1152 .platform = INTEL_PLATFORM_DG2_G10,
1153 };
1154
1155 static const struct intel_device_info intel_device_info_dg2_g11 = {
1156 DG2_FEATURES,
1157 .platform = INTEL_PLATFORM_DG2_G11,
1158 };
1159
1160 static const struct intel_device_info intel_device_info_dg2_g12 = {
1161 DG2_FEATURES,
1162 .platform = INTEL_PLATFORM_DG2_G12,
1163 };
1164
1165 static const struct intel_device_info intel_device_info_atsm_g10 = {
1166 DG2_FEATURES,
1167 .platform = INTEL_PLATFORM_ATSM_G10,
1168 };
1169
1170 static const struct intel_device_info intel_device_info_atsm_g11 = {
1171 DG2_FEATURES,
1172 .platform = INTEL_PLATFORM_ATSM_G11,
1173 };
1174
1175 #define MTL_FEATURES \
1176 /* (Sub)slice info comes from the kernel topology info */ \
1177 XEHP_FEATURES(0, 1, 0), \
1178 .has_local_mem = false, \
1179 .has_aux_map = true, \
1180 .has_64bit_float = true, \
1181 .has_64bit_float_via_math_pipe = true, \
1182 .has_integer_dword_mul = false, \
1183 .has_coarse_pixel_primitive_and_cb = true, \
1184 .has_mesh_shading = true, \
1185 .has_ray_tracing = true, \
1186 /* BSpec 45101 (r51017) */ \
1187 .pat = { \
1188 /* CPU: WB, GPU: PAT 3 => WB, 1WAY */ \
1189 .cached_coherent = PAT_ENTRY(3, WB), \
1190 /* CPU: WC, GPU: PAT 1 => WC */ \
1191 .scanout = PAT_ENTRY(1, WC), \
1192 /* CPU: WB, GPU: PAT 0 => WB, 0WAY */ \
1193 .writeback_incoherent = PAT_ENTRY(0, WB), \
1194 /* CPU: WC, GPU: PAT 1 => WC */ \
1195 .writecombining = PAT_ENTRY(1, WC), \
1196 }
1197
1198 static const struct intel_device_info intel_device_info_mtl_u = {
1199 MTL_FEATURES,
1200 .platform = INTEL_PLATFORM_MTL_U,
1201 };
1202
1203 static const struct intel_device_info intel_device_info_mtl_h = {
1204 MTL_FEATURES,
1205 .platform = INTEL_PLATFORM_MTL_H,
1206 };
1207
1208 static const struct intel_device_info intel_device_info_arl_u = {
1209 MTL_FEATURES,
1210 .platform = INTEL_PLATFORM_ARL_U,
1211 };
1212
1213 static const struct intel_device_info intel_device_info_arl_h = {
1214 MTL_FEATURES,
1215 .platform = INTEL_PLATFORM_ARL_H,
1216 };
1217
1218 #define XE2_FEATURES \
1219 /* (Sub)slice info comes from the kernel topology info */ \
1220 XEHP_FEATURES(0, 1, 0), \
1221 .ver = 20, \
1222 .verx10 = 200, \
1223 .num_subslices = dual_subslices(1), \
1224 .grf_size = 64, \
1225 .needs_null_push_constant_tbimr_workaround = false, \
1226 .has_64bit_float = true, \
1227 .has_64bit_int = true, \
1228 .has_integer_dword_mul = false, \
1229 .has_coarse_pixel_primitive_and_cb = true, \
1230 .has_mesh_shading = true, \
1231 .has_ray_tracing = true, \
1232 .has_indirect_unroll = true, \
1233 /* BSpec 71582 (r59285) */ \
1234 .pat = { \
1235 /* CPU: WB, GPU: PAT 1 => WB, 1WAY */ \
1236 .cached_coherent = PAT_ENTRY(1, WB), \
1237 /* CPU: WC, GPU: PAT 6 => XD */ \
1238 .scanout = PAT_ENTRY(6, WC), \
1239 /* CPU: WC, GPU: PAT 0 => WB */ \
1240 .writecombining = PAT_ENTRY(0, WC), \
1241 /* CPU: WC, GPU: PAT 11 => XD, compressed */ \
1242 .compressed = PAT_ENTRY(11, WC) \
1243 }, \
1244 .cooperative_matrix_configurations = { \
1245 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 16, 16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT16, INTEL_CMAT_FLOAT32, INTEL_CMAT_FLOAT32 }, \
1246 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 16, 32, INTEL_CMAT_SINT8, INTEL_CMAT_SINT8, INTEL_CMAT_SINT32, INTEL_CMAT_SINT32 }, \
1247 { INTEL_CMAT_SCOPE_SUBGROUP, 8, 16, 32, INTEL_CMAT_UINT8, INTEL_CMAT_UINT8, INTEL_CMAT_UINT32, INTEL_CMAT_UINT32 }, \
1248 }, \
1249 .has_flat_ccs = true
1250
1251 static const struct intel_device_info intel_device_info_bmg = {
1252 XE2_FEATURES,
1253 .platform = INTEL_PLATFORM_BMG,
1254 .has_local_mem = true,
1255 };
1256
1257 static const struct intel_device_info intel_device_info_lnl = {
1258 XE2_FEATURES,
1259 .platform = INTEL_PLATFORM_LNL,
1260 .has_local_mem = false,
1261 };
1262
1263 void
intel_device_info_topology_reset_masks(struct intel_device_info * devinfo)1264 intel_device_info_topology_reset_masks(struct intel_device_info *devinfo)
1265 {
1266 devinfo->subslice_slice_stride = 0;
1267 devinfo->eu_subslice_stride = 0;
1268 devinfo->eu_slice_stride = 0;
1269
1270 devinfo->num_slices = 0;
1271 memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1272
1273 memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1274 memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1275 memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1276 memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1277 }
1278
1279 void
intel_device_info_topology_update_counts(struct intel_device_info * devinfo)1280 intel_device_info_topology_update_counts(struct intel_device_info *devinfo)
1281 {
1282 devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1283 devinfo->subslice_total = 0;
1284 for (int s = 0; s < devinfo->max_slices; s++) {
1285 if (!intel_device_info_slice_available(devinfo, s))
1286 continue;
1287
1288 for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1289 devinfo->num_subslices[s] +=
1290 __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1291 }
1292 devinfo->subslice_total += devinfo->num_subslices[s];
1293 }
1294 assert(devinfo->num_slices > 0);
1295 assert(devinfo->subslice_total > 0);
1296 }
1297
1298 void
intel_device_info_update_pixel_pipes(struct intel_device_info * devinfo,uint8_t * subslice_masks)1299 intel_device_info_update_pixel_pipes(struct intel_device_info *devinfo, uint8_t *subslice_masks)
1300 {
1301 if (devinfo->ver < 11)
1302 return;
1303
1304 /* The kernel only reports one slice on all existing ICL+ platforms, even
1305 * if multiple slices are present. The slice mask is allowed to have the
1306 * accurate value greater than 1 on gfx12.5+ platforms though, in order to
1307 * be tolerant with the behavior of our simulation environment.
1308 */
1309 assert(devinfo->slice_masks == 1 || devinfo->verx10 >= 125);
1310
1311 /* Count the number of subslices on each pixel pipe. Assume that every
1312 * contiguous group of 4 subslices in the mask belong to the same pixel
1313 * pipe. However note that on TGL+ the kernel returns a mask of enabled
1314 * *dual* subslices instead of actual subslices somewhat confusingly, so
1315 * each pixel pipe only takes 2 bits in the mask even though it's still 4
1316 * subslices.
1317 */
1318 const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1319 for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1320 const unsigned offset = p * ppipe_bits;
1321 const unsigned subslice_idx = offset /
1322 devinfo->max_subslices_per_slice * devinfo->subslice_slice_stride;
1323 const unsigned ppipe_mask =
1324 BITFIELD_RANGE(offset % devinfo->max_subslices_per_slice, ppipe_bits);
1325
1326 if (subslice_idx < ARRAY_SIZE(devinfo->subslice_masks))
1327 devinfo->ppipe_subslices[p] =
1328 __builtin_popcount(subslice_masks[subslice_idx] & ppipe_mask);
1329 else
1330 devinfo->ppipe_subslices[p] = 0;
1331 }
1332 }
1333
1334 void
intel_device_info_update_l3_banks(struct intel_device_info * devinfo)1335 intel_device_info_update_l3_banks(struct intel_device_info *devinfo)
1336 {
1337 if (devinfo->ver != 12)
1338 return;
1339
1340 if (devinfo->verx10 >= 125) {
1341 if (devinfo->subslice_total > 16) {
1342 assert(devinfo->subslice_total <= 32);
1343 devinfo->l3_banks = 32;
1344 } else if (devinfo->subslice_total > 8) {
1345 devinfo->l3_banks = 16;
1346 } else {
1347 devinfo->l3_banks = 8;
1348 }
1349 } else {
1350 assert(devinfo->num_slices == 1);
1351 if (devinfo->subslice_total >= 6) {
1352 assert(devinfo->subslice_total == 6);
1353 devinfo->l3_banks = 8;
1354 } else if (devinfo->subslice_total > 2) {
1355 devinfo->l3_banks = 6;
1356 } else {
1357 devinfo->l3_banks = 4;
1358 }
1359 }
1360 }
1361
1362 /* Returns the number of EUs of the first subslice enabled */
1363 uint32_t
intel_device_info_get_eu_count_first_subslice(const struct intel_device_info * devinfo)1364 intel_device_info_get_eu_count_first_subslice(const struct intel_device_info *devinfo)
1365 {
1366 uint32_t first_subslice, first_slice, offset, i;
1367 uint32_t eu_count = 0;
1368
1369 first_slice = ffs(devinfo->slice_masks);
1370 first_slice--;
1371 offset = first_slice * devinfo->subslice_slice_stride;
1372
1373 for (i = 0; i < DIV_ROUND_UP(devinfo->max_subslices_per_slice, 8); i++) {
1374 first_subslice = ffs(devinfo->subslice_masks[offset + i]);
1375
1376 if (first_subslice == 0)
1377 continue;
1378
1379 break;
1380 }
1381
1382 assert(first_subslice > 0);
1383 first_subslice--;
1384 offset = first_slice * devinfo->eu_slice_stride +
1385 first_subslice * devinfo->eu_subslice_stride;
1386 for (i = 0; i < DIV_ROUND_UP(devinfo->max_eus_per_subslice, 8); i++)
1387 eu_count += __builtin_popcount(devinfo->eu_masks[offset + i]);
1388
1389 assert(eu_count > 0);
1390 return eu_count;
1391 }
1392
1393 /* Generate mask from the device data. */
1394 static void
fill_masks(struct intel_device_info * devinfo)1395 fill_masks(struct intel_device_info *devinfo)
1396 {
1397 /* All of our internal device descriptions assign the same number of
1398 * subslices for each slice. Just verify that this is true.
1399 */
1400 for (int s = 1; s < devinfo->num_slices; s++)
1401 assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1402
1403 intel_device_info_i915_update_from_masks(devinfo,
1404 (1U << devinfo->num_slices) - 1,
1405 (1U << devinfo->num_subslices[0]) - 1,
1406 devinfo->num_slices * devinfo->num_subslices[0] *
1407 devinfo->max_eus_per_subslice);
1408 }
1409
1410 void
intel_device_info_update_cs_workgroup_threads(struct intel_device_info * devinfo)1411 intel_device_info_update_cs_workgroup_threads(struct intel_device_info *devinfo)
1412 {
1413 /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1414 * can program is 64 without going up to a rectangular group. This only
1415 * impacts Haswell and TGL which have higher thread counts.
1416 *
1417 * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1418 * is 10 bits so we have no such restrictions.
1419 */
1420 devinfo->max_cs_workgroup_threads =
1421 devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1422 MIN2(devinfo->max_cs_threads, 64);
1423 }
1424
1425 static bool
parse_force_probe_entry(int pci_id,const char * entry,bool * force_on,bool * force_off)1426 parse_force_probe_entry(int pci_id, const char *entry, bool *force_on,
1427 bool *force_off)
1428 {
1429 const char *cp = entry;
1430
1431 bool negated = *cp == '!';
1432 if (negated)
1433 cp++;
1434
1435 if (*cp == '\0')
1436 return false;
1437
1438 bool wildcard = *cp == '*';
1439 long val = 0;
1440
1441 if (wildcard) {
1442 cp++;
1443 } else {
1444 char *end;
1445 val = strtol(cp, &end, 16);
1446 if (end == cp)
1447 return false;
1448 cp = end;
1449 }
1450
1451 if (*cp != '\0')
1452 return false;
1453
1454 bool matched = wildcard || (long)pci_id == val;
1455 if (matched) {
1456 *force_on = !negated;
1457 *force_off = negated;
1458 }
1459
1460 return matched;
1461 }
1462
1463 static void
scan_for_force_probe(int pci_id,bool * force_on,bool * force_off)1464 scan_for_force_probe(int pci_id, bool *force_on, bool *force_off)
1465 {
1466 *force_on = false;
1467 *force_off = false;
1468
1469 const char *env = getenv("INTEL_FORCE_PROBE");
1470 if (env == NULL)
1471 return;
1472
1473 size_t len = strlen(env);
1474 if (len == 0)
1475 return;
1476
1477 char *dup = strndup(env, len);
1478 if (dup == NULL)
1479 return;
1480
1481 for (char *entry = strtok(dup, ","); entry; entry = strtok(NULL, ","))
1482 parse_force_probe_entry(pci_id, entry, force_on, force_off);
1483
1484 free(dup);
1485 assert(!*force_on || !*force_off);
1486 }
1487
1488 struct device_init_config {
1489 bool require_force_probe;
1490 };
1491
1492 /* Example PCI ID entry using FORCE_PROBE:
1493 *
1494 * CHIPSET(0x1234, foo, "FOO", "Intel(R) Graphics", FORCE_PROBE)
1495 */
1496 #define FORCE_PROBE .require_force_probe = true
1497
1498 static bool
intel_device_info_init_common(int pci_id,bool building,struct intel_device_info * devinfo)1499 intel_device_info_init_common(int pci_id, bool building,
1500 struct intel_device_info *devinfo)
1501 {
1502 struct device_init_config device_config = { 0 };
1503 switch (pci_id) {
1504 #undef CHIPSET
1505 #define CHIPSET(id, family, fam_str, name, ...) \
1506 case id: \
1507 *devinfo = intel_device_info_##family; \
1508 device_config = *&(struct device_init_config) { __VA_ARGS__ }; \
1509 break;
1510 #include "pci_ids/crocus_pci_ids.h"
1511 #include "pci_ids/iris_pci_ids.h"
1512
1513 #undef CHIPSET
1514 #define CHIPSET(id, fam_str, name) \
1515 case id: *devinfo = intel_device_info_gfx3; break;
1516 #include "pci_ids/i915_pci_ids.h"
1517
1518 default:
1519 mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1520 return false;
1521 }
1522
1523 switch (pci_id) {
1524 #undef CHIPSET
1525 #define CHIPSET(_id, _family, _fam_str, _name, ...) \
1526 case _id: \
1527 /* sizeof(str_literal) includes the null */ \
1528 STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1529 sizeof(devinfo->name)); \
1530 strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1531 break;
1532 #include "pci_ids/crocus_pci_ids.h"
1533 #include "pci_ids/iris_pci_ids.h"
1534 default:
1535 strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1536 }
1537
1538 bool force_on = false;
1539 bool force_off = false;
1540 if (building)
1541 force_on = true;
1542 else
1543 scan_for_force_probe(pci_id, &force_on, &force_off);
1544 devinfo->probe_forced = force_on;
1545 if (force_off) {
1546 mesa_logw("%s (0x%x) disabled with INTEL_FORCE_PROBE", devinfo->name,
1547 pci_id);
1548 return false;
1549 } else if (device_config.require_force_probe) {
1550 if (force_on) {
1551 if (!building)
1552 mesa_logw("Forcing probe of unsupported: %s (0x%x)", devinfo->name,
1553 pci_id);
1554 } else {
1555 mesa_loge("%s (0x%x) requires INTEL_FORCE_PROBE", devinfo->name,
1556 pci_id);
1557 return false;
1558 }
1559 }
1560
1561 devinfo->pci_device_id = pci_id;
1562
1563 fill_masks(devinfo);
1564
1565 /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1566 *
1567 * "Scratch Space per slice is computed based on 4 sub-slices. SW must
1568 * allocate scratch space enough so that each slice has 4 slices allowed."
1569 *
1570 * The equivalent internal documentation says that this programming note
1571 * applies to all Gfx9+ platforms.
1572 *
1573 * The hardware typically calculates the scratch space pointer by taking
1574 * the base address, and adding per-thread-scratch-space * thread ID.
1575 * Extra padding can be necessary depending how the thread IDs are
1576 * calculated for a particular shader stage.
1577 */
1578
1579 switch(devinfo->ver) {
1580 case 9:
1581 devinfo->max_wm_threads = 64 /* threads-per-PSD */
1582 * devinfo->num_slices
1583 * 4; /* effective subslices per slice */
1584 break;
1585 case 11:
1586 case 12:
1587 case 20:
1588 devinfo->max_wm_threads = 128 /* threads-per-PSD */
1589 * devinfo->num_slices
1590 * 8; /* subslices per slice */
1591 break;
1592 default:
1593 assert(devinfo->ver < 9);
1594 break;
1595 }
1596
1597 assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1598
1599 if (devinfo->verx10 == 0)
1600 devinfo->verx10 = devinfo->ver * 10;
1601
1602 uint16_t major = devinfo->ver;
1603 uint16_t minor = (devinfo->verx10 - (devinfo->ver * 10)) * 10;
1604 /* When supported gfx_ip_ver will be overwritten by values read from KMD.
1605 * This is a approximation for platforms that do not support GMD ID or
1606 * when running offline tools.
1607 * verx10 125 becomes GFX_IP_VER(12, 50) for example.
1608 */
1609 devinfo->gfx_ip_ver = GFX_IP_VER(major, minor);
1610
1611 if (devinfo->has_mesh_shading) {
1612 /* Half of push constant space matches the size used in the simplest
1613 * primitive pipeline (VS + FS). Tweaking this affects performance.
1614 */
1615 devinfo->mesh_max_constant_urb_size_kb =
1616 devinfo->max_constant_urb_size_kb / 2;
1617 }
1618
1619 /*
1620 * Gfx 12.5 moved scratch to a surface and SURFTYPE_SCRATCH has this pitch
1621 * restriction:
1622 *
1623 * BSpec 43862 (r52666)
1624 * RENDER_SURFACE_STATE::Surface Pitch
1625 * For surfaces of type SURFTYPE_SCRATCH, valid range of pitch is:
1626 * [63,262143] -> [64B, 256KB]
1627 *
1628 * The pitch of the surface is the scratch size per thread and the surface
1629 * should be large enough to accommodate every physical thread.
1630 */
1631 devinfo->max_scratch_size_per_thread = devinfo->verx10 >= 125 ?
1632 (256 * 1024) : (2 * 1024 * 1024);
1633 intel_device_info_update_cs_workgroup_threads(devinfo);
1634
1635 return true;
1636 }
1637
1638 static void
intel_device_info_apply_workarounds(struct intel_device_info * devinfo)1639 intel_device_info_apply_workarounds(struct intel_device_info *devinfo)
1640 {
1641 if (intel_needs_workaround(devinfo, 18012660806))
1642 devinfo->urb.max_entries[MESA_SHADER_GEOMETRY] = 1536;
1643
1644 /* Fixes issues with:
1645 * dEQP-GLES31.functional.geometry_shading.layered.render_with_default_layer_cubemap
1646 * when running on GFX12 platforms with small EU count.
1647 */
1648 const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1649 if (devinfo->verx10 == 120 && eu_total <= 32)
1650 devinfo->urb.max_entries[MESA_SHADER_GEOMETRY] = 1024;
1651 }
1652
1653 static bool
intel_get_device_info_from_pci_id_common(int pci_id,bool building,struct intel_device_info * devinfo)1654 intel_get_device_info_from_pci_id_common(int pci_id, bool building,
1655 struct intel_device_info *devinfo)
1656 {
1657 intel_device_info_init_common(pci_id, building, devinfo);
1658
1659 /* This is a placeholder until a proper value is set. */
1660 devinfo->kmd_type = INTEL_KMD_TYPE_I915;
1661
1662 intel_device_info_init_was(devinfo);
1663 intel_device_info_apply_workarounds(devinfo);
1664
1665 return true;
1666 }
1667
1668 bool
intel_get_device_info_from_pci_id(int pci_id,struct intel_device_info * devinfo)1669 intel_get_device_info_from_pci_id(int pci_id,
1670 struct intel_device_info *devinfo)
1671 {
1672 return intel_get_device_info_from_pci_id_common(pci_id, false, devinfo);
1673 }
1674
1675 bool
intel_get_device_info_for_build(int pci_id,struct intel_device_info * devinfo)1676 intel_get_device_info_for_build(int pci_id,
1677 struct intel_device_info *devinfo)
1678 {
1679 return intel_get_device_info_from_pci_id_common(pci_id, true, devinfo);
1680 }
1681
1682 bool
intel_device_info_compute_system_memory(struct intel_device_info * devinfo,bool update)1683 intel_device_info_compute_system_memory(struct intel_device_info *devinfo, bool update)
1684 {
1685 if (!update) {
1686 if (!os_get_total_physical_memory(&devinfo->mem.sram.mappable.size))
1687 return false;
1688 }
1689
1690 os_get_available_system_memory(&devinfo->mem.sram.mappable.free);
1691
1692 return true;
1693 }
1694
1695 static void
intel_device_info_adjust_memory(struct intel_device_info * devinfo)1696 intel_device_info_adjust_memory(struct intel_device_info *devinfo)
1697 {
1698 uint64_t available;
1699
1700 /* Applications running without elevated privileges don't report valid
1701 * numbers for free sram
1702 */
1703 if (os_get_available_system_memory(&available)) {
1704 devinfo->mem.sram.mappable.free = MIN3(devinfo->mem.sram.mappable.free,
1705 devinfo->mem.sram.mappable.size,
1706 available);
1707 }
1708 }
1709
1710 static void
init_max_scratch_ids(struct intel_device_info * devinfo)1711 init_max_scratch_ids(struct intel_device_info *devinfo)
1712 {
1713 /* Determine the max number of subslices that potentially might be used in
1714 * scratch space ids.
1715 *
1716 * For, Gfx11+, scratch space allocation is based on the number of threads
1717 * in the base configuration.
1718 *
1719 * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1720 * we wish to view that there are 4 subslices per slice instead of the
1721 * actual number of subslices per slice. The documentation for 3DSTATE_PS
1722 * "Scratch Space Base Pointer" says:
1723 *
1724 * "Scratch Space per slice is computed based on 4 sub-slices. SW
1725 * must allocate scratch space enough so that each slice has 4
1726 * slices allowed."
1727 *
1728 * According to the other driver team, this applies to compute shaders
1729 * as well. This is not currently documented at all.
1730 *
1731 * For Gfx8 and older we user devinfo->subslice_total.
1732 */
1733 unsigned subslices;
1734 if (devinfo->verx10 == 125)
1735 subslices = 32;
1736 else if (devinfo->ver == 12)
1737 subslices = (devinfo->platform == INTEL_PLATFORM_DG1 || devinfo->gt == 2 ? 6 : 2);
1738 else if (devinfo->ver == 11)
1739 subslices = 8;
1740 else if (devinfo->ver >= 9 && devinfo->ver < 11)
1741 subslices = 4 * devinfo->num_slices;
1742 else
1743 subslices = devinfo->subslice_total;
1744 assert(subslices >= devinfo->subslice_total);
1745
1746 unsigned scratch_ids_per_subslice;
1747 if (devinfo->ver >= 12) {
1748 /* Same as ICL below, but with 16 EUs. */
1749 scratch_ids_per_subslice = 16 * 8;
1750 } else if (devinfo->ver >= 11) {
1751 /* The MEDIA_VFE_STATE docs say:
1752 *
1753 * "Starting with this configuration, the Maximum Number of
1754 * Threads must be set to (#EU * 8) for GPGPU dispatches.
1755 *
1756 * Although there are only 7 threads per EU in the configuration,
1757 * the FFTID is calculated as if there are 8 threads per EU,
1758 * which in turn requires a larger amount of Scratch Space to be
1759 * allocated by the driver."
1760 */
1761 scratch_ids_per_subslice = 8 * 8;
1762 } else if (devinfo->platform == INTEL_PLATFORM_HSW) {
1763 /* WaCSScratchSize:hsw
1764 *
1765 * Haswell's scratch space address calculation appears to be sparse
1766 * rather than tightly packed. The Thread ID has bits indicating
1767 * which subslice, EU within a subslice, and thread within an EU it
1768 * is. There's a maximum of two slices and two subslices, so these
1769 * can be stored with a single bit. Even though there are only 10 EUs
1770 * per subslice, this is stored in 4 bits, so there's an effective
1771 * maximum value of 16 EUs. Similarly, although there are only 7
1772 * threads per EU, this is stored in a 3 bit number, giving an
1773 * effective maximum value of 8 threads per EU.
1774 *
1775 * This means that we need to use 16 * 8 instead of 10 * 7 for the
1776 * number of threads per subslice.
1777 */
1778 scratch_ids_per_subslice = 16 * 8;
1779 } else if (devinfo->platform == INTEL_PLATFORM_CHV) {
1780 /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1781 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1782 * as if it had 8 EUs.
1783 */
1784 scratch_ids_per_subslice = 8 * 7;
1785 } else {
1786 scratch_ids_per_subslice = devinfo->max_cs_threads;
1787 }
1788
1789 unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1790
1791 if (devinfo->verx10 >= 125) {
1792 /* On GFX version 12.5, scratch access changed to a surface-based model.
1793 * Instead of each shader type having its own layout based on IDs passed
1794 * from the relevant fixed-function unit, all scratch access is based on
1795 * thread IDs like it always has been for compute.
1796 */
1797 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1798 devinfo->max_scratch_ids[i] = max_thread_ids;
1799 } else {
1800 unsigned max_scratch_ids[] = {
1801 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads,
1802 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1803 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1804 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads,
1805 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads,
1806 [MESA_SHADER_COMPUTE] = max_thread_ids,
1807 };
1808 STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1809 memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1810 sizeof(devinfo->max_scratch_ids));
1811 }
1812 }
1813
1814 static unsigned
intel_device_info_calc_engine_prefetch(const struct intel_device_info * devinfo,enum intel_engine_class engine_class)1815 intel_device_info_calc_engine_prefetch(const struct intel_device_info *devinfo,
1816 enum intel_engine_class engine_class)
1817 {
1818 if (devinfo->verx10 >= 200) {
1819 switch (engine_class) {
1820 case INTEL_ENGINE_CLASS_RENDER:
1821 return 4096;
1822 case INTEL_ENGINE_CLASS_COMPUTE:
1823 return 1024;
1824 default:
1825 return 512;
1826 }
1827 }
1828
1829 if (intel_device_info_is_mtl_or_arl(devinfo)) {
1830 switch (engine_class) {
1831 case INTEL_ENGINE_CLASS_RENDER:
1832 return 2048;
1833 case INTEL_ENGINE_CLASS_COMPUTE:
1834 return 1024;
1835 default:
1836 return 512;
1837 }
1838 }
1839
1840 /* DG2 */
1841 if (devinfo->verx10 == 125)
1842 return 1024;
1843
1844 /* Older than DG2/MTL */
1845 return 512;
1846 }
1847
1848 bool
intel_get_device_info_from_fd(int fd,struct intel_device_info * devinfo,int min_ver,int max_ver)1849 intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo, int min_ver, int max_ver)
1850 {
1851 if (NULL != getenv("INTEL_STUB_GPU_JSON")) {
1852 /* This call will succeed when shim-drm has been initialized with a
1853 * serialized intel_device_info structure.
1854 */
1855 struct drm_intel_stub_devinfo arg = {
1856 .addr = (uintptr_t)devinfo,
1857 .size = sizeof(*devinfo),
1858 };
1859 if (0 == intel_ioctl(fd, DRM_IOCTL_INTEL_STUB_DEVINFO, &arg)) {
1860 intel_device_info_init_was(devinfo);
1861 intel_device_info_apply_workarounds(devinfo);
1862 return true;
1863 }
1864 }
1865
1866 /* Get PCI info.
1867 *
1868 * Some callers may already have a valid drm device which holds values of
1869 * PCI fields queried here prior to calling this function. But making this
1870 * query optional leads to a more cumbersome implementation. These callers
1871 * still need to initialize the fields somewhere out of this function and
1872 * rely on an ioctl to get PCI device id for the next step when skipping
1873 * this drm query.
1874 */
1875 drmDevicePtr drmdev = NULL;
1876 if (drmGetDevice2(fd, DRM_DEVICE_GET_PCI_REVISION, &drmdev)) {
1877 mesa_loge("Failed to query drm device.");
1878 return false;
1879 }
1880 if (!intel_device_info_init_common(drmdev->deviceinfo.pci->device_id,
1881 false, devinfo)) {
1882 drmFreeDevice(&drmdev);
1883 return false;
1884 }
1885
1886 if ((min_ver > 0 && devinfo->ver < min_ver) || (max_ver > 0 && devinfo->ver > max_ver)) {
1887 drmFreeDevice(&drmdev);
1888 return false;
1889 }
1890
1891 devinfo->pci_domain = drmdev->businfo.pci->domain;
1892 devinfo->pci_bus = drmdev->businfo.pci->bus;
1893 devinfo->pci_dev = drmdev->businfo.pci->dev;
1894 devinfo->pci_func = drmdev->businfo.pci->func;
1895 devinfo->pci_device_id = drmdev->deviceinfo.pci->device_id;
1896 devinfo->pci_revision_id = drmdev->deviceinfo.pci->revision_id;
1897 drmFreeDevice(&drmdev);
1898 devinfo->no_hw = debug_get_bool_option("INTEL_NO_HW", false);
1899
1900 devinfo->kmd_type = intel_get_kmd_type(fd);
1901 if (devinfo->kmd_type == INTEL_KMD_TYPE_INVALID) {
1902 mesa_loge("Unknown kernel mode driver");
1903 return false;
1904 }
1905
1906 /* remaining initialization queries the kernel for device info */
1907 if (devinfo->no_hw) {
1908 /* Provide some sensible values for NO_HW. */
1909 devinfo->gtt_size =
1910 devinfo->ver >= 8 ? (1ull << 48) : 2ull * 1024 * 1024 * 1024;
1911 intel_device_info_compute_system_memory(devinfo, false);
1912 return true;
1913 }
1914
1915 bool ret;
1916 switch (devinfo->kmd_type) {
1917 case INTEL_KMD_TYPE_I915:
1918 ret = intel_device_info_i915_get_info_from_fd(fd, devinfo);
1919 break;
1920 case INTEL_KMD_TYPE_XE:
1921 ret = intel_device_info_xe_get_info_from_fd(fd, devinfo);
1922 if (devinfo->verx10 < 200)
1923 mesa_logw("Support for this platform is experimental with Xe KMD, bug reports may be ignored.");
1924 break;
1925 default:
1926 ret = false;
1927 unreachable("Missing");
1928 }
1929 if (!ret) {
1930 mesa_logw("Could not get intel_device_info.");
1931 return false;
1932 }
1933
1934 /* region info is required for lmem support */
1935 if (devinfo->has_local_mem && !devinfo->mem.use_class_instance) {
1936 mesa_logw("Could not query local memory size.");
1937 return false;
1938 }
1939
1940 intel_device_info_adjust_memory(devinfo);
1941
1942 /* Gfx7 and older do not support EU/Subslice info */
1943 assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
1944 devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
1945
1946 init_max_scratch_ids(devinfo);
1947
1948 for (enum intel_engine_class engine = INTEL_ENGINE_CLASS_RENDER;
1949 engine < ARRAY_SIZE(devinfo->engine_class_prefetch); engine++)
1950 devinfo->engine_class_prefetch[engine] =
1951 intel_device_info_calc_engine_prefetch(devinfo, engine);
1952
1953 intel_device_info_init_was(devinfo);
1954 intel_device_info_apply_workarounds(devinfo);
1955
1956 return true;
1957 }
1958
intel_device_info_update_memory_info(struct intel_device_info * devinfo,int fd)1959 bool intel_device_info_update_memory_info(struct intel_device_info *devinfo, int fd)
1960 {
1961 bool ret;
1962
1963 switch (devinfo->kmd_type) {
1964 case INTEL_KMD_TYPE_I915:
1965 ret = intel_device_info_i915_query_regions(devinfo, fd, true);
1966 break;
1967 case INTEL_KMD_TYPE_XE:
1968 ret = intel_device_info_xe_query_regions(fd, devinfo, true);
1969 break;
1970 default:
1971 ret = false;
1972 }
1973
1974 if (ret)
1975 intel_device_info_adjust_memory(devinfo);
1976 return ret;
1977 }
1978
1979 void
intel_device_info_update_after_hwconfig(struct intel_device_info * devinfo)1980 intel_device_info_update_after_hwconfig(struct intel_device_info *devinfo)
1981 {
1982 /* After applying hwconfig values, some items need to be recalculated. */
1983 devinfo->max_cs_threads =
1984 devinfo->max_eus_per_subslice * devinfo->num_thread_per_eu;
1985
1986 intel_device_info_update_cs_workgroup_threads(devinfo);
1987 }
1988
1989 enum intel_wa_steppings
intel_device_info_wa_stepping(struct intel_device_info * devinfo)1990 intel_device_info_wa_stepping(struct intel_device_info *devinfo)
1991 {
1992 /* When adding platforms to this function, check to see if
1993 * stepping-specific workarounds impact the compiler.
1994 *
1995 * If a stepping specific compiler workaround is required on a released
1996 * platform, intel_device_info->revision must be added as a
1997 * 'compiler_field' in intel_device_info.py
1998 */
1999
2000 if (devinfo->platform == INTEL_PLATFORM_BMG) {
2001 switch (devinfo->revision) {
2002 case 0:
2003 return INTEL_STEPPING_A0;
2004 case 1:
2005 return INTEL_STEPPING_A1;
2006 case 4:
2007 return INTEL_STEPPING_B0;
2008 default:
2009 return INTEL_STEPPING_RELEASE;
2010 }
2011 } else if (devinfo->platform == INTEL_PLATFORM_LNL) {
2012 switch (devinfo->revision) {
2013 case 0:
2014 return INTEL_STEPPING_A0;
2015 case 1:
2016 return INTEL_STEPPING_A1;
2017 case 4:
2018 return INTEL_STEPPING_B0;
2019 default:
2020 return INTEL_STEPPING_RELEASE;
2021 }
2022 } else if (devinfo->platform == INTEL_PLATFORM_TGL) {
2023 /* TGL production steppings: B0 and C0 */
2024 switch (devinfo->revision) {
2025 case 1:
2026 return INTEL_STEPPING_B0;
2027 case 3:
2028 return INTEL_STEPPING_C0;
2029 default:
2030 return INTEL_STEPPING_RELEASE;
2031 }
2032 }
2033
2034 /* all other platforms support only released steppings */
2035 return INTEL_STEPPING_RELEASE;
2036 }
2037
2038 uint32_t
intel_device_info_get_max_slm_size(const struct intel_device_info * devinfo)2039 intel_device_info_get_max_slm_size(const struct intel_device_info *devinfo)
2040 {
2041 uint32_t bytes = 0;
2042
2043 if (devinfo->verx10 >= 200) {
2044 bytes = intel_device_info_get_max_preferred_slm_size(devinfo);
2045 } else {
2046 bytes = 64 * 1024;
2047 }
2048
2049 return bytes;
2050 }
2051
2052 uint32_t
intel_device_info_get_max_preferred_slm_size(const struct intel_device_info * devinfo)2053 intel_device_info_get_max_preferred_slm_size(const struct intel_device_info *devinfo)
2054 {
2055 uint32_t k_bytes = 0;
2056
2057 if (devinfo->verx10 >= 200) {
2058 if (intel_needs_workaround(devinfo, 16018610683))
2059 k_bytes = 128;
2060 else
2061 k_bytes = 160;
2062 } else {
2063 k_bytes = 128;
2064 }
2065
2066 return k_bytes * 1024;
2067 }
2068