xref: /aosp_15_r20/external/mesa3d/src/freedreno/common/freedreno_dev_info.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  */
6 
7 #ifndef FREEDRENO_DEVICE_INFO_H
8 #define FREEDRENO_DEVICE_INFO_H
9 
10 #include <assert.h>
11 #include <stdbool.h>
12 #include <stdint.h>
13 
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 
18 /**
19  * Freedreno hardware description and quirks
20  */
21 
22 struct fd_dev_info {
23    uint8_t chip;
24 
25    /* alignment for size of tiles */
26    uint32_t tile_align_w, tile_align_h;
27    /* gmem load/store granularity */
28    uint32_t gmem_align_w, gmem_align_h;
29    /* max tile size */
30    uint32_t tile_max_w, tile_max_h;
31 
32    uint32_t num_vsc_pipes;
33 
34    uint32_t cs_shared_mem_size;
35 
36    int wave_granularity;
37 
38    /* Information for private memory calculations */
39    uint32_t fibers_per_sp;
40 
41    uint32_t threadsize_base;
42 
43    uint32_t max_waves;
44 
45    /* number of CCU is always equal to the number of SP */
46    union {
47       uint32_t num_sp_cores;
48       uint32_t num_ccu;
49    };
50 
51    struct {
52       uint32_t reg_size_vec4;
53 
54       /* The size (in instrlen units (128 bytes)) of instruction cache where
55        * we preload a shader. Loading more than this could trigger a hang
56        * on gen3 and later.
57        */
58       uint32_t instr_cache_size;
59 
60       bool has_hw_multiview;
61 
62       bool has_fs_tex_prefetch;
63 
64       /* Whether the PC_MULTIVIEW_MASK register exists. */
65       bool supports_multiview_mask;
66 
67       /* info for setting RB_CCU_CNTL */
68       bool concurrent_resolve;
69       bool has_z24uint_s8uint;
70 
71       bool tess_use_shared;
72 
73       /* Does the hw support GL_QCOM_shading_rate? */
74       bool has_shading_rate;
75 
76       /* Whether a 16-bit descriptor can be used */
77       bool storage_16bit;
78 
79       /* The latest known a630_sqe.fw fails to wait for WFI before
80        * reading the indirect buffer when using CP_DRAW_INDIRECT_MULTI,
81        * so we have to fall back to CP_WAIT_FOR_ME except for a650
82        * which has a fixed firmware.
83        *
84        * TODO: There may be newer a630_sqe.fw released in the future
85        * which fixes this, if so we should detect it and avoid this
86        * workaround.  Once we have uapi to query fw version, we can
87        * replace this with minimum fw version.
88        */
89       bool indirect_draw_wfm_quirk;
90 
91       /* On some GPUs, the depth test needs to be enabled when the
92        * depth bounds test is enabled and the depth attachment uses UBWC.
93        */
94       bool depth_bounds_require_depth_test_quirk;
95 
96       bool has_tex_filter_cubic;
97       bool has_separate_chroma_filter;
98 
99       bool has_sample_locations;
100 
101       /* The firmware on newer a6xx drops CP_REG_WRITE support as we
102        * can now use direct register writes for these regs.
103        */
104       bool has_cp_reg_write;
105 
106       bool has_8bpp_ubwc;
107 
108       bool has_lpac;
109 
110       bool has_getfiberid;
111 
112       bool has_dp2acc;
113       bool has_dp4acc;
114 
115       /* LRZ fast-clear works on all gens, however blob disables it on
116        * gen1 and gen2. We also elect to disable fast-clear on these gens
117        * because for close to none gains it adds complexity and seem to work
118        * a bit differently from gen3+. Which creates at least one edge case:
119        * if first draw which uses LRZ fast-clear doesn't lock LRZ direction
120        * the fast-clear value is undefined. For details see
121        * https://gitlab.freedesktop.org/mesa/mesa/-/issues/6829
122        */
123       bool enable_lrz_fast_clear;
124       bool has_lrz_dir_tracking;
125       bool lrz_track_quirk;
126       bool has_lrz_feedback;
127 
128       /* Some generations have a bit to add the multiview index to the
129        * viewport index, which lets us implement different scaling for
130        * different views.
131        */
132       bool has_per_view_viewport;
133       bool has_gmem_fast_clear;
134 
135       /* Per CCU GMEM amount reserved for each of DEPTH and COLOR caches
136        * in sysmem rendering. */
137       uint32_t sysmem_per_ccu_depth_cache_size;
138       uint32_t sysmem_per_ccu_color_cache_size;
139       /* Per CCU GMEM amount reserved for color cache used by GMEM resolves
140        * which require color cache (non-BLIT event case).
141        * The size is expressed as a fraction of ccu cache used by sysmem
142        * rendering. If a GMEM resolve requires color cache, the driver needs
143        * to make sure it will not overwrite pixel data in GMEM that is still
144        * needed.
145        */
146       /* see enum a6xx_ccu_cache_size */
147       uint32_t gmem_ccu_color_cache_fraction;
148 
149       /* Corresponds to HLSQ_CONTROL_1_REG::PRIMALLOCTHRESHOLD */
150       uint32_t prim_alloc_threshold;
151 
152       uint32_t vs_max_inputs_count;
153 
154       bool supports_double_threadsize;
155 
156       bool has_sampler_minmax;
157 
158       bool broken_ds_ubwc_quirk;
159 
160       /* See ir3_compiler::has_scalar_alu. */
161       bool has_scalar_alu;
162       /* See ir3_compiler::has_early_preamble. */
163       bool has_early_preamble;
164 
165       bool has_isam_v;
166       bool has_ssbo_imm_offsets;
167 
168       /* Whether writing to UBWC attachment and reading the same image as input
169        * attachment or as a texture reads correct values from the image.
170        * If this is false, we may read stale values from the flag buffer,
171        * thus reading incorrect values from the image.
172        * Happens with VK_EXT_attachment_feedback_loop_layout.
173        */
174       bool has_coherent_ubwc_flag_caches;
175 
176       struct {
177          uint32_t PC_POWER_CNTL;
178          uint32_t TPL1_DBG_ECO_CNTL;
179          uint32_t GRAS_DBG_ECO_CNTL;
180          uint32_t SP_CHICKEN_BITS;
181          uint32_t UCHE_CLIENT_PF;
182          uint32_t PC_MODE_CNTL;
183          uint32_t SP_DBG_ECO_CNTL;
184          uint32_t RB_DBG_ECO_CNTL;
185          uint32_t RB_DBG_ECO_CNTL_blit;
186          uint32_t HLSQ_DBG_ECO_CNTL;
187          uint32_t RB_UNKNOWN_8E01;
188          uint32_t VPC_DBG_ECO_CNTL;
189          uint32_t UCHE_UNKNOWN_0E12;
190 
191          uint32_t RB_UNKNOWN_8E06;
192       } magic;
193 
194       struct {
195             uint32_t reg;
196             uint32_t value;
197       } magic_raw[64];
198 
199       /* maximum number of descriptor sets */
200       uint32_t max_sets;
201 
202       float line_width_min;
203       float line_width_max;
204    } a6xx;
205 
206    struct {
207       /* stsc may need to be done twice for the same range to workaround
208        * _something_, observed in blob's disassembly.
209        */
210       bool stsc_duplication_quirk;
211 
212       /* Whether there is CP_EVENT_WRITE7::WRITE_SAMPLE_COUNT */
213       bool has_event_write_sample_count;
214 
215       /* Blob executes a special compute dispatch at the start of each
216        * command buffers. We copy this dispatch as is.
217        */
218       bool cmdbuf_start_a725_quirk;
219 
220       bool load_inline_uniforms_via_preamble_ldgk;
221       bool load_shader_consts_via_preamble;
222 
223       bool has_gmem_vpc_attr_buf;
224       /* Size of buffer in gmem for VPC attributes */
225       uint32_t sysmem_vpc_attr_buf_size;
226       uint32_t gmem_vpc_attr_buf_size;
227 
228       /* Whether UBWC is supported on all IBOs. Prior to this, only readonly
229        * or writeonly IBOs could use UBWC and mixing reads and writes was not
230        * permitted.
231        */
232       bool supports_ibo_ubwc;
233 
234       /* Whether the UBWC fast-clear values for snorn, unorm, and int formats
235        * are the same. This is the case from a740 onwards. These formats were
236        * already otherwise UBWC-compatible, so this means that they are now
237        * fully compatible.
238        */
239       bool ubwc_unorm_snorm_int_compatible;
240 
241       /* Having zero consts in one FS may corrupt consts in follow up FSs,
242        * on such GPUs blob never has zero consts in FS. The mechanism of
243        * corruption is unknown.
244        */
245       bool fs_must_have_non_zero_constlen_quirk;
246 
247       /* On a750 there is a hardware bug where certain VPC sizes in a GS with
248        * an input primitive type that is a triangle with adjacency can hang
249        * with a high enough vertex count.
250        */
251       bool gs_vpc_adjacency_quirk;
252 
253       /* On a740 TPL1_DBG_ECO_CNTL1.TP_UBWC_FLAG_HINT must be the same between
254        * all drivers in the system, somehow having different values affects
255        * BLIT_OP_SCALE. We cannot automatically match blob's value, so the
256        * best thing we could do is a toggle.
257        */
258       bool enable_tp_ubwc_flag_hint;
259 
260       bool storage_8bit;
261 
262       /* A750+ added a special flag that allows HW to correctly interpret UBWC, including
263        * UBWC fast-clear when casting image to a different format permitted by Vulkan.
264        * So it's possible to have UBWC enabled for image that has e.g. R32_UINT and
265        * R8G8B8A8_UNORM in the mutable formats list.
266        */
267       bool ubwc_all_formats_compatible;
268 
269       bool has_compliant_dp4acc;
270 
271       /* Whether a single clear blit could be used for both sysmem and gmem.*/
272       bool has_generic_clear;
273 
274       /* a750 has a bug where writing and then reading a UBWC-compressed IBO
275        * requires flushing UCHE. This is reproducible in many CTS tests, for
276        * example dEQP-VK.image.load_store.with_format.2d.*.
277        */
278       bool ubwc_coherency_quirk;
279    } a7xx;
280 };
281 
282 struct fd_dev_id {
283    uint32_t gpu_id;
284    uint64_t chip_id;
285 };
286 
287 /**
288  * Note that gpu-id should be considered deprecated.  For newer a6xx, if
289  * there is no gpu-id, this attempts to generate one from the chip-id.
290  * But that may not work forever, so avoid depending on this for newer
291  * gens
292  */
293 static inline uint32_t
fd_dev_gpu_id(const struct fd_dev_id * id)294 fd_dev_gpu_id(const struct fd_dev_id *id)
295 {
296    assert(id->gpu_id || id->chip_id);
297    if (!id->gpu_id) {
298       return ((id->chip_id >> 24) & 0xff) * 100 +
299              ((id->chip_id >> 16) & 0xff) * 10 +
300              ((id->chip_id >>  8) & 0xff);
301 
302    }
303    return id->gpu_id;
304 }
305 
306 /* Unmodified dev info as defined in freedreno_devices.py */
307 const struct fd_dev_info *fd_dev_info_raw(const struct fd_dev_id *id);
308 
309 /* Final dev info with dbg options and everything else applied.  */
310 const struct fd_dev_info fd_dev_info(const struct fd_dev_id *id);
311 
312 const struct fd_dev_info *fd_dev_info_raw_by_name(const char *name);
313 
314 static uint8_t
fd_dev_gen(const struct fd_dev_id * id)315 fd_dev_gen(const struct fd_dev_id *id)
316 {
317    return fd_dev_info_raw(id)->chip;
318 }
319 
320 static inline bool
fd_dev_64b(const struct fd_dev_id * id)321 fd_dev_64b(const struct fd_dev_id *id)
322 {
323    return fd_dev_gen(id) >= 5;
324 }
325 
326 /* per CCU GMEM amount reserved for depth cache for direct rendering */
327 #define A6XX_CCU_DEPTH_SIZE (64 * 1024)
328 /* per CCU GMEM amount reserved for color cache used by GMEM resolves
329  * which require color cache (non-BLIT event case).
330  * this is smaller than what is normally used by direct rendering
331  * (RB_CCU_CNTL.GMEM bit enables this smaller size)
332  * if a GMEM resolve requires color cache, the driver needs to make sure
333  * it will not overwrite pixel data in GMEM that is still needed
334  */
335 #define A6XX_CCU_GMEM_COLOR_SIZE (16 * 1024)
336 
337 const char * fd_dev_name(const struct fd_dev_id *id);
338 
339 void
340 fd_dev_info_apply_dbg_options(struct fd_dev_info *info);
341 
342 #ifdef __cplusplus
343 } /* end of extern "C" */
344 #endif
345 
346 #endif /* FREEDRENO_DEVICE_INFO_H */
347