xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_shader_util.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #ifndef AC_SHADER_UTIL_H
8 #define AC_SHADER_UTIL_H
9 
10 #include "ac_binary.h"
11 #include "amd_family.h"
12 #include "compiler/nir/nir.h"
13 #include "compiler/shader_enums.h"
14 #include "util/format/u_format.h"
15 
16 #include <stdbool.h>
17 #include <stdint.h>
18 
19 #ifdef __cplusplus
20 extern "C" {
21 #endif
22 
23 #define AC_SENDMSG_GS           2
24 #define AC_SENDMSG_GS_DONE      3
25 #define AC_SENDMSG_GS_ALLOC_REQ 9
26 
27 #define AC_SENDMSG_GS_OP_NOP      (0 << 4)
28 #define AC_SENDMSG_GS_OP_CUT      (1 << 4)
29 #define AC_SENDMSG_GS_OP_EMIT     (2 << 4)
30 #define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4)
31 
32 /* An extension of gl_access_qualifier describing other aspects of memory operations
33  * for code generation.
34  */
35 enum {
36    /* Only one of LOAD/STORE/ATOMIC can be set. */
37    ACCESS_TYPE_LOAD            = BITFIELD_BIT(27),
38    ACCESS_TYPE_STORE           = BITFIELD_BIT(28),
39    ACCESS_TYPE_ATOMIC          = BITFIELD_BIT(29),
40 
41    /* This access is expected to use an SMEM instruction if source operands are non-divergent.
42     * Only loads can set this.
43     */
44    ACCESS_TYPE_SMEM            = BITFIELD_BIT(30),
45 
46    /* Whether a store offset or size alignment is less than 4. */
47    ACCESS_MAY_STORE_SUBDWORD   = BITFIELD_BIT(31),
48 };
49 
50 /* GFX6-11. The meaning of these enums is different between chips. They match LLVM definitions,
51  * but they can also be used by ACO. Use ac_get_hw_cache_flags to get these.
52  */
53 enum ac_cache_flags
54 {
55    ac_glc = BITFIELD_BIT(0),
56    ac_slc = BITFIELD_BIT(1),
57    ac_dlc = BITFIELD_BIT(2),
58    ac_swizzled = BITFIELD_BIT(3),
59 };
60 
61 /* Cache-agnostic scope flags. */
62 enum gfx12_scope
63 {
64    /* Memory access is coherent within a workgroup in CU mode.
65     * There is no coherency between VMEM and SMEM.
66     */
67    gfx12_scope_cu,
68 
69    /* Memory access is coherent within an SE.
70     * If there is no SE cache, this resolves to the device scope in the gfx domain.
71     */
72    gfx12_scope_se,
73 
74    /* Memory access is globally coherent within the device for all gfx blocks except CP and GE
75     * depending on the chip (see below). This is referred to as the device scope. It's not coherent
76     * with non-gfx blocks like DCN and VCN.
77     *
78     * If there a single global GL2 cache:
79     *    - The device scope in the gfx domain resolves to GL2 scope in hw.
80     *    - Memory access is cached in GL2.
81     *    - radeon_info::cp_sdma_ge_use_system_memory_scope says whether CP, SDMA, and GE are
82     *      not coherent. If true, some features need special handling. The list of the features
83     *      and the suggested programming is:
84     *      * tess factor ring for GE: use ACCESS_CP_GE_COHERENT_AMD (it selects the correct scope
85     *        automatically)
86     *      * query results read by shaders and SET_PREDICATION: use AMDGPU_VM_MTYPE_UC,
87     *        but use VRAM for queries not read by the CPU for better performance
88     *      * vertex indices for GE: flush GL2 after buffer stores, but don't invalidate
89     *      * draw indirect for CP: flush GL2 after buffer stores, but don't invalidate
90     *      * shader uploads via SDMA: invalidate GL2 at the beginning of IBs
91     *      * PRIME buffer read by SDMA: the kernel flushes GL2 at the end of IBs
92     *      * CP DMA clears/copies: use compute shaders or range-flush/invalidate GL2 around it
93     *      * CP DMA prefetch: no change
94     *      * COPY_DATA - FILLED_SIZE state for streamout, range-flush/invalidate GL2
95     *      * WRITE_DATA - bindless descriptors: range-invalidate GL2
96     *
97     * If there is a separate GL2 cache per SE:
98     *    - The device scope resolves to memory scope in hw.
99     *    - Memory access is cached in MALL if MALL (infinity cache) is present.
100     *    - radeon_info::cp_sdma_ge_use_system_memory_scope is always false in this case.
101     */
102    gfx12_scope_device,
103 
104    /* Memory scope. It's cached if MALL is present. This is called "system scope" in the ISA
105     * documentation.
106     */
107    gfx12_scope_memory,
108 };
109 
110 enum gfx12_load_temporal_hint
111 {
112    /* VMEM and SMEM */
113    gfx12_load_regular_temporal,
114    gfx12_load_non_temporal,
115    gfx12_load_high_temporal,
116    /* VMEM$ treats SCOPE=3 and TH=3 as MALL bypass on GFX12. Don't use this combination in shaders. */
117    gfx12_load_last_use_discard,
118    /* VMEM only, far means the last level cache, near means other caches. */
119    gfx12_load_near_non_temporal_far_regular_temporal,
120    gfx12_load_near_regular_temporal_far_non_temporal,
121    gfx12_load_near_non_temporal_far_high_temporal,
122    gfx12_load_reserved,
123 };
124 
125 enum gfx12_store_temporal_hint
126 {
127    gfx12_store_regular_temporal,
128    gfx12_store_non_temporal,
129    gfx12_store_high_temporal,
130    gfx12_store_high_temporal_stay_dirty,
131    gfx12_store_near_non_temporal_far_regular_temporal,
132    gfx12_store_near_regular_temporal_far_non_temporal,
133    gfx12_store_near_non_temporal_far_high_temporal,
134    gfx12_store_near_non_temporal_far_writeback,
135 };
136 
137 enum gfx12_atomic_temporal_hint
138 {
139    gfx12_atomic_return = BITFIELD_BIT(0),
140    gfx12_atomic_non_temporal = BITFIELD_BIT(1),
141    gfx12_atomic_accum_deferred_scope = BITFIELD_BIT(2), /* requires no return */
142 };
143 
144 enum gfx12_speculative_data_read
145 {
146    gfx12_spec_read_auto,
147    gfx12_spec_read_force_on,
148    gfx12_spec_read_force_off,
149 };
150 
151 union ac_hw_cache_flags
152 {
153    struct {
154       /* This matches LLVM, but it can also be used by ACO for translation of ac_memop_flags. */
155       uint8_t temporal_hint:3;   /* gfx12_{load,store,atomic}_temporal_hint */
156       uint8_t scope:2;           /* gfx12_scope */
157       uint8_t _reserved:1;
158       uint8_t swizzled:1;        /* for swizzled buffer access (attribute ring) */
159       uint8_t _pad:1;
160    } gfx12;
161 
162    uint8_t value; /* ac_cache_flags (GFX6-11) or the gfx12 structure */
163 };
164 
165 enum ac_image_dim
166 {
167    ac_image_1d,
168    ac_image_2d,
169    ac_image_3d,
170    ac_image_cube, // includes cube arrays
171    ac_image_1darray,
172    ac_image_2darray,
173    ac_image_2dmsaa,
174    ac_image_2darraymsaa,
175 };
176 
177 struct ac_data_format_info {
178    uint8_t element_size;
179    uint8_t num_channels;
180    uint8_t chan_byte_size;
181    uint8_t chan_format;
182 };
183 
184 enum ac_vs_input_alpha_adjust {
185    AC_ALPHA_ADJUST_NONE = 0,
186    AC_ALPHA_ADJUST_SNORM = 1,
187    AC_ALPHA_ADJUST_SSCALED = 2,
188    AC_ALPHA_ADJUST_SINT = 3,
189 };
190 
191 struct ac_vtx_format_info {
192    uint16_t dst_sel;
193    uint8_t element_size;
194    uint8_t num_channels;
195    uint8_t chan_byte_size; /* 0 for packed formats */
196 
197    /* These last three are dependent on the family. */
198 
199    uint8_t has_hw_format;
200    /* Index is number of channels minus one. Use any index for packed formats.
201     * GFX6-8 is dfmt[0:3],nfmt[4:7].
202     */
203    uint8_t hw_format[4];
204    enum ac_vs_input_alpha_adjust alpha_adjust : 8;
205 };
206 
207 struct ac_spi_color_formats {
208    unsigned normal : 8;
209    unsigned alpha : 8;
210    unsigned blend : 8;
211    unsigned blend_alpha : 8;
212 };
213 
214 /* For ac_build_fetch_format.
215  *
216  * Note: FLOAT must be 0 (used for convenience of encoding in radeonsi).
217  */
218 enum ac_fetch_format
219 {
220    AC_FETCH_FORMAT_FLOAT = 0,
221    AC_FETCH_FORMAT_FIXED,
222    AC_FETCH_FORMAT_UNORM,
223    AC_FETCH_FORMAT_SNORM,
224    AC_FETCH_FORMAT_USCALED,
225    AC_FETCH_FORMAT_SSCALED,
226    AC_FETCH_FORMAT_UINT,
227    AC_FETCH_FORMAT_SINT,
228    AC_FETCH_FORMAT_NONE,
229 };
230 
231 enum ac_descriptor_type
232 {
233    AC_DESC_IMAGE,
234    AC_DESC_FMASK,
235    AC_DESC_SAMPLER,
236    AC_DESC_BUFFER,
237    AC_DESC_PLANE_0,
238    AC_DESC_PLANE_1,
239    AC_DESC_PLANE_2,
240 };
241 
242 void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
243                         nir_shader_compiler_options *options);
244 
245 bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
246                                    unsigned num_components, nir_intrinsic_instr *low,
247                                    nir_intrinsic_instr *high, void *data);
248 
249 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
250                                     bool writes_mrt0_alpha);
251 
252 unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format);
253 
254 uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum amd_gfx_level gfx_level);
255 
256 unsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsigned nfmt);
257 
258 const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt);
259 
260 const struct ac_vtx_format_info *ac_get_vtx_format_info_table(enum amd_gfx_level level,
261                                                               enum radeon_family family);
262 
263 const struct ac_vtx_format_info *ac_get_vtx_format_info(enum amd_gfx_level level,
264                                                         enum radeon_family family,
265                                                         enum pipe_format fmt);
266 
267 unsigned ac_get_safe_fetch_size(const enum amd_gfx_level gfx_level, const struct ac_vtx_format_info* vtx_info,
268                                 const unsigned offset, const unsigned max_channels, const unsigned alignment,
269                                 const unsigned num_channels);
270 
271 enum ac_image_dim ac_get_sampler_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim dim,
272                                      bool is_array);
273 
274 enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim sdim,
275                                    bool is_array);
276 
277 unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
278                                   uint8_t *num_fragcoord_components);
279 
280 uint16_t ac_get_ps_iter_mask(unsigned ps_iter_samples);
281 
282 void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
283                                  bool is_depth, bool use_rbplus,
284                                  struct ac_spi_color_formats *formats);
285 
286 void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
287                            bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask);
288 
289 unsigned ac_compute_cs_workgroup_size(const uint16_t sizes[3], bool variable, unsigned max);
290 
291 unsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, gl_shader_stage stage,
292                                         unsigned tess_num_patches,
293                                         unsigned tess_patch_in_vtx,
294                                         unsigned tess_patch_out_vtx);
295 
296 unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned wave_size,
297                                         unsigned es_verts, unsigned gs_inst_prims);
298 
299 unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
300                                        unsigned max_vtx_out, unsigned prim_amp_factor);
301 
302 uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp,
303                                      uint32_t num_tcs_output_cp, uint32_t vram_per_patch,
304                                      uint32_t lds_per_patch, uint32_t wave_size,
305                                      bool tess_uses_primid);
306 
307 uint32_t ac_compute_tess_lds_size(const struct radeon_info *info,
308                                   uint32_t lds_per_patch, uint32_t num_patches);
309 
310 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
311                         const struct radeon_info *info);
312 
313 void ac_get_scratch_tmpring_size(const struct radeon_info *info,
314                                  unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
315                                  uint32_t *tmpring_size);
316 
317 unsigned
318 ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage,
319                                    unsigned shader_num_outputs,
320                                    bool streamout_enabled,
321                                    bool export_prim_id,
322                                    bool has_user_edgeflags,
323                                    bool can_cull,
324                                    bool uses_instance_id,
325                                    bool uses_primitive_id);
326 
327 unsigned
328 ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
329                             unsigned workgroup_size,
330                             unsigned wave_size,
331                             bool streamout_enabled,
332                             bool can_cull);
333 
334 enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr);
335 
336 union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level,
337                                               enum gl_access_qualifier access);
338 
339 unsigned ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level);
340 
341 unsigned ac_shader_io_get_unique_index_patch(unsigned semantic);
342 
343 #ifdef __cplusplus
344 }
345 #endif
346 
347 #endif
348