1 /* 2 * Copyright 2012 Advanced Micro Devices, Inc. 3 * 4 * SPDX-License-Identifier: MIT 5 */ 6 7 #ifndef AC_SHADER_UTIL_H 8 #define AC_SHADER_UTIL_H 9 10 #include "ac_binary.h" 11 #include "amd_family.h" 12 #include "compiler/nir/nir.h" 13 #include "compiler/shader_enums.h" 14 #include "util/format/u_format.h" 15 16 #include <stdbool.h> 17 #include <stdint.h> 18 19 #ifdef __cplusplus 20 extern "C" { 21 #endif 22 23 #define AC_SENDMSG_GS 2 24 #define AC_SENDMSG_GS_DONE 3 25 #define AC_SENDMSG_GS_ALLOC_REQ 9 26 27 #define AC_SENDMSG_GS_OP_NOP (0 << 4) 28 #define AC_SENDMSG_GS_OP_CUT (1 << 4) 29 #define AC_SENDMSG_GS_OP_EMIT (2 << 4) 30 #define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4) 31 32 /* An extension of gl_access_qualifier describing other aspects of memory operations 33 * for code generation. 34 */ 35 enum { 36 /* Only one of LOAD/STORE/ATOMIC can be set. */ 37 ACCESS_TYPE_LOAD = BITFIELD_BIT(27), 38 ACCESS_TYPE_STORE = BITFIELD_BIT(28), 39 ACCESS_TYPE_ATOMIC = BITFIELD_BIT(29), 40 41 /* This access is expected to use an SMEM instruction if source operands are non-divergent. 42 * Only loads can set this. 43 */ 44 ACCESS_TYPE_SMEM = BITFIELD_BIT(30), 45 46 /* Whether a store offset or size alignment is less than 4. */ 47 ACCESS_MAY_STORE_SUBDWORD = BITFIELD_BIT(31), 48 }; 49 50 /* GFX6-11. The meaning of these enums is different between chips. They match LLVM definitions, 51 * but they can also be used by ACO. Use ac_get_hw_cache_flags to get these. 52 */ 53 enum ac_cache_flags 54 { 55 ac_glc = BITFIELD_BIT(0), 56 ac_slc = BITFIELD_BIT(1), 57 ac_dlc = BITFIELD_BIT(2), 58 ac_swizzled = BITFIELD_BIT(3), 59 }; 60 61 /* Cache-agnostic scope flags. */ 62 enum gfx12_scope 63 { 64 /* Memory access is coherent within a workgroup in CU mode. 65 * There is no coherency between VMEM and SMEM. 66 */ 67 gfx12_scope_cu, 68 69 /* Memory access is coherent within an SE. 70 * If there is no SE cache, this resolves to the device scope in the gfx domain. 71 */ 72 gfx12_scope_se, 73 74 /* Memory access is globally coherent within the device for all gfx blocks except CP and GE 75 * depending on the chip (see below). This is referred to as the device scope. It's not coherent 76 * with non-gfx blocks like DCN and VCN. 77 * 78 * If there a single global GL2 cache: 79 * - The device scope in the gfx domain resolves to GL2 scope in hw. 80 * - Memory access is cached in GL2. 81 * - radeon_info::cp_sdma_ge_use_system_memory_scope says whether CP, SDMA, and GE are 82 * not coherent. If true, some features need special handling. The list of the features 83 * and the suggested programming is: 84 * * tess factor ring for GE: use ACCESS_CP_GE_COHERENT_AMD (it selects the correct scope 85 * automatically) 86 * * query results read by shaders and SET_PREDICATION: use AMDGPU_VM_MTYPE_UC, 87 * but use VRAM for queries not read by the CPU for better performance 88 * * vertex indices for GE: flush GL2 after buffer stores, but don't invalidate 89 * * draw indirect for CP: flush GL2 after buffer stores, but don't invalidate 90 * * shader uploads via SDMA: invalidate GL2 at the beginning of IBs 91 * * PRIME buffer read by SDMA: the kernel flushes GL2 at the end of IBs 92 * * CP DMA clears/copies: use compute shaders or range-flush/invalidate GL2 around it 93 * * CP DMA prefetch: no change 94 * * COPY_DATA - FILLED_SIZE state for streamout, range-flush/invalidate GL2 95 * * WRITE_DATA - bindless descriptors: range-invalidate GL2 96 * 97 * If there is a separate GL2 cache per SE: 98 * - The device scope resolves to memory scope in hw. 99 * - Memory access is cached in MALL if MALL (infinity cache) is present. 100 * - radeon_info::cp_sdma_ge_use_system_memory_scope is always false in this case. 101 */ 102 gfx12_scope_device, 103 104 /* Memory scope. It's cached if MALL is present. This is called "system scope" in the ISA 105 * documentation. 106 */ 107 gfx12_scope_memory, 108 }; 109 110 enum gfx12_load_temporal_hint 111 { 112 /* VMEM and SMEM */ 113 gfx12_load_regular_temporal, 114 gfx12_load_non_temporal, 115 gfx12_load_high_temporal, 116 /* VMEM$ treats SCOPE=3 and TH=3 as MALL bypass on GFX12. Don't use this combination in shaders. */ 117 gfx12_load_last_use_discard, 118 /* VMEM only, far means the last level cache, near means other caches. */ 119 gfx12_load_near_non_temporal_far_regular_temporal, 120 gfx12_load_near_regular_temporal_far_non_temporal, 121 gfx12_load_near_non_temporal_far_high_temporal, 122 gfx12_load_reserved, 123 }; 124 125 enum gfx12_store_temporal_hint 126 { 127 gfx12_store_regular_temporal, 128 gfx12_store_non_temporal, 129 gfx12_store_high_temporal, 130 gfx12_store_high_temporal_stay_dirty, 131 gfx12_store_near_non_temporal_far_regular_temporal, 132 gfx12_store_near_regular_temporal_far_non_temporal, 133 gfx12_store_near_non_temporal_far_high_temporal, 134 gfx12_store_near_non_temporal_far_writeback, 135 }; 136 137 enum gfx12_atomic_temporal_hint 138 { 139 gfx12_atomic_return = BITFIELD_BIT(0), 140 gfx12_atomic_non_temporal = BITFIELD_BIT(1), 141 gfx12_atomic_accum_deferred_scope = BITFIELD_BIT(2), /* requires no return */ 142 }; 143 144 enum gfx12_speculative_data_read 145 { 146 gfx12_spec_read_auto, 147 gfx12_spec_read_force_on, 148 gfx12_spec_read_force_off, 149 }; 150 151 union ac_hw_cache_flags 152 { 153 struct { 154 /* This matches LLVM, but it can also be used by ACO for translation of ac_memop_flags. */ 155 uint8_t temporal_hint:3; /* gfx12_{load,store,atomic}_temporal_hint */ 156 uint8_t scope:2; /* gfx12_scope */ 157 uint8_t _reserved:1; 158 uint8_t swizzled:1; /* for swizzled buffer access (attribute ring) */ 159 uint8_t _pad:1; 160 } gfx12; 161 162 uint8_t value; /* ac_cache_flags (GFX6-11) or the gfx12 structure */ 163 }; 164 165 enum ac_image_dim 166 { 167 ac_image_1d, 168 ac_image_2d, 169 ac_image_3d, 170 ac_image_cube, // includes cube arrays 171 ac_image_1darray, 172 ac_image_2darray, 173 ac_image_2dmsaa, 174 ac_image_2darraymsaa, 175 }; 176 177 struct ac_data_format_info { 178 uint8_t element_size; 179 uint8_t num_channels; 180 uint8_t chan_byte_size; 181 uint8_t chan_format; 182 }; 183 184 enum ac_vs_input_alpha_adjust { 185 AC_ALPHA_ADJUST_NONE = 0, 186 AC_ALPHA_ADJUST_SNORM = 1, 187 AC_ALPHA_ADJUST_SSCALED = 2, 188 AC_ALPHA_ADJUST_SINT = 3, 189 }; 190 191 struct ac_vtx_format_info { 192 uint16_t dst_sel; 193 uint8_t element_size; 194 uint8_t num_channels; 195 uint8_t chan_byte_size; /* 0 for packed formats */ 196 197 /* These last three are dependent on the family. */ 198 199 uint8_t has_hw_format; 200 /* Index is number of channels minus one. Use any index for packed formats. 201 * GFX6-8 is dfmt[0:3],nfmt[4:7]. 202 */ 203 uint8_t hw_format[4]; 204 enum ac_vs_input_alpha_adjust alpha_adjust : 8; 205 }; 206 207 struct ac_spi_color_formats { 208 unsigned normal : 8; 209 unsigned alpha : 8; 210 unsigned blend : 8; 211 unsigned blend_alpha : 8; 212 }; 213 214 /* For ac_build_fetch_format. 215 * 216 * Note: FLOAT must be 0 (used for convenience of encoding in radeonsi). 217 */ 218 enum ac_fetch_format 219 { 220 AC_FETCH_FORMAT_FLOAT = 0, 221 AC_FETCH_FORMAT_FIXED, 222 AC_FETCH_FORMAT_UNORM, 223 AC_FETCH_FORMAT_SNORM, 224 AC_FETCH_FORMAT_USCALED, 225 AC_FETCH_FORMAT_SSCALED, 226 AC_FETCH_FORMAT_UINT, 227 AC_FETCH_FORMAT_SINT, 228 AC_FETCH_FORMAT_NONE, 229 }; 230 231 enum ac_descriptor_type 232 { 233 AC_DESC_IMAGE, 234 AC_DESC_FMASK, 235 AC_DESC_SAMPLER, 236 AC_DESC_BUFFER, 237 AC_DESC_PLANE_0, 238 AC_DESC_PLANE_1, 239 AC_DESC_PLANE_2, 240 }; 241 242 void ac_set_nir_options(struct radeon_info *info, bool use_llvm, 243 nir_shader_compiler_options *options); 244 245 bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, 246 unsigned num_components, nir_intrinsic_instr *low, 247 nir_intrinsic_instr *high, void *data); 248 249 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask, 250 bool writes_mrt0_alpha); 251 252 unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format); 253 254 uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum amd_gfx_level gfx_level); 255 256 unsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsigned nfmt); 257 258 const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt); 259 260 const struct ac_vtx_format_info *ac_get_vtx_format_info_table(enum amd_gfx_level level, 261 enum radeon_family family); 262 263 const struct ac_vtx_format_info *ac_get_vtx_format_info(enum amd_gfx_level level, 264 enum radeon_family family, 265 enum pipe_format fmt); 266 267 unsigned ac_get_safe_fetch_size(const enum amd_gfx_level gfx_level, const struct ac_vtx_format_info* vtx_info, 268 const unsigned offset, const unsigned max_channels, const unsigned alignment, 269 const unsigned num_channels); 270 271 enum ac_image_dim ac_get_sampler_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim dim, 272 bool is_array); 273 274 enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim sdim, 275 bool is_array); 276 277 unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, 278 uint8_t *num_fragcoord_components); 279 280 uint16_t ac_get_ps_iter_mask(unsigned ps_iter_samples); 281 282 void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, 283 bool is_depth, bool use_rbplus, 284 struct ac_spi_color_formats *formats); 285 286 void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling, 287 bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask); 288 289 unsigned ac_compute_cs_workgroup_size(const uint16_t sizes[3], bool variable, unsigned max); 290 291 unsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, gl_shader_stage stage, 292 unsigned tess_num_patches, 293 unsigned tess_patch_in_vtx, 294 unsigned tess_patch_out_vtx); 295 296 unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned wave_size, 297 unsigned es_verts, unsigned gs_inst_prims); 298 299 unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims, 300 unsigned max_vtx_out, unsigned prim_amp_factor); 301 302 uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, 303 uint32_t num_tcs_output_cp, uint32_t vram_per_patch, 304 uint32_t lds_per_patch, uint32_t wave_size, 305 bool tess_uses_primid); 306 307 uint32_t ac_compute_tess_lds_size(const struct radeon_info *info, 308 uint32_t lds_per_patch, uint32_t num_patches); 309 310 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, 311 const struct radeon_info *info); 312 313 void ac_get_scratch_tmpring_size(const struct radeon_info *info, 314 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, 315 uint32_t *tmpring_size); 316 317 unsigned 318 ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, 319 unsigned shader_num_outputs, 320 bool streamout_enabled, 321 bool export_prim_id, 322 bool has_user_edgeflags, 323 bool can_cull, 324 bool uses_instance_id, 325 bool uses_primitive_id); 326 327 unsigned 328 ac_ngg_get_scratch_lds_size(gl_shader_stage stage, 329 unsigned workgroup_size, 330 unsigned wave_size, 331 bool streamout_enabled, 332 bool can_cull); 333 334 enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr); 335 336 union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level, 337 enum gl_access_qualifier access); 338 339 unsigned ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level); 340 341 unsigned ac_shader_io_get_unique_index_patch(unsigned semantic); 342 343 #ifdef __cplusplus 344 } 345 #endif 346 347 #endif 348