1 /* 2 * Copyright 2018-2021 Alyssa Rosenzweig 3 * SPDX-License-Identifier: MIT 4 */ 5 6 #pragma once 7 8 #include "compiler/nir/nir.h" 9 #include "util/u_dynarray.h" 10 #include "shader_enums.h" 11 12 struct agx_cf_binding { 13 /* Base coefficient register */ 14 unsigned cf_base; 15 16 /* Slot being bound */ 17 gl_varying_slot slot; 18 19 /* First component bound. 20 * 21 * Must be 2 (Z) or 3 (W) if slot == VARYING_SLOT_POS. 22 */ 23 unsigned offset : 2; 24 25 /* Number of components bound */ 26 unsigned count : 3; 27 28 /* Is smooth shading enabled? If false, flat shading is used */ 29 bool smooth : 1; 30 31 /* Perspective correct interpolation */ 32 bool perspective : 1; 33 }; 34 35 /* Conservative bound, * 4 due to offsets (TODO: maybe worth eliminating 36 * coefficient register aliasing?) 37 */ 38 #define AGX_MAX_CF_BINDINGS (VARYING_SLOT_MAX * 4) 39 40 struct agx_varyings_fs { 41 /* Number of coefficient registers used */ 42 unsigned nr_cf; 43 44 /* Number of coefficient register bindings */ 45 unsigned nr_bindings; 46 47 /* Whether gl_FragCoord.z is read */ 48 bool reads_z; 49 50 /* Coefficient register bindings */ 51 struct agx_cf_binding bindings[AGX_MAX_CF_BINDINGS]; 52 }; 53 54 union agx_varyings { 55 struct agx_varyings_fs fs; 56 }; 57 58 struct agx_interp_info { 59 /* Bit masks indexed by I/O location of flat and linear varyings */ 60 uint64_t flat; 61 uint64_t linear; 62 }; 63 static_assert(sizeof(struct agx_interp_info) == 16, "packed"); 64 65 struct agx_shader_info { 66 enum pipe_shader_type stage; 67 68 union agx_varyings varyings; 69 70 /* Number of uniforms */ 71 unsigned push_count; 72 73 /* Local memory allocation in bytes */ 74 unsigned local_size; 75 76 /* Local imageblock allocation in bytes per thread */ 77 unsigned imageblock_stride; 78 79 /* Scratch memory allocation in bytes for main/preamble respectively */ 80 unsigned scratch_size, preamble_scratch_size; 81 82 /* Size in bytes of the main sahder */ 83 unsigned main_size; 84 85 /* Does the shader have a preamble? If so, it is at offset preamble_offset. 86 * The main shader is at offset main_offset. The preamble is executed first. 87 */ 88 bool has_preamble; 89 unsigned preamble_offset, main_offset; 90 91 /* Does the shader read the tilebuffer? */ 92 bool reads_tib; 93 94 /* Does the shader require early fragment tests? */ 95 bool early_fragment_tests; 96 97 /* Does the shader potentially draw to a nonzero viewport? */ 98 bool nonzero_viewport; 99 100 /* Does the shader write layer and/or viewport index? Written together */ 101 bool writes_layer_viewport; 102 103 /* Does the shader control the sample mask? */ 104 bool writes_sample_mask; 105 106 /* Depth layout, never equal to NONE */ 107 enum gl_frag_depth_layout depth_layout; 108 109 /* Based only the compiled shader, should tag writes be disabled? This is set 110 * based on what is outputted. Note if rasterizer discard is used, that needs 111 * to disable tag writes regardless of this flag. 112 */ 113 bool tag_write_disable; 114 115 /* Shader is incompatible with triangle merging */ 116 bool disable_tri_merging; 117 118 /* Reads draw ID system value */ 119 bool uses_draw_id; 120 121 /* Reads base vertex/instance */ 122 bool uses_base_param; 123 124 /* Uses txf and hence needs a txf sampler mapped */ 125 bool uses_txf; 126 127 /* Number of 16-bit registers used by the main shader and preamble 128 * respectively. 129 */ 130 unsigned nr_gprs, nr_preamble_gprs; 131 132 /* Output mask set during driver lowering */ 133 uint64_t outputs; 134 135 /* Immediate data that must be uploaded and mapped as uniform registers */ 136 unsigned immediate_base_uniform; 137 unsigned immediate_size_16; 138 uint16_t immediates[512]; 139 }; 140 141 struct agx_shader_part { 142 struct agx_shader_info info; 143 void *binary; 144 size_t binary_size; 145 }; 146 147 #define AGX_MAX_RTS (8) 148 149 enum agx_format { 150 AGX_FORMAT_I8 = 0, 151 AGX_FORMAT_I16 = 1, 152 AGX_FORMAT_I32 = 2, 153 AGX_FORMAT_F16 = 3, 154 AGX_FORMAT_U8NORM = 4, 155 AGX_FORMAT_S8NORM = 5, 156 AGX_FORMAT_U16NORM = 6, 157 AGX_FORMAT_S16NORM = 7, 158 AGX_FORMAT_RGB10A2 = 8, 159 AGX_FORMAT_SRGBA8 = 10, 160 AGX_FORMAT_RG11B10F = 12, 161 AGX_FORMAT_RGB9E5 = 13, 162 163 /* Keep last */ 164 AGX_NUM_FORMATS, 165 }; 166 167 struct agx_fs_shader_key { 168 /* Normally, access to the tilebuffer must be guarded by appropriate fencing 169 * instructions to ensure correct results in the presence of out-of-order 170 * hardware optimizations. However, specially dispatched clear shaders are 171 * not subject to these conditions and can omit the wait instructions. 172 * 173 * Must (only) be set for special clear shaders. 174 * 175 * Must not be used with sample mask writes (including discards) or 176 * tilebuffer loads (including blending). 177 */ 178 bool ignore_tib_dependencies; 179 180 /* When dynamic sample shading is used, the fragment shader is wrapped in a 181 * loop external to the API shader. This bit indicates that we are compiling 182 * inside the sample loop, meaning the execution nesting counter is already 183 * zero and must be preserved. 184 */ 185 bool inside_sample_loop; 186 187 /* Base coefficient register. 0 for API shaders but nonzero for FS prolog */ 188 uint8_t cf_base; 189 }; 190 191 struct agx_device_key { 192 /* Does the target GPU need explicit cluster coherency for atomics? 193 * Only used on G13X. 194 */ 195 bool needs_g13x_coherency; 196 197 /* Is soft fault enabled? This is technically system-wide policy set by the 198 * kernel, but that's functionally a hardware feature. 199 */ 200 bool soft_fault; 201 }; 202 203 struct agx_shader_key { 204 /* Device info */ 205 struct agx_device_key dev; 206 207 /* Number of reserved preamble slots at the start */ 208 unsigned reserved_preamble; 209 210 /* Library routines to link against */ 211 const nir_shader *libagx; 212 213 /* Whether scratch memory is available in the given shader stage */ 214 bool has_scratch; 215 216 /* Whether we're compiling the helper program used for scratch allocation. 217 * This has special register allocation requirements. 218 */ 219 bool is_helper; 220 221 /* Whether the driver supports uploading constants for this shader. If 222 * false, constants will not be promoted to uniforms. 223 */ 224 bool promote_constants; 225 226 /* Set if this is a non-monolithic shader that must be linked with additional 227 * shader parts before the program can be used. This suppresses omission of 228 * `stop` instructions, which the linker must insert instead. 229 */ 230 bool no_stop; 231 232 /* Set if this is a secondary shader part (prolog or epilog). This prevents 233 * the compiler from allocating uniform registers. For example, this turns 234 * off preambles. 235 */ 236 bool secondary; 237 238 union { 239 struct agx_fs_shader_key fs; 240 }; 241 }; 242 243 struct agx_interp_info agx_gather_interp_info(nir_shader *nir); 244 uint64_t agx_gather_texcoords(nir_shader *nir); 245 246 void agx_link_libagx(nir_shader *nir, const nir_shader *libagx); 247 void agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx); 248 bool agx_nir_lower_discard_zs_emit(nir_shader *s); 249 bool agx_nir_lower_sample_mask(nir_shader *s); 250 bool agx_nir_lower_interpolation(nir_shader *s); 251 252 bool agx_nir_lower_cull_distance_vs(struct nir_shader *s); 253 bool agx_nir_lower_cull_distance_fs(struct nir_shader *s, 254 unsigned nr_distances); 255 256 void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, 257 struct util_debug_callback *debug, 258 struct agx_shader_part *out); 259 260 struct agx_occupancy { 261 unsigned max_registers; 262 unsigned max_threads; 263 }; 264 265 struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs); 266 unsigned agx_max_registers_for_occupancy(unsigned occupancy); 267 268 static const nir_shader_compiler_options agx_nir_options = { 269 .lower_fdiv = true, 270 .fuse_ffma16 = true, 271 .fuse_ffma32 = true, 272 .lower_flrp16 = true, 273 .lower_flrp32 = true, 274 .lower_fpow = true, 275 .lower_fmod = true, 276 .lower_bitfield_insert = true, 277 .lower_ifind_msb = true, 278 .lower_find_lsb = true, 279 .lower_uadd_carry = true, 280 .lower_usub_borrow = true, 281 .lower_fisnormal = true, 282 .lower_scmp = true, 283 .lower_isign = true, 284 .lower_fsign = true, 285 .lower_iabs = true, 286 .lower_fminmax_signed_zero = true, 287 .lower_fdph = true, 288 .lower_ffract = true, 289 .lower_ldexp = true, 290 .lower_pack_half_2x16 = true, 291 .lower_pack_unorm_2x16 = true, 292 .lower_pack_snorm_2x16 = true, 293 .lower_pack_unorm_4x8 = true, 294 .lower_pack_snorm_4x8 = true, 295 .lower_pack_64_2x32 = true, 296 .lower_unpack_half_2x16 = true, 297 .lower_unpack_unorm_2x16 = true, 298 .lower_unpack_snorm_2x16 = true, 299 .lower_unpack_unorm_4x8 = true, 300 .lower_unpack_snorm_4x8 = true, 301 .lower_extract_byte = true, 302 .lower_insert_byte = true, 303 .lower_insert_word = true, 304 .has_cs_global_id = true, 305 .lower_device_index_to_zero = true, 306 .lower_hadd = true, 307 .vectorize_io = true, 308 .use_interpolated_input_intrinsics = true, 309 .has_isub = true, 310 .support_16bit_alu = true, 311 .max_unroll_iterations = 32, 312 .lower_uniforms_to_ubo = true, 313 .lower_int64_options = 314 (nir_lower_int64_options) ~(nir_lower_iadd64 | nir_lower_imul_2x32_64), 315 .lower_doubles_options = (nir_lower_doubles_options)(~0), 316 .lower_fquantize2f16 = true, 317 .compact_arrays = true, 318 .discard_is_demote = true, 319 .has_ddx_intrinsics = true, 320 .scalarize_ddx = true, 321 }; 322