1 /*
2 * Copyright © 2015 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <[email protected]>
7 */
8
9 #include "util/ralloc.h"
10
11 #include "freedreno_dev_info.h"
12
13 #include "ir3_compiler.h"
14
15 static const struct debug_named_value shader_debug_options[] = {
16 /* clang-format off */
17 {"vs", IR3_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
18 {"tcs", IR3_DBG_SHADER_TCS, "Print shader disasm for tess ctrl shaders"},
19 {"tes", IR3_DBG_SHADER_TES, "Print shader disasm for tess eval shaders"},
20 {"gs", IR3_DBG_SHADER_GS, "Print shader disasm for geometry shaders"},
21 {"fs", IR3_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
22 {"cs", IR3_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
23 {"internal", IR3_DBG_SHADER_INTERNAL, "Print shader disasm for internal shaders (normally not included in vs/fs/cs/etc)"},
24 {"disasm", IR3_DBG_DISASM, "Dump NIR and adreno shader disassembly"},
25 {"optmsgs", IR3_DBG_OPTMSGS, "Enable optimizer debug messages"},
26 {"forces2en", IR3_DBG_FORCES2EN, "Force s2en mode for tex sampler instructions"},
27 {"nouboopt", IR3_DBG_NOUBOOPT, "Disable lowering UBO to uniform"},
28 {"nofp16", IR3_DBG_NOFP16, "Don't lower mediump to fp16"},
29 {"nocache", IR3_DBG_NOCACHE, "Disable shader cache"},
30 {"spillall", IR3_DBG_SPILLALL, "Spill as much as possible to test the spiller"},
31 {"nopreamble", IR3_DBG_NOPREAMBLE, "Disable the preamble pass"},
32 {"fullsync", IR3_DBG_FULLSYNC, "Add (sy) + (ss) after each cat5/cat6"},
33 {"fullnop", IR3_DBG_FULLNOP, "Add nops before each instruction"},
34 {"noearlypreamble", IR3_DBG_NOEARLYPREAMBLE, "Disable early preambles"},
35 {"nodescprefetch", IR3_DBG_NODESCPREFETCH, "Disable descriptor prefetch optimization"},
36 {"expandrpt", IR3_DBG_EXPANDRPT, "Expand rptN instructions"},
37 #if MESA_DEBUG
38 /* MESA_DEBUG-only options: */
39 {"schedmsgs", IR3_DBG_SCHEDMSGS, "Enable scheduler debug messages"},
40 {"ramsgs", IR3_DBG_RAMSGS, "Enable register-allocation debug messages"},
41 #endif
42 DEBUG_NAMED_VALUE_END
43 /* clang-format on */
44 };
45
46 DEBUG_GET_ONCE_FLAGS_OPTION(ir3_shader_debug, "IR3_SHADER_DEBUG",
47 shader_debug_options, 0)
48 DEBUG_GET_ONCE_OPTION(ir3_shader_override_path, "IR3_SHADER_OVERRIDE_PATH",
49 NULL)
50
51 enum ir3_shader_debug ir3_shader_debug = 0;
52 const char *ir3_shader_override_path = NULL;
53
54 void
ir3_compiler_destroy(struct ir3_compiler * compiler)55 ir3_compiler_destroy(struct ir3_compiler *compiler)
56 {
57 disk_cache_destroy(compiler->disk_cache);
58 ralloc_free(compiler);
59 }
60
61 static const nir_shader_compiler_options ir3_base_options = {
62 .compact_arrays = true,
63 .lower_fpow = true,
64 .lower_scmp = true,
65 .lower_flrp16 = true,
66 .lower_flrp32 = true,
67 .lower_flrp64 = true,
68 .lower_ffract = true,
69 .lower_fmod = true,
70 .lower_fdiv = true,
71 .lower_isign = true,
72 .lower_ldexp = true,
73 .lower_uadd_carry = true,
74 .lower_usub_borrow = true,
75 .lower_mul_high = true,
76 .lower_mul_2x32_64 = true,
77 .fuse_ffma16 = true,
78 .fuse_ffma32 = true,
79 .fuse_ffma64 = true,
80 .vertex_id_zero_based = false,
81 .lower_extract_byte = true,
82 .lower_extract_word = true,
83 .lower_insert_byte = true,
84 .lower_insert_word = true,
85 .lower_helper_invocation = true,
86 .lower_bitfield_insert = true,
87 .lower_bitfield_extract = true,
88 .lower_pack_half_2x16 = true,
89 .lower_pack_snorm_4x8 = true,
90 .lower_pack_snorm_2x16 = true,
91 .lower_pack_unorm_4x8 = true,
92 .lower_pack_unorm_2x16 = true,
93 .lower_unpack_half_2x16 = true,
94 .lower_unpack_snorm_4x8 = true,
95 .lower_unpack_snorm_2x16 = true,
96 .lower_unpack_unorm_4x8 = true,
97 .lower_unpack_unorm_2x16 = true,
98 .lower_pack_split = true,
99 .use_interpolated_input_intrinsics = true,
100 .lower_to_scalar = true,
101 .has_imul24 = true,
102 .has_fsub = true,
103 .has_isub = true,
104 .force_indirect_unrolling_sampler = true,
105 .lower_uniforms_to_ubo = true,
106 .max_unroll_iterations = 32,
107
108 .lower_cs_local_index_to_id = true,
109 .lower_wpos_pntc = true,
110
111 .lower_int64_options = (nir_lower_int64_options)~0,
112 .lower_doubles_options = (nir_lower_doubles_options)~0,
113
114 .divergence_analysis_options = nir_divergence_uniform_load_tears,
115 .has_ddx_intrinsics = true,
116 .scalarize_ddx = true,
117 };
118
119 struct ir3_compiler *
ir3_compiler_create(struct fd_device * dev,const struct fd_dev_id * dev_id,const struct fd_dev_info * dev_info,const struct ir3_compiler_options * options)120 ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
121 const struct fd_dev_info *dev_info,
122 const struct ir3_compiler_options *options)
123 {
124 struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
125
126 ir3_shader_debug = debug_get_option_ir3_shader_debug();
127 ir3_shader_override_path =
128 __normal_user() ? debug_get_option_ir3_shader_override_path() : NULL;
129
130 if (ir3_shader_override_path) {
131 ir3_shader_debug |= IR3_DBG_NOCACHE;
132 }
133
134 compiler->dev = dev;
135 compiler->dev_id = dev_id;
136 compiler->gen = fd_dev_gen(dev_id);
137 compiler->is_64bit = fd_dev_64b(dev_id);
138 compiler->options = *options;
139
140 /* TODO see if older GPU's were different here */
141 compiler->branchstack_size = 64;
142 compiler->wave_granularity = dev_info->wave_granularity;
143 compiler->max_waves = dev_info->max_waves;
144
145 compiler->max_variable_workgroup_size = 1024;
146
147 compiler->local_mem_size = dev_info->cs_shared_mem_size;
148
149 compiler->num_predicates = 1;
150 compiler->bitops_can_write_predicates = false;
151 compiler->has_branch_and_or = false;
152 compiler->has_rpt_bary_f = false;
153
154 if (compiler->gen >= 6) {
155 compiler->samgq_workaround = true;
156 /* a6xx split the pipeline state into geometry and fragment state, in
157 * order to let the VS run ahead of the FS. As a result there are now
158 * separate const files for the the fragment shader and everything
159 * else, and separate limits. There seems to be a shared limit, but
160 * it's higher than the vert or frag limits.
161 *
162 * Also, according to the observation on a630/a650/a660, max_const_pipeline
163 * has to be 512 when all geometry stages are present. Otherwise a gpu hang
164 * happens. Accordingly maximum safe size for each stage should be under
165 * (max_const_pipeline / 5 (stages)) with 4 vec4's alignment considered for
166 * const files.
167 *
168 * Only when VS and FS stages are present, the limit is 640.
169 *
170 * TODO: The shared limit seems to be different on different models.
171 */
172 compiler->max_const_pipeline = 512;
173 compiler->max_const_frag = 512;
174 compiler->max_const_geom = 512;
175 compiler->max_const_safe = 100;
176
177 /* Compute shaders don't share a const file with the FS. Instead they
178 * have their own file, which is smaller than the FS one. On a7xx the size
179 * was doubled.
180 *
181 * TODO: is this true on earlier gen's?
182 */
183 compiler->max_const_compute = compiler->gen >= 7 ? 512 : 256;
184
185 /* TODO: implement clip+cull distances on earlier gen's */
186 compiler->has_clip_cull = true;
187
188 compiler->has_preamble = true;
189
190 compiler->tess_use_shared = dev_info->a6xx.tess_use_shared;
191
192 compiler->has_getfiberid = dev_info->a6xx.has_getfiberid;
193
194 compiler->has_dp2acc = dev_info->a6xx.has_dp2acc;
195 compiler->has_dp4acc = dev_info->a6xx.has_dp4acc;
196 compiler->has_compliant_dp4acc = dev_info->a7xx.has_compliant_dp4acc;
197
198 if (compiler->gen == 6 && options->shared_push_consts) {
199 compiler->shared_consts_base_offset = 504;
200 compiler->shared_consts_size = 8;
201 compiler->geom_shared_consts_size_quirk = 16;
202 } else {
203 compiler->shared_consts_base_offset = -1;
204 compiler->shared_consts_size = 0;
205 compiler->geom_shared_consts_size_quirk = 0;
206 }
207
208 compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
209 compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
210 compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
211 compiler->load_inline_uniforms_via_preamble_ldgk = dev_info->a7xx.load_inline_uniforms_via_preamble_ldgk;
212 compiler->num_predicates = 4;
213 compiler->bitops_can_write_predicates = true;
214 compiler->has_branch_and_or = true;
215 compiler->has_predication = true;
216 compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
217 compiler->has_isam_v = dev_info->a6xx.has_isam_v;
218 compiler->has_ssbo_imm_offsets = dev_info->a6xx.has_ssbo_imm_offsets;
219 compiler->fs_must_have_non_zero_constlen_quirk = dev_info->a7xx.fs_must_have_non_zero_constlen_quirk;
220 compiler->has_early_preamble = dev_info->a6xx.has_early_preamble;
221 compiler->has_rpt_bary_f = true;
222 } else {
223 compiler->max_const_pipeline = 512;
224 compiler->max_const_geom = 512;
225 compiler->max_const_frag = 512;
226 compiler->max_const_compute = 512;
227
228 /* Note: this will have to change if/when we support tess+GS on
229 * earlier gen's.
230 */
231 compiler->max_const_safe = 256;
232
233 compiler->has_scalar_alu = false;
234 compiler->has_isam_v = false;
235 compiler->has_ssbo_imm_offsets = false;
236 compiler->has_early_preamble = false;
237 }
238
239 /* This is just a guess for a4xx. */
240 compiler->pvtmem_per_fiber_align = compiler->gen >= 4 ? 512 : 128;
241 /* TODO: implement private memory on earlier gen's */
242 compiler->has_pvtmem = compiler->gen >= 5;
243
244 compiler->has_isam_ssbo = compiler->gen >= 6;
245
246 if (compiler->gen >= 6) {
247 compiler->reg_size_vec4 = dev_info->a6xx.reg_size_vec4;
248 } else if (compiler->gen >= 4) {
249 /* On a4xx-a5xx, using r24.x and above requires using the smallest
250 * threadsize.
251 */
252 compiler->reg_size_vec4 = 48;
253 } else {
254 /* TODO: confirm this */
255 compiler->reg_size_vec4 = 96;
256 }
257
258 compiler->threadsize_base = dev_info->threadsize_base;
259
260 if (compiler->gen >= 4) {
261 /* need special handling for "flat" */
262 compiler->flat_bypass = true;
263 compiler->levels_add_one = false;
264 compiler->unminify_coords = false;
265 compiler->txf_ms_with_isaml = false;
266 compiler->array_index_add_half = true;
267 compiler->instr_align = 16;
268 compiler->const_upload_unit = 4;
269 } else {
270 /* no special handling for "flat" */
271 compiler->flat_bypass = false;
272 compiler->levels_add_one = true;
273 compiler->unminify_coords = true;
274 compiler->txf_ms_with_isaml = true;
275 compiler->array_index_add_half = false;
276 compiler->instr_align = 4;
277 compiler->const_upload_unit = 8;
278 }
279
280 compiler->bool_type = (compiler->gen >= 5) ? TYPE_U16 : TYPE_U32;
281 compiler->has_shared_regfile = compiler->gen >= 5;
282
283 /* The driver can't request this unless preambles are supported. */
284 if (options->push_ubo_with_preamble)
285 assert(compiler->has_preamble);
286
287 /* Set up nir shader compiler options, using device-specific overrides of our base settings. */
288 compiler->nir_options = ir3_base_options;
289
290 if (compiler->gen >= 6) {
291 compiler->nir_options.vectorize_io = true,
292 compiler->nir_options.force_indirect_unrolling = nir_var_all,
293 compiler->nir_options.lower_device_index_to_zero = true;
294
295 if (dev_info->a6xx.has_dp2acc || dev_info->a6xx.has_dp4acc) {
296 compiler->nir_options.has_udot_4x8 =
297 compiler->nir_options.has_udot_4x8_sat = true;
298 compiler->nir_options.has_sudot_4x8 =
299 compiler->nir_options.has_sudot_4x8_sat = true;
300 }
301
302 if (dev_info->a6xx.has_dp4acc && dev_info->a7xx.has_compliant_dp4acc) {
303 compiler->nir_options.has_sdot_4x8 =
304 compiler->nir_options.has_sdot_4x8_sat = true;
305 }
306 } else if (compiler->gen >= 3 && compiler->gen <= 5) {
307 compiler->nir_options.vertex_id_zero_based = true;
308 } else if (compiler->gen <= 2) {
309 /* a2xx compiler doesn't handle indirect: */
310 compiler->nir_options.force_indirect_unrolling = nir_var_all;
311 }
312
313 if (options->lower_base_vertex) {
314 compiler->nir_options.lower_base_vertex = true;
315 }
316
317 /* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
318 * this core NIR option enables some optimizations of 16-bit operations.
319 */
320 if (compiler->gen >= 5 && !(ir3_shader_debug & IR3_DBG_NOFP16))
321 compiler->nir_options.support_16bit_alu = true;
322
323 if (!options->disable_cache)
324 ir3_disk_cache_init(compiler);
325
326 return compiler;
327 }
328
329 const nir_shader_compiler_options *
ir3_get_compiler_options(struct ir3_compiler * compiler)330 ir3_get_compiler_options(struct ir3_compiler *compiler)
331 {
332 return &compiler->nir_options;
333 }
334