1 /*
2 * Copyright 2016 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_nir.h"
8 #include "ac_nir_to_llvm.h"
9 #include "ac_rtld.h"
10 #include "si_pipe.h"
11 #include "si_shader_internal.h"
12 #include "si_shader_llvm.h"
13 #include "sid.h"
14 #include "util/u_memory.h"
15 #include "util/u_prim.h"
16
17 struct si_llvm_diagnostics {
18 struct util_debug_callback *debug;
19 unsigned retval;
20 };
21
si_diagnostic_handler(LLVMDiagnosticInfoRef di,void * context)22 static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
23 {
24 struct si_llvm_diagnostics *diag = (struct si_llvm_diagnostics *)context;
25 LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
26 const char *severity_str = NULL;
27
28 switch (severity) {
29 case LLVMDSError:
30 severity_str = "error";
31 break;
32 case LLVMDSWarning:
33 severity_str = "warning";
34 break;
35 case LLVMDSRemark:
36 case LLVMDSNote:
37 default:
38 return;
39 }
40
41 char *description = LLVMGetDiagInfoDescription(di);
42
43 util_debug_message(diag->debug, SHADER_INFO, "LLVM diagnostic (%s): %s", severity_str,
44 description);
45
46 if (severity == LLVMDSError) {
47 diag->retval = 1;
48 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n", description);
49 }
50
51 LLVMDisposeMessage(description);
52 }
53
si_compile_llvm(struct si_screen * sscreen,struct si_shader_binary * binary,struct ac_shader_config * conf,struct ac_llvm_compiler * compiler,struct ac_llvm_context * ac,struct util_debug_callback * debug,gl_shader_stage stage,const char * name,bool less_optimized)54 bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
55 struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
56 struct ac_llvm_context *ac, struct util_debug_callback *debug,
57 gl_shader_stage stage, const char *name, bool less_optimized)
58 {
59 unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
60
61 if (si_can_dump_shader(sscreen, stage, SI_DUMP_LLVM_IR)) {
62 fprintf(stderr, "radeonsi: Compiling shader %d\n", count);
63
64 fprintf(stderr, "%s LLVM IR:\n\n", name);
65 ac_dump_module(ac->module);
66 fprintf(stderr, "\n");
67 }
68
69 if (sscreen->record_llvm_ir) {
70 char *ir = LLVMPrintModuleToString(ac->module);
71 binary->llvm_ir_string = strdup(ir);
72 LLVMDisposeMessage(ir);
73 }
74
75 if (!si_replace_shader(count, binary)) {
76 struct ac_compiler_passes *passes = compiler->passes;
77
78 if (less_optimized && compiler->low_opt_passes)
79 passes = compiler->low_opt_passes;
80
81 struct si_llvm_diagnostics diag = {debug};
82 LLVMContextSetDiagnosticHandler(ac->context, si_diagnostic_handler, &diag);
83
84 if (!ac_compile_module_to_elf(passes, ac->module, (char **)&binary->code_buffer,
85 &binary->code_size))
86 diag.retval = 1;
87
88 if (diag.retval != 0) {
89 util_debug_message(debug, SHADER_INFO, "LLVM compilation failed");
90 return false;
91 }
92
93 binary->type = SI_SHADER_BINARY_ELF;
94 }
95
96 struct ac_rtld_binary rtld;
97 if (!ac_rtld_open(&rtld, (struct ac_rtld_open_info){
98 .info = &sscreen->info,
99 .shader_type = stage,
100 .wave_size = ac->wave_size,
101 .num_parts = 1,
102 .elf_ptrs = &binary->code_buffer,
103 .elf_sizes = &binary->code_size}))
104 return false;
105
106 bool ok = ac_rtld_read_config(&sscreen->info, &rtld, conf);
107 ac_rtld_close(&rtld);
108 return ok;
109 }
110
si_llvm_context_init(struct si_shader_context * ctx,struct si_screen * sscreen,struct ac_llvm_compiler * compiler,unsigned wave_size,bool exports_color_null,bool exports_mrtz,enum ac_float_mode float_mode)111 void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
112 struct ac_llvm_compiler *compiler, unsigned wave_size,
113 bool exports_color_null, bool exports_mrtz,
114 enum ac_float_mode float_mode)
115 {
116 memset(ctx, 0, sizeof(*ctx));
117 ctx->screen = sscreen;
118 ctx->compiler = compiler;
119
120 ac_llvm_context_init(&ctx->ac, compiler, &sscreen->info, float_mode,
121 wave_size, 64, exports_color_null, exports_mrtz);
122 }
123
si_llvm_create_func(struct si_shader_context * ctx,const char * name,LLVMTypeRef * return_types,unsigned num_return_elems,unsigned max_workgroup_size)124 void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
125 unsigned num_return_elems, unsigned max_workgroup_size)
126 {
127 LLVMTypeRef ret_type;
128 enum ac_llvm_calling_convention call_conv;
129
130 if (num_return_elems)
131 ret_type = LLVMStructTypeInContext(ctx->ac.context, return_types, num_return_elems, true);
132 else
133 ret_type = ctx->ac.voidt;
134
135 gl_shader_stage real_stage = ctx->stage;
136
137 /* LS is merged into HS (TCS), and ES is merged into GS. */
138 if (ctx->screen->info.gfx_level >= GFX9 && ctx->stage <= MESA_SHADER_GEOMETRY) {
139 if (ctx->shader->key.ge.as_ls)
140 real_stage = MESA_SHADER_TESS_CTRL;
141 else if (ctx->shader->key.ge.as_es || ctx->shader->key.ge.as_ngg)
142 real_stage = MESA_SHADER_GEOMETRY;
143 }
144
145 switch (real_stage) {
146 case MESA_SHADER_VERTEX:
147 case MESA_SHADER_TESS_EVAL:
148 call_conv = AC_LLVM_AMDGPU_VS;
149 break;
150 case MESA_SHADER_TESS_CTRL:
151 call_conv = AC_LLVM_AMDGPU_HS;
152 break;
153 case MESA_SHADER_GEOMETRY:
154 call_conv = AC_LLVM_AMDGPU_GS;
155 break;
156 case MESA_SHADER_FRAGMENT:
157 call_conv = AC_LLVM_AMDGPU_PS;
158 break;
159 case MESA_SHADER_COMPUTE:
160 call_conv = AC_LLVM_AMDGPU_CS;
161 break;
162 default:
163 unreachable("Unhandle shader type");
164 }
165
166 /* Setup the function */
167 ctx->return_type = ret_type;
168 ctx->main_fn = ac_build_main(&ctx->args->ac, &ctx->ac, call_conv, name, ret_type, ctx->ac.module);
169 ctx->return_value = LLVMGetUndef(ctx->return_type);
170
171 if (ctx->screen->info.address32_hi) {
172 ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "amdgpu-32bit-address-high-bits",
173 ctx->screen->info.address32_hi);
174 }
175
176 if (ctx->screen->info.gfx_level < GFX12 && ctx->stage <= MESA_SHADER_GEOMETRY &&
177 ctx->shader->key.ge.as_ngg && si_shader_uses_streamout(ctx->shader))
178 ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "amdgpu-gds-size", 256);
179
180 ac_llvm_set_workgroup_size(ctx->main_fn.value, max_workgroup_size);
181 ac_llvm_set_target_features(ctx->main_fn.value, &ctx->ac, false);
182 }
183
si_llvm_create_main_func(struct si_shader_context * ctx)184 void si_llvm_create_main_func(struct si_shader_context *ctx)
185 {
186 struct si_shader *shader = ctx->shader;
187 LLVMTypeRef returns[AC_MAX_ARGS];
188 unsigned i;
189
190 for (i = 0; i < ctx->args->ac.num_sgprs_returned; i++)
191 returns[i] = ctx->ac.i32; /* SGPR */
192 for (; i < ctx->args->ac.return_count; i++)
193 returns[i] = ctx->ac.f32; /* VGPR */
194
195 si_llvm_create_func(ctx, "main", returns, ctx->args->ac.return_count,
196 si_get_max_workgroup_size(shader));
197
198 /* Reserve register locations for VGPR inputs the PS prolog may need. */
199 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
200 ac_llvm_add_target_dep_function_attr(
201 ctx->main_fn.value, "InitialPSInputAddr", SI_SPI_PS_INPUT_ADDR_FOR_PROLOG);
202 }
203
204
205 if (ctx->stage <= MESA_SHADER_GEOMETRY &&
206 (shader->key.ge.as_ls || ctx->stage == MESA_SHADER_TESS_CTRL)) {
207 /* The LSHS size is not known until draw time, so we append it
208 * at the end of whatever LDS use there may be in the rest of
209 * the shader (currently none, unless LLVM decides to do its
210 * own LDS-based lowering).
211 */
212 ctx->ac.lds = (struct ac_llvm_pointer) {
213 .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
214 "__lds_end", AC_ADDR_SPACE_LDS),
215 .pointee_type = LLVMArrayType(ctx->ac.i32, 0)
216 };
217 LLVMSetAlignment(ctx->ac.lds.value, 256);
218 }
219
220 if (ctx->stage == MESA_SHADER_VERTEX) {
221 ctx->abi.vertex_id = ac_get_arg(&ctx->ac, ctx->args->ac.vertex_id);
222 ctx->abi.instance_id = ac_get_arg(&ctx->ac, ctx->args->ac.instance_id);
223 if (ctx->args->ac.vs_rel_patch_id.used)
224 ctx->abi.vs_rel_patch_id = ac_get_arg(&ctx->ac, ctx->args->ac.vs_rel_patch_id);
225
226 /* Apply the LS-HS input VGPR hw bug workaround. */
227 if (shader->key.ge.as_ls && ctx->screen->info.has_ls_vgpr_init_bug)
228 ac_fixup_ls_hs_input_vgprs(&ctx->ac, &ctx->abi, &ctx->args->ac);
229 }
230 }
231
si_llvm_optimize_module(struct si_shader_context * ctx)232 void si_llvm_optimize_module(struct si_shader_context *ctx)
233 {
234 /* Dump LLVM IR before any optimization passes */
235 if (si_can_dump_shader(ctx->screen, ctx->stage, SI_DUMP_INIT_LLVM_IR))
236 ac_dump_module(ctx->ac.module);
237
238 /* Run the pass */
239 LLVMRunPassManager(ctx->compiler->passmgr, ctx->ac.module);
240 }
241
si_llvm_dispose(struct si_shader_context * ctx)242 void si_llvm_dispose(struct si_shader_context *ctx)
243 {
244 LLVMDisposeModule(ctx->ac.module);
245 LLVMContextDispose(ctx->ac.context);
246 ac_llvm_context_dispose(&ctx->ac);
247 }
248
249 /**
250 * Load a dword from a constant buffer.
251 */
si_buffer_load_const(struct si_shader_context * ctx,LLVMValueRef resource,LLVMValueRef offset)252 LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
253 LLVMValueRef offset)
254 {
255 return ac_build_buffer_load(&ctx->ac, resource, 1, NULL, offset, NULL, ctx->ac.f32,
256 0, true, true);
257 }
258
si_llvm_build_ret(struct si_shader_context * ctx,LLVMValueRef ret)259 void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret)
260 {
261 if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind)
262 LLVMBuildRetVoid(ctx->ac.builder);
263 else
264 LLVMBuildRet(ctx->ac.builder, ret);
265 }
266
si_insert_input_ret(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)267 LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
268 struct ac_arg param, unsigned return_index)
269 {
270 return LLVMBuildInsertValue(ctx->ac.builder, ret, ac_get_arg(&ctx->ac, param), return_index, "");
271 }
272
si_insert_input_ret_float(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)273 LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
274 struct ac_arg param, unsigned return_index)
275 {
276 LLVMBuilderRef builder = ctx->ac.builder;
277 LLVMValueRef p = ac_get_arg(&ctx->ac, param);
278
279 return LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, p), return_index, "");
280 }
281
si_insert_input_ptr(struct si_shader_context * ctx,LLVMValueRef ret,struct ac_arg param,unsigned return_index)282 LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
283 struct ac_arg param, unsigned return_index)
284 {
285 LLVMBuilderRef builder = ctx->ac.builder;
286 LLVMValueRef ptr = ac_get_arg(&ctx->ac, param);
287 ptr = LLVMBuildPtrToInt(builder, ptr, ctx->ac.i32, "");
288 return LLVMBuildInsertValue(builder, ret, ptr, return_index, "");
289 }
290
si_prolog_get_internal_binding_slot(struct si_shader_context * ctx,unsigned slot)291 LLVMValueRef si_prolog_get_internal_binding_slot(struct si_shader_context *ctx, unsigned slot)
292 {
293 LLVMValueRef list = LLVMBuildIntToPtr(
294 ctx->ac.builder, ac_get_arg(&ctx->ac, ctx->args->internal_bindings),
295 ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
296 LLVMValueRef index = LLVMConstInt(ctx->ac.i32, slot, 0);
297
298 return ac_build_load_to_sgpr(&ctx->ac,
299 (struct ac_llvm_pointer) { .t = ctx->ac.v4i32, .v = list },
300 index);
301 }
302
303 /* Ensure that the esgs ring is declared.
304 *
305 * We declare it with 64KB alignment as a hint that the
306 * pointer value will always be 0.
307 */
si_llvm_declare_lds_esgs_ring(struct si_shader_context * ctx)308 static void si_llvm_declare_lds_esgs_ring(struct si_shader_context *ctx)
309 {
310 if (ctx->ac.lds.value)
311 return;
312
313 assert(!LLVMGetNamedGlobal(ctx->ac.module, "esgs_ring"));
314
315 LLVMValueRef esgs_ring =
316 LLVMAddGlobalInAddressSpace(ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0),
317 "esgs_ring", AC_ADDR_SPACE_LDS);
318 LLVMSetLinkage(esgs_ring, LLVMExternalLinkage);
319 LLVMSetAlignment(esgs_ring, 64 * 1024);
320
321 ctx->ac.lds.value = esgs_ring;
322 ctx->ac.lds.pointee_type = ctx->ac.i32;
323 }
324
si_init_exec_from_input(struct si_shader_context * ctx,struct ac_arg param,unsigned bitoffset)325 static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
326 unsigned bitoffset)
327 {
328 LLVMValueRef args[] = {
329 ac_get_arg(&ctx->ac, param),
330 LLVMConstInt(ctx->ac.i32, bitoffset, 0),
331 };
332 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.init.exec.from.input", ctx->ac.voidt, args, 2, 0);
333 }
334
335 /**
336 * Get the value of a shader input parameter and extract a bitfield.
337 */
unpack_llvm_param(struct si_shader_context * ctx,LLVMValueRef value,unsigned rshift,unsigned bitwidth)338 static LLVMValueRef unpack_llvm_param(struct si_shader_context *ctx, LLVMValueRef value,
339 unsigned rshift, unsigned bitwidth)
340 {
341 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMFloatTypeKind)
342 value = ac_to_integer(&ctx->ac, value);
343
344 if (rshift)
345 value = LLVMBuildLShr(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, rshift, 0), "");
346
347 if (rshift + bitwidth < 32) {
348 unsigned mask = (1 << bitwidth) - 1;
349 value = LLVMBuildAnd(ctx->ac.builder, value, LLVMConstInt(ctx->ac.i32, mask, 0), "");
350 }
351
352 return value;
353 }
354
si_unpack_param(struct si_shader_context * ctx,struct ac_arg param,unsigned rshift,unsigned bitwidth)355 LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
356 unsigned bitwidth)
357 {
358 LLVMValueRef value = ac_get_arg(&ctx->ac, param);
359
360 return unpack_llvm_param(ctx, value, rshift, bitwidth);
361 }
362
si_llvm_declare_compute_memory(struct si_shader_context * ctx)363 static void si_llvm_declare_compute_memory(struct si_shader_context *ctx)
364 {
365 struct si_shader_selector *sel = ctx->shader->selector;
366 unsigned lds_size = sel->info.base.shared_size;
367
368 LLVMTypeRef i8p = LLVMPointerType(ctx->ac.i8, AC_ADDR_SPACE_LDS);
369 LLVMValueRef var;
370
371 assert(!ctx->ac.lds.value);
372
373 LLVMTypeRef type = LLVMArrayType(ctx->ac.i8, lds_size);
374 var = LLVMAddGlobalInAddressSpace(ctx->ac.module, type,
375 "compute_lds", AC_ADDR_SPACE_LDS);
376 LLVMSetAlignment(var, 64 * 1024);
377
378 ctx->ac.lds = (struct ac_llvm_pointer) {
379 .value = LLVMBuildBitCast(ctx->ac.builder, var, i8p, ""),
380 .pointee_type = type,
381 };
382 }
383
384 /**
385 * Given two parts (LS/HS or ES/GS) of a merged shader, build a wrapper function that
386 * runs them in sequence to form a monolithic shader.
387 */
si_build_wrapper_function(struct si_shader_context * ctx,struct ac_llvm_pointer parts[2],bool same_thread_count)388 static void si_build_wrapper_function(struct si_shader_context *ctx,
389 struct ac_llvm_pointer parts[2],
390 bool same_thread_count)
391 {
392 LLVMBuilderRef builder = ctx->ac.builder;
393
394 for (unsigned i = 0; i < 2; ++i) {
395 ac_add_function_attr(ctx->ac.context, parts[i].value, -1, "alwaysinline");
396 LLVMSetLinkage(parts[i].value, LLVMPrivateLinkage);
397 }
398
399 si_llvm_create_func(ctx, "wrapper", NULL, 0, si_get_max_workgroup_size(ctx->shader));
400
401 if (same_thread_count) {
402 si_init_exec_from_input(ctx, ctx->args->ac.merged_wave_info, 0);
403 } else {
404 ac_init_exec_full_mask(&ctx->ac);
405
406 LLVMValueRef count = ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info);
407 count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
408
409 LLVMValueRef ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
410 ac_build_ifcc(&ctx->ac, ena, 6506);
411 }
412
413 LLVMValueRef params[AC_MAX_ARGS];
414 unsigned num_params = LLVMCountParams(ctx->main_fn.value);
415 LLVMGetParams(ctx->main_fn.value, params);
416
417 /* wrapper function has same parameter as first part shader */
418 LLVMValueRef ret =
419 ac_build_call(&ctx->ac, parts[0].pointee_type, parts[0].value, params, num_params);
420
421 if (same_thread_count) {
422 LLVMTypeRef type = LLVMTypeOf(ret);
423 assert(LLVMGetTypeKind(type) == LLVMStructTypeKind);
424
425 /* output of first part shader is the input of the second part */
426 num_params = LLVMCountStructElementTypes(type);
427 assert(num_params == LLVMCountParams(parts[1].value));
428
429 for (unsigned i = 0; i < num_params; i++) {
430 params[i] = LLVMBuildExtractValue(builder, ret, i, "");
431
432 /* Convert return value to same type as next shader's input param. */
433 LLVMTypeRef ret_type = LLVMTypeOf(params[i]);
434 LLVMTypeRef param_type = LLVMTypeOf(LLVMGetParam(parts[1].value, i));
435 assert(ac_get_type_size(ret_type) == 4);
436 assert(ac_get_type_size(param_type) == 4);
437
438 if (ret_type != param_type) {
439 if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
440 assert(LLVMGetPointerAddressSpace(param_type) == AC_ADDR_SPACE_CONST_32BIT);
441 assert(ret_type == ctx->ac.i32);
442
443 params[i] = LLVMBuildIntToPtr(builder, params[i], param_type, "");
444 } else {
445 params[i] = LLVMBuildBitCast(builder, params[i], param_type, "");
446 }
447 }
448 }
449 } else {
450 ac_build_endif(&ctx->ac, 6506);
451
452 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
453 LLVMValueRef count = ac_get_arg(&ctx->ac, ctx->args->ac.merged_wave_info);
454 count = LLVMBuildLShr(builder, count, LLVMConstInt(ctx->ac.i32, 8, 0), "");
455 count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
456
457 LLVMValueRef ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
458 ac_build_ifcc(&ctx->ac, ena, 6507);
459 }
460
461 /* The second half of the merged shader should use
462 * the inputs from the toplevel (wrapper) function,
463 * not the return value from the last call.
464 *
465 * That's because the last call was executed condi-
466 * tionally, so we can't consume it in the main
467 * block.
468 */
469
470 /* Second part params are same as the preceeding params of the first part. */
471 num_params = LLVMCountParams(parts[1].value);
472 }
473
474 ac_build_call(&ctx->ac, parts[1].pointee_type, parts[1].value, params, num_params);
475
476 /* Close the conditional wrapping the second shader. */
477 if (ctx->stage == MESA_SHADER_TESS_CTRL && !same_thread_count)
478 ac_build_endif(&ctx->ac, 6507);
479
480 LLVMBuildRetVoid(builder);
481 }
482
si_llvm_load_intrinsic(struct ac_shader_abi * abi,nir_intrinsic_instr * intrin)483 static LLVMValueRef si_llvm_load_intrinsic(struct ac_shader_abi *abi, nir_intrinsic_instr *intrin)
484 {
485 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
486
487 switch (intrin->intrinsic) {
488 case nir_intrinsic_load_lds_ngg_scratch_base_amd:
489 return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_scratch.value, ctx->ac.i32, "");
490
491 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
492 return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_emit, ctx->ac.i32, "");
493
494 default:
495 return NULL;
496 }
497 }
498
si_llvm_load_sampler_desc(struct ac_shader_abi * abi,LLVMValueRef index,enum ac_descriptor_type desc_type)499 static LLVMValueRef si_llvm_load_sampler_desc(struct ac_shader_abi *abi, LLVMValueRef index,
500 enum ac_descriptor_type desc_type)
501 {
502 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
503 LLVMBuilderRef builder = ctx->ac.builder;
504
505 if (index && LLVMTypeOf(index) == ctx->ac.i32) {
506 bool is_vec4 = false;
507
508 switch (desc_type) {
509 case AC_DESC_IMAGE:
510 /* The image is at [0:7]. */
511 index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
512 break;
513 case AC_DESC_BUFFER:
514 /* The buffer is in [4:7]. */
515 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
516 is_vec4 = true;
517 break;
518 case AC_DESC_FMASK:
519 /* The FMASK is at [8:15]. */
520 assert(ctx->screen->info.gfx_level < GFX11);
521 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
522 break;
523 case AC_DESC_SAMPLER:
524 /* The sampler state is at [12:15]. */
525 index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
526 LLVMConstInt(ctx->ac.i32, 3, 0));
527 is_vec4 = true;
528 break;
529 default:
530 unreachable("invalid desc");
531 }
532
533 struct ac_llvm_pointer list = {
534 .value = ac_get_arg(&ctx->ac, ctx->args->samplers_and_images),
535 .pointee_type = is_vec4 ? ctx->ac.v4i32 : ctx->ac.v8i32,
536 };
537
538 return ac_build_load_to_sgpr(&ctx->ac, list, index);
539 }
540
541 return index;
542 }
543
si_llvm_translate_nir(struct si_shader_context * ctx,struct si_shader * shader,struct nir_shader * nir,bool free_nir)544 static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
545 struct nir_shader *nir, bool free_nir)
546 {
547 struct si_shader_selector *sel = shader->selector;
548 const struct si_shader_info *info = &sel->info;
549
550 ctx->shader = shader;
551 ctx->stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage;
552
553 ctx->num_const_buffers = info->base.num_ubos;
554 ctx->num_shader_buffers = info->base.num_ssbos;
555
556 ctx->num_samplers = BITSET_LAST_BIT(info->base.textures_used);
557 ctx->num_images = info->base.num_images;
558
559 ctx->abi.intrinsic_load = si_llvm_load_intrinsic;
560 ctx->abi.load_sampler_desc = si_llvm_load_sampler_desc;
561
562 si_llvm_create_main_func(ctx);
563
564 switch (ctx->stage) {
565 case MESA_SHADER_VERTEX:
566 break;
567
568 case MESA_SHADER_TESS_CTRL:
569 si_llvm_init_tcs_callbacks(ctx);
570 break;
571
572 case MESA_SHADER_GEOMETRY:
573 if (ctx->shader->key.ge.as_ngg) {
574 LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
575 ctx->gs_ngg_scratch = (struct ac_llvm_pointer) {
576 .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS),
577 .pointee_type = ai32
578 };
579 LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(ai32));
580 LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8);
581
582 ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace(
583 ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS);
584 LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage);
585 LLVMSetAlignment(ctx->gs_ngg_emit, 4);
586 }
587 break;
588
589 case MESA_SHADER_FRAGMENT: {
590 ctx->abi.kill_ps_if_inf_interp =
591 ctx->screen->options.no_infinite_interp &&
592 (ctx->shader->selector->info.uses_persp_center ||
593 ctx->shader->selector->info.uses_persp_centroid ||
594 ctx->shader->selector->info.uses_persp_sample);
595 break;
596 }
597
598 case MESA_SHADER_COMPUTE:
599 if (ctx->shader->selector->info.base.shared_size)
600 si_llvm_declare_compute_memory(ctx);
601 break;
602
603 default:
604 break;
605 }
606
607 bool is_merged_esgs_stage =
608 ctx->screen->info.gfx_level >= GFX9 && ctx->stage <= MESA_SHADER_GEOMETRY &&
609 (ctx->shader->key.ge.as_es || ctx->stage == MESA_SHADER_GEOMETRY);
610
611 bool is_nogs_ngg_stage =
612 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
613 shader->key.ge.as_ngg && !shader->key.ge.as_es;
614
615 /* Declare the ESGS ring as an explicit LDS symbol.
616 * When NGG VS/TES, unconditionally declare for streamout and vertex compaction.
617 * Whether space is actually allocated is determined during linking / PM4 creation.
618 */
619 if (is_merged_esgs_stage || is_nogs_ngg_stage)
620 si_llvm_declare_lds_esgs_ring(ctx);
621
622 /* This is really only needed when streamout and / or vertex
623 * compaction is enabled.
624 */
625 if (is_nogs_ngg_stage &&
626 (si_shader_uses_streamout(shader) || shader->key.ge.opt.ngg_culling)) {
627 LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader));
628 ctx->gs_ngg_scratch = (struct ac_llvm_pointer) {
629 .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch",
630 AC_ADDR_SPACE_LDS),
631 .pointee_type = asi32
632 };
633 LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(asi32));
634 LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8);
635 }
636
637 /* For merged shaders (VS-TCS, VS-GS, TES-GS): */
638 if (ctx->screen->info.gfx_level >= GFX9 && si_is_merged_shader(shader)) {
639 /* Set EXEC = ~0 before the first shader. For monolithic shaders, the wrapper
640 * function does this.
641 */
642 if (ctx->stage == MESA_SHADER_TESS_EVAL) {
643 /* TES has only 1 shader part, therefore it doesn't use the wrapper function. */
644 if (!shader->is_monolithic || !shader->key.ge.as_es)
645 ac_init_exec_full_mask(&ctx->ac);
646 } else if (ctx->stage == MESA_SHADER_VERTEX) {
647 if (shader->is_monolithic) {
648 /* Only mono VS with TCS/GS present has wrapper function. */
649 if (!shader->key.ge.as_ls && !shader->key.ge.as_es)
650 ac_init_exec_full_mask(&ctx->ac);
651 } else {
652 ac_init_exec_full_mask(&ctx->ac);
653 }
654 }
655
656 /* NGG VS and NGG TES: nir ngg lowering send gs_alloc_req at the beginning when culling
657 * is disabled, but GFX10 may hang if not all waves are launched before gs_alloc_req.
658 * We work around this HW bug by inserting a barrier before gs_alloc_req.
659 */
660 if (ctx->screen->info.gfx_level == GFX10 &&
661 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
662 shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling)
663 ac_build_s_barrier(&ctx->ac, ctx->stage);
664
665 LLVMValueRef thread_enabled = NULL;
666
667 if ((ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) ||
668 (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) {
669 /* Wrap both shaders in an if statement according to the number of enabled threads
670 * there. For monolithic TCS, the if statement is inserted by the wrapper function,
671 * not here. For NGG GS, the if statement is inserted by nir lowering.
672 */
673 thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
674 } else if ((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) {
675 /* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
676 * the if statement is inserted by the wrapper function.
677 */
678 thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
679 }
680
681 if (thread_enabled) {
682 ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
683 ctx->merged_wrap_if_label = 11500;
684 ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
685 }
686
687 /* Execute a barrier before the second shader in
688 * a merged shader.
689 *
690 * Execute the barrier inside the conditional block,
691 * so that empty waves can jump directly to s_endpgm,
692 * which will also signal the barrier.
693 *
694 * This is possible in gfx9, because an empty wave for the second shader does not insert
695 * any ending. With NGG, empty waves may still be required to export data (e.g. GS output
696 * vertices), so we cannot let them exit early.
697 *
698 * If the shader is TCS and the TCS epilog is present
699 * and contains a barrier, it will wait there and then
700 * reach s_endpgm.
701 */
702 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
703 /* We need the barrier only if TCS inputs are read from LDS. */
704 if (!shader->key.ge.opt.same_patch_vertices ||
705 shader->selector->info.base.inputs_read &
706 ~shader->selector->info.tcs_vgpr_only_inputs) {
707 ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
708
709 /* If both input and output patches are wholly in one wave, we don't need a barrier.
710 * That's true when both VS and TCS have the same number of patch vertices and
711 * the wave size is a multiple of the number of patch vertices.
712 */
713 if (!shader->key.ge.opt.same_patch_vertices ||
714 ctx->ac.wave_size % sel->info.base.tess.tcs_vertices_out != 0)
715 ac_build_s_barrier(&ctx->ac, ctx->stage);
716 }
717 } else if (ctx->stage == MESA_SHADER_GEOMETRY) {
718 ac_build_waitcnt(&ctx->ac, AC_WAIT_DS);
719 ac_build_s_barrier(&ctx->ac, ctx->stage);
720 }
721 }
722
723 ctx->abi.clamp_shadow_reference = true;
724 ctx->abi.robust_buffer_access = true;
725 ctx->abi.load_grid_size_from_user_sgpr = true;
726 ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero ||
727 info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO;
728 ctx->abi.disable_aniso_single_level = true;
729
730 bool ls_need_output =
731 ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.as_ls &&
732 shader->key.ge.opt.same_patch_vertices;
733
734 bool ps_need_output = ctx->stage == MESA_SHADER_FRAGMENT;
735
736 if (ls_need_output || ps_need_output) {
737 for (unsigned i = 0; i < info->num_outputs; i++) {
738 LLVMTypeRef type = ctx->ac.f32;
739
740 /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */
741 if (nir->info.stage == MESA_SHADER_FRAGMENT &&
742 nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16)
743 type = ctx->ac.f16;
744
745 for (unsigned j = 0; j < 4; j++) {
746 ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, "");
747 ctx->abi.is_16bit[i * 4 + j] = type == ctx->ac.f16;
748 }
749 }
750 }
751
752 if (!ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args->ac, nir))
753 return false;
754
755 switch (ctx->stage) {
756 case MESA_SHADER_VERTEX:
757 if (shader->key.ge.as_ls)
758 si_llvm_ls_build_end(ctx);
759 else if (shader->key.ge.as_es)
760 si_llvm_es_build_end(ctx);
761 break;
762
763 case MESA_SHADER_TESS_CTRL:
764 if (!shader->is_monolithic)
765 si_llvm_tcs_build_end(ctx);
766 break;
767
768 case MESA_SHADER_TESS_EVAL:
769 if (ctx->shader->key.ge.as_es)
770 si_llvm_es_build_end(ctx);
771 break;
772
773 case MESA_SHADER_GEOMETRY:
774 if (!ctx->shader->key.ge.as_ngg)
775 si_llvm_gs_build_end(ctx);
776 break;
777
778 case MESA_SHADER_FRAGMENT:
779 if (!shader->is_monolithic)
780 si_llvm_ps_build_end(ctx);
781 break;
782
783 default:
784 break;
785 }
786
787 si_llvm_build_ret(ctx, ctx->return_value);
788
789 if (free_nir)
790 ralloc_free(nir);
791 return true;
792 }
793
si_should_optimize_less(struct ac_llvm_compiler * compiler,struct si_shader_selector * sel)794 static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
795 struct si_shader_selector *sel)
796 {
797 if (!compiler->low_opt_passes)
798 return false;
799
800 /* Assume a slow CPU. */
801 assert(!sel->screen->info.has_dedicated_vram && sel->screen->info.gfx_level <= GFX8);
802
803 /* For a crazy dEQP test containing 2597 memory opcodes, mostly
804 * buffer stores. */
805 return sel->stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
806 }
807
si_llvm_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct si_shader_args * args,struct util_debug_callback * debug,struct nir_shader * nir)808 bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
809 struct si_shader *shader, struct si_shader_args *args,
810 struct util_debug_callback *debug, struct nir_shader *nir)
811 {
812 struct si_shader_selector *sel = shader->selector;
813 struct si_shader_context ctx;
814 enum ac_float_mode float_mode = nir->info.stage == MESA_SHADER_KERNEL ? AC_FLOAT_MODE_DEFAULT : AC_FLOAT_MODE_DEFAULT_OPENGL;
815 bool exports_color_null = false;
816 bool exports_mrtz = false;
817
818 if (sel->stage == MESA_SHADER_FRAGMENT) {
819 exports_color_null = sel->info.colors_written;
820 exports_mrtz = sel->info.writes_z || sel->info.writes_stencil || shader->ps.writes_samplemask;
821 if (!exports_mrtz && !exports_color_null)
822 exports_color_null = si_shader_uses_discard(shader) || sscreen->info.gfx_level < GFX10;
823 }
824
825 si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size, exports_color_null, exports_mrtz,
826 float_mode);
827 ctx.args = args;
828
829 if (!si_llvm_translate_nir(&ctx, shader, nir, false)) {
830 si_llvm_dispose(&ctx);
831 return false;
832 }
833
834 /* For merged shader stage. */
835 if (shader->is_monolithic && sscreen->info.gfx_level >= GFX9 &&
836 (sel->stage == MESA_SHADER_TESS_CTRL || sel->stage == MESA_SHADER_GEOMETRY)) {
837 /* LS or ES shader. */
838 struct si_shader prev_shader = {};
839
840 bool free_nir;
841 nir = si_get_prev_stage_nir_shader(shader, &prev_shader, ctx.args, &free_nir);
842
843 struct ac_llvm_pointer parts[2];
844 parts[1] = ctx.main_fn;
845
846 if (!si_llvm_translate_nir(&ctx, &prev_shader, nir, free_nir)) {
847 si_llvm_dispose(&ctx);
848 return false;
849 }
850
851 parts[0] = ctx.main_fn;
852
853 /* Reset the shader context. */
854 ctx.shader = shader;
855 ctx.stage = sel->stage;
856
857 bool same_thread_count = shader->key.ge.opt.same_patch_vertices;
858 si_build_wrapper_function(&ctx, parts, same_thread_count);
859 }
860
861 si_llvm_optimize_module(&ctx);
862
863 /* Make sure the input is a pointer and not integer followed by inttoptr. */
864 assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn.value, 0))) == LLVMPointerTypeKind);
865
866 /* Compile to bytecode. */
867 if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug,
868 sel->stage, si_get_shader_name(shader),
869 si_should_optimize_less(compiler, shader->selector))) {
870 si_llvm_dispose(&ctx);
871 fprintf(stderr, "LLVM failed to compile shader\n");
872 return false;
873 }
874
875 si_llvm_dispose(&ctx);
876 return true;
877 }
878
si_llvm_build_shader_part(struct si_screen * sscreen,gl_shader_stage stage,bool prolog,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name,struct si_shader_part * result)879 bool si_llvm_build_shader_part(struct si_screen *sscreen, gl_shader_stage stage,
880 bool prolog, struct ac_llvm_compiler *compiler,
881 struct util_debug_callback *debug, const char *name,
882 struct si_shader_part *result)
883 {
884 union si_shader_part_key *key = &result->key;
885
886 struct si_shader_selector sel = {};
887 sel.screen = sscreen;
888
889 struct si_shader shader = {};
890 shader.selector = &sel;
891 bool wave32 = false;
892 bool exports_color_null = false;
893 bool exports_mrtz = false;
894
895 switch (stage) {
896 case MESA_SHADER_FRAGMENT:
897 if (prolog) {
898 shader.key.ps.part.prolog = key->ps_prolog.states;
899 wave32 = key->ps_prolog.wave32;
900 exports_color_null = key->ps_prolog.states.poly_stipple;
901 } else {
902 shader.key.ps.part.epilog = key->ps_epilog.states;
903 wave32 = key->ps_epilog.wave32;
904 exports_color_null = key->ps_epilog.colors_written;
905 exports_mrtz = key->ps_epilog.writes_z || key->ps_epilog.writes_stencil ||
906 key->ps_epilog.writes_samplemask;
907 if (!exports_mrtz && !exports_color_null)
908 exports_color_null = key->ps_epilog.uses_discard || sscreen->info.gfx_level < GFX10;
909 }
910 break;
911 default:
912 unreachable("bad shader part");
913 }
914
915 struct si_shader_context ctx;
916 si_llvm_context_init(&ctx, sscreen, compiler, wave32 ? 32 : 64, exports_color_null, exports_mrtz,
917 AC_FLOAT_MODE_DEFAULT_OPENGL);
918
919 ctx.shader = &shader;
920 ctx.stage = stage;
921
922 struct si_shader_args args;
923 ctx.args = &args;
924
925 void (*build)(struct si_shader_context *, union si_shader_part_key *);
926
927 switch (stage) {
928 case MESA_SHADER_FRAGMENT:
929 build = prolog ? si_llvm_build_ps_prolog : si_llvm_build_ps_epilog;
930 break;
931 default:
932 unreachable("bad shader part");
933 }
934
935 build(&ctx, key);
936
937 /* Compile. */
938 si_llvm_optimize_module(&ctx);
939
940 bool ret = si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
941 &ctx.ac, debug, ctx.stage, name, false);
942
943 si_llvm_dispose(&ctx);
944 return ret;
945 }
946