1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "si_shader_internal.h"
9 #include "si_shader_llvm.h"
10 #include "sid.h"
11
si_build_fs_interp(struct si_shader_context * ctx,unsigned attr_index,unsigned chan,LLVMValueRef prim_mask,LLVMValueRef i,LLVMValueRef j)12 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
13 unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
14 LLVMValueRef j)
15 {
16 if (i || j) {
17 return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
18 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
19 }
20 return ac_build_fs_interp_mov(&ctx->ac, 0, /* P0 */
21 LLVMConstInt(ctx->ac.i32, chan, 0),
22 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
23 }
24
25 /**
26 * Interpolate a fragment shader input.
27 *
28 * @param ctx context
29 * @param input_index index of the input in hardware
30 * @param semantic_index semantic index
31 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
32 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
33 * @param interp_param interpolation weights (i,j)
34 * @param prim_mask SI_PARAM_PRIM_MASK
35 * @param face SI_PARAM_FRONT_FACE
36 * @param result the return value (4 components)
37 */
interp_fs_color(struct si_shader_context * ctx,unsigned input_index,unsigned semantic_index,unsigned num_interp_inputs,unsigned colors_read_mask,LLVMValueRef interp_param,LLVMValueRef prim_mask,LLVMValueRef face,LLVMValueRef result[4])38 static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
39 unsigned semantic_index, unsigned num_interp_inputs,
40 unsigned colors_read_mask, LLVMValueRef interp_param,
41 LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
42 {
43 LLVMValueRef i = NULL, j = NULL;
44 unsigned chan;
45
46 /* fs.constant returns the param from the middle vertex, so it's not
47 * really useful for flat shading. It's meant to be used for custom
48 * interpolation (but the intrinsic can't fetch from the other two
49 * vertices).
50 *
51 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
52 * to do the right thing. The only reason we use fs.constant is that
53 * fs.interp cannot be used on integers, because they can be equal
54 * to NaN.
55 *
56 * When interp is false we will use fs.constant or for newer llvm,
57 * amdgcn.interp.mov.
58 */
59 bool interp = interp_param != NULL;
60
61 if (interp) {
62 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
63 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
64 }
65
66 if (ctx->shader->key.ps.part.prolog.color_two_side) {
67 LLVMValueRef is_face_positive;
68
69 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
70 * otherwise it's at offset "num_inputs".
71 */
72 unsigned back_attr_offset = num_interp_inputs;
73 if (semantic_index == 1 && colors_read_mask & 0xf)
74 back_attr_offset += 1;
75
76 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, face, ctx->ac.i32_0, "");
77
78 for (chan = 0; chan < 4; chan++) {
79 LLVMValueRef front, back;
80
81 front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
82 back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
83
84 result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
85 }
86 } else {
87 for (chan = 0; chan < 4; chan++) {
88 result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
89 }
90 }
91 }
92
si_alpha_test(struct si_shader_context * ctx,LLVMValueRef alpha)93 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
94 {
95 if (ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_NEVER) {
96 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
97 [PIPE_FUNC_LESS] = LLVMRealOLT, [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
98 [PIPE_FUNC_LEQUAL] = LLVMRealOLE, [PIPE_FUNC_GREATER] = LLVMRealOGT,
99 [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
100 };
101 LLVMRealPredicate cond = cond_map[ctx->shader->key.ps.part.epilog.alpha_func];
102 assert(cond);
103
104 LLVMValueRef alpha_ref = ac_get_arg(&ctx->ac, ctx->args->alpha_reference);
105 if (LLVMTypeOf(alpha) == ctx->ac.f16)
106 alpha_ref = LLVMBuildFPTrunc(ctx->ac.builder, alpha_ref, ctx->ac.f16, "");
107
108 LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
109 ac_build_kill_if_false(&ctx->ac, alpha_pass);
110 } else {
111 ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
112 }
113 }
114
115 struct si_ps_exports {
116 unsigned num;
117 struct ac_export_args args[10];
118 };
119
pack_two_16bit(struct ac_llvm_context * ctx,LLVMValueRef args[2])120 static LLVMValueRef pack_two_16bit(struct ac_llvm_context *ctx, LLVMValueRef args[2])
121 {
122 LLVMValueRef tmp = ac_build_gather_values(ctx, args, 2);
123 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2f16, "");
124 }
125
get_color_32bit(struct si_shader_context * ctx,unsigned color_type,LLVMValueRef value)126 static LLVMValueRef get_color_32bit(struct si_shader_context *ctx, unsigned color_type,
127 LLVMValueRef value)
128 {
129 switch (color_type) {
130 case SI_TYPE_FLOAT16:
131 return LLVMBuildFPExt(ctx->ac.builder, value, ctx->ac.f32, "");
132 case SI_TYPE_INT16:
133 value = ac_to_integer(&ctx->ac, value);
134 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->ac.i32, "");
135 return ac_to_float(&ctx->ac, value);
136 case SI_TYPE_UINT16:
137 value = ac_to_integer(&ctx->ac, value);
138 value = LLVMBuildZExt(ctx->ac.builder, value, ctx->ac.i32, "");
139 return ac_to_float(&ctx->ac, value);
140 case SI_TYPE_ANY32:
141 return value;
142 }
143 return NULL;
144 }
145
146 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_ps_export_args(struct si_shader_context * ctx,LLVMValueRef * values,unsigned cbuf,unsigned compacted_mrt_index,unsigned color_type,struct ac_export_args * args)147 static bool si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
148 unsigned cbuf, unsigned compacted_mrt_index,
149 unsigned color_type, struct ac_export_args *args)
150 {
151 const union si_shader_key *key = &ctx->shader->key;
152 unsigned col_formats = key->ps.part.epilog.spi_shader_col_format;
153 LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
154 unsigned spi_shader_col_format;
155 unsigned chan;
156 bool is_int8, is_int10;
157
158 assert(cbuf < 8);
159
160 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
161 if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
162 return false;
163
164 is_int8 = (key->ps.part.epilog.color_is_int8 >> cbuf) & 0x1;
165 is_int10 = (key->ps.part.epilog.color_is_int10 >> cbuf) & 0x1;
166
167 /* Default is 0xf. Adjusted below depending on the format. */
168 args->enabled_channels = 0xf; /* writemask */
169
170 /* Specify whether the EXEC mask represents the valid mask */
171 args->valid_mask = 0;
172
173 /* Specify whether this is the last export */
174 args->done = 0;
175
176 /* Specify the target we are exporting */
177 args->target = V_008DFC_SQ_EXP_MRT + compacted_mrt_index;
178
179 if (key->ps.part.epilog.dual_src_blend_swizzle &&
180 (compacted_mrt_index == 0 || compacted_mrt_index == 1)) {
181 assert(ctx->ac.gfx_level >= GFX11);
182 args->target += 21;
183 }
184
185 args->compr = false;
186 args->out[0] = f32undef;
187 args->out[1] = f32undef;
188 args->out[2] = f32undef;
189 args->out[3] = f32undef;
190
191 LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
192 LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
193 bool hi) = NULL;
194
195 switch (spi_shader_col_format) {
196 case V_028714_SPI_SHADER_32_R:
197 args->enabled_channels = 1; /* writemask */
198 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
199 break;
200
201 case V_028714_SPI_SHADER_32_GR:
202 args->enabled_channels = 0x3; /* writemask */
203 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
204 args->out[1] = get_color_32bit(ctx, color_type, values[1]);
205 break;
206
207 case V_028714_SPI_SHADER_32_AR:
208 if (ctx->screen->info.gfx_level >= GFX10) {
209 args->enabled_channels = 0x3; /* writemask */
210 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
211 args->out[1] = get_color_32bit(ctx, color_type, values[3]);
212 } else {
213 args->enabled_channels = 0x9; /* writemask */
214 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
215 args->out[3] = get_color_32bit(ctx, color_type, values[3]);
216 }
217 break;
218
219 case V_028714_SPI_SHADER_FP16_ABGR:
220 if (color_type != SI_TYPE_ANY32)
221 packf = pack_two_16bit;
222 else
223 packf = ac_build_cvt_pkrtz_f16;
224 break;
225
226 case V_028714_SPI_SHADER_UNORM16_ABGR:
227 if (color_type != SI_TYPE_ANY32)
228 packf = ac_build_cvt_pknorm_u16_f16;
229 else
230 packf = ac_build_cvt_pknorm_u16;
231 break;
232
233 case V_028714_SPI_SHADER_SNORM16_ABGR:
234 if (color_type != SI_TYPE_ANY32)
235 packf = ac_build_cvt_pknorm_i16_f16;
236 else
237 packf = ac_build_cvt_pknorm_i16;
238 break;
239
240 case V_028714_SPI_SHADER_UINT16_ABGR:
241 if (color_type != SI_TYPE_ANY32)
242 packf = pack_two_16bit;
243 else
244 packi = ac_build_cvt_pk_u16;
245 break;
246
247 case V_028714_SPI_SHADER_SINT16_ABGR:
248 if (color_type != SI_TYPE_ANY32)
249 packf = pack_two_16bit;
250 else
251 packi = ac_build_cvt_pk_i16;
252 break;
253
254 case V_028714_SPI_SHADER_32_ABGR:
255 for (unsigned i = 0; i < 4; i++)
256 args->out[i] = get_color_32bit(ctx, color_type, values[i]);
257 break;
258 }
259
260 /* Pack f16 or norm_i16/u16. */
261 if (packf) {
262 for (chan = 0; chan < 2; chan++) {
263 LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
264 LLVMValueRef packed;
265
266 packed = packf(&ctx->ac, pack_args);
267 args->out[chan] = ac_to_float(&ctx->ac, packed);
268 }
269 }
270 /* Pack i16/u16. */
271 if (packi) {
272 for (chan = 0; chan < 2; chan++) {
273 LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
274 ac_to_integer(&ctx->ac, values[2 * chan + 1])};
275 LLVMValueRef packed;
276
277 packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
278 args->out[chan] = ac_to_float(&ctx->ac, packed);
279 }
280 }
281 if (packf || packi) {
282 if (ctx->screen->info.gfx_level >= GFX11)
283 args->enabled_channels = 0x3;
284 else
285 args->compr = 1; /* COMPR flag */
286 }
287
288 return true;
289 }
290
si_llvm_build_clamp_alpha_test(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index)291 static void si_llvm_build_clamp_alpha_test(struct si_shader_context *ctx,
292 LLVMValueRef *color, unsigned index)
293 {
294 int i;
295
296 /* Clamp color */
297 if (ctx->shader->key.ps.part.epilog.clamp_color)
298 for (i = 0; i < 4; i++)
299 color[i] = ac_build_clamp(&ctx->ac, color[i]);
300
301 /* Alpha to one */
302 if (ctx->shader->key.ps.part.epilog.alpha_to_one)
303 color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1);
304
305 /* Alpha test */
306 if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
307 si_alpha_test(ctx, color[3]);
308 }
309
si_export_mrt_color(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index,unsigned first_color_export,unsigned color_type,struct si_ps_exports * exp)310 static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
311 unsigned first_color_export, unsigned color_type,
312 struct si_ps_exports *exp)
313 {
314 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
315 if (ctx->shader->key.ps.part.epilog.last_cbuf > 0) {
316 assert(exp->num == first_color_export);
317
318 /* Get the export arguments, also find out what the last one is. */
319 for (int c = 0; c <= ctx->shader->key.ps.part.epilog.last_cbuf; c++) {
320 if (si_llvm_init_ps_export_args(ctx, color, c, exp->num - first_color_export,
321 color_type, &exp->args[exp->num])) {
322 assert(exp->args[exp->num].enabled_channels);
323 exp->num++;
324 }
325 }
326 } else {
327 /* Export */
328 if (si_llvm_init_ps_export_args(ctx, color, index, exp->num - first_color_export,
329 color_type, &exp->args[exp->num])) {
330 assert(exp->args[exp->num].enabled_channels);
331 exp->num++;
332 }
333 }
334 }
335
336 /**
337 * Return PS outputs in this order:
338 *
339 * v[0:3] = color0.xyzw
340 * v[4:7] = color1.xyzw
341 * ...
342 * vN+0 = Depth
343 * vN+1 = Stencil
344 * vN+2 = SampleMask
345 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
346 *
347 * The alpha-ref SGPR is returned via its original location.
348 */
si_llvm_ps_build_end(struct si_shader_context * ctx)349 void si_llvm_ps_build_end(struct si_shader_context *ctx)
350 {
351 struct si_shader *shader = ctx->shader;
352 struct si_shader_info *info = &shader->selector->info;
353 LLVMBuilderRef builder = ctx->ac.builder;
354 unsigned i, j, vgpr;
355 LLVMValueRef *addrs = ctx->abi.outputs;
356
357 LLVMValueRef color[8][4] = {};
358 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
359 LLVMValueRef ret;
360
361 /* Read the output values. */
362 for (i = 0; i < info->num_outputs; i++) {
363 unsigned semantic = info->output_semantic[i];
364 LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32;
365
366 switch (semantic) {
367 case FRAG_RESULT_DEPTH:
368 depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
369 break;
370 case FRAG_RESULT_STENCIL:
371 stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
372 break;
373 case FRAG_RESULT_SAMPLE_MASK:
374 samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
375 break;
376 default:
377 if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
378 unsigned index = semantic - FRAG_RESULT_DATA0;
379
380 for (j = 0; j < 4; j++) {
381 LLVMValueRef ptr = addrs[4 * i + j];
382 type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
383 LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, "");
384 color[index][j] = result;
385 }
386 } else {
387 fprintf(stderr, "Warning: Unhandled fs output type:%d\n", semantic);
388 }
389 break;
390 }
391 }
392
393 /* Fill the return structure. */
394 ret = ctx->return_value;
395
396 /* Set SGPRs. */
397 ret = LLVMBuildInsertValue(
398 builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn.value, SI_PARAM_ALPHA_REF)),
399 SI_SGPR_ALPHA_REF, "");
400
401 /* Set VGPRs */
402 vgpr = SI_SGPR_ALPHA_REF + 1;
403 for (i = 0; i < ARRAY_SIZE(color); i++) {
404 if (!color[i][0])
405 continue;
406
407 if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
408 for (j = 0; j < 2; j++) {
409 LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
410 tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
411 ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
412 }
413 vgpr += 2;
414 } else {
415 for (j = 0; j < 4; j++)
416 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
417 }
418 }
419 if (depth)
420 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
421 if (stencil)
422 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
423 if (samplemask)
424 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
425
426 ctx->return_value = ret;
427 }
428
si_llvm_emit_polygon_stipple(struct si_shader_context * ctx)429 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx)
430 {
431 LLVMBuilderRef builder = ctx->ac.builder;
432 LLVMValueRef desc, offset, row, bit, address[2];
433
434 /* Use the fixed-point gl_FragCoord input.
435 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
436 * per coordinate to get the repeating effect.
437 */
438 address[0] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 0, 5);
439 address[1] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 16, 5);
440
441 /* Load the buffer descriptor. */
442 desc = si_prolog_get_internal_binding_slot(ctx, SI_PS_CONST_POLY_STIPPLE);
443
444 /* The stipple pattern is 32x32, each row has 32 bits. */
445 offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
446 row = si_buffer_load_const(ctx, desc, offset);
447 row = ac_to_integer(&ctx->ac, row);
448 bit = LLVMBuildLShr(builder, row, address[0], "");
449 bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
450 ac_build_kill_if_false(&ctx->ac, bit);
451 }
452
insert_ret_of_arg(struct si_shader_context * ctx,LLVMValueRef ret,LLVMValueRef data,unsigned arg_index)453 static LLVMValueRef insert_ret_of_arg(struct si_shader_context *ctx, LLVMValueRef ret,
454 LLVMValueRef data, unsigned arg_index)
455 {
456 unsigned base = ctx->args->ac.args[arg_index].file == AC_ARG_VGPR ?
457 ctx->args->ac.num_sgprs_used : 0;
458 unsigned index = base + ctx->args->ac.args[arg_index].offset;
459
460 if (ctx->args->ac.args[arg_index].size == 1) {
461 return LLVMBuildInsertValue(ctx->ac.builder, ret, data, index, "");
462 } else {
463 assert(ctx->args->ac.args[arg_index].size == 2);
464 LLVMValueRef tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_0, "");
465 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index, "");
466 tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_1, "");
467 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index + 1, "");
468 return ret;
469 }
470 }
471
472 /**
473 * Build the pixel shader prolog function. This handles:
474 * - two-side color selection and interpolation
475 * - overriding interpolation parameters for the API PS
476 * - polygon stippling
477 *
478 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
479 * overridden by other states. (e.g. per-sample interpolation)
480 * Interpolated colors are stored after the preloaded VGPRs.
481 */
si_llvm_build_ps_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)482 void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
483 {
484 struct si_shader_args *args = ctx->args;
485 si_get_ps_prolog_args(args, key);
486
487 /* Declare outputs (same as inputs + add colors if needed) */
488 LLVMTypeRef return_types[AC_MAX_ARGS];
489 int num_returns = 0;
490
491 for (int i = 0; i < args->ac.num_sgprs_used; i++)
492 return_types[num_returns++] = ctx->ac.i32;
493
494 unsigned num_color_channels = util_bitcount(key->ps_prolog.colors_read);
495 unsigned num_output_vgprs = args->ac.num_vgprs_used + num_color_channels;
496 for (int i = 0; i < num_output_vgprs; i++)
497 return_types[num_returns++] = ctx->ac.f32;
498
499 /* Create the function. */
500 si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
501 LLVMValueRef func = ctx->main_fn.value;
502
503 /* Copy inputs to outputs. This should be no-op, as the registers match,
504 * but it will prevent the compiler from overwriting them unintentionally.
505 */
506 LLVMValueRef ret = ctx->return_value;
507 for (int i = 0; i < args->ac.arg_count; i++) {
508 LLVMValueRef p = LLVMGetParam(func, i);
509 ret = insert_ret_of_arg(ctx, ret, p, i);
510 }
511
512 /* Polygon stippling. */
513 if (key->ps_prolog.states.poly_stipple)
514 si_llvm_emit_polygon_stipple(ctx);
515
516 if (key->ps_prolog.states.bc_optimize_for_persp ||
517 key->ps_prolog.states.bc_optimize_for_linear) {
518 LLVMValueRef center, centroid, tmp;
519
520 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
521 * The hw doesn't compute CENTROID if the whole wave only
522 * contains fully-covered quads.
523 */
524 LLVMValueRef bc_optimize = ac_get_arg(&ctx->ac, args->ac.prim_mask);
525 bc_optimize =
526 LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
527 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
528
529 if (key->ps_prolog.states.bc_optimize_for_persp) {
530 center = ac_get_arg(&ctx->ac, args->ac.persp_center);
531 centroid = ac_get_arg(&ctx->ac, args->ac.persp_centroid);
532 /* Select PERSP_CENTROID. */
533 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
534 ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.persp_centroid.arg_index);
535 }
536 if (key->ps_prolog.states.bc_optimize_for_linear) {
537 center = ac_get_arg(&ctx->ac, args->ac.linear_center);
538 centroid = ac_get_arg(&ctx->ac, args->ac.linear_centroid);
539 /* Select PERSP_CENTROID. */
540 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
541 ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.linear_centroid.arg_index);
542 }
543 }
544
545 /* Force per-sample interpolation. */
546 if (key->ps_prolog.states.force_persp_sample_interp) {
547 LLVMValueRef persp_sample = ac_get_arg(&ctx->ac, args->ac.persp_sample);
548 /* Overwrite PERSP_CENTER. */
549 ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_center.arg_index);
550 /* Overwrite PERSP_CENTROID. */
551 ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_centroid.arg_index);
552 }
553 if (key->ps_prolog.states.force_linear_sample_interp) {
554 LLVMValueRef linear_sample = ac_get_arg(&ctx->ac, args->ac.linear_sample);
555 /* Overwrite LINEAR_CENTER. */
556 ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_center.arg_index);
557 /* Overwrite LINEAR_CENTROID. */
558 ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_centroid.arg_index);
559 }
560
561 /* Force center interpolation. */
562 if (key->ps_prolog.states.force_persp_center_interp) {
563 LLVMValueRef persp_center = ac_get_arg(&ctx->ac, args->ac.persp_center);
564 /* Overwrite PERSP_SAMPLE. */
565 ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_sample.arg_index);
566 /* Overwrite PERSP_CENTROID. */
567 ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_centroid.arg_index);
568 }
569 if (key->ps_prolog.states.force_linear_center_interp) {
570 LLVMValueRef linear_center = ac_get_arg(&ctx->ac, args->ac.linear_center);
571 /* Overwrite LINEAR_SAMPLE. */
572 ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_sample.arg_index);
573 /* Overwrite LINEAR_CENTROID. */
574 ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_centroid.arg_index);
575 }
576
577 /* Interpolate colors. */
578 unsigned color_out_idx = 0;
579 unsigned num_input_gprs = args->ac.num_sgprs_used + args->ac.num_vgprs_used;
580 for (int i = 0; i < 2; i++) {
581 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
582
583 if (!writemask)
584 continue;
585
586 /* If the interpolation qualifier is not CONSTANT (-1). */
587 LLVMValueRef interp_ij = NULL;
588 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
589 unsigned index =
590 args->ac.num_sgprs_used + key->ps_prolog.color_interp_vgpr_index[i];
591
592 /* Get the (i,j) updated by bc_optimize handling. */
593 LLVMValueRef interp[2] = {
594 LLVMBuildExtractValue(ctx->ac.builder, ret, index, ""),
595 LLVMBuildExtractValue(ctx->ac.builder, ret, index + 1, ""),
596 };
597 interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
598 }
599
600 LLVMValueRef prim_mask = ac_get_arg(&ctx->ac, args->ac.prim_mask);
601
602 LLVMValueRef face = NULL;
603 if (key->ps_prolog.states.color_two_side) {
604 face = ac_get_arg(&ctx->ac, args->ac.front_face);
605 face = ac_to_integer(&ctx->ac, face);
606 }
607
608 LLVMValueRef color[4];
609 interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
610 key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
611
612 while (writemask) {
613 unsigned chan = u_bit_scan(&writemask);
614 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
615 num_input_gprs + color_out_idx++, "");
616 }
617 }
618
619 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
620 * says:
621 *
622 * "When per-sample shading is active due to the use of a fragment
623 * input qualified by sample or due to the use of the gl_SampleID
624 * or gl_SamplePosition variables, only the bit for the current
625 * sample is set in gl_SampleMaskIn. When state specifies multiple
626 * fragment shader invocations for a given fragment, the sample
627 * mask for any single fragment shader invocation may specify a
628 * subset of the covered samples for the fragment. In this case,
629 * the bit corresponding to each covered sample will be set in
630 * exactly one fragment shader invocation."
631 *
632 * The samplemask loaded by hardware is always the coverage of the
633 * entire pixel/fragment, so mask bits out based on the sample ID.
634 */
635 if (key->ps_prolog.states.samplemask_log_ps_iter) {
636 uint32_t ps_iter_mask =
637 ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter);
638 LLVMValueRef sampleid = si_unpack_param(ctx, args->ac.ancillary, 8, 4);
639 LLVMValueRef samplemask = ac_get_arg(&ctx->ac, args->ac.sample_coverage);
640
641 samplemask = ac_to_integer(&ctx->ac, samplemask);
642 samplemask =
643 LLVMBuildAnd(ctx->ac.builder, samplemask,
644 LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
645 sampleid, ""),
646 "");
647 samplemask = ac_to_float(&ctx->ac, samplemask);
648
649 ret = insert_ret_of_arg(ctx, ret, samplemask, args->ac.sample_coverage.arg_index);
650 }
651
652 /* Tell LLVM to insert WQM instruction sequence when needed. */
653 if (key->ps_prolog.wqm) {
654 LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
655 }
656
657 si_llvm_build_ret(ctx, ret);
658 }
659
660 /**
661 * Build the pixel shader epilog function. This handles everything that must be
662 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
663 */
si_llvm_build_ps_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)664 void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
665 {
666 int i;
667 struct si_ps_exports exp = {};
668 LLVMValueRef color[8][4] = {};
669
670 struct si_shader_args *args = ctx->args;
671 struct ac_arg color_args[MAX_DRAW_BUFFERS];
672 struct ac_arg depth_arg, stencil_arg, samplemask_arg;
673 si_get_ps_epilog_args(args, key, color_args, &depth_arg, &stencil_arg, &samplemask_arg);
674
675 /* Create the function. */
676 si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
677 /* Disable elimination of unused inputs. */
678 ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "InitialPSInputAddr", 0xffffff);
679
680 /* Prepare color. */
681 unsigned colors_written = key->ps_epilog.colors_written;
682
683 while (colors_written) {
684 int write_i = u_bit_scan(&colors_written);
685 unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
686 LLVMValueRef arg = ac_get_arg(&ctx->ac, color_args[write_i]);
687
688 if (color_type != SI_TYPE_ANY32)
689 arg = LLVMBuildBitCast(ctx->ac.builder, arg, LLVMVectorType(ctx->ac.f16, 8), "");
690
691 for (i = 0; i < 4; i++)
692 color[write_i][i] = ac_llvm_extract_elem(&ctx->ac, arg, i);
693
694 si_llvm_build_clamp_alpha_test(ctx, color[write_i], write_i);
695 }
696
697 LLVMValueRef mrtz_alpha =
698 key->ps_epilog.states.alpha_to_coverage_via_mrtz ? color[0][3] : NULL;
699
700 /* Prepare the mrtz export. */
701 if (key->ps_epilog.writes_z ||
702 key->ps_epilog.writes_stencil ||
703 key->ps_epilog.writes_samplemask ||
704 mrtz_alpha) {
705 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
706
707 if (key->ps_epilog.writes_z)
708 depth = ac_get_arg(&ctx->ac, depth_arg);
709 if (key->ps_epilog.writes_stencil)
710 stencil = ac_get_arg(&ctx->ac, stencil_arg);
711 if (key->ps_epilog.writes_samplemask)
712 samplemask = ac_get_arg(&ctx->ac, samplemask_arg);
713
714 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, mrtz_alpha, false,
715 &exp.args[exp.num++]);
716 }
717
718 /* Prepare color exports. */
719 const unsigned first_color_export = exp.num;
720 colors_written = key->ps_epilog.colors_written;
721
722 while (colors_written) {
723 int write_i = u_bit_scan(&colors_written);
724 unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
725
726 si_export_mrt_color(ctx, color[write_i], write_i, first_color_export, color_type, &exp);
727 }
728
729 if (exp.num) {
730 exp.args[exp.num - 1].valid_mask = 1; /* whether the EXEC mask is valid */
731 exp.args[exp.num - 1].done = 1; /* DONE bit */
732
733 if (key->ps_epilog.states.dual_src_blend_swizzle) {
734 assert(ctx->ac.gfx_level >= GFX11);
735 assert((key->ps_epilog.colors_written & 0x3) == 0x3);
736 ac_build_dual_src_blend_swizzle(&ctx->ac, &exp.args[first_color_export],
737 &exp.args[first_color_export + 1]);
738 }
739
740 for (unsigned i = 0; i < exp.num; i++)
741 ac_build_export(&ctx->ac, &exp.args[i]);
742 } else {
743 ac_build_export_null(&ctx->ac, key->ps_epilog.uses_discard);
744 }
745
746 /* Compile. */
747 LLVMBuildRetVoid(ctx->ac.builder);
748 }
749
750