xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "si_shader_internal.h"
9 #include "si_shader_llvm.h"
10 #include "sid.h"
11 
si_build_fs_interp(struct si_shader_context * ctx,unsigned attr_index,unsigned chan,LLVMValueRef prim_mask,LLVMValueRef i,LLVMValueRef j)12 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
13                                        unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
14                                        LLVMValueRef j)
15 {
16    if (i || j) {
17       return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
18                                 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
19    }
20    return ac_build_fs_interp_mov(&ctx->ac, 0, /* P0 */
21                                  LLVMConstInt(ctx->ac.i32, chan, 0),
22                                  LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
23 }
24 
25 /**
26  * Interpolate a fragment shader input.
27  *
28  * @param ctx                context
29  * @param input_index        index of the input in hardware
30  * @param semantic_index     semantic index
31  * @param num_interp_inputs  number of all interpolated inputs (= BCOLOR offset)
32  * @param colors_read_mask   color components read (4 bits for each color, 8 bits in total)
33  * @param interp_param       interpolation weights (i,j)
34  * @param prim_mask          SI_PARAM_PRIM_MASK
35  * @param face               SI_PARAM_FRONT_FACE
36  * @param result             the return value (4 components)
37  */
interp_fs_color(struct si_shader_context * ctx,unsigned input_index,unsigned semantic_index,unsigned num_interp_inputs,unsigned colors_read_mask,LLVMValueRef interp_param,LLVMValueRef prim_mask,LLVMValueRef face,LLVMValueRef result[4])38 static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
39                             unsigned semantic_index, unsigned num_interp_inputs,
40                             unsigned colors_read_mask, LLVMValueRef interp_param,
41                             LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
42 {
43    LLVMValueRef i = NULL, j = NULL;
44    unsigned chan;
45 
46    /* fs.constant returns the param from the middle vertex, so it's not
47     * really useful for flat shading. It's meant to be used for custom
48     * interpolation (but the intrinsic can't fetch from the other two
49     * vertices).
50     *
51     * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
52     * to do the right thing. The only reason we use fs.constant is that
53     * fs.interp cannot be used on integers, because they can be equal
54     * to NaN.
55     *
56     * When interp is false we will use fs.constant or for newer llvm,
57     * amdgcn.interp.mov.
58     */
59    bool interp = interp_param != NULL;
60 
61    if (interp) {
62       i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
63       j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
64    }
65 
66    if (ctx->shader->key.ps.part.prolog.color_two_side) {
67       LLVMValueRef is_face_positive;
68 
69       /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
70        * otherwise it's at offset "num_inputs".
71        */
72       unsigned back_attr_offset = num_interp_inputs;
73       if (semantic_index == 1 && colors_read_mask & 0xf)
74          back_attr_offset += 1;
75 
76       is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, face, ctx->ac.i32_0, "");
77 
78       for (chan = 0; chan < 4; chan++) {
79          LLVMValueRef front, back;
80 
81          front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
82          back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
83 
84          result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
85       }
86    } else {
87       for (chan = 0; chan < 4; chan++) {
88          result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
89       }
90    }
91 }
92 
si_alpha_test(struct si_shader_context * ctx,LLVMValueRef alpha)93 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
94 {
95    if (ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_NEVER) {
96       static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
97          [PIPE_FUNC_LESS] = LLVMRealOLT,     [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
98          [PIPE_FUNC_LEQUAL] = LLVMRealOLE,   [PIPE_FUNC_GREATER] = LLVMRealOGT,
99          [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
100       };
101       LLVMRealPredicate cond = cond_map[ctx->shader->key.ps.part.epilog.alpha_func];
102       assert(cond);
103 
104       LLVMValueRef alpha_ref = ac_get_arg(&ctx->ac, ctx->args->alpha_reference);
105       if (LLVMTypeOf(alpha) == ctx->ac.f16)
106          alpha_ref = LLVMBuildFPTrunc(ctx->ac.builder, alpha_ref, ctx->ac.f16, "");
107 
108       LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
109       ac_build_kill_if_false(&ctx->ac, alpha_pass);
110    } else {
111       ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
112    }
113 }
114 
115 struct si_ps_exports {
116    unsigned num;
117    struct ac_export_args args[10];
118 };
119 
pack_two_16bit(struct ac_llvm_context * ctx,LLVMValueRef args[2])120 static LLVMValueRef pack_two_16bit(struct ac_llvm_context *ctx, LLVMValueRef args[2])
121 {
122    LLVMValueRef tmp = ac_build_gather_values(ctx, args, 2);
123    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2f16, "");
124 }
125 
get_color_32bit(struct si_shader_context * ctx,unsigned color_type,LLVMValueRef value)126 static LLVMValueRef get_color_32bit(struct si_shader_context *ctx, unsigned color_type,
127                                     LLVMValueRef value)
128 {
129    switch (color_type) {
130    case SI_TYPE_FLOAT16:
131       return LLVMBuildFPExt(ctx->ac.builder, value, ctx->ac.f32, "");
132    case SI_TYPE_INT16:
133       value = ac_to_integer(&ctx->ac, value);
134       value = LLVMBuildSExt(ctx->ac.builder, value, ctx->ac.i32, "");
135       return ac_to_float(&ctx->ac, value);
136    case SI_TYPE_UINT16:
137       value = ac_to_integer(&ctx->ac, value);
138       value = LLVMBuildZExt(ctx->ac.builder, value, ctx->ac.i32, "");
139       return ac_to_float(&ctx->ac, value);
140    case SI_TYPE_ANY32:
141       return value;
142    }
143    return NULL;
144 }
145 
146 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_ps_export_args(struct si_shader_context * ctx,LLVMValueRef * values,unsigned cbuf,unsigned compacted_mrt_index,unsigned color_type,struct ac_export_args * args)147 static bool si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
148                                         unsigned cbuf, unsigned compacted_mrt_index,
149                                         unsigned color_type, struct ac_export_args *args)
150 {
151    const union si_shader_key *key = &ctx->shader->key;
152    unsigned col_formats = key->ps.part.epilog.spi_shader_col_format;
153    LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
154    unsigned spi_shader_col_format;
155    unsigned chan;
156    bool is_int8, is_int10;
157 
158    assert(cbuf < 8);
159 
160    spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
161    if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
162       return false;
163 
164    is_int8 = (key->ps.part.epilog.color_is_int8 >> cbuf) & 0x1;
165    is_int10 = (key->ps.part.epilog.color_is_int10 >> cbuf) & 0x1;
166 
167    /* Default is 0xf. Adjusted below depending on the format. */
168    args->enabled_channels = 0xf; /* writemask */
169 
170    /* Specify whether the EXEC mask represents the valid mask */
171    args->valid_mask = 0;
172 
173    /* Specify whether this is the last export */
174    args->done = 0;
175 
176    /* Specify the target we are exporting */
177    args->target = V_008DFC_SQ_EXP_MRT + compacted_mrt_index;
178 
179    if (key->ps.part.epilog.dual_src_blend_swizzle &&
180        (compacted_mrt_index == 0 || compacted_mrt_index == 1)) {
181       assert(ctx->ac.gfx_level >= GFX11);
182       args->target += 21;
183    }
184 
185    args->compr = false;
186    args->out[0] = f32undef;
187    args->out[1] = f32undef;
188    args->out[2] = f32undef;
189    args->out[3] = f32undef;
190 
191    LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
192    LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
193                          bool hi) = NULL;
194 
195    switch (spi_shader_col_format) {
196    case V_028714_SPI_SHADER_32_R:
197       args->enabled_channels = 1; /* writemask */
198       args->out[0] = get_color_32bit(ctx, color_type, values[0]);
199       break;
200 
201    case V_028714_SPI_SHADER_32_GR:
202       args->enabled_channels = 0x3; /* writemask */
203       args->out[0] = get_color_32bit(ctx, color_type, values[0]);
204       args->out[1] = get_color_32bit(ctx, color_type, values[1]);
205       break;
206 
207    case V_028714_SPI_SHADER_32_AR:
208       if (ctx->screen->info.gfx_level >= GFX10) {
209          args->enabled_channels = 0x3; /* writemask */
210          args->out[0] = get_color_32bit(ctx, color_type, values[0]);
211          args->out[1] = get_color_32bit(ctx, color_type, values[3]);
212       } else {
213          args->enabled_channels = 0x9; /* writemask */
214          args->out[0] = get_color_32bit(ctx, color_type, values[0]);
215          args->out[3] = get_color_32bit(ctx, color_type, values[3]);
216       }
217       break;
218 
219    case V_028714_SPI_SHADER_FP16_ABGR:
220       if (color_type != SI_TYPE_ANY32)
221          packf = pack_two_16bit;
222       else
223          packf = ac_build_cvt_pkrtz_f16;
224       break;
225 
226    case V_028714_SPI_SHADER_UNORM16_ABGR:
227       if (color_type != SI_TYPE_ANY32)
228          packf = ac_build_cvt_pknorm_u16_f16;
229       else
230          packf = ac_build_cvt_pknorm_u16;
231       break;
232 
233    case V_028714_SPI_SHADER_SNORM16_ABGR:
234       if (color_type != SI_TYPE_ANY32)
235          packf = ac_build_cvt_pknorm_i16_f16;
236       else
237          packf = ac_build_cvt_pknorm_i16;
238       break;
239 
240    case V_028714_SPI_SHADER_UINT16_ABGR:
241       if (color_type != SI_TYPE_ANY32)
242          packf = pack_two_16bit;
243       else
244          packi = ac_build_cvt_pk_u16;
245       break;
246 
247    case V_028714_SPI_SHADER_SINT16_ABGR:
248       if (color_type != SI_TYPE_ANY32)
249          packf = pack_two_16bit;
250       else
251          packi = ac_build_cvt_pk_i16;
252       break;
253 
254    case V_028714_SPI_SHADER_32_ABGR:
255       for (unsigned i = 0; i < 4; i++)
256          args->out[i] = get_color_32bit(ctx, color_type, values[i]);
257       break;
258    }
259 
260    /* Pack f16 or norm_i16/u16. */
261    if (packf) {
262       for (chan = 0; chan < 2; chan++) {
263          LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
264          LLVMValueRef packed;
265 
266          packed = packf(&ctx->ac, pack_args);
267          args->out[chan] = ac_to_float(&ctx->ac, packed);
268       }
269    }
270    /* Pack i16/u16. */
271    if (packi) {
272       for (chan = 0; chan < 2; chan++) {
273          LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
274                                       ac_to_integer(&ctx->ac, values[2 * chan + 1])};
275          LLVMValueRef packed;
276 
277          packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
278          args->out[chan] = ac_to_float(&ctx->ac, packed);
279       }
280    }
281    if (packf || packi) {
282       if (ctx->screen->info.gfx_level >= GFX11)
283          args->enabled_channels = 0x3;
284       else
285          args->compr = 1; /* COMPR flag */
286    }
287 
288    return true;
289 }
290 
si_llvm_build_clamp_alpha_test(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index)291 static void si_llvm_build_clamp_alpha_test(struct si_shader_context *ctx,
292                                            LLVMValueRef *color, unsigned index)
293 {
294    int i;
295 
296    /* Clamp color */
297    if (ctx->shader->key.ps.part.epilog.clamp_color)
298       for (i = 0; i < 4; i++)
299          color[i] = ac_build_clamp(&ctx->ac, color[i]);
300 
301    /* Alpha to one */
302    if (ctx->shader->key.ps.part.epilog.alpha_to_one)
303       color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1);
304 
305    /* Alpha test */
306    if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
307       si_alpha_test(ctx, color[3]);
308 }
309 
si_export_mrt_color(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index,unsigned first_color_export,unsigned color_type,struct si_ps_exports * exp)310 static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
311                                 unsigned first_color_export, unsigned color_type,
312                                 struct si_ps_exports *exp)
313 {
314    /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
315    if (ctx->shader->key.ps.part.epilog.last_cbuf > 0) {
316       assert(exp->num == first_color_export);
317 
318       /* Get the export arguments, also find out what the last one is. */
319       for (int c = 0; c <= ctx->shader->key.ps.part.epilog.last_cbuf; c++) {
320          if (si_llvm_init_ps_export_args(ctx, color, c, exp->num - first_color_export,
321                                          color_type, &exp->args[exp->num])) {
322             assert(exp->args[exp->num].enabled_channels);
323             exp->num++;
324          }
325       }
326    } else {
327       /* Export */
328       if (si_llvm_init_ps_export_args(ctx, color, index, exp->num - first_color_export,
329                                       color_type, &exp->args[exp->num])) {
330          assert(exp->args[exp->num].enabled_channels);
331          exp->num++;
332       }
333    }
334 }
335 
336 /**
337  * Return PS outputs in this order:
338  *
339  * v[0:3] = color0.xyzw
340  * v[4:7] = color1.xyzw
341  * ...
342  * vN+0 = Depth
343  * vN+1 = Stencil
344  * vN+2 = SampleMask
345  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
346  *
347  * The alpha-ref SGPR is returned via its original location.
348  */
si_llvm_ps_build_end(struct si_shader_context * ctx)349 void si_llvm_ps_build_end(struct si_shader_context *ctx)
350 {
351    struct si_shader *shader = ctx->shader;
352    struct si_shader_info *info = &shader->selector->info;
353    LLVMBuilderRef builder = ctx->ac.builder;
354    unsigned i, j, vgpr;
355    LLVMValueRef *addrs = ctx->abi.outputs;
356 
357    LLVMValueRef color[8][4] = {};
358    LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
359    LLVMValueRef ret;
360 
361    /* Read the output values. */
362    for (i = 0; i < info->num_outputs; i++) {
363       unsigned semantic = info->output_semantic[i];
364       LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32;
365 
366       switch (semantic) {
367       case FRAG_RESULT_DEPTH:
368          depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
369          break;
370       case FRAG_RESULT_STENCIL:
371          stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
372          break;
373       case FRAG_RESULT_SAMPLE_MASK:
374          samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
375          break;
376       default:
377          if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
378             unsigned index = semantic - FRAG_RESULT_DATA0;
379 
380             for (j = 0; j < 4; j++) {
381                LLVMValueRef ptr = addrs[4 * i + j];
382                type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
383                LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, "");
384                color[index][j] = result;
385             }
386          } else {
387             fprintf(stderr, "Warning: Unhandled fs output type:%d\n", semantic);
388          }
389          break;
390       }
391    }
392 
393    /* Fill the return structure. */
394    ret = ctx->return_value;
395 
396    /* Set SGPRs. */
397    ret = LLVMBuildInsertValue(
398       builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn.value, SI_PARAM_ALPHA_REF)),
399       SI_SGPR_ALPHA_REF, "");
400 
401    /* Set VGPRs */
402    vgpr = SI_SGPR_ALPHA_REF + 1;
403    for (i = 0; i < ARRAY_SIZE(color); i++) {
404       if (!color[i][0])
405          continue;
406 
407       if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
408          for (j = 0; j < 2; j++) {
409             LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
410             tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
411             ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
412          }
413          vgpr += 2;
414       } else {
415          for (j = 0; j < 4; j++)
416             ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
417       }
418    }
419    if (depth)
420       ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
421    if (stencil)
422       ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
423    if (samplemask)
424       ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
425 
426    ctx->return_value = ret;
427 }
428 
si_llvm_emit_polygon_stipple(struct si_shader_context * ctx)429 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx)
430 {
431    LLVMBuilderRef builder = ctx->ac.builder;
432    LLVMValueRef desc, offset, row, bit, address[2];
433 
434    /* Use the fixed-point gl_FragCoord input.
435     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
436     * per coordinate to get the repeating effect.
437     */
438    address[0] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 0, 5);
439    address[1] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 16, 5);
440 
441    /* Load the buffer descriptor. */
442    desc = si_prolog_get_internal_binding_slot(ctx, SI_PS_CONST_POLY_STIPPLE);
443 
444    /* The stipple pattern is 32x32, each row has 32 bits. */
445    offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
446    row = si_buffer_load_const(ctx, desc, offset);
447    row = ac_to_integer(&ctx->ac, row);
448    bit = LLVMBuildLShr(builder, row, address[0], "");
449    bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
450    ac_build_kill_if_false(&ctx->ac, bit);
451 }
452 
insert_ret_of_arg(struct si_shader_context * ctx,LLVMValueRef ret,LLVMValueRef data,unsigned arg_index)453 static LLVMValueRef insert_ret_of_arg(struct si_shader_context *ctx, LLVMValueRef ret,
454                                       LLVMValueRef data, unsigned arg_index)
455 {
456    unsigned base = ctx->args->ac.args[arg_index].file == AC_ARG_VGPR ?
457       ctx->args->ac.num_sgprs_used : 0;
458    unsigned index = base + ctx->args->ac.args[arg_index].offset;
459 
460    if (ctx->args->ac.args[arg_index].size == 1) {
461       return LLVMBuildInsertValue(ctx->ac.builder, ret, data, index, "");
462    } else {
463       assert(ctx->args->ac.args[arg_index].size == 2);
464       LLVMValueRef tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_0, "");
465       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index, "");
466       tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_1, "");
467       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index + 1, "");
468       return ret;
469    }
470 }
471 
472 /**
473  * Build the pixel shader prolog function. This handles:
474  * - two-side color selection and interpolation
475  * - overriding interpolation parameters for the API PS
476  * - polygon stippling
477  *
478  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
479  * overridden by other states. (e.g. per-sample interpolation)
480  * Interpolated colors are stored after the preloaded VGPRs.
481  */
si_llvm_build_ps_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)482 void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
483 {
484    struct si_shader_args *args = ctx->args;
485    si_get_ps_prolog_args(args, key);
486 
487    /* Declare outputs (same as inputs + add colors if needed) */
488    LLVMTypeRef return_types[AC_MAX_ARGS];
489    int num_returns = 0;
490 
491    for (int i = 0; i < args->ac.num_sgprs_used; i++)
492       return_types[num_returns++] = ctx->ac.i32;
493 
494    unsigned num_color_channels = util_bitcount(key->ps_prolog.colors_read);
495    unsigned num_output_vgprs = args->ac.num_vgprs_used + num_color_channels;
496    for (int i = 0; i < num_output_vgprs; i++)
497       return_types[num_returns++] = ctx->ac.f32;
498 
499    /* Create the function. */
500    si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
501    LLVMValueRef func = ctx->main_fn.value;
502 
503    /* Copy inputs to outputs. This should be no-op, as the registers match,
504     * but it will prevent the compiler from overwriting them unintentionally.
505     */
506    LLVMValueRef ret = ctx->return_value;
507    for (int i = 0; i < args->ac.arg_count; i++) {
508       LLVMValueRef p = LLVMGetParam(func, i);
509       ret = insert_ret_of_arg(ctx, ret, p, i);
510    }
511 
512    /* Polygon stippling. */
513    if (key->ps_prolog.states.poly_stipple)
514       si_llvm_emit_polygon_stipple(ctx);
515 
516    if (key->ps_prolog.states.bc_optimize_for_persp ||
517        key->ps_prolog.states.bc_optimize_for_linear) {
518       LLVMValueRef center, centroid, tmp;
519 
520       /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
521        * The hw doesn't compute CENTROID if the whole wave only
522        * contains fully-covered quads.
523        */
524       LLVMValueRef bc_optimize = ac_get_arg(&ctx->ac, args->ac.prim_mask);
525       bc_optimize =
526          LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
527       bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
528 
529       if (key->ps_prolog.states.bc_optimize_for_persp) {
530          center = ac_get_arg(&ctx->ac, args->ac.persp_center);
531          centroid = ac_get_arg(&ctx->ac, args->ac.persp_centroid);
532          /* Select PERSP_CENTROID. */
533          tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
534          ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.persp_centroid.arg_index);
535       }
536       if (key->ps_prolog.states.bc_optimize_for_linear) {
537          center = ac_get_arg(&ctx->ac, args->ac.linear_center);
538          centroid = ac_get_arg(&ctx->ac, args->ac.linear_centroid);
539          /* Select PERSP_CENTROID. */
540          tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
541          ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.linear_centroid.arg_index);
542       }
543    }
544 
545    /* Force per-sample interpolation. */
546    if (key->ps_prolog.states.force_persp_sample_interp) {
547       LLVMValueRef persp_sample = ac_get_arg(&ctx->ac, args->ac.persp_sample);
548       /* Overwrite PERSP_CENTER. */
549       ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_center.arg_index);
550       /* Overwrite PERSP_CENTROID. */
551       ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_centroid.arg_index);
552    }
553    if (key->ps_prolog.states.force_linear_sample_interp) {
554       LLVMValueRef linear_sample = ac_get_arg(&ctx->ac, args->ac.linear_sample);
555       /* Overwrite LINEAR_CENTER. */
556       ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_center.arg_index);
557       /* Overwrite LINEAR_CENTROID. */
558       ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_centroid.arg_index);
559    }
560 
561    /* Force center interpolation. */
562    if (key->ps_prolog.states.force_persp_center_interp) {
563       LLVMValueRef persp_center = ac_get_arg(&ctx->ac, args->ac.persp_center);
564       /* Overwrite PERSP_SAMPLE. */
565       ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_sample.arg_index);
566       /* Overwrite PERSP_CENTROID. */
567       ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_centroid.arg_index);
568    }
569    if (key->ps_prolog.states.force_linear_center_interp) {
570       LLVMValueRef linear_center = ac_get_arg(&ctx->ac, args->ac.linear_center);
571       /* Overwrite LINEAR_SAMPLE. */
572       ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_sample.arg_index);
573       /* Overwrite LINEAR_CENTROID. */
574       ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_centroid.arg_index);
575    }
576 
577    /* Interpolate colors. */
578    unsigned color_out_idx = 0;
579    unsigned num_input_gprs = args->ac.num_sgprs_used + args->ac.num_vgprs_used;
580    for (int i = 0; i < 2; i++) {
581       unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
582 
583       if (!writemask)
584          continue;
585 
586       /* If the interpolation qualifier is not CONSTANT (-1). */
587       LLVMValueRef interp_ij = NULL;
588       if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
589          unsigned index =
590             args->ac.num_sgprs_used + key->ps_prolog.color_interp_vgpr_index[i];
591 
592          /* Get the (i,j) updated by bc_optimize handling. */
593          LLVMValueRef interp[2] = {
594             LLVMBuildExtractValue(ctx->ac.builder, ret, index, ""),
595             LLVMBuildExtractValue(ctx->ac.builder, ret, index + 1, ""),
596          };
597          interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
598       }
599 
600       LLVMValueRef prim_mask = ac_get_arg(&ctx->ac, args->ac.prim_mask);
601 
602       LLVMValueRef face = NULL;
603       if (key->ps_prolog.states.color_two_side) {
604          face = ac_get_arg(&ctx->ac, args->ac.front_face);
605          face = ac_to_integer(&ctx->ac, face);
606       }
607 
608       LLVMValueRef color[4];
609       interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
610                       key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
611 
612       while (writemask) {
613          unsigned chan = u_bit_scan(&writemask);
614          ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
615                                     num_input_gprs + color_out_idx++, "");
616       }
617    }
618 
619    /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
620     * says:
621     *
622     *    "When per-sample shading is active due to the use of a fragment
623     *     input qualified by sample or due to the use of the gl_SampleID
624     *     or gl_SamplePosition variables, only the bit for the current
625     *     sample is set in gl_SampleMaskIn. When state specifies multiple
626     *     fragment shader invocations for a given fragment, the sample
627     *     mask for any single fragment shader invocation may specify a
628     *     subset of the covered samples for the fragment. In this case,
629     *     the bit corresponding to each covered sample will be set in
630     *     exactly one fragment shader invocation."
631     *
632     * The samplemask loaded by hardware is always the coverage of the
633     * entire pixel/fragment, so mask bits out based on the sample ID.
634     */
635    if (key->ps_prolog.states.samplemask_log_ps_iter) {
636       uint32_t ps_iter_mask =
637          ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter);
638       LLVMValueRef sampleid = si_unpack_param(ctx, args->ac.ancillary, 8, 4);
639       LLVMValueRef samplemask = ac_get_arg(&ctx->ac, args->ac.sample_coverage);
640 
641       samplemask = ac_to_integer(&ctx->ac, samplemask);
642       samplemask =
643          LLVMBuildAnd(ctx->ac.builder, samplemask,
644                       LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
645                                    sampleid, ""),
646                       "");
647       samplemask = ac_to_float(&ctx->ac, samplemask);
648 
649       ret = insert_ret_of_arg(ctx, ret, samplemask, args->ac.sample_coverage.arg_index);
650    }
651 
652    /* Tell LLVM to insert WQM instruction sequence when needed. */
653    if (key->ps_prolog.wqm) {
654       LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
655    }
656 
657    si_llvm_build_ret(ctx, ret);
658 }
659 
660 /**
661  * Build the pixel shader epilog function. This handles everything that must be
662  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
663  */
si_llvm_build_ps_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)664 void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
665 {
666    int i;
667    struct si_ps_exports exp = {};
668    LLVMValueRef color[8][4] = {};
669 
670    struct si_shader_args *args = ctx->args;
671    struct ac_arg color_args[MAX_DRAW_BUFFERS];
672    struct ac_arg depth_arg, stencil_arg, samplemask_arg;
673    si_get_ps_epilog_args(args, key, color_args, &depth_arg, &stencil_arg, &samplemask_arg);
674 
675    /* Create the function. */
676    si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
677    /* Disable elimination of unused inputs. */
678    ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "InitialPSInputAddr", 0xffffff);
679 
680    /* Prepare color. */
681    unsigned colors_written = key->ps_epilog.colors_written;
682 
683    while (colors_written) {
684       int write_i = u_bit_scan(&colors_written);
685       unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
686       LLVMValueRef arg = ac_get_arg(&ctx->ac, color_args[write_i]);
687 
688       if (color_type != SI_TYPE_ANY32)
689          arg = LLVMBuildBitCast(ctx->ac.builder, arg, LLVMVectorType(ctx->ac.f16, 8), "");
690 
691       for (i = 0; i < 4; i++)
692          color[write_i][i] = ac_llvm_extract_elem(&ctx->ac, arg, i);
693 
694       si_llvm_build_clamp_alpha_test(ctx, color[write_i], write_i);
695    }
696 
697    LLVMValueRef mrtz_alpha =
698       key->ps_epilog.states.alpha_to_coverage_via_mrtz ? color[0][3] : NULL;
699 
700    /* Prepare the mrtz export. */
701    if (key->ps_epilog.writes_z ||
702        key->ps_epilog.writes_stencil ||
703        key->ps_epilog.writes_samplemask ||
704        mrtz_alpha) {
705       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
706 
707       if (key->ps_epilog.writes_z)
708          depth = ac_get_arg(&ctx->ac, depth_arg);
709       if (key->ps_epilog.writes_stencil)
710          stencil = ac_get_arg(&ctx->ac, stencil_arg);
711       if (key->ps_epilog.writes_samplemask)
712          samplemask = ac_get_arg(&ctx->ac, samplemask_arg);
713 
714       ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, mrtz_alpha, false,
715                       &exp.args[exp.num++]);
716    }
717 
718    /* Prepare color exports. */
719    const unsigned first_color_export = exp.num;
720    colors_written = key->ps_epilog.colors_written;
721 
722    while (colors_written) {
723       int write_i = u_bit_scan(&colors_written);
724       unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
725 
726       si_export_mrt_color(ctx, color[write_i], write_i, first_color_export, color_type, &exp);
727    }
728 
729    if (exp.num) {
730       exp.args[exp.num - 1].valid_mask = 1;  /* whether the EXEC mask is valid */
731       exp.args[exp.num - 1].done = 1;        /* DONE bit */
732 
733       if (key->ps_epilog.states.dual_src_blend_swizzle) {
734          assert(ctx->ac.gfx_level >= GFX11);
735          assert((key->ps_epilog.colors_written & 0x3) == 0x3);
736          ac_build_dual_src_blend_swizzle(&ctx->ac, &exp.args[first_color_export],
737                                          &exp.args[first_color_export + 1]);
738       }
739 
740       for (unsigned i = 0; i < exp.num; i++)
741          ac_build_export(&ctx->ac, &exp.args[i]);
742    } else {
743       ac_build_export_null(&ctx->ac, key->ps_epilog.uses_discard);
744    }
745 
746    /* Compile. */
747    LLVMBuildRetVoid(ctx->ac.builder);
748 }
749 
750