xref: /aosp_15_r20/external/mesa3d/src/amd/llvm/ac_llvm_build.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
7 #include "ac_llvm_build.h"
8 #include "ac_gpu_info.h"
9 #include "ac_nir.h"
10 #include "ac_llvm_util.h"
11 #include "ac_shader_util.h"
12 #include "c11/threads.h"
13 #include "shader_enums.h"
14 #include "sid.h"
15 #include "util/bitscan.h"
16 #include "util/macros.h"
17 #include "util/u_atomic.h"
18 #include "util/u_math.h"
19 #include <llvm-c/Core.h>
20 #include <llvm/Config/llvm-config.h>
21 
22 #include <assert.h>
23 #include <stdio.h>
24 
25 #define AC_LLVM_INITIAL_CF_DEPTH 4
26 
27 /* Data for if/else/endif and bgnloop/endloop control flow structures.
28  */
29 struct ac_llvm_flow {
30    /* Loop exit or next part of if/else/endif. */
31    LLVMBasicBlockRef next_block;
32    LLVMBasicBlockRef loop_entry_block;
33 };
34 
35 /* Initialize module-independent parts of the context.
36  *
37  * The caller is responsible for initializing ctx::module and ctx::builder.
38  */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,const struct radeon_info * info,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits,bool exports_color_null,bool exports_mrtz)39 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
40                           const struct radeon_info *info, enum ac_float_mode float_mode,
41                           unsigned wave_size, unsigned ballot_mask_bits, bool exports_color_null,
42                           bool exports_mrtz)
43 {
44    ctx->context = LLVMContextCreate();
45 
46    ctx->info = info;
47    ctx->gfx_level = info->gfx_level;
48    ctx->wave_size = wave_size;
49    ctx->ballot_mask_bits = ballot_mask_bits;
50    ctx->float_mode = float_mode;
51    ctx->exports_color_null = exports_color_null;
52    ctx->exports_mrtz = exports_mrtz;
53    ctx->module = ac_create_module(compiler->tm, ctx->context);
54    ctx->builder = ac_create_builder(ctx->context, float_mode);
55 
56    ctx->voidt = LLVMVoidTypeInContext(ctx->context);
57    ctx->i1 = LLVMInt1TypeInContext(ctx->context);
58    ctx->i8 = LLVMInt8TypeInContext(ctx->context);
59    ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
60    ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
61    ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
62    ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
63    ctx->intptr = ctx->i32;
64    ctx->f16 = LLVMHalfTypeInContext(ctx->context);
65    ctx->f32 = LLVMFloatTypeInContext(ctx->context);
66    ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
67    ctx->v4i8 = LLVMVectorType(ctx->i8, 4);
68    ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
69    ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
70    ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
71    ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
72    ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
73    ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
74    ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
75    ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
76    ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
77    ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
78    ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
79    ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
80    ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
81 
82    ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
83    ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
84    ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
85    ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
86    ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
87    ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
88    ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
89    ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
90    ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
91    ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
92    ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
93    ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
94    ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
95    ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
96    ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
97    ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
98 
99    ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
100    ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
101 
102    ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
103    ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
104    ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
105    ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
106 
107    ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
108 
109    LLVMValueRef three = LLVMConstReal(ctx->f32, 3);
110    ctx->three_md = LLVMMDNodeInContext(ctx->context, &three, 1);
111 
112    ctx->flow = calloc(1, sizeof(*ctx->flow));
113 
114    ctx->ring_offsets_index = INT32_MAX;
115 }
116 
ac_llvm_context_dispose(struct ac_llvm_context * ctx)117 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
118 {
119    free(ctx->flow->stack);
120    free(ctx->flow);
121    ctx->flow = NULL;
122 
123    LLVMDisposeBuilder(ctx->builder);
124 }
125 
ac_get_llvm_num_components(LLVMValueRef value)126 int ac_get_llvm_num_components(LLVMValueRef value)
127 {
128    LLVMTypeRef type = LLVMTypeOf(value);
129    unsigned num_components =
130       LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
131    return num_components;
132 }
133 
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)134 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
135 {
136    if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
137       assert(index == 0);
138       return value;
139    }
140 
141    return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
142 }
143 
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)144 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
145 {
146    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
147       type = LLVMGetElementType(type);
148 
149    if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
150       return LLVMGetIntTypeWidth(type);
151 
152    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
153       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
154          return 32;
155    }
156 
157    if (type == ctx->f16)
158       return 16;
159    if (type == ctx->f32)
160       return 32;
161    if (type == ctx->f64)
162       return 64;
163 
164    unreachable("Unhandled type kind in get_elem_bits");
165 }
166 
ac_get_type_size(LLVMTypeRef type)167 unsigned ac_get_type_size(LLVMTypeRef type)
168 {
169    LLVMTypeKind kind = LLVMGetTypeKind(type);
170 
171    switch (kind) {
172    case LLVMIntegerTypeKind:
173       return LLVMGetIntTypeWidth(type) / 8;
174    case LLVMHalfTypeKind:
175       return 2;
176    case LLVMFloatTypeKind:
177       return 4;
178    case LLVMDoubleTypeKind:
179       return 8;
180    case LLVMPointerTypeKind:
181       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
182          return 4;
183       return 8;
184    case LLVMVectorTypeKind:
185       return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
186    case LLVMArrayTypeKind:
187       return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
188    default:
189       assert(0);
190       return 0;
191    }
192 }
193 
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)194 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
195 {
196    if (t == ctx->i1)
197       return ctx->i1;
198    else if (t == ctx->i8)
199       return ctx->i8;
200    else if (t == ctx->f16 || t == ctx->i16)
201       return ctx->i16;
202    else if (t == ctx->f32 || t == ctx->i32)
203       return ctx->i32;
204    else if (t == ctx->f64 || t == ctx->i64)
205       return ctx->i64;
206    else
207       unreachable("Unhandled integer size");
208 }
209 
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)210 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
211 {
212    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
213       LLVMTypeRef elem_type = LLVMGetElementType(t);
214       return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
215    }
216    if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
217       switch (LLVMGetPointerAddressSpace(t)) {
218       case AC_ADDR_SPACE_GLOBAL:
219       case AC_ADDR_SPACE_CONST:
220          return ctx->i64;
221       case AC_ADDR_SPACE_CONST_32BIT:
222       case AC_ADDR_SPACE_LDS:
223          return ctx->i32;
224       default:
225          unreachable("unhandled address space");
226       }
227    }
228    return to_integer_type_scalar(ctx, t);
229 }
230 
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)231 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
232 {
233    LLVMTypeRef type = LLVMTypeOf(v);
234    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
235       return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
236    }
237    return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
238 }
239 
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)240 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
241 {
242    LLVMTypeRef type = LLVMTypeOf(v);
243    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
244       return v;
245    return ac_to_integer(ctx, v);
246 }
247 
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)248 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
249 {
250    if (t == ctx->i8)
251       return ctx->i8;
252    else if (t == ctx->i16 || t == ctx->f16)
253       return ctx->f16;
254    else if (t == ctx->i32 || t == ctx->f32)
255       return ctx->f32;
256    else if (t == ctx->i64 || t == ctx->f64)
257       return ctx->f64;
258    else
259       unreachable("Unhandled float size");
260 }
261 
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)262 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
263 {
264    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
265       LLVMTypeRef elem_type = LLVMGetElementType(t);
266       return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
267    }
268    return to_float_type_scalar(ctx, t);
269 }
270 
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)271 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
272 {
273    LLVMTypeRef type = LLVMTypeOf(v);
274    return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
275 }
276 
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)277 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
278                                 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
279                                 unsigned attrib_mask)
280 {
281    LLVMValueRef call;
282 
283    LLVMTypeRef param_types[32];
284    assert(param_count <= 32);
285    for (unsigned i = 0; i < param_count; ++i) {
286       assert(params[i]);
287       param_types[i] = LLVMTypeOf(params[i]);
288    }
289 
290    LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
291    LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
292 
293    if (!function) {
294       function = LLVMAddFunction(ctx->module, name, function_type);
295 
296       LLVMSetFunctionCallConv(function, LLVMCCallConv);
297       LLVMSetLinkage(function, LLVMExternalLinkage);
298    }
299 
300    call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
301 
302    if (attrib_mask & AC_ATTR_INVARIANT_LOAD)
303       LLVMSetMetadata(call, ctx->invariant_load_md_kind, ctx->empty_md);
304 
305    if (attrib_mask & AC_ATTR_CONVERGENT)
306       LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "convergent"));
307 
308    LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "nounwind"));
309    return call;
310 }
311 
312 /**
313  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
314  * intrinsic names).
315  */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)316 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
317 {
318    LLVMTypeRef elem_type = type;
319 
320    if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
321       unsigned count = LLVMCountStructElementTypes(type);
322       int ret = snprintf(buf, bufsize, "sl_");
323       buf += ret;
324       bufsize -= ret;
325 
326       LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
327       LLVMGetStructElementTypes(type, elems);
328 
329       for (unsigned i = 0; i < count; i++) {
330          ac_build_type_name_for_intr(elems[i], buf, bufsize);
331          ret = strlen(buf);
332          buf += ret;
333          bufsize -= ret;
334       }
335 
336       snprintf(buf, bufsize, "s");
337       return;
338    }
339 
340    assert(bufsize >= 8);
341    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
342       int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
343       if (ret < 0) {
344          char *type_name = LLVMPrintTypeToString(type);
345          fprintf(stderr, "Error building type name for: %s\n", type_name);
346          LLVMDisposeMessage(type_name);
347          return;
348       }
349       elem_type = LLVMGetElementType(type);
350       buf += ret;
351       bufsize -= ret;
352    }
353    switch (LLVMGetTypeKind(elem_type)) {
354    default:
355       break;
356    case LLVMIntegerTypeKind:
357       snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
358       break;
359    case LLVMHalfTypeKind:
360       snprintf(buf, bufsize, "f16");
361       break;
362    case LLVMFloatTypeKind:
363       snprintf(buf, bufsize, "f32");
364       break;
365    case LLVMDoubleTypeKind:
366       snprintf(buf, bufsize, "f64");
367       break;
368    }
369 }
370 
371 /**
372  * Helper function that builds an LLVM IR PHI node and immediately adds
373  * incoming edges.
374  */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)375 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
376                           LLVMValueRef *values, LLVMBasicBlockRef *blocks)
377 {
378    LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
379    LLVMAddIncoming(phi, values, blocks, count_incoming);
380    return phi;
381 }
382 
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)383 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
384 {
385    /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
386     * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
387     */
388    if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
389       return;
390 
391    ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, 0);
392 }
393 
394 /* Prevent optimizations (at least of memory accesses) across the current
395  * point in the program by emitting empty inline assembly that is marked as
396  * having side effects.
397  *
398  * Optionally, a value can be passed through the inline assembly to prevent
399  * LLVM from hoisting calls to ReadNone functions.
400  */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)401 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
402 {
403    static int counter = 0;
404 
405    LLVMBuilderRef builder = ctx->builder;
406    char code[16];
407    const char *constraint = sgpr ? "=s,0" : "=v,0";
408 
409    snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
410 
411    if (!pgpr) {
412       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
413       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
414       LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
415    } else {
416       LLVMTypeRef old_type = LLVMTypeOf(*pgpr);
417 
418       if (old_type == ctx->i1)
419          *pgpr = LLVMBuildZExt(builder, *pgpr, ctx->i32, "");
420 
421       if (old_type == LLVMVectorType(ctx->i16, 3))
422          *pgpr = ac_build_expand_to_vec4(ctx, *pgpr, 4);
423 
424       LLVMTypeRef type = LLVMTypeOf(*pgpr);
425       LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
426       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
427 
428       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
429 
430       if (old_type == ctx->i1)
431          *pgpr = LLVMBuildTrunc(builder, *pgpr, old_type, "");
432 
433       if (old_type == LLVMVectorType(ctx->i16, 3))
434          *pgpr = ac_extract_components(ctx, *pgpr, 0, 3);
435    }
436 }
437 
ac_build_shader_clock(struct ac_llvm_context * ctx,mesa_scope scope)438 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, mesa_scope scope)
439 {
440    if (ctx->gfx_level >= GFX11 && scope == SCOPE_DEVICE) {
441       const char *name = "llvm.amdgcn.s.sendmsg.rtn.i64";
442       LLVMValueRef arg = LLVMConstInt(ctx->i32, 0x83 /* realtime */, 0);
443       LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, &arg, 1, 0);
444       return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
445    }
446 
447    const char *subgroup = "llvm.readcyclecounter";
448    const char *name = scope == SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
449 
450    LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
451    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
452 }
453 
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)454 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
455 {
456    const char *name;
457 
458    if (LLVMTypeOf(value) == ctx->i1)
459       value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
460 
461    if (ctx->wave_size == 64)
462       name = "llvm.amdgcn.icmp.i64.i32";
463    else
464       name = "llvm.amdgcn.icmp.i32.i32";
465 
466    LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
467 
468    /* We currently have no other way to prevent LLVM from lifting the icmp
469     * calls to a dominating basic block.
470     */
471    ac_build_optimization_barrier(ctx, &args[0], false);
472 
473    args[0] = ac_to_integer(ctx, args[0]);
474 
475    return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
476 }
477 
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)478 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
479 {
480    const char *name;
481 
482    if (ctx->wave_size == 64)
483       name = "llvm.amdgcn.icmp.i64.i1";
484    else
485       name = "llvm.amdgcn.icmp.i32.i1";
486 
487    LLVMValueRef args[3] = {
488       value,
489       ctx->i1false,
490       LLVMConstInt(ctx->i32, LLVMIntNE, 0),
491    };
492 
493    return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
494 }
495 
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)496 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
497 {
498    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
499    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
500    return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
501 }
502 
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)503 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
504 {
505    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
506    return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
507                         "");
508 }
509 
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)510 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
511 {
512    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
513    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
514 
515    LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
516    LLVMValueRef none =
517       LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
518    return LLVMBuildOr(ctx->builder, all, none, "");
519 }
520 
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)521 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
522                                             unsigned value_count, unsigned component)
523 {
524    LLVMValueRef vec = NULL;
525 
526    if (value_count == 1) {
527       return values[component];
528    } else if (!value_count)
529       unreachable("value_count is 0");
530 
531    for (unsigned i = component; i < value_count + component; i++) {
532       LLVMValueRef value = values[i];
533 
534       if (i == component)
535          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
536       LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
537       vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
538    }
539    return vec;
540 }
541 
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)542 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
543                                              unsigned value_count, unsigned value_stride,
544                                              bool always_vector)
545 {
546    LLVMBuilderRef builder = ctx->builder;
547    LLVMValueRef vec = NULL;
548    unsigned i;
549 
550    if (value_count == 1 && !always_vector) {
551       return values[0];
552    } else if (!value_count)
553       unreachable("value_count is 0");
554 
555    for (i = 0; i < value_count; i++) {
556       LLVMValueRef value = values[i * value_stride];
557 
558       if (!i)
559          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
560       LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
561       vec = LLVMBuildInsertElement(builder, vec, value, index, "");
562    }
563    return vec;
564 }
565 
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)566 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
567                                     unsigned value_count)
568 {
569    return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
570 }
571 
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)572 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
573 {
574    if (!a)
575       return b;
576 
577    unsigned a_size = ac_get_llvm_num_components(a);
578    unsigned b_size = ac_get_llvm_num_components(b);
579 
580    LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
581    for (unsigned i = 0; i < a_size; i++)
582       elems[i] = ac_llvm_extract_elem(ctx, a, i);
583    for (unsigned i = 0; i < b_size; i++)
584       elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
585 
586    return ac_build_gather_values(ctx, elems, a_size + b_size);
587 }
588 
589 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
590  * channels with undef. Extract at most src_channels components from the input.
591  */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)592 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
593                              unsigned src_channels, unsigned dst_channels)
594 {
595    LLVMTypeRef elemtype;
596    LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
597 
598    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
599       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
600 
601       if (src_channels == dst_channels && vec_size == dst_channels)
602          return value;
603 
604       src_channels = MIN2(src_channels, vec_size);
605 
606       for (unsigned i = 0; i < src_channels; i++)
607          chan[i] = ac_llvm_extract_elem(ctx, value, i);
608 
609       elemtype = LLVMGetElementType(LLVMTypeOf(value));
610    } else {
611       if (src_channels) {
612          assert(src_channels == 1);
613          chan[0] = value;
614       }
615       elemtype = LLVMTypeOf(value);
616    }
617 
618    for (unsigned i = src_channels; i < dst_channels; i++)
619       chan[i] = LLVMGetUndef(elemtype);
620 
621    return ac_build_gather_values(ctx, chan, dst_channels);
622 }
623 
624 /* Extract components [start, start + channels) from a vector.
625  */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)626 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
627                                    unsigned channels)
628 {
629    LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
630 
631    for (unsigned i = 0; i < channels; i++)
632       chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
633 
634    return ac_build_gather_values(ctx, chan, channels);
635 }
636 
637 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
638  * with undef. Extract at most num_channels components from the input.
639  */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)640 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
641                                      unsigned num_channels)
642 {
643    return ac_build_expand(ctx, value, num_channels, 4);
644 }
645 
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)646 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
647 {
648    unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
649    const char *name;
650 
651    if (type_size == 2)
652       name = "llvm.rint.f16";
653    else if (type_size == 4)
654       name = "llvm.rint.f32";
655    else
656       name = "llvm.rint.f64";
657 
658    return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, 0);
659 }
660 
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)661 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
662 {
663    unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
664    const char *name;
665 
666    if (type_size == 2)
667       name = "llvm.amdgcn.rcp.f16";
668    else if (type_size == 4)
669       name = "llvm.amdgcn.rcp.f32";
670    else
671       name = "llvm.amdgcn.rcp.f64";
672 
673    LLVMValueRef rcp =
674       ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, 0);
675 
676    return LLVMBuildFMul(ctx->builder, num, rcp, "");
677 }
678 
679 /* See fast_idiv_by_const.h. */
680 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)681 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
682                                 LLVMValueRef multiplier, LLVMValueRef pre_shift,
683                                 LLVMValueRef post_shift, LLVMValueRef increment)
684 {
685    LLVMBuilderRef builder = ctx->builder;
686 
687    num = LLVMBuildLShr(builder, num, pre_shift, "");
688    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
689                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
690    num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
691    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
692    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
693    return LLVMBuildLShr(builder, num, post_shift, "");
694 }
695 
696 /* See fast_idiv_by_const.h. */
697 /* If num != UINT_MAX, this more efficient version can be used. */
698 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)699 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
700                                     LLVMValueRef multiplier, LLVMValueRef pre_shift,
701                                     LLVMValueRef post_shift, LLVMValueRef increment)
702 {
703    LLVMBuilderRef builder = ctx->builder;
704 
705    num = LLVMBuildLShr(builder, num, pre_shift, "");
706    num = LLVMBuildNUWAdd(builder, num, increment, "");
707    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
708                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
709    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
710    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
711    return LLVMBuildLShr(builder, num, post_shift, "");
712 }
713 
714 /* See fast_idiv_by_const.h. */
715 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)716 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
717                                               LLVMValueRef multiplier, LLVMValueRef post_shift)
718 {
719    LLVMBuilderRef builder = ctx->builder;
720 
721    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
722                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
723    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
724    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
725    return LLVMBuildLShr(builder, num, post_shift, "");
726 }
727 
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)728 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
729                                 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
730                                 LLVMValueRef j)
731 {
732    LLVMValueRef args[5];
733 
734    if (ctx->gfx_level >= GFX11) {
735       LLVMValueRef p;
736       LLVMValueRef p10;
737 
738       args[0] = llvm_chan;
739       args[1] = attr_number;
740       args[2] = params;
741 
742       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
743                              ctx->f32, args, 3, 0);
744 
745       args[0] = p;
746       args[1] = i;
747       args[2] = p;
748 
749       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
750                                ctx->f32, args, 3, 0);
751 
752       args[0] = p;
753       args[1] = j;
754       args[2] = p10;
755 
756       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
757                                 ctx->f32, args, 3, 0);
758 
759    } else {
760       LLVMValueRef p1;
761 
762       args[0] = i;
763       args[1] = llvm_chan;
764       args[2] = attr_number;
765       args[3] = params;
766 
767       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
768                               ctx->f32, args, 4, 0);
769 
770       args[0] = p1;
771       args[1] = j;
772       args[2] = llvm_chan;
773       args[3] = attr_number;
774       args[4] = params;
775 
776       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
777                                 ctx->f32, args, 5, 0);
778    }
779 }
780 
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)781 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
782                                     LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
783                                     LLVMValueRef j, bool high_16bits)
784 {
785    LLVMValueRef args[6];
786 
787    if (ctx->gfx_level >= GFX11) {
788       LLVMValueRef p;
789       LLVMValueRef p10;
790 
791       args[0] = llvm_chan;
792       args[1] = attr_number;
793       args[2] = params;
794 
795       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
796                              ctx->f32, args, 3, 0);
797 
798       args[0] = p;
799       args[1] = i;
800       args[2] = p;
801       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
802 
803       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
804                                ctx->f32, args, 4, 0);
805 
806       args[0] = p;
807       args[1] = j;
808       args[2] = p10;
809       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
810 
811       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
812                                 ctx->f16, args, 4, 0);
813 
814    } else {
815       LLVMValueRef p1;
816 
817       args[0] = i;
818       args[1] = llvm_chan;
819       args[2] = attr_number;
820       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
821       args[4] = params;
822 
823       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
824                               0);
825 
826       args[0] = p1;
827       args[1] = j;
828       args[2] = llvm_chan;
829       args[3] = attr_number;
830       args[4] = high_16bits ? ctx->i1true : ctx->i1false;
831       args[5] = params;
832 
833       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
834                                 0);
835    }
836 }
837 
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,unsigned parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)838 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, unsigned parameter,
839                                     LLVMValueRef llvm_chan, LLVMValueRef attr_number,
840                                     LLVMValueRef params)
841 {
842    LLVMValueRef args[4];
843 
844    if (ctx->gfx_level >= GFX11) {
845       LLVMValueRef p;
846 
847       args[0] = llvm_chan;
848       args[1] = attr_number;
849       args[2] = params;
850 
851       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
852                              ctx->f32, args, 3, 0);
853       p = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
854       p = ac_build_quad_swizzle(ctx, p, parameter, parameter, parameter, parameter);
855       return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
856    } else {
857       args[0] = LLVMConstInt(ctx->i32, (parameter + 2) % 3, 0);
858       args[1] = llvm_chan;
859       args[2] = attr_number;
860       args[3] = params;
861 
862       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 0);
863    }
864 }
865 
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index)866 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef base_ptr,
867                               LLVMValueRef index)
868 {
869    return LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
870 }
871 
ac_build_gep0_type(LLVMTypeRef pointee_type,LLVMValueRef index)872 LLVMTypeRef ac_build_gep0_type(LLVMTypeRef pointee_type, LLVMValueRef index)
873 {
874    switch (LLVMGetTypeKind(pointee_type)) {
875       case LLVMPointerTypeKind:
876          return pointee_type;
877       case LLVMArrayTypeKind:
878          /* If input is a pointer to an array GEP2 will return a pointer to
879           * the array elements type.
880           */
881          return LLVMGetElementType(pointee_type);
882       case LLVMStructTypeKind:
883          /* If input is a pointer to a struct, GEP2 will return a pointer to
884           * the index-nth field, so get its type.
885           */
886          return LLVMStructGetTypeAtIndex(pointee_type, LLVMConstIntGetZExtValue(index));
887       default:
888          /* gep0 shouldn't receive any other types. */
889          assert(false);
890    }
891    return NULL;
892 }
893 
ac_build_gep0(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)894 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
895 {
896    LLVMValueRef indices[2] = {
897       ctx->i32_0,
898       index,
899    };
900 
901    return LLVMBuildGEP2(ctx->builder, ptr.t, ptr.v, indices, 2, "");
902 }
903 
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef ptr,LLVMValueRef index)904 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef ptr, LLVMValueRef index)
905 {
906    return LLVMBuildGEP2(ctx->builder, type, ptr, &index, 1, "");
907 }
908 
ac_build_indexed_store(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index,LLVMValueRef value)909 void ac_build_indexed_store(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index,
910                             LLVMValueRef value)
911 {
912    LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, ptr, index));
913 }
914 
915 /**
916  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
917  * It's equivalent to doing a load from &base_ptr[index].
918  *
919  * \param base_ptr  Where the array starts.
920  * \param index     The element index into the array.
921  * \param uniform   Whether the base_ptr and index can be assumed to be
922  *                  dynamically uniform (i.e. load to an SGPR)
923  * \param invariant Whether the load is invariant (no other opcodes affect it)
924  * \param no_unsigned_wraparound
925  *    For all possible re-associations and re-distributions of an expression
926  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
927  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
928  *    does not result in an unsigned integer wraparound. This is used for
929  *    optimal code generation of 32-bit pointer arithmetic.
930  *
931  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
932  *    integer wraparound can't be an imm offset in s_load_dword, because
933  *    the instruction performs "addr + offset" in 64 bits.
934  *
935  *    Expected usage for bindless textures by chaining GEPs:
936  *      // possible unsigned wraparound, don't use InBounds:
937  *      ptr1 = LLVMBuildGEP(base_ptr, index);
938  *      image = load(ptr1); // becomes "s_load ptr1, 0"
939  *
940  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
941  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
942  */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)943 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMTypeRef type,
944                                          LLVMValueRef base_ptr, LLVMValueRef index,
945                                          bool uniform, bool invariant, bool no_unsigned_wraparound)
946 {
947    LLVMValueRef pointer, result;
948 
949    if (no_unsigned_wraparound &&
950        LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
951       pointer = LLVMBuildInBoundsGEP2(ctx->builder, type, base_ptr, &index, 1, "");
952    else
953       pointer = LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
954 
955    if (uniform)
956       LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
957    result = LLVMBuildLoad2(ctx->builder, type, pointer, "");
958    if (invariant)
959       LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
960    LLVMSetAlignment(result, 4);
961    return result;
962 }
963 
ac_build_load(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)964 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
965 {
966    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, false, false);
967 }
968 
ac_build_load_invariant(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)969 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
970                                      LLVMValueRef index)
971 {
972    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, true, false);
973 }
974 
975 /* This assumes that there is no unsigned integer wraparound during the address
976  * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)977 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
978                                    LLVMValueRef index)
979 {
980    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, true);
981 }
982 
983 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)984 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
985 {
986    return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, false);
987 }
988 
get_cache_flags(struct ac_llvm_context * ctx,enum gl_access_qualifier access)989 static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
990 {
991    return ac_get_hw_cache_flags(ctx->gfx_level, access).value;
992 }
993 
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access,bool use_format)994 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
995                                          LLVMValueRef data, LLVMValueRef vindex,
996                                          LLVMValueRef voffset, LLVMValueRef soffset,
997                                          enum gl_access_qualifier access, bool use_format)
998 {
999    LLVMValueRef args[6];
1000    int idx = 0;
1001    args[idx++] = data;
1002    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1003    if (vindex)
1004       args[idx++] = vindex ? vindex : ctx->i32_0;
1005    args[idx++] = voffset ? voffset : ctx->i32_0;
1006    args[idx++] = soffset ? soffset : ctx->i32_0;
1007    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
1008    const char *indexing_kind = vindex ? "struct" : "raw";
1009    char name[256], type_name[8];
1010 
1011    ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1012 
1013    if (use_format) {
1014       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1015                type_name);
1016    } else {
1017       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1018    }
1019 
1020    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 0);
1021 }
1022 
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,enum gl_access_qualifier access)1023 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1024                                   LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
1025 {
1026    ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
1027 }
1028 
1029 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1030 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1031                                  LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1032                                  enum gl_access_qualifier access)
1033 {
1034    unsigned num_channels = ac_get_llvm_num_components(vdata);
1035 
1036    /* Split 3 channel stores if unsupported. */
1037    if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
1038       LLVMValueRef v[3], v01, voffset2;
1039 
1040       for (int i = 0; i < 3; i++) {
1041          v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1042       }
1043       v01 = ac_build_gather_values(ctx, v, 2);
1044 
1045       voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1046                               LLVMConstInt(ctx->i32, 8, 0), "");
1047 
1048       ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
1049       ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
1050       return;
1051    }
1052 
1053    ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
1054                                 access, false);
1055 }
1056 
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool use_format)1057 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1058                                                 LLVMValueRef vindex, LLVMValueRef voffset,
1059                                                 LLVMValueRef soffset, unsigned num_channels,
1060                                                 LLVMTypeRef channel_type, enum gl_access_qualifier access,
1061                                                 bool can_speculate, bool use_format)
1062 {
1063    LLVMValueRef args[5];
1064    int idx = 0;
1065    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1066    if (vindex)
1067       args[idx++] = vindex;
1068    args[idx++] = voffset ? voffset : ctx->i32_0;
1069    args[idx++] = soffset ? soffset : ctx->i32_0;
1070    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1071    unsigned func =
1072       !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
1073    const char *indexing_kind = vindex ? "struct" : "raw";
1074    char name[256], type_name[8];
1075 
1076    /* D16 is only supported on gfx8+ */
1077    assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1078           ctx->gfx_level >= GFX8);
1079 
1080    LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1081    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1082 
1083    if (use_format) {
1084       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1085                type_name);
1086    } else {
1087       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1088    }
1089 
1090    LLVMValueRef result = ac_build_intrinsic(ctx, name, type, args, idx,
1091                                             can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1092    if (func > num_channels)
1093       result = ac_trim_vector(ctx, result, num_channels);
1094    return result;
1095 }
1096 
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool allow_smem)1097 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1098                                   LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1099                                   LLVMTypeRef channel_type, enum gl_access_qualifier access,
1100                                   bool can_speculate, bool allow_smem)
1101 {
1102    if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
1103       assert(vindex == NULL);
1104 
1105       LLVMValueRef result[32];
1106 
1107       LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
1108       if (soffset)
1109          offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1110 
1111       char name[256], type_name[8];
1112       ac_build_type_name_for_intr(channel_type, type_name, sizeof(type_name));
1113       snprintf(name, sizeof(name), "llvm.amdgcn.s.buffer.load.%s", type_name);
1114 
1115       LLVMValueRef channel_size = LLVMConstInt(ctx->i32, ac_get_type_size(channel_type), 0);
1116 
1117       for (int i = 0; i < num_channels; i++) {
1118          if (i) {
1119             offset = LLVMBuildAdd(ctx->builder, offset, channel_size, "");
1120          }
1121          LLVMValueRef args[3] = {
1122             rsrc,
1123             offset,
1124             LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
1125                                                         ACCESS_TYPE_SMEM), 0),
1126          };
1127          result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
1128       }
1129       if (num_channels == 1)
1130          return result[0];
1131 
1132       return ac_build_gather_values(ctx, result, num_channels);
1133    }
1134 
1135    /* LLVM is unable to select instructions for num_channels > 4, so we
1136     * workaround that by manually splitting larger buffer loads.
1137     */
1138    LLVMValueRef result = NULL;
1139    for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1140       fetch_num_channels = MIN2(4, num_channels - i);
1141       LLVMValueRef fetch_voffset =
1142             LLVMBuildAdd(ctx->builder, voffset,
1143                          LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
1144       LLVMValueRef item =
1145          ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
1146                                      channel_type, access, can_speculate, false);
1147       result = ac_build_concat(ctx, result, item);
1148    }
1149 
1150    return result;
1151 }
1152 
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate,bool d16,bool tfe)1153 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1154                                          LLVMValueRef vindex, LLVMValueRef voffset,
1155                                          unsigned num_channels, enum gl_access_qualifier access,
1156                                          bool can_speculate, bool d16, bool tfe)
1157 {
1158    if (tfe) {
1159       assert(!d16);
1160 
1161       union ac_hw_cache_flags cache_flags =
1162          ac_get_hw_cache_flags(ctx->gfx_level, access | ACCESS_TYPE_LOAD);
1163       char code[1024];
1164 
1165       /* The definition in the assembly and the one in the constraint string
1166        * differs because of an assembler bug.
1167        */
1168       if (ctx->gfx_level >= GFX12) {
1169          const char *scope = "";
1170          const char *temporal_hint = "";
1171 
1172          if (cache_flags.gfx12.scope == gfx12_scope_se)
1173             scope = "scope:SCOPE_SE";
1174          else if (cache_flags.gfx12.scope == gfx12_scope_device)
1175             scope = "scope:SCOPE_DEV";
1176          else if (cache_flags.gfx12.scope == gfx12_scope_memory)
1177             scope = "scope:SCOPE_SYS";
1178 
1179          if (cache_flags.gfx12.temporal_hint == gfx12_load_non_temporal)
1180             temporal_hint = "th:TH_LOAD_NT";
1181          else if (cache_flags.gfx12.temporal_hint == gfx12_load_high_temporal)
1182             temporal_hint = "th:TH_LOAD_HT";
1183          else if (cache_flags.gfx12.temporal_hint == gfx12_load_last_use_discard)
1184             temporal_hint = "th:TH_LOAD_LU";
1185          else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_regular_temporal)
1186             temporal_hint = "th:TH_LOAD_NT_RT";
1187          else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_regular_temporal_far_non_temporal)
1188             temporal_hint = "th:TH_LOAD_RT_NT";
1189          else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_high_temporal)
1190             temporal_hint = "th:TH_LOAD_NT_HT";
1191 
1192          snprintf(code, sizeof(code),
1193                   "v_mov_b32 v0, 0\n"
1194                   "v_mov_b32 v1, 0\n"
1195                   "v_mov_b32 v2, 0\n"
1196                   "v_mov_b32 v3, 0\n"
1197                   "v_mov_b32 v4, 0\n"
1198                   "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe\n"
1199                   "s_waitcnt vmcnt(0)",
1200                   temporal_hint, scope);
1201       } else {
1202          snprintf(code, sizeof(code),
1203                   "v_mov_b32 v0, 0\n"
1204                   "v_mov_b32 v1, 0\n"
1205                   "v_mov_b32 v2, 0\n"
1206                   "v_mov_b32 v3, 0\n"
1207                   "v_mov_b32 v4, 0\n"
1208                   "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1209                   "s_waitcnt vmcnt(0)",
1210                   cache_flags.value & ac_glc ? "glc" : "",
1211                   cache_flags.value & ac_slc ? "slc" : "",
1212                   cache_flags.value & ac_dlc ? "dlc" : "");
1213       }
1214 
1215       LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1216       LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1217       LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1218 
1219       LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1220                                    voffset ? voffset : ctx->i32_0};
1221 
1222       LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1223                              LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1224       LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1225 
1226       return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1227                              ac_llvm_extract_elem(ctx, res, 4));
1228    }
1229 
1230    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1231                                       num_channels, d16 ? ctx->f16 : ctx->f32, access,
1232                                       can_speculate, true);
1233 }
1234 
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned tbuffer_format,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate)1235 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1236                                           LLVMValueRef vindex, LLVMValueRef voffset,
1237                                           LLVMValueRef soffset, unsigned num_channels,
1238                                           unsigned tbuffer_format, LLVMTypeRef channel_type,
1239                                           enum gl_access_qualifier access, bool can_speculate)
1240 {
1241    LLVMValueRef args[6];
1242    int idx = 0;
1243    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1244    if (vindex)
1245       args[idx++] = vindex;
1246    args[idx++] = voffset ? voffset : ctx->i32_0;
1247    args[idx++] = soffset ? soffset : ctx->i32_0;
1248    args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
1249    args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1250    const char *indexing_kind = vindex ? "struct" : "raw";
1251    char name[256], type_name[8];
1252 
1253    LLVMTypeRef type = num_channels > 1 ? LLVMVectorType(channel_type, num_channels) : channel_type;
1254    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1255 
1256    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1257 
1258    return ac_build_intrinsic(ctx, name, type, args, idx,
1259                              can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1260 }
1261 
ac_build_safe_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vidx,LLVMValueRef base_voffset,LLVMValueRef soffset,const enum pipe_format format,unsigned channel_bit_size,unsigned const_offset,unsigned align_offset,unsigned align_mul,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate)1262 LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1263                                         LLVMValueRef vidx, LLVMValueRef base_voffset,
1264                                         LLVMValueRef soffset,
1265                                         const enum pipe_format format,
1266                                         unsigned channel_bit_size,
1267                                         unsigned const_offset,
1268                                         unsigned align_offset,
1269                                         unsigned align_mul,
1270                                         unsigned num_channels,
1271                                         enum gl_access_qualifier access,
1272                                         bool can_speculate)
1273 {
1274    const struct ac_vtx_format_info *vtx_info = ac_get_vtx_format_info(ctx->gfx_level, ctx->info->family, format);
1275    const unsigned max_channels = vtx_info->num_channels;
1276    LLVMValueRef voffset_plus_const =
1277       LLVMBuildAdd(ctx->builder, base_voffset, LLVMConstInt(ctx->i32, const_offset, 0), "");
1278 
1279    /* Split the specified load into several MTBUF instructions,
1280     * according to a safe fetch size determined by aligmnent information.
1281     */
1282    LLVMValueRef result = NULL;
1283    for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1284       /* Packed formats (determined here by chan_byte_size == 0) should never be split. */
1285       assert(i == 0 || vtx_info->chan_byte_size);
1286 
1287       const unsigned fetch_const_offset = const_offset + i * vtx_info->chan_byte_size;
1288       const unsigned fetch_align_offset = (align_offset + i * vtx_info->chan_byte_size) % align_mul;
1289       const unsigned fetch_alignment = fetch_align_offset ? 1 << (ffs(fetch_align_offset) - 1) : align_mul;
1290 
1291       fetch_num_channels =
1292          ac_get_safe_fetch_size(ctx->gfx_level, vtx_info, fetch_const_offset,
1293                                 max_channels - i, fetch_alignment, num_channels - i);
1294       const unsigned fetch_format = vtx_info->hw_format[fetch_num_channels - 1];
1295       LLVMValueRef fetch_voffset =
1296             LLVMBuildAdd(ctx->builder, voffset_plus_const,
1297                          LLVMConstInt(ctx->i32, i * vtx_info->chan_byte_size, 0), "");
1298       LLVMValueRef item =
1299          ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
1300                                fetch_num_channels, fetch_format, ctx->i32,
1301                                access, can_speculate);
1302       result = ac_build_concat(ctx, result, item);
1303    }
1304 
1305    /*
1306     * LLVM is not able to select 16-bit typed loads. Load 32-bit values instead and
1307     * manually truncate them to the required size.
1308     * TODO: Do this in NIR instead.
1309     */
1310    const struct util_format_description *desc = util_format_description(format);
1311    bool is_float = !desc->channel[0].pure_integer;
1312 
1313    if (channel_bit_size == 16) {
1314       LLVMValueRef channels[4];
1315       for (unsigned i = 0; i < num_channels; i++) {
1316          LLVMValueRef channel = result;
1317          if (num_channels > 1)
1318             channel = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, i, false), "");
1319 
1320          if (is_float) {
1321             channel = LLVMBuildBitCast(ctx->builder, channel, ctx->f32, "");
1322             channel = LLVMBuildFPTrunc(ctx->builder, channel, ctx->f16, "");
1323             channel = LLVMBuildBitCast(ctx->builder, channel, ctx->i16, "");
1324          } else {
1325             channel = LLVMBuildTrunc(ctx->builder, channel, ctx->i16, "");
1326          }
1327          channels[i] = channel;
1328       }
1329       result = ac_build_gather_values(ctx, channels, num_channels);
1330    }
1331 
1332    return result;
1333 }
1334 
1335 
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1336 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1337                                         LLVMValueRef voffset, LLVMValueRef soffset,
1338                                         enum gl_access_qualifier access)
1339 {
1340    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1341                                       access, false, false);
1342 }
1343 
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1344 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1345                                        LLVMValueRef voffset, LLVMValueRef soffset,
1346                                        enum gl_access_qualifier access)
1347 {
1348    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
1349                                       false, false);
1350 }
1351 
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1352 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1353                                  LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1354                                  enum gl_access_qualifier access)
1355 {
1356    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1357 
1358    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1359 }
1360 
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1361 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1362                                 LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
1363 {
1364    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1365 
1366    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1367 }
1368 
1369 /**
1370  * Set range metadata on an instruction.  This can only be used on load and
1371  * call instructions.  If you know an instruction can only produce the values
1372  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1373  * \p lo is the minimum value inclusive.
1374  * \p hi is the maximum value exclusive.
1375  */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1376 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1377                            unsigned hi)
1378 {
1379    LLVMValueRef range_md, md_args[2];
1380    LLVMTypeRef type = LLVMTypeOf(value);
1381    LLVMContextRef context = LLVMGetTypeContext(type);
1382 
1383    md_args[0] = LLVMConstInt(type, lo, false);
1384    md_args[1] = LLVMConstInt(type, hi, false);
1385    range_md = LLVMMDNodeInContext(context, md_args, 2);
1386    LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1387 }
1388 
ac_get_thread_id(struct ac_llvm_context * ctx)1389 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1390 {
1391    return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1392 }
1393 
1394 /*
1395  * AMD GCN implements derivatives using the local data store (LDS)
1396  * All writes to the LDS happen in all executing threads at
1397  * the same time. TID is the Thread ID for the current
1398  * thread and is a value between 0 and 63, representing
1399  * the thread's position in the wavefront.
1400  *
1401  * For the pixel shader threads are grouped into quads of four pixels.
1402  * The TIDs of the pixels of a quad are:
1403  *
1404  *  +------+------+
1405  *  |4n + 0|4n + 1|
1406  *  +------+------+
1407  *  |4n + 2|4n + 3|
1408  *  +------+------+
1409  *
1410  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1411  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1412  * the current pixel's column, and masking with 0xfffffffe yields the TID
1413  * of the left pixel of the current pixel's row.
1414  *
1415  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1416  * adding 2 yields the TID of the pixel below the top pixel.
1417  */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1418 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1419 {
1420    unsigned tl_lanes[4], trbl_lanes[4];
1421    char name[32], type[8];
1422    LLVMValueRef tl, trbl;
1423    LLVMTypeRef result_type;
1424    LLVMValueRef result;
1425 
1426    result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1427 
1428    if (result_type == ctx->f16)
1429       val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1430    else if (result_type == ctx->v2f16)
1431       val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1432 
1433    for (unsigned i = 0; i < 4; ++i) {
1434       tl_lanes[i] = i & mask;
1435       trbl_lanes[i] = (i & mask) + idx;
1436    }
1437 
1438    tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1439    trbl =
1440       ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1441 
1442    if (result_type == ctx->f16) {
1443       tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1444       trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1445    }
1446 
1447    tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1448    trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1449    result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1450 
1451    ac_build_type_name_for_intr(result_type, type, sizeof(type));
1452    snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1453 
1454    return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1455 }
1456 
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t imm,LLVMValueRef m0_content)1457 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t imm, LLVMValueRef m0_content)
1458 {
1459    LLVMValueRef args[2];
1460    args[0] = LLVMConstInt(ctx->i32, imm, false);
1461    args[1] = m0_content;
1462    ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1463 }
1464 
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1465 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1466 {
1467    LLVMValueRef msb =
1468       ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, 0);
1469 
1470    /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1471     * the index from LSB. Invert it by doing "31 - msb". */
1472    msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1473 
1474    LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1475    LLVMValueRef cond =
1476       LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1477                   LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1478 
1479    return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1480 }
1481 
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type,bool rev)1482 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type,
1483                            bool rev)
1484 {
1485    const char *intrin_name;
1486    LLVMTypeRef type;
1487    LLVMValueRef highest_bit;
1488    LLVMValueRef zero;
1489    unsigned bitsize;
1490 
1491    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1492    switch (bitsize) {
1493    case 64:
1494       intrin_name = "llvm.ctlz.i64";
1495       type = ctx->i64;
1496       highest_bit = LLVMConstInt(ctx->i64, 63, false);
1497       zero = ctx->i64_0;
1498       break;
1499    case 32:
1500       intrin_name = "llvm.ctlz.i32";
1501       type = ctx->i32;
1502       highest_bit = LLVMConstInt(ctx->i32, 31, false);
1503       zero = ctx->i32_0;
1504       break;
1505    case 16:
1506       intrin_name = "llvm.ctlz.i16";
1507       type = ctx->i16;
1508       highest_bit = LLVMConstInt(ctx->i16, 15, false);
1509       zero = ctx->i16_0;
1510       break;
1511    case 8:
1512       intrin_name = "llvm.ctlz.i8";
1513       type = ctx->i8;
1514       highest_bit = LLVMConstInt(ctx->i8, 7, false);
1515       zero = ctx->i8_0;
1516       break;
1517    default:
1518       unreachable("invalid bitsize");
1519       break;
1520    }
1521 
1522    LLVMValueRef params[2] = {
1523       arg,
1524       ctx->i1true,
1525    };
1526 
1527    LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
1528 
1529    if (!rev) {
1530       /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1531        * the index from LSB. Invert it by doing "31 - msb". */
1532       msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1533    }
1534 
1535    if (bitsize == 64) {
1536       msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1537    } else if (bitsize < 32) {
1538       msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1539    }
1540 
1541    /* check for zero */
1542    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1543                           LLVMConstInt(ctx->i32, -1, true), msb, "");
1544 }
1545 
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1546 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1547 {
1548    char name[64], type[64];
1549 
1550    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1551    snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1552    LLVMValueRef args[2] = {a, b};
1553    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1554 }
1555 
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1556 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1557 {
1558    char name[64], type[64];
1559 
1560    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1561    snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1562    LLVMValueRef args[2] = {a, b};
1563    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1564 }
1565 
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1566 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1567 {
1568    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1569    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1570 }
1571 
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1572 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1573 {
1574    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1575    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1576 }
1577 
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1578 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1579 {
1580    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1581    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1582 }
1583 
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1584 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1585 {
1586    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1587    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1588 }
1589 
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1590 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1591 {
1592    LLVMTypeRef t = LLVMTypeOf(value);
1593    return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1594                         LLVMConstReal(t, 1.0));
1595 }
1596 
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1597 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1598 {
1599    LLVMValueRef args[9];
1600 
1601    args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1602    args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1603 
1604    if (a->compr) {
1605       assert(ctx->gfx_level < GFX11);
1606 
1607       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1608       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1609       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1610       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1611 
1612       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1613    } else {
1614       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1615       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1616       args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1617       args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1618       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1619       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1620 
1621       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1622    }
1623 }
1624 
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)1625 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
1626 {
1627    struct ac_export_args args;
1628 
1629    /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
1630     * for discard.
1631     */
1632    if (ctx->gfx_level >= GFX10 && !uses_discard)
1633       return;
1634 
1635    args.enabled_channels = 0x0; /* enabled channels */
1636    args.valid_mask = 1;         /* whether the EXEC mask is valid */
1637    args.done = 1;               /* DONE bit */
1638    /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
1639    args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
1640    args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
1641    args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1642    args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1643    args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1644    args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1645 
1646    ac_build_export(ctx, &args);
1647 }
1648 
ac_num_coords(enum ac_image_dim dim)1649 static unsigned ac_num_coords(enum ac_image_dim dim)
1650 {
1651    switch (dim) {
1652    case ac_image_1d:
1653       return 1;
1654    case ac_image_2d:
1655    case ac_image_1darray:
1656       return 2;
1657    case ac_image_3d:
1658    case ac_image_cube:
1659    case ac_image_2darray:
1660    case ac_image_2dmsaa:
1661       return 3;
1662    case ac_image_2darraymsaa:
1663       return 4;
1664    default:
1665       unreachable("ac_num_coords: bad dim");
1666    }
1667 }
1668 
ac_num_derivs(enum ac_image_dim dim)1669 static unsigned ac_num_derivs(enum ac_image_dim dim)
1670 {
1671    switch (dim) {
1672    case ac_image_1d:
1673    case ac_image_1darray:
1674       return 2;
1675    case ac_image_2d:
1676    case ac_image_2darray:
1677    case ac_image_cube:
1678       return 4;
1679    case ac_image_3d:
1680       return 6;
1681    case ac_image_2dmsaa:
1682    case ac_image_2darraymsaa:
1683    default:
1684       unreachable("derivatives not supported");
1685    }
1686 }
1687 
get_atomic_name(enum ac_atomic_op op)1688 static const char *get_atomic_name(enum ac_atomic_op op)
1689 {
1690    switch (op) {
1691    case ac_atomic_swap:
1692       return "swap";
1693    case ac_atomic_add:
1694       return "add";
1695    case ac_atomic_sub:
1696       return "sub";
1697    case ac_atomic_smin:
1698       return "smin";
1699    case ac_atomic_umin:
1700       return "umin";
1701    case ac_atomic_smax:
1702       return "smax";
1703    case ac_atomic_umax:
1704       return "umax";
1705    case ac_atomic_and:
1706       return "and";
1707    case ac_atomic_or:
1708       return "or";
1709    case ac_atomic_xor:
1710       return "xor";
1711    case ac_atomic_inc_wrap:
1712       return "inc";
1713    case ac_atomic_dec_wrap:
1714       return "dec";
1715    case ac_atomic_fmin:
1716       return "fmin";
1717    case ac_atomic_fmax:
1718       return "fmax";
1719    }
1720    unreachable("bad atomic op");
1721 }
1722 
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)1723 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
1724 {
1725    const char *overload[3] = {"", "", ""};
1726    unsigned num_overloads = 0;
1727    LLVMValueRef args[18];
1728    unsigned num_args = 0;
1729    enum ac_image_dim dim = a->dim;
1730 
1731    assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
1732    assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
1733            a->opcode != ac_image_store_mip) ||
1734           a->lod);
1735    assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1736           (!a->compare && !a->offset));
1737    assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1738            a->opcode == ac_image_get_lod) ||
1739           !a->bias);
1740    assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
1741           1);
1742    assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
1743    assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
1744                       a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
1745                       a->opcode != ac_image_get_resinfo));
1746    assert(!a->a16 || ctx->gfx_level >= GFX9);
1747    assert(!a->derivs[0] || a->g16 == a->a16 || ctx->gfx_level >= GFX10);
1748 
1749    assert(!a->offset ||
1750           ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
1751    assert(!a->bias ||
1752           ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == (a->a16 ? 16 : 32));
1753    assert(!a->compare ||
1754           ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
1755    assert(!a->derivs[0] ||
1756           ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
1757            (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
1758    assert(!a->coords[0] ||
1759           ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
1760            (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
1761    assert(!a->lod ||
1762           ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
1763            (a->opcode == ac_image_get_resinfo ||
1764             ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
1765             ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
1766    assert(!a->min_lod ||
1767           ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
1768           ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
1769 
1770    if (a->opcode == ac_image_get_lod) {
1771       switch (dim) {
1772       case ac_image_1darray:
1773          dim = ac_image_1d;
1774          break;
1775       case ac_image_2darray:
1776       case ac_image_cube:
1777          dim = ac_image_2d;
1778          break;
1779       default:
1780          break;
1781       }
1782    }
1783 
1784    bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1785                  a->opcode == ac_image_get_lod;
1786    bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
1787    bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1788                a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
1789    LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
1790    uint8_t dmask = a->dmask;
1791    LLVMTypeRef data_type;
1792    char data_type_str[32];
1793 
1794    if (atomic) {
1795       data_type = LLVMTypeOf(a->data[0]);
1796    } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1797       /* Image stores might have been shrunk using the format. */
1798       data_type = LLVMTypeOf(a->data[0]);
1799       dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
1800    } else {
1801       data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
1802    }
1803 
1804    if (a->tfe) {
1805       data_type = LLVMStructTypeInContext(
1806          ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
1807    }
1808 
1809    if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1810       args[num_args++] = a->data[0];
1811       if (a->opcode == ac_image_atomic_cmpswap)
1812          args[num_args++] = a->data[1];
1813    }
1814 
1815    if (!atomic)
1816       args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
1817 
1818    if (a->offset)
1819       args[num_args++] = ac_to_integer(ctx, a->offset);
1820    if (a->bias) {
1821       args[num_args++] = ac_to_float(ctx, a->bias);
1822       overload[num_overloads++] = ".f32";
1823    }
1824    if (a->compare)
1825       args[num_args++] = ac_to_float(ctx, a->compare);
1826    if (a->derivs[0]) {
1827       unsigned count = ac_num_derivs(dim);
1828       for (unsigned i = 0; i < count; ++i)
1829          args[num_args++] = ac_to_float(ctx, a->derivs[i]);
1830       overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
1831    }
1832    unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
1833    for (unsigned i = 0; i < num_coords; ++i)
1834       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
1835    if (a->lod)
1836       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
1837    if (a->min_lod)
1838       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
1839 
1840    overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
1841 
1842    args[num_args++] = a->resource;
1843    if (sample) {
1844       args[num_args++] = a->sampler;
1845       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
1846    }
1847 
1848    args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
1849    args[num_args++] = LLVMConstInt(
1850       ctx->i32, get_cache_flags(ctx,
1851                                 a->access |
1852                                 (atomic ? ACCESS_TYPE_ATOMIC :
1853                                  load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
1854       false);
1855 
1856    const char *name;
1857    const char *atomic_subop = "";
1858    switch (a->opcode) {
1859    case ac_image_sample:
1860       name = "sample";
1861       break;
1862    case ac_image_gather4:
1863       name = "gather4";
1864       break;
1865    case ac_image_load:
1866       name = "load";
1867       break;
1868    case ac_image_load_mip:
1869       name = "load.mip";
1870       break;
1871    case ac_image_store:
1872       name = "store";
1873       break;
1874    case ac_image_store_mip:
1875       name = "store.mip";
1876       break;
1877    case ac_image_atomic:
1878       name = "atomic.";
1879       atomic_subop = get_atomic_name(a->atomic);
1880       break;
1881    case ac_image_atomic_cmpswap:
1882       name = "atomic.";
1883       atomic_subop = "cmpswap";
1884       break;
1885    case ac_image_get_lod:
1886       name = "getlod";
1887       break;
1888    case ac_image_get_resinfo:
1889       name = "getresinfo";
1890       break;
1891    default:
1892       unreachable("invalid image opcode");
1893    }
1894 
1895    const char *dimname;
1896    switch (dim) {
1897    case ac_image_1d:
1898       dimname = "1d";
1899       break;
1900    case ac_image_2d:
1901       dimname = "2d";
1902       break;
1903    case ac_image_3d:
1904       dimname = "3d";
1905       break;
1906    case ac_image_cube:
1907       dimname = "cube";
1908       break;
1909    case ac_image_1darray:
1910       dimname = "1darray";
1911       break;
1912    case ac_image_2darray:
1913       dimname = "2darray";
1914       break;
1915    case ac_image_2dmsaa:
1916       dimname = "2dmsaa";
1917       break;
1918    case ac_image_2darraymsaa:
1919       dimname = "2darraymsaa";
1920       break;
1921    default:
1922       unreachable("invalid dim");
1923    }
1924 
1925    ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
1926 
1927    bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1928    char intr_name[96];
1929    snprintf(intr_name, sizeof(intr_name),
1930             "llvm.amdgcn.image.%s%s" /* base name */
1931             "%s%s%s%s"               /* sample/gather modifiers */
1932             ".%s.%s%s%s%s",          /* dimension and type overloads */
1933             name, atomic_subop, a->compare ? ".c" : "",
1934             a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
1935             a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
1936             data_type_str, overload[0], overload[1], overload[2]);
1937 
1938    LLVMTypeRef retty;
1939    if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
1940       retty = ctx->voidt;
1941    else
1942       retty = data_type;
1943 
1944    LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
1945    if (a->tfe) {
1946       LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
1947       LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
1948       result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
1949    }
1950 
1951    if (!sample && !atomic && retty != ctx->voidt)
1952       result = ac_to_integer(ctx, result);
1953 
1954    return result;
1955 }
1956 
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)1957 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
1958 {
1959    LLVMValueRef samples;
1960 
1961    /* Read the samples from the descriptor directly.
1962     * Hardware doesn't have any instruction for this.
1963     */
1964    samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
1965    samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
1966    samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
1967    samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
1968    return samples;
1969 }
1970 
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1971 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1972 {
1973    return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 0);
1974 }
1975 
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1976 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1977 {
1978    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 0);
1979    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1980 }
1981 
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1982 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1983 {
1984    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 0);
1985    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1986 }
1987 
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1988 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
1989                                          LLVMValueRef args[2])
1990 {
1991    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1992    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1993    LLVMValueRef code = LLVMConstInlineAsm(calltype,
1994                                           ctx->gfx_level >= GFX11 ?
1995                                              "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
1996                                              "v_cvt_pknorm_i16_f16 $0, $1, $2",
1997                                           "=v,v,v", false, false);
1998    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1999 }
2000 
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2001 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2002                                          LLVMValueRef args[2])
2003 {
2004    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2005    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2006    LLVMValueRef code = LLVMConstInlineAsm(calltype,
2007                                           ctx->gfx_level >= GFX11 ?
2008                                              "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
2009                                              "v_cvt_pknorm_u16_f16 $0, $1, $2",
2010                                           "=v,v,v", false, false);
2011    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2012 }
2013 
2014 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2015 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2016                                  bool hi)
2017 {
2018    assert(bits == 8 || bits == 10 || bits == 16);
2019 
2020    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2021    LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2022    LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2023    LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2024 
2025    /* Clamp. */
2026    if (bits != 16) {
2027       for (int i = 0; i < 2; i++) {
2028          bool alpha = hi && i == 1;
2029          args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2030          args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2031       }
2032    }
2033 
2034    LLVMValueRef res =
2035       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, 0);
2036    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2037 }
2038 
2039 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2040 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2041                                  bool hi)
2042 {
2043    assert(bits == 8 || bits == 10 || bits == 16);
2044 
2045    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2046    LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2047 
2048    /* Clamp. */
2049    if (bits != 16) {
2050       for (int i = 0; i < 2; i++) {
2051          bool alpha = hi && i == 1;
2052          args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2053       }
2054    }
2055 
2056    LLVMValueRef res =
2057       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, 0);
2058    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2059 }
2060 
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2061 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2062 {
2063    return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, 0);
2064 }
2065 
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2066 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2067 {
2068    ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2069 }
2070 
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2071 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2072                           LLVMValueRef width, bool is_signed)
2073 {
2074    LLVMValueRef args[] = {
2075       input,
2076       offset,
2077       width,
2078    };
2079 
2080    return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2081                              ctx->i32, args, 3, 0);
2082 }
2083 
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2084 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2085                            LLVMValueRef s2)
2086 {
2087    return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2088 }
2089 
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2090 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2091                            LLVMValueRef s2)
2092 {
2093    /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2094    if (ctx->gfx_level >= GFX10)
2095       return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 0);
2096 
2097    return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2098 }
2099 
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2100 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2101 {
2102    if (!wait_flags)
2103       return;
2104 
2105    if (ctx->gfx_level >= GFX12) {
2106       if (wait_flags & AC_WAIT_DS)
2107          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.dscnt", ctx->voidt, &ctx->i16_0, 1, 0);
2108       if (wait_flags & AC_WAIT_KM)
2109          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.kmcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2110       if (wait_flags & AC_WAIT_EXP)
2111          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.expcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2112       if (wait_flags & AC_WAIT_LOAD)
2113          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.loadcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2114       if (wait_flags & AC_WAIT_STORE)
2115          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.storecnt", ctx->voidt, &ctx->i16_0, 1, 0);
2116       if (wait_flags & AC_WAIT_SAMPLE)
2117          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.samplecnt", ctx->voidt, &ctx->i16_0, 1, 0);
2118       if (wait_flags & AC_WAIT_BVH)
2119          ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.bvhcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2120    } else {
2121       unsigned expcnt = 7;
2122       unsigned lgkmcnt = 63;
2123       unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
2124       unsigned vscnt = 63;
2125 
2126       if (wait_flags & AC_WAIT_EXP)
2127          expcnt = 0;
2128       if (wait_flags & (AC_WAIT_DS | AC_WAIT_KM))
2129          lgkmcnt = 0;
2130       if (wait_flags & (AC_WAIT_LOAD | AC_WAIT_SAMPLE | AC_WAIT_BVH))
2131          vmcnt = 0;
2132 
2133       if (wait_flags & AC_WAIT_STORE) {
2134          if (ctx->gfx_level >= GFX10)
2135             vscnt = 0;
2136          else
2137             vmcnt = 0;
2138       }
2139 
2140       /* There is no intrinsic for vscnt(0), so use a fence. It waits for everything except expcnt. */
2141       if (vscnt == 0) {
2142          assert(!(wait_flags & AC_WAIT_EXP));
2143          LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2144          return;
2145       }
2146 
2147       unsigned simm16;
2148 
2149       if (ctx->gfx_level >= GFX11)
2150          simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2151       else
2152          simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2153 
2154       LLVMValueRef args[1] = {
2155          LLVMConstInt(ctx->i32, simm16, false),
2156       };
2157       ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2158    }
2159 }
2160 
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2161 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2162                            LLVMTypeRef type)
2163 {
2164    unsigned bitsize = ac_get_elem_bits(ctx, type);
2165    LLVMValueRef zero = LLVMConstReal(type, 0.0);
2166    LLVMValueRef one = LLVMConstReal(type, 1.0);
2167    LLVMValueRef result;
2168 
2169    if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2170       /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2171        * doesn't expose an intrinsic.
2172        */
2173       result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2174    } else {
2175       LLVMTypeRef type;
2176       char *intr;
2177 
2178       if (bitsize == 16) {
2179          intr = "llvm.amdgcn.fmed3.f16";
2180          type = ctx->f16;
2181       } else {
2182          assert(bitsize == 32);
2183          intr = "llvm.amdgcn.fmed3.f32";
2184          type = ctx->f32;
2185       }
2186 
2187       LLVMValueRef params[] = {
2188          zero,
2189          one,
2190          src,
2191       };
2192 
2193       result = ac_build_intrinsic(ctx, intr, type, params, 3, 0);
2194    }
2195 
2196    if (ctx->gfx_level < GFX9 && bitsize == 32) {
2197       /* Only pre-GFX9 chips do not flush denorms. */
2198       result = ac_build_canonicalize(ctx, result, bitsize);
2199    }
2200 
2201    return result;
2202 }
2203 
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2204 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2205 {
2206    LLVMTypeRef type;
2207    char *intr;
2208 
2209    if (bitsize == 16) {
2210       intr = "llvm.amdgcn.fract.f16";
2211       type = ctx->f16;
2212    } else if (bitsize == 32) {
2213       intr = "llvm.amdgcn.fract.f32";
2214       type = ctx->f32;
2215    } else {
2216       intr = "llvm.amdgcn.fract.f64";
2217       type = ctx->f64;
2218    }
2219 
2220    LLVMValueRef params[] = {
2221       src0,
2222    };
2223    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
2224 }
2225 
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2226 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2227 {
2228 
2229    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2230       LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2231       unsigned vec_size = LLVMGetVectorSize(type);
2232       LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2233 
2234       for (unsigned i = 0; i < vec_size; i++)
2235          scalars[i] = scalar;
2236       return LLVMConstVector(scalars, vec_size);
2237    }
2238    return LLVMConstInt(type, value, 0);
2239 }
2240 
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2241 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2242 {
2243    LLVMTypeRef type = LLVMTypeOf(src0);
2244    LLVMValueRef val;
2245 
2246    /* v_med3 is selected only when max is first. (LLVM bug?) */
2247    val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2248    return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2249 }
2250 
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2251 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2252 {
2253    ac_enable_signed_zeros(ctx);
2254    /* (val + 0) converts negative zero to positive zero. */
2255    val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2256    ac_disable_signed_zeros(ctx);
2257    return val;
2258 }
2259 
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2260 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2261 {
2262    LLVMTypeRef type = LLVMTypeOf(src);
2263    LLVMValueRef pos, neg, dw[2], val;
2264    unsigned bitsize = ac_get_elem_bits(ctx, type);
2265 
2266    /* The standard version leads to this:
2267     *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2268     *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2269     *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2270     *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2271     *
2272     * The isign version:
2273     *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2274     *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2275     *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2276     *
2277     * (src0 + 0) converts negative zero to positive zero.
2278     * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2279     *
2280     * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2281     * reduction. (FP64 comparisons are as fast as int64 comparisons)
2282     */
2283    if (bitsize == 16 || bitsize == 32) {
2284       val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2285       val = ac_build_isign(ctx, val);
2286       return LLVMBuildSIToFP(ctx->builder, val, type, "");
2287    }
2288 
2289    assert(bitsize == 64);
2290    pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2291    neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2292    dw[0] = ctx->i32_0;
2293    dw[1] = LLVMBuildSelect(
2294       ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2295       LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2296       "");
2297    return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2298 }
2299 
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2300 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2301 {
2302    LLVMValueRef result;
2303    unsigned bitsize;
2304 
2305    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2306 
2307    switch (bitsize) {
2308    case 128:
2309       result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 0);
2310       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2311       break;
2312    case 64:
2313       result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2314       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2315       break;
2316    case 32:
2317       result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2318       break;
2319    case 16:
2320       result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2321       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2322       break;
2323    case 8:
2324       result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2325       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2326       break;
2327    default:
2328       unreachable("invalid bitsize");
2329       break;
2330    }
2331 
2332    return result;
2333 }
2334 
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2335 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2336 {
2337    LLVMValueRef result;
2338    unsigned bitsize;
2339 
2340    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2341 
2342    switch (bitsize) {
2343    case 64:
2344       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2345       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2346       break;
2347    case 32:
2348       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2349       break;
2350    case 16:
2351       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2352       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2353       break;
2354    case 8:
2355       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2356       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2357       break;
2358    default:
2359       unreachable("invalid bitsize");
2360       break;
2361    }
2362 
2363    return result;
2364 }
2365 
ac_build_sudot_4x8(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2,bool clamp,unsigned neg_lo)2366 LLVMValueRef ac_build_sudot_4x8(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2367                                 LLVMValueRef s2, bool clamp, unsigned neg_lo)
2368 {
2369    const char *name = "llvm.amdgcn.sudot4";
2370    LLVMValueRef src[6];
2371 
2372    src[0] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x1), false);
2373    src[1] = s0;
2374    src[2] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x2), false);
2375    src[3] = s1;
2376    src[4] = s2;
2377    src[5] = LLVMConstInt(ctx->i1, clamp, false);
2378 
2379    return ac_build_intrinsic(ctx, name, ctx->i32, src, 6, 0);
2380 }
2381 
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2382 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2383 {
2384    LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2385    ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 0);
2386 }
2387 
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2388 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2389 {
2390    unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2391    LLVMTypeRef type = LLVMArrayType(ctx->i32, lds_size / 4);
2392    ctx->lds = (struct ac_llvm_pointer) {
2393       .value = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2394                   LLVMPointerType(type, AC_ADDR_SPACE_LDS), "lds"),
2395       .pointee_type = type
2396    };
2397 }
2398 
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2399 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2400 {
2401    LLVMValueRef v = ac_build_gep0(ctx, ctx->lds, dw_addr);
2402    return LLVMBuildLoad2(ctx->builder, ctx->i32, v, "");
2403 }
2404 
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2405 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2406 {
2407    value = ac_to_integer(ctx, value);
2408    ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2409 }
2410 
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2411 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2412 {
2413    unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2414    const char *intrin_name;
2415    LLVMTypeRef type;
2416    LLVMValueRef zero;
2417 
2418    switch (src0_bitsize) {
2419    case 64:
2420       intrin_name = "llvm.cttz.i64";
2421       type = ctx->i64;
2422       zero = ctx->i64_0;
2423       break;
2424    case 32:
2425       intrin_name = "llvm.cttz.i32";
2426       type = ctx->i32;
2427       zero = ctx->i32_0;
2428       break;
2429    case 16:
2430       intrin_name = "llvm.cttz.i16";
2431       type = ctx->i16;
2432       zero = ctx->i16_0;
2433       break;
2434    case 8:
2435       intrin_name = "llvm.cttz.i8";
2436       type = ctx->i8;
2437       zero = ctx->i8_0;
2438       break;
2439    default:
2440       unreachable("invalid bitsize");
2441    }
2442 
2443    LLVMValueRef params[2] = {
2444       src0,
2445 
2446       /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2447        * add special code to check for x=0. The reason is that
2448        * the LLVM behavior for x=0 is different from what we
2449        * need here. However, LLVM also assumes that ffs(x) is
2450        * in [0, 31], but GLSL expects that ffs(0) = -1, so
2451        * a conditional assignment to handle 0 is still required.
2452        *
2453        * The hardware already implements the correct behavior.
2454        */
2455       ctx->i1true,
2456    };
2457 
2458    LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
2459 
2460    if (src0_bitsize == 64) {
2461       lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2462    } else if (src0_bitsize < 32) {
2463       lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2464    }
2465 
2466    /* TODO: We need an intrinsic to skip this conditional. */
2467    /* Check for zero: */
2468    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2469                           LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2470 }
2471 
ac_arg_type_to_pointee_type(struct ac_llvm_context * ctx,enum ac_arg_type type)2472 LLVMTypeRef ac_arg_type_to_pointee_type(struct ac_llvm_context *ctx, enum ac_arg_type type) {
2473    switch (type) {
2474    case AC_ARG_CONST_PTR:
2475       return ctx->i8;
2476       break;
2477    case AC_ARG_CONST_FLOAT_PTR:
2478       return ctx->f32;
2479       break;
2480    case AC_ARG_CONST_PTR_PTR:
2481       return ac_array_in_const32_addr_space(ctx->i8);
2482       break;
2483    case AC_ARG_CONST_DESC_PTR:
2484       return ctx->v4i32;
2485       break;
2486    case AC_ARG_CONST_IMAGE_PTR:
2487       return ctx->v8i32;
2488    default:
2489       /* Other ac_arg_type values aren't pointers. */
2490       assert(false);
2491       return NULL;
2492    }
2493 }
2494 
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2495 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2496 {
2497    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2498 }
2499 
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2500 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2501 {
2502    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2503 }
2504 
get_current_flow(struct ac_llvm_context * ctx)2505 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2506 {
2507    if (ctx->flow->depth > 0)
2508       return &ctx->flow->stack[ctx->flow->depth - 1];
2509    return NULL;
2510 }
2511 
get_innermost_loop(struct ac_llvm_context * ctx)2512 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2513 {
2514    for (unsigned i = ctx->flow->depth; i > 0; --i) {
2515       if (ctx->flow->stack[i - 1].loop_entry_block)
2516          return &ctx->flow->stack[i - 1];
2517    }
2518    return NULL;
2519 }
2520 
push_flow(struct ac_llvm_context * ctx)2521 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2522 {
2523    struct ac_llvm_flow *flow;
2524 
2525    if (ctx->flow->depth >= ctx->flow->depth_max) {
2526       unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2527 
2528       ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2529       ctx->flow->depth_max = new_max;
2530    }
2531 
2532    flow = &ctx->flow->stack[ctx->flow->depth];
2533    ctx->flow->depth++;
2534 
2535    flow->next_block = NULL;
2536    flow->loop_entry_block = NULL;
2537    return flow;
2538 }
2539 
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2540 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2541 {
2542    char buf[32];
2543    snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2544    LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2545 }
2546 
2547 /* Append a basic block at the level of the parent flow.
2548  */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2549 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2550 {
2551    assert(ctx->flow->depth >= 1);
2552 
2553    if (ctx->flow->depth >= 2) {
2554       struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2555 
2556       return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2557    }
2558 
2559    LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2560    return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2561 }
2562 
2563 /* Emit a branch to the given default target for the current block if
2564  * applicable -- that is, if the current block does not already contain a
2565  * branch from a break or continue.
2566  */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2567 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2568 {
2569    if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2570       LLVMBuildBr(builder, target);
2571 }
2572 
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2573 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2574 {
2575    struct ac_llvm_flow *flow = push_flow(ctx);
2576    flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2577    flow->next_block = append_basic_block(ctx, "ENDLOOP");
2578    set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2579    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2580    LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2581 }
2582 
ac_build_break(struct ac_llvm_context * ctx)2583 void ac_build_break(struct ac_llvm_context *ctx)
2584 {
2585    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2586    LLVMBuildBr(ctx->builder, flow->next_block);
2587 }
2588 
ac_build_continue(struct ac_llvm_context * ctx)2589 void ac_build_continue(struct ac_llvm_context *ctx)
2590 {
2591    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2592    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2593 }
2594 
ac_build_else(struct ac_llvm_context * ctx,int label_id)2595 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2596 {
2597    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2598    LLVMBasicBlockRef endif_block;
2599 
2600    assert(!current_branch->loop_entry_block);
2601 
2602    endif_block = append_basic_block(ctx, "ENDIF");
2603    emit_default_branch(ctx->builder, endif_block);
2604 
2605    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2606    set_basicblock_name(current_branch->next_block, "else", label_id);
2607 
2608    current_branch->next_block = endif_block;
2609 }
2610 
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2611 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2612 {
2613    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2614 
2615    assert(!current_branch->loop_entry_block);
2616 
2617    emit_default_branch(ctx->builder, current_branch->next_block);
2618    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2619    set_basicblock_name(current_branch->next_block, "endif", label_id);
2620 
2621    ctx->flow->depth--;
2622 }
2623 
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2624 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2625 {
2626    struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2627 
2628    assert(current_loop->loop_entry_block);
2629 
2630    emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2631 
2632    LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2633    set_basicblock_name(current_loop->next_block, "endloop", label_id);
2634    ctx->flow->depth--;
2635 }
2636 
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2637 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2638 {
2639    struct ac_llvm_flow *flow = push_flow(ctx);
2640    LLVMBasicBlockRef if_block;
2641 
2642    if_block = append_basic_block(ctx, "IF");
2643    flow->next_block = append_basic_block(ctx, "ELSE");
2644    set_basicblock_name(if_block, "if", label_id);
2645    LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
2646    LLVMPositionBuilderAtEnd(ctx->builder, if_block);
2647 }
2648 
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2649 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2650 {
2651    LLVMBuilderRef builder = ac->builder;
2652    LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
2653    LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
2654    LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
2655    LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
2656    LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
2657    LLVMValueRef res;
2658 
2659    if (first_instr) {
2660       LLVMPositionBuilderBefore(first_builder, first_instr);
2661    } else {
2662       LLVMPositionBuilderAtEnd(first_builder, first_block);
2663    }
2664 
2665    res = LLVMBuildAlloca(first_builder, type, name);
2666    LLVMDisposeBuilder(first_builder);
2667    return res;
2668 }
2669 
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2670 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2671 {
2672    LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
2673    LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
2674    return ptr;
2675 }
2676 
ac_build_alloca_init(struct ac_llvm_context * ac,LLVMValueRef val,const char * name)2677 LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
2678 {
2679    LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
2680    LLVMBuildStore(ac->builder, val, ptr);
2681    return ptr;
2682 }
2683 
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)2684 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
2685 {
2686    int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
2687    return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
2688 }
2689 
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)2690 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
2691 {
2692    unsigned num_components = ac_get_llvm_num_components(value);
2693    if (count == num_components)
2694       return value;
2695 
2696    LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
2697    masks[0] = ctx->i32_0;
2698    masks[1] = ctx->i32_1;
2699    for (unsigned i = 2; i < count; i++)
2700       masks[i] = LLVMConstInt(ctx->i32, i, false);
2701 
2702    if (count == 1)
2703       return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
2704 
2705    LLVMValueRef swizzle = LLVMConstVector(masks, count);
2706    return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
2707 }
2708 
2709 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)2710 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
2711                              unsigned bitwidth)
2712 {
2713    LLVMValueRef value = param;
2714    if (rshift)
2715       value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
2716 
2717    if (rshift + bitwidth < 32) {
2718       uint64_t mask = (1ull << bitwidth) - 1;
2719       value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
2720    }
2721 
2722    if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
2723       value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
2724    return value;
2725 }
2726 
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2727 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
2728                                        LLVMValueRef lane, bool with_opt_barrier)
2729 {
2730    LLVMTypeRef type = LLVMTypeOf(src);
2731    LLVMValueRef result;
2732 
2733    if (with_opt_barrier)
2734       ac_build_optimization_barrier(ctx, &src, false);
2735 
2736    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2737    if (lane)
2738       lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
2739 
2740    result =
2741       ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
2742                          ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 0);
2743 
2744    return LLVMBuildTrunc(ctx->builder, result, type, "");
2745 }
2746 
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2747 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
2748                                              LLVMValueRef lane, bool with_opt_barrier)
2749 {
2750    LLVMTypeRef src_type = LLVMTypeOf(src);
2751    src = ac_to_integer(ctx, src);
2752    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2753    LLVMValueRef ret;
2754 
2755    if (bits > 32) {
2756       assert(bits % 32 == 0);
2757       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2758       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2759       ret = LLVMGetUndef(vec_type);
2760       for (unsigned i = 0; i < bits / 32; i++) {
2761          LLVMValueRef ret_comp;
2762 
2763          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2764 
2765          ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2766 
2767          ret =
2768             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2769       }
2770    } else {
2771       ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2772    }
2773 
2774    if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
2775       return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
2776    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2777 }
2778 
2779 /**
2780  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
2781  *
2782  * The optimization barrier is not needed if the value is the same in all lanes
2783  * or if this is called in the outermost block.
2784  *
2785  * @param ctx
2786  * @param src
2787  * @param lane - id of the lane or NULL for the first active lane
2788  * @return value of the lane
2789  */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2790 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
2791                                               LLVMValueRef lane)
2792 {
2793    return ac_build_readlane_common(ctx, src, lane, false);
2794 }
2795 
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2796 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2797 {
2798    return ac_build_readlane_common(ctx, src, lane, true);
2799 }
2800 
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)2801 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
2802                                 LLVMValueRef lane)
2803 {
2804    return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
2805                              (LLVMValueRef[]){value, lane, src}, 3, 0);
2806 }
2807 
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)2808 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
2809 {
2810    LLVMValueRef add = LLVM_VERSION_MAJOR >= 16 ? add_src : ctx->i32_0;
2811    LLVMValueRef val;
2812 
2813    if (ctx->wave_size == 32) {
2814       if (LLVMTypeOf(mask) == ctx->i64)
2815          mask = LLVMBuildTrunc(ctx->builder, mask, ctx->i32, "");
2816 
2817       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2818                                (LLVMValueRef[]){mask, add}, 2, 0);
2819    } else {
2820       LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
2821       LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
2822       LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
2823       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2824                                (LLVMValueRef[]){mask_lo, add}, 2, 0);
2825       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
2826                                2, 0);
2827    }
2828 
2829    if (add == ctx->i32_0)
2830       ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2831 
2832    if (LLVM_VERSION_MAJOR < 16) {
2833       /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
2834        * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
2835        */
2836       ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2837       val = LLVMBuildAdd(ctx->builder, val, add_src, "");
2838    }
2839 
2840    return val;
2841 }
2842 
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)2843 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
2844 {
2845    return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
2846 }
2847 
2848 enum dpp_ctrl
2849 {
2850    _dpp_quad_perm = 0x000,
2851    _dpp_row_sl = 0x100,
2852    _dpp_row_sr = 0x110,
2853    _dpp_row_rr = 0x120,
2854    dpp_wf_sl1 = 0x130,
2855    dpp_wf_rl1 = 0x134,
2856    dpp_wf_sr1 = 0x138,
2857    dpp_wf_rr1 = 0x13C,
2858    dpp_row_mirror = 0x140,
2859    dpp_row_half_mirror = 0x141,
2860    dpp_row_bcast15 = 0x142,
2861    dpp_row_bcast31 = 0x143
2862 };
2863 
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)2864 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
2865                                           unsigned lane3)
2866 {
2867    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
2868    return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
2869 }
2870 
dpp_row_sr(unsigned amount)2871 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
2872 {
2873    assert(amount > 0 && amount < 16);
2874    return _dpp_row_sr | amount;
2875 }
2876 
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2877 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2878                                   enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2879                                   bool bound_ctrl)
2880 {
2881    LLVMTypeRef type = LLVMTypeOf(src);
2882    LLVMValueRef res;
2883 
2884    old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
2885    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2886 
2887    res = ac_build_intrinsic(
2888       ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
2889       (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
2890                        LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
2891                        LLVMConstInt(ctx->i1, bound_ctrl, 0)},
2892       6, 0);
2893 
2894    return LLVMBuildTrunc(ctx->builder, res, type, "");
2895 }
2896 
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2897 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2898                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2899                                  bool bound_ctrl)
2900 {
2901    LLVMTypeRef src_type = LLVMTypeOf(src);
2902    src = ac_to_integer(ctx, src);
2903    old = ac_to_integer(ctx, old);
2904    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2905    LLVMValueRef ret;
2906    if (bits > 32) {
2907       assert(bits % 32 == 0);
2908       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2909       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2910       LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
2911       ret = LLVMGetUndef(vec_type);
2912       for (unsigned i = 0; i < bits / 32; i++) {
2913          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2914          old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
2915          LLVMValueRef ret_comp =
2916             _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2917          ret =
2918             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2919       }
2920    } else {
2921       ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2922    }
2923    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2924 }
2925 
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2926 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
2927                                          uint64_t sel, bool exchange_rows, bool bound_ctrl)
2928 {
2929    LLVMTypeRef type = LLVMTypeOf(src);
2930    LLVMValueRef result;
2931 
2932    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2933 
2934    LLVMValueRef args[6] = {
2935       src,
2936       src,
2937       LLVMConstInt(ctx->i32, sel, false),
2938       LLVMConstInt(ctx->i32, sel >> 32, false),
2939       ctx->i1true, /* fi */
2940       bound_ctrl ? ctx->i1true : ctx->i1false,
2941    };
2942 
2943    result =
2944       ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
2945                          ctx->i32, args, 6, 0);
2946 
2947    return LLVMBuildTrunc(ctx->builder, result, type, "");
2948 }
2949 
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2950 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
2951                                         bool exchange_rows, bool bound_ctrl)
2952 {
2953    LLVMTypeRef src_type = LLVMTypeOf(src);
2954    src = ac_to_integer(ctx, src);
2955    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2956    LLVMValueRef ret;
2957    if (bits > 32) {
2958       assert(bits % 32 == 0);
2959       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2960       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2961       ret = LLVMGetUndef(vec_type);
2962       for (unsigned i = 0; i < bits / 32; i++) {
2963          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2964          LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2965          ret =
2966             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2967       }
2968    } else {
2969       ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2970    }
2971    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2972 }
2973 
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)2974 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
2975 {
2976    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
2977    return and_mask | (or_mask << 5) | (xor_mask << 10);
2978 }
2979 
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2980 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
2981                                          unsigned mask)
2982 {
2983    LLVMTypeRef src_type = LLVMTypeOf(src);
2984    LLVMValueRef ret;
2985 
2986    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2987 
2988    ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
2989                             (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
2990                             0);
2991 
2992    return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2993 }
2994 
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2995 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2996 {
2997    LLVMTypeRef src_type = LLVMTypeOf(src);
2998    src = ac_to_integer(ctx, src);
2999    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3000    LLVMValueRef ret;
3001    if (bits > 32) {
3002       assert(bits % 32 == 0);
3003       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3004       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3005       ret = LLVMGetUndef(vec_type);
3006       for (unsigned i = 0; i < bits / 32; i++) {
3007          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3008          LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3009          ret =
3010             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3011       }
3012    } else {
3013       ret = _ac_build_ds_swizzle(ctx, src, mask);
3014    }
3015    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3016 }
3017 
ac_build_mode(struct ac_llvm_context * ctx,LLVMValueRef src,const char * mode)3018 static LLVMValueRef ac_build_mode(struct ac_llvm_context *ctx, LLVMValueRef src, const char *mode)
3019 {
3020    LLVMTypeRef src_type = LLVMTypeOf(src);
3021    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3022    char name[32], type[8];
3023    LLVMValueRef ret;
3024 
3025    src = ac_to_integer(ctx, src);
3026 
3027    if (bitsize < 32)
3028       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3029 
3030    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3031    snprintf(name, sizeof(name), "llvm.amdgcn.%s.%s", mode, type);
3032    ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 0);
3033 
3034    if (bitsize < 32)
3035       ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3036 
3037    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3038 }
3039 
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3040 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3041 {
3042    return ac_build_mode(ctx, src, "wwm");
3043 }
3044 
ac_build_wqm(struct ac_llvm_context * ctx,LLVMValueRef src)3045 LLVMValueRef ac_build_wqm(struct ac_llvm_context *ctx, LLVMValueRef src)
3046 {
3047    return ac_build_mode(ctx, src, "wqm");
3048 }
3049 
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3050 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3051                                           LLVMValueRef inactive)
3052 {
3053    char name[33], type[8];
3054    LLVMTypeRef src_type = LLVMTypeOf(src);
3055    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3056    src = ac_to_integer(ctx, src);
3057    inactive = ac_to_integer(ctx, inactive);
3058 
3059    if (bitsize < 32) {
3060       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3061       inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3062    }
3063 
3064    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3065    snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3066    LLVMValueRef ret =
3067       ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 0);
3068    if (bitsize < 32)
3069       ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3070 
3071    return ret;
3072 }
3073 
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3074 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3075                                            unsigned type_size)
3076 {
3077 
3078    if (type_size == 0) {
3079       switch (op) {
3080       case nir_op_ior:
3081       case nir_op_ixor:
3082          return ctx->i1false;
3083       case nir_op_iand:
3084          return ctx->i1true;
3085       default:
3086          unreachable("bad reduction intrinsic");
3087       }
3088    } else if (type_size == 1) {
3089       switch (op) {
3090       case nir_op_iadd:
3091          return ctx->i8_0;
3092       case nir_op_imul:
3093          return ctx->i8_1;
3094       case nir_op_imin:
3095          return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3096       case nir_op_umin:
3097          return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3098       case nir_op_imax:
3099          return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3100       case nir_op_umax:
3101          return ctx->i8_0;
3102       case nir_op_iand:
3103          return LLVMConstInt(ctx->i8, -1, 0);
3104       case nir_op_ior:
3105          return ctx->i8_0;
3106       case nir_op_ixor:
3107          return ctx->i8_0;
3108       default:
3109          unreachable("bad reduction intrinsic");
3110       }
3111    } else if (type_size == 2) {
3112       switch (op) {
3113       case nir_op_iadd:
3114          return ctx->i16_0;
3115       case nir_op_fadd:
3116          return ctx->f16_0;
3117       case nir_op_imul:
3118          return ctx->i16_1;
3119       case nir_op_fmul:
3120          return ctx->f16_1;
3121       case nir_op_imin:
3122          return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3123       case nir_op_umin:
3124          return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3125       case nir_op_fmin:
3126          return LLVMConstReal(ctx->f16, INFINITY);
3127       case nir_op_imax:
3128          return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3129       case nir_op_umax:
3130          return ctx->i16_0;
3131       case nir_op_fmax:
3132          return LLVMConstReal(ctx->f16, -INFINITY);
3133       case nir_op_iand:
3134          return LLVMConstInt(ctx->i16, -1, 0);
3135       case nir_op_ior:
3136          return ctx->i16_0;
3137       case nir_op_ixor:
3138          return ctx->i16_0;
3139       default:
3140          unreachable("bad reduction intrinsic");
3141       }
3142    } else if (type_size == 4) {
3143       switch (op) {
3144       case nir_op_iadd:
3145          return ctx->i32_0;
3146       case nir_op_fadd:
3147          return ctx->f32_0;
3148       case nir_op_imul:
3149          return ctx->i32_1;
3150       case nir_op_fmul:
3151          return ctx->f32_1;
3152       case nir_op_imin:
3153          return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3154       case nir_op_umin:
3155          return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3156       case nir_op_fmin:
3157          return LLVMConstReal(ctx->f32, INFINITY);
3158       case nir_op_imax:
3159          return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3160       case nir_op_umax:
3161          return ctx->i32_0;
3162       case nir_op_fmax:
3163          return LLVMConstReal(ctx->f32, -INFINITY);
3164       case nir_op_iand:
3165          return LLVMConstInt(ctx->i32, -1, 0);
3166       case nir_op_ior:
3167          return ctx->i32_0;
3168       case nir_op_ixor:
3169          return ctx->i32_0;
3170       default:
3171          unreachable("bad reduction intrinsic");
3172       }
3173    } else { /* type_size == 64bit */
3174       switch (op) {
3175       case nir_op_iadd:
3176          return ctx->i64_0;
3177       case nir_op_fadd:
3178          return ctx->f64_0;
3179       case nir_op_imul:
3180          return ctx->i64_1;
3181       case nir_op_fmul:
3182          return ctx->f64_1;
3183       case nir_op_imin:
3184          return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3185       case nir_op_umin:
3186          return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3187       case nir_op_fmin:
3188          return LLVMConstReal(ctx->f64, INFINITY);
3189       case nir_op_imax:
3190          return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3191       case nir_op_umax:
3192          return ctx->i64_0;
3193       case nir_op_fmax:
3194          return LLVMConstReal(ctx->f64, -INFINITY);
3195       case nir_op_iand:
3196          return LLVMConstInt(ctx->i64, -1, 0);
3197       case nir_op_ior:
3198          return ctx->i64_0;
3199       case nir_op_ixor:
3200          return ctx->i64_0;
3201       default:
3202          unreachable("bad reduction intrinsic");
3203       }
3204    }
3205 }
3206 
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3207 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3208                                     nir_op op)
3209 {
3210    bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3211    bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3212    switch (op) {
3213    case nir_op_iadd:
3214       return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3215    case nir_op_fadd:
3216       return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3217    case nir_op_imul:
3218       return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3219    case nir_op_fmul:
3220       return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3221    case nir_op_imin:
3222       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3223                              lhs, rhs, "");
3224    case nir_op_umin:
3225       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3226                              lhs, rhs, "");
3227    case nir_op_fmin:
3228       return ac_build_intrinsic(
3229          ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3230          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3231    case nir_op_imax:
3232       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3233                              lhs, rhs, "");
3234    case nir_op_umax:
3235       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3236                              lhs, rhs, "");
3237    case nir_op_fmax:
3238       return ac_build_intrinsic(
3239          ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3240          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3241    case nir_op_iand:
3242       return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3243    case nir_op_ior:
3244       return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3245    case nir_op_ixor:
3246       return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3247    default:
3248       unreachable("bad reduction intrinsic");
3249    }
3250 }
3251 
3252 /**
3253  * \param src The value to shift.
3254  * \param identity The value to use the first lane.
3255  * \param maxprefix specifies that the result only needs to be correct for a
3256  *     prefix of this many threads
3257  * \return src, shifted 1 lane up, and identity shifted into lane 0.
3258  */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3259 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3260                                                LLVMValueRef identity, unsigned maxprefix)
3261 {
3262    if (ctx->gfx_level >= GFX10) {
3263       /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3264       LLVMValueRef active, tmp1, tmp2;
3265       LLVMValueRef tid = ac_get_thread_id(ctx);
3266 
3267       tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3268 
3269       tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3270 
3271       if (maxprefix > 32) {
3272          active =
3273             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3274 
3275          tmp2 = LLVMBuildSelect(ctx->builder, active,
3276                                 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3277                                 tmp2, "");
3278 
3279          active = LLVMBuildOr(
3280             ctx->builder, active,
3281             LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3282                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3283                           LLVMConstInt(ctx->i32, 0x10, false), ""),
3284             "");
3285          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3286       } else if (maxprefix > 16) {
3287          active =
3288             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3289 
3290          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3291       }
3292    } else if (ctx->gfx_level >= GFX8) {
3293       return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3294    }
3295 
3296    /* wavefront shift_right by 1 on SI/CI */
3297    LLVMValueRef active, tmp1, tmp2;
3298    LLVMValueRef tid = ac_get_thread_id(ctx);
3299    tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3300    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3301    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3302                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3303                           LLVMConstInt(ctx->i32, 0x4, 0), "");
3304    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3305    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3306    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3307                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3308                           LLVMConstInt(ctx->i32, 0x8, 0), "");
3309    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3310    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3311    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3312                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3313                           LLVMConstInt(ctx->i32, 0x10, 0), "");
3314    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3315    tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3316    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3317    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3318    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, ctx->i32_0, "");
3319    return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3320 }
3321 
3322 /**
3323  * \param maxprefix specifies that the result only needs to be correct for a
3324  *     prefix of this many threads
3325  */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3326 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3327                                   LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3328 {
3329    LLVMValueRef result, tmp;
3330 
3331    if (!inclusive)
3332       src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3333 
3334    result = src;
3335 
3336    if (ctx->gfx_level <= GFX7) {
3337       assert(maxprefix == 64);
3338       LLVMValueRef tid = ac_get_thread_id(ctx);
3339       LLVMValueRef active;
3340       tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3341       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3342                              LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3343       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3344       result = ac_build_alu_op(ctx, result, tmp, op);
3345       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3346       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3347                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3348                              ctx->i32_0, "");
3349       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3350       result = ac_build_alu_op(ctx, result, tmp, op);
3351       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3352       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3353                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3354                              ctx->i32_0, "");
3355       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3356       result = ac_build_alu_op(ctx, result, tmp, op);
3357       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3358       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3359                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3360                              ctx->i32_0, "");
3361       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3362       result = ac_build_alu_op(ctx, result, tmp, op);
3363       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3364       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3365                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3366                              ctx->i32_0, "");
3367       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3368       result = ac_build_alu_op(ctx, result, tmp, op);
3369       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3370       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3371                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3372                              ctx->i32_0, "");
3373       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3374       result = ac_build_alu_op(ctx, result, tmp, op);
3375       return result;
3376    }
3377 
3378    if (maxprefix <= 1)
3379       return result;
3380    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3381    result = ac_build_alu_op(ctx, result, tmp, op);
3382    if (maxprefix <= 2)
3383       return result;
3384    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3385    result = ac_build_alu_op(ctx, result, tmp, op);
3386    if (maxprefix <= 3)
3387       return result;
3388    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3389    result = ac_build_alu_op(ctx, result, tmp, op);
3390    if (maxprefix <= 4)
3391       return result;
3392    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3393    result = ac_build_alu_op(ctx, result, tmp, op);
3394    if (maxprefix <= 8)
3395       return result;
3396    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3397    result = ac_build_alu_op(ctx, result, tmp, op);
3398    if (maxprefix <= 16)
3399       return result;
3400 
3401    if (ctx->gfx_level >= GFX10) {
3402       LLVMValueRef tid = ac_get_thread_id(ctx);
3403       LLVMValueRef active;
3404 
3405       tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3406 
3407       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3408                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3409                              ctx->i32_0, "");
3410 
3411       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3412 
3413       result = ac_build_alu_op(ctx, result, tmp, op);
3414 
3415       if (maxprefix <= 32)
3416          return result;
3417 
3418       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3419 
3420       active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3421 
3422       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3423 
3424       result = ac_build_alu_op(ctx, result, tmp, op);
3425       return result;
3426    }
3427 
3428    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3429    result = ac_build_alu_op(ctx, result, tmp, op);
3430    if (maxprefix <= 32)
3431       return result;
3432    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3433    result = ac_build_alu_op(ctx, result, tmp, op);
3434    return result;
3435 }
3436 
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3437 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3438 {
3439    LLVMValueRef result;
3440 
3441    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3442       LLVMBuilderRef builder = ctx->builder;
3443       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3444       result = ac_build_ballot(ctx, src);
3445       result = ac_build_mbcnt(ctx, result);
3446       result = LLVMBuildAdd(builder, result, src, "");
3447       return result;
3448    }
3449 
3450    ac_build_optimization_barrier(ctx, &src, false);
3451 
3452    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3453    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3454                              LLVMTypeOf(identity), "");
3455    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3456 
3457    return ac_build_wwm(ctx, result);
3458 }
3459 
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3460 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3461 {
3462    LLVMValueRef result;
3463 
3464    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3465       LLVMBuilderRef builder = ctx->builder;
3466       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3467       result = ac_build_ballot(ctx, src);
3468       result = ac_build_mbcnt(ctx, result);
3469       return result;
3470    }
3471 
3472    ac_build_optimization_barrier(ctx, &src, false);
3473 
3474    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3475    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3476                              LLVMTypeOf(identity), "");
3477    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3478 
3479    return ac_build_wwm(ctx, result);
3480 }
3481 
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3482 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3483                              unsigned cluster_size)
3484 {
3485    if (cluster_size == 1)
3486       return src;
3487    ac_build_optimization_barrier(ctx, &src, false);
3488    LLVMValueRef result, swap;
3489    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3490    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3491                              LLVMTypeOf(identity), "");
3492    swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3493    result = ac_build_alu_op(ctx, result, swap, op);
3494    if (cluster_size == 2)
3495       return ac_build_wwm(ctx, result);
3496 
3497    swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3498    result = ac_build_alu_op(ctx, result, swap, op);
3499    if (cluster_size == 4)
3500       return ac_build_wwm(ctx, result);
3501 
3502    if (ctx->gfx_level >= GFX8)
3503       swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3504    else
3505       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3506    result = ac_build_alu_op(ctx, result, swap, op);
3507    if (cluster_size == 8)
3508       return ac_build_wwm(ctx, result);
3509 
3510    if (ctx->gfx_level >= GFX8)
3511       swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3512    else
3513       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3514    result = ac_build_alu_op(ctx, result, swap, op);
3515    if (cluster_size == 16)
3516       return ac_build_wwm(ctx, result);
3517 
3518    if (ctx->gfx_level >= GFX10)
3519       swap = ac_build_permlane16(ctx, result, 0, true, false);
3520    else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3521       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3522    else
3523       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3524    result = ac_build_alu_op(ctx, result, swap, op);
3525    if (cluster_size == 32)
3526       return ac_build_wwm(ctx, result);
3527 
3528    if (ctx->gfx_level >= GFX8) {
3529       if (ctx->wave_size == 64) {
3530          if (ctx->gfx_level >= GFX10)
3531             swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3532          else
3533             swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3534          result = ac_build_alu_op(ctx, result, swap, op);
3535          result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3536       }
3537 
3538       return ac_build_wwm(ctx, result);
3539    } else {
3540       swap = ac_build_readlane(ctx, result, ctx->i32_0);
3541       result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3542       result = ac_build_alu_op(ctx, result, swap, op);
3543       return ac_build_wwm(ctx, result);
3544    }
3545 }
3546 
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)3547 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3548                                              LLVMValueRef *arg0, LLVMValueRef *arg1)
3549 {
3550    LLVMValueRef tid;
3551    LLVMValueRef src0, src1;
3552    LLVMValueRef tmp0;
3553    LLVMValueRef params[2];
3554    LLVMValueRef is_even;
3555 
3556    src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
3557    src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
3558 
3559    /* swap odd,even lanes of arg_0*/
3560    params[0] = src0;
3561    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3562    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3563                              ctx->i32, params, 2, 0);
3564 
3565    /* swap even lanes between arg_0 and arg_1 */
3566    tid = ac_get_thread_id(ctx);
3567    is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3568                            LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
3569                            ctx->i32_0, "");
3570    tmp0 = src0;
3571    src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
3572    src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
3573 
3574    /* swap odd,even lanes again for arg_0*/
3575    params[0] = src0;
3576    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3577    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3578                              ctx->i32, params, 2, 0);
3579 
3580    *arg0 = src0;
3581    *arg1 = src1;
3582 }
3583 
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)3584 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3585                                      struct ac_export_args *mrt0,
3586                                      struct ac_export_args *mrt1)
3587 {
3588    assert(ctx->gfx_level >= GFX11);
3589    assert(mrt0->enabled_channels == mrt1->enabled_channels);
3590 
3591    for (int i = 0; i < 4; i++) {
3592       if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
3593          _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
3594    }
3595 }
3596 
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3597 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
3598                                    unsigned lane1, unsigned lane2, unsigned lane3)
3599 {
3600    unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3601    if (ctx->gfx_level >= GFX8) {
3602       return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
3603    } else {
3604       return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3605    }
3606 }
3607 
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)3608 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3609 {
3610    LLVMTypeRef type = LLVMTypeOf(src);
3611    LLVMValueRef result;
3612 
3613    index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3614    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3615 
3616    result =
3617       ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 0);
3618    return LLVMBuildTrunc(ctx->builder, result, type, "");
3619 }
3620 
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3621 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3622 {
3623    LLVMTypeRef type;
3624    char *intr;
3625 
3626    if (bitsize == 16) {
3627       intr = "llvm.amdgcn.frexp.exp.i16.f16";
3628       type = ctx->i16;
3629    } else if (bitsize == 32) {
3630       intr = "llvm.amdgcn.frexp.exp.i32.f32";
3631       type = ctx->i32;
3632    } else {
3633       intr = "llvm.amdgcn.frexp.exp.i32.f64";
3634       type = ctx->i32;
3635    }
3636 
3637    LLVMValueRef params[] = {
3638       src0,
3639    };
3640    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3641 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3642 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3643 {
3644    LLVMTypeRef type;
3645    char *intr;
3646 
3647    if (bitsize == 16) {
3648       intr = "llvm.amdgcn.frexp.mant.f16";
3649       type = ctx->f16;
3650    } else if (bitsize == 32) {
3651       intr = "llvm.amdgcn.frexp.mant.f32";
3652       type = ctx->f32;
3653    } else {
3654       intr = "llvm.amdgcn.frexp.mant.f64";
3655       type = ctx->f64;
3656    }
3657 
3658    LLVMValueRef params[] = {
3659       src0,
3660    };
3661    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3662 }
3663 
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3664 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3665 {
3666    LLVMTypeRef type;
3667    char *intr;
3668 
3669    if (bitsize == 16) {
3670       intr = "llvm.canonicalize.f16";
3671       type = ctx->f16;
3672    } else if (bitsize == 32) {
3673       intr = "llvm.canonicalize.f32";
3674       type = ctx->f32;
3675    } else {
3676       intr = "llvm.canonicalize.f64";
3677       type = ctx->f64;
3678    }
3679 
3680    LLVMValueRef params[] = {
3681       src0,
3682    };
3683    return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3684 }
3685 
3686 /*
3687  * this takes an I,J coordinate pair,
3688  * and works out the X and Y derivatives.
3689  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3690  */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)3691 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
3692 {
3693    LLVMValueRef result[4], a;
3694    unsigned i;
3695 
3696    for (i = 0; i < 2; i++) {
3697       a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
3698       result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
3699       result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
3700    }
3701    return ac_build_gather_values(ctx, result, 4);
3702 }
3703 
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)3704 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
3705 {
3706    LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 0);
3707 
3708    return LLVMBuildNot(ctx->builder, result, "");
3709 }
3710 
ac_build_call(struct ac_llvm_context * ctx,LLVMTypeRef fn_type,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)3711 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMTypeRef fn_type, LLVMValueRef func, LLVMValueRef *args,
3712                            unsigned num_args)
3713 {
3714    LLVMValueRef ret = LLVMBuildCall2(ctx->builder, fn_type, func, args, num_args, "");
3715    LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
3716    return ret;
3717 }
3718 
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)3719 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
3720                      LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
3721                      struct ac_export_args *args)
3722 {
3723    unsigned mask = 0;
3724    unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
3725                                                 mrt0_alpha != NULL);
3726 
3727    assert(depth || stencil || samplemask);
3728 
3729    memset(args, 0, sizeof(*args));
3730 
3731    if (is_last) {
3732       args->valid_mask = 1; /* whether the EXEC mask is valid */
3733       args->done = 1;       /* DONE bit */
3734    }
3735 
3736    /* Specify the target we are exporting */
3737    args->target = V_008DFC_SQ_EXP_MRTZ;
3738 
3739    args->compr = 0;                       /* COMP flag */
3740    args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
3741    args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
3742    args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
3743    args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
3744 
3745    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3746       assert(!depth);
3747       args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
3748 
3749       if (stencil) {
3750          /* Stencil should be in X[23:16]. */
3751          stencil = ac_to_integer(ctx, stencil);
3752          stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
3753          args->out[0] = ac_to_float(ctx, stencil);
3754          mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
3755       }
3756       if (samplemask) {
3757          /* SampleMask should be in Y[15:0]. */
3758          args->out[1] = samplemask;
3759          mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
3760       }
3761    } else {
3762       if (depth) {
3763          args->out[0] = depth;
3764          mask |= 0x1;
3765       }
3766       if (stencil) {
3767          args->out[1] = stencil;
3768          mask |= 0x2;
3769       }
3770       if (samplemask) {
3771          args->out[2] = samplemask;
3772          mask |= 0x4;
3773       }
3774       if (mrt0_alpha) {
3775          args->out[3] = mrt0_alpha;
3776          mask |= 0x8;
3777       }
3778    }
3779 
3780    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
3781     * at the X writemask component. */
3782    if (ctx->gfx_level == GFX6 &&
3783        ctx->info->family != CHIP_OLAND &&
3784        ctx->info->family != CHIP_HAINAN)
3785       mask |= 0x1;
3786 
3787    /* Specify which components to enable */
3788    args->enabled_channels = mask;
3789 }
3790 
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)3791 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
3792 {
3793    LLVMTypeRef base;
3794    switch (type) {
3795       case AC_ARG_FLOAT:
3796          return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
3797       case AC_ARG_INT:
3798          return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
3799       case AC_ARG_CONST_PTR:
3800          base = ctx->i8;
3801          break;
3802       case AC_ARG_CONST_FLOAT_PTR:
3803          base = ctx->f32;
3804          break;
3805       case AC_ARG_CONST_PTR_PTR:
3806          base = ac_array_in_const32_addr_space(ctx->i8);
3807          break;
3808       case AC_ARG_CONST_DESC_PTR:
3809          base = ctx->v4i32;
3810          break;
3811       case AC_ARG_CONST_IMAGE_PTR:
3812          base = ctx->v8i32;
3813          break;
3814       default:
3815          assert(false);
3816          return NULL;
3817    }
3818 
3819    assert(base);
3820    if (size == 1) {
3821       return ac_array_in_const32_addr_space(base);
3822    } else {
3823       assert(size == 2);
3824       return ac_array_in_const_addr_space(base);
3825    }
3826 }
3827 
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)3828 struct ac_llvm_pointer ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
3829                            enum ac_llvm_calling_convention convention, const char *name,
3830                            LLVMTypeRef ret_type, LLVMModuleRef module)
3831 {
3832    LLVMTypeRef arg_types[AC_MAX_ARGS];
3833    enum ac_arg_regfile arg_regfiles[AC_MAX_ARGS];
3834 
3835    /* ring_offsets doesn't have a corresponding function parameter because LLVM can allocate it
3836     * itself for scratch memory purposes and gives us access through llvm.amdgcn.implicit.buffer.ptr
3837     */
3838    unsigned arg_count = 0;
3839    for (unsigned i = 0; i < args->arg_count; i++) {
3840       if (args->ring_offsets.used && i == args->ring_offsets.arg_index) {
3841          ctx->ring_offsets_index = i;
3842          continue;
3843       }
3844       arg_regfiles[arg_count] = args->args[i].file;
3845       arg_types[arg_count++] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
3846    }
3847 
3848    LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, arg_count, 0);
3849 
3850    LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
3851    LLVMBasicBlockRef main_function_body =
3852       LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
3853    LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
3854 
3855    LLVMSetFunctionCallConv(main_function, convention);
3856    for (unsigned i = 0; i < arg_count; ++i) {
3857       LLVMValueRef P = LLVMGetParam(main_function, i);
3858 
3859       if (arg_regfiles[i] != AC_ARG_SGPR)
3860          continue;
3861 
3862       ac_add_function_attr(ctx->context, main_function, i + 1, "inreg");
3863 
3864       if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3865          ac_add_function_attr(ctx->context, main_function, i + 1, "noalias");
3866          ac_add_attr_dereferenceable(P, UINT64_MAX);
3867          ac_add_attr_alignment(P, 4);
3868       }
3869    }
3870 
3871    if (args->ring_offsets.used) {
3872       ctx->ring_offsets =
3873          ac_build_intrinsic(ctx, "llvm.amdgcn.implicit.buffer.ptr",
3874                             LLVMPointerType(ctx->i8, AC_ADDR_SPACE_CONST), NULL, 0, 0);
3875       ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
3876                                            ac_array_in_const_addr_space(ctx->v4i32), "");
3877    }
3878 
3879    ctx->main_function = (struct ac_llvm_pointer) {
3880       .value = main_function,
3881       .pointee_type = main_function_type
3882    };
3883 
3884    /* Enable denormals for FP16 and FP64: */
3885    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
3886    /* Disable denormals for FP32: */
3887    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
3888                                       "preserve-sign,preserve-sign");
3889 
3890    if (convention == AC_LLVM_AMDGPU_PS) {
3891       LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-depth-export",
3892                                          ctx->exports_mrtz ? "1" : "0");
3893       LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-color-export",
3894                                          ctx->exports_color_null ? "1" : "0");
3895    }
3896 
3897    return ctx->main_function;
3898 }
3899 
ac_build_s_endpgm(struct ac_llvm_context * ctx)3900 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
3901 {
3902    LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3903    LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
3904    LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, "");
3905 }
3906 
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)3907 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
3908 {
3909    LLVMValueRef args[2] = {
3910       a,
3911       LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
3912    };
3913    return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2, 0);
3914 }
3915