1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
7 #include "ac_llvm_build.h"
8 #include "ac_gpu_info.h"
9 #include "ac_nir.h"
10 #include "ac_llvm_util.h"
11 #include "ac_shader_util.h"
12 #include "c11/threads.h"
13 #include "shader_enums.h"
14 #include "sid.h"
15 #include "util/bitscan.h"
16 #include "util/macros.h"
17 #include "util/u_atomic.h"
18 #include "util/u_math.h"
19 #include <llvm-c/Core.h>
20 #include <llvm/Config/llvm-config.h>
21
22 #include <assert.h>
23 #include <stdio.h>
24
25 #define AC_LLVM_INITIAL_CF_DEPTH 4
26
27 /* Data for if/else/endif and bgnloop/endloop control flow structures.
28 */
29 struct ac_llvm_flow {
30 /* Loop exit or next part of if/else/endif. */
31 LLVMBasicBlockRef next_block;
32 LLVMBasicBlockRef loop_entry_block;
33 };
34
35 /* Initialize module-independent parts of the context.
36 *
37 * The caller is responsible for initializing ctx::module and ctx::builder.
38 */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,const struct radeon_info * info,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits,bool exports_color_null,bool exports_mrtz)39 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
40 const struct radeon_info *info, enum ac_float_mode float_mode,
41 unsigned wave_size, unsigned ballot_mask_bits, bool exports_color_null,
42 bool exports_mrtz)
43 {
44 ctx->context = LLVMContextCreate();
45
46 ctx->info = info;
47 ctx->gfx_level = info->gfx_level;
48 ctx->wave_size = wave_size;
49 ctx->ballot_mask_bits = ballot_mask_bits;
50 ctx->float_mode = float_mode;
51 ctx->exports_color_null = exports_color_null;
52 ctx->exports_mrtz = exports_mrtz;
53 ctx->module = ac_create_module(compiler->tm, ctx->context);
54 ctx->builder = ac_create_builder(ctx->context, float_mode);
55
56 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
57 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
58 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
59 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
60 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
61 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
62 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
63 ctx->intptr = ctx->i32;
64 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
65 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
66 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
67 ctx->v4i8 = LLVMVectorType(ctx->i8, 4);
68 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
69 ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
70 ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
71 ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
72 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
73 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
74 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
75 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
76 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
77 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
78 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
79 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
80 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
81
82 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
83 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
84 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
85 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
86 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
87 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
88 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
89 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
90 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
91 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
92 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
93 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
94 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
95 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
96 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
97 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
98
99 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
100 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
101
102 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
103 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
104 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
105 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
106
107 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
108
109 LLVMValueRef three = LLVMConstReal(ctx->f32, 3);
110 ctx->three_md = LLVMMDNodeInContext(ctx->context, &three, 1);
111
112 ctx->flow = calloc(1, sizeof(*ctx->flow));
113
114 ctx->ring_offsets_index = INT32_MAX;
115 }
116
ac_llvm_context_dispose(struct ac_llvm_context * ctx)117 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
118 {
119 free(ctx->flow->stack);
120 free(ctx->flow);
121 ctx->flow = NULL;
122
123 LLVMDisposeBuilder(ctx->builder);
124 }
125
ac_get_llvm_num_components(LLVMValueRef value)126 int ac_get_llvm_num_components(LLVMValueRef value)
127 {
128 LLVMTypeRef type = LLVMTypeOf(value);
129 unsigned num_components =
130 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
131 return num_components;
132 }
133
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)134 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
135 {
136 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
137 assert(index == 0);
138 return value;
139 }
140
141 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
142 }
143
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)144 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
145 {
146 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
147 type = LLVMGetElementType(type);
148
149 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
150 return LLVMGetIntTypeWidth(type);
151
152 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
153 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
154 return 32;
155 }
156
157 if (type == ctx->f16)
158 return 16;
159 if (type == ctx->f32)
160 return 32;
161 if (type == ctx->f64)
162 return 64;
163
164 unreachable("Unhandled type kind in get_elem_bits");
165 }
166
ac_get_type_size(LLVMTypeRef type)167 unsigned ac_get_type_size(LLVMTypeRef type)
168 {
169 LLVMTypeKind kind = LLVMGetTypeKind(type);
170
171 switch (kind) {
172 case LLVMIntegerTypeKind:
173 return LLVMGetIntTypeWidth(type) / 8;
174 case LLVMHalfTypeKind:
175 return 2;
176 case LLVMFloatTypeKind:
177 return 4;
178 case LLVMDoubleTypeKind:
179 return 8;
180 case LLVMPointerTypeKind:
181 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
182 return 4;
183 return 8;
184 case LLVMVectorTypeKind:
185 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
186 case LLVMArrayTypeKind:
187 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
188 default:
189 assert(0);
190 return 0;
191 }
192 }
193
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)194 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
195 {
196 if (t == ctx->i1)
197 return ctx->i1;
198 else if (t == ctx->i8)
199 return ctx->i8;
200 else if (t == ctx->f16 || t == ctx->i16)
201 return ctx->i16;
202 else if (t == ctx->f32 || t == ctx->i32)
203 return ctx->i32;
204 else if (t == ctx->f64 || t == ctx->i64)
205 return ctx->i64;
206 else
207 unreachable("Unhandled integer size");
208 }
209
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)210 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
211 {
212 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
213 LLVMTypeRef elem_type = LLVMGetElementType(t);
214 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
215 }
216 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
217 switch (LLVMGetPointerAddressSpace(t)) {
218 case AC_ADDR_SPACE_GLOBAL:
219 case AC_ADDR_SPACE_CONST:
220 return ctx->i64;
221 case AC_ADDR_SPACE_CONST_32BIT:
222 case AC_ADDR_SPACE_LDS:
223 return ctx->i32;
224 default:
225 unreachable("unhandled address space");
226 }
227 }
228 return to_integer_type_scalar(ctx, t);
229 }
230
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)231 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
232 {
233 LLVMTypeRef type = LLVMTypeOf(v);
234 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
235 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
236 }
237 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
238 }
239
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)240 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
241 {
242 LLVMTypeRef type = LLVMTypeOf(v);
243 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
244 return v;
245 return ac_to_integer(ctx, v);
246 }
247
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)248 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
249 {
250 if (t == ctx->i8)
251 return ctx->i8;
252 else if (t == ctx->i16 || t == ctx->f16)
253 return ctx->f16;
254 else if (t == ctx->i32 || t == ctx->f32)
255 return ctx->f32;
256 else if (t == ctx->i64 || t == ctx->f64)
257 return ctx->f64;
258 else
259 unreachable("Unhandled float size");
260 }
261
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)262 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
263 {
264 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
265 LLVMTypeRef elem_type = LLVMGetElementType(t);
266 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
267 }
268 return to_float_type_scalar(ctx, t);
269 }
270
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)271 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
272 {
273 LLVMTypeRef type = LLVMTypeOf(v);
274 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
275 }
276
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)277 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
278 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
279 unsigned attrib_mask)
280 {
281 LLVMValueRef call;
282
283 LLVMTypeRef param_types[32];
284 assert(param_count <= 32);
285 for (unsigned i = 0; i < param_count; ++i) {
286 assert(params[i]);
287 param_types[i] = LLVMTypeOf(params[i]);
288 }
289
290 LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
291 LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
292
293 if (!function) {
294 function = LLVMAddFunction(ctx->module, name, function_type);
295
296 LLVMSetFunctionCallConv(function, LLVMCCallConv);
297 LLVMSetLinkage(function, LLVMExternalLinkage);
298 }
299
300 call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
301
302 if (attrib_mask & AC_ATTR_INVARIANT_LOAD)
303 LLVMSetMetadata(call, ctx->invariant_load_md_kind, ctx->empty_md);
304
305 if (attrib_mask & AC_ATTR_CONVERGENT)
306 LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "convergent"));
307
308 LLVMAddCallSiteAttribute(call, -1, ac_get_llvm_attribute(ctx->context, "nounwind"));
309 return call;
310 }
311
312 /**
313 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
314 * intrinsic names).
315 */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)316 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
317 {
318 LLVMTypeRef elem_type = type;
319
320 if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
321 unsigned count = LLVMCountStructElementTypes(type);
322 int ret = snprintf(buf, bufsize, "sl_");
323 buf += ret;
324 bufsize -= ret;
325
326 LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
327 LLVMGetStructElementTypes(type, elems);
328
329 for (unsigned i = 0; i < count; i++) {
330 ac_build_type_name_for_intr(elems[i], buf, bufsize);
331 ret = strlen(buf);
332 buf += ret;
333 bufsize -= ret;
334 }
335
336 snprintf(buf, bufsize, "s");
337 return;
338 }
339
340 assert(bufsize >= 8);
341 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
342 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
343 if (ret < 0) {
344 char *type_name = LLVMPrintTypeToString(type);
345 fprintf(stderr, "Error building type name for: %s\n", type_name);
346 LLVMDisposeMessage(type_name);
347 return;
348 }
349 elem_type = LLVMGetElementType(type);
350 buf += ret;
351 bufsize -= ret;
352 }
353 switch (LLVMGetTypeKind(elem_type)) {
354 default:
355 break;
356 case LLVMIntegerTypeKind:
357 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
358 break;
359 case LLVMHalfTypeKind:
360 snprintf(buf, bufsize, "f16");
361 break;
362 case LLVMFloatTypeKind:
363 snprintf(buf, bufsize, "f32");
364 break;
365 case LLVMDoubleTypeKind:
366 snprintf(buf, bufsize, "f64");
367 break;
368 }
369 }
370
371 /**
372 * Helper function that builds an LLVM IR PHI node and immediately adds
373 * incoming edges.
374 */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)375 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
376 LLVMValueRef *values, LLVMBasicBlockRef *blocks)
377 {
378 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
379 LLVMAddIncoming(phi, values, blocks, count_incoming);
380 return phi;
381 }
382
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)383 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
384 {
385 /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
386 * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
387 */
388 if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
389 return;
390
391 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, 0);
392 }
393
394 /* Prevent optimizations (at least of memory accesses) across the current
395 * point in the program by emitting empty inline assembly that is marked as
396 * having side effects.
397 *
398 * Optionally, a value can be passed through the inline assembly to prevent
399 * LLVM from hoisting calls to ReadNone functions.
400 */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)401 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
402 {
403 static int counter = 0;
404
405 LLVMBuilderRef builder = ctx->builder;
406 char code[16];
407 const char *constraint = sgpr ? "=s,0" : "=v,0";
408
409 snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
410
411 if (!pgpr) {
412 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
413 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
414 LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
415 } else {
416 LLVMTypeRef old_type = LLVMTypeOf(*pgpr);
417
418 if (old_type == ctx->i1)
419 *pgpr = LLVMBuildZExt(builder, *pgpr, ctx->i32, "");
420
421 if (old_type == LLVMVectorType(ctx->i16, 3))
422 *pgpr = ac_build_expand_to_vec4(ctx, *pgpr, 4);
423
424 LLVMTypeRef type = LLVMTypeOf(*pgpr);
425 LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
426 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
427
428 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
429
430 if (old_type == ctx->i1)
431 *pgpr = LLVMBuildTrunc(builder, *pgpr, old_type, "");
432
433 if (old_type == LLVMVectorType(ctx->i16, 3))
434 *pgpr = ac_extract_components(ctx, *pgpr, 0, 3);
435 }
436 }
437
ac_build_shader_clock(struct ac_llvm_context * ctx,mesa_scope scope)438 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, mesa_scope scope)
439 {
440 if (ctx->gfx_level >= GFX11 && scope == SCOPE_DEVICE) {
441 const char *name = "llvm.amdgcn.s.sendmsg.rtn.i64";
442 LLVMValueRef arg = LLVMConstInt(ctx->i32, 0x83 /* realtime */, 0);
443 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, &arg, 1, 0);
444 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
445 }
446
447 const char *subgroup = "llvm.readcyclecounter";
448 const char *name = scope == SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
449
450 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
451 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
452 }
453
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)454 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
455 {
456 const char *name;
457
458 if (LLVMTypeOf(value) == ctx->i1)
459 value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
460
461 if (ctx->wave_size == 64)
462 name = "llvm.amdgcn.icmp.i64.i32";
463 else
464 name = "llvm.amdgcn.icmp.i32.i32";
465
466 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
467
468 /* We currently have no other way to prevent LLVM from lifting the icmp
469 * calls to a dominating basic block.
470 */
471 ac_build_optimization_barrier(ctx, &args[0], false);
472
473 args[0] = ac_to_integer(ctx, args[0]);
474
475 return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
476 }
477
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)478 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
479 {
480 const char *name;
481
482 if (ctx->wave_size == 64)
483 name = "llvm.amdgcn.icmp.i64.i1";
484 else
485 name = "llvm.amdgcn.icmp.i32.i1";
486
487 LLVMValueRef args[3] = {
488 value,
489 ctx->i1false,
490 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
491 };
492
493 return ac_build_intrinsic(ctx, name, ctx->iN_wavemask, args, 3, 0);
494 }
495
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)496 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
497 {
498 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
499 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
500 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
501 }
502
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)503 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
504 {
505 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
506 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
507 "");
508 }
509
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)510 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
511 {
512 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
513 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
514
515 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
516 LLVMValueRef none =
517 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
518 return LLVMBuildOr(ctx->builder, all, none, "");
519 }
520
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)521 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
522 unsigned value_count, unsigned component)
523 {
524 LLVMValueRef vec = NULL;
525
526 if (value_count == 1) {
527 return values[component];
528 } else if (!value_count)
529 unreachable("value_count is 0");
530
531 for (unsigned i = component; i < value_count + component; i++) {
532 LLVMValueRef value = values[i];
533
534 if (i == component)
535 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
536 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
537 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
538 }
539 return vec;
540 }
541
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)542 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
543 unsigned value_count, unsigned value_stride,
544 bool always_vector)
545 {
546 LLVMBuilderRef builder = ctx->builder;
547 LLVMValueRef vec = NULL;
548 unsigned i;
549
550 if (value_count == 1 && !always_vector) {
551 return values[0];
552 } else if (!value_count)
553 unreachable("value_count is 0");
554
555 for (i = 0; i < value_count; i++) {
556 LLVMValueRef value = values[i * value_stride];
557
558 if (!i)
559 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
560 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
561 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
562 }
563 return vec;
564 }
565
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)566 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
567 unsigned value_count)
568 {
569 return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
570 }
571
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)572 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
573 {
574 if (!a)
575 return b;
576
577 unsigned a_size = ac_get_llvm_num_components(a);
578 unsigned b_size = ac_get_llvm_num_components(b);
579
580 LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
581 for (unsigned i = 0; i < a_size; i++)
582 elems[i] = ac_llvm_extract_elem(ctx, a, i);
583 for (unsigned i = 0; i < b_size; i++)
584 elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
585
586 return ac_build_gather_values(ctx, elems, a_size + b_size);
587 }
588
589 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
590 * channels with undef. Extract at most src_channels components from the input.
591 */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)592 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
593 unsigned src_channels, unsigned dst_channels)
594 {
595 LLVMTypeRef elemtype;
596 LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
597
598 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
599 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
600
601 if (src_channels == dst_channels && vec_size == dst_channels)
602 return value;
603
604 src_channels = MIN2(src_channels, vec_size);
605
606 for (unsigned i = 0; i < src_channels; i++)
607 chan[i] = ac_llvm_extract_elem(ctx, value, i);
608
609 elemtype = LLVMGetElementType(LLVMTypeOf(value));
610 } else {
611 if (src_channels) {
612 assert(src_channels == 1);
613 chan[0] = value;
614 }
615 elemtype = LLVMTypeOf(value);
616 }
617
618 for (unsigned i = src_channels; i < dst_channels; i++)
619 chan[i] = LLVMGetUndef(elemtype);
620
621 return ac_build_gather_values(ctx, chan, dst_channels);
622 }
623
624 /* Extract components [start, start + channels) from a vector.
625 */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)626 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
627 unsigned channels)
628 {
629 LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
630
631 for (unsigned i = 0; i < channels; i++)
632 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
633
634 return ac_build_gather_values(ctx, chan, channels);
635 }
636
637 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
638 * with undef. Extract at most num_channels components from the input.
639 */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)640 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
641 unsigned num_channels)
642 {
643 return ac_build_expand(ctx, value, num_channels, 4);
644 }
645
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)646 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
647 {
648 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
649 const char *name;
650
651 if (type_size == 2)
652 name = "llvm.rint.f16";
653 else if (type_size == 4)
654 name = "llvm.rint.f32";
655 else
656 name = "llvm.rint.f64";
657
658 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, 0);
659 }
660
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)661 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
662 {
663 unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
664 const char *name;
665
666 if (type_size == 2)
667 name = "llvm.amdgcn.rcp.f16";
668 else if (type_size == 4)
669 name = "llvm.amdgcn.rcp.f32";
670 else
671 name = "llvm.amdgcn.rcp.f64";
672
673 LLVMValueRef rcp =
674 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, 0);
675
676 return LLVMBuildFMul(ctx->builder, num, rcp, "");
677 }
678
679 /* See fast_idiv_by_const.h. */
680 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)681 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
682 LLVMValueRef multiplier, LLVMValueRef pre_shift,
683 LLVMValueRef post_shift, LLVMValueRef increment)
684 {
685 LLVMBuilderRef builder = ctx->builder;
686
687 num = LLVMBuildLShr(builder, num, pre_shift, "");
688 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
689 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
690 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
691 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
692 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
693 return LLVMBuildLShr(builder, num, post_shift, "");
694 }
695
696 /* See fast_idiv_by_const.h. */
697 /* If num != UINT_MAX, this more efficient version can be used. */
698 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)699 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
700 LLVMValueRef multiplier, LLVMValueRef pre_shift,
701 LLVMValueRef post_shift, LLVMValueRef increment)
702 {
703 LLVMBuilderRef builder = ctx->builder;
704
705 num = LLVMBuildLShr(builder, num, pre_shift, "");
706 num = LLVMBuildNUWAdd(builder, num, increment, "");
707 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
708 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
709 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
710 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
711 return LLVMBuildLShr(builder, num, post_shift, "");
712 }
713
714 /* See fast_idiv_by_const.h. */
715 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)716 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
717 LLVMValueRef multiplier, LLVMValueRef post_shift)
718 {
719 LLVMBuilderRef builder = ctx->builder;
720
721 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
722 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
723 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
724 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
725 return LLVMBuildLShr(builder, num, post_shift, "");
726 }
727
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)728 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
729 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
730 LLVMValueRef j)
731 {
732 LLVMValueRef args[5];
733
734 if (ctx->gfx_level >= GFX11) {
735 LLVMValueRef p;
736 LLVMValueRef p10;
737
738 args[0] = llvm_chan;
739 args[1] = attr_number;
740 args[2] = params;
741
742 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
743 ctx->f32, args, 3, 0);
744
745 args[0] = p;
746 args[1] = i;
747 args[2] = p;
748
749 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
750 ctx->f32, args, 3, 0);
751
752 args[0] = p;
753 args[1] = j;
754 args[2] = p10;
755
756 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
757 ctx->f32, args, 3, 0);
758
759 } else {
760 LLVMValueRef p1;
761
762 args[0] = i;
763 args[1] = llvm_chan;
764 args[2] = attr_number;
765 args[3] = params;
766
767 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
768 ctx->f32, args, 4, 0);
769
770 args[0] = p1;
771 args[1] = j;
772 args[2] = llvm_chan;
773 args[3] = attr_number;
774 args[4] = params;
775
776 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
777 ctx->f32, args, 5, 0);
778 }
779 }
780
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)781 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
782 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
783 LLVMValueRef j, bool high_16bits)
784 {
785 LLVMValueRef args[6];
786
787 if (ctx->gfx_level >= GFX11) {
788 LLVMValueRef p;
789 LLVMValueRef p10;
790
791 args[0] = llvm_chan;
792 args[1] = attr_number;
793 args[2] = params;
794
795 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
796 ctx->f32, args, 3, 0);
797
798 args[0] = p;
799 args[1] = i;
800 args[2] = p;
801 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
802
803 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
804 ctx->f32, args, 4, 0);
805
806 args[0] = p;
807 args[1] = j;
808 args[2] = p10;
809 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
810
811 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
812 ctx->f16, args, 4, 0);
813
814 } else {
815 LLVMValueRef p1;
816
817 args[0] = i;
818 args[1] = llvm_chan;
819 args[2] = attr_number;
820 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
821 args[4] = params;
822
823 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
824 0);
825
826 args[0] = p1;
827 args[1] = j;
828 args[2] = llvm_chan;
829 args[3] = attr_number;
830 args[4] = high_16bits ? ctx->i1true : ctx->i1false;
831 args[5] = params;
832
833 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
834 0);
835 }
836 }
837
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,unsigned parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)838 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, unsigned parameter,
839 LLVMValueRef llvm_chan, LLVMValueRef attr_number,
840 LLVMValueRef params)
841 {
842 LLVMValueRef args[4];
843
844 if (ctx->gfx_level >= GFX11) {
845 LLVMValueRef p;
846
847 args[0] = llvm_chan;
848 args[1] = attr_number;
849 args[2] = params;
850
851 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
852 ctx->f32, args, 3, 0);
853 p = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
854 p = ac_build_quad_swizzle(ctx, p, parameter, parameter, parameter, parameter);
855 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, 0);
856 } else {
857 args[0] = LLVMConstInt(ctx->i32, (parameter + 2) % 3, 0);
858 args[1] = llvm_chan;
859 args[2] = attr_number;
860 args[3] = params;
861
862 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4, 0);
863 }
864 }
865
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index)866 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef base_ptr,
867 LLVMValueRef index)
868 {
869 return LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
870 }
871
ac_build_gep0_type(LLVMTypeRef pointee_type,LLVMValueRef index)872 LLVMTypeRef ac_build_gep0_type(LLVMTypeRef pointee_type, LLVMValueRef index)
873 {
874 switch (LLVMGetTypeKind(pointee_type)) {
875 case LLVMPointerTypeKind:
876 return pointee_type;
877 case LLVMArrayTypeKind:
878 /* If input is a pointer to an array GEP2 will return a pointer to
879 * the array elements type.
880 */
881 return LLVMGetElementType(pointee_type);
882 case LLVMStructTypeKind:
883 /* If input is a pointer to a struct, GEP2 will return a pointer to
884 * the index-nth field, so get its type.
885 */
886 return LLVMStructGetTypeAtIndex(pointee_type, LLVMConstIntGetZExtValue(index));
887 default:
888 /* gep0 shouldn't receive any other types. */
889 assert(false);
890 }
891 return NULL;
892 }
893
ac_build_gep0(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)894 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
895 {
896 LLVMValueRef indices[2] = {
897 ctx->i32_0,
898 index,
899 };
900
901 return LLVMBuildGEP2(ctx->builder, ptr.t, ptr.v, indices, 2, "");
902 }
903
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef ptr,LLVMValueRef index)904 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMTypeRef type, LLVMValueRef ptr, LLVMValueRef index)
905 {
906 return LLVMBuildGEP2(ctx->builder, type, ptr, &index, 1, "");
907 }
908
ac_build_indexed_store(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index,LLVMValueRef value)909 void ac_build_indexed_store(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index,
910 LLVMValueRef value)
911 {
912 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, ptr, index));
913 }
914
915 /**
916 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
917 * It's equivalent to doing a load from &base_ptr[index].
918 *
919 * \param base_ptr Where the array starts.
920 * \param index The element index into the array.
921 * \param uniform Whether the base_ptr and index can be assumed to be
922 * dynamically uniform (i.e. load to an SGPR)
923 * \param invariant Whether the load is invariant (no other opcodes affect it)
924 * \param no_unsigned_wraparound
925 * For all possible re-associations and re-distributions of an expression
926 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
927 * without inbounds in base_ptr), this parameter is true if "addr + offset"
928 * does not result in an unsigned integer wraparound. This is used for
929 * optimal code generation of 32-bit pointer arithmetic.
930 *
931 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
932 * integer wraparound can't be an imm offset in s_load_dword, because
933 * the instruction performs "addr + offset" in 64 bits.
934 *
935 * Expected usage for bindless textures by chaining GEPs:
936 * // possible unsigned wraparound, don't use InBounds:
937 * ptr1 = LLVMBuildGEP(base_ptr, index);
938 * image = load(ptr1); // becomes "s_load ptr1, 0"
939 *
940 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
941 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
942 */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMTypeRef type,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)943 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMTypeRef type,
944 LLVMValueRef base_ptr, LLVMValueRef index,
945 bool uniform, bool invariant, bool no_unsigned_wraparound)
946 {
947 LLVMValueRef pointer, result;
948
949 if (no_unsigned_wraparound &&
950 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
951 pointer = LLVMBuildInBoundsGEP2(ctx->builder, type, base_ptr, &index, 1, "");
952 else
953 pointer = LLVMBuildGEP2(ctx->builder, type, base_ptr, &index, 1, "");
954
955 if (uniform)
956 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
957 result = LLVMBuildLoad2(ctx->builder, type, pointer, "");
958 if (invariant)
959 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
960 LLVMSetAlignment(result, 4);
961 return result;
962 }
963
ac_build_load(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)964 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
965 {
966 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, false, false);
967 }
968
ac_build_load_invariant(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)969 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
970 LLVMValueRef index)
971 {
972 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, false, true, false);
973 }
974
975 /* This assumes that there is no unsigned integer wraparound during the address
976 * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)977 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr,
978 LLVMValueRef index)
979 {
980 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, true);
981 }
982
983 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,struct ac_llvm_pointer ptr,LLVMValueRef index)984 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx, struct ac_llvm_pointer ptr, LLVMValueRef index)
985 {
986 return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, false);
987 }
988
get_cache_flags(struct ac_llvm_context * ctx,enum gl_access_qualifier access)989 static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
990 {
991 return ac_get_hw_cache_flags(ctx->gfx_level, access).value;
992 }
993
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access,bool use_format)994 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
995 LLVMValueRef data, LLVMValueRef vindex,
996 LLVMValueRef voffset, LLVMValueRef soffset,
997 enum gl_access_qualifier access, bool use_format)
998 {
999 LLVMValueRef args[6];
1000 int idx = 0;
1001 args[idx++] = data;
1002 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1003 if (vindex)
1004 args[idx++] = vindex ? vindex : ctx->i32_0;
1005 args[idx++] = voffset ? voffset : ctx->i32_0;
1006 args[idx++] = soffset ? soffset : ctx->i32_0;
1007 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
1008 const char *indexing_kind = vindex ? "struct" : "raw";
1009 char name[256], type_name[8];
1010
1011 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1012
1013 if (use_format) {
1014 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1015 type_name);
1016 } else {
1017 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1018 }
1019
1020 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, 0);
1021 }
1022
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,enum gl_access_qualifier access)1023 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1024 LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
1025 {
1026 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
1027 }
1028
1029 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1030 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1031 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1032 enum gl_access_qualifier access)
1033 {
1034 unsigned num_channels = ac_get_llvm_num_components(vdata);
1035
1036 /* Split 3 channel stores if unsupported. */
1037 if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
1038 LLVMValueRef v[3], v01, voffset2;
1039
1040 for (int i = 0; i < 3; i++) {
1041 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1042 }
1043 v01 = ac_build_gather_values(ctx, v, 2);
1044
1045 voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1046 LLVMConstInt(ctx->i32, 8, 0), "");
1047
1048 ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
1049 ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
1050 return;
1051 }
1052
1053 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
1054 access, false);
1055 }
1056
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool use_format)1057 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1058 LLVMValueRef vindex, LLVMValueRef voffset,
1059 LLVMValueRef soffset, unsigned num_channels,
1060 LLVMTypeRef channel_type, enum gl_access_qualifier access,
1061 bool can_speculate, bool use_format)
1062 {
1063 LLVMValueRef args[5];
1064 int idx = 0;
1065 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1066 if (vindex)
1067 args[idx++] = vindex;
1068 args[idx++] = voffset ? voffset : ctx->i32_0;
1069 args[idx++] = soffset ? soffset : ctx->i32_0;
1070 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1071 unsigned func =
1072 !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
1073 const char *indexing_kind = vindex ? "struct" : "raw";
1074 char name[256], type_name[8];
1075
1076 /* D16 is only supported on gfx8+ */
1077 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1078 ctx->gfx_level >= GFX8);
1079
1080 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1081 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1082
1083 if (use_format) {
1084 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1085 type_name);
1086 } else {
1087 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1088 }
1089
1090 LLVMValueRef result = ac_build_intrinsic(ctx, name, type, args, idx,
1091 can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1092 if (func > num_channels)
1093 result = ac_trim_vector(ctx, result, num_channels);
1094 return result;
1095 }
1096
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate,bool allow_smem)1097 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1098 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1099 LLVMTypeRef channel_type, enum gl_access_qualifier access,
1100 bool can_speculate, bool allow_smem)
1101 {
1102 if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
1103 assert(vindex == NULL);
1104
1105 LLVMValueRef result[32];
1106
1107 LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
1108 if (soffset)
1109 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1110
1111 char name[256], type_name[8];
1112 ac_build_type_name_for_intr(channel_type, type_name, sizeof(type_name));
1113 snprintf(name, sizeof(name), "llvm.amdgcn.s.buffer.load.%s", type_name);
1114
1115 LLVMValueRef channel_size = LLVMConstInt(ctx->i32, ac_get_type_size(channel_type), 0);
1116
1117 for (int i = 0; i < num_channels; i++) {
1118 if (i) {
1119 offset = LLVMBuildAdd(ctx->builder, offset, channel_size, "");
1120 }
1121 LLVMValueRef args[3] = {
1122 rsrc,
1123 offset,
1124 LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
1125 ACCESS_TYPE_SMEM), 0),
1126 };
1127 result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
1128 }
1129 if (num_channels == 1)
1130 return result[0];
1131
1132 return ac_build_gather_values(ctx, result, num_channels);
1133 }
1134
1135 /* LLVM is unable to select instructions for num_channels > 4, so we
1136 * workaround that by manually splitting larger buffer loads.
1137 */
1138 LLVMValueRef result = NULL;
1139 for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1140 fetch_num_channels = MIN2(4, num_channels - i);
1141 LLVMValueRef fetch_voffset =
1142 LLVMBuildAdd(ctx->builder, voffset,
1143 LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
1144 LLVMValueRef item =
1145 ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
1146 channel_type, access, can_speculate, false);
1147 result = ac_build_concat(ctx, result, item);
1148 }
1149
1150 return result;
1151 }
1152
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate,bool d16,bool tfe)1153 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1154 LLVMValueRef vindex, LLVMValueRef voffset,
1155 unsigned num_channels, enum gl_access_qualifier access,
1156 bool can_speculate, bool d16, bool tfe)
1157 {
1158 if (tfe) {
1159 assert(!d16);
1160
1161 union ac_hw_cache_flags cache_flags =
1162 ac_get_hw_cache_flags(ctx->gfx_level, access | ACCESS_TYPE_LOAD);
1163 char code[1024];
1164
1165 /* The definition in the assembly and the one in the constraint string
1166 * differs because of an assembler bug.
1167 */
1168 if (ctx->gfx_level >= GFX12) {
1169 const char *scope = "";
1170 const char *temporal_hint = "";
1171
1172 if (cache_flags.gfx12.scope == gfx12_scope_se)
1173 scope = "scope:SCOPE_SE";
1174 else if (cache_flags.gfx12.scope == gfx12_scope_device)
1175 scope = "scope:SCOPE_DEV";
1176 else if (cache_flags.gfx12.scope == gfx12_scope_memory)
1177 scope = "scope:SCOPE_SYS";
1178
1179 if (cache_flags.gfx12.temporal_hint == gfx12_load_non_temporal)
1180 temporal_hint = "th:TH_LOAD_NT";
1181 else if (cache_flags.gfx12.temporal_hint == gfx12_load_high_temporal)
1182 temporal_hint = "th:TH_LOAD_HT";
1183 else if (cache_flags.gfx12.temporal_hint == gfx12_load_last_use_discard)
1184 temporal_hint = "th:TH_LOAD_LU";
1185 else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_regular_temporal)
1186 temporal_hint = "th:TH_LOAD_NT_RT";
1187 else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_regular_temporal_far_non_temporal)
1188 temporal_hint = "th:TH_LOAD_RT_NT";
1189 else if (cache_flags.gfx12.temporal_hint == gfx12_load_near_non_temporal_far_high_temporal)
1190 temporal_hint = "th:TH_LOAD_NT_HT";
1191
1192 snprintf(code, sizeof(code),
1193 "v_mov_b32 v0, 0\n"
1194 "v_mov_b32 v1, 0\n"
1195 "v_mov_b32 v2, 0\n"
1196 "v_mov_b32 v3, 0\n"
1197 "v_mov_b32 v4, 0\n"
1198 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe\n"
1199 "s_waitcnt vmcnt(0)",
1200 temporal_hint, scope);
1201 } else {
1202 snprintf(code, sizeof(code),
1203 "v_mov_b32 v0, 0\n"
1204 "v_mov_b32 v1, 0\n"
1205 "v_mov_b32 v2, 0\n"
1206 "v_mov_b32 v3, 0\n"
1207 "v_mov_b32 v4, 0\n"
1208 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1209 "s_waitcnt vmcnt(0)",
1210 cache_flags.value & ac_glc ? "glc" : "",
1211 cache_flags.value & ac_slc ? "slc" : "",
1212 cache_flags.value & ac_dlc ? "dlc" : "");
1213 }
1214
1215 LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1216 LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1217 LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1218
1219 LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1220 voffset ? voffset : ctx->i32_0};
1221
1222 LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1223 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1224 LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1225
1226 return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1227 ac_llvm_extract_elem(ctx, res, 4));
1228 }
1229
1230 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
1231 num_channels, d16 ? ctx->f16 : ctx->f32, access,
1232 can_speculate, true);
1233 }
1234
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned tbuffer_format,LLVMTypeRef channel_type,enum gl_access_qualifier access,bool can_speculate)1235 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1236 LLVMValueRef vindex, LLVMValueRef voffset,
1237 LLVMValueRef soffset, unsigned num_channels,
1238 unsigned tbuffer_format, LLVMTypeRef channel_type,
1239 enum gl_access_qualifier access, bool can_speculate)
1240 {
1241 LLVMValueRef args[6];
1242 int idx = 0;
1243 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1244 if (vindex)
1245 args[idx++] = vindex;
1246 args[idx++] = voffset ? voffset : ctx->i32_0;
1247 args[idx++] = soffset ? soffset : ctx->i32_0;
1248 args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
1249 args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
1250 const char *indexing_kind = vindex ? "struct" : "raw";
1251 char name[256], type_name[8];
1252
1253 LLVMTypeRef type = num_channels > 1 ? LLVMVectorType(channel_type, num_channels) : channel_type;
1254 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1255
1256 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1257
1258 return ac_build_intrinsic(ctx, name, type, args, idx,
1259 can_speculate ? AC_ATTR_INVARIANT_LOAD : 0);
1260 }
1261
ac_build_safe_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vidx,LLVMValueRef base_voffset,LLVMValueRef soffset,const enum pipe_format format,unsigned channel_bit_size,unsigned const_offset,unsigned align_offset,unsigned align_mul,unsigned num_channels,enum gl_access_qualifier access,bool can_speculate)1262 LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1263 LLVMValueRef vidx, LLVMValueRef base_voffset,
1264 LLVMValueRef soffset,
1265 const enum pipe_format format,
1266 unsigned channel_bit_size,
1267 unsigned const_offset,
1268 unsigned align_offset,
1269 unsigned align_mul,
1270 unsigned num_channels,
1271 enum gl_access_qualifier access,
1272 bool can_speculate)
1273 {
1274 const struct ac_vtx_format_info *vtx_info = ac_get_vtx_format_info(ctx->gfx_level, ctx->info->family, format);
1275 const unsigned max_channels = vtx_info->num_channels;
1276 LLVMValueRef voffset_plus_const =
1277 LLVMBuildAdd(ctx->builder, base_voffset, LLVMConstInt(ctx->i32, const_offset, 0), "");
1278
1279 /* Split the specified load into several MTBUF instructions,
1280 * according to a safe fetch size determined by aligmnent information.
1281 */
1282 LLVMValueRef result = NULL;
1283 for (unsigned i = 0, fetch_num_channels; i < num_channels; i += fetch_num_channels) {
1284 /* Packed formats (determined here by chan_byte_size == 0) should never be split. */
1285 assert(i == 0 || vtx_info->chan_byte_size);
1286
1287 const unsigned fetch_const_offset = const_offset + i * vtx_info->chan_byte_size;
1288 const unsigned fetch_align_offset = (align_offset + i * vtx_info->chan_byte_size) % align_mul;
1289 const unsigned fetch_alignment = fetch_align_offset ? 1 << (ffs(fetch_align_offset) - 1) : align_mul;
1290
1291 fetch_num_channels =
1292 ac_get_safe_fetch_size(ctx->gfx_level, vtx_info, fetch_const_offset,
1293 max_channels - i, fetch_alignment, num_channels - i);
1294 const unsigned fetch_format = vtx_info->hw_format[fetch_num_channels - 1];
1295 LLVMValueRef fetch_voffset =
1296 LLVMBuildAdd(ctx->builder, voffset_plus_const,
1297 LLVMConstInt(ctx->i32, i * vtx_info->chan_byte_size, 0), "");
1298 LLVMValueRef item =
1299 ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
1300 fetch_num_channels, fetch_format, ctx->i32,
1301 access, can_speculate);
1302 result = ac_build_concat(ctx, result, item);
1303 }
1304
1305 /*
1306 * LLVM is not able to select 16-bit typed loads. Load 32-bit values instead and
1307 * manually truncate them to the required size.
1308 * TODO: Do this in NIR instead.
1309 */
1310 const struct util_format_description *desc = util_format_description(format);
1311 bool is_float = !desc->channel[0].pure_integer;
1312
1313 if (channel_bit_size == 16) {
1314 LLVMValueRef channels[4];
1315 for (unsigned i = 0; i < num_channels; i++) {
1316 LLVMValueRef channel = result;
1317 if (num_channels > 1)
1318 channel = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, i, false), "");
1319
1320 if (is_float) {
1321 channel = LLVMBuildBitCast(ctx->builder, channel, ctx->f32, "");
1322 channel = LLVMBuildFPTrunc(ctx->builder, channel, ctx->f16, "");
1323 channel = LLVMBuildBitCast(ctx->builder, channel, ctx->i16, "");
1324 } else {
1325 channel = LLVMBuildTrunc(ctx->builder, channel, ctx->i16, "");
1326 }
1327 channels[i] = channel;
1328 }
1329 result = ac_build_gather_values(ctx, channels, num_channels);
1330 }
1331
1332 return result;
1333 }
1334
1335
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1336 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1337 LLVMValueRef voffset, LLVMValueRef soffset,
1338 enum gl_access_qualifier access)
1339 {
1340 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1341 access, false, false);
1342 }
1343
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1344 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1345 LLVMValueRef voffset, LLVMValueRef soffset,
1346 enum gl_access_qualifier access)
1347 {
1348 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
1349 false, false);
1350 }
1351
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1352 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1353 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1354 enum gl_access_qualifier access)
1355 {
1356 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1357
1358 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1359 }
1360
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,enum gl_access_qualifier access)1361 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1362 LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
1363 {
1364 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1365
1366 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
1367 }
1368
1369 /**
1370 * Set range metadata on an instruction. This can only be used on load and
1371 * call instructions. If you know an instruction can only produce the values
1372 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1373 * \p lo is the minimum value inclusive.
1374 * \p hi is the maximum value exclusive.
1375 */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1376 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1377 unsigned hi)
1378 {
1379 LLVMValueRef range_md, md_args[2];
1380 LLVMTypeRef type = LLVMTypeOf(value);
1381 LLVMContextRef context = LLVMGetTypeContext(type);
1382
1383 md_args[0] = LLVMConstInt(type, lo, false);
1384 md_args[1] = LLVMConstInt(type, hi, false);
1385 range_md = LLVMMDNodeInContext(context, md_args, 2);
1386 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1387 }
1388
ac_get_thread_id(struct ac_llvm_context * ctx)1389 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1390 {
1391 return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1392 }
1393
1394 /*
1395 * AMD GCN implements derivatives using the local data store (LDS)
1396 * All writes to the LDS happen in all executing threads at
1397 * the same time. TID is the Thread ID for the current
1398 * thread and is a value between 0 and 63, representing
1399 * the thread's position in the wavefront.
1400 *
1401 * For the pixel shader threads are grouped into quads of four pixels.
1402 * The TIDs of the pixels of a quad are:
1403 *
1404 * +------+------+
1405 * |4n + 0|4n + 1|
1406 * +------+------+
1407 * |4n + 2|4n + 3|
1408 * +------+------+
1409 *
1410 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1411 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1412 * the current pixel's column, and masking with 0xfffffffe yields the TID
1413 * of the left pixel of the current pixel's row.
1414 *
1415 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1416 * adding 2 yields the TID of the pixel below the top pixel.
1417 */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1418 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1419 {
1420 unsigned tl_lanes[4], trbl_lanes[4];
1421 char name[32], type[8];
1422 LLVMValueRef tl, trbl;
1423 LLVMTypeRef result_type;
1424 LLVMValueRef result;
1425
1426 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1427
1428 if (result_type == ctx->f16)
1429 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1430 else if (result_type == ctx->v2f16)
1431 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1432
1433 for (unsigned i = 0; i < 4; ++i) {
1434 tl_lanes[i] = i & mask;
1435 trbl_lanes[i] = (i & mask) + idx;
1436 }
1437
1438 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1439 trbl =
1440 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1441
1442 if (result_type == ctx->f16) {
1443 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1444 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1445 }
1446
1447 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1448 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1449 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1450
1451 ac_build_type_name_for_intr(result_type, type, sizeof(type));
1452 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1453
1454 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1455 }
1456
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t imm,LLVMValueRef m0_content)1457 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t imm, LLVMValueRef m0_content)
1458 {
1459 LLVMValueRef args[2];
1460 args[0] = LLVMConstInt(ctx->i32, imm, false);
1461 args[1] = m0_content;
1462 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1463 }
1464
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1465 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1466 {
1467 LLVMValueRef msb =
1468 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, 0);
1469
1470 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1471 * the index from LSB. Invert it by doing "31 - msb". */
1472 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1473
1474 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1475 LLVMValueRef cond =
1476 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1477 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1478
1479 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1480 }
1481
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type,bool rev)1482 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type,
1483 bool rev)
1484 {
1485 const char *intrin_name;
1486 LLVMTypeRef type;
1487 LLVMValueRef highest_bit;
1488 LLVMValueRef zero;
1489 unsigned bitsize;
1490
1491 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1492 switch (bitsize) {
1493 case 64:
1494 intrin_name = "llvm.ctlz.i64";
1495 type = ctx->i64;
1496 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1497 zero = ctx->i64_0;
1498 break;
1499 case 32:
1500 intrin_name = "llvm.ctlz.i32";
1501 type = ctx->i32;
1502 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1503 zero = ctx->i32_0;
1504 break;
1505 case 16:
1506 intrin_name = "llvm.ctlz.i16";
1507 type = ctx->i16;
1508 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1509 zero = ctx->i16_0;
1510 break;
1511 case 8:
1512 intrin_name = "llvm.ctlz.i8";
1513 type = ctx->i8;
1514 highest_bit = LLVMConstInt(ctx->i8, 7, false);
1515 zero = ctx->i8_0;
1516 break;
1517 default:
1518 unreachable("invalid bitsize");
1519 break;
1520 }
1521
1522 LLVMValueRef params[2] = {
1523 arg,
1524 ctx->i1true,
1525 };
1526
1527 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
1528
1529 if (!rev) {
1530 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1531 * the index from LSB. Invert it by doing "31 - msb". */
1532 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1533 }
1534
1535 if (bitsize == 64) {
1536 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1537 } else if (bitsize < 32) {
1538 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1539 }
1540
1541 /* check for zero */
1542 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1543 LLVMConstInt(ctx->i32, -1, true), msb, "");
1544 }
1545
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1546 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1547 {
1548 char name[64], type[64];
1549
1550 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1551 snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1552 LLVMValueRef args[2] = {a, b};
1553 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1554 }
1555
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1556 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1557 {
1558 char name[64], type[64];
1559
1560 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1561 snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1562 LLVMValueRef args[2] = {a, b};
1563 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, 0);
1564 }
1565
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1566 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1567 {
1568 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1569 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1570 }
1571
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1572 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1573 {
1574 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1575 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1576 }
1577
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1578 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1579 {
1580 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1581 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1582 }
1583
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1584 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1585 {
1586 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1587 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1588 }
1589
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1590 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1591 {
1592 LLVMTypeRef t = LLVMTypeOf(value);
1593 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1594 LLVMConstReal(t, 1.0));
1595 }
1596
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1597 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1598 {
1599 LLVMValueRef args[9];
1600
1601 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1602 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1603
1604 if (a->compr) {
1605 assert(ctx->gfx_level < GFX11);
1606
1607 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1608 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1609 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1610 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1611
1612 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1613 } else {
1614 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1615 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1616 args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1617 args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1618 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1619 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1620
1621 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1622 }
1623 }
1624
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)1625 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
1626 {
1627 struct ac_export_args args;
1628
1629 /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
1630 * for discard.
1631 */
1632 if (ctx->gfx_level >= GFX10 && !uses_discard)
1633 return;
1634
1635 args.enabled_channels = 0x0; /* enabled channels */
1636 args.valid_mask = 1; /* whether the EXEC mask is valid */
1637 args.done = 1; /* DONE bit */
1638 /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
1639 args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
1640 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
1641 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1642 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1643 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1644 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1645
1646 ac_build_export(ctx, &args);
1647 }
1648
ac_num_coords(enum ac_image_dim dim)1649 static unsigned ac_num_coords(enum ac_image_dim dim)
1650 {
1651 switch (dim) {
1652 case ac_image_1d:
1653 return 1;
1654 case ac_image_2d:
1655 case ac_image_1darray:
1656 return 2;
1657 case ac_image_3d:
1658 case ac_image_cube:
1659 case ac_image_2darray:
1660 case ac_image_2dmsaa:
1661 return 3;
1662 case ac_image_2darraymsaa:
1663 return 4;
1664 default:
1665 unreachable("ac_num_coords: bad dim");
1666 }
1667 }
1668
ac_num_derivs(enum ac_image_dim dim)1669 static unsigned ac_num_derivs(enum ac_image_dim dim)
1670 {
1671 switch (dim) {
1672 case ac_image_1d:
1673 case ac_image_1darray:
1674 return 2;
1675 case ac_image_2d:
1676 case ac_image_2darray:
1677 case ac_image_cube:
1678 return 4;
1679 case ac_image_3d:
1680 return 6;
1681 case ac_image_2dmsaa:
1682 case ac_image_2darraymsaa:
1683 default:
1684 unreachable("derivatives not supported");
1685 }
1686 }
1687
get_atomic_name(enum ac_atomic_op op)1688 static const char *get_atomic_name(enum ac_atomic_op op)
1689 {
1690 switch (op) {
1691 case ac_atomic_swap:
1692 return "swap";
1693 case ac_atomic_add:
1694 return "add";
1695 case ac_atomic_sub:
1696 return "sub";
1697 case ac_atomic_smin:
1698 return "smin";
1699 case ac_atomic_umin:
1700 return "umin";
1701 case ac_atomic_smax:
1702 return "smax";
1703 case ac_atomic_umax:
1704 return "umax";
1705 case ac_atomic_and:
1706 return "and";
1707 case ac_atomic_or:
1708 return "or";
1709 case ac_atomic_xor:
1710 return "xor";
1711 case ac_atomic_inc_wrap:
1712 return "inc";
1713 case ac_atomic_dec_wrap:
1714 return "dec";
1715 case ac_atomic_fmin:
1716 return "fmin";
1717 case ac_atomic_fmax:
1718 return "fmax";
1719 }
1720 unreachable("bad atomic op");
1721 }
1722
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)1723 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
1724 {
1725 const char *overload[3] = {"", "", ""};
1726 unsigned num_overloads = 0;
1727 LLVMValueRef args[18];
1728 unsigned num_args = 0;
1729 enum ac_image_dim dim = a->dim;
1730
1731 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
1732 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
1733 a->opcode != ac_image_store_mip) ||
1734 a->lod);
1735 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1736 (!a->compare && !a->offset));
1737 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1738 a->opcode == ac_image_get_lod) ||
1739 !a->bias);
1740 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
1741 1);
1742 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
1743 assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
1744 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
1745 a->opcode != ac_image_get_resinfo));
1746 assert(!a->a16 || ctx->gfx_level >= GFX9);
1747 assert(!a->derivs[0] || a->g16 == a->a16 || ctx->gfx_level >= GFX10);
1748
1749 assert(!a->offset ||
1750 ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
1751 assert(!a->bias ||
1752 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == (a->a16 ? 16 : 32));
1753 assert(!a->compare ||
1754 ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
1755 assert(!a->derivs[0] ||
1756 ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
1757 (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
1758 assert(!a->coords[0] ||
1759 ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
1760 (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
1761 assert(!a->lod ||
1762 ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
1763 (a->opcode == ac_image_get_resinfo ||
1764 ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
1765 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
1766 assert(!a->min_lod ||
1767 ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
1768 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
1769
1770 if (a->opcode == ac_image_get_lod) {
1771 switch (dim) {
1772 case ac_image_1darray:
1773 dim = ac_image_1d;
1774 break;
1775 case ac_image_2darray:
1776 case ac_image_cube:
1777 dim = ac_image_2d;
1778 break;
1779 default:
1780 break;
1781 }
1782 }
1783
1784 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1785 a->opcode == ac_image_get_lod;
1786 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
1787 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1788 a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
1789 LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
1790 uint8_t dmask = a->dmask;
1791 LLVMTypeRef data_type;
1792 char data_type_str[32];
1793
1794 if (atomic) {
1795 data_type = LLVMTypeOf(a->data[0]);
1796 } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1797 /* Image stores might have been shrunk using the format. */
1798 data_type = LLVMTypeOf(a->data[0]);
1799 dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
1800 } else {
1801 data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
1802 }
1803
1804 if (a->tfe) {
1805 data_type = LLVMStructTypeInContext(
1806 ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
1807 }
1808
1809 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1810 args[num_args++] = a->data[0];
1811 if (a->opcode == ac_image_atomic_cmpswap)
1812 args[num_args++] = a->data[1];
1813 }
1814
1815 if (!atomic)
1816 args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
1817
1818 if (a->offset)
1819 args[num_args++] = ac_to_integer(ctx, a->offset);
1820 if (a->bias) {
1821 args[num_args++] = ac_to_float(ctx, a->bias);
1822 overload[num_overloads++] = ".f32";
1823 }
1824 if (a->compare)
1825 args[num_args++] = ac_to_float(ctx, a->compare);
1826 if (a->derivs[0]) {
1827 unsigned count = ac_num_derivs(dim);
1828 for (unsigned i = 0; i < count; ++i)
1829 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
1830 overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
1831 }
1832 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
1833 for (unsigned i = 0; i < num_coords; ++i)
1834 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
1835 if (a->lod)
1836 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
1837 if (a->min_lod)
1838 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
1839
1840 overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
1841
1842 args[num_args++] = a->resource;
1843 if (sample) {
1844 args[num_args++] = a->sampler;
1845 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
1846 }
1847
1848 args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
1849 args[num_args++] = LLVMConstInt(
1850 ctx->i32, get_cache_flags(ctx,
1851 a->access |
1852 (atomic ? ACCESS_TYPE_ATOMIC :
1853 load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
1854 false);
1855
1856 const char *name;
1857 const char *atomic_subop = "";
1858 switch (a->opcode) {
1859 case ac_image_sample:
1860 name = "sample";
1861 break;
1862 case ac_image_gather4:
1863 name = "gather4";
1864 break;
1865 case ac_image_load:
1866 name = "load";
1867 break;
1868 case ac_image_load_mip:
1869 name = "load.mip";
1870 break;
1871 case ac_image_store:
1872 name = "store";
1873 break;
1874 case ac_image_store_mip:
1875 name = "store.mip";
1876 break;
1877 case ac_image_atomic:
1878 name = "atomic.";
1879 atomic_subop = get_atomic_name(a->atomic);
1880 break;
1881 case ac_image_atomic_cmpswap:
1882 name = "atomic.";
1883 atomic_subop = "cmpswap";
1884 break;
1885 case ac_image_get_lod:
1886 name = "getlod";
1887 break;
1888 case ac_image_get_resinfo:
1889 name = "getresinfo";
1890 break;
1891 default:
1892 unreachable("invalid image opcode");
1893 }
1894
1895 const char *dimname;
1896 switch (dim) {
1897 case ac_image_1d:
1898 dimname = "1d";
1899 break;
1900 case ac_image_2d:
1901 dimname = "2d";
1902 break;
1903 case ac_image_3d:
1904 dimname = "3d";
1905 break;
1906 case ac_image_cube:
1907 dimname = "cube";
1908 break;
1909 case ac_image_1darray:
1910 dimname = "1darray";
1911 break;
1912 case ac_image_2darray:
1913 dimname = "2darray";
1914 break;
1915 case ac_image_2dmsaa:
1916 dimname = "2dmsaa";
1917 break;
1918 case ac_image_2darraymsaa:
1919 dimname = "2darraymsaa";
1920 break;
1921 default:
1922 unreachable("invalid dim");
1923 }
1924
1925 ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
1926
1927 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1928 char intr_name[96];
1929 snprintf(intr_name, sizeof(intr_name),
1930 "llvm.amdgcn.image.%s%s" /* base name */
1931 "%s%s%s%s" /* sample/gather modifiers */
1932 ".%s.%s%s%s%s", /* dimension and type overloads */
1933 name, atomic_subop, a->compare ? ".c" : "",
1934 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
1935 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
1936 data_type_str, overload[0], overload[1], overload[2]);
1937
1938 LLVMTypeRef retty;
1939 if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
1940 retty = ctx->voidt;
1941 else
1942 retty = data_type;
1943
1944 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
1945 if (a->tfe) {
1946 LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
1947 LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
1948 result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
1949 }
1950
1951 if (!sample && !atomic && retty != ctx->voidt)
1952 result = ac_to_integer(ctx, result);
1953
1954 return result;
1955 }
1956
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)1957 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
1958 {
1959 LLVMValueRef samples;
1960
1961 /* Read the samples from the descriptor directly.
1962 * Hardware doesn't have any instruction for this.
1963 */
1964 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
1965 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
1966 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
1967 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
1968 return samples;
1969 }
1970
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1971 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1972 {
1973 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2, 0);
1974 }
1975
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1976 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1977 {
1978 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2, 0);
1979 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1980 }
1981
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1982 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
1983 {
1984 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2, 0);
1985 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1986 }
1987
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])1988 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
1989 LLVMValueRef args[2])
1990 {
1991 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
1992 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
1993 LLVMValueRef code = LLVMConstInlineAsm(calltype,
1994 ctx->gfx_level >= GFX11 ?
1995 "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
1996 "v_cvt_pknorm_i16_f16 $0, $1, $2",
1997 "=v,v,v", false, false);
1998 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
1999 }
2000
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2001 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2002 LLVMValueRef args[2])
2003 {
2004 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2005 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2006 LLVMValueRef code = LLVMConstInlineAsm(calltype,
2007 ctx->gfx_level >= GFX11 ?
2008 "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
2009 "v_cvt_pknorm_u16_f16 $0, $1, $2",
2010 "=v,v,v", false, false);
2011 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2012 }
2013
2014 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2015 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2016 bool hi)
2017 {
2018 assert(bits == 8 || bits == 10 || bits == 16);
2019
2020 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2021 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2022 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2023 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2024
2025 /* Clamp. */
2026 if (bits != 16) {
2027 for (int i = 0; i < 2; i++) {
2028 bool alpha = hi && i == 1;
2029 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2030 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2031 }
2032 }
2033
2034 LLVMValueRef res =
2035 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, 0);
2036 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2037 }
2038
2039 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2040 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2041 bool hi)
2042 {
2043 assert(bits == 8 || bits == 10 || bits == 16);
2044
2045 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2046 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2047
2048 /* Clamp. */
2049 if (bits != 16) {
2050 for (int i = 0; i < 2; i++) {
2051 bool alpha = hi && i == 1;
2052 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2053 }
2054 }
2055
2056 LLVMValueRef res =
2057 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, 0);
2058 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2059 }
2060
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2061 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2062 {
2063 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, 0);
2064 }
2065
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2066 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2067 {
2068 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2069 }
2070
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2071 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2072 LLVMValueRef width, bool is_signed)
2073 {
2074 LLVMValueRef args[] = {
2075 input,
2076 offset,
2077 width,
2078 };
2079
2080 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2081 ctx->i32, args, 3, 0);
2082 }
2083
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2084 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2085 LLVMValueRef s2)
2086 {
2087 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2088 }
2089
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2090 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2091 LLVMValueRef s2)
2092 {
2093 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2094 if (ctx->gfx_level >= GFX10)
2095 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3, 0);
2096
2097 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2098 }
2099
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2100 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2101 {
2102 if (!wait_flags)
2103 return;
2104
2105 if (ctx->gfx_level >= GFX12) {
2106 if (wait_flags & AC_WAIT_DS)
2107 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.dscnt", ctx->voidt, &ctx->i16_0, 1, 0);
2108 if (wait_flags & AC_WAIT_KM)
2109 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.kmcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2110 if (wait_flags & AC_WAIT_EXP)
2111 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.expcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2112 if (wait_flags & AC_WAIT_LOAD)
2113 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.loadcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2114 if (wait_flags & AC_WAIT_STORE)
2115 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.storecnt", ctx->voidt, &ctx->i16_0, 1, 0);
2116 if (wait_flags & AC_WAIT_SAMPLE)
2117 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.samplecnt", ctx->voidt, &ctx->i16_0, 1, 0);
2118 if (wait_flags & AC_WAIT_BVH)
2119 ac_build_intrinsic(ctx, "llvm.amdgcn.s.wait.bvhcnt", ctx->voidt, &ctx->i16_0, 1, 0);
2120 } else {
2121 unsigned expcnt = 7;
2122 unsigned lgkmcnt = 63;
2123 unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
2124 unsigned vscnt = 63;
2125
2126 if (wait_flags & AC_WAIT_EXP)
2127 expcnt = 0;
2128 if (wait_flags & (AC_WAIT_DS | AC_WAIT_KM))
2129 lgkmcnt = 0;
2130 if (wait_flags & (AC_WAIT_LOAD | AC_WAIT_SAMPLE | AC_WAIT_BVH))
2131 vmcnt = 0;
2132
2133 if (wait_flags & AC_WAIT_STORE) {
2134 if (ctx->gfx_level >= GFX10)
2135 vscnt = 0;
2136 else
2137 vmcnt = 0;
2138 }
2139
2140 /* There is no intrinsic for vscnt(0), so use a fence. It waits for everything except expcnt. */
2141 if (vscnt == 0) {
2142 assert(!(wait_flags & AC_WAIT_EXP));
2143 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2144 return;
2145 }
2146
2147 unsigned simm16;
2148
2149 if (ctx->gfx_level >= GFX11)
2150 simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2151 else
2152 simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2153
2154 LLVMValueRef args[1] = {
2155 LLVMConstInt(ctx->i32, simm16, false),
2156 };
2157 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2158 }
2159 }
2160
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2161 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2162 LLVMTypeRef type)
2163 {
2164 unsigned bitsize = ac_get_elem_bits(ctx, type);
2165 LLVMValueRef zero = LLVMConstReal(type, 0.0);
2166 LLVMValueRef one = LLVMConstReal(type, 1.0);
2167 LLVMValueRef result;
2168
2169 if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2170 /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2171 * doesn't expose an intrinsic.
2172 */
2173 result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2174 } else {
2175 LLVMTypeRef type;
2176 char *intr;
2177
2178 if (bitsize == 16) {
2179 intr = "llvm.amdgcn.fmed3.f16";
2180 type = ctx->f16;
2181 } else {
2182 assert(bitsize == 32);
2183 intr = "llvm.amdgcn.fmed3.f32";
2184 type = ctx->f32;
2185 }
2186
2187 LLVMValueRef params[] = {
2188 zero,
2189 one,
2190 src,
2191 };
2192
2193 result = ac_build_intrinsic(ctx, intr, type, params, 3, 0);
2194 }
2195
2196 if (ctx->gfx_level < GFX9 && bitsize == 32) {
2197 /* Only pre-GFX9 chips do not flush denorms. */
2198 result = ac_build_canonicalize(ctx, result, bitsize);
2199 }
2200
2201 return result;
2202 }
2203
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2204 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2205 {
2206 LLVMTypeRef type;
2207 char *intr;
2208
2209 if (bitsize == 16) {
2210 intr = "llvm.amdgcn.fract.f16";
2211 type = ctx->f16;
2212 } else if (bitsize == 32) {
2213 intr = "llvm.amdgcn.fract.f32";
2214 type = ctx->f32;
2215 } else {
2216 intr = "llvm.amdgcn.fract.f64";
2217 type = ctx->f64;
2218 }
2219
2220 LLVMValueRef params[] = {
2221 src0,
2222 };
2223 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
2224 }
2225
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2226 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2227 {
2228
2229 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2230 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2231 unsigned vec_size = LLVMGetVectorSize(type);
2232 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2233
2234 for (unsigned i = 0; i < vec_size; i++)
2235 scalars[i] = scalar;
2236 return LLVMConstVector(scalars, vec_size);
2237 }
2238 return LLVMConstInt(type, value, 0);
2239 }
2240
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2241 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2242 {
2243 LLVMTypeRef type = LLVMTypeOf(src0);
2244 LLVMValueRef val;
2245
2246 /* v_med3 is selected only when max is first. (LLVM bug?) */
2247 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2248 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2249 }
2250
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2251 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2252 {
2253 ac_enable_signed_zeros(ctx);
2254 /* (val + 0) converts negative zero to positive zero. */
2255 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2256 ac_disable_signed_zeros(ctx);
2257 return val;
2258 }
2259
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2260 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2261 {
2262 LLVMTypeRef type = LLVMTypeOf(src);
2263 LLVMValueRef pos, neg, dw[2], val;
2264 unsigned bitsize = ac_get_elem_bits(ctx, type);
2265
2266 /* The standard version leads to this:
2267 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2268 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2269 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2270 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2271 *
2272 * The isign version:
2273 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2274 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2275 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2276 *
2277 * (src0 + 0) converts negative zero to positive zero.
2278 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2279 *
2280 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2281 * reduction. (FP64 comparisons are as fast as int64 comparisons)
2282 */
2283 if (bitsize == 16 || bitsize == 32) {
2284 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2285 val = ac_build_isign(ctx, val);
2286 return LLVMBuildSIToFP(ctx->builder, val, type, "");
2287 }
2288
2289 assert(bitsize == 64);
2290 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2291 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2292 dw[0] = ctx->i32_0;
2293 dw[1] = LLVMBuildSelect(
2294 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2295 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2296 "");
2297 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2298 }
2299
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2300 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2301 {
2302 LLVMValueRef result;
2303 unsigned bitsize;
2304
2305 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2306
2307 switch (bitsize) {
2308 case 128:
2309 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1, 0);
2310 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2311 break;
2312 case 64:
2313 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2314 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2315 break;
2316 case 32:
2317 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2318 break;
2319 case 16:
2320 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2321 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2322 break;
2323 case 8:
2324 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2325 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2326 break;
2327 default:
2328 unreachable("invalid bitsize");
2329 break;
2330 }
2331
2332 return result;
2333 }
2334
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2335 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2336 {
2337 LLVMValueRef result;
2338 unsigned bitsize;
2339
2340 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2341
2342 switch (bitsize) {
2343 case 64:
2344 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1, 0);
2345 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2346 break;
2347 case 32:
2348 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1, 0);
2349 break;
2350 case 16:
2351 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1, 0);
2352 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2353 break;
2354 case 8:
2355 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1, 0);
2356 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2357 break;
2358 default:
2359 unreachable("invalid bitsize");
2360 break;
2361 }
2362
2363 return result;
2364 }
2365
ac_build_sudot_4x8(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2,bool clamp,unsigned neg_lo)2366 LLVMValueRef ac_build_sudot_4x8(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2367 LLVMValueRef s2, bool clamp, unsigned neg_lo)
2368 {
2369 const char *name = "llvm.amdgcn.sudot4";
2370 LLVMValueRef src[6];
2371
2372 src[0] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x1), false);
2373 src[1] = s0;
2374 src[2] = LLVMConstInt(ctx->i1, !!(neg_lo & 0x2), false);
2375 src[3] = s1;
2376 src[4] = s2;
2377 src[5] = LLVMConstInt(ctx->i1, clamp, false);
2378
2379 return ac_build_intrinsic(ctx, name, ctx->i32, src, 6, 0);
2380 }
2381
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2382 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2383 {
2384 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2385 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1, 0);
2386 }
2387
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2388 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2389 {
2390 unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2391 LLVMTypeRef type = LLVMArrayType(ctx->i32, lds_size / 4);
2392 ctx->lds = (struct ac_llvm_pointer) {
2393 .value = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2394 LLVMPointerType(type, AC_ADDR_SPACE_LDS), "lds"),
2395 .pointee_type = type
2396 };
2397 }
2398
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2399 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2400 {
2401 LLVMValueRef v = ac_build_gep0(ctx, ctx->lds, dw_addr);
2402 return LLVMBuildLoad2(ctx->builder, ctx->i32, v, "");
2403 }
2404
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2405 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2406 {
2407 value = ac_to_integer(ctx, value);
2408 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2409 }
2410
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2411 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2412 {
2413 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2414 const char *intrin_name;
2415 LLVMTypeRef type;
2416 LLVMValueRef zero;
2417
2418 switch (src0_bitsize) {
2419 case 64:
2420 intrin_name = "llvm.cttz.i64";
2421 type = ctx->i64;
2422 zero = ctx->i64_0;
2423 break;
2424 case 32:
2425 intrin_name = "llvm.cttz.i32";
2426 type = ctx->i32;
2427 zero = ctx->i32_0;
2428 break;
2429 case 16:
2430 intrin_name = "llvm.cttz.i16";
2431 type = ctx->i16;
2432 zero = ctx->i16_0;
2433 break;
2434 case 8:
2435 intrin_name = "llvm.cttz.i8";
2436 type = ctx->i8;
2437 zero = ctx->i8_0;
2438 break;
2439 default:
2440 unreachable("invalid bitsize");
2441 }
2442
2443 LLVMValueRef params[2] = {
2444 src0,
2445
2446 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2447 * add special code to check for x=0. The reason is that
2448 * the LLVM behavior for x=0 is different from what we
2449 * need here. However, LLVM also assumes that ffs(x) is
2450 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2451 * a conditional assignment to handle 0 is still required.
2452 *
2453 * The hardware already implements the correct behavior.
2454 */
2455 ctx->i1true,
2456 };
2457
2458 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, 0);
2459
2460 if (src0_bitsize == 64) {
2461 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2462 } else if (src0_bitsize < 32) {
2463 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2464 }
2465
2466 /* TODO: We need an intrinsic to skip this conditional. */
2467 /* Check for zero: */
2468 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2469 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2470 }
2471
ac_arg_type_to_pointee_type(struct ac_llvm_context * ctx,enum ac_arg_type type)2472 LLVMTypeRef ac_arg_type_to_pointee_type(struct ac_llvm_context *ctx, enum ac_arg_type type) {
2473 switch (type) {
2474 case AC_ARG_CONST_PTR:
2475 return ctx->i8;
2476 break;
2477 case AC_ARG_CONST_FLOAT_PTR:
2478 return ctx->f32;
2479 break;
2480 case AC_ARG_CONST_PTR_PTR:
2481 return ac_array_in_const32_addr_space(ctx->i8);
2482 break;
2483 case AC_ARG_CONST_DESC_PTR:
2484 return ctx->v4i32;
2485 break;
2486 case AC_ARG_CONST_IMAGE_PTR:
2487 return ctx->v8i32;
2488 default:
2489 /* Other ac_arg_type values aren't pointers. */
2490 assert(false);
2491 return NULL;
2492 }
2493 }
2494
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2495 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2496 {
2497 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2498 }
2499
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2500 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2501 {
2502 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2503 }
2504
get_current_flow(struct ac_llvm_context * ctx)2505 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2506 {
2507 if (ctx->flow->depth > 0)
2508 return &ctx->flow->stack[ctx->flow->depth - 1];
2509 return NULL;
2510 }
2511
get_innermost_loop(struct ac_llvm_context * ctx)2512 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2513 {
2514 for (unsigned i = ctx->flow->depth; i > 0; --i) {
2515 if (ctx->flow->stack[i - 1].loop_entry_block)
2516 return &ctx->flow->stack[i - 1];
2517 }
2518 return NULL;
2519 }
2520
push_flow(struct ac_llvm_context * ctx)2521 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2522 {
2523 struct ac_llvm_flow *flow;
2524
2525 if (ctx->flow->depth >= ctx->flow->depth_max) {
2526 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2527
2528 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2529 ctx->flow->depth_max = new_max;
2530 }
2531
2532 flow = &ctx->flow->stack[ctx->flow->depth];
2533 ctx->flow->depth++;
2534
2535 flow->next_block = NULL;
2536 flow->loop_entry_block = NULL;
2537 return flow;
2538 }
2539
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2540 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2541 {
2542 char buf[32];
2543 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2544 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2545 }
2546
2547 /* Append a basic block at the level of the parent flow.
2548 */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2549 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2550 {
2551 assert(ctx->flow->depth >= 1);
2552
2553 if (ctx->flow->depth >= 2) {
2554 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2555
2556 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2557 }
2558
2559 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2560 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2561 }
2562
2563 /* Emit a branch to the given default target for the current block if
2564 * applicable -- that is, if the current block does not already contain a
2565 * branch from a break or continue.
2566 */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2567 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2568 {
2569 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2570 LLVMBuildBr(builder, target);
2571 }
2572
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2573 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2574 {
2575 struct ac_llvm_flow *flow = push_flow(ctx);
2576 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2577 flow->next_block = append_basic_block(ctx, "ENDLOOP");
2578 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2579 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2580 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2581 }
2582
ac_build_break(struct ac_llvm_context * ctx)2583 void ac_build_break(struct ac_llvm_context *ctx)
2584 {
2585 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2586 LLVMBuildBr(ctx->builder, flow->next_block);
2587 }
2588
ac_build_continue(struct ac_llvm_context * ctx)2589 void ac_build_continue(struct ac_llvm_context *ctx)
2590 {
2591 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2592 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2593 }
2594
ac_build_else(struct ac_llvm_context * ctx,int label_id)2595 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2596 {
2597 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2598 LLVMBasicBlockRef endif_block;
2599
2600 assert(!current_branch->loop_entry_block);
2601
2602 endif_block = append_basic_block(ctx, "ENDIF");
2603 emit_default_branch(ctx->builder, endif_block);
2604
2605 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2606 set_basicblock_name(current_branch->next_block, "else", label_id);
2607
2608 current_branch->next_block = endif_block;
2609 }
2610
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2611 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2612 {
2613 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2614
2615 assert(!current_branch->loop_entry_block);
2616
2617 emit_default_branch(ctx->builder, current_branch->next_block);
2618 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2619 set_basicblock_name(current_branch->next_block, "endif", label_id);
2620
2621 ctx->flow->depth--;
2622 }
2623
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2624 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2625 {
2626 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2627
2628 assert(current_loop->loop_entry_block);
2629
2630 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2631
2632 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2633 set_basicblock_name(current_loop->next_block, "endloop", label_id);
2634 ctx->flow->depth--;
2635 }
2636
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2637 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2638 {
2639 struct ac_llvm_flow *flow = push_flow(ctx);
2640 LLVMBasicBlockRef if_block;
2641
2642 if_block = append_basic_block(ctx, "IF");
2643 flow->next_block = append_basic_block(ctx, "ELSE");
2644 set_basicblock_name(if_block, "if", label_id);
2645 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
2646 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
2647 }
2648
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2649 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2650 {
2651 LLVMBuilderRef builder = ac->builder;
2652 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
2653 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
2654 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
2655 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
2656 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
2657 LLVMValueRef res;
2658
2659 if (first_instr) {
2660 LLVMPositionBuilderBefore(first_builder, first_instr);
2661 } else {
2662 LLVMPositionBuilderAtEnd(first_builder, first_block);
2663 }
2664
2665 res = LLVMBuildAlloca(first_builder, type, name);
2666 LLVMDisposeBuilder(first_builder);
2667 return res;
2668 }
2669
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)2670 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
2671 {
2672 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
2673 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
2674 return ptr;
2675 }
2676
ac_build_alloca_init(struct ac_llvm_context * ac,LLVMValueRef val,const char * name)2677 LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
2678 {
2679 LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
2680 LLVMBuildStore(ac->builder, val, ptr);
2681 return ptr;
2682 }
2683
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)2684 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
2685 {
2686 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
2687 return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
2688 }
2689
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)2690 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
2691 {
2692 unsigned num_components = ac_get_llvm_num_components(value);
2693 if (count == num_components)
2694 return value;
2695
2696 LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
2697 masks[0] = ctx->i32_0;
2698 masks[1] = ctx->i32_1;
2699 for (unsigned i = 2; i < count; i++)
2700 masks[i] = LLVMConstInt(ctx->i32, i, false);
2701
2702 if (count == 1)
2703 return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
2704
2705 LLVMValueRef swizzle = LLVMConstVector(masks, count);
2706 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
2707 }
2708
2709 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)2710 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
2711 unsigned bitwidth)
2712 {
2713 LLVMValueRef value = param;
2714 if (rshift)
2715 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
2716
2717 if (rshift + bitwidth < 32) {
2718 uint64_t mask = (1ull << bitwidth) - 1;
2719 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
2720 }
2721
2722 if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
2723 value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
2724 return value;
2725 }
2726
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2727 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
2728 LLVMValueRef lane, bool with_opt_barrier)
2729 {
2730 LLVMTypeRef type = LLVMTypeOf(src);
2731 LLVMValueRef result;
2732
2733 if (with_opt_barrier)
2734 ac_build_optimization_barrier(ctx, &src, false);
2735
2736 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2737 if (lane)
2738 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
2739
2740 result =
2741 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
2742 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2, 0);
2743
2744 return LLVMBuildTrunc(ctx->builder, result, type, "");
2745 }
2746
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)2747 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
2748 LLVMValueRef lane, bool with_opt_barrier)
2749 {
2750 LLVMTypeRef src_type = LLVMTypeOf(src);
2751 src = ac_to_integer(ctx, src);
2752 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2753 LLVMValueRef ret;
2754
2755 if (bits > 32) {
2756 assert(bits % 32 == 0);
2757 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2758 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2759 ret = LLVMGetUndef(vec_type);
2760 for (unsigned i = 0; i < bits / 32; i++) {
2761 LLVMValueRef ret_comp;
2762
2763 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2764
2765 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2766
2767 ret =
2768 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2769 }
2770 } else {
2771 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
2772 }
2773
2774 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
2775 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
2776 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2777 }
2778
2779 /**
2780 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
2781 *
2782 * The optimization barrier is not needed if the value is the same in all lanes
2783 * or if this is called in the outermost block.
2784 *
2785 * @param ctx
2786 * @param src
2787 * @param lane - id of the lane or NULL for the first active lane
2788 * @return value of the lane
2789 */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2790 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
2791 LLVMValueRef lane)
2792 {
2793 return ac_build_readlane_common(ctx, src, lane, false);
2794 }
2795
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)2796 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2797 {
2798 return ac_build_readlane_common(ctx, src, lane, true);
2799 }
2800
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)2801 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
2802 LLVMValueRef lane)
2803 {
2804 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
2805 (LLVMValueRef[]){value, lane, src}, 3, 0);
2806 }
2807
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)2808 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
2809 {
2810 LLVMValueRef add = LLVM_VERSION_MAJOR >= 16 ? add_src : ctx->i32_0;
2811 LLVMValueRef val;
2812
2813 if (ctx->wave_size == 32) {
2814 if (LLVMTypeOf(mask) == ctx->i64)
2815 mask = LLVMBuildTrunc(ctx->builder, mask, ctx->i32, "");
2816
2817 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2818 (LLVMValueRef[]){mask, add}, 2, 0);
2819 } else {
2820 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
2821 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
2822 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
2823 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2824 (LLVMValueRef[]){mask_lo, add}, 2, 0);
2825 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
2826 2, 0);
2827 }
2828
2829 if (add == ctx->i32_0)
2830 ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2831
2832 if (LLVM_VERSION_MAJOR < 16) {
2833 /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
2834 * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
2835 */
2836 ac_set_range_metadata(ctx, val, 0, ctx->wave_size);
2837 val = LLVMBuildAdd(ctx->builder, val, add_src, "");
2838 }
2839
2840 return val;
2841 }
2842
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)2843 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
2844 {
2845 return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
2846 }
2847
2848 enum dpp_ctrl
2849 {
2850 _dpp_quad_perm = 0x000,
2851 _dpp_row_sl = 0x100,
2852 _dpp_row_sr = 0x110,
2853 _dpp_row_rr = 0x120,
2854 dpp_wf_sl1 = 0x130,
2855 dpp_wf_rl1 = 0x134,
2856 dpp_wf_sr1 = 0x138,
2857 dpp_wf_rr1 = 0x13C,
2858 dpp_row_mirror = 0x140,
2859 dpp_row_half_mirror = 0x141,
2860 dpp_row_bcast15 = 0x142,
2861 dpp_row_bcast31 = 0x143
2862 };
2863
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)2864 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
2865 unsigned lane3)
2866 {
2867 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
2868 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
2869 }
2870
dpp_row_sr(unsigned amount)2871 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
2872 {
2873 assert(amount > 0 && amount < 16);
2874 return _dpp_row_sr | amount;
2875 }
2876
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2877 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2878 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2879 bool bound_ctrl)
2880 {
2881 LLVMTypeRef type = LLVMTypeOf(src);
2882 LLVMValueRef res;
2883
2884 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
2885 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2886
2887 res = ac_build_intrinsic(
2888 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
2889 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
2890 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
2891 LLVMConstInt(ctx->i1, bound_ctrl, 0)},
2892 6, 0);
2893
2894 return LLVMBuildTrunc(ctx->builder, res, type, "");
2895 }
2896
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)2897 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2898 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2899 bool bound_ctrl)
2900 {
2901 LLVMTypeRef src_type = LLVMTypeOf(src);
2902 src = ac_to_integer(ctx, src);
2903 old = ac_to_integer(ctx, old);
2904 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2905 LLVMValueRef ret;
2906 if (bits > 32) {
2907 assert(bits % 32 == 0);
2908 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2909 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2910 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
2911 ret = LLVMGetUndef(vec_type);
2912 for (unsigned i = 0; i < bits / 32; i++) {
2913 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2914 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
2915 LLVMValueRef ret_comp =
2916 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2917 ret =
2918 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2919 }
2920 } else {
2921 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
2922 }
2923 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2924 }
2925
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2926 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
2927 uint64_t sel, bool exchange_rows, bool bound_ctrl)
2928 {
2929 LLVMTypeRef type = LLVMTypeOf(src);
2930 LLVMValueRef result;
2931
2932 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2933
2934 LLVMValueRef args[6] = {
2935 src,
2936 src,
2937 LLVMConstInt(ctx->i32, sel, false),
2938 LLVMConstInt(ctx->i32, sel >> 32, false),
2939 ctx->i1true, /* fi */
2940 bound_ctrl ? ctx->i1true : ctx->i1false,
2941 };
2942
2943 result =
2944 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
2945 ctx->i32, args, 6, 0);
2946
2947 return LLVMBuildTrunc(ctx->builder, result, type, "");
2948 }
2949
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)2950 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
2951 bool exchange_rows, bool bound_ctrl)
2952 {
2953 LLVMTypeRef src_type = LLVMTypeOf(src);
2954 src = ac_to_integer(ctx, src);
2955 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2956 LLVMValueRef ret;
2957 if (bits > 32) {
2958 assert(bits % 32 == 0);
2959 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2960 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2961 ret = LLVMGetUndef(vec_type);
2962 for (unsigned i = 0; i < bits / 32; i++) {
2963 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
2964 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2965 ret =
2966 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
2967 }
2968 } else {
2969 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
2970 }
2971 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2972 }
2973
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)2974 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
2975 {
2976 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
2977 return and_mask | (or_mask << 5) | (xor_mask << 10);
2978 }
2979
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2980 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
2981 unsigned mask)
2982 {
2983 LLVMTypeRef src_type = LLVMTypeOf(src);
2984 LLVMValueRef ret;
2985
2986 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
2987
2988 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
2989 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
2990 0);
2991
2992 return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
2993 }
2994
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)2995 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2996 {
2997 LLVMTypeRef src_type = LLVMTypeOf(src);
2998 src = ac_to_integer(ctx, src);
2999 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3000 LLVMValueRef ret;
3001 if (bits > 32) {
3002 assert(bits % 32 == 0);
3003 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3004 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3005 ret = LLVMGetUndef(vec_type);
3006 for (unsigned i = 0; i < bits / 32; i++) {
3007 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3008 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3009 ret =
3010 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3011 }
3012 } else {
3013 ret = _ac_build_ds_swizzle(ctx, src, mask);
3014 }
3015 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3016 }
3017
ac_build_mode(struct ac_llvm_context * ctx,LLVMValueRef src,const char * mode)3018 static LLVMValueRef ac_build_mode(struct ac_llvm_context *ctx, LLVMValueRef src, const char *mode)
3019 {
3020 LLVMTypeRef src_type = LLVMTypeOf(src);
3021 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3022 char name[32], type[8];
3023 LLVMValueRef ret;
3024
3025 src = ac_to_integer(ctx, src);
3026
3027 if (bitsize < 32)
3028 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3029
3030 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3031 snprintf(name, sizeof(name), "llvm.amdgcn.%s.%s", mode, type);
3032 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1, 0);
3033
3034 if (bitsize < 32)
3035 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3036
3037 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3038 }
3039
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3040 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3041 {
3042 return ac_build_mode(ctx, src, "wwm");
3043 }
3044
ac_build_wqm(struct ac_llvm_context * ctx,LLVMValueRef src)3045 LLVMValueRef ac_build_wqm(struct ac_llvm_context *ctx, LLVMValueRef src)
3046 {
3047 return ac_build_mode(ctx, src, "wqm");
3048 }
3049
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3050 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3051 LLVMValueRef inactive)
3052 {
3053 char name[33], type[8];
3054 LLVMTypeRef src_type = LLVMTypeOf(src);
3055 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3056 src = ac_to_integer(ctx, src);
3057 inactive = ac_to_integer(ctx, inactive);
3058
3059 if (bitsize < 32) {
3060 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3061 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3062 }
3063
3064 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3065 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3066 LLVMValueRef ret =
3067 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2, 0);
3068 if (bitsize < 32)
3069 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3070
3071 return ret;
3072 }
3073
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3074 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3075 unsigned type_size)
3076 {
3077
3078 if (type_size == 0) {
3079 switch (op) {
3080 case nir_op_ior:
3081 case nir_op_ixor:
3082 return ctx->i1false;
3083 case nir_op_iand:
3084 return ctx->i1true;
3085 default:
3086 unreachable("bad reduction intrinsic");
3087 }
3088 } else if (type_size == 1) {
3089 switch (op) {
3090 case nir_op_iadd:
3091 return ctx->i8_0;
3092 case nir_op_imul:
3093 return ctx->i8_1;
3094 case nir_op_imin:
3095 return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3096 case nir_op_umin:
3097 return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3098 case nir_op_imax:
3099 return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3100 case nir_op_umax:
3101 return ctx->i8_0;
3102 case nir_op_iand:
3103 return LLVMConstInt(ctx->i8, -1, 0);
3104 case nir_op_ior:
3105 return ctx->i8_0;
3106 case nir_op_ixor:
3107 return ctx->i8_0;
3108 default:
3109 unreachable("bad reduction intrinsic");
3110 }
3111 } else if (type_size == 2) {
3112 switch (op) {
3113 case nir_op_iadd:
3114 return ctx->i16_0;
3115 case nir_op_fadd:
3116 return ctx->f16_0;
3117 case nir_op_imul:
3118 return ctx->i16_1;
3119 case nir_op_fmul:
3120 return ctx->f16_1;
3121 case nir_op_imin:
3122 return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3123 case nir_op_umin:
3124 return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3125 case nir_op_fmin:
3126 return LLVMConstReal(ctx->f16, INFINITY);
3127 case nir_op_imax:
3128 return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3129 case nir_op_umax:
3130 return ctx->i16_0;
3131 case nir_op_fmax:
3132 return LLVMConstReal(ctx->f16, -INFINITY);
3133 case nir_op_iand:
3134 return LLVMConstInt(ctx->i16, -1, 0);
3135 case nir_op_ior:
3136 return ctx->i16_0;
3137 case nir_op_ixor:
3138 return ctx->i16_0;
3139 default:
3140 unreachable("bad reduction intrinsic");
3141 }
3142 } else if (type_size == 4) {
3143 switch (op) {
3144 case nir_op_iadd:
3145 return ctx->i32_0;
3146 case nir_op_fadd:
3147 return ctx->f32_0;
3148 case nir_op_imul:
3149 return ctx->i32_1;
3150 case nir_op_fmul:
3151 return ctx->f32_1;
3152 case nir_op_imin:
3153 return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3154 case nir_op_umin:
3155 return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3156 case nir_op_fmin:
3157 return LLVMConstReal(ctx->f32, INFINITY);
3158 case nir_op_imax:
3159 return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3160 case nir_op_umax:
3161 return ctx->i32_0;
3162 case nir_op_fmax:
3163 return LLVMConstReal(ctx->f32, -INFINITY);
3164 case nir_op_iand:
3165 return LLVMConstInt(ctx->i32, -1, 0);
3166 case nir_op_ior:
3167 return ctx->i32_0;
3168 case nir_op_ixor:
3169 return ctx->i32_0;
3170 default:
3171 unreachable("bad reduction intrinsic");
3172 }
3173 } else { /* type_size == 64bit */
3174 switch (op) {
3175 case nir_op_iadd:
3176 return ctx->i64_0;
3177 case nir_op_fadd:
3178 return ctx->f64_0;
3179 case nir_op_imul:
3180 return ctx->i64_1;
3181 case nir_op_fmul:
3182 return ctx->f64_1;
3183 case nir_op_imin:
3184 return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3185 case nir_op_umin:
3186 return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3187 case nir_op_fmin:
3188 return LLVMConstReal(ctx->f64, INFINITY);
3189 case nir_op_imax:
3190 return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3191 case nir_op_umax:
3192 return ctx->i64_0;
3193 case nir_op_fmax:
3194 return LLVMConstReal(ctx->f64, -INFINITY);
3195 case nir_op_iand:
3196 return LLVMConstInt(ctx->i64, -1, 0);
3197 case nir_op_ior:
3198 return ctx->i64_0;
3199 case nir_op_ixor:
3200 return ctx->i64_0;
3201 default:
3202 unreachable("bad reduction intrinsic");
3203 }
3204 }
3205 }
3206
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3207 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3208 nir_op op)
3209 {
3210 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3211 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3212 switch (op) {
3213 case nir_op_iadd:
3214 return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3215 case nir_op_fadd:
3216 return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3217 case nir_op_imul:
3218 return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3219 case nir_op_fmul:
3220 return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3221 case nir_op_imin:
3222 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3223 lhs, rhs, "");
3224 case nir_op_umin:
3225 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3226 lhs, rhs, "");
3227 case nir_op_fmin:
3228 return ac_build_intrinsic(
3229 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3230 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3231 case nir_op_imax:
3232 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3233 lhs, rhs, "");
3234 case nir_op_umax:
3235 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3236 lhs, rhs, "");
3237 case nir_op_fmax:
3238 return ac_build_intrinsic(
3239 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3240 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2, 0);
3241 case nir_op_iand:
3242 return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3243 case nir_op_ior:
3244 return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3245 case nir_op_ixor:
3246 return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3247 default:
3248 unreachable("bad reduction intrinsic");
3249 }
3250 }
3251
3252 /**
3253 * \param src The value to shift.
3254 * \param identity The value to use the first lane.
3255 * \param maxprefix specifies that the result only needs to be correct for a
3256 * prefix of this many threads
3257 * \return src, shifted 1 lane up, and identity shifted into lane 0.
3258 */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3259 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3260 LLVMValueRef identity, unsigned maxprefix)
3261 {
3262 if (ctx->gfx_level >= GFX10) {
3263 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3264 LLVMValueRef active, tmp1, tmp2;
3265 LLVMValueRef tid = ac_get_thread_id(ctx);
3266
3267 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3268
3269 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3270
3271 if (maxprefix > 32) {
3272 active =
3273 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3274
3275 tmp2 = LLVMBuildSelect(ctx->builder, active,
3276 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3277 tmp2, "");
3278
3279 active = LLVMBuildOr(
3280 ctx->builder, active,
3281 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3282 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3283 LLVMConstInt(ctx->i32, 0x10, false), ""),
3284 "");
3285 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3286 } else if (maxprefix > 16) {
3287 active =
3288 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3289
3290 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3291 }
3292 } else if (ctx->gfx_level >= GFX8) {
3293 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3294 }
3295
3296 /* wavefront shift_right by 1 on SI/CI */
3297 LLVMValueRef active, tmp1, tmp2;
3298 LLVMValueRef tid = ac_get_thread_id(ctx);
3299 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3300 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3301 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3302 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3303 LLVMConstInt(ctx->i32, 0x4, 0), "");
3304 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3305 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3306 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3307 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3308 LLVMConstInt(ctx->i32, 0x8, 0), "");
3309 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3310 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3311 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3312 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3313 LLVMConstInt(ctx->i32, 0x10, 0), "");
3314 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3315 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3316 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3317 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3318 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, ctx->i32_0, "");
3319 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3320 }
3321
3322 /**
3323 * \param maxprefix specifies that the result only needs to be correct for a
3324 * prefix of this many threads
3325 */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3326 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3327 LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3328 {
3329 LLVMValueRef result, tmp;
3330
3331 if (!inclusive)
3332 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3333
3334 result = src;
3335
3336 if (ctx->gfx_level <= GFX7) {
3337 assert(maxprefix == 64);
3338 LLVMValueRef tid = ac_get_thread_id(ctx);
3339 LLVMValueRef active;
3340 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3341 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3342 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3343 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3344 result = ac_build_alu_op(ctx, result, tmp, op);
3345 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3346 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3347 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3348 ctx->i32_0, "");
3349 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3350 result = ac_build_alu_op(ctx, result, tmp, op);
3351 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3352 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3353 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3354 ctx->i32_0, "");
3355 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3356 result = ac_build_alu_op(ctx, result, tmp, op);
3357 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3358 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3359 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3360 ctx->i32_0, "");
3361 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3362 result = ac_build_alu_op(ctx, result, tmp, op);
3363 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3364 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3365 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3366 ctx->i32_0, "");
3367 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3368 result = ac_build_alu_op(ctx, result, tmp, op);
3369 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3370 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3371 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3372 ctx->i32_0, "");
3373 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3374 result = ac_build_alu_op(ctx, result, tmp, op);
3375 return result;
3376 }
3377
3378 if (maxprefix <= 1)
3379 return result;
3380 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3381 result = ac_build_alu_op(ctx, result, tmp, op);
3382 if (maxprefix <= 2)
3383 return result;
3384 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3385 result = ac_build_alu_op(ctx, result, tmp, op);
3386 if (maxprefix <= 3)
3387 return result;
3388 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3389 result = ac_build_alu_op(ctx, result, tmp, op);
3390 if (maxprefix <= 4)
3391 return result;
3392 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3393 result = ac_build_alu_op(ctx, result, tmp, op);
3394 if (maxprefix <= 8)
3395 return result;
3396 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3397 result = ac_build_alu_op(ctx, result, tmp, op);
3398 if (maxprefix <= 16)
3399 return result;
3400
3401 if (ctx->gfx_level >= GFX10) {
3402 LLVMValueRef tid = ac_get_thread_id(ctx);
3403 LLVMValueRef active;
3404
3405 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3406
3407 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3408 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3409 ctx->i32_0, "");
3410
3411 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3412
3413 result = ac_build_alu_op(ctx, result, tmp, op);
3414
3415 if (maxprefix <= 32)
3416 return result;
3417
3418 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3419
3420 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3421
3422 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3423
3424 result = ac_build_alu_op(ctx, result, tmp, op);
3425 return result;
3426 }
3427
3428 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3429 result = ac_build_alu_op(ctx, result, tmp, op);
3430 if (maxprefix <= 32)
3431 return result;
3432 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3433 result = ac_build_alu_op(ctx, result, tmp, op);
3434 return result;
3435 }
3436
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3437 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3438 {
3439 LLVMValueRef result;
3440
3441 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3442 LLVMBuilderRef builder = ctx->builder;
3443 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3444 result = ac_build_ballot(ctx, src);
3445 result = ac_build_mbcnt(ctx, result);
3446 result = LLVMBuildAdd(builder, result, src, "");
3447 return result;
3448 }
3449
3450 ac_build_optimization_barrier(ctx, &src, false);
3451
3452 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3453 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3454 LLVMTypeOf(identity), "");
3455 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3456
3457 return ac_build_wwm(ctx, result);
3458 }
3459
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3460 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3461 {
3462 LLVMValueRef result;
3463
3464 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3465 LLVMBuilderRef builder = ctx->builder;
3466 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3467 result = ac_build_ballot(ctx, src);
3468 result = ac_build_mbcnt(ctx, result);
3469 return result;
3470 }
3471
3472 ac_build_optimization_barrier(ctx, &src, false);
3473
3474 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3475 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3476 LLVMTypeOf(identity), "");
3477 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3478
3479 return ac_build_wwm(ctx, result);
3480 }
3481
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3482 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3483 unsigned cluster_size)
3484 {
3485 if (cluster_size == 1)
3486 return src;
3487 ac_build_optimization_barrier(ctx, &src, false);
3488 LLVMValueRef result, swap;
3489 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3490 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3491 LLVMTypeOf(identity), "");
3492 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3493 result = ac_build_alu_op(ctx, result, swap, op);
3494 if (cluster_size == 2)
3495 return ac_build_wwm(ctx, result);
3496
3497 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3498 result = ac_build_alu_op(ctx, result, swap, op);
3499 if (cluster_size == 4)
3500 return ac_build_wwm(ctx, result);
3501
3502 if (ctx->gfx_level >= GFX8)
3503 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3504 else
3505 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3506 result = ac_build_alu_op(ctx, result, swap, op);
3507 if (cluster_size == 8)
3508 return ac_build_wwm(ctx, result);
3509
3510 if (ctx->gfx_level >= GFX8)
3511 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3512 else
3513 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3514 result = ac_build_alu_op(ctx, result, swap, op);
3515 if (cluster_size == 16)
3516 return ac_build_wwm(ctx, result);
3517
3518 if (ctx->gfx_level >= GFX10)
3519 swap = ac_build_permlane16(ctx, result, 0, true, false);
3520 else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3521 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3522 else
3523 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3524 result = ac_build_alu_op(ctx, result, swap, op);
3525 if (cluster_size == 32)
3526 return ac_build_wwm(ctx, result);
3527
3528 if (ctx->gfx_level >= GFX8) {
3529 if (ctx->wave_size == 64) {
3530 if (ctx->gfx_level >= GFX10)
3531 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3532 else
3533 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3534 result = ac_build_alu_op(ctx, result, swap, op);
3535 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3536 }
3537
3538 return ac_build_wwm(ctx, result);
3539 } else {
3540 swap = ac_build_readlane(ctx, result, ctx->i32_0);
3541 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3542 result = ac_build_alu_op(ctx, result, swap, op);
3543 return ac_build_wwm(ctx, result);
3544 }
3545 }
3546
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)3547 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3548 LLVMValueRef *arg0, LLVMValueRef *arg1)
3549 {
3550 LLVMValueRef tid;
3551 LLVMValueRef src0, src1;
3552 LLVMValueRef tmp0;
3553 LLVMValueRef params[2];
3554 LLVMValueRef is_even;
3555
3556 src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
3557 src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
3558
3559 /* swap odd,even lanes of arg_0*/
3560 params[0] = src0;
3561 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3562 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3563 ctx->i32, params, 2, 0);
3564
3565 /* swap even lanes between arg_0 and arg_1 */
3566 tid = ac_get_thread_id(ctx);
3567 is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3568 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
3569 ctx->i32_0, "");
3570 tmp0 = src0;
3571 src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
3572 src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
3573
3574 /* swap odd,even lanes again for arg_0*/
3575 params[0] = src0;
3576 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
3577 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
3578 ctx->i32, params, 2, 0);
3579
3580 *arg0 = src0;
3581 *arg1 = src1;
3582 }
3583
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)3584 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
3585 struct ac_export_args *mrt0,
3586 struct ac_export_args *mrt1)
3587 {
3588 assert(ctx->gfx_level >= GFX11);
3589 assert(mrt0->enabled_channels == mrt1->enabled_channels);
3590
3591 for (int i = 0; i < 4; i++) {
3592 if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
3593 _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
3594 }
3595 }
3596
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3597 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
3598 unsigned lane1, unsigned lane2, unsigned lane3)
3599 {
3600 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3601 if (ctx->gfx_level >= GFX8) {
3602 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
3603 } else {
3604 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3605 }
3606 }
3607
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)3608 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3609 {
3610 LLVMTypeRef type = LLVMTypeOf(src);
3611 LLVMValueRef result;
3612
3613 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3614 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3615
3616 result =
3617 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2, 0);
3618 return LLVMBuildTrunc(ctx->builder, result, type, "");
3619 }
3620
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3621 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3622 {
3623 LLVMTypeRef type;
3624 char *intr;
3625
3626 if (bitsize == 16) {
3627 intr = "llvm.amdgcn.frexp.exp.i16.f16";
3628 type = ctx->i16;
3629 } else if (bitsize == 32) {
3630 intr = "llvm.amdgcn.frexp.exp.i32.f32";
3631 type = ctx->i32;
3632 } else {
3633 intr = "llvm.amdgcn.frexp.exp.i32.f64";
3634 type = ctx->i32;
3635 }
3636
3637 LLVMValueRef params[] = {
3638 src0,
3639 };
3640 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3641 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3642 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3643 {
3644 LLVMTypeRef type;
3645 char *intr;
3646
3647 if (bitsize == 16) {
3648 intr = "llvm.amdgcn.frexp.mant.f16";
3649 type = ctx->f16;
3650 } else if (bitsize == 32) {
3651 intr = "llvm.amdgcn.frexp.mant.f32";
3652 type = ctx->f32;
3653 } else {
3654 intr = "llvm.amdgcn.frexp.mant.f64";
3655 type = ctx->f64;
3656 }
3657
3658 LLVMValueRef params[] = {
3659 src0,
3660 };
3661 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3662 }
3663
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)3664 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
3665 {
3666 LLVMTypeRef type;
3667 char *intr;
3668
3669 if (bitsize == 16) {
3670 intr = "llvm.canonicalize.f16";
3671 type = ctx->f16;
3672 } else if (bitsize == 32) {
3673 intr = "llvm.canonicalize.f32";
3674 type = ctx->f32;
3675 } else {
3676 intr = "llvm.canonicalize.f64";
3677 type = ctx->f64;
3678 }
3679
3680 LLVMValueRef params[] = {
3681 src0,
3682 };
3683 return ac_build_intrinsic(ctx, intr, type, params, 1, 0);
3684 }
3685
3686 /*
3687 * this takes an I,J coordinate pair,
3688 * and works out the X and Y derivatives.
3689 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
3690 */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)3691 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
3692 {
3693 LLVMValueRef result[4], a;
3694 unsigned i;
3695
3696 for (i = 0; i < 2; i++) {
3697 a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
3698 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
3699 result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
3700 }
3701 return ac_build_gather_values(ctx, result, 4);
3702 }
3703
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)3704 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
3705 {
3706 LLVMValueRef result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0, 0);
3707
3708 return LLVMBuildNot(ctx->builder, result, "");
3709 }
3710
ac_build_call(struct ac_llvm_context * ctx,LLVMTypeRef fn_type,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)3711 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMTypeRef fn_type, LLVMValueRef func, LLVMValueRef *args,
3712 unsigned num_args)
3713 {
3714 LLVMValueRef ret = LLVMBuildCall2(ctx->builder, fn_type, func, args, num_args, "");
3715 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
3716 return ret;
3717 }
3718
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)3719 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
3720 LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
3721 struct ac_export_args *args)
3722 {
3723 unsigned mask = 0;
3724 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
3725 mrt0_alpha != NULL);
3726
3727 assert(depth || stencil || samplemask);
3728
3729 memset(args, 0, sizeof(*args));
3730
3731 if (is_last) {
3732 args->valid_mask = 1; /* whether the EXEC mask is valid */
3733 args->done = 1; /* DONE bit */
3734 }
3735
3736 /* Specify the target we are exporting */
3737 args->target = V_008DFC_SQ_EXP_MRTZ;
3738
3739 args->compr = 0; /* COMP flag */
3740 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
3741 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
3742 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
3743 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
3744
3745 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
3746 assert(!depth);
3747 args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
3748
3749 if (stencil) {
3750 /* Stencil should be in X[23:16]. */
3751 stencil = ac_to_integer(ctx, stencil);
3752 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
3753 args->out[0] = ac_to_float(ctx, stencil);
3754 mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
3755 }
3756 if (samplemask) {
3757 /* SampleMask should be in Y[15:0]. */
3758 args->out[1] = samplemask;
3759 mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
3760 }
3761 } else {
3762 if (depth) {
3763 args->out[0] = depth;
3764 mask |= 0x1;
3765 }
3766 if (stencil) {
3767 args->out[1] = stencil;
3768 mask |= 0x2;
3769 }
3770 if (samplemask) {
3771 args->out[2] = samplemask;
3772 mask |= 0x4;
3773 }
3774 if (mrt0_alpha) {
3775 args->out[3] = mrt0_alpha;
3776 mask |= 0x8;
3777 }
3778 }
3779
3780 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
3781 * at the X writemask component. */
3782 if (ctx->gfx_level == GFX6 &&
3783 ctx->info->family != CHIP_OLAND &&
3784 ctx->info->family != CHIP_HAINAN)
3785 mask |= 0x1;
3786
3787 /* Specify which components to enable */
3788 args->enabled_channels = mask;
3789 }
3790
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)3791 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
3792 {
3793 LLVMTypeRef base;
3794 switch (type) {
3795 case AC_ARG_FLOAT:
3796 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
3797 case AC_ARG_INT:
3798 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
3799 case AC_ARG_CONST_PTR:
3800 base = ctx->i8;
3801 break;
3802 case AC_ARG_CONST_FLOAT_PTR:
3803 base = ctx->f32;
3804 break;
3805 case AC_ARG_CONST_PTR_PTR:
3806 base = ac_array_in_const32_addr_space(ctx->i8);
3807 break;
3808 case AC_ARG_CONST_DESC_PTR:
3809 base = ctx->v4i32;
3810 break;
3811 case AC_ARG_CONST_IMAGE_PTR:
3812 base = ctx->v8i32;
3813 break;
3814 default:
3815 assert(false);
3816 return NULL;
3817 }
3818
3819 assert(base);
3820 if (size == 1) {
3821 return ac_array_in_const32_addr_space(base);
3822 } else {
3823 assert(size == 2);
3824 return ac_array_in_const_addr_space(base);
3825 }
3826 }
3827
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)3828 struct ac_llvm_pointer ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
3829 enum ac_llvm_calling_convention convention, const char *name,
3830 LLVMTypeRef ret_type, LLVMModuleRef module)
3831 {
3832 LLVMTypeRef arg_types[AC_MAX_ARGS];
3833 enum ac_arg_regfile arg_regfiles[AC_MAX_ARGS];
3834
3835 /* ring_offsets doesn't have a corresponding function parameter because LLVM can allocate it
3836 * itself for scratch memory purposes and gives us access through llvm.amdgcn.implicit.buffer.ptr
3837 */
3838 unsigned arg_count = 0;
3839 for (unsigned i = 0; i < args->arg_count; i++) {
3840 if (args->ring_offsets.used && i == args->ring_offsets.arg_index) {
3841 ctx->ring_offsets_index = i;
3842 continue;
3843 }
3844 arg_regfiles[arg_count] = args->args[i].file;
3845 arg_types[arg_count++] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
3846 }
3847
3848 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, arg_count, 0);
3849
3850 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
3851 LLVMBasicBlockRef main_function_body =
3852 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
3853 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
3854
3855 LLVMSetFunctionCallConv(main_function, convention);
3856 for (unsigned i = 0; i < arg_count; ++i) {
3857 LLVMValueRef P = LLVMGetParam(main_function, i);
3858
3859 if (arg_regfiles[i] != AC_ARG_SGPR)
3860 continue;
3861
3862 ac_add_function_attr(ctx->context, main_function, i + 1, "inreg");
3863
3864 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
3865 ac_add_function_attr(ctx->context, main_function, i + 1, "noalias");
3866 ac_add_attr_dereferenceable(P, UINT64_MAX);
3867 ac_add_attr_alignment(P, 4);
3868 }
3869 }
3870
3871 if (args->ring_offsets.used) {
3872 ctx->ring_offsets =
3873 ac_build_intrinsic(ctx, "llvm.amdgcn.implicit.buffer.ptr",
3874 LLVMPointerType(ctx->i8, AC_ADDR_SPACE_CONST), NULL, 0, 0);
3875 ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
3876 ac_array_in_const_addr_space(ctx->v4i32), "");
3877 }
3878
3879 ctx->main_function = (struct ac_llvm_pointer) {
3880 .value = main_function,
3881 .pointee_type = main_function_type
3882 };
3883
3884 /* Enable denormals for FP16 and FP64: */
3885 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
3886 /* Disable denormals for FP32: */
3887 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
3888 "preserve-sign,preserve-sign");
3889
3890 if (convention == AC_LLVM_AMDGPU_PS) {
3891 LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-depth-export",
3892 ctx->exports_mrtz ? "1" : "0");
3893 LLVMAddTargetDependentFunctionAttr(main_function, "amdgpu-color-export",
3894 ctx->exports_color_null ? "1" : "0");
3895 }
3896
3897 return ctx->main_function;
3898 }
3899
ac_build_s_endpgm(struct ac_llvm_context * ctx)3900 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
3901 {
3902 LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
3903 LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
3904 LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, "");
3905 }
3906
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)3907 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
3908 {
3909 LLVMValueRef args[2] = {
3910 a,
3911 LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
3912 };
3913 return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2, 0);
3914 }
3915