1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13def global_ptr_ty : LLVMQualPointerType<1>; 14 15class AMDGPUReadPreloadRegisterIntrinsic 16 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 17 18class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 19 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; 20 21// Used to tag image and resource intrinsics with information used to generate 22// mem operands. 23class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 24 int RsrcArg = rsrcarg; 25 bit IsImage = isimage; 26} 27 28let TargetPrefix = "r600" in { 29 30multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 31 def _x : AMDGPUReadPreloadRegisterIntrinsic; 32 def _y : AMDGPUReadPreloadRegisterIntrinsic; 33 def _z : AMDGPUReadPreloadRegisterIntrinsic; 34} 35 36multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 37 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 38 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 39 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 40} 41 42defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 43 <"__builtin_r600_read_global_size">; 44defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 45 <"__builtin_r600_read_ngroups">; 46defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 47 <"__builtin_r600_read_tgid">; 48 49defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 50defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 51 52def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 53 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 54 55// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 56def int_r600_implicitarg_ptr : 57 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 58 DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [], 59 [IntrNoMem, IntrSpeculatable]>; 60 61def int_r600_rat_store_typed : 62 // 1st parameter: Data 63 // 2nd parameter: Index 64 // 3rd parameter: Constant RAT ID 65 DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, 66 ClangBuiltin<"__builtin_r600_rat_store_typed">; 67 68def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< 69 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 70>; 71 72def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< 73 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 74>; 75 76def int_r600_cube : DefaultAttrsIntrinsic< 77 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 78>; 79 80def int_r600_store_stream_output : DefaultAttrsIntrinsic< 81 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] 82>; 83 84class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ 85 llvm_v4f32_ty, // Coord 86 llvm_i32_ty, // offset_x 87 llvm_i32_ty, // offset_y, 88 llvm_i32_ty, // offset_z, 89 llvm_i32_ty, // resource_id 90 llvm_i32_ty, // samplerid 91 llvm_i32_ty, // coord_type_x 92 llvm_i32_ty, // coord_type_y 93 llvm_i32_ty, // coord_type_z 94 llvm_i32_ty], // coord_type_w 95 [IntrNoMem] 96>; 97 98class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ 99 llvm_v4i32_ty, // Coord 100 llvm_i32_ty, // offset_x 101 llvm_i32_ty, // offset_y, 102 llvm_i32_ty, // offset_z, 103 llvm_i32_ty, // resource_id 104 llvm_i32_ty, // samplerid 105 llvm_i32_ty, // coord_type_x 106 llvm_i32_ty, // coord_type_y 107 llvm_i32_ty, // coord_type_z 108 llvm_i32_ty], // coord_type_w 109 [IntrNoMem] 110>; 111 112def int_r600_store_swizzle : 113 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 114>; 115 116def int_r600_tex : TextureIntrinsicFloatInput; 117def int_r600_texc : TextureIntrinsicFloatInput; 118def int_r600_txl : TextureIntrinsicFloatInput; 119def int_r600_txlc : TextureIntrinsicFloatInput; 120def int_r600_txb : TextureIntrinsicFloatInput; 121def int_r600_txbc : TextureIntrinsicFloatInput; 122def int_r600_txf : TextureIntrinsicInt32Input; 123def int_r600_txq : TextureIntrinsicInt32Input; 124def int_r600_ddx : TextureIntrinsicFloatInput; 125def int_r600_ddy : TextureIntrinsicFloatInput; 126 127def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], 128 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 129>; 130 131def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; 132 133} // End TargetPrefix = "r600" 134 135let TargetPrefix = "amdgcn" in { 136 137//===----------------------------------------------------------------------===// 138// ABI Special Intrinsics 139//===----------------------------------------------------------------------===// 140 141defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 142defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 143 <"__builtin_amdgcn_workgroup_id">; 144 145def int_amdgcn_dispatch_ptr : 146 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 147 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 148 149def int_amdgcn_queue_ptr : 150 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 151 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 152 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 153 154def int_amdgcn_kernarg_segment_ptr : 155 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 156 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 157 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 158 159def int_amdgcn_implicitarg_ptr : 160 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 161 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 162 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 163 164def int_amdgcn_groupstaticsize : 165 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 166 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 167 168def int_amdgcn_dispatch_id : 169 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 170 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; 171 172// For internal use. Coordinates LDS lowering between IR transform and backend. 173def int_amdgcn_lds_kernel_id : 174 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 175 176def int_amdgcn_implicit_buffer_ptr : 177 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 178 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 179 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 180 181// Set EXEC to the 64-bit value given. 182// This is always moved to the beginning of the basic block. 183// FIXME: Should be mangled for wave size. 184def int_amdgcn_init_exec : Intrinsic<[], 185 [llvm_i64_ty], // 64-bit literal constant 186 [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, 187 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; 188 189// Set EXEC according to a thread count packed in an SGPR input: 190// thread_count = (input >> bitoffset) & 0x7f; 191// This is always moved to the beginning of the basic block. 192// Note: only inreg arguments to the parent function are valid as 193// inputs to this intrinsic, computed values cannot be used. 194def int_amdgcn_init_exec_from_input : Intrinsic<[], 195 [llvm_i32_ty, // 32-bit SGPR input 196 llvm_i32_ty], // bit offset of the thread count 197 [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, 198 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 199 200def int_amdgcn_wavefrontsize : 201 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 202 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 203 204// Represent a relocation constant. 205def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< 206 [llvm_i32_ty], [llvm_metadata_ty], 207 [IntrNoMem, IntrSpeculatable] 208>; 209 210//===----------------------------------------------------------------------===// 211// Instruction Intrinsics 212//===----------------------------------------------------------------------===// 213 214// The first parameter is s_sendmsg immediate (i16), 215// the second one is copied to m0 216def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 217 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 218 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 219def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 220 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 221 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 222 223 224// gfx11 intrinsic 225// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 226def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 227 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 228 229def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 230 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 231 232def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, 233 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 234 IntrNoCallback, IntrNoFree]>; 235 236def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, 237 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 238 IntrNoCallback, IntrNoFree]>; 239 240def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, 241 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 242 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 243 244def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">, 245 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 246 IntrNoCallback, IntrNoFree]>; 247 248def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, 249 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, 250 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 251 252def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, 253 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 254 IntrNoCallback, IntrNoFree]>; 255 256def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, 257 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 258 IntrNoCallback, IntrNoFree]>; 259 260def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, 261 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 262 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 263 264def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, 265 Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 266 267def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, 268 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 269 IntrNoCallback, IntrNoFree]>; 270 271def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 272 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 273 274// The 1st parameter is a mask for the types of instructions that may be allowed 275// to cross the SCHED_BARRIER during scheduling. 276// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 277// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 278// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 279// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 280// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 281// MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. 282// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 283// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 284// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 285// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 286// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 287// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 288def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 289 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 290 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 291 292// The first parameter is a mask that determines the types of instructions that 293// you would like to synchronize around and add to a scheduling group. The 294// values of the mask are defined above for sched_barrier. These instructions 295// will be selected from the bottom up starting from the sched_group_barrier's 296// location during instruction scheduling. The second parameter is the number of 297// matching instructions that will be associated with this sched_group_barrier. 298// The third parameter is an identifier which is used to describe what other 299// sched_group_barriers should be synchronized with. 300def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, 301 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 302 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, 303 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 304 305// Scheduler optimization hint. 306// MASK = 0: Small gemm opt 307def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, 308 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 309 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 310 311def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 312 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 313 314// GFX12 intrinsics 315class AMDGPUWaitIntrinsic : 316 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 317def int_amdgcn_s_wait_bvhcnt : AMDGPUWaitIntrinsic; 318def int_amdgcn_s_wait_dscnt : AMDGPUWaitIntrinsic; 319def int_amdgcn_s_wait_expcnt : AMDGPUWaitIntrinsic; 320def int_amdgcn_s_wait_kmcnt : AMDGPUWaitIntrinsic; 321def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic; 322def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic; 323def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic; 324 325def int_amdgcn_div_scale : DefaultAttrsIntrinsic< 326 // 1st parameter: Numerator 327 // 2nd parameter: Denominator 328 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 329 // (0 = Denominator, 1 = Numerator). 330 [llvm_anyfloat_ty, llvm_i1_ty], 331 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 332 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] 333>; 334 335def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 336 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 337 [IntrNoMem, IntrSpeculatable] 338>; 339 340def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 341 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 342 [IntrNoMem, IntrSpeculatable] 343>; 344 345// Look Up 2.0 / pi src0 with segment select src1[4:0] 346def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< 347 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 348 [IntrNoMem, IntrSpeculatable] 349>; 350 351def int_amdgcn_sin : DefaultAttrsIntrinsic< 352 [llvm_anyfloat_ty], [LLVMMatchType<0>], 353 [IntrNoMem, IntrSpeculatable] 354>; 355 356def int_amdgcn_cos : DefaultAttrsIntrinsic< 357 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 358>; 359 360// v_log_{f16|f32}, performs log2. f32 version does not handle 361// denormals. There is no reason to use this for f16 as it does 362// support denormals, and the generic log2 intrinsic should be 363// preferred. 364def int_amdgcn_log : DefaultAttrsIntrinsic< 365 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 366>; 367 368// v_exp_{f16|f32} (int_amdgcn_exp was taken by export 369// already). Performs exp2. f32 version does not handle 370// denormals. There is no reason to use this for f16 as it does 371// support denormals, and the generic exp2 intrinsic should be 372// preferred. 373def int_amdgcn_exp2 : DefaultAttrsIntrinsic< 374 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 375>; 376 377def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< 378 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 379>; 380 381def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 382 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 383 [IntrNoMem, IntrSpeculatable, Commutative] 384>; 385 386// Fused single-precision multiply-add with legacy behaviour for the multiply, 387// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 388// intended for use on subtargets that have the v_fma_legacy_f32 and/or 389// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 390// has a completely different kind of legacy behaviour.) 391def int_amdgcn_fma_legacy : 392 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 393 [IntrNoMem, IntrSpeculatable, Commutative] 394>; 395 396def int_amdgcn_rcp : DefaultAttrsIntrinsic< 397 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 398>; 399 400def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 401 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], 402 [IntrNoMem, IntrSpeculatable] 403>; 404 405def int_amdgcn_sqrt : DefaultAttrsIntrinsic< 406 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 407>; 408 409def int_amdgcn_rsq : DefaultAttrsIntrinsic< 410 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 411>; 412 413def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 414 DefaultAttrsIntrinsic< 415 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] 416>; 417 418// out = 1.0 / sqrt(a) result clamped to +/- max_float. 419def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< 420 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; 421 422def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< 423 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 424>; 425 426def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< 427 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] 428>; 429 430// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 431// and always uses rtz, so is not suitable for implementing the OpenCL 432// fract function. It should be ok on VI. 433def int_amdgcn_fract : DefaultAttrsIntrinsic< 434 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 435>; 436 437def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 438 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 439 [IntrNoMem, IntrSpeculatable] 440>; 441 442def int_amdgcn_cvt_pknorm_i16 : 443 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 444 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 445 [IntrNoMem, IntrSpeculatable] 446>; 447 448def int_amdgcn_cvt_pknorm_u16 : 449 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 450 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 451 [IntrNoMem, IntrSpeculatable] 452>; 453 454def int_amdgcn_cvt_pk_i16 : 455 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 456 DefaultAttrsIntrinsic< 457 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 458 [IntrNoMem, IntrSpeculatable] 459>; 460 461def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 462 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 463 [IntrNoMem, IntrSpeculatable] 464>; 465 466def int_amdgcn_class : DefaultAttrsIntrinsic< 467 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 468 [IntrNoMem, IntrSpeculatable] 469>; 470 471def int_amdgcn_fmed3 : 472 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 473 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 474 [IntrNoMem, IntrSpeculatable] 475>; 476 477def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 478 DefaultAttrsIntrinsic<[llvm_float_ty], 479 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 480 [IntrNoMem, IntrSpeculatable] 481>; 482 483def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 484 DefaultAttrsIntrinsic<[llvm_float_ty], 485 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 486 [IntrNoMem, IntrSpeculatable] 487>; 488 489def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 490 DefaultAttrsIntrinsic<[llvm_float_ty], 491 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 492 [IntrNoMem, IntrSpeculatable] 493>; 494 495def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 496 DefaultAttrsIntrinsic<[llvm_float_ty], 497 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 498 [IntrNoMem, IntrSpeculatable] 499>; 500 501// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 502// should be used. 503def int_amdgcn_sffbh : 504 DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 505 [IntrNoMem, IntrSpeculatable] 506>; 507 508// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 509def int_amdgcn_fmad_ftz : 510 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 511 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 512 [IntrNoMem, IntrSpeculatable] 513>; 514 515class AMDGPULDSIntrin : 516 Intrinsic<[llvm_any_ty], 517 [LLVMQualPointerType<3>, 518 LLVMMatchType<0>, 519 llvm_i32_ty, // ordering 520 llvm_i32_ty, // scope 521 llvm_i1_ty], // isVolatile 522 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 523 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree] 524>; 525 526// FIXME: The m0 argument should be moved after the normal arguments 527class AMDGPUDSOrderedIntrinsic : Intrinsic< 528 [llvm_i32_ty], 529 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 530 // the bit packing can be optimized at the IR level. 531 [LLVMQualPointerType<2>, // IntToPtr(M0) 532 llvm_i32_ty, // value to add or swap 533 llvm_i32_ty, // ordering 534 llvm_i32_ty, // scope 535 llvm_i1_ty, // isVolatile 536 llvm_i32_ty, // ordered count index (OA index), also added to the address 537 // gfx10: bits 24-27 indicate the number of active threads/dwords 538 llvm_i1_ty, // wave release, usually set to 1 539 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 540 [IntrWillReturn, NoCapture<ArgIndex<0>>, 541 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 542 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree 543 ] 544>; 545 546class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 547 [llvm_i32_ty], 548 [llvm_anyptr_ty, // LDS or GDS ptr 549 llvm_i1_ty], // isVolatile 550 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 551 NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], 552 "", 553 [SDNPMemOperand] 554>; 555 556def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 557def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 558 559// The pointer argument is assumed to be dynamically uniform if a VGPR. 560def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 561def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 562 563def int_amdgcn_ds_fadd : AMDGPULDSIntrin; 564def int_amdgcn_ds_fmin : AMDGPULDSIntrin; 565def int_amdgcn_ds_fmax : AMDGPULDSIntrin; 566 567} // TargetPrefix = "amdgcn" 568 569// New-style image intrinsics 570 571////////////////////////////////////////////////////////////////////////// 572// Dimension-aware image intrinsics framework 573////////////////////////////////////////////////////////////////////////// 574 575// Helper class to represent (type, name) combinations of arguments. The 576// argument names are explanatory and used as DAG operand names for codegen 577// pattern matching. 578class AMDGPUArg<LLVMType ty, string name> { 579 LLVMType Type = ty; 580 string Name = name; 581} 582 583// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 584class makeArgList<list<string> names, LLVMType basety> { 585 list<AMDGPUArg> ret = 586 !listconcat([AMDGPUArg<basety, names[0]>], 587 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 588} 589 590// Return arglist, with LLVMMatchType's references shifted by 'shift'. 591class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 592 list<AMDGPUArg> ret = 593 !foreach(arg, arglist, 594 !if(!isa<LLVMMatchType>(arg.Type), 595 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 596 arg.Name>, 597 arg)); 598} 599 600// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 601// accordingly, and shifted by an additional 'shift'. 602class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 603 list<AMDGPUArg> ret = 604 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 605 !listconcat( 606 lhs, 607 arglistmatchshift<rhs, 608 !add(shift, !foldl(0, lhs, a, b, 609 !add(a, b.Type.isAny)))>.ret)); 610} 611 612// Represent texture/image types / dimensionality. 613class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 614 list<string> coord_names, list<string> slice_names, 615 bit msaa = 0> { 616 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 617 string Name = name; // e.g. "2darraymsaa" 618 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 619 bits<3> Encoding = enc; 620 bit DA = 0; // DA bit in MIMG encoding 621 bit MSAA = msaa; 622 623 list<AMDGPUArg> CoordSliceArgs = 624 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 625 list<AMDGPUArg> CoordSliceIntArgs = 626 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 627 list<AMDGPUArg> GradientArgs = 628 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 629 !foreach(name, coord_names, "d" # name # "dv")), 630 llvm_anyfloat_ty>.ret; 631 632 bits<8> NumCoords = !size(CoordSliceArgs); 633 bits<8> NumGradients = !size(GradientArgs); 634} 635 636def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 637def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 638def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 639let DA = 1 in { 640 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 641 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 642 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 643} 644def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 645let DA = 1 in { 646 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 647} 648 649def AMDGPUDims { 650 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 651 AMDGPUDimCube, AMDGPUDim1DArray, 652 AMDGPUDim2DArray]; 653 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 654 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 655} 656 657// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 658class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 659 string UpperCaseMod = ucmod; 660 string LowerCaseMod = lcmod; 661 662 // {offset} {bias} {z-compare} 663 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 664 bit Offset = false; 665 bit Bias = false; 666 bit ZCompare = false; 667 bit Gradients = false; 668 669 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 670 // if any. 671 string LodOrClamp = ""; 672} 673 674// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 675// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 676defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 677 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 678 list<AMDGPUArg> extra_addr> { 679 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 680 let Offset = true in 681 def NAME#lcmod#_o : AMDGPUSampleVariant< 682 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 683 } 684 685 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 686 list<AMDGPUArg> extra_addr> { 687 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 688 let ZCompare = true in 689 defm NAME : AMDGPUSampleHelper_Offset< 690 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 691 } 692 693 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 694 list<AMDGPUArg> extra_addr> { 695 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 696 let LodOrClamp = "clamp" in 697 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 698 } 699 700 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 701 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 702 let Bias = true in 703 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 704 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 705 let LodOrClamp = "lod" in 706 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 707 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 708 } 709 710 let Gradients = true in { 711 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 712 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 713 } 714} 715 716// Helper class to capture the profile of a dimension-aware image intrinsic. 717// This information is used to generate the intrinsic's type and to inform 718// codegen pattern matching. 719class AMDGPUDimProfile<string opmod, 720 AMDGPUDimProps dim> { 721 AMDGPUDimProps Dim = dim; 722 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 723 724 // These are intended to be overwritten by subclasses 725 bit IsSample = false; 726 bit IsAtomic = false; 727 list<LLVMType> RetTypes = []; 728 list<AMDGPUArg> DataArgs = []; 729 list<AMDGPUArg> ExtraAddrArgs = []; 730 bit Offset = false; 731 bit Bias = false; 732 bit ZCompare = false; 733 bit Gradients = false; 734 string LodClampMip = ""; 735 736 int NumRetAndDataAnyTypes = 737 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 738 !add(a, b.isAny)); 739 740 list<AMDGPUArg> AddrArgs = 741 arglistconcat<[ExtraAddrArgs, 742 !if(Gradients, dim.GradientArgs, []), 743 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 744 !if(!empty(LodClampMip), 745 []<AMDGPUArg>, 746 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 747 NumRetAndDataAnyTypes>.ret; 748 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 749 list<AMDGPUArg> AddrDefaultArgs = 750 !foreach(arg, AddrArgs, 751 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 752 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 753 arg.Name>); 754 list<AMDGPUArg> AddrA16Args = 755 !foreach(arg, AddrArgs, 756 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 757 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 758 arg.Name>); 759} 760 761class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 762 let IsSample = base.IsSample; 763 let IsAtomic = base.IsAtomic; 764 let RetTypes = base.RetTypes; 765 let DataArgs = base.DataArgs; 766 let ExtraAddrArgs = base.ExtraAddrArgs; 767 let Offset = base.Offset; 768 let Bias = base.Bias; 769 let ZCompare = base.ZCompare; 770 let Gradients = base.Gradients; 771 let LodClampMip = base.LodClampMip; 772} 773 774class AMDGPUDimSampleProfile<string opmod, 775 AMDGPUDimProps dim, 776 AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { 777 let IsSample = true; 778 let RetTypes = [llvm_any_ty]; 779 let ExtraAddrArgs = sample.ExtraAddrArgs; 780 let Offset = sample.Offset; 781 let Bias = sample.Bias; 782 let ZCompare = sample.ZCompare; 783 let Gradients = sample.Gradients; 784 let LodClampMip = sample.LodOrClamp; 785} 786 787class AMDGPUDimNoSampleProfile<string opmod, 788 AMDGPUDimProps dim, 789 list<LLVMType> retty, 790 list<AMDGPUArg> dataargs, 791 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 792 let RetTypes = retty; 793 let DataArgs = dataargs; 794 let LodClampMip = !if(Mip, "mip", ""); 795} 796 797class AMDGPUDimAtomicProfile<string opmod, 798 AMDGPUDimProps dim, 799 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { 800 let RetTypes = [llvm_anyint_ty]; 801 let DataArgs = dataargs; 802 let IsAtomic = true; 803} 804 805class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, 806 list<AMDGPUArg> dataargs> 807 : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { 808 let RetTypes = [llvm_anyfloat_ty]; 809} 810 811class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 812 : AMDGPUDimProfile<"GET_RESINFO", dim> { 813 let RetTypes = [llvm_anyfloat_ty]; 814 let DataArgs = []; 815 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 816 let LodClampMip = "mip"; 817} 818 819// Helper class for figuring out image intrinsic argument indexes. 820class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 821 int NumDataArgs = !size(P_.DataArgs); 822 int NumDmaskArgs = !not(P_.IsAtomic); 823 int NumOffsetArgs = !if(P_.Offset, 1, 0); 824 int NumBiasArgs = !if(P_.Bias, 1, 0); 825 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 826 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 827 int NumVAddrArgs = !size(P_.AddrArgs); 828 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 829 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 830 int NumRSrcArgs = 1; 831 int NumSampArgs = !if(P_.IsSample, 2, 0); 832 int DmaskArgIndex = NumDataArgs; 833 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 834 int OffsetArgIndex = VAddrArgIndex; 835 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 836 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 837 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 838 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 839 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 840 int MipArgIndex = LodArgIndex; 841 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 842 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 843 int UnormArgIndex = !add(SampArgIndex, 1); 844 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 845 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 846} 847 848// All dimension-aware intrinsics are derived from this class. 849class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 850 list<IntrinsicProperty> props, 851 list<SDNodeProperty> sdnodeprops> : Intrinsic< 852 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 853 !listconcat( 854 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 855 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 856 P_.AddrTypes, // vaddr(VGPR) 857 [llvm_v8i32_ty], // rsrc(SGPR) 858 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) 859 llvm_i1_ty], []), // unorm(imm) 860 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 861 llvm_i32_ty]), // auxiliary/cachepolicy(imm): 862 // bit 0 = glc, bit 1 = slc, 863 // bit 2 = dlc (gfx10/gfx11), 864 // bit 4 = scc (gfx90a) 865 // gfx940: bit 0 = sc0, bit 1 = nt, bit 4 = sc1 866 // gfx12+: bits [0-2] = th, bits [3-4] = scope 867 !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn], 868 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 869 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 870 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 871 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>], 872 !if(P_.IsAtomic, [], [IntrNoSync])), 873 874 875 "", sdnodeprops>, 876 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 877 !if(P_.IsAtomic, 0, 1)), 1> { 878 AMDGPUDimProfile P = P_; 879 880 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 881 882 let TargetPrefix = "amdgcn"; 883} 884 885// Marker class for intrinsics with a DMask that determines the returned 886// channels. 887class AMDGPUImageDMaskIntrinsic; 888 889defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 890 891 ////////////////////////////////////////////////////////////////////////// 892 // Load and store intrinsics 893 ////////////////////////////////////////////////////////////////////////// 894 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 895 list<LLVMType> retty, 896 list<AMDGPUArg> dataargs, 897 list<IntrinsicProperty> props, 898 list<SDNodeProperty> sdnodeprops, 899 bit Mip = false> { 900 foreach dim = AMDGPUDims.NoMsaa in { 901 def !strconcat(NAME, "_", dim.Name) 902 : AMDGPUImageDimIntrinsic< 903 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 904 props, sdnodeprops>; 905 } 906 } 907 908 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 909 list<LLVMType> retty, 910 list<AMDGPUArg> dataargs, 911 list<IntrinsicProperty> props, 912 list<SDNodeProperty> sdnodeprops, 913 bit Mip = false> { 914 foreach dim = AMDGPUDims.All in { 915 def !strconcat(NAME, "_", dim.Name) 916 : AMDGPUImageDimIntrinsic< 917 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 918 props, sdnodeprops>; 919 } 920 } 921 922 defm int_amdgcn_image_load 923 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 924 [SDNPMemOperand]>, 925 AMDGPUImageDMaskIntrinsic; 926 defm int_amdgcn_image_load_mip 927 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 928 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 929 AMDGPUImageDMaskIntrinsic; 930 931 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 932 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 933 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, 934 AMDGPUImageDMaskIntrinsic; 935 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 936 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 937 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, 938 AMDGPUImageDMaskIntrinsic; 939 940 ////////////////////////////////////////////////////////////////////////// 941 // MSAA intrinsics 942 ////////////////////////////////////////////////////////////////////////// 943 foreach dim = AMDGPUDims.Msaa in { 944 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 945 AMDGPUImageDimIntrinsic< 946 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 947 [IntrReadMem], [SDNPMemOperand]>; 948 } 949 950 foreach dim = AMDGPUDims.Msaa in { 951 def int_amdgcn_image_msaa_load # _ # dim.Name: 952 AMDGPUImageDimIntrinsic< 953 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 954 [IntrReadMem], [SDNPMemOperand]>; 955 } 956 957 ////////////////////////////////////////////////////////////////////////// 958 // sample and getlod intrinsics 959 ////////////////////////////////////////////////////////////////////////// 960 multiclass AMDGPUImageDimSampleDims<string opmod, 961 AMDGPUSampleVariant sample, 962 bit NoMem = false> { 963 foreach dim = AMDGPUDims.NoMsaa in { 964 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 965 AMDGPUDimSampleProfile<opmod, dim, sample>, 966 !if(NoMem, [IntrNoMem], [IntrReadMem]), 967 !if(NoMem, [], [SDNPMemOperand])>; 968 } 969 } 970 971 foreach sample = AMDGPUSampleVariants in { 972 defm int_amdgcn_image_sample # sample.LowerCaseMod 973 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 974 AMDGPUImageDMaskIntrinsic; 975 } 976 977 defm int_amdgcn_image_getlod 978 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 979 AMDGPUImageDMaskIntrinsic; 980 981 ////////////////////////////////////////////////////////////////////////// 982 // getresinfo intrinsics 983 ////////////////////////////////////////////////////////////////////////// 984 foreach dim = AMDGPUDims.All in { 985 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 986 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 987 AMDGPUImageDMaskIntrinsic; 988 } 989 990 ////////////////////////////////////////////////////////////////////////// 991 // gather4 intrinsics 992 ////////////////////////////////////////////////////////////////////////// 993 foreach sample = AMDGPUSampleVariantsNoGradients in { 994 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 995 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 996 AMDGPUImageDimIntrinsic< 997 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 998 [IntrReadMem], [SDNPMemOperand]>; 999 } 1000 } 1001} 1002 1003////////////////////////////////////////////////////////////////////////// 1004// atomic intrinsics 1005////////////////////////////////////////////////////////////////////////// 1006defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 1007 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 1008 int isFloat = 0> { 1009 foreach dim = AMDGPUDims.All in { 1010 def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< 1011 !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, 1012 AMDGPUDimAtomicProfile<opmod, dim, dataargs>), 1013 [], [SDNPMemOperand]>; 1014 } 1015 } 1016 1017 multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { 1018 defm "" 1019 : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], 1020 isFloat>; 1021 } 1022 1023 multiclass AMDGPUImageDimFloatAtomic<string opmod> { 1024 defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; 1025 } 1026 1027 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; 1028 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 1029 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 1030 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 1031 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 1032 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 1033 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 1034 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 1035 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 1036 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 1037 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 1038 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 1039 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 1040 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 1041 defm int_amdgcn_image_atomic_add_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_ADD_FLT">; 1042 defm int_amdgcn_image_atomic_min_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MIN_FLT">; 1043 defm int_amdgcn_image_atomic_max_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MAX_FLT">; 1044 1045 defm int_amdgcn_image_atomic_cmpswap : 1046 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 1047 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 1048 1049 defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">; 1050 defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">; 1051} 1052 1053////////////////////////////////////////////////////////////////////////// 1054// Buffer intrinsics 1055////////////////////////////////////////////////////////////////////////// 1056 1057// Data type for buffer resources (V#). Maybe, in the future, we can create a 1058// similar one for textures (T#). 1059def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>; 1060 1061let TargetPrefix = "amdgcn" in { 1062 1063def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < 1064 [AMDGPUBufferRsrcTy], 1065 [llvm_anyptr_ty, // base 1066 llvm_i16_ty, // stride (and swizzle control) 1067 llvm_i32_ty, // NumRecords / extent 1068 llvm_i32_ty], // flags 1069 // Attributes lifted from ptrmask + some extra argument attributes. 1070 [IntrNoMem, ReadNone<ArgIndex<0>>, 1071 IntrSpeculatable, IntrWillReturn]>; 1072 1073defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 1074 1075class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1076 [data_ty], 1077 [llvm_v4i32_ty, // rsrc(SGPR) 1078 llvm_i32_ty, // vindex(VGPR) 1079 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1080 llvm_i1_ty, // glc(imm) 1081 llvm_i1_ty], // slc(imm) 1082 [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1083 AMDGPURsrcIntrinsic<0>; 1084def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; 1085def int_amdgcn_buffer_load : AMDGPUBufferLoad; 1086 1087// Generate a buffer_load instruction that may be optimized to s_buffer_load if 1088// the offset argument is uniform. 1089def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < 1090 [llvm_any_ty], 1091 [llvm_v4i32_ty, // rsrc(SGPR) 1092 llvm_i32_ty, // byte offset 1093 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1094 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1095 // bit 3 = swz, bit 4 = scc (gfx90a) 1096 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1097 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1098 // bit 6 = swz 1099 // Note: volatile bit is **not** permitted here. 1100 [IntrNoMem, ImmArg<ArgIndex<2>>]>, 1101 AMDGPURsrcIntrinsic<0>; 1102 1103class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1104 [], 1105 [data_ty, // vdata(VGPR) 1106 llvm_v4i32_ty, // rsrc(SGPR) 1107 llvm_i32_ty, // vindex(VGPR) 1108 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1109 llvm_i1_ty, // glc(imm) 1110 llvm_i1_ty], // slc(imm) 1111 [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1112 AMDGPURsrcIntrinsic<1>; 1113def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; 1114def int_amdgcn_buffer_store : AMDGPUBufferStore; 1115 1116// New buffer intrinsics with separate raw and struct variants. The raw 1117// variant never has an index. The struct variant always has an index, even if 1118// it is const 0. A struct intrinsic with constant 0 index is different to the 1119// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1120// and swizzling changes depending on whether idxen is set in the instruction. 1121// These new instrinsics also keep the offset and soffset arguments separate as 1122// they behave differently in bounds checking and swizzling. 1123 1124// The versions of these intrinsics that take <4 x i32> arguments are deprecated 1125// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, 1126// which allow for improved reasoning about memory accesses. 1127// 1128// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved 1129// through to final assembly selection and is used to signal that the buffer 1130// operation is volatile. 1131class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1132 [data_ty], 1133 [llvm_v4i32_ty, // rsrc(SGPR) 1134 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1135 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1136 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1137 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1138 // bit 3 = swz, bit 4 = scc (gfx90a) 1139 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1140 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1141 // bit 6 = swz 1142 // all: volatile op (bit 31, stripped at lowering) 1143 [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1144 AMDGPURsrcIntrinsic<0>; 1145def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; 1146def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1147 1148class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1149 [data_ty], 1150 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1151 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1152 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1153 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1154 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1155 // bit 3 = swz, bit 4 = scc (gfx90a) 1156 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1157 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1158 // bit 6 = swz 1159 // all: volatile op (bit 31, stripped at lowering) 1160 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1161 ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1162 AMDGPURsrcIntrinsic<0>; 1163def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>; 1164def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad; 1165 1166class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1167 [data_ty], 1168 [llvm_v4i32_ty, // rsrc(SGPR) 1169 llvm_i32_ty, // vindex(VGPR) 1170 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1171 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1172 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1173 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1174 // bit 3 = swz, bit 4 = scc (gfx90a) 1175 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1176 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1177 // bit 6 = swz 1178 // all: volatile op (bit 31, stripped at lowering) 1179 [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1180 AMDGPURsrcIntrinsic<0>; 1181def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1182def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1183 1184class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1185 [data_ty], 1186 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1187 llvm_i32_ty, // vindex(VGPR) 1188 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1189 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1190 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1191 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1192 // bit 3 = swz, bit 4 = scc (gfx90a) 1193 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1194 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1195 // bit 6 = swz 1196 // all: volatile op (bit 31, stripped at lowering) 1197 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1198 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1199 AMDGPURsrcIntrinsic<0>; 1200def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad; 1201def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad; 1202 1203class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1204 [], 1205 [data_ty, // vdata(VGPR) 1206 llvm_v4i32_ty, // rsrc(SGPR) 1207 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1208 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1209 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1210 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1211 // bit 3 = swz, bit 4 = scc (gfx90a) 1212 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1213 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1214 // bit 6 = swz 1215 // all: volatile op (bit 31, stripped at lowering) 1216 [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1217 AMDGPURsrcIntrinsic<1>; 1218def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1219def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1220 1221class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1222 [], 1223 [data_ty, // vdata(VGPR) 1224 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1225 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1226 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1227 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1228 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1229 // bit 3 = swz, bit 4 = scc (gfx90a) 1230 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1231 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1232 // bit 6 = swz 1233 // all: volatile op (bit 31, stripped at lowering) 1234 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1235 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1236 AMDGPURsrcIntrinsic<1>; 1237def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>; 1238def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore; 1239 1240class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1241 [], 1242 [data_ty, // vdata(VGPR) 1243 llvm_v4i32_ty, // rsrc(SGPR) 1244 llvm_i32_ty, // vindex(VGPR) 1245 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1246 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1247 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1248 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1249 // bit 3 = swz, bit 4 = scc (gfx90a) 1250 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1251 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1252 // bit 6 = swz 1253 // all: volatile op (bit 31, stripped at lowering) 1254 [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1255 AMDGPURsrcIntrinsic<1>; 1256def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1257def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1258 1259class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1260 [], 1261 [data_ty, // vdata(VGPR) 1262 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1263 llvm_i32_ty, // vindex(VGPR) 1264 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1265 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1266 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1267 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1268 // bit 3 = swz, bit 4 = scc (gfx90a) 1269 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1270 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1271 // bit 6 = swz 1272 // all: volatile op (bit 31, stripped at lowering) 1273 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1274 ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1275 AMDGPURsrcIntrinsic<1>; 1276def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore; 1277def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore; 1278 1279class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1280 [data_ty], 1281 [LLVMMatchType<0>, // vdata(VGPR) 1282 llvm_v4i32_ty, // rsrc(SGPR) 1283 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1284 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1285 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1286 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1287 AMDGPURsrcIntrinsic<1, 0>; 1288def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1289def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1290def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1291def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1292def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1293def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1294def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1295def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1296def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1297def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1298def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1299def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1300def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1301def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1302def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; 1303def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1304 [llvm_anyint_ty], 1305 [LLVMMatchType<0>, // src(VGPR) 1306 LLVMMatchType<0>, // cmp(VGPR) 1307 llvm_v4i32_ty, // rsrc(SGPR) 1308 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1309 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1310 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1311 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1312 AMDGPURsrcIntrinsic<2, 0>; 1313 1314class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1315 [data_ty], 1316 [LLVMMatchType<0>, // vdata(VGPR) 1317 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1318 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1319 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1320 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1321 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1322 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1323 AMDGPURsrcIntrinsic<1, 0>; 1324 1325def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic; 1326def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic; 1327def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic; 1328def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic; 1329def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic; 1330def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1331def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic; 1332def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic; 1333def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1334def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic; 1335def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; 1336def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; 1337def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; 1338def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; 1339def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; 1340def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< 1341 [llvm_anyint_ty], 1342 [LLVMMatchType<0>, // src(VGPR) 1343 LLVMMatchType<0>, // cmp(VGPR) 1344 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1345 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1346 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1347 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1348 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1349 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1350 AMDGPURsrcIntrinsic<2, 0>; 1351 1352// gfx908 intrinsic 1353def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1354def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1355// gfx12+ intrinsic 1356def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic < 1357 [llvm_v2bf16_ty], 1358 [llvm_v2bf16_ty, 1359 llvm_v4i32_ty, 1360 llvm_i32_ty, 1361 llvm_i32_ty, 1362 llvm_i32_ty], 1363 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1364 AMDGPURsrcIntrinsic<1, 0>; 1365def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < 1366 [llvm_v2bf16_ty], 1367 [llvm_v2bf16_ty, 1368 AMDGPUBufferRsrcTy, 1369 llvm_i32_ty, 1370 llvm_i32_ty, 1371 llvm_i32_ty], 1372 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1373 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1374 AMDGPURsrcIntrinsic<1, 0>; 1375 1376class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1377 [data_ty], 1378 [LLVMMatchType<0>, // vdata(VGPR) 1379 llvm_v4i32_ty, // rsrc(SGPR) 1380 llvm_i32_ty, // vindex(VGPR) 1381 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1382 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1383 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1384 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1385 AMDGPURsrcIntrinsic<1, 0>; 1386def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1387def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1388def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1389def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1390def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1391def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1392def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1393def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1394def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1395def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1396def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1397def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1398def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; 1399def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1400 [llvm_anyint_ty], 1401 [LLVMMatchType<0>, // src(VGPR) 1402 LLVMMatchType<0>, // cmp(VGPR) 1403 llvm_v4i32_ty, // rsrc(SGPR) 1404 llvm_i32_ty, // vindex(VGPR) 1405 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1406 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1407 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1408 [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1409 AMDGPURsrcIntrinsic<2, 0>; 1410 1411class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1412 [data_ty], 1413 [LLVMMatchType<0>, // vdata(VGPR) 1414 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1415 llvm_i32_ty, // vindex(VGPR) 1416 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1417 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1418 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1419 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1420 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1421 AMDGPURsrcIntrinsic<1, 0>; 1422def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic; 1423def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic; 1424def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic; 1425def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic; 1426def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic; 1427def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic; 1428def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic; 1429def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic; 1430def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; 1431def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; 1432def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; 1433def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; 1434def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; 1435def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< 1436 [llvm_anyint_ty], 1437 [LLVMMatchType<0>, // src(VGPR) 1438 LLVMMatchType<0>, // cmp(VGPR) 1439 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1440 llvm_i32_ty, // vindex(VGPR) 1441 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1442 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1443 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1444 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1445 ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1446 AMDGPURsrcIntrinsic<2, 0>; 1447 1448// gfx908 intrinsic 1449def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1450def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1451// gfx12 intrinsic 1452def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic < 1453 [llvm_v2bf16_ty], 1454 [llvm_v2bf16_ty, 1455 llvm_v4i32_ty, 1456 llvm_i32_ty, 1457 llvm_i32_ty, 1458 llvm_i32_ty, 1459 llvm_i32_ty], 1460 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1461 AMDGPURsrcIntrinsic<1, 0>; 1462def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < 1463 [llvm_v2bf16_ty], 1464 [llvm_v2bf16_ty, 1465 AMDGPUBufferRsrcTy, 1466 llvm_i32_ty, 1467 llvm_i32_ty, 1468 llvm_i32_ty, 1469 llvm_i32_ty], 1470 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1471 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1472 AMDGPURsrcIntrinsic<1, 0>; 1473 1474// gfx90a intrinsics 1475def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1476def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1477 1478def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1479def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1480 1481// Obsolescent tbuffer intrinsics. 1482def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < 1483 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1484 [llvm_v4i32_ty, // rsrc(SGPR) 1485 llvm_i32_ty, // vindex(VGPR) 1486 llvm_i32_ty, // voffset(VGPR) 1487 llvm_i32_ty, // soffset(SGPR) 1488 llvm_i32_ty, // offset(imm) 1489 llvm_i32_ty, // dfmt(imm) 1490 llvm_i32_ty, // nfmt(imm) 1491 llvm_i1_ty, // glc(imm) 1492 llvm_i1_ty], // slc(imm) 1493 [IntrReadMem, 1494 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, 1495 ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, 1496 AMDGPURsrcIntrinsic<0>; 1497 1498def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < 1499 [], 1500 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1501 llvm_v4i32_ty, // rsrc(SGPR) 1502 llvm_i32_ty, // vindex(VGPR) 1503 llvm_i32_ty, // voffset(VGPR) 1504 llvm_i32_ty, // soffset(SGPR) 1505 llvm_i32_ty, // offset(imm) 1506 llvm_i32_ty, // dfmt(imm) 1507 llvm_i32_ty, // nfmt(imm) 1508 llvm_i1_ty, // glc(imm) 1509 llvm_i1_ty], // slc(imm) 1510 [IntrWriteMem, ImmArg<ArgIndex<5>>, 1511 ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, 1512 ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, 1513 AMDGPURsrcIntrinsic<1>; 1514 1515// New tbuffer intrinsics, with: 1516// - raw and struct variants 1517// - joint format field 1518// - joint cachepolicy field 1519def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < 1520 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1521 [llvm_v4i32_ty, // rsrc(SGPR) 1522 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1523 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1524 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1525 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1526 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1527 // bit 3 = swz, bit 4 = scc (gfx90a) 1528 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1529 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1530 // bit 6 = swz 1531 [IntrReadMem, 1532 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1533 AMDGPURsrcIntrinsic<0>; 1534 1535def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1536 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1537 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1538 llvm_i32_ty, // offset(VGPR/imm, included in bounds` checking and swizzling) 1539 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1540 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1541 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1542 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1543 // bit 3 = swz, bit 4 = scc (gfx90a) 1544 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1545 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1546 // bit 6 = swz 1547 // all: volatile op (bit 31, stripped at lowering) 1548 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1549 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1550 AMDGPURsrcIntrinsic<0>; 1551 1552def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < 1553 [], 1554 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1555 llvm_v4i32_ty, // rsrc(SGPR) 1556 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1557 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1558 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1559 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1560 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1561 // bit 3 = swz, bit 4 = scc (gfx90a) 1562 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1563 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1564 // bit 6 = swz 1565 // all: volatile op (bit 31, stripped at lowering) 1566 [IntrWriteMem, 1567 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1568 AMDGPURsrcIntrinsic<1>; 1569 1570def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1571 [], 1572 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1573 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1574 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1575 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1576 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1577 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1578 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1579 // bit 3 = swz, bit 4 = scc (gfx90a) 1580 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1581 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1582 // bit 6 = swz 1583 // all: volatile op (bit 31, stripped at lowering) 1584 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1585 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1586 AMDGPURsrcIntrinsic<1>; 1587 1588def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < 1589 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1590 [llvm_v4i32_ty, // rsrc(SGPR) 1591 llvm_i32_ty, // vindex(VGPR) 1592 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1593 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1594 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1595 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1596 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1597 // bit 3 = swz, bit 4 = scc (gfx90a) 1598 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1599 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1600 // bit 6 = swz 1601 // all: volatile op (bit 31, stripped at lowering) 1602 [IntrReadMem, 1603 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1604 AMDGPURsrcIntrinsic<0>; 1605 1606def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1607 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1608 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1609 llvm_i32_ty, // vindex(VGPR) 1610 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1611 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1612 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1613 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1614 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1615 // bit 3 = swz, bit 4 = scc (gfx90a) 1616 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1617 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1618 // bit 6 = swz 1619 // all: volatile op (bit 31, stripped at lowering) 1620 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1621 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1622 AMDGPURsrcIntrinsic<0>; 1623 1624def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1625 [], 1626 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1627 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1628 llvm_i32_ty, // vindex(VGPR) 1629 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1630 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1631 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1632 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1633 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1634 // bit 3 = swz, bit 4 = scc (gfx90a) 1635 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1636 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1637 // bit 6 = swz 1638 // all: volatile op (bit 31, stripped at lowering) 1639 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1640 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1641 AMDGPURsrcIntrinsic<1>; 1642 1643def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < 1644 [], 1645 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1646 llvm_v4i32_ty, // rsrc(SGPR) 1647 llvm_i32_ty, // vindex(VGPR) 1648 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1649 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1650 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1651 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1652 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1653 // bit 3 = swz, bit 4 = scc (gfx90a) 1654 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1655 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1656 // bit 6 = swz 1657 // all: volatile op (bit 31, stripped at lowering) 1658 [IntrWriteMem, 1659 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1660 AMDGPURsrcIntrinsic<1>; 1661 1662class AMDGPUBufferAtomic : Intrinsic < 1663 [llvm_anyint_ty], 1664 [LLVMMatchType<0>, // vdata(VGPR) 1665 llvm_v4i32_ty, // rsrc(SGPR) 1666 llvm_i32_ty, // vindex(VGPR) 1667 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1668 llvm_i1_ty], // slc(imm) 1669 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1670 AMDGPURsrcIntrinsic<1, 0>; 1671def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; 1672def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; 1673def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; 1674def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; 1675def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; 1676def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; 1677def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; 1678def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; 1679def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; 1680def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; 1681def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< 1682 [llvm_i32_ty], 1683 [llvm_i32_ty, // src(VGPR) 1684 llvm_i32_ty, // cmp(VGPR) 1685 llvm_v4i32_ty, // rsrc(SGPR) 1686 llvm_i32_ty, // vindex(VGPR) 1687 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1688 llvm_i1_ty], // slc(imm) 1689 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1690 AMDGPURsrcIntrinsic<2, 0>; 1691 1692def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; 1693 1694class AMDGPUBufferAtomicFP : Intrinsic < 1695 [llvm_anyfloat_ty], 1696 [LLVMMatchType<0>, // vdata(VGPR) 1697 llvm_v4i32_ty, // rsrc(SGPR) 1698 llvm_i32_ty, // vindex(VGPR) 1699 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1700 llvm_i1_ty], // slc(imm) 1701 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1702 AMDGPURsrcIntrinsic<1, 0>; 1703 1704// Legacy form of the intrinsic. raw and struct forms should be preferred. 1705def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; 1706 1707class AMDGPURawBufferLoadLDS : Intrinsic < 1708 [], 1709 [llvm_v4i32_ty, // rsrc(SGPR) 1710 LLVMQualPointerType<3>, // LDS base offset 1711 llvm_i32_ty, // Data byte size: 1/2/4 1712 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1713 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1714 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1715 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1716 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1717 // bit 3 = swz, bit 4 = scc (gfx90a) 1718 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1719 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1720 // bit 6 = swz 1721 // all: volatile op (bit 31, stripped at lowering) 1722 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1723 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1724def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1725 1726class AMDGPURawPtrBufferLoadLDS : Intrinsic < 1727 [], 1728 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1729 LLVMQualPointerType<3>, // LDS base offset 1730 llvm_i32_ty, // Data byte size: 1/2/4 1731 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1732 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1733 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1734 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1735 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1736 // bit 3 = swz, bit 4 = scc (gfx90a) 1737 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1738 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1739 // bit 6 = swz 1740 // all: volatile op (bit 31, stripped at lowering) 1741 [IntrWillReturn, IntrArgMemOnly, 1742 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1743 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1744 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1745 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1746def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS; 1747 1748class AMDGPUStructBufferLoadLDS : Intrinsic < 1749 [], 1750 [llvm_v4i32_ty, // rsrc(SGPR) 1751 LLVMQualPointerType<3>, // LDS base offset 1752 llvm_i32_ty, // Data byte size: 1/2/4 1753 llvm_i32_ty, // vindex(VGPR) 1754 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1755 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1756 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1757 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1758 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1759 // bit 3 = swz, bit 4 = scc (gfx90a) 1760 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1761 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1762 // bit 6 = swz 1763 // all: volatile op (bit 31, stripped at lowering) 1764 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1765 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1766def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1767 1768class AMDGPUStructPtrBufferLoadLDS : Intrinsic < 1769 [], 1770 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1771 LLVMQualPointerType<3>, // LDS base offset 1772 llvm_i32_ty, // Data byte size: 1/2/4 1773 llvm_i32_ty, // vindex(VGPR) 1774 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1775 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1776 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1777 llvm_i32_ty], // auxiliary/cachepolicy(imm): 1778 // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), 1779 // bit 3 = swz, bit 4 = scc (gfx90a) 1780 // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 1781 // gfx12+: bits [0-2] = th, bits [3-4] = scope, 1782 // bit 6 = swz 1783 // all: volatile op (bit 31, stripped at lowering) 1784 [IntrWillReturn, IntrArgMemOnly, 1785 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1786 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1787 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1788 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1789def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS; 1790 1791} // defset AMDGPUBufferIntrinsics 1792 1793// Uses that do not set the done bit should set IntrWriteMem on the 1794// call site. 1795def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ 1796 llvm_i32_ty, // tgt, 1797 llvm_i32_ty, // en 1798 llvm_any_ty, // src0 (f32 or i32) 1799 LLVMMatchType<0>, // src1 1800 LLVMMatchType<0>, // src2 1801 LLVMMatchType<0>, // src3 1802 llvm_i1_ty, // done 1803 llvm_i1_ty // vm (ignored on GFX11+) 1804 ], 1805 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1806 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] 1807>; 1808 1809// exp with row_en bit set. Only supported on GFX11+. 1810def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ 1811 llvm_i32_ty, // tgt, 1812 llvm_i32_ty, // en 1813 llvm_any_ty, // src0 (f32 or i32) 1814 LLVMMatchType<0>, // src1 1815 LLVMMatchType<0>, // src2 1816 LLVMMatchType<0>, // src3 1817 llvm_i1_ty, // done 1818 llvm_i32_ty], // row number 1819 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1820 IntrWriteMem, IntrInaccessibleMemOnly] 1821>; 1822 1823// exp with compr bit set. Not supported on GFX11+. 1824def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ 1825 llvm_i32_ty, // tgt, 1826 llvm_i32_ty, // en 1827 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1828 LLVMMatchType<0>, // src1 1829 llvm_i1_ty, // done 1830 llvm_i1_ty], // vm 1831 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1832 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] 1833>; 1834 1835def int_amdgcn_buffer_wbinvl1_sc : 1836 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1837 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1838 1839def int_amdgcn_buffer_wbinvl1 : 1840 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1841 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1842 1843def int_amdgcn_s_dcache_inv : 1844 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1845 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1846 1847def int_amdgcn_s_memtime : 1848 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 1849 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; 1850 1851def int_amdgcn_s_sleep : 1852 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 1853 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1854 IntrHasSideEffects]> { 1855} 1856 1857def int_amdgcn_s_sleep_var 1858 : ClangBuiltin<"__builtin_amdgcn_s_sleep_var">, 1859 Intrinsic<[], [llvm_i32_ty], 1860 [IntrNoMem, IntrHasSideEffects, IntrWillReturn]> { 1861} 1862 1863def int_amdgcn_s_nop : 1864 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1865 IntrHasSideEffects]> { 1866} 1867 1868def int_amdgcn_s_incperflevel : 1869 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 1870 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1871 IntrHasSideEffects]> { 1872} 1873 1874def int_amdgcn_s_decperflevel : 1875 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 1876 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1877 IntrHasSideEffects]> { 1878} 1879 1880def int_amdgcn_s_sethalt : 1881 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1882 IntrHasSideEffects]>; 1883 1884def int_amdgcn_s_setprio : 1885 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 1886 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1887 IntrHasSideEffects]>; 1888 1889def int_amdgcn_s_ttracedata : 1890 ClangBuiltin<"__builtin_amdgcn_s_ttracedata">, 1891 DefaultAttrsIntrinsic<[], [llvm_i32_ty], 1892 [IntrNoMem, IntrHasSideEffects]>; 1893 1894def int_amdgcn_s_ttracedata_imm : 1895 ClangBuiltin<"__builtin_amdgcn_s_ttracedata_imm">, 1896 DefaultAttrsIntrinsic<[], [llvm_i16_ty], 1897 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>; 1898 1899// This is IntrHasSideEffects so it can be used to read cycle counters. 1900def int_amdgcn_s_getreg : 1901 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 1902 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], 1903 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1904>; 1905 1906// Note this can be used to set FP environment properties that are 1907// unsafe to change in non-strictfp functions. The register properties 1908// available (and value required to access them) may differ per 1909// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 1910def int_amdgcn_s_setreg : 1911 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 1912 DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], 1913 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1914>; 1915 1916// int_amdgcn_s_getpc is provided to allow a specific style of position 1917// independent code to determine the high part of its address when it is 1918// known (through convention) that the code and any data of interest does 1919// not cross a 4Gb address boundary. Use for any other purpose may not 1920// produce the desired results as optimizations may cause code movement, 1921// especially as we explicitly use IntrNoMem to allow optimizations. 1922// This intrinsic always returns PC sign-extended from 48 bits even if the 1923// s_getpc_b64 instruction returns a zero-extended value. 1924def int_amdgcn_s_getpc : 1925 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 1926 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, 1927 IntrWillReturn]>; 1928 1929// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 1930// param values: 0 = P10, 1 = P20, 2 = P0 1931def int_amdgcn_interp_mov : 1932 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 1933 DefaultAttrsIntrinsic<[llvm_float_ty], 1934 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1935 [IntrNoMem, IntrSpeculatable, 1936 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1937 1938// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 1939// This intrinsic reads from lds, but the memory values are constant, 1940// so it behaves like IntrNoMem. 1941def int_amdgcn_interp_p1 : 1942 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 1943 DefaultAttrsIntrinsic<[llvm_float_ty], 1944 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1945 [IntrNoMem, IntrSpeculatable, 1946 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1947 1948// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 1949def int_amdgcn_interp_p2 : 1950 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 1951 DefaultAttrsIntrinsic<[llvm_float_ty], 1952 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1953 [IntrNoMem, IntrSpeculatable, 1954 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1955 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 1956 1957// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 1958// high selects whether high or low 16-bits are loaded from LDS 1959def int_amdgcn_interp_p1_f16 : 1960 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 1961 DefaultAttrsIntrinsic<[llvm_float_ty], 1962 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1963 [IntrNoMem, IntrSpeculatable, 1964 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1965 1966// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 1967// high selects whether high or low 16-bits are loaded from LDS 1968def int_amdgcn_interp_p2_f16 : 1969 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 1970 DefaultAttrsIntrinsic<[llvm_half_ty], 1971 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1972 [IntrNoMem, IntrSpeculatable, 1973 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1974 1975// llvm.amdgcn.lds.direct.load <m0> 1976// The input argument is m0, which contains a packed combination of address 1977// offset and flags describing the data type. 1978def int_amdgcn_lds_direct_load : 1979 DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 1980 [llvm_i32_ty], 1981 [IntrReadMem, IntrSpeculatable]>; 1982 1983// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 1984// Like interp intrinsics, this reads from lds, but the memory values are constant, 1985// so it behaves like IntrNoMem. 1986def int_amdgcn_lds_param_load : 1987 DefaultAttrsIntrinsic<[llvm_float_ty], 1988 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1989 [IntrNoMem, IntrSpeculatable, 1990 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 1991 1992// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 1993def int_amdgcn_interp_inreg_p10 : 1994 DefaultAttrsIntrinsic<[llvm_float_ty], 1995 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1996 [IntrNoMem, IntrSpeculatable]>; 1997 1998// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 1999def int_amdgcn_interp_inreg_p2 : 2000 DefaultAttrsIntrinsic<[llvm_float_ty], 2001 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 2002 [IntrNoMem, IntrSpeculatable]>; 2003 2004// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 2005// high selects whether high or low 16-bits are used for p and p0 operands 2006def int_amdgcn_interp_inreg_p10_f16: 2007 DefaultAttrsIntrinsic<[llvm_float_ty], 2008 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2009 [IntrNoMem, IntrSpeculatable, 2010 ImmArg<ArgIndex<3>>]>; 2011 2012// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 2013// high selects whether high or low 16-bits are used for p operand 2014def int_amdgcn_interp_inreg_p2_f16 : 2015 DefaultAttrsIntrinsic<[llvm_half_ty], 2016 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2017 [IntrNoMem, IntrSpeculatable, 2018 ImmArg<ArgIndex<3>>]>; 2019 2020// llvm.amdgcn.interp.p10.rtz.f16 <p>, <i>, <p0>, <high> 2021// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode. 2022// high selects whether high or low 16-bits are used for p and p0 operands 2023def int_amdgcn_interp_p10_rtz_f16: 2024 DefaultAttrsIntrinsic<[llvm_float_ty], 2025 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2026 [IntrNoMem, IntrSpeculatable, 2027 ImmArg<ArgIndex<3>>]>; 2028 2029// llvm.amdgcn.interp.p2.rtz.f16 <p>, <j>, <tmp>, <high> 2030// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode. 2031// high selects whether high or low 16-bits are used for p operand 2032def int_amdgcn_interp_p2_rtz_f16 : 2033 DefaultAttrsIntrinsic<[llvm_half_ty], 2034 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2035 [IntrNoMem, IntrSpeculatable, 2036 ImmArg<ArgIndex<3>>]>; 2037 2038// Deprecated: use llvm.amdgcn.live.mask instead. 2039def int_amdgcn_ps_live : DefaultAttrsIntrinsic < 2040 [llvm_i1_ty], 2041 [], 2042 [IntrNoMem]>; 2043 2044// Query currently live lanes. 2045// Returns true if lane is live (and not a helper lane). 2046def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], 2047 [], [IntrReadMem, IntrInaccessibleMemOnly] 2048>; 2049 2050def int_amdgcn_mbcnt_lo : 2051 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 2052 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2053 [IntrNoMem]>; 2054 2055def int_amdgcn_mbcnt_hi : 2056 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 2057 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2058 [IntrNoMem]>; 2059 2060// llvm.amdgcn.ds.swizzle src offset 2061def int_amdgcn_ds_swizzle : 2062 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 2063 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2064 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, 2065 ImmArg<ArgIndex<1>>]>; 2066 2067def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2068 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 2069 [IntrNoMem, IntrSpeculatable] 2070>; 2071 2072def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2073 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 2074 [IntrNoMem, IntrSpeculatable] 2075>; 2076 2077def int_amdgcn_lerp : 2078 ClangBuiltin<"__builtin_amdgcn_lerp">, 2079 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2080 [IntrNoMem, IntrSpeculatable] 2081>; 2082 2083def int_amdgcn_sad_u8 : 2084 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 2085 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2086 [IntrNoMem, IntrSpeculatable] 2087>; 2088 2089def int_amdgcn_msad_u8 : 2090 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 2091 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2092 [IntrNoMem, IntrSpeculatable] 2093>; 2094 2095def int_amdgcn_sad_hi_u8 : 2096 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 2097 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2098 [IntrNoMem, IntrSpeculatable] 2099>; 2100 2101def int_amdgcn_sad_u16 : 2102 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 2103 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2104 [IntrNoMem, IntrSpeculatable] 2105>; 2106 2107def int_amdgcn_qsad_pk_u16_u8 : 2108 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 2109 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2110 [IntrNoMem, IntrSpeculatable] 2111>; 2112 2113def int_amdgcn_mqsad_pk_u16_u8 : 2114 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 2115 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2116 [IntrNoMem, IntrSpeculatable] 2117>; 2118 2119def int_amdgcn_mqsad_u32_u8 : 2120 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 2121 DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 2122 [IntrNoMem, IntrSpeculatable] 2123>; 2124 2125def int_amdgcn_cvt_pk_u8_f32 : 2126 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 2127 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 2128 [IntrNoMem, IntrSpeculatable] 2129>; 2130 2131def int_amdgcn_icmp : 2132 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 2133 [IntrNoMem, IntrConvergent, 2134 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2135 2136def int_amdgcn_fcmp : 2137 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 2138 [IntrNoMem, IntrConvergent, 2139 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2140 2141def int_amdgcn_ballot : 2142 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 2143 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2144 2145def int_amdgcn_inverse_ballot : 2146 Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], 2147 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2148 2149// Lowers to S_BITREPLICATE_B64_B32. 2150// The argument must be uniform; otherwise, the result is undefined. 2151def int_amdgcn_s_bitreplicate : 2152 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; 2153 2154// Lowers to S_QUADMASK_B{32,64} 2155// The argument must be uniform; otherwise, the result is undefined. 2156def int_amdgcn_s_quadmask : 2157 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2158 2159// Lowers to S_WQM_B{32,64} 2160// The argument must be uniform; otherwise, the result is undefined. 2161// Does not set WQM; merely calculates the bitmask. 2162def int_amdgcn_s_wqm : 2163 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2164 2165class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< 2166 [data_ty], 2167 [ 2168 LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) 2169 llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, 2170 // 1: Iterative strategy, and 2171 // 2. DPP) 2172 ], 2173 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; 2174 2175def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; 2176def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; 2177 2178def int_amdgcn_readfirstlane : 2179 ClangBuiltin<"__builtin_amdgcn_readfirstlane">, 2180 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 2181 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2182 2183// The lane argument must be uniform across the currently active threads of the 2184// current wave. Otherwise, the result is undefined. 2185def int_amdgcn_readlane : 2186 ClangBuiltin<"__builtin_amdgcn_readlane">, 2187 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2188 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2189 2190// The value to write and lane select arguments must be uniform across the 2191// currently active threads of the current wave. Otherwise, the result is 2192// undefined. 2193def int_amdgcn_writelane : 2194 ClangBuiltin<"__builtin_amdgcn_writelane">, 2195 Intrinsic<[llvm_i32_ty], [ 2196 llvm_i32_ty, // uniform value to write: returned by the selected lane 2197 llvm_i32_ty, // uniform lane select 2198 llvm_i32_ty // returned by all lanes other than the selected one 2199 ], 2200 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2201>; 2202 2203def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 2204 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2205 [IntrNoMem, IntrSpeculatable] 2206>; 2207 2208// mul24 intrinsics can return i32 or i64. 2209// When returning i64, they're lowered to a mul24/mulhi24 pair. 2210def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2211 [llvm_i32_ty, llvm_i32_ty], 2212 [IntrNoMem, IntrSpeculatable] 2213>; 2214 2215def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2216 [llvm_i32_ty, llvm_i32_ty], 2217 [IntrNoMem, IntrSpeculatable] 2218>; 2219 2220def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2221 [llvm_i32_ty, llvm_i32_ty], 2222 [IntrNoMem, IntrSpeculatable] 2223>; 2224 2225def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2226 [llvm_i32_ty, llvm_i32_ty], 2227 [IntrNoMem, IntrSpeculatable] 2228>; 2229 2230// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 2231// 2232// bar_val is the total number of waves that will wait on this 2233// barrier, minus 1. 2234def int_amdgcn_ds_gws_init : 2235 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 2236 Intrinsic<[], 2237 [llvm_i32_ty, llvm_i32_ty], 2238 [IntrConvergent, IntrWriteMem, 2239 IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2240 [SDNPMemOperand] 2241>; 2242 2243// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 2244// bar_val is the total number of waves that will wait on this 2245// barrier, minus 1. 2246def int_amdgcn_ds_gws_barrier : 2247 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 2248 Intrinsic<[], 2249 [llvm_i32_ty, llvm_i32_ty], 2250 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2251 [SDNPMemOperand] 2252>; 2253 2254// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 2255def int_amdgcn_ds_gws_sema_v : 2256 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 2257 Intrinsic<[], 2258 [llvm_i32_ty], 2259 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2260 [SDNPMemOperand] 2261>; 2262 2263// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 2264def int_amdgcn_ds_gws_sema_br : 2265 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 2266 Intrinsic<[], 2267 [llvm_i32_ty, llvm_i32_ty], 2268 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2269 [SDNPMemOperand] 2270>; 2271 2272// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 2273def int_amdgcn_ds_gws_sema_p : 2274 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 2275 Intrinsic<[], 2276 [llvm_i32_ty], 2277 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2278 [SDNPMemOperand] 2279>; 2280 2281// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 2282def int_amdgcn_ds_gws_sema_release_all : 2283 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 2284 Intrinsic<[], 2285 [llvm_i32_ty], 2286 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2287 [SDNPMemOperand] 2288>; 2289 2290 2291// Copies the source value to the destination value, with the guarantee that 2292// the source value is computed as if the entire program were executed in WQM. 2293def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 2294 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2295>; 2296 2297// Copies the source value to the destination value, such that the source 2298// is computed as if the entire program were executed in WQM if any other 2299// program code executes in WQM. 2300def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 2301 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2302>; 2303 2304// Return true if at least one thread within the pixel quad passes true into 2305// the function. 2306def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 2307 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2308>; 2309 2310// If false, set EXEC=0 for the current thread until the end of program. 2311// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 2312def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; 2313 2314def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 2315 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrConvergent, 2316 IntrNoCallback, IntrNoFree] 2317>; 2318 2319// If false, mark all active lanes as helper lanes until the end of program. 2320def int_amdgcn_wqm_demote : Intrinsic<[], 2321 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] 2322>; 2323 2324// Copies the active channels of the source value to the destination value, 2325// with the guarantee that the source value is computed as if the entire 2326// program were executed in Whole Wavefront Mode, i.e. with all channels 2327// enabled, with a few exceptions: - Phi nodes which require WWM return an 2328// undefined value. 2329def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 2330 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2331 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2332>; 2333// Deprecated. Use int_amdgcn_strict_wwm instead. 2334def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 2335 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2336 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2337>; 2338def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 2339 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2340 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2341>; 2342 2343// Given a value, copies it while setting all the inactive lanes to a given 2344// value. Note that OpenGL helper lanes are considered active, so if the 2345// program ever uses WQM, then the instruction and the first source will be 2346// computed in WQM. 2347def int_amdgcn_set_inactive : 2348 Intrinsic<[llvm_anyint_ty], 2349 [LLVMMatchType<0>, // value to be copied 2350 LLVMMatchType<0>], // value for the inactive lanes to take 2351 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2352 2353// Similar to int_amdgcn_set_inactive, but the value for the inactive lanes must 2354// be a VGPR function argument. 2355// Can only be used in functions with the `amdgpu_cs_chain` or 2356// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2357// flow. 2358def int_amdgcn_set_inactive_chain_arg : 2359 Intrinsic<[llvm_anyint_ty], 2360 [LLVMMatchType<0>, // value to be copied 2361 LLVMMatchType<0>], // value for the inactive lanes to take 2362 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2363 2364// Return if the given flat pointer points to a local memory address. 2365def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 2366 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2367 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2368>; 2369 2370// Return if the given flat pointer points to a prvate memory address. 2371def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 2372 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2373 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2374>; 2375 2376// A uniform tail call to a function with the `amdgpu_cs_chain` or 2377// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs 2378// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to 2379// the given function. 2380// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or 2381// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2382// flow. 2383def int_amdgcn_cs_chain: 2384 Intrinsic<[], 2385 [llvm_anyptr_ty, // The function to jump to. 2386 llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64). 2387 llvm_any_ty, // Arguments that will be copied into SGPRs (s0+). 2388 // Must be uniform. 2389 llvm_any_ty, // Arguments that will be copied into VGPRs (v8+). 2390 // Need not be uniform. 2391 llvm_i32_ty, // Flags. 2392 llvm_vararg_ty // Additional arguments. Only present if Flags is 2393 // non-zero. 2394 ], 2395 [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>; 2396 2397 2398//===----------------------------------------------------------------------===// 2399// CI+ Intrinsics 2400//===----------------------------------------------------------------------===// 2401 2402def int_amdgcn_s_dcache_inv_vol : 2403 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 2404 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2405 2406def int_amdgcn_buffer_wbinvl1_vol : 2407 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 2408 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2409 2410//===----------------------------------------------------------------------===// 2411// VI Intrinsics 2412//===----------------------------------------------------------------------===// 2413 2414// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2415def int_amdgcn_mov_dpp : 2416 Intrinsic<[llvm_anyint_ty], 2417 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 2418 llvm_i1_ty], 2419 [IntrNoMem, IntrConvergent, IntrWillReturn, 2420 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 2421 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2422 2423// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2424// Should be equivalent to: 2425// v_mov_b32 <dest> <old> 2426// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2427def int_amdgcn_update_dpp : 2428 Intrinsic<[llvm_any_ty], 2429 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 2430 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 2431 [IntrNoMem, IntrConvergent, IntrWillReturn, 2432 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 2433 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2434 2435def int_amdgcn_s_dcache_wb : 2436 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 2437 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2438 2439def int_amdgcn_s_dcache_wb_vol : 2440 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 2441 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2442 2443def int_amdgcn_s_memrealtime : 2444 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 2445 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2446 2447// llvm.amdgcn.ds.permute <index> <src> 2448def int_amdgcn_ds_permute : 2449 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 2450 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2451 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2452 2453// llvm.amdgcn.ds.bpermute <index> <src> 2454def int_amdgcn_ds_bpermute : 2455 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 2456 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2457 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2458 2459// llvm.amdgcn.perm <src0> <src1> <selector> 2460def int_amdgcn_perm : 2461 ClangBuiltin<"__builtin_amdgcn_perm">, 2462 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2463 [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2464 2465//===----------------------------------------------------------------------===// 2466// GFX9 Intrinsics 2467//===----------------------------------------------------------------------===// 2468 2469class AMDGPUGlobalLoadLDS : Intrinsic < 2470 [], 2471 [LLVMQualPointerType<1>, // Base global pointer to load from 2472 LLVMQualPointerType<3>, // LDS base pointer to store to 2473 llvm_i32_ty, // Data byte size: 1/2/4 2474 llvm_i32_ty, // imm offset (applied to both global and LDS address) 2475 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, 2476 // bit 1 = slc/sc1, 2477 // bit 2 = dlc on gfx10/gfx11)) 2478 // bit 4 = scc/nt on gfx90a+)) 2479 // gfx12+: 2480 // cachepolicy (bits [0-2] = th, 2481 // bits [3-4] = scope) 2482 // swizzled buffer (bit 6 = swz), 2483 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 2484 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], 2485 "", [SDNPMemOperand]>; 2486def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 2487 2488//===----------------------------------------------------------------------===// 2489// GFX10 Intrinsics 2490//===----------------------------------------------------------------------===// 2491 2492// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 2493def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, 2494 Intrinsic<[llvm_i32_ty], 2495 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2496 [IntrNoMem, IntrConvergent, IntrWillReturn, 2497 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2498 2499// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 2500def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, 2501 Intrinsic<[llvm_i32_ty], 2502 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2503 [IntrNoMem, IntrConvergent, IntrWillReturn, 2504 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2505 2506// llvm.amdgcn.mov.dpp8.i32 <src> <sel> 2507// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 2508// the lanes to read from. 2509def int_amdgcn_mov_dpp8 : 2510 Intrinsic<[llvm_anyint_ty], 2511 [LLVMMatchType<0>, llvm_i32_ty], 2512 [IntrNoMem, IntrConvergent, IntrWillReturn, 2513 ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; 2514 2515def int_amdgcn_s_get_waveid_in_workgroup : 2516 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 2517 Intrinsic<[llvm_i32_ty], [], 2518 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2519 2520class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic < 2521 [vt], 2522 [pt, // vaddr 2523 vt], // vdata(VGPR) 2524 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", 2525 [SDNPMemOperand]>; 2526 2527def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>; 2528 2529// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 2530// <ray_dir>, <ray_inv_dir>, <texture_descr> 2531// <node_ptr> is i32 or i64. 2532// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 2533def int_amdgcn_image_bvh_intersect_ray : 2534 DefaultAttrsIntrinsic<[llvm_v4i32_ty], 2535 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 2536 LLVMMatchType<1>, llvm_v4i32_ty], 2537 [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2538 2539//===----------------------------------------------------------------------===// 2540// GFX11 Intrinsics 2541//===----------------------------------------------------------------------===// 2542 2543// llvm.amdgcn.permlane64 <src0> 2544def int_amdgcn_permlane64 : 2545 ClangBuiltin<"__builtin_amdgcn_permlane64">, 2546 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 2547 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2548 2549def int_amdgcn_ds_add_gs_reg_rtn : 2550 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 2551 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2552 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2553 "", [SDNPMemOperand]>; 2554 2555def int_amdgcn_ds_sub_gs_reg_rtn : 2556 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 2557 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2558 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2559 "", [SDNPMemOperand]>; 2560 2561def int_amdgcn_ds_bvh_stack_rtn : 2562 Intrinsic< 2563 [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr 2564 [ 2565 llvm_i32_ty, // %addr 2566 llvm_i32_ty, // %data0 2567 llvm_v4i32_ty, // %data1 2568 llvm_i32_ty, // %offset 2569 ], 2570 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2571 >; 2572 2573def int_amdgcn_s_wait_event_export_ready : 2574 ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, 2575 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] 2576>; 2577 2578// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2579// 2580// These operations perform a matrix multiplication and accumulation of 2581// the form: D = A * B + C . 2582 2583class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 2584 Intrinsic< 2585 [CD], // %D 2586 [ 2587 AB, // %A 2588 LLVMMatchType<1>, // %B 2589 LLVMMatchType<0>, // %C 2590 ], 2591 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2592>; 2593 2594class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2595 Intrinsic< 2596 [CD], // %D 2597 [ 2598 AB, // %A 2599 LLVMMatchType<1>, // %B 2600 LLVMMatchType<0>, // %C 2601 llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12 2602 ], 2603 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2604>; 2605 2606class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2607 Intrinsic< 2608 [CD], // %D 2609 [ 2610 llvm_i1_ty, // %A_sign 2611 AB, // %A 2612 llvm_i1_ty, // %B_sign 2613 LLVMMatchType<1>, // %B 2614 LLVMMatchType<0>, // %C 2615 llvm_i1_ty, // %clamp 2616 ], 2617 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2618>; 2619 2620// WMMA GFX11Only 2621 2622// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2623// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. 2624// The content of the other 16-bit half is preserved from the input. 2625 2626defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX11 = { 2627def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2628def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2629 2630// WMMA GFX11Plus 2631 2632def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2633def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2634def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2635def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2636 2637// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2638// The content of the other 16-bit half is undefined. 2639// GFX12: The op_sel bit must be 0. 2640def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2641def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2642} 2643 2644//===----------------------------------------------------------------------===// 2645// GFX12 Intrinsics 2646//===----------------------------------------------------------------------===// 2647 2648// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control> 2649def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, 2650 Intrinsic<[llvm_i32_ty], 2651 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2652 [IntrNoMem, IntrConvergent, IntrWillReturn, 2653 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2654 2655// llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control> 2656def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">, 2657 Intrinsic<[llvm_i32_ty], 2658 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2659 [IntrNoMem, IntrConvergent, IntrWillReturn, 2660 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2661 2662// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics 2663// 2664// These operations perform a sparse matrix multiplication and accumulation of 2665// the form: D = A * B + C. 2666// A is sparse matrix, half the size of B, and is expanded using sparsity index. 2667 2668class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2669 Intrinsic< 2670 [CD], // %D 2671 [ 2672 A, // %A 2673 B, // %B 2674 LLVMMatchType<0>, // %C 2675 Index // %Sparsity index for A 2676 ], 2677 [IntrNoMem, IntrConvergent, IntrWillReturn] 2678>; 2679 2680class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2681 Intrinsic< 2682 [CD], // %D 2683 [ 2684 llvm_i1_ty, // %A_sign 2685 A, // %A 2686 llvm_i1_ty, // %B_sign 2687 B, // %B 2688 LLVMMatchType<0>, // %C 2689 Index, // %Sparsity index for A 2690 llvm_i1_ty, // %clamp 2691 ], 2692 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>] 2693>; 2694 2695defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = { 2696// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2697// 2698// These operations perform a matrix multiplication and accumulation of 2699// the form: D = A * B + C . 2700 2701// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. 2702def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2703def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2704def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2705def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2706// A and B are <16 x iu4>. 2707def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2708 2709def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2710def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2711def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2712def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2713def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2714def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2715def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2716def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2717def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2718def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2719def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2720} 2721 2722def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>; 2723 2724def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2725def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2726def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2727def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2728 2729def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; 2730 2731class AMDGPULoadIntrinsic<LLVMType ptr_ty>: 2732 Intrinsic< 2733 [llvm_any_ty], 2734 [ptr_ty], 2735 [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], 2736 "", 2737 [SDNPMemOperand] 2738 >; 2739 2740// Wave32 2741// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64 2742// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128 2743// Wave64 2744// i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64 2745// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128 2746 2747def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; 2748def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>; 2749 2750// i32 @llvm.amdgcn.wave.id() 2751def int_amdgcn_wave_id : 2752 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 2753 2754//===----------------------------------------------------------------------===// 2755// Deep learning intrinsics. 2756//===----------------------------------------------------------------------===// 2757 2758// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2759// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2760def int_amdgcn_fdot2 : 2761 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2762 DefaultAttrsIntrinsic< 2763 [llvm_float_ty], // %r 2764 [ 2765 llvm_v2f16_ty, // %a 2766 llvm_v2f16_ty, // %b 2767 llvm_float_ty, // %c 2768 llvm_i1_ty // %clamp 2769 ], 2770 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2771 >; 2772 2773// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2774// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2775def int_amdgcn_fdot2_f16_f16 : 2776 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2777 DefaultAttrsIntrinsic< 2778 [llvm_half_ty], // %r 2779 [ 2780 llvm_v2f16_ty, // %a 2781 llvm_v2f16_ty, // %b 2782 llvm_half_ty // %c 2783 ], 2784 [IntrNoMem, IntrSpeculatable] 2785 >; 2786 2787// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2788// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2789def int_amdgcn_fdot2_bf16_bf16 : 2790 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2791 DefaultAttrsIntrinsic< 2792 [llvm_bfloat_ty], // %r 2793 [ 2794 llvm_v2bf16_ty, // %a 2795 llvm_v2bf16_ty, // %b 2796 llvm_bfloat_ty // %c 2797 ], 2798 [IntrNoMem, IntrSpeculatable] 2799 >; 2800 2801// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2802// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2803def int_amdgcn_fdot2_f32_bf16 : 2804 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2805 DefaultAttrsIntrinsic< 2806 [llvm_float_ty], // %r 2807 [ 2808 llvm_v2bf16_ty, // %a 2809 llvm_v2bf16_ty, // %b 2810 llvm_float_ty, // %c 2811 llvm_i1_ty // %clamp 2812 ], 2813 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2814 >; 2815 2816// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 2817// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2818def int_amdgcn_sdot2 : 2819 ClangBuiltin<"__builtin_amdgcn_sdot2">, 2820 DefaultAttrsIntrinsic< 2821 [llvm_i32_ty], // %r 2822 [ 2823 llvm_v2i16_ty, // %a 2824 llvm_v2i16_ty, // %b 2825 llvm_i32_ty, // %c 2826 llvm_i1_ty // %clamp 2827 ], 2828 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2829 >; 2830 2831// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 2832// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2833def int_amdgcn_udot2 : 2834 ClangBuiltin<"__builtin_amdgcn_udot2">, 2835 DefaultAttrsIntrinsic< 2836 [llvm_i32_ty], // %r 2837 [ 2838 llvm_v2i16_ty, // %a 2839 llvm_v2i16_ty, // %b 2840 llvm_i32_ty, // %c 2841 llvm_i1_ty // %clamp 2842 ], 2843 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2844 >; 2845 2846// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2847// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2848def int_amdgcn_sdot4 : 2849 ClangBuiltin<"__builtin_amdgcn_sdot4">, 2850 DefaultAttrsIntrinsic< 2851 [llvm_i32_ty], // %r 2852 [ 2853 llvm_i32_ty, // %a 2854 llvm_i32_ty, // %b 2855 llvm_i32_ty, // %c 2856 llvm_i1_ty // %clamp 2857 ], 2858 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2859 >; 2860 2861// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 2862// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2863def int_amdgcn_udot4 : 2864 ClangBuiltin<"__builtin_amdgcn_udot4">, 2865 DefaultAttrsIntrinsic< 2866 [llvm_i32_ty], // %r 2867 [ 2868 llvm_i32_ty, // %a 2869 llvm_i32_ty, // %b 2870 llvm_i32_ty, // %c 2871 llvm_i1_ty // %clamp 2872 ], 2873 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2874 >; 2875 2876// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2877// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2878// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 2879// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 2880// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2881def int_amdgcn_sudot4 : 2882 ClangBuiltin<"__builtin_amdgcn_sudot4">, 2883 DefaultAttrsIntrinsic< 2884 [llvm_i32_ty], // %r 2885 [ 2886 llvm_i1_ty, // %a_sign 2887 llvm_i32_ty, // %a 2888 llvm_i1_ty, // %b_sign 2889 llvm_i32_ty, // %b 2890 llvm_i32_ty, // %c 2891 llvm_i1_ty // %clamp 2892 ], 2893 [IntrNoMem, IntrSpeculatable, 2894 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2895 >; 2896 2897// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2898// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2899// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2900def int_amdgcn_sdot8 : 2901 ClangBuiltin<"__builtin_amdgcn_sdot8">, 2902 DefaultAttrsIntrinsic< 2903 [llvm_i32_ty], // %r 2904 [ 2905 llvm_i32_ty, // %a 2906 llvm_i32_ty, // %b 2907 llvm_i32_ty, // %c 2908 llvm_i1_ty // %clamp 2909 ], 2910 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2911 >; 2912 2913// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 2914// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2915// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2916def int_amdgcn_udot8 : 2917 ClangBuiltin<"__builtin_amdgcn_udot8">, 2918 DefaultAttrsIntrinsic< 2919 [llvm_i32_ty], // %r 2920 [ 2921 llvm_i32_ty, // %a 2922 llvm_i32_ty, // %b 2923 llvm_i32_ty, // %c 2924 llvm_i1_ty // %clamp 2925 ], 2926 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2927 >; 2928 2929// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2930// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2931// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 2932// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 2933// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2934// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2935 def int_amdgcn_sudot8 : 2936 ClangBuiltin<"__builtin_amdgcn_sudot8">, 2937 DefaultAttrsIntrinsic< 2938 [llvm_i32_ty], // %r 2939 [ 2940 llvm_i1_ty, // %a_sign 2941 llvm_i32_ty, // %a 2942 llvm_i1_ty, // %b_sign 2943 llvm_i32_ty, // %b 2944 llvm_i32_ty, // %c 2945 llvm_i1_ty // %clamp 2946 ], 2947 [IntrNoMem, IntrSpeculatable, 2948 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2949 >; 2950 2951// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c) 2952// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2953class AMDGPU8bitFloatDot4Intrinsic : 2954 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2955 DefaultAttrsIntrinsic< 2956 [llvm_float_ty], // %r 2957 [ 2958 llvm_i32_ty, // %a 2959 llvm_i32_ty, // %b 2960 llvm_float_ty, // %c 2961 ], 2962 [IntrNoMem, IntrSpeculatable] 2963 >; 2964 2965def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 2966def int_amdgcn_dot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 2967def int_amdgcn_dot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 2968def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 2969 2970//===----------------------------------------------------------------------===// 2971// gfx908 intrinsics 2972// ===----------------------------------------------------------------------===// 2973 2974def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2975 2976// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 2977class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 2978 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2979 DefaultAttrsIntrinsic<[DestTy], 2980 [SrcABTy, SrcABTy, DestTy, 2981 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2982 [IntrConvergent, IntrNoMem, 2983 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2984 2985defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = { 2986def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 2987def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2988def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2989def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2990def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2991def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 2992def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2993def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2994def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2995def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2996def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 2997def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2998def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2999def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 3000def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 3001def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 3002def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 3003def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 3004def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 3005def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 3006} 3007 3008//===----------------------------------------------------------------------===// 3009// gfx90a intrinsics 3010// ===----------------------------------------------------------------------===// 3011 3012def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3013def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3014def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3015def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3016def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3017 3018defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = { 3019def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 3020def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 3021def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 3022def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 3023def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 3024 3025// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 3026// Three bits corresponding to the neg modifier applied to the respective 3027// source operand. 3028def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 3029def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 3030} 3031 3032//===----------------------------------------------------------------------===// 3033// gfx940 intrinsics 3034// ===----------------------------------------------------------------------===// 3035 3036class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : 3037 AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; 3038 3039multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { 3040 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 3041 def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; 3042} 3043 3044// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 3045class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 3046 ClangBuiltin<!subst("int", "__builtin", NAME)>, 3047 DefaultAttrsIntrinsic<[DestTy], 3048 [SrcA, SrcB, DestTy, llvm_i32_ty, 3049 llvm_i32_ty, llvm_i32_ty], 3050 [IntrConvergent, IntrNoMem, 3051 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 3052 3053class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : 3054 AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; 3055 3056multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { 3057 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 3058 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; 3059} 3060 3061// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. 3062def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 3063def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 3064def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< 3065 [llvm_v2i16_ty], 3066 [LLVMQualPointerType<3>, llvm_v2i16_ty], 3067 [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>, 3068 ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; 3069 3070defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = { 3071def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 3072def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 3073def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 3074def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 3075 3076defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; 3077defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; 3078 3079def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 3080def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 3081def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 3082def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 3083def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 3084def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 3085 3086defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; 3087defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; 3088} 3089 3090// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] 3091// byte_sel selects byte from srcA. 3092def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, 3093 DefaultAttrsIntrinsic<[llvm_float_ty], 3094 [llvm_i32_ty, llvm_i32_ty], 3095 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3096 3097// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] 3098def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, 3099 DefaultAttrsIntrinsic<[llvm_float_ty], 3100 [llvm_i32_ty, llvm_i32_ty], 3101 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3102 3103// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel 3104// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. 3105def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, 3106 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 3107 [llvm_i32_ty, llvm_i1_ty], 3108 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3109 3110// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. 3111def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, 3112 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 3113 [llvm_i32_ty, llvm_i1_ty], 3114 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3115 3116// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3117// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. 3118def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, 3119 DefaultAttrsIntrinsic<[llvm_i32_ty], 3120 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3121 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3122 3123// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3124def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, 3125 DefaultAttrsIntrinsic<[llvm_i32_ty], 3126 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3127 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3128 3129// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3130// byte_sel selects byte to write into vdst. 3131def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, 3132 DefaultAttrsIntrinsic<[llvm_i32_ty], 3133 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3134 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3135 3136// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3137def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, 3138 DefaultAttrsIntrinsic<[llvm_i32_ty], 3139 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3140 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3141 3142//===----------------------------------------------------------------------===// 3143// Special Intrinsics for backend internal use only. No frontend 3144// should emit calls to these. 3145// ===----------------------------------------------------------------------===// 3146// 3147// Control-flow intrinsics in LLVM IR are convergent because they represent the 3148// wave CFG, i.e., sets of threads that are "converged" or "execute in 3149// lock-step". But they exist during a small window in the lowering process, 3150// inserted after the structurizer and then translated to equivalent MIR 3151// pseudos. So rather than create convergence tokens for these builtins, we 3152// simply mark them as not convergent. 3153// 3154// This is really a workaround to allow control flow lowering in the presence of 3155// convergence control tokens. The corresponding MIR pseudos are marked as 3156// having side effects, which is sufficient to prevent optimizations without 3157// having to mark them as convergent. 3158def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3159 [llvm_i1_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3160>; 3161 3162def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3163 [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3164>; 3165 3166def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 3167 [llvm_i1_ty, LLVMMatchType<0>], 3168 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree] 3169>; 3170 3171def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 3172 [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 3173>; 3174 3175def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 3176 [IntrWillReturn, IntrNoCallback, IntrNoFree]>; 3177 3178// Represent unreachable in a divergent region. 3179def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; 3180 3181// Emit 2.5 ulp, no denormal division. Should only be inserted by 3182// pass based on !fpmath metadata. 3183def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< 3184 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 3185 [IntrNoMem, IntrSpeculatable] 3186>; 3187 3188/// Emit an addrspacecast without null pointer checking. 3189/// Should only be inserted by a pass based on analysis of an addrspacecast's src. 3190def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic< 3191 [llvm_anyptr_ty], [llvm_anyptr_ty], 3192 [IntrNoMem, IntrSpeculatable] 3193>; 3194} 3195