xref: /aosp_15_r20/external/llvm/test/CodeGen/AMDGPU/lds-alignment.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s
2*9880d681SAndroid Build Coastguard Worker
3*9880d681SAndroid Build Coastguard Worker@lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
4*9880d681SAndroid Build Coastguard Worker@lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
5*9880d681SAndroid Build Coastguard Worker
6*9880d681SAndroid Build Coastguard Worker@lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
7*9880d681SAndroid Build Coastguard Worker@lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32
8*9880d681SAndroid Build Coastguard Worker
9*9880d681SAndroid Build Coastguard Worker@lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
10*9880d681SAndroid Build Coastguard Worker@lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
11*9880d681SAndroid Build Coastguard Worker
12*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1) #0
13*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #0
14*9880d681SAndroid Build Coastguard Worker
15*9880d681SAndroid Build Coastguard Worker
16*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_no_round_size_1:
17*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 38
18*9880d681SAndroid Build Coastguard Workerdefine void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
19*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
20*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
21*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
22*9880d681SAndroid Build Coastguard Worker  ret void
23*9880d681SAndroid Build Coastguard Worker}
24*9880d681SAndroid Build Coastguard Worker
25*9880d681SAndroid Build Coastguard Worker; There are two objects, so one requires padding to to be correctly
26*9880d681SAndroid Build Coastguard Worker; aligned after the other.
27*9880d681SAndroid Build Coastguard Worker
28*9880d681SAndroid Build Coastguard Worker; (38 -> 48) + 38 = 92
29*9880d681SAndroid Build Coastguard Worker
30*9880d681SAndroid Build Coastguard Worker; I don't think it is necessary to add padding after since if there
31*9880d681SAndroid Build Coastguard Worker; were to be a dynamically sized LDS kernel arg, the runtime should
32*9880d681SAndroid Build Coastguard Worker; add the alignment padding if necessary alignment padding if needed.
33*9880d681SAndroid Build Coastguard Worker
34*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_2:
35*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 86
36*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
37*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
38*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
39*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
40*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
41*9880d681SAndroid Build Coastguard Worker
42*9880d681SAndroid Build Coastguard Worker  %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)*
43*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.1.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
44*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.1.bc, i32 38, i32 4, i1 false)
45*9880d681SAndroid Build Coastguard Worker
46*9880d681SAndroid Build Coastguard Worker  ret void
47*9880d681SAndroid Build Coastguard Worker}
48*9880d681SAndroid Build Coastguard Worker
49*9880d681SAndroid Build Coastguard Worker; 38 + (10 pad) + 38
50*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_2_align_8:
51*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 86
52*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
53*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
54*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
55*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
56*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
57*9880d681SAndroid Build Coastguard Worker
58*9880d681SAndroid Build Coastguard Worker  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
59*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
60*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
61*9880d681SAndroid Build Coastguard Worker
62*9880d681SAndroid Build Coastguard Worker  ret void
63*9880d681SAndroid Build Coastguard Worker}
64*9880d681SAndroid Build Coastguard Worker
65*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
66*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 38
67*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
68*9880d681SAndroid Build Coastguard Workerdefine void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
69*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
70*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
71*9880d681SAndroid Build Coastguard Worker
72*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
73*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
74*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
75*9880d681SAndroid Build Coastguard Worker  ret void
76*9880d681SAndroid Build Coastguard Worker}
77*9880d681SAndroid Build Coastguard Worker
78*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_lds_arg:
79*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 0
80*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
81*9880d681SAndroid Build Coastguard Workerdefine void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
82*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
83*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
84*9880d681SAndroid Build Coastguard Worker  ret void
85*9880d681SAndroid Build Coastguard Worker}
86*9880d681SAndroid Build Coastguard Worker
87*9880d681SAndroid Build Coastguard Worker; FIXME: Parameter alignment not considered
88*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_high_align_lds_arg:
89*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 0
90*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
91*9880d681SAndroid Build Coastguard Workerdefine void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
92*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
93*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
94*9880d681SAndroid Build Coastguard Worker  ret void
95*9880d681SAndroid Build Coastguard Worker}
96*9880d681SAndroid Build Coastguard Worker
97*9880d681SAndroid Build Coastguard Worker; (7 * 8) + (39 * 4) = 212
98*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
99*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 212
100*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
101*9880d681SAndroid Build Coastguard Workerdefine void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
102*9880d681SAndroid Build Coastguard Worker  %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
103*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
104*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
105*9880d681SAndroid Build Coastguard Worker
106*9880d681SAndroid Build Coastguard Worker  %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
107*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
108*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
109*9880d681SAndroid Build Coastguard Worker
110*9880d681SAndroid Build Coastguard Worker  ret void
111*9880d681SAndroid Build Coastguard Worker}
112*9880d681SAndroid Build Coastguard Worker
113*9880d681SAndroid Build Coastguard Worker; (39 * 4) + (4 pad) + (7 * 8) = 216
114*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
115*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 216
116*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
117*9880d681SAndroid Build Coastguard Workerdefine void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
118*9880d681SAndroid Build Coastguard Worker  %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
119*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
120*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
121*9880d681SAndroid Build Coastguard Worker
122*9880d681SAndroid Build Coastguard Worker  %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
123*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
124*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
125*9880d681SAndroid Build Coastguard Worker
126*9880d681SAndroid Build Coastguard Worker  ret void
127*9880d681SAndroid Build Coastguard Worker}
128*9880d681SAndroid Build Coastguard Worker; Test how the size needed for padding changes based on when the
129*9880d681SAndroid Build Coastguard Worker; global is encountered during lowering. There should be a consistent
130*9880d681SAndroid Build Coastguard Worker; order to minimize padding waste.
131*9880d681SAndroid Build Coastguard Worker;
132*9880d681SAndroid Build Coastguard Worker; The way global addresses are lowered now, this is in inverse of
133*9880d681SAndroid Build Coastguard Worker; first use order which isn't great.
134*9880d681SAndroid Build Coastguard Worker;
135*9880d681SAndroid Build Coastguard Worker; This should be the optimal order for these globals. If sorted to
136*9880d681SAndroid Build Coastguard Worker; minimize padding, the minimum possible size is: align 32, align 8,
137*9880d681SAndroid Build Coastguard Worker; align 16
138*9880d681SAndroid Build Coastguard Worker
139*9880d681SAndroid Build Coastguard Worker
140*9880d681SAndroid Build Coastguard Worker; align 32, 16, 8
141*9880d681SAndroid Build Coastguard Worker; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
142*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order0:
143*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 134
144*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
145*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
146*9880d681SAndroid Build Coastguard Worker  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
147*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
148*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
149*9880d681SAndroid Build Coastguard Worker
150*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
151*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
152*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
153*9880d681SAndroid Build Coastguard Worker
154*9880d681SAndroid Build Coastguard Worker  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
155*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
156*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
157*9880d681SAndroid Build Coastguard Worker
158*9880d681SAndroid Build Coastguard Worker  ret void
159*9880d681SAndroid Build Coastguard Worker}
160*9880d681SAndroid Build Coastguard Worker
161*9880d681SAndroid Build Coastguard Worker; align 32, 8, 16
162*9880d681SAndroid Build Coastguard Worker; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
163*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order1:
164*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 134
165*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
166*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
167*9880d681SAndroid Build Coastguard Worker  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
168*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
169*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
170*9880d681SAndroid Build Coastguard Worker
171*9880d681SAndroid Build Coastguard Worker  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
172*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
173*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
174*9880d681SAndroid Build Coastguard Worker
175*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
176*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
177*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
178*9880d681SAndroid Build Coastguard Worker
179*9880d681SAndroid Build Coastguard Worker  ret void
180*9880d681SAndroid Build Coastguard Worker}
181*9880d681SAndroid Build Coastguard Worker
182*9880d681SAndroid Build Coastguard Worker; align 16, 32, 8
183*9880d681SAndroid Build Coastguard Worker; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
184*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order2:
185*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 150
186*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
187*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
188*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
189*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
190*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
191*9880d681SAndroid Build Coastguard Worker
192*9880d681SAndroid Build Coastguard Worker  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
193*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
194*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
195*9880d681SAndroid Build Coastguard Worker
196*9880d681SAndroid Build Coastguard Worker  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
197*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
198*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
199*9880d681SAndroid Build Coastguard Worker
200*9880d681SAndroid Build Coastguard Worker  ret void
201*9880d681SAndroid Build Coastguard Worker}
202*9880d681SAndroid Build Coastguard Worker
203*9880d681SAndroid Build Coastguard Worker; align 16, 8, 32
204*9880d681SAndroid Build Coastguard Worker; 38 + (2 pad) + 38 + (2 pad) + 38
205*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order3:
206*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 118
207*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
208*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
209*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
210*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
211*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
212*9880d681SAndroid Build Coastguard Worker
213*9880d681SAndroid Build Coastguard Worker  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
214*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
215*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
216*9880d681SAndroid Build Coastguard Worker
217*9880d681SAndroid Build Coastguard Worker  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
218*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
219*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
220*9880d681SAndroid Build Coastguard Worker
221*9880d681SAndroid Build Coastguard Worker  ret void
222*9880d681SAndroid Build Coastguard Worker}
223*9880d681SAndroid Build Coastguard Worker
224*9880d681SAndroid Build Coastguard Worker; align 8, 32, 16
225*9880d681SAndroid Build Coastguard Worker; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
226*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order4:
227*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 142
228*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
229*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
230*9880d681SAndroid Build Coastguard Worker  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
231*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
232*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
233*9880d681SAndroid Build Coastguard Worker
234*9880d681SAndroid Build Coastguard Worker  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
235*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
236*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
237*9880d681SAndroid Build Coastguard Worker
238*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
239*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
240*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
241*9880d681SAndroid Build Coastguard Worker
242*9880d681SAndroid Build Coastguard Worker  ret void
243*9880d681SAndroid Build Coastguard Worker}
244*9880d681SAndroid Build Coastguard Worker
245*9880d681SAndroid Build Coastguard Worker; align 8, 16, 32
246*9880d681SAndroid Build Coastguard Worker; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
247*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order5:
248*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 126
249*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4
250*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
251*9880d681SAndroid Build Coastguard Worker  %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
252*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
253*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
254*9880d681SAndroid Build Coastguard Worker
255*9880d681SAndroid Build Coastguard Worker  %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
256*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
257*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
258*9880d681SAndroid Build Coastguard Worker
259*9880d681SAndroid Build Coastguard Worker  %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
260*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
261*9880d681SAndroid Build Coastguard Worker  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
262*9880d681SAndroid Build Coastguard Worker
263*9880d681SAndroid Build Coastguard Worker  ret void
264*9880d681SAndroid Build Coastguard Worker}
265*9880d681SAndroid Build Coastguard Worker
266*9880d681SAndroid Build Coastguard Workerattributes #0 = { argmemonly nounwind }
267*9880d681SAndroid Build Coastguard Workerattributes #1 = { nounwind }
268*9880d681SAndroid Build Coastguard Workerattributes #2 = { convergent nounwind }
269