1*9880d681SAndroid Build Coastguard Worker; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s 2*9880d681SAndroid Build Coastguard Worker 3*9880d681SAndroid Build Coastguard Worker@lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 4*9880d681SAndroid Build Coastguard Worker@lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 5*9880d681SAndroid Build Coastguard Worker 6*9880d681SAndroid Build Coastguard Worker@lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 7*9880d681SAndroid Build Coastguard Worker@lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32 8*9880d681SAndroid Build Coastguard Worker 9*9880d681SAndroid Build Coastguard Worker@lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef 10*9880d681SAndroid Build Coastguard Worker@lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef 11*9880d681SAndroid Build Coastguard Worker 12*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1) #0 13*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #0 14*9880d681SAndroid Build Coastguard Worker 15*9880d681SAndroid Build Coastguard Worker 16*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_no_round_size_1: 17*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 38 18*9880d681SAndroid Build Coastguard Workerdefine void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 19*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 20*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 21*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) 22*9880d681SAndroid Build Coastguard Worker ret void 23*9880d681SAndroid Build Coastguard Worker} 24*9880d681SAndroid Build Coastguard Worker 25*9880d681SAndroid Build Coastguard Worker; There are two objects, so one requires padding to to be correctly 26*9880d681SAndroid Build Coastguard Worker; aligned after the other. 27*9880d681SAndroid Build Coastguard Worker 28*9880d681SAndroid Build Coastguard Worker; (38 -> 48) + 38 = 92 29*9880d681SAndroid Build Coastguard Worker 30*9880d681SAndroid Build Coastguard Worker; I don't think it is necessary to add padding after since if there 31*9880d681SAndroid Build Coastguard Worker; were to be a dynamically sized LDS kernel arg, the runtime should 32*9880d681SAndroid Build Coastguard Worker; add the alignment padding if necessary alignment padding if needed. 33*9880d681SAndroid Build Coastguard Worker 34*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_2: 35*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 86 36*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 37*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 38*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 39*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 40*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) 41*9880d681SAndroid Build Coastguard Worker 42*9880d681SAndroid Build Coastguard Worker %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)* 43*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.1.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 44*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.1.bc, i32 38, i32 4, i1 false) 45*9880d681SAndroid Build Coastguard Worker 46*9880d681SAndroid Build Coastguard Worker ret void 47*9880d681SAndroid Build Coastguard Worker} 48*9880d681SAndroid Build Coastguard Worker 49*9880d681SAndroid Build Coastguard Worker; 38 + (10 pad) + 38 50*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_2_align_8: 51*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 86 52*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 53*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 54*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 55*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 56*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 57*9880d681SAndroid Build Coastguard Worker 58*9880d681SAndroid Build Coastguard Worker %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 59*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 60*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 61*9880d681SAndroid Build Coastguard Worker 62*9880d681SAndroid Build Coastguard Worker ret void 63*9880d681SAndroid Build Coastguard Worker} 64*9880d681SAndroid Build Coastguard Worker 65*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_local_lds_and_arg: 66*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 38 67*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 68*9880d681SAndroid Build Coastguard Workerdefine void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { 69*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 70*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 71*9880d681SAndroid Build Coastguard Worker 72*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) 73*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 74*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) 75*9880d681SAndroid Build Coastguard Worker ret void 76*9880d681SAndroid Build Coastguard Worker} 77*9880d681SAndroid Build Coastguard Worker 78*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_lds_arg: 79*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 0 80*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 81*9880d681SAndroid Build Coastguard Workerdefine void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { 82*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) 83*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) 84*9880d681SAndroid Build Coastguard Worker ret void 85*9880d681SAndroid Build Coastguard Worker} 86*9880d681SAndroid Build Coastguard Worker 87*9880d681SAndroid Build Coastguard Worker; FIXME: Parameter alignment not considered 88*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_high_align_lds_arg: 89*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 0 90*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 91*9880d681SAndroid Build Coastguard Workerdefine void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { 92*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false) 93*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false) 94*9880d681SAndroid Build Coastguard Worker ret void 95*9880d681SAndroid Build Coastguard Worker} 96*9880d681SAndroid Build Coastguard Worker 97*9880d681SAndroid Build Coastguard Worker; (7 * 8) + (39 * 4) = 212 98*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: 99*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 212 100*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 101*9880d681SAndroid Build Coastguard Workerdefine void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 102*9880d681SAndroid Build Coastguard Worker %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* 103*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) 104*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) 105*9880d681SAndroid Build Coastguard Worker 106*9880d681SAndroid Build Coastguard Worker %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* 107*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) 108*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) 109*9880d681SAndroid Build Coastguard Worker 110*9880d681SAndroid Build Coastguard Worker ret void 111*9880d681SAndroid Build Coastguard Worker} 112*9880d681SAndroid Build Coastguard Worker 113*9880d681SAndroid Build Coastguard Worker; (39 * 4) + (4 pad) + (7 * 8) = 216 114*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: 115*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 216 116*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 117*9880d681SAndroid Build Coastguard Workerdefine void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 118*9880d681SAndroid Build Coastguard Worker %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* 119*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) 120*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) 121*9880d681SAndroid Build Coastguard Worker 122*9880d681SAndroid Build Coastguard Worker %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* 123*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) 124*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) 125*9880d681SAndroid Build Coastguard Worker 126*9880d681SAndroid Build Coastguard Worker ret void 127*9880d681SAndroid Build Coastguard Worker} 128*9880d681SAndroid Build Coastguard Worker; Test how the size needed for padding changes based on when the 129*9880d681SAndroid Build Coastguard Worker; global is encountered during lowering. There should be a consistent 130*9880d681SAndroid Build Coastguard Worker; order to minimize padding waste. 131*9880d681SAndroid Build Coastguard Worker; 132*9880d681SAndroid Build Coastguard Worker; The way global addresses are lowered now, this is in inverse of 133*9880d681SAndroid Build Coastguard Worker; first use order which isn't great. 134*9880d681SAndroid Build Coastguard Worker; 135*9880d681SAndroid Build Coastguard Worker; This should be the optimal order for these globals. If sorted to 136*9880d681SAndroid Build Coastguard Worker; minimize padding, the minimum possible size is: align 32, align 8, 137*9880d681SAndroid Build Coastguard Worker; align 16 138*9880d681SAndroid Build Coastguard Worker 139*9880d681SAndroid Build Coastguard Worker 140*9880d681SAndroid Build Coastguard Worker; align 32, 16, 8 141*9880d681SAndroid Build Coastguard Worker; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 142*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order0: 143*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 134 144*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 145*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 146*9880d681SAndroid Build Coastguard Worker %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 147*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 148*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 149*9880d681SAndroid Build Coastguard Worker 150*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 151*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 152*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 153*9880d681SAndroid Build Coastguard Worker 154*9880d681SAndroid Build Coastguard Worker %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 155*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 156*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 157*9880d681SAndroid Build Coastguard Worker 158*9880d681SAndroid Build Coastguard Worker ret void 159*9880d681SAndroid Build Coastguard Worker} 160*9880d681SAndroid Build Coastguard Worker 161*9880d681SAndroid Build Coastguard Worker; align 32, 8, 16 162*9880d681SAndroid Build Coastguard Worker; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 163*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order1: 164*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 134 165*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 166*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 167*9880d681SAndroid Build Coastguard Worker %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 168*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 169*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 170*9880d681SAndroid Build Coastguard Worker 171*9880d681SAndroid Build Coastguard Worker %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 172*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 173*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 174*9880d681SAndroid Build Coastguard Worker 175*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 176*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 177*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 178*9880d681SAndroid Build Coastguard Worker 179*9880d681SAndroid Build Coastguard Worker ret void 180*9880d681SAndroid Build Coastguard Worker} 181*9880d681SAndroid Build Coastguard Worker 182*9880d681SAndroid Build Coastguard Worker; align 16, 32, 8 183*9880d681SAndroid Build Coastguard Worker; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 184*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order2: 185*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 150 186*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 187*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 188*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 189*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 190*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 191*9880d681SAndroid Build Coastguard Worker 192*9880d681SAndroid Build Coastguard Worker %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 193*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 194*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 195*9880d681SAndroid Build Coastguard Worker 196*9880d681SAndroid Build Coastguard Worker %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 197*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 198*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 199*9880d681SAndroid Build Coastguard Worker 200*9880d681SAndroid Build Coastguard Worker ret void 201*9880d681SAndroid Build Coastguard Worker} 202*9880d681SAndroid Build Coastguard Worker 203*9880d681SAndroid Build Coastguard Worker; align 16, 8, 32 204*9880d681SAndroid Build Coastguard Worker; 38 + (2 pad) + 38 + (2 pad) + 38 205*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order3: 206*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 118 207*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 208*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 209*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 210*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 211*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 212*9880d681SAndroid Build Coastguard Worker 213*9880d681SAndroid Build Coastguard Worker %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 214*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 215*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 216*9880d681SAndroid Build Coastguard Worker 217*9880d681SAndroid Build Coastguard Worker %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 218*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 219*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 220*9880d681SAndroid Build Coastguard Worker 221*9880d681SAndroid Build Coastguard Worker ret void 222*9880d681SAndroid Build Coastguard Worker} 223*9880d681SAndroid Build Coastguard Worker 224*9880d681SAndroid Build Coastguard Worker; align 8, 32, 16 225*9880d681SAndroid Build Coastguard Worker; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 226*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order4: 227*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 142 228*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 229*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 230*9880d681SAndroid Build Coastguard Worker %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 231*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 232*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 233*9880d681SAndroid Build Coastguard Worker 234*9880d681SAndroid Build Coastguard Worker %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 235*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 236*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 237*9880d681SAndroid Build Coastguard Worker 238*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 239*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 240*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 241*9880d681SAndroid Build Coastguard Worker 242*9880d681SAndroid Build Coastguard Worker ret void 243*9880d681SAndroid Build Coastguard Worker} 244*9880d681SAndroid Build Coastguard Worker 245*9880d681SAndroid Build Coastguard Worker; align 8, 16, 32 246*9880d681SAndroid Build Coastguard Worker; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 247*9880d681SAndroid Build Coastguard Worker; HSA-LABEL: {{^}}test_round_size_3_order5: 248*9880d681SAndroid Build Coastguard Worker; HSA: workgroup_group_segment_byte_size = 126 249*9880d681SAndroid Build Coastguard Worker; HSA: group_segment_alignment = 4 250*9880d681SAndroid Build Coastguard Workerdefine void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 251*9880d681SAndroid Build Coastguard Worker %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 252*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 253*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) 254*9880d681SAndroid Build Coastguard Worker 255*9880d681SAndroid Build Coastguard Worker %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 256*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 257*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) 258*9880d681SAndroid Build Coastguard Worker 259*9880d681SAndroid Build Coastguard Worker %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 260*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) 261*9880d681SAndroid Build Coastguard Worker call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) 262*9880d681SAndroid Build Coastguard Worker 263*9880d681SAndroid Build Coastguard Worker ret void 264*9880d681SAndroid Build Coastguard Worker} 265*9880d681SAndroid Build Coastguard Worker 266*9880d681SAndroid Build Coastguard Workerattributes #0 = { argmemonly nounwind } 267*9880d681SAndroid Build Coastguard Workerattributes #1 = { nounwind } 268*9880d681SAndroid Build Coastguard Workerattributes #2 = { convergent nounwind } 269