1*9880d681SAndroid Build Coastguard Worker; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS 2*9880d681SAndroid Build Coastguard Worker; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR 3*9880d681SAndroid Build Coastguard Worker; 4*9880d681SAndroid Build Coastguard Worker; <rdar://problem/14477220> 5*9880d681SAndroid Build Coastguard Worker 6*9880d681SAndroid Build Coastguard Worker%class.Complex = type { float, float } 7*9880d681SAndroid Build Coastguard Worker 8*9880d681SAndroid Build Coastguard Worker 9*9880d681SAndroid Build Coastguard Worker; Check that independent slices leads to independent loads then the slices leads to 10*9880d681SAndroid Build Coastguard Worker; different register file. 11*9880d681SAndroid Build Coastguard Worker; 12*9880d681SAndroid Build Coastguard Worker; The layout is: 13*9880d681SAndroid Build Coastguard Worker; LSB 0 1 2 3 | 4 5 6 7 MSB 14*9880d681SAndroid Build Coastguard Worker; Low High 15*9880d681SAndroid Build Coastguard Worker; The base address points to 0 and is 8-bytes aligned. 16*9880d681SAndroid Build Coastguard Worker; Low slice starts at 0 (base) and is 8-bytes aligned. 17*9880d681SAndroid Build Coastguard Worker; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned. 18*9880d681SAndroid Build Coastguard Worker; 19*9880d681SAndroid Build Coastguard Worker; STRESS-LABEL: t1: 20*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].real, this is base + 8 * 8 + 0. 21*9880d681SAndroid Build Coastguard Worker; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]] 22*9880d681SAndroid Build Coastguard Worker; Add low slice: out[out_start].real, this is base + 0. 23*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] 24*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. 25*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] 26*9880d681SAndroid Build Coastguard Worker; Add high slice: out[out_start].imm, this is base + 4. 27*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] 28*9880d681SAndroid Build Coastguard Worker; Swap Imm and Real. 29*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] 30*9880d681SAndroid Build Coastguard Worker; Put the results back into out[out_start]. 31*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]]) 32*9880d681SAndroid Build Coastguard Worker; 33*9880d681SAndroid Build Coastguard Worker; Same for REGULAR, we eliminate register bank copy with each slices. 34*9880d681SAndroid Build Coastguard Worker; REGULAR-LABEL: t1: 35*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].real, this is base + 8 * 8 + 0. 36*9880d681SAndroid Build Coastguard Worker; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]] 37*9880d681SAndroid Build Coastguard Worker; Add low slice: out[out_start].real, this is base + 0. 38*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] 39*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. 40*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] 41*9880d681SAndroid Build Coastguard Worker; Add high slice: out[out_start].imm, this is base + 4. 42*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] 43*9880d681SAndroid Build Coastguard Worker; Swap Imm and Real. 44*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] 45*9880d681SAndroid Build Coastguard Worker; Put the results back into out[out_start]. 46*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]]) 47*9880d681SAndroid Build Coastguard Workerdefine void @t1(%class.Complex* nocapture %out, i64 %out_start) { 48*9880d681SAndroid Build Coastguard Workerentry: 49*9880d681SAndroid Build Coastguard Worker %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start 50*9880d681SAndroid Build Coastguard Worker %tmp = bitcast %class.Complex* %arrayidx to i64* 51*9880d681SAndroid Build Coastguard Worker %tmp1 = load i64, i64* %tmp, align 8 52*9880d681SAndroid Build Coastguard Worker %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32 53*9880d681SAndroid Build Coastguard Worker %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float 54*9880d681SAndroid Build Coastguard Worker %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32 55*9880d681SAndroid Build Coastguard Worker %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32 56*9880d681SAndroid Build Coastguard Worker %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float 57*9880d681SAndroid Build Coastguard Worker %add = add i64 %out_start, 8 58*9880d681SAndroid Build Coastguard Worker %arrayidx2 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add 59*9880d681SAndroid Build Coastguard Worker %i.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 0 60*9880d681SAndroid Build Coastguard Worker %tmp4 = load float, float* %i.i, align 4 61*9880d681SAndroid Build Coastguard Worker %add.i = fadd float %tmp4, %tmp2 62*9880d681SAndroid Build Coastguard Worker %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0 63*9880d681SAndroid Build Coastguard Worker %r.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 1 64*9880d681SAndroid Build Coastguard Worker %tmp5 = load float, float* %r.i, align 4 65*9880d681SAndroid Build Coastguard Worker %add5.i = fadd float %tmp5, %tmp3 66*9880d681SAndroid Build Coastguard Worker %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1 67*9880d681SAndroid Build Coastguard Worker %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>* 68*9880d681SAndroid Build Coastguard Worker store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4 69*9880d681SAndroid Build Coastguard Worker ret void 70*9880d681SAndroid Build Coastguard Worker} 71*9880d681SAndroid Build Coastguard Worker 72*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind 73*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 74*9880d681SAndroid Build Coastguard Worker 75*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind 76*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.lifetime.start(i64, i8* nocapture) 77*9880d681SAndroid Build Coastguard Worker 78*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind 79*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.lifetime.end(i64, i8* nocapture) 80*9880d681SAndroid Build Coastguard Worker 81*9880d681SAndroid Build Coastguard Worker; Check that we do not read outside of the chunk of bits of the original loads. 82*9880d681SAndroid Build Coastguard Worker; 83*9880d681SAndroid Build Coastguard Worker; The 64-bits should have been split in one 32-bits and one 16-bits slices. 84*9880d681SAndroid Build Coastguard Worker; The 16-bits should be zero extended to match the final type. 85*9880d681SAndroid Build Coastguard Worker; 86*9880d681SAndroid Build Coastguard Worker; The memory layout is: 87*9880d681SAndroid Build Coastguard Worker; LSB 0 1 2 3 | 4 5 | 6 7 MSB 88*9880d681SAndroid Build Coastguard Worker; Low High 89*9880d681SAndroid Build Coastguard Worker; The base address points to 0 and is 8-bytes aligned. 90*9880d681SAndroid Build Coastguard Worker; Low slice starts at 0 (base) and is 8-bytes aligned. 91*9880d681SAndroid Build Coastguard Worker; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned. 92*9880d681SAndroid Build Coastguard Worker; 93*9880d681SAndroid Build Coastguard Worker; STRESS-LABEL: t2: 94*9880d681SAndroid Build Coastguard Worker; STRESS: movzwl 6([[BASE:[^)]+]]), %eax 95*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: addl ([[BASE]]), %eax 96*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: ret 97*9880d681SAndroid Build Coastguard Worker; 98*9880d681SAndroid Build Coastguard Worker; For the REGULAR heuristic, this is not profitable to slice things that are not 99*9880d681SAndroid Build Coastguard Worker; next to each other in memory. Here we have a hole with bytes #4-5. 100*9880d681SAndroid Build Coastguard Worker; REGULAR-LABEL: t2: 101*9880d681SAndroid Build Coastguard Worker; REGULAR: shrq $48 102*9880d681SAndroid Build Coastguard Workerdefine i32 @t2(%class.Complex* nocapture %out, i64 %out_start) { 103*9880d681SAndroid Build Coastguard Worker %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start 104*9880d681SAndroid Build Coastguard Worker %bitcast = bitcast %class.Complex* %arrayidx to i64* 105*9880d681SAndroid Build Coastguard Worker %chunk64 = load i64, i64* %bitcast, align 8 106*9880d681SAndroid Build Coastguard Worker %slice32_low = trunc i64 %chunk64 to i32 107*9880d681SAndroid Build Coastguard Worker %shift48 = lshr i64 %chunk64, 48 108*9880d681SAndroid Build Coastguard Worker %slice32_high = trunc i64 %shift48 to i32 109*9880d681SAndroid Build Coastguard Worker %res = add i32 %slice32_high, %slice32_low 110*9880d681SAndroid Build Coastguard Worker ret i32 %res 111*9880d681SAndroid Build Coastguard Worker} 112*9880d681SAndroid Build Coastguard Worker 113*9880d681SAndroid Build Coastguard Worker; Check that we do not optimize overlapping slices. 114*9880d681SAndroid Build Coastguard Worker; 115*9880d681SAndroid Build Coastguard Worker; The 64-bits should NOT have been split in as slices are overlapping. 116*9880d681SAndroid Build Coastguard Worker; First slice uses bytes numbered 0 to 3. 117*9880d681SAndroid Build Coastguard Worker; Second slice uses bytes numbered 6 and 7. 118*9880d681SAndroid Build Coastguard Worker; Third slice uses bytes numbered 4 to 7. 119*9880d681SAndroid Build Coastguard Worker; 120*9880d681SAndroid Build Coastguard Worker; STRESS-LABEL: t3: 121*9880d681SAndroid Build Coastguard Worker; STRESS: shrq $48 122*9880d681SAndroid Build Coastguard Worker; STRESS: shrq $32 123*9880d681SAndroid Build Coastguard Worker; 124*9880d681SAndroid Build Coastguard Worker; REGULAR-LABEL: t3: 125*9880d681SAndroid Build Coastguard Worker; REGULAR: shrq $48 126*9880d681SAndroid Build Coastguard Worker; REGULAR: shrq $32 127*9880d681SAndroid Build Coastguard Workerdefine i32 @t3(%class.Complex* nocapture %out, i64 %out_start) { 128*9880d681SAndroid Build Coastguard Worker %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start 129*9880d681SAndroid Build Coastguard Worker %bitcast = bitcast %class.Complex* %arrayidx to i64* 130*9880d681SAndroid Build Coastguard Worker %chunk64 = load i64, i64* %bitcast, align 8 131*9880d681SAndroid Build Coastguard Worker %slice32_low = trunc i64 %chunk64 to i32 132*9880d681SAndroid Build Coastguard Worker %shift48 = lshr i64 %chunk64, 48 133*9880d681SAndroid Build Coastguard Worker %slice32_high = trunc i64 %shift48 to i32 134*9880d681SAndroid Build Coastguard Worker %shift32 = lshr i64 %chunk64, 32 135*9880d681SAndroid Build Coastguard Worker %slice32_lowhigh = trunc i64 %shift32 to i32 136*9880d681SAndroid Build Coastguard Worker %tmpres = add i32 %slice32_high, %slice32_low 137*9880d681SAndroid Build Coastguard Worker %res = add i32 %slice32_lowhigh, %tmpres 138*9880d681SAndroid Build Coastguard Worker ret i32 %res 139*9880d681SAndroid Build Coastguard Worker} 140