xref: /aosp_15_r20/external/llvm/test/CodeGen/X86/load-slice.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
2*9880d681SAndroid Build Coastguard Worker; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
3*9880d681SAndroid Build Coastguard Worker;
4*9880d681SAndroid Build Coastguard Worker; <rdar://problem/14477220>
5*9880d681SAndroid Build Coastguard Worker
6*9880d681SAndroid Build Coastguard Worker%class.Complex = type { float, float }
7*9880d681SAndroid Build Coastguard Worker
8*9880d681SAndroid Build Coastguard Worker
9*9880d681SAndroid Build Coastguard Worker; Check that independent slices leads to independent loads then the slices leads to
10*9880d681SAndroid Build Coastguard Worker; different register file.
11*9880d681SAndroid Build Coastguard Worker;
12*9880d681SAndroid Build Coastguard Worker; The layout is:
13*9880d681SAndroid Build Coastguard Worker; LSB 0 1 2 3 | 4 5 6 7 MSB
14*9880d681SAndroid Build Coastguard Worker;       Low      High
15*9880d681SAndroid Build Coastguard Worker; The base address points to 0 and is 8-bytes aligned.
16*9880d681SAndroid Build Coastguard Worker; Low slice starts at 0 (base) and is 8-bytes aligned.
17*9880d681SAndroid Build Coastguard Worker; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
18*9880d681SAndroid Build Coastguard Worker;
19*9880d681SAndroid Build Coastguard Worker; STRESS-LABEL: t1:
20*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
21*9880d681SAndroid Build Coastguard Worker; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
22*9880d681SAndroid Build Coastguard Worker; Add low slice: out[out_start].real, this is base + 0.
23*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
24*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
25*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
26*9880d681SAndroid Build Coastguard Worker; Add high slice: out[out_start].imm, this is base + 4.
27*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
28*9880d681SAndroid Build Coastguard Worker; Swap Imm and Real.
29*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
30*9880d681SAndroid Build Coastguard Worker; Put the results back into out[out_start].
31*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
32*9880d681SAndroid Build Coastguard Worker;
33*9880d681SAndroid Build Coastguard Worker; Same for REGULAR, we eliminate register bank copy with each slices.
34*9880d681SAndroid Build Coastguard Worker; REGULAR-LABEL: t1:
35*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
36*9880d681SAndroid Build Coastguard Worker; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
37*9880d681SAndroid Build Coastguard Worker; Add low slice: out[out_start].real, this is base + 0.
38*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
39*9880d681SAndroid Build Coastguard Worker; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
40*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
41*9880d681SAndroid Build Coastguard Worker; Add high slice: out[out_start].imm, this is base + 4.
42*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
43*9880d681SAndroid Build Coastguard Worker; Swap Imm and Real.
44*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
45*9880d681SAndroid Build Coastguard Worker; Put the results back into out[out_start].
46*9880d681SAndroid Build Coastguard Worker; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
47*9880d681SAndroid Build Coastguard Workerdefine void @t1(%class.Complex* nocapture %out, i64 %out_start) {
48*9880d681SAndroid Build Coastguard Workerentry:
49*9880d681SAndroid Build Coastguard Worker  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
50*9880d681SAndroid Build Coastguard Worker  %tmp = bitcast %class.Complex* %arrayidx to i64*
51*9880d681SAndroid Build Coastguard Worker  %tmp1 = load i64, i64* %tmp, align 8
52*9880d681SAndroid Build Coastguard Worker  %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
53*9880d681SAndroid Build Coastguard Worker  %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
54*9880d681SAndroid Build Coastguard Worker  %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
55*9880d681SAndroid Build Coastguard Worker  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
56*9880d681SAndroid Build Coastguard Worker  %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
57*9880d681SAndroid Build Coastguard Worker  %add = add i64 %out_start, 8
58*9880d681SAndroid Build Coastguard Worker  %arrayidx2 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add
59*9880d681SAndroid Build Coastguard Worker  %i.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 0
60*9880d681SAndroid Build Coastguard Worker  %tmp4 = load float, float* %i.i, align 4
61*9880d681SAndroid Build Coastguard Worker  %add.i = fadd float %tmp4, %tmp2
62*9880d681SAndroid Build Coastguard Worker  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
63*9880d681SAndroid Build Coastguard Worker  %r.i = getelementptr inbounds %class.Complex, %class.Complex* %arrayidx2, i64 0, i32 1
64*9880d681SAndroid Build Coastguard Worker  %tmp5 = load float, float* %r.i, align 4
65*9880d681SAndroid Build Coastguard Worker  %add5.i = fadd float %tmp5, %tmp3
66*9880d681SAndroid Build Coastguard Worker  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
67*9880d681SAndroid Build Coastguard Worker  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
68*9880d681SAndroid Build Coastguard Worker  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
69*9880d681SAndroid Build Coastguard Worker  ret void
70*9880d681SAndroid Build Coastguard Worker}
71*9880d681SAndroid Build Coastguard Worker
72*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind
73*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
74*9880d681SAndroid Build Coastguard Worker
75*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind
76*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.lifetime.start(i64, i8* nocapture)
77*9880d681SAndroid Build Coastguard Worker
78*9880d681SAndroid Build Coastguard Worker; Function Attrs: nounwind
79*9880d681SAndroid Build Coastguard Workerdeclare void @llvm.lifetime.end(i64, i8* nocapture)
80*9880d681SAndroid Build Coastguard Worker
81*9880d681SAndroid Build Coastguard Worker; Check that we do not read outside of the chunk of bits of the original loads.
82*9880d681SAndroid Build Coastguard Worker;
83*9880d681SAndroid Build Coastguard Worker; The 64-bits should have been split in one 32-bits and one 16-bits slices.
84*9880d681SAndroid Build Coastguard Worker; The 16-bits should be zero extended to match the final type.
85*9880d681SAndroid Build Coastguard Worker;
86*9880d681SAndroid Build Coastguard Worker; The memory layout is:
87*9880d681SAndroid Build Coastguard Worker; LSB 0 1 2 3 | 4 5 | 6 7 MSB
88*9880d681SAndroid Build Coastguard Worker;      Low            High
89*9880d681SAndroid Build Coastguard Worker; The base address points to 0 and is 8-bytes aligned.
90*9880d681SAndroid Build Coastguard Worker; Low slice starts at 0 (base) and is 8-bytes aligned.
91*9880d681SAndroid Build Coastguard Worker; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
92*9880d681SAndroid Build Coastguard Worker;
93*9880d681SAndroid Build Coastguard Worker; STRESS-LABEL: t2:
94*9880d681SAndroid Build Coastguard Worker; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
95*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: addl ([[BASE]]), %eax
96*9880d681SAndroid Build Coastguard Worker; STRESS-NEXT: ret
97*9880d681SAndroid Build Coastguard Worker;
98*9880d681SAndroid Build Coastguard Worker; For the REGULAR heuristic, this is not profitable to slice things that are not
99*9880d681SAndroid Build Coastguard Worker; next to each other in memory. Here we have a hole with bytes #4-5.
100*9880d681SAndroid Build Coastguard Worker; REGULAR-LABEL: t2:
101*9880d681SAndroid Build Coastguard Worker; REGULAR: shrq $48
102*9880d681SAndroid Build Coastguard Workerdefine i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
103*9880d681SAndroid Build Coastguard Worker  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
104*9880d681SAndroid Build Coastguard Worker  %bitcast = bitcast %class.Complex* %arrayidx to i64*
105*9880d681SAndroid Build Coastguard Worker  %chunk64 = load i64, i64* %bitcast, align 8
106*9880d681SAndroid Build Coastguard Worker  %slice32_low = trunc i64 %chunk64 to i32
107*9880d681SAndroid Build Coastguard Worker  %shift48 = lshr i64 %chunk64, 48
108*9880d681SAndroid Build Coastguard Worker  %slice32_high = trunc i64 %shift48 to i32
109*9880d681SAndroid Build Coastguard Worker  %res = add i32 %slice32_high, %slice32_low
110*9880d681SAndroid Build Coastguard Worker  ret i32 %res
111*9880d681SAndroid Build Coastguard Worker}
112*9880d681SAndroid Build Coastguard Worker
113*9880d681SAndroid Build Coastguard Worker; Check that we do not optimize overlapping slices.
114*9880d681SAndroid Build Coastguard Worker;
115*9880d681SAndroid Build Coastguard Worker; The 64-bits should NOT have been split in as slices are overlapping.
116*9880d681SAndroid Build Coastguard Worker; First slice uses bytes numbered 0 to 3.
117*9880d681SAndroid Build Coastguard Worker; Second slice uses bytes numbered 6 and 7.
118*9880d681SAndroid Build Coastguard Worker; Third slice uses bytes numbered 4 to 7.
119*9880d681SAndroid Build Coastguard Worker;
120*9880d681SAndroid Build Coastguard Worker; STRESS-LABEL: t3:
121*9880d681SAndroid Build Coastguard Worker; STRESS: shrq $48
122*9880d681SAndroid Build Coastguard Worker; STRESS: shrq $32
123*9880d681SAndroid Build Coastguard Worker;
124*9880d681SAndroid Build Coastguard Worker; REGULAR-LABEL: t3:
125*9880d681SAndroid Build Coastguard Worker; REGULAR: shrq $48
126*9880d681SAndroid Build Coastguard Worker; REGULAR: shrq $32
127*9880d681SAndroid Build Coastguard Workerdefine i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
128*9880d681SAndroid Build Coastguard Worker  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
129*9880d681SAndroid Build Coastguard Worker  %bitcast = bitcast %class.Complex* %arrayidx to i64*
130*9880d681SAndroid Build Coastguard Worker  %chunk64 = load i64, i64* %bitcast, align 8
131*9880d681SAndroid Build Coastguard Worker  %slice32_low = trunc i64 %chunk64 to i32
132*9880d681SAndroid Build Coastguard Worker  %shift48 = lshr i64 %chunk64, 48
133*9880d681SAndroid Build Coastguard Worker  %slice32_high = trunc i64 %shift48 to i32
134*9880d681SAndroid Build Coastguard Worker  %shift32 = lshr i64 %chunk64, 32
135*9880d681SAndroid Build Coastguard Worker  %slice32_lowhigh = trunc i64 %shift32 to i32
136*9880d681SAndroid Build Coastguard Worker  %tmpres = add i32 %slice32_high, %slice32_low
137*9880d681SAndroid Build Coastguard Worker  %res = add i32 %slice32_lowhigh, %tmpres
138*9880d681SAndroid Build Coastguard Worker  ret i32 %res
139*9880d681SAndroid Build Coastguard Worker}
140