xref: /aosp_15_r20/external/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXSLOW
3*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXFAST
4*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2
5*9880d681SAndroid Build Coastguard Worker
6*9880d681SAndroid Build Coastguard Worker; Don't generate an unaligned 32-byte load on this test if that is slower than two 16-byte loads.
7*9880d681SAndroid Build Coastguard Worker
8*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @load32bytes(<8 x float>* %Ap) {
9*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: load32bytes:
10*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
11*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vmovaps (%rdi), %xmm0
12*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
13*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
14*9880d681SAndroid Build Coastguard Worker;
15*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: load32bytes:
16*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
17*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vmovups (%rdi), %ymm0
18*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
19*9880d681SAndroid Build Coastguard Worker;
20*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: load32bytes:
21*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
22*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovups (%rdi), %ymm0
23*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
24*9880d681SAndroid Build Coastguard Worker  %A = load <8 x float>, <8 x float>* %Ap, align 16
25*9880d681SAndroid Build Coastguard Worker  ret <8 x float> %A
26*9880d681SAndroid Build Coastguard Worker}
27*9880d681SAndroid Build Coastguard Worker
28*9880d681SAndroid Build Coastguard Worker; Don't generate an unaligned 32-byte store on this test if that is slower than two 16-byte loads.
29*9880d681SAndroid Build Coastguard Worker
30*9880d681SAndroid Build Coastguard Workerdefine void @store32bytes(<8 x float> %A, <8 x float>* %P) {
31*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: store32bytes:
32*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
33*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vextractf128 $1, %ymm0, 16(%rdi)
34*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vmovaps %xmm0, (%rdi)
35*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vzeroupper
36*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
37*9880d681SAndroid Build Coastguard Worker;
38*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: store32bytes:
39*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
40*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vmovups %ymm0, (%rdi)
41*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vzeroupper
42*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
43*9880d681SAndroid Build Coastguard Worker;
44*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: store32bytes:
45*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
46*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovups %ymm0, (%rdi)
47*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vzeroupper
48*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
49*9880d681SAndroid Build Coastguard Worker  store <8 x float> %A, <8 x float>* %P, align 16
50*9880d681SAndroid Build Coastguard Worker  ret void
51*9880d681SAndroid Build Coastguard Worker}
52*9880d681SAndroid Build Coastguard Worker
53*9880d681SAndroid Build Coastguard Worker; Merge two consecutive 16-byte subvector loads into a single 32-byte load if it's faster.
54*9880d681SAndroid Build Coastguard Worker
55*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
56*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic:
57*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
58*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vmovups 48(%rdi), %xmm0
59*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
60*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
61*9880d681SAndroid Build Coastguard Worker;
62*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic:
63*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
64*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vmovups 48(%rdi), %ymm0
65*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
66*9880d681SAndroid Build Coastguard Worker;
67*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_no_intrinsic:
68*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
69*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovups 48(%rdi), %ymm0
70*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
71*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
72*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
73*9880d681SAndroid Build Coastguard Worker  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
74*9880d681SAndroid Build Coastguard Worker  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
75*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
76*9880d681SAndroid Build Coastguard Worker  ret <8 x float> %v3
77*9880d681SAndroid Build Coastguard Worker}
78*9880d681SAndroid Build Coastguard Worker
79*9880d681SAndroid Build Coastguard Worker; If the first load is 32-byte aligned, then the loads should be merged in all cases.
80*9880d681SAndroid Build Coastguard Worker
81*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) {
82*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_aligned:
83*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
84*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vmovaps 48(%rdi), %ymm0
85*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
86*9880d681SAndroid Build Coastguard Worker;
87*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_aligned:
88*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
89*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vmovaps 48(%rdi), %ymm0
90*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
91*9880d681SAndroid Build Coastguard Worker;
92*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_aligned:
93*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
94*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovaps 48(%rdi), %ymm0
95*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
96*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
97*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
98*9880d681SAndroid Build Coastguard Worker  %v1 = load <4 x float>, <4 x float>* %ptr1, align 32
99*9880d681SAndroid Build Coastguard Worker  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
100*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
101*9880d681SAndroid Build Coastguard Worker  ret <8 x float> %v3
102*9880d681SAndroid Build Coastguard Worker}
103*9880d681SAndroid Build Coastguard Worker
104*9880d681SAndroid Build Coastguard Worker; Swap the order of the shufflevector operands to ensure that the pattern still matches.
105*9880d681SAndroid Build Coastguard Worker
106*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
107*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic_swap:
108*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
109*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vmovups 64(%rdi), %xmm0
110*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, 80(%rdi), %ymm0, %ymm0
111*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
112*9880d681SAndroid Build Coastguard Worker;
113*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic_swap:
114*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
115*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vmovups 64(%rdi), %ymm0
116*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
117*9880d681SAndroid Build Coastguard Worker;
118*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_no_intrinsic_swap:
119*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
120*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vmovups 64(%rdi), %ymm0
121*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
122*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
123*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
124*9880d681SAndroid Build Coastguard Worker  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
125*9880d681SAndroid Build Coastguard Worker  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
126*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
127*9880d681SAndroid Build Coastguard Worker  ret <8 x float> %v3
128*9880d681SAndroid Build Coastguard Worker}
129*9880d681SAndroid Build Coastguard Worker
130*9880d681SAndroid Build Coastguard Worker; Check each element type other than float to make sure it is handled correctly.
131*9880d681SAndroid Build Coastguard Worker; Use the loaded values with an 'add' to make sure we're using the correct load type.
132*9880d681SAndroid Build Coastguard Worker; Don't generate 32-byte loads for integer ops unless we have AVX2.
133*9880d681SAndroid Build Coastguard Worker
134*9880d681SAndroid Build Coastguard Workerdefine <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
135*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i64:
136*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
137*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
138*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddq 96(%rdi), %xmm1, %xmm1
139*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddq 80(%rdi), %xmm0, %xmm0
140*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
141*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
142*9880d681SAndroid Build Coastguard Worker;
143*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i64:
144*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
145*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
146*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddq 96(%rdi), %xmm1, %xmm1
147*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddq 80(%rdi), %xmm0, %xmm0
148*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
149*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
150*9880d681SAndroid Build Coastguard Worker;
151*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i64:
152*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
153*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddq 80(%rdi), %ymm0, %ymm0
154*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
155*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5
156*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6
157*9880d681SAndroid Build Coastguard Worker  %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1
158*9880d681SAndroid Build Coastguard Worker  %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1
159*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
160*9880d681SAndroid Build Coastguard Worker  %v4 = add <4 x i64> %v3, %x
161*9880d681SAndroid Build Coastguard Worker  ret <4 x i64> %v4
162*9880d681SAndroid Build Coastguard Worker}
163*9880d681SAndroid Build Coastguard Worker
164*9880d681SAndroid Build Coastguard Workerdefine <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
165*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i32:
166*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
167*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
168*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddd 112(%rdi), %xmm1, %xmm1
169*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddd 96(%rdi), %xmm0, %xmm0
170*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
171*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
172*9880d681SAndroid Build Coastguard Worker;
173*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i32:
174*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
175*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
176*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddd 112(%rdi), %xmm1, %xmm1
177*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddd 96(%rdi), %xmm0, %xmm0
178*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
179*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
180*9880d681SAndroid Build Coastguard Worker;
181*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i32:
182*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
183*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddd 96(%rdi), %ymm0, %ymm0
184*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
185*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6
186*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7
187*9880d681SAndroid Build Coastguard Worker  %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1
188*9880d681SAndroid Build Coastguard Worker  %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1
189*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
190*9880d681SAndroid Build Coastguard Worker  %v4 = add <8 x i32> %v3, %x
191*9880d681SAndroid Build Coastguard Worker  ret <8 x i32> %v4
192*9880d681SAndroid Build Coastguard Worker}
193*9880d681SAndroid Build Coastguard Worker
194*9880d681SAndroid Build Coastguard Workerdefine <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
195*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i16:
196*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
197*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
198*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddw 128(%rdi), %xmm1, %xmm1
199*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddw 112(%rdi), %xmm0, %xmm0
200*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
201*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
202*9880d681SAndroid Build Coastguard Worker;
203*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i16:
204*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
205*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
206*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddw 128(%rdi), %xmm1, %xmm1
207*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddw 112(%rdi), %xmm0, %xmm0
208*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
209*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
210*9880d681SAndroid Build Coastguard Worker;
211*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i16:
212*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
213*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddw 112(%rdi), %ymm0, %ymm0
214*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
215*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7
216*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8
217*9880d681SAndroid Build Coastguard Worker  %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1
218*9880d681SAndroid Build Coastguard Worker  %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1
219*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
220*9880d681SAndroid Build Coastguard Worker  %v4 = add <16 x i16> %v3, %x
221*9880d681SAndroid Build Coastguard Worker  ret <16 x i16> %v4
222*9880d681SAndroid Build Coastguard Worker}
223*9880d681SAndroid Build Coastguard Worker
224*9880d681SAndroid Build Coastguard Workerdefine <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
225*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i8:
226*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
227*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
228*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddb 144(%rdi), %xmm1, %xmm1
229*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vpaddb 128(%rdi), %xmm0, %xmm0
230*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
231*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
232*9880d681SAndroid Build Coastguard Worker;
233*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i8:
234*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
235*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
236*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddb 144(%rdi), %xmm1, %xmm1
237*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vpaddb 128(%rdi), %xmm0, %xmm0
238*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
239*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
240*9880d681SAndroid Build Coastguard Worker;
241*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i8:
242*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
243*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vpaddb 128(%rdi), %ymm0, %ymm0
244*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
245*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8
246*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9
247*9880d681SAndroid Build Coastguard Worker  %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1
248*9880d681SAndroid Build Coastguard Worker  %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1
249*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
250*9880d681SAndroid Build Coastguard Worker  %v4 = add <32 x i8> %v3, %x
251*9880d681SAndroid Build Coastguard Worker  ret <32 x i8> %v4
252*9880d681SAndroid Build Coastguard Worker}
253*9880d681SAndroid Build Coastguard Worker
254*9880d681SAndroid Build Coastguard Workerdefine <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
255*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_double:
256*9880d681SAndroid Build Coastguard Worker; AVXSLOW:       # BB#0:
257*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vmovupd 144(%rdi), %xmm1
258*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
259*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
260*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT:    retq
261*9880d681SAndroid Build Coastguard Worker;
262*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_double:
263*9880d681SAndroid Build Coastguard Worker; AVXFAST:       # BB#0:
264*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    vaddpd 144(%rdi), %ymm0, %ymm0
265*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT:    retq
266*9880d681SAndroid Build Coastguard Worker;
267*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_double:
268*9880d681SAndroid Build Coastguard Worker; AVX2:       # BB#0:
269*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    vaddpd 144(%rdi), %ymm0, %ymm0
270*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT:    retq
271*9880d681SAndroid Build Coastguard Worker  %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9
272*9880d681SAndroid Build Coastguard Worker  %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10
273*9880d681SAndroid Build Coastguard Worker  %v1 = load <2 x double>, <2 x double>* %ptr1, align 1
274*9880d681SAndroid Build Coastguard Worker  %v2 = load <2 x double>, <2 x double>* %ptr2, align 1
275*9880d681SAndroid Build Coastguard Worker  %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
276*9880d681SAndroid Build Coastguard Worker  %v4 = fadd <4 x double> %v3, %x
277*9880d681SAndroid Build Coastguard Worker  ret <4 x double> %v4
278*9880d681SAndroid Build Coastguard Worker}
279*9880d681SAndroid Build Coastguard Worker
280