1*9880d681SAndroid Build Coastguard Worker; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXSLOW 3*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=AVXFAST 4*9880d681SAndroid Build Coastguard Worker; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2 5*9880d681SAndroid Build Coastguard Worker 6*9880d681SAndroid Build Coastguard Worker; Don't generate an unaligned 32-byte load on this test if that is slower than two 16-byte loads. 7*9880d681SAndroid Build Coastguard Worker 8*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @load32bytes(<8 x float>* %Ap) { 9*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: load32bytes: 10*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 11*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vmovaps (%rdi), %xmm0 12*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 13*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 14*9880d681SAndroid Build Coastguard Worker; 15*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: load32bytes: 16*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 17*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vmovups (%rdi), %ymm0 18*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 19*9880d681SAndroid Build Coastguard Worker; 20*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: load32bytes: 21*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 22*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovups (%rdi), %ymm0 23*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 24*9880d681SAndroid Build Coastguard Worker %A = load <8 x float>, <8 x float>* %Ap, align 16 25*9880d681SAndroid Build Coastguard Worker ret <8 x float> %A 26*9880d681SAndroid Build Coastguard Worker} 27*9880d681SAndroid Build Coastguard Worker 28*9880d681SAndroid Build Coastguard Worker; Don't generate an unaligned 32-byte store on this test if that is slower than two 16-byte loads. 29*9880d681SAndroid Build Coastguard Worker 30*9880d681SAndroid Build Coastguard Workerdefine void @store32bytes(<8 x float> %A, <8 x float>* %P) { 31*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: store32bytes: 32*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 33*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi) 34*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vmovaps %xmm0, (%rdi) 35*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vzeroupper 36*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 37*9880d681SAndroid Build Coastguard Worker; 38*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: store32bytes: 39*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 40*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vmovups %ymm0, (%rdi) 41*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vzeroupper 42*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 43*9880d681SAndroid Build Coastguard Worker; 44*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: store32bytes: 45*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 46*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovups %ymm0, (%rdi) 47*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vzeroupper 48*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 49*9880d681SAndroid Build Coastguard Worker store <8 x float> %A, <8 x float>* %P, align 16 50*9880d681SAndroid Build Coastguard Worker ret void 51*9880d681SAndroid Build Coastguard Worker} 52*9880d681SAndroid Build Coastguard Worker 53*9880d681SAndroid Build Coastguard Worker; Merge two consecutive 16-byte subvector loads into a single 32-byte load if it's faster. 54*9880d681SAndroid Build Coastguard Worker 55*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { 56*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic: 57*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 58*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vmovups 48(%rdi), %xmm0 59*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 60*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 61*9880d681SAndroid Build Coastguard Worker; 62*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic: 63*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 64*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vmovups 48(%rdi), %ymm0 65*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 66*9880d681SAndroid Build Coastguard Worker; 67*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_no_intrinsic: 68*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 69*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovups 48(%rdi), %ymm0 70*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 71*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 72*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 73*9880d681SAndroid Build Coastguard Worker %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 74*9880d681SAndroid Build Coastguard Worker %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 75*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 76*9880d681SAndroid Build Coastguard Worker ret <8 x float> %v3 77*9880d681SAndroid Build Coastguard Worker} 78*9880d681SAndroid Build Coastguard Worker 79*9880d681SAndroid Build Coastguard Worker; If the first load is 32-byte aligned, then the loads should be merged in all cases. 80*9880d681SAndroid Build Coastguard Worker 81*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) { 82*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_aligned: 83*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 84*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0 85*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 86*9880d681SAndroid Build Coastguard Worker; 87*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_aligned: 88*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 89*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vmovaps 48(%rdi), %ymm0 90*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 91*9880d681SAndroid Build Coastguard Worker; 92*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_aligned: 93*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 94*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovaps 48(%rdi), %ymm0 95*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 96*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 97*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 98*9880d681SAndroid Build Coastguard Worker %v1 = load <4 x float>, <4 x float>* %ptr1, align 32 99*9880d681SAndroid Build Coastguard Worker %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 100*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 101*9880d681SAndroid Build Coastguard Worker ret <8 x float> %v3 102*9880d681SAndroid Build Coastguard Worker} 103*9880d681SAndroid Build Coastguard Worker 104*9880d681SAndroid Build Coastguard Worker; Swap the order of the shufflevector operands to ensure that the pattern still matches. 105*9880d681SAndroid Build Coastguard Worker 106*9880d681SAndroid Build Coastguard Workerdefine <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { 107*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic_swap: 108*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 109*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vmovups 64(%rdi), %xmm0 110*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, 80(%rdi), %ymm0, %ymm0 111*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 112*9880d681SAndroid Build Coastguard Worker; 113*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic_swap: 114*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 115*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vmovups 64(%rdi), %ymm0 116*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 117*9880d681SAndroid Build Coastguard Worker; 118*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_no_intrinsic_swap: 119*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 120*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vmovups 64(%rdi), %ymm0 121*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 122*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 123*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 124*9880d681SAndroid Build Coastguard Worker %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 125*9880d681SAndroid Build Coastguard Worker %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 126*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 127*9880d681SAndroid Build Coastguard Worker ret <8 x float> %v3 128*9880d681SAndroid Build Coastguard Worker} 129*9880d681SAndroid Build Coastguard Worker 130*9880d681SAndroid Build Coastguard Worker; Check each element type other than float to make sure it is handled correctly. 131*9880d681SAndroid Build Coastguard Worker; Use the loaded values with an 'add' to make sure we're using the correct load type. 132*9880d681SAndroid Build Coastguard Worker; Don't generate 32-byte loads for integer ops unless we have AVX2. 133*9880d681SAndroid Build Coastguard Worker 134*9880d681SAndroid Build Coastguard Workerdefine <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { 135*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i64: 136*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 137*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 138*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1 139*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0 140*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 141*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 142*9880d681SAndroid Build Coastguard Worker; 143*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i64: 144*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 145*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 146*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1 147*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0 148*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 149*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 150*9880d681SAndroid Build Coastguard Worker; 151*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i64: 152*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 153*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddq 80(%rdi), %ymm0, %ymm0 154*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 155*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5 156*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6 157*9880d681SAndroid Build Coastguard Worker %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1 158*9880d681SAndroid Build Coastguard Worker %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1 159*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 160*9880d681SAndroid Build Coastguard Worker %v4 = add <4 x i64> %v3, %x 161*9880d681SAndroid Build Coastguard Worker ret <4 x i64> %v4 162*9880d681SAndroid Build Coastguard Worker} 163*9880d681SAndroid Build Coastguard Worker 164*9880d681SAndroid Build Coastguard Workerdefine <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { 165*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i32: 166*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 167*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 168*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1 169*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0 170*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 171*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 172*9880d681SAndroid Build Coastguard Worker; 173*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i32: 174*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 175*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 176*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1 177*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0 178*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 179*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 180*9880d681SAndroid Build Coastguard Worker; 181*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i32: 182*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 183*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddd 96(%rdi), %ymm0, %ymm0 184*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 185*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6 186*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7 187*9880d681SAndroid Build Coastguard Worker %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1 188*9880d681SAndroid Build Coastguard Worker %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1 189*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 190*9880d681SAndroid Build Coastguard Worker %v4 = add <8 x i32> %v3, %x 191*9880d681SAndroid Build Coastguard Worker ret <8 x i32> %v4 192*9880d681SAndroid Build Coastguard Worker} 193*9880d681SAndroid Build Coastguard Worker 194*9880d681SAndroid Build Coastguard Workerdefine <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { 195*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i16: 196*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 197*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 198*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1 199*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0 200*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 201*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 202*9880d681SAndroid Build Coastguard Worker; 203*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i16: 204*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 205*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 206*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1 207*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0 208*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 209*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 210*9880d681SAndroid Build Coastguard Worker; 211*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i16: 212*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 213*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddw 112(%rdi), %ymm0, %ymm0 214*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 215*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7 216*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8 217*9880d681SAndroid Build Coastguard Worker %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1 218*9880d681SAndroid Build Coastguard Worker %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1 219*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 220*9880d681SAndroid Build Coastguard Worker %v4 = add <16 x i16> %v3, %x 221*9880d681SAndroid Build Coastguard Worker ret <16 x i16> %v4 222*9880d681SAndroid Build Coastguard Worker} 223*9880d681SAndroid Build Coastguard Worker 224*9880d681SAndroid Build Coastguard Workerdefine <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { 225*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_i8: 226*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 227*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 228*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1 229*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0 230*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 231*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 232*9880d681SAndroid Build Coastguard Worker; 233*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_i8: 234*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 235*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1 236*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1 237*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0 238*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 239*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 240*9880d681SAndroid Build Coastguard Worker; 241*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_i8: 242*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 243*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vpaddb 128(%rdi), %ymm0, %ymm0 244*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 245*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8 246*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9 247*9880d681SAndroid Build Coastguard Worker %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1 248*9880d681SAndroid Build Coastguard Worker %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1 249*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 250*9880d681SAndroid Build Coastguard Worker %v4 = add <32 x i8> %v3, %x 251*9880d681SAndroid Build Coastguard Worker ret <32 x i8> %v4 252*9880d681SAndroid Build Coastguard Worker} 253*9880d681SAndroid Build Coastguard Worker 254*9880d681SAndroid Build Coastguard Workerdefine <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) { 255*9880d681SAndroid Build Coastguard Worker; AVXSLOW-LABEL: combine_16_byte_loads_double: 256*9880d681SAndroid Build Coastguard Worker; AVXSLOW: # BB#0: 257*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vmovupd 144(%rdi), %xmm1 258*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 259*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: vaddpd %ymm0, %ymm1, %ymm0 260*9880d681SAndroid Build Coastguard Worker; AVXSLOW-NEXT: retq 261*9880d681SAndroid Build Coastguard Worker; 262*9880d681SAndroid Build Coastguard Worker; AVXFAST-LABEL: combine_16_byte_loads_double: 263*9880d681SAndroid Build Coastguard Worker; AVXFAST: # BB#0: 264*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0 265*9880d681SAndroid Build Coastguard Worker; AVXFAST-NEXT: retq 266*9880d681SAndroid Build Coastguard Worker; 267*9880d681SAndroid Build Coastguard Worker; AVX2-LABEL: combine_16_byte_loads_double: 268*9880d681SAndroid Build Coastguard Worker; AVX2: # BB#0: 269*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0 270*9880d681SAndroid Build Coastguard Worker; AVX2-NEXT: retq 271*9880d681SAndroid Build Coastguard Worker %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9 272*9880d681SAndroid Build Coastguard Worker %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10 273*9880d681SAndroid Build Coastguard Worker %v1 = load <2 x double>, <2 x double>* %ptr1, align 1 274*9880d681SAndroid Build Coastguard Worker %v2 = load <2 x double>, <2 x double>* %ptr2, align 1 275*9880d681SAndroid Build Coastguard Worker %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 276*9880d681SAndroid Build Coastguard Worker %v4 = fadd <4 x double> %v3, %x 277*9880d681SAndroid Build Coastguard Worker ret <4 x double> %v4 278*9880d681SAndroid Build Coastguard Worker} 279*9880d681SAndroid Build Coastguard Worker 280