1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX 8 9; Make sure that we generate non-temporal stores for the test cases below. 10; We use xorps for zeroing, so domain information isn't available anymore. 11 12; Scalar versions (zeroing means we can this even for fp types). 13 14define void @test_zero_f32(float* %dst) { 15; SSE-LABEL: test_zero_f32: 16; SSE: # BB#0: 17; SSE-NEXT: xorl %eax, %eax 18; SSE-NEXT: movntil %eax, (%rdi) 19; SSE-NEXT: retq 20; 21; AVX-LABEL: test_zero_f32: 22; AVX: # BB#0: 23; AVX-NEXT: xorl %eax, %eax 24; AVX-NEXT: movntil %eax, (%rdi) 25; AVX-NEXT: retq 26; 27; VLX-LABEL: test_zero_f32: 28; VLX: # BB#0: 29; VLX-NEXT: xorl %eax, %eax 30; VLX-NEXT: movntil %eax, (%rdi) 31; VLX-NEXT: retq 32 store float zeroinitializer, float* %dst, align 1, !nontemporal !1 33 ret void 34} 35 36define void @test_zero_i32(i32* %dst) { 37; SSE-LABEL: test_zero_i32: 38; SSE: # BB#0: 39; SSE-NEXT: xorl %eax, %eax 40; SSE-NEXT: movntil %eax, (%rdi) 41; SSE-NEXT: retq 42; 43; AVX-LABEL: test_zero_i32: 44; AVX: # BB#0: 45; AVX-NEXT: xorl %eax, %eax 46; AVX-NEXT: movntil %eax, (%rdi) 47; AVX-NEXT: retq 48; 49; VLX-LABEL: test_zero_i32: 50; VLX: # BB#0: 51; VLX-NEXT: xorl %eax, %eax 52; VLX-NEXT: movntil %eax, (%rdi) 53; VLX-NEXT: retq 54 store i32 zeroinitializer, i32* %dst, align 1, !nontemporal !1 55 ret void 56} 57 58define void @test_zero_f64(double* %dst) { 59; SSE-LABEL: test_zero_f64: 60; SSE: # BB#0: 61; SSE-NEXT: xorl %eax, %eax 62; SSE-NEXT: movntiq %rax, (%rdi) 63; SSE-NEXT: retq 64; 65; AVX-LABEL: test_zero_f64: 66; AVX: # BB#0: 67; AVX-NEXT: xorl %eax, %eax 68; AVX-NEXT: movntiq %rax, (%rdi) 69; AVX-NEXT: retq 70; 71; VLX-LABEL: test_zero_f64: 72; VLX: # BB#0: 73; VLX-NEXT: xorl %eax, %eax 74; VLX-NEXT: movntiq %rax, (%rdi) 75; VLX-NEXT: retq 76 store double zeroinitializer, double* %dst, align 1, !nontemporal !1 77 ret void 78} 79 80define void @test_zero_i64(i64* %dst) { 81; SSE-LABEL: test_zero_i64: 82; SSE: # BB#0: 83; SSE-NEXT: xorl %eax, %eax 84; SSE-NEXT: movntiq %rax, (%rdi) 85; SSE-NEXT: retq 86; 87; AVX-LABEL: test_zero_i64: 88; AVX: # BB#0: 89; AVX-NEXT: xorl %eax, %eax 90; AVX-NEXT: movntiq %rax, (%rdi) 91; AVX-NEXT: retq 92; 93; VLX-LABEL: test_zero_i64: 94; VLX: # BB#0: 95; VLX-NEXT: xorl %eax, %eax 96; VLX-NEXT: movntiq %rax, (%rdi) 97; VLX-NEXT: retq 98 store i64 zeroinitializer, i64* %dst, align 1, !nontemporal !1 99 ret void 100} 101 102; And now XMM versions. 103 104define void @test_zero_v4f32(<4 x float>* %dst) { 105; SSE-LABEL: test_zero_v4f32: 106; SSE: # BB#0: 107; SSE-NEXT: xorps %xmm0, %xmm0 108; SSE-NEXT: movntps %xmm0, (%rdi) 109; SSE-NEXT: retq 110; 111; AVX-LABEL: test_zero_v4f32: 112; AVX: # BB#0: 113; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 114; AVX-NEXT: vmovntps %xmm0, (%rdi) 115; AVX-NEXT: retq 116; 117; VLX-LABEL: test_zero_v4f32: 118; VLX: # BB#0: 119; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 120; VLX-NEXT: vmovntdq %xmm0, (%rdi) 121; VLX-NEXT: retq 122 store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1 123 ret void 124} 125 126define void @test_zero_v4i32(<4 x i32>* %dst) { 127; SSE-LABEL: test_zero_v4i32: 128; SSE: # BB#0: 129; SSE-NEXT: xorps %xmm0, %xmm0 130; SSE-NEXT: movntps %xmm0, (%rdi) 131; SSE-NEXT: retq 132; 133; AVX-LABEL: test_zero_v4i32: 134; AVX: # BB#0: 135; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 136; AVX-NEXT: vmovntps %xmm0, (%rdi) 137; AVX-NEXT: retq 138; 139; VLX-LABEL: test_zero_v4i32: 140; VLX: # BB#0: 141; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 142; VLX-NEXT: vmovntdq %xmm0, (%rdi) 143; VLX-NEXT: retq 144 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 145 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 146 ret void 147} 148 149define void @test_zero_v2f64(<2 x double>* %dst) { 150; SSE-LABEL: test_zero_v2f64: 151; SSE: # BB#0: 152; SSE-NEXT: xorps %xmm0, %xmm0 153; SSE-NEXT: movntps %xmm0, (%rdi) 154; SSE-NEXT: retq 155; 156; AVX-LABEL: test_zero_v2f64: 157; AVX: # BB#0: 158; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 159; AVX-NEXT: vmovntps %xmm0, (%rdi) 160; AVX-NEXT: retq 161; 162; VLX-LABEL: test_zero_v2f64: 163; VLX: # BB#0: 164; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 165; VLX-NEXT: vmovntdq %xmm0, (%rdi) 166; VLX-NEXT: retq 167 store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1 168 ret void 169} 170 171define void @test_zero_v2i64(<2 x i64>* %dst) { 172; SSE-LABEL: test_zero_v2i64: 173; SSE: # BB#0: 174; SSE-NEXT: xorps %xmm0, %xmm0 175; SSE-NEXT: movntps %xmm0, (%rdi) 176; SSE-NEXT: retq 177; 178; AVX-LABEL: test_zero_v2i64: 179; AVX: # BB#0: 180; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 181; AVX-NEXT: vmovntps %xmm0, (%rdi) 182; AVX-NEXT: retq 183; 184; VLX-LABEL: test_zero_v2i64: 185; VLX: # BB#0: 186; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 187; VLX-NEXT: vmovntdq %xmm0, (%rdi) 188; VLX-NEXT: retq 189 store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1 190 ret void 191} 192 193define void @test_zero_v8i16(<8 x i16>* %dst) { 194; SSE-LABEL: test_zero_v8i16: 195; SSE: # BB#0: 196; SSE-NEXT: xorps %xmm0, %xmm0 197; SSE-NEXT: movntps %xmm0, (%rdi) 198; SSE-NEXT: retq 199; 200; AVX-LABEL: test_zero_v8i16: 201; AVX: # BB#0: 202; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 203; AVX-NEXT: vmovntps %xmm0, (%rdi) 204; AVX-NEXT: retq 205; 206; VLX-LABEL: test_zero_v8i16: 207; VLX: # BB#0: 208; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 209; VLX-NEXT: vmovntdq %xmm0, (%rdi) 210; VLX-NEXT: retq 211 store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1 212 ret void 213} 214 215define void @test_zero_v16i8(<16 x i8>* %dst) { 216; SSE-LABEL: test_zero_v16i8: 217; SSE: # BB#0: 218; SSE-NEXT: xorps %xmm0, %xmm0 219; SSE-NEXT: movntps %xmm0, (%rdi) 220; SSE-NEXT: retq 221; 222; AVX-LABEL: test_zero_v16i8: 223; AVX: # BB#0: 224; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 225; AVX-NEXT: vmovntps %xmm0, (%rdi) 226; AVX-NEXT: retq 227; 228; VLX-LABEL: test_zero_v16i8: 229; VLX: # BB#0: 230; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 231; VLX-NEXT: vmovntdq %xmm0, (%rdi) 232; VLX-NEXT: retq 233 store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1 234 ret void 235} 236 237; And now YMM versions. 238 239define void @test_zero_v8f32(<8 x float>* %dst) { 240; SSE-LABEL: test_zero_v8f32: 241; SSE: # BB#0: 242; SSE-NEXT: xorps %xmm0, %xmm0 243; SSE-NEXT: movntps %xmm0, 16(%rdi) 244; SSE-NEXT: movntps %xmm0, (%rdi) 245; SSE-NEXT: retq 246; 247; AVX-LABEL: test_zero_v8f32: 248; AVX: # BB#0: 249; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0 250; AVX-NEXT: vmovntps %ymm0, (%rdi) 251; AVX-NEXT: vzeroupper 252; AVX-NEXT: retq 253; 254; VLX-LABEL: test_zero_v8f32: 255; VLX: # BB#0: 256; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0 257; VLX-NEXT: vmovntdq %ymm0, (%rdi) 258; VLX-NEXT: retq 259 store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1 260 ret void 261} 262 263define void @test_zero_v8i32(<8 x i32>* %dst) { 264; SSE-LABEL: test_zero_v8i32: 265; SSE: # BB#0: 266; SSE-NEXT: xorps %xmm0, %xmm0 267; SSE-NEXT: movntps %xmm0, 16(%rdi) 268; SSE-NEXT: movntps %xmm0, (%rdi) 269; SSE-NEXT: retq 270; 271; AVX-LABEL: test_zero_v8i32: 272; AVX: # BB#0: 273; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0 274; AVX-NEXT: vmovntps %ymm0, (%rdi) 275; AVX-NEXT: vzeroupper 276; AVX-NEXT: retq 277; 278; VLX-LABEL: test_zero_v8i32: 279; VLX: # BB#0: 280; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0 281; VLX-NEXT: vmovntdq %ymm0, (%rdi) 282; VLX-NEXT: retq 283 store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1 284 ret void 285} 286 287define void @test_zero_v4f64(<4 x double>* %dst) { 288; SSE-LABEL: test_zero_v4f64: 289; SSE: # BB#0: 290; SSE-NEXT: xorps %xmm0, %xmm0 291; SSE-NEXT: movntps %xmm0, 16(%rdi) 292; SSE-NEXT: movntps %xmm0, (%rdi) 293; SSE-NEXT: retq 294; 295; AVX-LABEL: test_zero_v4f64: 296; AVX: # BB#0: 297; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0 298; AVX-NEXT: vmovntps %ymm0, (%rdi) 299; AVX-NEXT: vzeroupper 300; AVX-NEXT: retq 301; 302; VLX-LABEL: test_zero_v4f64: 303; VLX: # BB#0: 304; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0 305; VLX-NEXT: vmovntdq %ymm0, (%rdi) 306; VLX-NEXT: retq 307 store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1 308 ret void 309} 310 311define void @test_zero_v4i64(<4 x i64>* %dst) { 312; SSE-LABEL: test_zero_v4i64: 313; SSE: # BB#0: 314; SSE-NEXT: xorps %xmm0, %xmm0 315; SSE-NEXT: movntps %xmm0, 16(%rdi) 316; SSE-NEXT: movntps %xmm0, (%rdi) 317; SSE-NEXT: retq 318; 319; AVX-LABEL: test_zero_v4i64: 320; AVX: # BB#0: 321; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0 322; AVX-NEXT: vmovntps %ymm0, (%rdi) 323; AVX-NEXT: vzeroupper 324; AVX-NEXT: retq 325; 326; VLX-LABEL: test_zero_v4i64: 327; VLX: # BB#0: 328; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0 329; VLX-NEXT: vmovntdq %ymm0, (%rdi) 330; VLX-NEXT: retq 331 store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1 332 ret void 333} 334 335define void @test_zero_v16i16(<16 x i16>* %dst) { 336; SSE-LABEL: test_zero_v16i16: 337; SSE: # BB#0: 338; SSE-NEXT: xorps %xmm0, %xmm0 339; SSE-NEXT: movntps %xmm0, 16(%rdi) 340; SSE-NEXT: movntps %xmm0, (%rdi) 341; SSE-NEXT: retq 342; 343; AVX-LABEL: test_zero_v16i16: 344; AVX: # BB#0: 345; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0 346; AVX-NEXT: vmovntps %ymm0, (%rdi) 347; AVX-NEXT: vzeroupper 348; AVX-NEXT: retq 349; 350; VLX-LABEL: test_zero_v16i16: 351; VLX: # BB#0: 352; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0 353; VLX-NEXT: vmovntdq %ymm0, (%rdi) 354; VLX-NEXT: retq 355 store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1 356 ret void 357} 358 359define void @test_zero_v32i8(<32 x i8>* %dst) { 360; SSE-LABEL: test_zero_v32i8: 361; SSE: # BB#0: 362; SSE-NEXT: xorps %xmm0, %xmm0 363; SSE-NEXT: movntps %xmm0, 16(%rdi) 364; SSE-NEXT: movntps %xmm0, (%rdi) 365; SSE-NEXT: retq 366; 367; AVX-LABEL: test_zero_v32i8: 368; AVX: # BB#0: 369; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0 370; AVX-NEXT: vmovntps %ymm0, (%rdi) 371; AVX-NEXT: vzeroupper 372; AVX-NEXT: retq 373; 374; VLX-LABEL: test_zero_v32i8: 375; VLX: # BB#0: 376; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0 377; VLX-NEXT: vmovntdq %ymm0, (%rdi) 378; VLX-NEXT: retq 379 store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1 380 ret void 381} 382 383 384; Check that we also handle arguments. Here the type survives longer. 385 386; Scalar versions. 387 388define void @test_arg_f32(float %arg, float* %dst) { 389; SSE2-LABEL: test_arg_f32: 390; SSE2: # BB#0: 391; SSE2-NEXT: movss %xmm0, (%rdi) 392; SSE2-NEXT: retq 393; 394; SSE4A-LABEL: test_arg_f32: 395; SSE4A: # BB#0: 396; SSE4A-NEXT: movntss %xmm0, (%rdi) 397; SSE4A-NEXT: retq 398; 399; SSE41-LABEL: test_arg_f32: 400; SSE41: # BB#0: 401; SSE41-NEXT: movss %xmm0, (%rdi) 402; SSE41-NEXT: retq 403; 404; AVX-LABEL: test_arg_f32: 405; AVX: # BB#0: 406; AVX-NEXT: vmovss %xmm0, (%rdi) 407; AVX-NEXT: retq 408; 409; VLX-LABEL: test_arg_f32: 410; VLX: # BB#0: 411; VLX-NEXT: vmovss %xmm0, (%rdi) 412; VLX-NEXT: retq 413 store float %arg, float* %dst, align 1, !nontemporal !1 414 ret void 415} 416 417define void @test_arg_i32(i32 %arg, i32* %dst) { 418; SSE-LABEL: test_arg_i32: 419; SSE: # BB#0: 420; SSE-NEXT: movntil %edi, (%rsi) 421; SSE-NEXT: retq 422; 423; AVX-LABEL: test_arg_i32: 424; AVX: # BB#0: 425; AVX-NEXT: movntil %edi, (%rsi) 426; AVX-NEXT: retq 427; 428; VLX-LABEL: test_arg_i32: 429; VLX: # BB#0: 430; VLX-NEXT: movntil %edi, (%rsi) 431; VLX-NEXT: retq 432 store i32 %arg, i32* %dst, align 1, !nontemporal !1 433 ret void 434} 435 436define void @test_arg_f64(double %arg, double* %dst) { 437; SSE2-LABEL: test_arg_f64: 438; SSE2: # BB#0: 439; SSE2-NEXT: movsd %xmm0, (%rdi) 440; SSE2-NEXT: retq 441; 442; SSE4A-LABEL: test_arg_f64: 443; SSE4A: # BB#0: 444; SSE4A-NEXT: movntsd %xmm0, (%rdi) 445; SSE4A-NEXT: retq 446; 447; SSE41-LABEL: test_arg_f64: 448; SSE41: # BB#0: 449; SSE41-NEXT: movsd %xmm0, (%rdi) 450; SSE41-NEXT: retq 451; 452; AVX-LABEL: test_arg_f64: 453; AVX: # BB#0: 454; AVX-NEXT: vmovsd %xmm0, (%rdi) 455; AVX-NEXT: retq 456; 457; VLX-LABEL: test_arg_f64: 458; VLX: # BB#0: 459; VLX-NEXT: vmovsd %xmm0, (%rdi) 460; VLX-NEXT: retq 461 store double %arg, double* %dst, align 1, !nontemporal !1 462 ret void 463} 464 465define void @test_arg_i64(i64 %arg, i64* %dst) { 466; SSE-LABEL: test_arg_i64: 467; SSE: # BB#0: 468; SSE-NEXT: movntiq %rdi, (%rsi) 469; SSE-NEXT: retq 470; 471; AVX-LABEL: test_arg_i64: 472; AVX: # BB#0: 473; AVX-NEXT: movntiq %rdi, (%rsi) 474; AVX-NEXT: retq 475; 476; VLX-LABEL: test_arg_i64: 477; VLX: # BB#0: 478; VLX-NEXT: movntiq %rdi, (%rsi) 479; VLX-NEXT: retq 480 store i64 %arg, i64* %dst, align 1, !nontemporal !1 481 ret void 482} 483 484; Extract versions 485 486define void @test_extract_f32(<4 x float> %arg, float* %dst) { 487; SSE2-LABEL: test_extract_f32: 488; SSE2: # BB#0: 489; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] 490; SSE2-NEXT: movss %xmm0, (%rdi) 491; SSE2-NEXT: retq 492; 493; SSE4A-LABEL: test_extract_f32: 494; SSE4A: # BB#0: 495; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 496; SSE4A-NEXT: movntss %xmm0, (%rdi) 497; SSE4A-NEXT: retq 498; 499; SSE41-LABEL: test_extract_f32: 500; SSE41: # BB#0: 501; SSE41-NEXT: extractps $1, %xmm0, %eax 502; SSE41-NEXT: movntil %eax, (%rdi) 503; SSE41-NEXT: retq 504; 505; AVX-LABEL: test_extract_f32: 506; AVX: # BB#0: 507; AVX-NEXT: vextractps $1, %xmm0, %eax 508; AVX-NEXT: movntil %eax, (%rdi) 509; AVX-NEXT: retq 510; 511; VLX-LABEL: test_extract_f32: 512; VLX: # BB#0: 513; VLX-NEXT: vextractps $1, %xmm0, %eax 514; VLX-NEXT: movntil %eax, (%rdi) 515; VLX-NEXT: retq 516 %1 = extractelement <4 x float> %arg, i32 1 517 store float %1, float* %dst, align 1, !nontemporal !1 518 ret void 519} 520 521define void @test_extract_i32(<4 x i32> %arg, i32* %dst) { 522; SSE2-LABEL: test_extract_i32: 523; SSE2: # BB#0: 524; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 525; SSE2-NEXT: movd %xmm0, %eax 526; SSE2-NEXT: movntil %eax, (%rdi) 527; SSE2-NEXT: retq 528; 529; SSE4A-LABEL: test_extract_i32: 530; SSE4A: # BB#0: 531; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 532; SSE4A-NEXT: movd %xmm0, %eax 533; SSE4A-NEXT: movntil %eax, (%rdi) 534; SSE4A-NEXT: retq 535; 536; SSE41-LABEL: test_extract_i32: 537; SSE41: # BB#0: 538; SSE41-NEXT: pextrd $1, %xmm0, %eax 539; SSE41-NEXT: movntil %eax, (%rdi) 540; SSE41-NEXT: retq 541; 542; AVX-LABEL: test_extract_i32: 543; AVX: # BB#0: 544; AVX-NEXT: vpextrd $1, %xmm0, %eax 545; AVX-NEXT: movntil %eax, (%rdi) 546; AVX-NEXT: retq 547; 548; VLX-LABEL: test_extract_i32: 549; VLX: # BB#0: 550; VLX-NEXT: vpextrd $1, %xmm0, %eax 551; VLX-NEXT: movntil %eax, (%rdi) 552; VLX-NEXT: retq 553 %1 = extractelement <4 x i32> %arg, i32 1 554 store i32 %1, i32* %dst, align 1, !nontemporal !1 555 ret void 556} 557 558define void @test_extract_f64(<2 x double> %arg, double* %dst) { 559; SSE2-LABEL: test_extract_f64: 560; SSE2: # BB#0: 561; SSE2-NEXT: movhpd %xmm0, (%rdi) 562; SSE2-NEXT: retq 563; 564; SSE4A-LABEL: test_extract_f64: 565; SSE4A: # BB#0: 566; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] 567; SSE4A-NEXT: movntsd %xmm0, (%rdi) 568; SSE4A-NEXT: retq 569; 570; SSE41-LABEL: test_extract_f64: 571; SSE41: # BB#0: 572; SSE41-NEXT: movhpd %xmm0, (%rdi) 573; SSE41-NEXT: retq 574; 575; AVX-LABEL: test_extract_f64: 576; AVX: # BB#0: 577; AVX-NEXT: vmovhpd %xmm0, (%rdi) 578; AVX-NEXT: retq 579; 580; VLX-LABEL: test_extract_f64: 581; VLX: # BB#0: 582; VLX-NEXT: vmovhpd %xmm0, (%rdi) 583; VLX-NEXT: retq 584 %1 = extractelement <2 x double> %arg, i32 1 585 store double %1, double* %dst, align 1, !nontemporal !1 586 ret void 587} 588 589define void @test_extract_i64(<2 x i64> %arg, i64* %dst) { 590; SSE2-LABEL: test_extract_i64: 591; SSE2: # BB#0: 592; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 593; SSE2-NEXT: movd %xmm0, %rax 594; SSE2-NEXT: movntiq %rax, (%rdi) 595; SSE2-NEXT: retq 596; 597; SSE4A-LABEL: test_extract_i64: 598; SSE4A: # BB#0: 599; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 600; SSE4A-NEXT: movd %xmm0, %rax 601; SSE4A-NEXT: movntiq %rax, (%rdi) 602; SSE4A-NEXT: retq 603; 604; SSE41-LABEL: test_extract_i64: 605; SSE41: # BB#0: 606; SSE41-NEXT: pextrq $1, %xmm0, %rax 607; SSE41-NEXT: movntiq %rax, (%rdi) 608; SSE41-NEXT: retq 609; 610; AVX-LABEL: test_extract_i64: 611; AVX: # BB#0: 612; AVX-NEXT: vpextrq $1, %xmm0, %rax 613; AVX-NEXT: movntiq %rax, (%rdi) 614; AVX-NEXT: retq 615; 616; VLX-LABEL: test_extract_i64: 617; VLX: # BB#0: 618; VLX-NEXT: vpextrq $1, %xmm0, %rax 619; VLX-NEXT: movntiq %rax, (%rdi) 620; VLX-NEXT: retq 621 %1 = extractelement <2 x i64> %arg, i32 1 622 store i64 %1, i64* %dst, align 1, !nontemporal !1 623 ret void 624} 625 626; And now XMM versions. 627 628define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) { 629; SSE-LABEL: test_arg_v4f32: 630; SSE: # BB#0: 631; SSE-NEXT: movntps %xmm0, (%rdi) 632; SSE-NEXT: retq 633; 634; AVX-LABEL: test_arg_v4f32: 635; AVX: # BB#0: 636; AVX-NEXT: vmovntps %xmm0, (%rdi) 637; AVX-NEXT: retq 638; 639; VLX-LABEL: test_arg_v4f32: 640; VLX: # BB#0: 641; VLX-NEXT: vmovntps %xmm0, (%rdi) 642; VLX-NEXT: retq 643 store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1 644 ret void 645} 646 647define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) { 648; SSE-LABEL: test_arg_v4i32: 649; SSE: # BB#0: 650; SSE-NEXT: movntps %xmm0, (%rdi) 651; SSE-NEXT: retq 652; 653; AVX-LABEL: test_arg_v4i32: 654; AVX: # BB#0: 655; AVX-NEXT: vmovntps %xmm0, (%rdi) 656; AVX-NEXT: retq 657; 658; VLX-LABEL: test_arg_v4i32: 659; VLX: # BB#0: 660; VLX-NEXT: vmovntdq %xmm0, (%rdi) 661; VLX-NEXT: retq 662 store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1 663 ret void 664} 665 666define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) { 667; SSE-LABEL: test_arg_v2f64: 668; SSE: # BB#0: 669; SSE-NEXT: movntps %xmm0, (%rdi) 670; SSE-NEXT: retq 671; 672; AVX-LABEL: test_arg_v2f64: 673; AVX: # BB#0: 674; AVX-NEXT: vmovntps %xmm0, (%rdi) 675; AVX-NEXT: retq 676; 677; VLX-LABEL: test_arg_v2f64: 678; VLX: # BB#0: 679; VLX-NEXT: vmovntpd %xmm0, (%rdi) 680; VLX-NEXT: retq 681 store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1 682 ret void 683} 684 685define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) { 686; SSE-LABEL: test_arg_v2i64: 687; SSE: # BB#0: 688; SSE-NEXT: movntps %xmm0, (%rdi) 689; SSE-NEXT: retq 690; 691; AVX-LABEL: test_arg_v2i64: 692; AVX: # BB#0: 693; AVX-NEXT: vmovntps %xmm0, (%rdi) 694; AVX-NEXT: retq 695; 696; VLX-LABEL: test_arg_v2i64: 697; VLX: # BB#0: 698; VLX-NEXT: vmovntdq %xmm0, (%rdi) 699; VLX-NEXT: retq 700 store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1 701 ret void 702} 703 704define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) { 705; SSE-LABEL: test_arg_v8i16: 706; SSE: # BB#0: 707; SSE-NEXT: movntps %xmm0, (%rdi) 708; SSE-NEXT: retq 709; 710; AVX-LABEL: test_arg_v8i16: 711; AVX: # BB#0: 712; AVX-NEXT: vmovntps %xmm0, (%rdi) 713; AVX-NEXT: retq 714; 715; VLX-LABEL: test_arg_v8i16: 716; VLX: # BB#0: 717; VLX-NEXT: vmovntdq %xmm0, (%rdi) 718; VLX-NEXT: retq 719 store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1 720 ret void 721} 722 723define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) { 724; SSE-LABEL: test_arg_v16i8: 725; SSE: # BB#0: 726; SSE-NEXT: movntps %xmm0, (%rdi) 727; SSE-NEXT: retq 728; 729; AVX-LABEL: test_arg_v16i8: 730; AVX: # BB#0: 731; AVX-NEXT: vmovntps %xmm0, (%rdi) 732; AVX-NEXT: retq 733; 734; VLX-LABEL: test_arg_v16i8: 735; VLX: # BB#0: 736; VLX-NEXT: vmovntdq %xmm0, (%rdi) 737; VLX-NEXT: retq 738 store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1 739 ret void 740} 741 742; And now YMM versions. 743 744define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) { 745; SSE-LABEL: test_arg_v8f32: 746; SSE: # BB#0: 747; SSE-NEXT: movntps %xmm1, 16(%rdi) 748; SSE-NEXT: movntps %xmm0, (%rdi) 749; SSE-NEXT: retq 750; 751; AVX-LABEL: test_arg_v8f32: 752; AVX: # BB#0: 753; AVX-NEXT: vmovntps %ymm0, (%rdi) 754; AVX-NEXT: vzeroupper 755; AVX-NEXT: retq 756; 757; VLX-LABEL: test_arg_v8f32: 758; VLX: # BB#0: 759; VLX-NEXT: vmovntps %ymm0, (%rdi) 760; VLX-NEXT: retq 761 store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1 762 ret void 763} 764 765define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) { 766; SSE-LABEL: test_arg_v8i32: 767; SSE: # BB#0: 768; SSE-NEXT: movntps %xmm1, 16(%rdi) 769; SSE-NEXT: movntps %xmm0, (%rdi) 770; SSE-NEXT: retq 771; 772; AVX-LABEL: test_arg_v8i32: 773; AVX: # BB#0: 774; AVX-NEXT: vmovntps %ymm0, (%rdi) 775; AVX-NEXT: vzeroupper 776; AVX-NEXT: retq 777; 778; VLX-LABEL: test_arg_v8i32: 779; VLX: # BB#0: 780; VLX-NEXT: vmovntdq %ymm0, (%rdi) 781; VLX-NEXT: retq 782 store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1 783 ret void 784} 785 786define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) { 787; SSE-LABEL: test_arg_v4f64: 788; SSE: # BB#0: 789; SSE-NEXT: movntps %xmm1, 16(%rdi) 790; SSE-NEXT: movntps %xmm0, (%rdi) 791; SSE-NEXT: retq 792; 793; AVX-LABEL: test_arg_v4f64: 794; AVX: # BB#0: 795; AVX-NEXT: vmovntps %ymm0, (%rdi) 796; AVX-NEXT: vzeroupper 797; AVX-NEXT: retq 798; 799; VLX-LABEL: test_arg_v4f64: 800; VLX: # BB#0: 801; VLX-NEXT: vmovntpd %ymm0, (%rdi) 802; VLX-NEXT: retq 803 store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1 804 ret void 805} 806 807define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) { 808; SSE-LABEL: test_arg_v4i64: 809; SSE: # BB#0: 810; SSE-NEXT: movntps %xmm1, 16(%rdi) 811; SSE-NEXT: movntps %xmm0, (%rdi) 812; SSE-NEXT: retq 813; 814; AVX-LABEL: test_arg_v4i64: 815; AVX: # BB#0: 816; AVX-NEXT: vmovntps %ymm0, (%rdi) 817; AVX-NEXT: vzeroupper 818; AVX-NEXT: retq 819; 820; VLX-LABEL: test_arg_v4i64: 821; VLX: # BB#0: 822; VLX-NEXT: vmovntdq %ymm0, (%rdi) 823; VLX-NEXT: retq 824 store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1 825 ret void 826} 827 828define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) { 829; SSE-LABEL: test_arg_v16i16: 830; SSE: # BB#0: 831; SSE-NEXT: movntps %xmm1, 16(%rdi) 832; SSE-NEXT: movntps %xmm0, (%rdi) 833; SSE-NEXT: retq 834; 835; AVX-LABEL: test_arg_v16i16: 836; AVX: # BB#0: 837; AVX-NEXT: vmovntps %ymm0, (%rdi) 838; AVX-NEXT: vzeroupper 839; AVX-NEXT: retq 840; 841; VLX-LABEL: test_arg_v16i16: 842; VLX: # BB#0: 843; VLX-NEXT: vmovntdq %ymm0, (%rdi) 844; VLX-NEXT: retq 845 store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1 846 ret void 847} 848 849define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) { 850; SSE-LABEL: test_arg_v32i8: 851; SSE: # BB#0: 852; SSE-NEXT: movntps %xmm1, 16(%rdi) 853; SSE-NEXT: movntps %xmm0, (%rdi) 854; SSE-NEXT: retq 855; 856; AVX-LABEL: test_arg_v32i8: 857; AVX: # BB#0: 858; AVX-NEXT: vmovntps %ymm0, (%rdi) 859; AVX-NEXT: vzeroupper 860; AVX-NEXT: retq 861; 862; VLX-LABEL: test_arg_v32i8: 863; VLX: # BB#0: 864; VLX-NEXT: vmovntdq %ymm0, (%rdi) 865; VLX-NEXT: retq 866 store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1 867 ret void 868} 869 870 871; Now check that if the execution domain is trivially visible, we use it. 872; We use an add to make the type survive all the way to the MOVNT. 873 874define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) { 875; SSE-LABEL: test_op_v4f32: 876; SSE: # BB#0: 877; SSE-NEXT: addps %xmm1, %xmm0 878; SSE-NEXT: movntps %xmm0, (%rdi) 879; SSE-NEXT: retq 880; 881; AVX-LABEL: test_op_v4f32: 882; AVX: # BB#0: 883; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 884; AVX-NEXT: vmovntps %xmm0, (%rdi) 885; AVX-NEXT: retq 886; 887; VLX-LABEL: test_op_v4f32: 888; VLX: # BB#0: 889; VLX-NEXT: vaddps %xmm1, %xmm0, %xmm0 890; VLX-NEXT: vmovntps %xmm0, (%rdi) 891; VLX-NEXT: retq 892 %r = fadd <4 x float> %a, %b 893 store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1 894 ret void 895} 896 897define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) { 898; SSE-LABEL: test_op_v4i32: 899; SSE: # BB#0: 900; SSE-NEXT: paddd %xmm1, %xmm0 901; SSE-NEXT: movntdq %xmm0, (%rdi) 902; SSE-NEXT: retq 903; 904; AVX-LABEL: test_op_v4i32: 905; AVX: # BB#0: 906; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 907; AVX-NEXT: vmovntdq %xmm0, (%rdi) 908; AVX-NEXT: retq 909; 910; VLX-LABEL: test_op_v4i32: 911; VLX: # BB#0: 912; VLX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 913; VLX-NEXT: vmovntdq %xmm0, (%rdi) 914; VLX-NEXT: retq 915 %r = add <4 x i32> %a, %b 916 store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1 917 ret void 918} 919 920define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) { 921; SSE-LABEL: test_op_v2f64: 922; SSE: # BB#0: 923; SSE-NEXT: addpd %xmm1, %xmm0 924; SSE-NEXT: movntpd %xmm0, (%rdi) 925; SSE-NEXT: retq 926; 927; AVX-LABEL: test_op_v2f64: 928; AVX: # BB#0: 929; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 930; AVX-NEXT: vmovntpd %xmm0, (%rdi) 931; AVX-NEXT: retq 932; 933; VLX-LABEL: test_op_v2f64: 934; VLX: # BB#0: 935; VLX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 936; VLX-NEXT: vmovntpd %xmm0, (%rdi) 937; VLX-NEXT: retq 938 %r = fadd <2 x double> %a, %b 939 store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1 940 ret void 941} 942 943define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) { 944; SSE-LABEL: test_op_v2i64: 945; SSE: # BB#0: 946; SSE-NEXT: paddq %xmm1, %xmm0 947; SSE-NEXT: movntdq %xmm0, (%rdi) 948; SSE-NEXT: retq 949; 950; AVX-LABEL: test_op_v2i64: 951; AVX: # BB#0: 952; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 953; AVX-NEXT: vmovntdq %xmm0, (%rdi) 954; AVX-NEXT: retq 955; 956; VLX-LABEL: test_op_v2i64: 957; VLX: # BB#0: 958; VLX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 959; VLX-NEXT: vmovntdq %xmm0, (%rdi) 960; VLX-NEXT: retq 961 %r = add <2 x i64> %a, %b 962 store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1 963 ret void 964} 965 966define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) { 967; SSE-LABEL: test_op_v8i16: 968; SSE: # BB#0: 969; SSE-NEXT: paddw %xmm1, %xmm0 970; SSE-NEXT: movntdq %xmm0, (%rdi) 971; SSE-NEXT: retq 972; 973; AVX-LABEL: test_op_v8i16: 974; AVX: # BB#0: 975; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 976; AVX-NEXT: vmovntdq %xmm0, (%rdi) 977; AVX-NEXT: retq 978; 979; VLX-LABEL: test_op_v8i16: 980; VLX: # BB#0: 981; VLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 982; VLX-NEXT: vmovntdq %xmm0, (%rdi) 983; VLX-NEXT: retq 984 %r = add <8 x i16> %a, %b 985 store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1 986 ret void 987} 988 989define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) { 990; SSE-LABEL: test_op_v16i8: 991; SSE: # BB#0: 992; SSE-NEXT: paddb %xmm1, %xmm0 993; SSE-NEXT: movntdq %xmm0, (%rdi) 994; SSE-NEXT: retq 995; 996; AVX-LABEL: test_op_v16i8: 997; AVX: # BB#0: 998; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 999; AVX-NEXT: vmovntdq %xmm0, (%rdi) 1000; AVX-NEXT: retq 1001; 1002; VLX-LABEL: test_op_v16i8: 1003; VLX: # BB#0: 1004; VLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1005; VLX-NEXT: vmovntdq %xmm0, (%rdi) 1006; VLX-NEXT: retq 1007 %r = add <16 x i8> %a, %b 1008 store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1 1009 ret void 1010} 1011 1012; And now YMM versions. 1013 1014define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) { 1015; SSE-LABEL: test_op_v8f32: 1016; SSE: # BB#0: 1017; SSE-NEXT: addps %xmm2, %xmm0 1018; SSE-NEXT: addps %xmm3, %xmm1 1019; SSE-NEXT: movntps %xmm1, 16(%rdi) 1020; SSE-NEXT: movntps %xmm0, (%rdi) 1021; SSE-NEXT: retq 1022; 1023; AVX-LABEL: test_op_v8f32: 1024; AVX: # BB#0: 1025; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1026; AVX-NEXT: vmovntps %ymm0, (%rdi) 1027; AVX-NEXT: vzeroupper 1028; AVX-NEXT: retq 1029; 1030; VLX-LABEL: test_op_v8f32: 1031; VLX: # BB#0: 1032; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1033; VLX-NEXT: vmovntps %ymm0, (%rdi) 1034; VLX-NEXT: retq 1035 %r = fadd <8 x float> %a, %b 1036 store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1 1037 ret void 1038} 1039 1040define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) { 1041; SSE-LABEL: test_op_v8i32: 1042; SSE: # BB#0: 1043; SSE-NEXT: paddd %xmm2, %xmm0 1044; SSE-NEXT: paddd %xmm3, %xmm1 1045; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1046; SSE-NEXT: movntdq %xmm0, (%rdi) 1047; SSE-NEXT: retq 1048; 1049; AVX1-LABEL: test_op_v8i32: 1050; AVX1: # BB#0: 1051; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1052; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1053; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 1054; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1055; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1056; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1057; AVX1-NEXT: vzeroupper 1058; AVX1-NEXT: retq 1059; 1060; AVX2-LABEL: test_op_v8i32: 1061; AVX2: # BB#0: 1062; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1063; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1064; AVX2-NEXT: vzeroupper 1065; AVX2-NEXT: retq 1066; 1067; VLX-LABEL: test_op_v8i32: 1068; VLX: # BB#0: 1069; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 1070; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1071; VLX-NEXT: retq 1072 %r = add <8 x i32> %a, %b 1073 store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1 1074 ret void 1075} 1076 1077define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) { 1078; SSE-LABEL: test_op_v4f64: 1079; SSE: # BB#0: 1080; SSE-NEXT: addpd %xmm2, %xmm0 1081; SSE-NEXT: addpd %xmm3, %xmm1 1082; SSE-NEXT: movntpd %xmm1, 16(%rdi) 1083; SSE-NEXT: movntpd %xmm0, (%rdi) 1084; SSE-NEXT: retq 1085; 1086; AVX-LABEL: test_op_v4f64: 1087; AVX: # BB#0: 1088; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1089; AVX-NEXT: vmovntpd %ymm0, (%rdi) 1090; AVX-NEXT: vzeroupper 1091; AVX-NEXT: retq 1092; 1093; VLX-LABEL: test_op_v4f64: 1094; VLX: # BB#0: 1095; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1096; VLX-NEXT: vmovntpd %ymm0, (%rdi) 1097; VLX-NEXT: retq 1098 %r = fadd <4 x double> %a, %b 1099 store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1 1100 ret void 1101} 1102 1103define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) { 1104; SSE-LABEL: test_op_v4i64: 1105; SSE: # BB#0: 1106; SSE-NEXT: paddq %xmm2, %xmm0 1107; SSE-NEXT: paddq %xmm3, %xmm1 1108; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1109; SSE-NEXT: movntdq %xmm0, (%rdi) 1110; SSE-NEXT: retq 1111; 1112; AVX1-LABEL: test_op_v4i64: 1113; AVX1: # BB#0: 1114; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1115; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1116; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 1117; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 1118; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1119; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1120; AVX1-NEXT: vzeroupper 1121; AVX1-NEXT: retq 1122; 1123; AVX2-LABEL: test_op_v4i64: 1124; AVX2: # BB#0: 1125; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1126; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1127; AVX2-NEXT: vzeroupper 1128; AVX2-NEXT: retq 1129; 1130; VLX-LABEL: test_op_v4i64: 1131; VLX: # BB#0: 1132; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 1133; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1134; VLX-NEXT: retq 1135 %r = add <4 x i64> %a, %b 1136 store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1 1137 ret void 1138} 1139 1140define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) { 1141; SSE-LABEL: test_op_v16i16: 1142; SSE: # BB#0: 1143; SSE-NEXT: paddw %xmm2, %xmm0 1144; SSE-NEXT: paddw %xmm3, %xmm1 1145; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1146; SSE-NEXT: movntdq %xmm0, (%rdi) 1147; SSE-NEXT: retq 1148; 1149; AVX1-LABEL: test_op_v16i16: 1150; AVX1: # BB#0: 1151; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1152; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1153; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 1154; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1155; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1156; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1157; AVX1-NEXT: vzeroupper 1158; AVX1-NEXT: retq 1159; 1160; AVX2-LABEL: test_op_v16i16: 1161; AVX2: # BB#0: 1162; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1163; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1164; AVX2-NEXT: vzeroupper 1165; AVX2-NEXT: retq 1166; 1167; VLX-LABEL: test_op_v16i16: 1168; VLX: # BB#0: 1169; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 1170; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1171; VLX-NEXT: retq 1172 %r = add <16 x i16> %a, %b 1173 store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1 1174 ret void 1175} 1176 1177define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) { 1178; SSE-LABEL: test_op_v32i8: 1179; SSE: # BB#0: 1180; SSE-NEXT: paddb %xmm2, %xmm0 1181; SSE-NEXT: paddb %xmm3, %xmm1 1182; SSE-NEXT: movntdq %xmm1, 16(%rdi) 1183; SSE-NEXT: movntdq %xmm0, (%rdi) 1184; SSE-NEXT: retq 1185; 1186; AVX1-LABEL: test_op_v32i8: 1187; AVX1: # BB#0: 1188; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1189; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1190; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 1191; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 1192; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1193; AVX1-NEXT: vmovntps %ymm0, (%rdi) 1194; AVX1-NEXT: vzeroupper 1195; AVX1-NEXT: retq 1196; 1197; AVX2-LABEL: test_op_v32i8: 1198; AVX2: # BB#0: 1199; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1200; AVX2-NEXT: vmovntdq %ymm0, (%rdi) 1201; AVX2-NEXT: vzeroupper 1202; AVX2-NEXT: retq 1203; 1204; VLX-LABEL: test_op_v32i8: 1205; VLX: # BB#0: 1206; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1207; VLX-NEXT: vmovntdq %ymm0, (%rdi) 1208; VLX-NEXT: retq 1209 %r = add <32 x i8> %a, %b 1210 store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1 1211 ret void 1212} 1213 1214; 256-bit NT stores require 256-bit alignment. 1215; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we 1216; could even scalarize to movnti when we have 1-alignment: nontemporal is 1217; probably always worth even some 20 instruction scalarization. 1218define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) { 1219; SSE-LABEL: test_unaligned_v8f32: 1220; SSE: # BB#0: 1221; SSE-NEXT: addps %xmm2, %xmm0 1222; SSE-NEXT: addps %xmm3, %xmm1 1223; SSE-NEXT: movntps %xmm1, 16(%rdi) 1224; SSE-NEXT: movntps %xmm0, (%rdi) 1225; SSE-NEXT: retq 1226; 1227; AVX-LABEL: test_unaligned_v8f32: 1228; AVX: # BB#0: 1229; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1230; AVX-NEXT: vmovups %ymm0, (%rdi) 1231; AVX-NEXT: vzeroupper 1232; AVX-NEXT: retq 1233; 1234; VLX-LABEL: test_unaligned_v8f32: 1235; VLX: # BB#0: 1236; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0 1237; VLX-NEXT: vmovups %ymm0, (%rdi) 1238; VLX-NEXT: retq 1239 %r = fadd <8 x float> %a, %b 1240 store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1 1241 ret void 1242} 1243 1244!1 = !{i32 1} 1245