xref: /aosp_15_r20/external/llvm/test/CodeGen/X86/nontemporal-2.ll (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX
8
9; Make sure that we generate non-temporal stores for the test cases below.
10; We use xorps for zeroing, so domain information isn't available anymore.
11
12; Scalar versions (zeroing means we can this even for fp types).
13
14define void @test_zero_f32(float* %dst) {
15; SSE-LABEL: test_zero_f32:
16; SSE:       # BB#0:
17; SSE-NEXT:    xorl %eax, %eax
18; SSE-NEXT:    movntil %eax, (%rdi)
19; SSE-NEXT:    retq
20;
21; AVX-LABEL: test_zero_f32:
22; AVX:       # BB#0:
23; AVX-NEXT:    xorl %eax, %eax
24; AVX-NEXT:    movntil %eax, (%rdi)
25; AVX-NEXT:    retq
26;
27; VLX-LABEL: test_zero_f32:
28; VLX:       # BB#0:
29; VLX-NEXT:    xorl %eax, %eax
30; VLX-NEXT:    movntil %eax, (%rdi)
31; VLX-NEXT:    retq
32  store float zeroinitializer, float* %dst, align 1, !nontemporal !1
33  ret void
34}
35
36define void @test_zero_i32(i32* %dst) {
37; SSE-LABEL: test_zero_i32:
38; SSE:       # BB#0:
39; SSE-NEXT:    xorl %eax, %eax
40; SSE-NEXT:    movntil %eax, (%rdi)
41; SSE-NEXT:    retq
42;
43; AVX-LABEL: test_zero_i32:
44; AVX:       # BB#0:
45; AVX-NEXT:    xorl %eax, %eax
46; AVX-NEXT:    movntil %eax, (%rdi)
47; AVX-NEXT:    retq
48;
49; VLX-LABEL: test_zero_i32:
50; VLX:       # BB#0:
51; VLX-NEXT:    xorl %eax, %eax
52; VLX-NEXT:    movntil %eax, (%rdi)
53; VLX-NEXT:    retq
54  store i32 zeroinitializer, i32* %dst, align 1, !nontemporal !1
55  ret void
56}
57
58define void @test_zero_f64(double* %dst) {
59; SSE-LABEL: test_zero_f64:
60; SSE:       # BB#0:
61; SSE-NEXT:    xorl %eax, %eax
62; SSE-NEXT:    movntiq %rax, (%rdi)
63; SSE-NEXT:    retq
64;
65; AVX-LABEL: test_zero_f64:
66; AVX:       # BB#0:
67; AVX-NEXT:    xorl %eax, %eax
68; AVX-NEXT:    movntiq %rax, (%rdi)
69; AVX-NEXT:    retq
70;
71; VLX-LABEL: test_zero_f64:
72; VLX:       # BB#0:
73; VLX-NEXT:    xorl %eax, %eax
74; VLX-NEXT:    movntiq %rax, (%rdi)
75; VLX-NEXT:    retq
76  store double zeroinitializer, double* %dst, align 1, !nontemporal !1
77  ret void
78}
79
80define void @test_zero_i64(i64* %dst) {
81; SSE-LABEL: test_zero_i64:
82; SSE:       # BB#0:
83; SSE-NEXT:    xorl %eax, %eax
84; SSE-NEXT:    movntiq %rax, (%rdi)
85; SSE-NEXT:    retq
86;
87; AVX-LABEL: test_zero_i64:
88; AVX:       # BB#0:
89; AVX-NEXT:    xorl %eax, %eax
90; AVX-NEXT:    movntiq %rax, (%rdi)
91; AVX-NEXT:    retq
92;
93; VLX-LABEL: test_zero_i64:
94; VLX:       # BB#0:
95; VLX-NEXT:    xorl %eax, %eax
96; VLX-NEXT:    movntiq %rax, (%rdi)
97; VLX-NEXT:    retq
98  store i64 zeroinitializer, i64* %dst, align 1, !nontemporal !1
99  ret void
100}
101
102; And now XMM versions.
103
104define void @test_zero_v4f32(<4 x float>* %dst) {
105; SSE-LABEL: test_zero_v4f32:
106; SSE:       # BB#0:
107; SSE-NEXT:    xorps %xmm0, %xmm0
108; SSE-NEXT:    movntps %xmm0, (%rdi)
109; SSE-NEXT:    retq
110;
111; AVX-LABEL: test_zero_v4f32:
112; AVX:       # BB#0:
113; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
114; AVX-NEXT:    vmovntps %xmm0, (%rdi)
115; AVX-NEXT:    retq
116;
117; VLX-LABEL: test_zero_v4f32:
118; VLX:       # BB#0:
119; VLX-NEXT:    vpxord %xmm0, %xmm0, %xmm0
120; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
121; VLX-NEXT:    retq
122  store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
123  ret void
124}
125
126define void @test_zero_v4i32(<4 x i32>* %dst) {
127; SSE-LABEL: test_zero_v4i32:
128; SSE:       # BB#0:
129; SSE-NEXT:    xorps %xmm0, %xmm0
130; SSE-NEXT:    movntps %xmm0, (%rdi)
131; SSE-NEXT:    retq
132;
133; AVX-LABEL: test_zero_v4i32:
134; AVX:       # BB#0:
135; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
136; AVX-NEXT:    vmovntps %xmm0, (%rdi)
137; AVX-NEXT:    retq
138;
139; VLX-LABEL: test_zero_v4i32:
140; VLX:       # BB#0:
141; VLX-NEXT:    vpxord %xmm0, %xmm0, %xmm0
142; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
143; VLX-NEXT:    retq
144  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
145  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
146  ret void
147}
148
149define void @test_zero_v2f64(<2 x double>* %dst) {
150; SSE-LABEL: test_zero_v2f64:
151; SSE:       # BB#0:
152; SSE-NEXT:    xorps %xmm0, %xmm0
153; SSE-NEXT:    movntps %xmm0, (%rdi)
154; SSE-NEXT:    retq
155;
156; AVX-LABEL: test_zero_v2f64:
157; AVX:       # BB#0:
158; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
159; AVX-NEXT:    vmovntps %xmm0, (%rdi)
160; AVX-NEXT:    retq
161;
162; VLX-LABEL: test_zero_v2f64:
163; VLX:       # BB#0:
164; VLX-NEXT:    vpxord %xmm0, %xmm0, %xmm0
165; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
166; VLX-NEXT:    retq
167  store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
168  ret void
169}
170
171define void @test_zero_v2i64(<2 x i64>* %dst) {
172; SSE-LABEL: test_zero_v2i64:
173; SSE:       # BB#0:
174; SSE-NEXT:    xorps %xmm0, %xmm0
175; SSE-NEXT:    movntps %xmm0, (%rdi)
176; SSE-NEXT:    retq
177;
178; AVX-LABEL: test_zero_v2i64:
179; AVX:       # BB#0:
180; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
181; AVX-NEXT:    vmovntps %xmm0, (%rdi)
182; AVX-NEXT:    retq
183;
184; VLX-LABEL: test_zero_v2i64:
185; VLX:       # BB#0:
186; VLX-NEXT:    vpxord %xmm0, %xmm0, %xmm0
187; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
188; VLX-NEXT:    retq
189  store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
190  ret void
191}
192
193define void @test_zero_v8i16(<8 x i16>* %dst) {
194; SSE-LABEL: test_zero_v8i16:
195; SSE:       # BB#0:
196; SSE-NEXT:    xorps %xmm0, %xmm0
197; SSE-NEXT:    movntps %xmm0, (%rdi)
198; SSE-NEXT:    retq
199;
200; AVX-LABEL: test_zero_v8i16:
201; AVX:       # BB#0:
202; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
203; AVX-NEXT:    vmovntps %xmm0, (%rdi)
204; AVX-NEXT:    retq
205;
206; VLX-LABEL: test_zero_v8i16:
207; VLX:       # BB#0:
208; VLX-NEXT:    vpxord %xmm0, %xmm0, %xmm0
209; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
210; VLX-NEXT:    retq
211  store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
212  ret void
213}
214
215define void @test_zero_v16i8(<16 x i8>* %dst) {
216; SSE-LABEL: test_zero_v16i8:
217; SSE:       # BB#0:
218; SSE-NEXT:    xorps %xmm0, %xmm0
219; SSE-NEXT:    movntps %xmm0, (%rdi)
220; SSE-NEXT:    retq
221;
222; AVX-LABEL: test_zero_v16i8:
223; AVX:       # BB#0:
224; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
225; AVX-NEXT:    vmovntps %xmm0, (%rdi)
226; AVX-NEXT:    retq
227;
228; VLX-LABEL: test_zero_v16i8:
229; VLX:       # BB#0:
230; VLX-NEXT:    vpxord %xmm0, %xmm0, %xmm0
231; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
232; VLX-NEXT:    retq
233  store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
234  ret void
235}
236
237; And now YMM versions.
238
239define void @test_zero_v8f32(<8 x float>* %dst) {
240; SSE-LABEL: test_zero_v8f32:
241; SSE:       # BB#0:
242; SSE-NEXT:    xorps %xmm0, %xmm0
243; SSE-NEXT:    movntps %xmm0, 16(%rdi)
244; SSE-NEXT:    movntps %xmm0, (%rdi)
245; SSE-NEXT:    retq
246;
247; AVX-LABEL: test_zero_v8f32:
248; AVX:       # BB#0:
249; AVX-NEXT:    vxorps %ymm0, %ymm0, %ymm0
250; AVX-NEXT:    vmovntps %ymm0, (%rdi)
251; AVX-NEXT:    vzeroupper
252; AVX-NEXT:    retq
253;
254; VLX-LABEL: test_zero_v8f32:
255; VLX:       # BB#0:
256; VLX-NEXT:    vpxord %ymm0, %ymm0, %ymm0
257; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
258; VLX-NEXT:    retq
259  store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
260  ret void
261}
262
263define void @test_zero_v8i32(<8 x i32>* %dst) {
264; SSE-LABEL: test_zero_v8i32:
265; SSE:       # BB#0:
266; SSE-NEXT:    xorps %xmm0, %xmm0
267; SSE-NEXT:    movntps %xmm0, 16(%rdi)
268; SSE-NEXT:    movntps %xmm0, (%rdi)
269; SSE-NEXT:    retq
270;
271; AVX-LABEL: test_zero_v8i32:
272; AVX:       # BB#0:
273; AVX-NEXT:    vxorps %ymm0, %ymm0, %ymm0
274; AVX-NEXT:    vmovntps %ymm0, (%rdi)
275; AVX-NEXT:    vzeroupper
276; AVX-NEXT:    retq
277;
278; VLX-LABEL: test_zero_v8i32:
279; VLX:       # BB#0:
280; VLX-NEXT:    vpxord %ymm0, %ymm0, %ymm0
281; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
282; VLX-NEXT:    retq
283  store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
284  ret void
285}
286
287define void @test_zero_v4f64(<4 x double>* %dst) {
288; SSE-LABEL: test_zero_v4f64:
289; SSE:       # BB#0:
290; SSE-NEXT:    xorps %xmm0, %xmm0
291; SSE-NEXT:    movntps %xmm0, 16(%rdi)
292; SSE-NEXT:    movntps %xmm0, (%rdi)
293; SSE-NEXT:    retq
294;
295; AVX-LABEL: test_zero_v4f64:
296; AVX:       # BB#0:
297; AVX-NEXT:    vxorps %ymm0, %ymm0, %ymm0
298; AVX-NEXT:    vmovntps %ymm0, (%rdi)
299; AVX-NEXT:    vzeroupper
300; AVX-NEXT:    retq
301;
302; VLX-LABEL: test_zero_v4f64:
303; VLX:       # BB#0:
304; VLX-NEXT:    vpxord %ymm0, %ymm0, %ymm0
305; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
306; VLX-NEXT:    retq
307  store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
308  ret void
309}
310
311define void @test_zero_v4i64(<4 x i64>* %dst) {
312; SSE-LABEL: test_zero_v4i64:
313; SSE:       # BB#0:
314; SSE-NEXT:    xorps %xmm0, %xmm0
315; SSE-NEXT:    movntps %xmm0, 16(%rdi)
316; SSE-NEXT:    movntps %xmm0, (%rdi)
317; SSE-NEXT:    retq
318;
319; AVX-LABEL: test_zero_v4i64:
320; AVX:       # BB#0:
321; AVX-NEXT:    vxorps %ymm0, %ymm0, %ymm0
322; AVX-NEXT:    vmovntps %ymm0, (%rdi)
323; AVX-NEXT:    vzeroupper
324; AVX-NEXT:    retq
325;
326; VLX-LABEL: test_zero_v4i64:
327; VLX:       # BB#0:
328; VLX-NEXT:    vpxord %ymm0, %ymm0, %ymm0
329; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
330; VLX-NEXT:    retq
331  store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
332  ret void
333}
334
335define void @test_zero_v16i16(<16 x i16>* %dst) {
336; SSE-LABEL: test_zero_v16i16:
337; SSE:       # BB#0:
338; SSE-NEXT:    xorps %xmm0, %xmm0
339; SSE-NEXT:    movntps %xmm0, 16(%rdi)
340; SSE-NEXT:    movntps %xmm0, (%rdi)
341; SSE-NEXT:    retq
342;
343; AVX-LABEL: test_zero_v16i16:
344; AVX:       # BB#0:
345; AVX-NEXT:    vxorps %ymm0, %ymm0, %ymm0
346; AVX-NEXT:    vmovntps %ymm0, (%rdi)
347; AVX-NEXT:    vzeroupper
348; AVX-NEXT:    retq
349;
350; VLX-LABEL: test_zero_v16i16:
351; VLX:       # BB#0:
352; VLX-NEXT:    vpxord %ymm0, %ymm0, %ymm0
353; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
354; VLX-NEXT:    retq
355  store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
356  ret void
357}
358
359define void @test_zero_v32i8(<32 x i8>* %dst) {
360; SSE-LABEL: test_zero_v32i8:
361; SSE:       # BB#0:
362; SSE-NEXT:    xorps %xmm0, %xmm0
363; SSE-NEXT:    movntps %xmm0, 16(%rdi)
364; SSE-NEXT:    movntps %xmm0, (%rdi)
365; SSE-NEXT:    retq
366;
367; AVX-LABEL: test_zero_v32i8:
368; AVX:       # BB#0:
369; AVX-NEXT:    vxorps %ymm0, %ymm0, %ymm0
370; AVX-NEXT:    vmovntps %ymm0, (%rdi)
371; AVX-NEXT:    vzeroupper
372; AVX-NEXT:    retq
373;
374; VLX-LABEL: test_zero_v32i8:
375; VLX:       # BB#0:
376; VLX-NEXT:    vpxord %ymm0, %ymm0, %ymm0
377; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
378; VLX-NEXT:    retq
379  store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
380  ret void
381}
382
383
384; Check that we also handle arguments.  Here the type survives longer.
385
386; Scalar versions.
387
388define void @test_arg_f32(float %arg, float* %dst) {
389; SSE2-LABEL: test_arg_f32:
390; SSE2:       # BB#0:
391; SSE2-NEXT:    movss %xmm0, (%rdi)
392; SSE2-NEXT:    retq
393;
394; SSE4A-LABEL: test_arg_f32:
395; SSE4A:       # BB#0:
396; SSE4A-NEXT:    movntss %xmm0, (%rdi)
397; SSE4A-NEXT:    retq
398;
399; SSE41-LABEL: test_arg_f32:
400; SSE41:       # BB#0:
401; SSE41-NEXT:    movss %xmm0, (%rdi)
402; SSE41-NEXT:    retq
403;
404; AVX-LABEL: test_arg_f32:
405; AVX:       # BB#0:
406; AVX-NEXT:    vmovss %xmm0, (%rdi)
407; AVX-NEXT:    retq
408;
409; VLX-LABEL: test_arg_f32:
410; VLX:       # BB#0:
411; VLX-NEXT:    vmovss %xmm0, (%rdi)
412; VLX-NEXT:    retq
413  store float %arg, float* %dst, align 1, !nontemporal !1
414  ret void
415}
416
417define void @test_arg_i32(i32 %arg, i32* %dst) {
418; SSE-LABEL: test_arg_i32:
419; SSE:       # BB#0:
420; SSE-NEXT:    movntil %edi, (%rsi)
421; SSE-NEXT:    retq
422;
423; AVX-LABEL: test_arg_i32:
424; AVX:       # BB#0:
425; AVX-NEXT:    movntil %edi, (%rsi)
426; AVX-NEXT:    retq
427;
428; VLX-LABEL: test_arg_i32:
429; VLX:       # BB#0:
430; VLX-NEXT:    movntil %edi, (%rsi)
431; VLX-NEXT:    retq
432  store i32 %arg, i32* %dst, align 1, !nontemporal !1
433  ret void
434}
435
436define void @test_arg_f64(double %arg, double* %dst) {
437; SSE2-LABEL: test_arg_f64:
438; SSE2:       # BB#0:
439; SSE2-NEXT:    movsd %xmm0, (%rdi)
440; SSE2-NEXT:    retq
441;
442; SSE4A-LABEL: test_arg_f64:
443; SSE4A:       # BB#0:
444; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
445; SSE4A-NEXT:    retq
446;
447; SSE41-LABEL: test_arg_f64:
448; SSE41:       # BB#0:
449; SSE41-NEXT:    movsd %xmm0, (%rdi)
450; SSE41-NEXT:    retq
451;
452; AVX-LABEL: test_arg_f64:
453; AVX:       # BB#0:
454; AVX-NEXT:    vmovsd %xmm0, (%rdi)
455; AVX-NEXT:    retq
456;
457; VLX-LABEL: test_arg_f64:
458; VLX:       # BB#0:
459; VLX-NEXT:    vmovsd %xmm0, (%rdi)
460; VLX-NEXT:    retq
461  store double %arg, double* %dst, align 1, !nontemporal !1
462  ret void
463}
464
465define void @test_arg_i64(i64 %arg, i64* %dst) {
466; SSE-LABEL: test_arg_i64:
467; SSE:       # BB#0:
468; SSE-NEXT:    movntiq %rdi, (%rsi)
469; SSE-NEXT:    retq
470;
471; AVX-LABEL: test_arg_i64:
472; AVX:       # BB#0:
473; AVX-NEXT:    movntiq %rdi, (%rsi)
474; AVX-NEXT:    retq
475;
476; VLX-LABEL: test_arg_i64:
477; VLX:       # BB#0:
478; VLX-NEXT:    movntiq %rdi, (%rsi)
479; VLX-NEXT:    retq
480  store i64 %arg, i64* %dst, align 1, !nontemporal !1
481  ret void
482}
483
484; Extract versions
485
486define void @test_extract_f32(<4 x float> %arg, float* %dst) {
487; SSE2-LABEL: test_extract_f32:
488; SSE2:       # BB#0:
489; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
490; SSE2-NEXT:    movss %xmm0, (%rdi)
491; SSE2-NEXT:    retq
492;
493; SSE4A-LABEL: test_extract_f32:
494; SSE4A:       # BB#0:
495; SSE4A-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
496; SSE4A-NEXT:    movntss %xmm0, (%rdi)
497; SSE4A-NEXT:    retq
498;
499; SSE41-LABEL: test_extract_f32:
500; SSE41:       # BB#0:
501; SSE41-NEXT:    extractps $1, %xmm0, %eax
502; SSE41-NEXT:    movntil %eax, (%rdi)
503; SSE41-NEXT:    retq
504;
505; AVX-LABEL: test_extract_f32:
506; AVX:       # BB#0:
507; AVX-NEXT:    vextractps $1, %xmm0, %eax
508; AVX-NEXT:    movntil %eax, (%rdi)
509; AVX-NEXT:    retq
510;
511; VLX-LABEL: test_extract_f32:
512; VLX:       # BB#0:
513; VLX-NEXT:    vextractps $1, %xmm0, %eax
514; VLX-NEXT:    movntil %eax, (%rdi)
515; VLX-NEXT:    retq
516  %1 = extractelement <4 x float> %arg, i32 1
517  store float %1, float* %dst, align 1, !nontemporal !1
518  ret void
519}
520
521define void @test_extract_i32(<4 x i32> %arg, i32* %dst) {
522; SSE2-LABEL: test_extract_i32:
523; SSE2:       # BB#0:
524; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
525; SSE2-NEXT:    movd %xmm0, %eax
526; SSE2-NEXT:    movntil %eax, (%rdi)
527; SSE2-NEXT:    retq
528;
529; SSE4A-LABEL: test_extract_i32:
530; SSE4A:       # BB#0:
531; SSE4A-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
532; SSE4A-NEXT:    movd %xmm0, %eax
533; SSE4A-NEXT:    movntil %eax, (%rdi)
534; SSE4A-NEXT:    retq
535;
536; SSE41-LABEL: test_extract_i32:
537; SSE41:       # BB#0:
538; SSE41-NEXT:    pextrd $1, %xmm0, %eax
539; SSE41-NEXT:    movntil %eax, (%rdi)
540; SSE41-NEXT:    retq
541;
542; AVX-LABEL: test_extract_i32:
543; AVX:       # BB#0:
544; AVX-NEXT:    vpextrd $1, %xmm0, %eax
545; AVX-NEXT:    movntil %eax, (%rdi)
546; AVX-NEXT:    retq
547;
548; VLX-LABEL: test_extract_i32:
549; VLX:       # BB#0:
550; VLX-NEXT:    vpextrd $1, %xmm0, %eax
551; VLX-NEXT:    movntil %eax, (%rdi)
552; VLX-NEXT:    retq
553  %1 = extractelement <4 x i32> %arg, i32 1
554  store i32 %1, i32* %dst, align 1, !nontemporal !1
555  ret void
556}
557
558define void @test_extract_f64(<2 x double> %arg, double* %dst) {
559; SSE2-LABEL: test_extract_f64:
560; SSE2:       # BB#0:
561; SSE2-NEXT:    movhpd %xmm0, (%rdi)
562; SSE2-NEXT:    retq
563;
564; SSE4A-LABEL: test_extract_f64:
565; SSE4A:       # BB#0:
566; SSE4A-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
567; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
568; SSE4A-NEXT:    retq
569;
570; SSE41-LABEL: test_extract_f64:
571; SSE41:       # BB#0:
572; SSE41-NEXT:    movhpd %xmm0, (%rdi)
573; SSE41-NEXT:    retq
574;
575; AVX-LABEL: test_extract_f64:
576; AVX:       # BB#0:
577; AVX-NEXT:    vmovhpd %xmm0, (%rdi)
578; AVX-NEXT:    retq
579;
580; VLX-LABEL: test_extract_f64:
581; VLX:       # BB#0:
582; VLX-NEXT:    vmovhpd %xmm0, (%rdi)
583; VLX-NEXT:    retq
584  %1 = extractelement <2 x double> %arg, i32 1
585  store double %1, double* %dst, align 1, !nontemporal !1
586  ret void
587}
588
589define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
590; SSE2-LABEL: test_extract_i64:
591; SSE2:       # BB#0:
592; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
593; SSE2-NEXT:    movd %xmm0, %rax
594; SSE2-NEXT:    movntiq %rax, (%rdi)
595; SSE2-NEXT:    retq
596;
597; SSE4A-LABEL: test_extract_i64:
598; SSE4A:       # BB#0:
599; SSE4A-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
600; SSE4A-NEXT:    movd %xmm0, %rax
601; SSE4A-NEXT:    movntiq %rax, (%rdi)
602; SSE4A-NEXT:    retq
603;
604; SSE41-LABEL: test_extract_i64:
605; SSE41:       # BB#0:
606; SSE41-NEXT:    pextrq $1, %xmm0, %rax
607; SSE41-NEXT:    movntiq %rax, (%rdi)
608; SSE41-NEXT:    retq
609;
610; AVX-LABEL: test_extract_i64:
611; AVX:       # BB#0:
612; AVX-NEXT:    vpextrq $1, %xmm0, %rax
613; AVX-NEXT:    movntiq %rax, (%rdi)
614; AVX-NEXT:    retq
615;
616; VLX-LABEL: test_extract_i64:
617; VLX:       # BB#0:
618; VLX-NEXT:    vpextrq $1, %xmm0, %rax
619; VLX-NEXT:    movntiq %rax, (%rdi)
620; VLX-NEXT:    retq
621  %1 = extractelement <2 x i64> %arg, i32 1
622  store i64 %1, i64* %dst, align 1, !nontemporal !1
623  ret void
624}
625
626; And now XMM versions.
627
628define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
629; SSE-LABEL: test_arg_v4f32:
630; SSE:       # BB#0:
631; SSE-NEXT:    movntps %xmm0, (%rdi)
632; SSE-NEXT:    retq
633;
634; AVX-LABEL: test_arg_v4f32:
635; AVX:       # BB#0:
636; AVX-NEXT:    vmovntps %xmm0, (%rdi)
637; AVX-NEXT:    retq
638;
639; VLX-LABEL: test_arg_v4f32:
640; VLX:       # BB#0:
641; VLX-NEXT:    vmovntps %xmm0, (%rdi)
642; VLX-NEXT:    retq
643  store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
644  ret void
645}
646
647define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
648; SSE-LABEL: test_arg_v4i32:
649; SSE:       # BB#0:
650; SSE-NEXT:    movntps %xmm0, (%rdi)
651; SSE-NEXT:    retq
652;
653; AVX-LABEL: test_arg_v4i32:
654; AVX:       # BB#0:
655; AVX-NEXT:    vmovntps %xmm0, (%rdi)
656; AVX-NEXT:    retq
657;
658; VLX-LABEL: test_arg_v4i32:
659; VLX:       # BB#0:
660; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
661; VLX-NEXT:    retq
662  store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
663  ret void
664}
665
666define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
667; SSE-LABEL: test_arg_v2f64:
668; SSE:       # BB#0:
669; SSE-NEXT:    movntps %xmm0, (%rdi)
670; SSE-NEXT:    retq
671;
672; AVX-LABEL: test_arg_v2f64:
673; AVX:       # BB#0:
674; AVX-NEXT:    vmovntps %xmm0, (%rdi)
675; AVX-NEXT:    retq
676;
677; VLX-LABEL: test_arg_v2f64:
678; VLX:       # BB#0:
679; VLX-NEXT:    vmovntpd %xmm0, (%rdi)
680; VLX-NEXT:    retq
681  store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
682  ret void
683}
684
685define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
686; SSE-LABEL: test_arg_v2i64:
687; SSE:       # BB#0:
688; SSE-NEXT:    movntps %xmm0, (%rdi)
689; SSE-NEXT:    retq
690;
691; AVX-LABEL: test_arg_v2i64:
692; AVX:       # BB#0:
693; AVX-NEXT:    vmovntps %xmm0, (%rdi)
694; AVX-NEXT:    retq
695;
696; VLX-LABEL: test_arg_v2i64:
697; VLX:       # BB#0:
698; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
699; VLX-NEXT:    retq
700  store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
701  ret void
702}
703
704define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
705; SSE-LABEL: test_arg_v8i16:
706; SSE:       # BB#0:
707; SSE-NEXT:    movntps %xmm0, (%rdi)
708; SSE-NEXT:    retq
709;
710; AVX-LABEL: test_arg_v8i16:
711; AVX:       # BB#0:
712; AVX-NEXT:    vmovntps %xmm0, (%rdi)
713; AVX-NEXT:    retq
714;
715; VLX-LABEL: test_arg_v8i16:
716; VLX:       # BB#0:
717; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
718; VLX-NEXT:    retq
719  store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
720  ret void
721}
722
723define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
724; SSE-LABEL: test_arg_v16i8:
725; SSE:       # BB#0:
726; SSE-NEXT:    movntps %xmm0, (%rdi)
727; SSE-NEXT:    retq
728;
729; AVX-LABEL: test_arg_v16i8:
730; AVX:       # BB#0:
731; AVX-NEXT:    vmovntps %xmm0, (%rdi)
732; AVX-NEXT:    retq
733;
734; VLX-LABEL: test_arg_v16i8:
735; VLX:       # BB#0:
736; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
737; VLX-NEXT:    retq
738  store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
739  ret void
740}
741
742; And now YMM versions.
743
744define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
745; SSE-LABEL: test_arg_v8f32:
746; SSE:       # BB#0:
747; SSE-NEXT:    movntps %xmm1, 16(%rdi)
748; SSE-NEXT:    movntps %xmm0, (%rdi)
749; SSE-NEXT:    retq
750;
751; AVX-LABEL: test_arg_v8f32:
752; AVX:       # BB#0:
753; AVX-NEXT:    vmovntps %ymm0, (%rdi)
754; AVX-NEXT:    vzeroupper
755; AVX-NEXT:    retq
756;
757; VLX-LABEL: test_arg_v8f32:
758; VLX:       # BB#0:
759; VLX-NEXT:    vmovntps %ymm0, (%rdi)
760; VLX-NEXT:    retq
761  store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
762  ret void
763}
764
765define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
766; SSE-LABEL: test_arg_v8i32:
767; SSE:       # BB#0:
768; SSE-NEXT:    movntps %xmm1, 16(%rdi)
769; SSE-NEXT:    movntps %xmm0, (%rdi)
770; SSE-NEXT:    retq
771;
772; AVX-LABEL: test_arg_v8i32:
773; AVX:       # BB#0:
774; AVX-NEXT:    vmovntps %ymm0, (%rdi)
775; AVX-NEXT:    vzeroupper
776; AVX-NEXT:    retq
777;
778; VLX-LABEL: test_arg_v8i32:
779; VLX:       # BB#0:
780; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
781; VLX-NEXT:    retq
782  store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
783  ret void
784}
785
786define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
787; SSE-LABEL: test_arg_v4f64:
788; SSE:       # BB#0:
789; SSE-NEXT:    movntps %xmm1, 16(%rdi)
790; SSE-NEXT:    movntps %xmm0, (%rdi)
791; SSE-NEXT:    retq
792;
793; AVX-LABEL: test_arg_v4f64:
794; AVX:       # BB#0:
795; AVX-NEXT:    vmovntps %ymm0, (%rdi)
796; AVX-NEXT:    vzeroupper
797; AVX-NEXT:    retq
798;
799; VLX-LABEL: test_arg_v4f64:
800; VLX:       # BB#0:
801; VLX-NEXT:    vmovntpd %ymm0, (%rdi)
802; VLX-NEXT:    retq
803  store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
804  ret void
805}
806
807define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
808; SSE-LABEL: test_arg_v4i64:
809; SSE:       # BB#0:
810; SSE-NEXT:    movntps %xmm1, 16(%rdi)
811; SSE-NEXT:    movntps %xmm0, (%rdi)
812; SSE-NEXT:    retq
813;
814; AVX-LABEL: test_arg_v4i64:
815; AVX:       # BB#0:
816; AVX-NEXT:    vmovntps %ymm0, (%rdi)
817; AVX-NEXT:    vzeroupper
818; AVX-NEXT:    retq
819;
820; VLX-LABEL: test_arg_v4i64:
821; VLX:       # BB#0:
822; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
823; VLX-NEXT:    retq
824  store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
825  ret void
826}
827
828define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
829; SSE-LABEL: test_arg_v16i16:
830; SSE:       # BB#0:
831; SSE-NEXT:    movntps %xmm1, 16(%rdi)
832; SSE-NEXT:    movntps %xmm0, (%rdi)
833; SSE-NEXT:    retq
834;
835; AVX-LABEL: test_arg_v16i16:
836; AVX:       # BB#0:
837; AVX-NEXT:    vmovntps %ymm0, (%rdi)
838; AVX-NEXT:    vzeroupper
839; AVX-NEXT:    retq
840;
841; VLX-LABEL: test_arg_v16i16:
842; VLX:       # BB#0:
843; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
844; VLX-NEXT:    retq
845  store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
846  ret void
847}
848
849define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
850; SSE-LABEL: test_arg_v32i8:
851; SSE:       # BB#0:
852; SSE-NEXT:    movntps %xmm1, 16(%rdi)
853; SSE-NEXT:    movntps %xmm0, (%rdi)
854; SSE-NEXT:    retq
855;
856; AVX-LABEL: test_arg_v32i8:
857; AVX:       # BB#0:
858; AVX-NEXT:    vmovntps %ymm0, (%rdi)
859; AVX-NEXT:    vzeroupper
860; AVX-NEXT:    retq
861;
862; VLX-LABEL: test_arg_v32i8:
863; VLX:       # BB#0:
864; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
865; VLX-NEXT:    retq
866  store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
867  ret void
868}
869
870
871; Now check that if the execution domain is trivially visible, we use it.
872; We use an add to make the type survive all the way to the MOVNT.
873
874define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
875; SSE-LABEL: test_op_v4f32:
876; SSE:       # BB#0:
877; SSE-NEXT:    addps %xmm1, %xmm0
878; SSE-NEXT:    movntps %xmm0, (%rdi)
879; SSE-NEXT:    retq
880;
881; AVX-LABEL: test_op_v4f32:
882; AVX:       # BB#0:
883; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
884; AVX-NEXT:    vmovntps %xmm0, (%rdi)
885; AVX-NEXT:    retq
886;
887; VLX-LABEL: test_op_v4f32:
888; VLX:       # BB#0:
889; VLX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
890; VLX-NEXT:    vmovntps %xmm0, (%rdi)
891; VLX-NEXT:    retq
892  %r = fadd <4 x float> %a, %b
893  store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
894  ret void
895}
896
897define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
898; SSE-LABEL: test_op_v4i32:
899; SSE:       # BB#0:
900; SSE-NEXT:    paddd %xmm1, %xmm0
901; SSE-NEXT:    movntdq %xmm0, (%rdi)
902; SSE-NEXT:    retq
903;
904; AVX-LABEL: test_op_v4i32:
905; AVX:       # BB#0:
906; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
907; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
908; AVX-NEXT:    retq
909;
910; VLX-LABEL: test_op_v4i32:
911; VLX:       # BB#0:
912; VLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
913; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
914; VLX-NEXT:    retq
915  %r = add <4 x i32> %a, %b
916  store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
917  ret void
918}
919
920define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
921; SSE-LABEL: test_op_v2f64:
922; SSE:       # BB#0:
923; SSE-NEXT:    addpd %xmm1, %xmm0
924; SSE-NEXT:    movntpd %xmm0, (%rdi)
925; SSE-NEXT:    retq
926;
927; AVX-LABEL: test_op_v2f64:
928; AVX:       # BB#0:
929; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
930; AVX-NEXT:    vmovntpd %xmm0, (%rdi)
931; AVX-NEXT:    retq
932;
933; VLX-LABEL: test_op_v2f64:
934; VLX:       # BB#0:
935; VLX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
936; VLX-NEXT:    vmovntpd %xmm0, (%rdi)
937; VLX-NEXT:    retq
938  %r = fadd <2 x double> %a, %b
939  store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
940  ret void
941}
942
943define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
944; SSE-LABEL: test_op_v2i64:
945; SSE:       # BB#0:
946; SSE-NEXT:    paddq %xmm1, %xmm0
947; SSE-NEXT:    movntdq %xmm0, (%rdi)
948; SSE-NEXT:    retq
949;
950; AVX-LABEL: test_op_v2i64:
951; AVX:       # BB#0:
952; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
953; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
954; AVX-NEXT:    retq
955;
956; VLX-LABEL: test_op_v2i64:
957; VLX:       # BB#0:
958; VLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
959; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
960; VLX-NEXT:    retq
961  %r = add <2 x i64> %a, %b
962  store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
963  ret void
964}
965
966define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
967; SSE-LABEL: test_op_v8i16:
968; SSE:       # BB#0:
969; SSE-NEXT:    paddw %xmm1, %xmm0
970; SSE-NEXT:    movntdq %xmm0, (%rdi)
971; SSE-NEXT:    retq
972;
973; AVX-LABEL: test_op_v8i16:
974; AVX:       # BB#0:
975; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
976; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
977; AVX-NEXT:    retq
978;
979; VLX-LABEL: test_op_v8i16:
980; VLX:       # BB#0:
981; VLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
982; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
983; VLX-NEXT:    retq
984  %r = add <8 x i16> %a, %b
985  store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
986  ret void
987}
988
989define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
990; SSE-LABEL: test_op_v16i8:
991; SSE:       # BB#0:
992; SSE-NEXT:    paddb %xmm1, %xmm0
993; SSE-NEXT:    movntdq %xmm0, (%rdi)
994; SSE-NEXT:    retq
995;
996; AVX-LABEL: test_op_v16i8:
997; AVX:       # BB#0:
998; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
999; AVX-NEXT:    vmovntdq %xmm0, (%rdi)
1000; AVX-NEXT:    retq
1001;
1002; VLX-LABEL: test_op_v16i8:
1003; VLX:       # BB#0:
1004; VLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1005; VLX-NEXT:    vmovntdq %xmm0, (%rdi)
1006; VLX-NEXT:    retq
1007  %r = add <16 x i8> %a, %b
1008  store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
1009  ret void
1010}
1011
1012; And now YMM versions.
1013
1014define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
1015; SSE-LABEL: test_op_v8f32:
1016; SSE:       # BB#0:
1017; SSE-NEXT:    addps %xmm2, %xmm0
1018; SSE-NEXT:    addps %xmm3, %xmm1
1019; SSE-NEXT:    movntps %xmm1, 16(%rdi)
1020; SSE-NEXT:    movntps %xmm0, (%rdi)
1021; SSE-NEXT:    retq
1022;
1023; AVX-LABEL: test_op_v8f32:
1024; AVX:       # BB#0:
1025; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1026; AVX-NEXT:    vmovntps %ymm0, (%rdi)
1027; AVX-NEXT:    vzeroupper
1028; AVX-NEXT:    retq
1029;
1030; VLX-LABEL: test_op_v8f32:
1031; VLX:       # BB#0:
1032; VLX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1033; VLX-NEXT:    vmovntps %ymm0, (%rdi)
1034; VLX-NEXT:    retq
1035  %r = fadd <8 x float> %a, %b
1036  store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
1037  ret void
1038}
1039
1040define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
1041; SSE-LABEL: test_op_v8i32:
1042; SSE:       # BB#0:
1043; SSE-NEXT:    paddd %xmm2, %xmm0
1044; SSE-NEXT:    paddd %xmm3, %xmm1
1045; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1046; SSE-NEXT:    movntdq %xmm0, (%rdi)
1047; SSE-NEXT:    retq
1048;
1049; AVX1-LABEL: test_op_v8i32:
1050; AVX1:       # BB#0:
1051; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1052; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1053; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
1054; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1055; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1056; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1057; AVX1-NEXT:    vzeroupper
1058; AVX1-NEXT:    retq
1059;
1060; AVX2-LABEL: test_op_v8i32:
1061; AVX2:       # BB#0:
1062; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1063; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1064; AVX2-NEXT:    vzeroupper
1065; AVX2-NEXT:    retq
1066;
1067; VLX-LABEL: test_op_v8i32:
1068; VLX:       # BB#0:
1069; VLX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
1070; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1071; VLX-NEXT:    retq
1072  %r = add <8 x i32> %a, %b
1073  store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
1074  ret void
1075}
1076
1077define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
1078; SSE-LABEL: test_op_v4f64:
1079; SSE:       # BB#0:
1080; SSE-NEXT:    addpd %xmm2, %xmm0
1081; SSE-NEXT:    addpd %xmm3, %xmm1
1082; SSE-NEXT:    movntpd %xmm1, 16(%rdi)
1083; SSE-NEXT:    movntpd %xmm0, (%rdi)
1084; SSE-NEXT:    retq
1085;
1086; AVX-LABEL: test_op_v4f64:
1087; AVX:       # BB#0:
1088; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1089; AVX-NEXT:    vmovntpd %ymm0, (%rdi)
1090; AVX-NEXT:    vzeroupper
1091; AVX-NEXT:    retq
1092;
1093; VLX-LABEL: test_op_v4f64:
1094; VLX:       # BB#0:
1095; VLX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
1096; VLX-NEXT:    vmovntpd %ymm0, (%rdi)
1097; VLX-NEXT:    retq
1098  %r = fadd <4 x double> %a, %b
1099  store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
1100  ret void
1101}
1102
1103define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
1104; SSE-LABEL: test_op_v4i64:
1105; SSE:       # BB#0:
1106; SSE-NEXT:    paddq %xmm2, %xmm0
1107; SSE-NEXT:    paddq %xmm3, %xmm1
1108; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1109; SSE-NEXT:    movntdq %xmm0, (%rdi)
1110; SSE-NEXT:    retq
1111;
1112; AVX1-LABEL: test_op_v4i64:
1113; AVX1:       # BB#0:
1114; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1115; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1116; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
1117; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
1118; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1119; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1120; AVX1-NEXT:    vzeroupper
1121; AVX1-NEXT:    retq
1122;
1123; AVX2-LABEL: test_op_v4i64:
1124; AVX2:       # BB#0:
1125; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1126; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1127; AVX2-NEXT:    vzeroupper
1128; AVX2-NEXT:    retq
1129;
1130; VLX-LABEL: test_op_v4i64:
1131; VLX:       # BB#0:
1132; VLX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1133; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1134; VLX-NEXT:    retq
1135  %r = add <4 x i64> %a, %b
1136  store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
1137  ret void
1138}
1139
1140define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
1141; SSE-LABEL: test_op_v16i16:
1142; SSE:       # BB#0:
1143; SSE-NEXT:    paddw %xmm2, %xmm0
1144; SSE-NEXT:    paddw %xmm3, %xmm1
1145; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1146; SSE-NEXT:    movntdq %xmm0, (%rdi)
1147; SSE-NEXT:    retq
1148;
1149; AVX1-LABEL: test_op_v16i16:
1150; AVX1:       # BB#0:
1151; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1152; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1153; AVX1-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
1154; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1155; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1156; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1157; AVX1-NEXT:    vzeroupper
1158; AVX1-NEXT:    retq
1159;
1160; AVX2-LABEL: test_op_v16i16:
1161; AVX2:       # BB#0:
1162; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1163; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1164; AVX2-NEXT:    vzeroupper
1165; AVX2-NEXT:    retq
1166;
1167; VLX-LABEL: test_op_v16i16:
1168; VLX:       # BB#0:
1169; VLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
1170; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1171; VLX-NEXT:    retq
1172  %r = add <16 x i16> %a, %b
1173  store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
1174  ret void
1175}
1176
1177define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
1178; SSE-LABEL: test_op_v32i8:
1179; SSE:       # BB#0:
1180; SSE-NEXT:    paddb %xmm2, %xmm0
1181; SSE-NEXT:    paddb %xmm3, %xmm1
1182; SSE-NEXT:    movntdq %xmm1, 16(%rdi)
1183; SSE-NEXT:    movntdq %xmm0, (%rdi)
1184; SSE-NEXT:    retq
1185;
1186; AVX1-LABEL: test_op_v32i8:
1187; AVX1:       # BB#0:
1188; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1189; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1190; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
1191; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1192; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1193; AVX1-NEXT:    vmovntps %ymm0, (%rdi)
1194; AVX1-NEXT:    vzeroupper
1195; AVX1-NEXT:    retq
1196;
1197; AVX2-LABEL: test_op_v32i8:
1198; AVX2:       # BB#0:
1199; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1200; AVX2-NEXT:    vmovntdq %ymm0, (%rdi)
1201; AVX2-NEXT:    vzeroupper
1202; AVX2-NEXT:    retq
1203;
1204; VLX-LABEL: test_op_v32i8:
1205; VLX:       # BB#0:
1206; VLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1207; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
1208; VLX-NEXT:    retq
1209  %r = add <32 x i8> %a, %b
1210  store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
1211  ret void
1212}
1213
1214; 256-bit NT stores require 256-bit alignment.
1215; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
1216; could even scalarize to movnti when we have 1-alignment: nontemporal is
1217; probably always worth even some 20 instruction scalarization.
1218define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
1219; SSE-LABEL: test_unaligned_v8f32:
1220; SSE:       # BB#0:
1221; SSE-NEXT:    addps %xmm2, %xmm0
1222; SSE-NEXT:    addps %xmm3, %xmm1
1223; SSE-NEXT:    movntps %xmm1, 16(%rdi)
1224; SSE-NEXT:    movntps %xmm0, (%rdi)
1225; SSE-NEXT:    retq
1226;
1227; AVX-LABEL: test_unaligned_v8f32:
1228; AVX:       # BB#0:
1229; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1230; AVX-NEXT:    vmovups %ymm0, (%rdi)
1231; AVX-NEXT:    vzeroupper
1232; AVX-NEXT:    retq
1233;
1234; VLX-LABEL: test_unaligned_v8f32:
1235; VLX:       # BB#0:
1236; VLX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
1237; VLX-NEXT:    vmovups %ymm0, (%rdi)
1238; VLX-NEXT:    retq
1239  %r = fadd <8 x float> %a, %b
1240  store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
1241  ret void
1242}
1243
1244!1 = !{i32 1}
1245