1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16
17 #include <xmmintrin.h>
18
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26 /* Type defines. */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS \
53 __attribute__((__always_inline__, __nodebug__, \
54 __target__("sse2,no-evex512"), __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX \
56 __attribute__((__always_inline__, __nodebug__, \
57 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58
59 /// Adds lower double-precision values in both operands and returns the
60 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
61 /// are copied from the upper double-precision value of the first operand.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66 ///
67 /// \param __a
68 /// A 128-bit vector of [2 x double] containing one of the source operands.
69 /// \param __b
70 /// A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 /// from the upper 64 bits of the first source operand.
_mm_add_sd(__m128d __a,__m128d __b)74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75 __m128d __b) {
76 __a[0] += __b[0];
77 return __a;
78 }
79
80 /// Adds two 128-bit vectors of [2 x double].
81 ///
82 /// \headerfile <x86intrin.h>
83 ///
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85 ///
86 /// \param __a
87 /// A 128-bit vector of [2 x double] containing one of the source operands.
88 /// \param __b
89 /// A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
91 /// operands.
_mm_add_pd(__m128d __a,__m128d __b)92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93 __m128d __b) {
94 return (__m128d)((__v2df)__a + (__v2df)__b);
95 }
96
97 /// Subtracts the lower double-precision value of the second operand
98 /// from the lower double-precision value of the first operand and returns
99 /// the difference in the lower 64 bits of the result. The upper 64 bits of
100 /// the result are copied from the upper double-precision value of the first
101 /// operand.
102 ///
103 /// \headerfile <x86intrin.h>
104 ///
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106 ///
107 /// \param __a
108 /// A 128-bit vector of [2 x double] containing the minuend.
109 /// \param __b
110 /// A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 /// difference of the lower 64 bits of both operands. The upper 64 bits are
113 /// copied from the upper 64 bits of the first source operand.
_mm_sub_sd(__m128d __a,__m128d __b)114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115 __m128d __b) {
116 __a[0] -= __b[0];
117 return __a;
118 }
119
120 /// Subtracts two 128-bit vectors of [2 x double].
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125 ///
126 /// \param __a
127 /// A 128-bit vector of [2 x double] containing the minuend.
128 /// \param __b
129 /// A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
131 /// both operands.
_mm_sub_pd(__m128d __a,__m128d __b)132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133 __m128d __b) {
134 return (__m128d)((__v2df)__a - (__v2df)__b);
135 }
136
137 /// Multiplies lower double-precision values in both operands and returns
138 /// the product in the lower 64 bits of the result. The upper 64 bits of the
139 /// result are copied from the upper double-precision value of the first
140 /// operand.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145 ///
146 /// \param __a
147 /// A 128-bit vector of [2 x double] containing one of the source operands.
148 /// \param __b
149 /// A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 /// product of the lower 64 bits of both operands. The upper 64 bits are
152 /// copied from the upper 64 bits of the first source operand.
_mm_mul_sd(__m128d __a,__m128d __b)153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154 __m128d __b) {
155 __a[0] *= __b[0];
156 return __a;
157 }
158
159 /// Multiplies two 128-bit vectors of [2 x double].
160 ///
161 /// \headerfile <x86intrin.h>
162 ///
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 ///
165 /// \param __a
166 /// A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 /// A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 /// operands.
_mm_mul_pd(__m128d __a,__m128d __b)171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172 __m128d __b) {
173 return (__m128d)((__v2df)__a * (__v2df)__b);
174 }
175
176 /// Divides the lower double-precision value of the first operand by the
177 /// lower double-precision value of the second operand and returns the
178 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
179 /// result are copied from the upper double-precision value of the first
180 /// operand.
181 ///
182 /// \headerfile <x86intrin.h>
183 ///
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185 ///
186 /// \param __a
187 /// A 128-bit vector of [2 x double] containing the dividend.
188 /// \param __b
189 /// A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
192 /// copied from the upper 64 bits of the first source operand.
_mm_div_sd(__m128d __a,__m128d __b)193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194 __m128d __b) {
195 __a[0] /= __b[0];
196 return __a;
197 }
198
199 /// Performs an element-by-element division of two 128-bit vectors of
200 /// [2 x double].
201 ///
202 /// \headerfile <x86intrin.h>
203 ///
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205 ///
206 /// \param __a
207 /// A 128-bit vector of [2 x double] containing the dividend.
208 /// \param __b
209 /// A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
211 /// operands.
_mm_div_pd(__m128d __a,__m128d __b)212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213 __m128d __b) {
214 return (__m128d)((__v2df)__a / (__v2df)__b);
215 }
216
217 /// Calculates the square root of the lower double-precision value of
218 /// the second operand and returns it in the lower 64 bits of the result.
219 /// The upper 64 bits of the result are copied from the upper
220 /// double-precision value of the first operand.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225 ///
226 /// \param __a
227 /// A 128-bit vector of [2 x double] containing one of the operands. The
228 /// upper 64 bits of this operand are copied to the upper 64 bits of the
229 /// result.
230 /// \param __b
231 /// A 128-bit vector of [2 x double] containing one of the operands. The
232 /// square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
235 /// bits are copied from the upper 64 bits of operand \a __a.
_mm_sqrt_sd(__m128d __a,__m128d __b)236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237 __m128d __b) {
238 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239 return __extension__(__m128d){__c[0], __a[1]};
240 }
241
242 /// Calculates the square root of the each of two values stored in a
243 /// 128-bit vector of [2 x double].
244 ///
245 /// \headerfile <x86intrin.h>
246 ///
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248 ///
249 /// \param __a
250 /// A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 /// values in the operand.
_mm_sqrt_pd(__m128d __a)253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254 return __builtin_ia32_sqrtpd((__v2df)__a);
255 }
256
257 /// Compares lower 64-bit double-precision values of both operands, and
258 /// returns the lesser of the pair of values in the lower 64-bits of the
259 /// result. The upper 64 bits of the result are copied from the upper
260 /// double-precision value of the first operand.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
265 ///
266 /// \param __a
267 /// A 128-bit vector of [2 x double] containing one of the operands. The
268 /// lower 64 bits of this operand are used in the comparison.
269 /// \param __b
270 /// A 128-bit vector of [2 x double] containing one of the operands. The
271 /// lower 64 bits of this operand are used in the comparison.
272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273 /// minimum value between both operands. The upper 64 bits are copied from
274 /// the upper 64 bits of the first source operand.
_mm_min_sd(__m128d __a,__m128d __b)275 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276 __m128d __b) {
277 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
278 }
279
280 /// Performs element-by-element comparison of the two 128-bit vectors of
281 /// [2 x double] and returns the vector containing the lesser of each pair of
282 /// values.
283 ///
284 /// \headerfile <x86intrin.h>
285 ///
286 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
287 ///
288 /// \param __a
289 /// A 128-bit vector of [2 x double] containing one of the operands.
290 /// \param __b
291 /// A 128-bit vector of [2 x double] containing one of the operands.
292 /// \returns A 128-bit vector of [2 x double] containing the minimum values
293 /// between both operands.
_mm_min_pd(__m128d __a,__m128d __b)294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295 __m128d __b) {
296 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
297 }
298
299 /// Compares lower 64-bit double-precision values of both operands, and
300 /// returns the greater of the pair of values in the lower 64-bits of the
301 /// result. The upper 64 bits of the result are copied from the upper
302 /// double-precision value of the first operand.
303 ///
304 /// \headerfile <x86intrin.h>
305 ///
306 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
307 ///
308 /// \param __a
309 /// A 128-bit vector of [2 x double] containing one of the operands. The
310 /// lower 64 bits of this operand are used in the comparison.
311 /// \param __b
312 /// A 128-bit vector of [2 x double] containing one of the operands. The
313 /// lower 64 bits of this operand are used in the comparison.
314 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315 /// maximum value between both operands. The upper 64 bits are copied from
316 /// the upper 64 bits of the first source operand.
_mm_max_sd(__m128d __a,__m128d __b)317 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318 __m128d __b) {
319 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
320 }
321
322 /// Performs element-by-element comparison of the two 128-bit vectors of
323 /// [2 x double] and returns the vector containing the greater of each pair
324 /// of values.
325 ///
326 /// \headerfile <x86intrin.h>
327 ///
328 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
329 ///
330 /// \param __a
331 /// A 128-bit vector of [2 x double] containing one of the operands.
332 /// \param __b
333 /// A 128-bit vector of [2 x double] containing one of the operands.
334 /// \returns A 128-bit vector of [2 x double] containing the maximum values
335 /// between both operands.
_mm_max_pd(__m128d __a,__m128d __b)336 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337 __m128d __b) {
338 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339 }
340
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346 ///
347 /// \param __a
348 /// A 128-bit vector of [2 x double] containing one of the source operands.
349 /// \param __b
350 /// A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 /// values between both operands.
_mm_and_pd(__m128d __a,__m128d __b)353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354 __m128d __b) {
355 return (__m128d)((__v2du)__a & (__v2du)__b);
356 }
357
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359 /// the one's complement of the values contained in the first source operand.
360 ///
361 /// \headerfile <x86intrin.h>
362 ///
363 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
364 ///
365 /// \param __a
366 /// A 128-bit vector of [2 x double] containing the left source operand. The
367 /// one's complement of this value is used in the bitwise AND.
368 /// \param __b
369 /// A 128-bit vector of [2 x double] containing the right source operand.
370 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371 /// values in the second operand and the one's complement of the first
372 /// operand.
_mm_andnot_pd(__m128d __a,__m128d __b)373 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374 __m128d __b) {
375 return (__m128d)(~(__v2du)__a & (__v2du)__b);
376 }
377
378 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
379 ///
380 /// \headerfile <x86intrin.h>
381 ///
382 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
383 ///
384 /// \param __a
385 /// A 128-bit vector of [2 x double] containing one of the source operands.
386 /// \param __b
387 /// A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389 /// values between both operands.
_mm_or_pd(__m128d __a,__m128d __b)390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391 __m128d __b) {
392 return (__m128d)((__v2du)__a | (__v2du)__b);
393 }
394
395 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
396 ///
397 /// \headerfile <x86intrin.h>
398 ///
399 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
400 ///
401 /// \param __a
402 /// A 128-bit vector of [2 x double] containing one of the source operands.
403 /// \param __b
404 /// A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406 /// values between both operands.
_mm_xor_pd(__m128d __a,__m128d __b)407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408 __m128d __b) {
409 return (__m128d)((__v2du)__a ^ (__v2du)__b);
410 }
411
412 /// Compares each of the corresponding double-precision values of the
413 /// 128-bit vectors of [2 x double] for equality.
414 ///
415 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
416 ///
417 /// \headerfile <x86intrin.h>
418 ///
419 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
420 ///
421 /// \param __a
422 /// A 128-bit vector of [2 x double].
423 /// \param __b
424 /// A 128-bit vector of [2 x double].
425 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpeq_pd(__m128d __a,__m128d __b)426 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
427 __m128d __b) {
428 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
429 }
430
431 /// Compares each of the corresponding double-precision values of the
432 /// 128-bit vectors of [2 x double] to determine if the values in the first
433 /// operand are less than those in the second operand.
434 ///
435 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
436 ///
437 /// \headerfile <x86intrin.h>
438 ///
439 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
440 ///
441 /// \param __a
442 /// A 128-bit vector of [2 x double].
443 /// \param __b
444 /// A 128-bit vector of [2 x double].
445 /// \returns A 128-bit vector containing the comparison results.
_mm_cmplt_pd(__m128d __a,__m128d __b)446 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
447 __m128d __b) {
448 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
449 }
450
451 /// Compares each of the corresponding double-precision values of the
452 /// 128-bit vectors of [2 x double] to determine if the values in the first
453 /// operand are less than or equal to those in the second operand.
454 ///
455 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
456 ///
457 /// \headerfile <x86intrin.h>
458 ///
459 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
460 ///
461 /// \param __a
462 /// A 128-bit vector of [2 x double].
463 /// \param __b
464 /// A 128-bit vector of [2 x double].
465 /// \returns A 128-bit vector containing the comparison results.
_mm_cmple_pd(__m128d __a,__m128d __b)466 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
467 __m128d __b) {
468 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
469 }
470
471 /// Compares each of the corresponding double-precision values of the
472 /// 128-bit vectors of [2 x double] to determine if the values in the first
473 /// operand are greater than those in the second operand.
474 ///
475 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
476 ///
477 /// \headerfile <x86intrin.h>
478 ///
479 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
480 ///
481 /// \param __a
482 /// A 128-bit vector of [2 x double].
483 /// \param __b
484 /// A 128-bit vector of [2 x double].
485 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpgt_pd(__m128d __a,__m128d __b)486 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
487 __m128d __b) {
488 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
489 }
490
491 /// Compares each of the corresponding double-precision values of the
492 /// 128-bit vectors of [2 x double] to determine if the values in the first
493 /// operand are greater than or equal to those in the second operand.
494 ///
495 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
496 ///
497 /// \headerfile <x86intrin.h>
498 ///
499 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
500 ///
501 /// \param __a
502 /// A 128-bit vector of [2 x double].
503 /// \param __b
504 /// A 128-bit vector of [2 x double].
505 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpge_pd(__m128d __a,__m128d __b)506 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
507 __m128d __b) {
508 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
509 }
510
511 /// Compares each of the corresponding double-precision values of the
512 /// 128-bit vectors of [2 x double] to determine if the values in the first
513 /// operand are ordered with respect to those in the second operand.
514 ///
515 /// A pair of double-precision values are "ordered" with respect to each
516 /// other if neither value is a NaN. Each comparison yields 0x0 for false,
517 /// 0xFFFFFFFFFFFFFFFF for true.
518 ///
519 /// \headerfile <x86intrin.h>
520 ///
521 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
522 ///
523 /// \param __a
524 /// A 128-bit vector of [2 x double].
525 /// \param __b
526 /// A 128-bit vector of [2 x double].
527 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpord_pd(__m128d __a,__m128d __b)528 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
529 __m128d __b) {
530 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
531 }
532
533 /// Compares each of the corresponding double-precision values of the
534 /// 128-bit vectors of [2 x double] to determine if the values in the first
535 /// operand are unordered with respect to those in the second operand.
536 ///
537 /// A pair of double-precision values are "unordered" with respect to each
538 /// other if one or both values are NaN. Each comparison yields 0x0 for
539 /// false, 0xFFFFFFFFFFFFFFFF for true.
540 ///
541 /// \headerfile <x86intrin.h>
542 ///
543 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
544 /// instruction.
545 ///
546 /// \param __a
547 /// A 128-bit vector of [2 x double].
548 /// \param __b
549 /// A 128-bit vector of [2 x double].
550 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpunord_pd(__m128d __a,__m128d __b)551 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
552 __m128d __b) {
553 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
554 }
555
556 /// Compares each of the corresponding double-precision values of the
557 /// 128-bit vectors of [2 x double] to determine if the values in the first
558 /// operand are unequal to those in the second operand.
559 ///
560 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
561 ///
562 /// \headerfile <x86intrin.h>
563 ///
564 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
565 ///
566 /// \param __a
567 /// A 128-bit vector of [2 x double].
568 /// \param __b
569 /// A 128-bit vector of [2 x double].
570 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpneq_pd(__m128d __a,__m128d __b)571 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
572 __m128d __b) {
573 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
574 }
575
576 /// Compares each of the corresponding double-precision values of the
577 /// 128-bit vectors of [2 x double] to determine if the values in the first
578 /// operand are not less than those in the second operand.
579 ///
580 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
581 ///
582 /// \headerfile <x86intrin.h>
583 ///
584 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
585 ///
586 /// \param __a
587 /// A 128-bit vector of [2 x double].
588 /// \param __b
589 /// A 128-bit vector of [2 x double].
590 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnlt_pd(__m128d __a,__m128d __b)591 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
592 __m128d __b) {
593 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
594 }
595
596 /// Compares each of the corresponding double-precision values of the
597 /// 128-bit vectors of [2 x double] to determine if the values in the first
598 /// operand are not less than or equal to those in the second operand.
599 ///
600 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
601 ///
602 /// \headerfile <x86intrin.h>
603 ///
604 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
605 ///
606 /// \param __a
607 /// A 128-bit vector of [2 x double].
608 /// \param __b
609 /// A 128-bit vector of [2 x double].
610 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnle_pd(__m128d __a,__m128d __b)611 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
612 __m128d __b) {
613 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
614 }
615
616 /// Compares each of the corresponding double-precision values of the
617 /// 128-bit vectors of [2 x double] to determine if the values in the first
618 /// operand are not greater than those in the second operand.
619 ///
620 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
621 ///
622 /// \headerfile <x86intrin.h>
623 ///
624 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
625 ///
626 /// \param __a
627 /// A 128-bit vector of [2 x double].
628 /// \param __b
629 /// A 128-bit vector of [2 x double].
630 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpngt_pd(__m128d __a,__m128d __b)631 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
632 __m128d __b) {
633 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
634 }
635
636 /// Compares each of the corresponding double-precision values of the
637 /// 128-bit vectors of [2 x double] to determine if the values in the first
638 /// operand are not greater than or equal to those in the second operand.
639 ///
640 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
641 ///
642 /// \headerfile <x86intrin.h>
643 ///
644 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
645 ///
646 /// \param __a
647 /// A 128-bit vector of [2 x double].
648 /// \param __b
649 /// A 128-bit vector of [2 x double].
650 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnge_pd(__m128d __a,__m128d __b)651 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
652 __m128d __b) {
653 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
654 }
655
656 /// Compares the lower double-precision floating-point values in each of
657 /// the two 128-bit floating-point vectors of [2 x double] for equality.
658 ///
659 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
660 ///
661 /// \headerfile <x86intrin.h>
662 ///
663 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
664 ///
665 /// \param __a
666 /// A 128-bit vector of [2 x double]. The lower double-precision value is
667 /// compared to the lower double-precision value of \a __b.
668 /// \param __b
669 /// A 128-bit vector of [2 x double]. The lower double-precision value is
670 /// compared to the lower double-precision value of \a __a.
671 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
672 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpeq_sd(__m128d __a,__m128d __b)673 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
674 __m128d __b) {
675 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
676 }
677
678 /// Compares the lower double-precision floating-point values in each of
679 /// the two 128-bit floating-point vectors of [2 x double] to determine if
680 /// the value in the first parameter is less than the corresponding value in
681 /// the second parameter.
682 ///
683 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
684 ///
685 /// \headerfile <x86intrin.h>
686 ///
687 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
688 ///
689 /// \param __a
690 /// A 128-bit vector of [2 x double]. The lower double-precision value is
691 /// compared to the lower double-precision value of \a __b.
692 /// \param __b
693 /// A 128-bit vector of [2 x double]. The lower double-precision value is
694 /// compared to the lower double-precision value of \a __a.
695 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
696 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmplt_sd(__m128d __a,__m128d __b)697 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
698 __m128d __b) {
699 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
700 }
701
702 /// Compares the lower double-precision floating-point values in each of
703 /// the two 128-bit floating-point vectors of [2 x double] to determine if
704 /// the value in the first parameter is less than or equal to the
705 /// corresponding value in the second parameter.
706 ///
707 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
708 ///
709 /// \headerfile <x86intrin.h>
710 ///
711 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
712 ///
713 /// \param __a
714 /// A 128-bit vector of [2 x double]. The lower double-precision value is
715 /// compared to the lower double-precision value of \a __b.
716 /// \param __b
717 /// A 128-bit vector of [2 x double]. The lower double-precision value is
718 /// compared to the lower double-precision value of \a __a.
719 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
720 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmple_sd(__m128d __a,__m128d __b)721 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
722 __m128d __b) {
723 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
724 }
725
726 /// Compares the lower double-precision floating-point values in each of
727 /// the two 128-bit floating-point vectors of [2 x double] to determine if
728 /// the value in the first parameter is greater than the corresponding value
729 /// in the second parameter.
730 ///
731 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
732 ///
733 /// \headerfile <x86intrin.h>
734 ///
735 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
736 ///
737 /// \param __a
738 /// A 128-bit vector of [2 x double]. The lower double-precision value is
739 /// compared to the lower double-precision value of \a __b.
740 /// \param __b
741 /// A 128-bit vector of [2 x double]. The lower double-precision value is
742 /// compared to the lower double-precision value of \a __a.
743 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
744 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpgt_sd(__m128d __a,__m128d __b)745 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
746 __m128d __b) {
747 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
748 return __extension__(__m128d){__c[0], __a[1]};
749 }
750
751 /// Compares the lower double-precision floating-point values in each of
752 /// the two 128-bit floating-point vectors of [2 x double] to determine if
753 /// the value in the first parameter is greater than or equal to the
754 /// corresponding value in the second parameter.
755 ///
756 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
757 ///
758 /// \headerfile <x86intrin.h>
759 ///
760 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
761 ///
762 /// \param __a
763 /// A 128-bit vector of [2 x double]. The lower double-precision value is
764 /// compared to the lower double-precision value of \a __b.
765 /// \param __b
766 /// A 128-bit vector of [2 x double]. The lower double-precision value is
767 /// compared to the lower double-precision value of \a __a.
768 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
769 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpge_sd(__m128d __a,__m128d __b)770 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
771 __m128d __b) {
772 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
773 return __extension__(__m128d){__c[0], __a[1]};
774 }
775
776 /// Compares the lower double-precision floating-point values in each of
777 /// the two 128-bit floating-point vectors of [2 x double] to determine if
778 /// the value in the first parameter is "ordered" with respect to the
779 /// corresponding value in the second parameter.
780 ///
781 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
782 /// of double-precision values are "ordered" with respect to each other if
783 /// neither value is a NaN.
784 ///
785 /// \headerfile <x86intrin.h>
786 ///
787 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
788 ///
789 /// \param __a
790 /// A 128-bit vector of [2 x double]. The lower double-precision value is
791 /// compared to the lower double-precision value of \a __b.
792 /// \param __b
793 /// A 128-bit vector of [2 x double]. The lower double-precision value is
794 /// compared to the lower double-precision value of \a __a.
795 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
796 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpord_sd(__m128d __a,__m128d __b)797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
798 __m128d __b) {
799 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
800 }
801
802 /// Compares the lower double-precision floating-point values in each of
803 /// the two 128-bit floating-point vectors of [2 x double] to determine if
804 /// the value in the first parameter is "unordered" with respect to the
805 /// corresponding value in the second parameter.
806 ///
807 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
808 /// of double-precision values are "unordered" with respect to each other if
809 /// one or both values are NaN.
810 ///
811 /// \headerfile <x86intrin.h>
812 ///
813 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
814 /// instruction.
815 ///
816 /// \param __a
817 /// A 128-bit vector of [2 x double]. The lower double-precision value is
818 /// compared to the lower double-precision value of \a __b.
819 /// \param __b
820 /// A 128-bit vector of [2 x double]. The lower double-precision value is
821 /// compared to the lower double-precision value of \a __a.
822 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
823 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpunord_sd(__m128d __a,__m128d __b)824 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
825 __m128d __b) {
826 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
827 }
828
829 /// Compares the lower double-precision floating-point values in each of
830 /// the two 128-bit floating-point vectors of [2 x double] to determine if
831 /// the value in the first parameter is unequal to the corresponding value in
832 /// the second parameter.
833 ///
834 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
835 ///
836 /// \headerfile <x86intrin.h>
837 ///
838 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
839 ///
840 /// \param __a
841 /// A 128-bit vector of [2 x double]. The lower double-precision value is
842 /// compared to the lower double-precision value of \a __b.
843 /// \param __b
844 /// A 128-bit vector of [2 x double]. The lower double-precision value is
845 /// compared to the lower double-precision value of \a __a.
846 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
847 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpneq_sd(__m128d __a,__m128d __b)848 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
849 __m128d __b) {
850 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
851 }
852
853 /// Compares the lower double-precision floating-point values in each of
854 /// the two 128-bit floating-point vectors of [2 x double] to determine if
855 /// the value in the first parameter is not less than the corresponding
856 /// value in the second parameter.
857 ///
858 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
859 ///
860 /// \headerfile <x86intrin.h>
861 ///
862 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
863 ///
864 /// \param __a
865 /// A 128-bit vector of [2 x double]. The lower double-precision value is
866 /// compared to the lower double-precision value of \a __b.
867 /// \param __b
868 /// A 128-bit vector of [2 x double]. The lower double-precision value is
869 /// compared to the lower double-precision value of \a __a.
870 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
871 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnlt_sd(__m128d __a,__m128d __b)872 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
873 __m128d __b) {
874 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
875 }
876
877 /// Compares the lower double-precision floating-point values in each of
878 /// the two 128-bit floating-point vectors of [2 x double] to determine if
879 /// the value in the first parameter is not less than or equal to the
880 /// corresponding value in the second parameter.
881 ///
882 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
883 ///
884 /// \headerfile <x86intrin.h>
885 ///
886 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
887 ///
888 /// \param __a
889 /// A 128-bit vector of [2 x double]. The lower double-precision value is
890 /// compared to the lower double-precision value of \a __b.
891 /// \param __b
892 /// A 128-bit vector of [2 x double]. The lower double-precision value is
893 /// compared to the lower double-precision value of \a __a.
894 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
895 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnle_sd(__m128d __a,__m128d __b)896 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
897 __m128d __b) {
898 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
899 }
900
901 /// Compares the lower double-precision floating-point values in each of
902 /// the two 128-bit floating-point vectors of [2 x double] to determine if
903 /// the value in the first parameter is not greater than the corresponding
904 /// value in the second parameter.
905 ///
906 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
907 ///
908 /// \headerfile <x86intrin.h>
909 ///
910 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
911 ///
912 /// \param __a
913 /// A 128-bit vector of [2 x double]. The lower double-precision value is
914 /// compared to the lower double-precision value of \a __b.
915 /// \param __b
916 /// A 128-bit vector of [2 x double]. The lower double-precision value is
917 /// compared to the lower double-precision value of \a __a.
918 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
919 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpngt_sd(__m128d __a,__m128d __b)920 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
921 __m128d __b) {
922 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
923 return __extension__(__m128d){__c[0], __a[1]};
924 }
925
926 /// Compares the lower double-precision floating-point values in each of
927 /// the two 128-bit floating-point vectors of [2 x double] to determine if
928 /// the value in the first parameter is not greater than or equal to the
929 /// corresponding value in the second parameter.
930 ///
931 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
932 ///
933 /// \headerfile <x86intrin.h>
934 ///
935 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
936 ///
937 /// \param __a
938 /// A 128-bit vector of [2 x double]. The lower double-precision value is
939 /// compared to the lower double-precision value of \a __b.
940 /// \param __b
941 /// A 128-bit vector of [2 x double]. The lower double-precision value is
942 /// compared to the lower double-precision value of \a __a.
943 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
944 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnge_sd(__m128d __a,__m128d __b)945 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
946 __m128d __b) {
947 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
948 return __extension__(__m128d){__c[0], __a[1]};
949 }
950
951 /// Compares the lower double-precision floating-point values in each of
952 /// the two 128-bit floating-point vectors of [2 x double] for equality.
953 ///
954 /// The comparison returns 0 for false, 1 for true. If either of the two
955 /// lower double-precision values is NaN, returns 0.
956 ///
957 /// \headerfile <x86intrin.h>
958 ///
959 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
960 ///
961 /// \param __a
962 /// A 128-bit vector of [2 x double]. The lower double-precision value is
963 /// compared to the lower double-precision value of \a __b.
964 /// \param __b
965 /// A 128-bit vector of [2 x double]. The lower double-precision value is
966 /// compared to the lower double-precision value of \a __a.
967 /// \returns An integer containing the comparison results.
_mm_comieq_sd(__m128d __a,__m128d __b)968 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
969 __m128d __b) {
970 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
971 }
972
973 /// Compares the lower double-precision floating-point values in each of
974 /// the two 128-bit floating-point vectors of [2 x double] to determine if
975 /// the value in the first parameter is less than the corresponding value in
976 /// the second parameter.
977 ///
978 /// The comparison returns 0 for false, 1 for true. If either of the two
979 /// lower double-precision values is NaN, returns 0.
980 ///
981 /// \headerfile <x86intrin.h>
982 ///
983 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
984 ///
985 /// \param __a
986 /// A 128-bit vector of [2 x double]. The lower double-precision value is
987 /// compared to the lower double-precision value of \a __b.
988 /// \param __b
989 /// A 128-bit vector of [2 x double]. The lower double-precision value is
990 /// compared to the lower double-precision value of \a __a.
991 /// \returns An integer containing the comparison results.
_mm_comilt_sd(__m128d __a,__m128d __b)992 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993 __m128d __b) {
994 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
995 }
996
997 /// Compares the lower double-precision floating-point values in each of
998 /// the two 128-bit floating-point vectors of [2 x double] to determine if
999 /// the value in the first parameter is less than or equal to the
1000 /// corresponding value in the second parameter.
1001 ///
1002 /// The comparison returns 0 for false, 1 for true. If either of the two
1003 /// lower double-precision values is NaN, returns 0.
1004 ///
1005 /// \headerfile <x86intrin.h>
1006 ///
1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008 ///
1009 /// \param __a
1010 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1011 /// compared to the lower double-precision value of \a __b.
1012 /// \param __b
1013 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1014 /// compared to the lower double-precision value of \a __a.
1015 /// \returns An integer containing the comparison results.
_mm_comile_sd(__m128d __a,__m128d __b)1016 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1017 __m128d __b) {
1018 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1019 }
1020
1021 /// Compares the lower double-precision floating-point values in each of
1022 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1023 /// the value in the first parameter is greater than the corresponding value
1024 /// in the second parameter.
1025 ///
1026 /// The comparison returns 0 for false, 1 for true. If either of the two
1027 /// lower double-precision values is NaN, returns 0.
1028 ///
1029 /// \headerfile <x86intrin.h>
1030 ///
1031 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1032 ///
1033 /// \param __a
1034 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1035 /// compared to the lower double-precision value of \a __b.
1036 /// \param __b
1037 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1038 /// compared to the lower double-precision value of \a __a.
1039 /// \returns An integer containing the comparison results.
_mm_comigt_sd(__m128d __a,__m128d __b)1040 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1041 __m128d __b) {
1042 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1043 }
1044
1045 /// Compares the lower double-precision floating-point values in each of
1046 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1047 /// the value in the first parameter is greater than or equal to the
1048 /// corresponding value in the second parameter.
1049 ///
1050 /// The comparison returns 0 for false, 1 for true. If either of the two
1051 /// lower double-precision values is NaN, returns 0.
1052 ///
1053 /// \headerfile <x86intrin.h>
1054 ///
1055 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1056 ///
1057 /// \param __a
1058 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1059 /// compared to the lower double-precision value of \a __b.
1060 /// \param __b
1061 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1062 /// compared to the lower double-precision value of \a __a.
1063 /// \returns An integer containing the comparison results.
_mm_comige_sd(__m128d __a,__m128d __b)1064 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1065 __m128d __b) {
1066 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1067 }
1068
1069 /// Compares the lower double-precision floating-point values in each of
1070 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1071 /// the value in the first parameter is unequal to the corresponding value in
1072 /// the second parameter.
1073 ///
1074 /// The comparison returns 0 for false, 1 for true. If either of the two
1075 /// lower double-precision values is NaN, 1 is returned.
1076 ///
1077 /// \headerfile <x86intrin.h>
1078 ///
1079 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1080 ///
1081 /// \param __a
1082 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1083 /// compared to the lower double-precision value of \a __b.
1084 /// \param __b
1085 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1086 /// compared to the lower double-precision value of \a __a.
1087 /// \returns An integer containing the comparison results.
_mm_comineq_sd(__m128d __a,__m128d __b)1088 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1089 __m128d __b) {
1090 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1091 }
1092
1093 /// Compares the lower double-precision floating-point values in each of
1094 /// the two 128-bit floating-point vectors of [2 x double] for equality.
1095 ///
1096 /// The comparison returns 0 for false, 1 for true. If either of the two
1097 /// lower double-precision values is NaN, returns 0.
1098 ///
1099 /// \headerfile <x86intrin.h>
1100 ///
1101 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1102 ///
1103 /// \param __a
1104 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1105 /// compared to the lower double-precision value of \a __b.
1106 /// \param __b
1107 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1108 /// compared to the lower double-precision value of \a __a.
1109 /// \returns An integer containing the comparison results.
_mm_ucomieq_sd(__m128d __a,__m128d __b)1110 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1111 __m128d __b) {
1112 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1113 }
1114
1115 /// Compares the lower double-precision floating-point values in each of
1116 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1117 /// the value in the first parameter is less than the corresponding value in
1118 /// the second parameter.
1119 ///
1120 /// The comparison returns 0 for false, 1 for true. If either of the two
1121 /// lower double-precision values is NaN, returns 0.
1122 ///
1123 /// \headerfile <x86intrin.h>
1124 ///
1125 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1126 ///
1127 /// \param __a
1128 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1129 /// compared to the lower double-precision value of \a __b.
1130 /// \param __b
1131 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1132 /// compared to the lower double-precision value of \a __a.
1133 /// \returns An integer containing the comparison results.
_mm_ucomilt_sd(__m128d __a,__m128d __b)1134 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1135 __m128d __b) {
1136 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1137 }
1138
1139 /// Compares the lower double-precision floating-point values in each of
1140 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1141 /// the value in the first parameter is less than or equal to the
1142 /// corresponding value in the second parameter.
1143 ///
1144 /// The comparison returns 0 for false, 1 for true. If either of the two
1145 /// lower double-precision values is NaN, returns 0.
1146 ///
1147 /// \headerfile <x86intrin.h>
1148 ///
1149 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1150 ///
1151 /// \param __a
1152 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1153 /// compared to the lower double-precision value of \a __b.
1154 /// \param __b
1155 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1156 /// compared to the lower double-precision value of \a __a.
1157 /// \returns An integer containing the comparison results.
_mm_ucomile_sd(__m128d __a,__m128d __b)1158 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1159 __m128d __b) {
1160 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1161 }
1162
1163 /// Compares the lower double-precision floating-point values in each of
1164 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1165 /// the value in the first parameter is greater than the corresponding value
1166 /// in the second parameter.
1167 ///
1168 /// The comparison returns 0 for false, 1 for true. If either of the two
1169 /// lower double-precision values is NaN, returns 0.
1170 ///
1171 /// \headerfile <x86intrin.h>
1172 ///
1173 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1174 ///
1175 /// \param __a
1176 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1177 /// compared to the lower double-precision value of \a __b.
1178 /// \param __b
1179 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1180 /// compared to the lower double-precision value of \a __a.
1181 /// \returns An integer containing the comparison results.
_mm_ucomigt_sd(__m128d __a,__m128d __b)1182 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1183 __m128d __b) {
1184 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1185 }
1186
1187 /// Compares the lower double-precision floating-point values in each of
1188 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1189 /// the value in the first parameter is greater than or equal to the
1190 /// corresponding value in the second parameter.
1191 ///
1192 /// The comparison returns 0 for false, 1 for true. If either of the two
1193 /// lower double-precision values is NaN, returns 0.
1194 ///
1195 /// \headerfile <x86intrin.h>
1196 ///
1197 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1198 ///
1199 /// \param __a
1200 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1201 /// compared to the lower double-precision value of \a __b.
1202 /// \param __b
1203 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1204 /// compared to the lower double-precision value of \a __a.
1205 /// \returns An integer containing the comparison results.
_mm_ucomige_sd(__m128d __a,__m128d __b)1206 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1207 __m128d __b) {
1208 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1209 }
1210
1211 /// Compares the lower double-precision floating-point values in each of
1212 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1213 /// the value in the first parameter is unequal to the corresponding value in
1214 /// the second parameter.
1215 ///
1216 /// The comparison returns 0 for false, 1 for true. If either of the two
1217 /// lower double-precision values is NaN, 1 is returned.
1218 ///
1219 /// \headerfile <x86intrin.h>
1220 ///
1221 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1222 ///
1223 /// \param __a
1224 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1225 /// compared to the lower double-precision value of \a __b.
1226 /// \param __b
1227 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1228 /// compared to the lower double-precision value of \a __a.
1229 /// \returns An integer containing the comparison result.
_mm_ucomineq_sd(__m128d __a,__m128d __b)1230 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1231 __m128d __b) {
1232 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1233 }
1234
1235 /// Converts the two double-precision floating-point elements of a
1236 /// 128-bit vector of [2 x double] into two single-precision floating-point
1237 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1238 /// The upper 64 bits of the result vector are set to zero.
1239 ///
1240 /// \headerfile <x86intrin.h>
1241 ///
1242 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1243 ///
1244 /// \param __a
1245 /// A 128-bit vector of [2 x double].
1246 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1247 /// converted values. The upper 64 bits are set to zero.
_mm_cvtpd_ps(__m128d __a)1248 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1249 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1250 }
1251
1252 /// Converts the lower two single-precision floating-point elements of a
1253 /// 128-bit vector of [4 x float] into two double-precision floating-point
1254 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1255 /// elements of the input vector are unused.
1256 ///
1257 /// \headerfile <x86intrin.h>
1258 ///
1259 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1260 ///
1261 /// \param __a
1262 /// A 128-bit vector of [4 x float]. The lower two single-precision
1263 /// floating-point elements are converted to double-precision values. The
1264 /// upper two elements are unused.
1265 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtps_pd(__m128 __a)1266 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1267 return (__m128d) __builtin_convertvector(
1268 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1269 }
1270
1271 /// Converts the lower two integer elements of a 128-bit vector of
1272 /// [4 x i32] into two double-precision floating-point values, returned in a
1273 /// 128-bit vector of [2 x double].
1274 ///
1275 /// The upper two elements of the input vector are unused.
1276 ///
1277 /// \headerfile <x86intrin.h>
1278 ///
1279 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1280 ///
1281 /// \param __a
1282 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1283 /// converted to double-precision values.
1284 ///
1285 /// The upper two elements are unused.
1286 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtepi32_pd(__m128i __a)1287 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1288 return (__m128d) __builtin_convertvector(
1289 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1290 }
1291
1292 /// Converts the two double-precision floating-point elements of a
1293 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1294 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1295 /// 64 bits of the result vector are set to zero.
1296 ///
1297 /// \headerfile <x86intrin.h>
1298 ///
1299 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1300 ///
1301 /// \param __a
1302 /// A 128-bit vector of [2 x double].
1303 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1304 /// converted values. The upper 64 bits are set to zero.
_mm_cvtpd_epi32(__m128d __a)1305 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1306 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1307 }
1308
1309 /// Converts the low-order element of a 128-bit vector of [2 x double]
1310 /// into a 32-bit signed integer value.
1311 ///
1312 /// \headerfile <x86intrin.h>
1313 ///
1314 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1315 ///
1316 /// \param __a
1317 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1318 /// conversion.
1319 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvtsd_si32(__m128d __a)1320 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1321 return __builtin_ia32_cvtsd2si((__v2df)__a);
1322 }
1323
1324 /// Converts the lower double-precision floating-point element of a
1325 /// 128-bit vector of [2 x double], in the second parameter, into a
1326 /// single-precision floating-point value, returned in the lower 32 bits of a
1327 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1328 /// copied from the upper 96 bits of the first parameter.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1333 ///
1334 /// \param __a
1335 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1336 /// copied to the upper 96 bits of the result.
1337 /// \param __b
1338 /// A 128-bit vector of [2 x double]. The lower double-precision
1339 /// floating-point element is used in the conversion.
1340 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1341 /// converted value from the second parameter. The upper 96 bits are copied
1342 /// from the upper 96 bits of the first parameter.
_mm_cvtsd_ss(__m128 __a,__m128d __b)1343 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1344 __m128d __b) {
1345 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1346 }
1347
1348 /// Converts a 32-bit signed integer value, in the second parameter, into
1349 /// a double-precision floating-point value, returned in the lower 64 bits of
1350 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1351 /// are copied from the upper 64 bits of the first parameter.
1352 ///
1353 /// \headerfile <x86intrin.h>
1354 ///
1355 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1356 ///
1357 /// \param __a
1358 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1359 /// copied to the upper 64 bits of the result.
1360 /// \param __b
1361 /// A 32-bit signed integer containing the value to be converted.
1362 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1363 /// converted value from the second parameter. The upper 64 bits are copied
1364 /// from the upper 64 bits of the first parameter.
_mm_cvtsi32_sd(__m128d __a,int __b)1365 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1366 int __b) {
1367 __a[0] = __b;
1368 return __a;
1369 }
1370
1371 /// Converts the lower single-precision floating-point element of a
1372 /// 128-bit vector of [4 x float], in the second parameter, into a
1373 /// double-precision floating-point value, returned in the lower 64 bits of
1374 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1375 /// are copied from the upper 64 bits of the first parameter.
1376 ///
1377 /// \headerfile <x86intrin.h>
1378 ///
1379 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1380 ///
1381 /// \param __a
1382 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1383 /// copied to the upper 64 bits of the result.
1384 /// \param __b
1385 /// A 128-bit vector of [4 x float]. The lower single-precision
1386 /// floating-point element is used in the conversion.
1387 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1388 /// converted value from the second parameter. The upper 64 bits are copied
1389 /// from the upper 64 bits of the first parameter.
_mm_cvtss_sd(__m128d __a,__m128 __b)1390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1391 __m128 __b) {
1392 __a[0] = __b[0];
1393 return __a;
1394 }
1395
1396 /// Converts the two double-precision floating-point elements of a
1397 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1398 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1399 ///
1400 /// If the result of either conversion is inexact, the result is truncated
1401 /// (rounded towards zero) regardless of the current MXCSR setting. The upper
1402 /// 64 bits of the result vector are set to zero.
1403 ///
1404 /// \headerfile <x86intrin.h>
1405 ///
1406 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1407 /// instruction.
1408 ///
1409 /// \param __a
1410 /// A 128-bit vector of [2 x double].
1411 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1412 /// converted values. The upper 64 bits are set to zero.
_mm_cvttpd_epi32(__m128d __a)1413 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1414 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1415 }
1416
1417 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1418 /// signed integer value, truncating the result when it is inexact.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1423 /// instruction.
1424 ///
1425 /// \param __a
1426 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1427 /// conversion.
1428 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvttsd_si32(__m128d __a)1429 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1430 return __builtin_ia32_cvttsd2si((__v2df)__a);
1431 }
1432
1433 /// Converts the two double-precision floating-point elements of a
1434 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1435 /// returned in a 64-bit vector of [2 x i32].
1436 ///
1437 /// \headerfile <x86intrin.h>
1438 ///
1439 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1440 ///
1441 /// \param __a
1442 /// A 128-bit vector of [2 x double].
1443 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvtpd_pi32(__m128d __a)1444 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1445 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1446 }
1447
1448 /// Converts the two double-precision floating-point elements of a
1449 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1450 /// returned in a 64-bit vector of [2 x i32].
1451 ///
1452 /// If the result of either conversion is inexact, the result is truncated
1453 /// (rounded towards zero) regardless of the current MXCSR setting.
1454 ///
1455 /// \headerfile <x86intrin.h>
1456 ///
1457 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1458 ///
1459 /// \param __a
1460 /// A 128-bit vector of [2 x double].
1461 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvttpd_pi32(__m128d __a)1462 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1463 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1464 }
1465
1466 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1467 /// [2 x i32] into two double-precision floating-point values, returned in a
1468 /// 128-bit vector of [2 x double].
1469 ///
1470 /// \headerfile <x86intrin.h>
1471 ///
1472 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1473 ///
1474 /// \param __a
1475 /// A 64-bit vector of [2 x i32].
1476 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtpi32_pd(__m64 __a)1477 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1478 return __builtin_ia32_cvtpi2pd((__v2si)__a);
1479 }
1480
1481 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1482 /// a double-precision floating-point value.
1483 ///
1484 /// \headerfile <x86intrin.h>
1485 ///
1486 /// This intrinsic has no corresponding instruction.
1487 ///
1488 /// \param __a
1489 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1490 /// \returns A double-precision floating-point value copied from the lower 64
1491 /// bits of \a __a.
_mm_cvtsd_f64(__m128d __a)1492 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1493 return __a[0];
1494 }
1495
1496 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1497 /// memory location.
1498 ///
1499 /// \headerfile <x86intrin.h>
1500 ///
1501 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1502 ///
1503 /// \param __dp
1504 /// A pointer to a 128-bit memory location. The address of the memory
1505 /// location has to be 16-byte aligned.
1506 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_load_pd(double const * __dp)1507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1508 return *(const __m128d *)__dp;
1509 }
1510
1511 /// Loads a double-precision floating-point value from a specified memory
1512 /// location and duplicates it to both vector elements of a 128-bit vector of
1513 /// [2 x double].
1514 ///
1515 /// \headerfile <x86intrin.h>
1516 ///
1517 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1518 ///
1519 /// \param __dp
1520 /// A pointer to a memory location containing a double-precision value.
1521 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1522 /// duplicated values.
_mm_load1_pd(double const * __dp)1523 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1524 struct __mm_load1_pd_struct {
1525 double __u;
1526 } __attribute__((__packed__, __may_alias__));
1527 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1528 return __extension__(__m128d){__u, __u};
1529 }
1530
1531 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1532
1533 /// Loads two double-precision values, in reverse order, from an aligned
1534 /// memory location into a 128-bit vector of [2 x double].
1535 ///
1536 /// \headerfile <x86intrin.h>
1537 ///
1538 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1539 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1540 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1541 ///
1542 /// \param __dp
1543 /// A 16-byte aligned pointer to an array of double-precision values to be
1544 /// loaded in reverse order.
1545 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1546 /// values.
_mm_loadr_pd(double const * __dp)1547 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1548 __m128d __u = *(const __m128d *)__dp;
1549 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1550 }
1551
1552 /// Loads a 128-bit floating-point vector of [2 x double] from an
1553 /// unaligned memory location.
1554 ///
1555 /// \headerfile <x86intrin.h>
1556 ///
1557 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1558 ///
1559 /// \param __dp
1560 /// A pointer to a 128-bit memory location. The address of the memory
1561 /// location does not have to be aligned.
1562 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_loadu_pd(double const * __dp)1563 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1564 struct __loadu_pd {
1565 __m128d_u __v;
1566 } __attribute__((__packed__, __may_alias__));
1567 return ((const struct __loadu_pd *)__dp)->__v;
1568 }
1569
1570 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1571 /// vector and clears the upper element.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1576 ///
1577 /// \param __a
1578 /// A pointer to a 64-bit memory location. The address of the memory
1579 /// location does not have to be aligned.
1580 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
_mm_loadu_si64(void const * __a)1581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1582 struct __loadu_si64 {
1583 long long __v;
1584 } __attribute__((__packed__, __may_alias__));
1585 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1586 return __extension__(__m128i)(__v2di){__u, 0LL};
1587 }
1588
1589 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1590 /// vector and clears the upper element.
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1595 ///
1596 /// \param __a
1597 /// A pointer to a 32-bit memory location. The address of the memory
1598 /// location does not have to be aligned.
1599 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
_mm_loadu_si32(void const * __a)1600 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1601 struct __loadu_si32 {
1602 int __v;
1603 } __attribute__((__packed__, __may_alias__));
1604 int __u = ((const struct __loadu_si32 *)__a)->__v;
1605 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1606 }
1607
1608 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1609 /// vector and clears the upper element.
1610 ///
1611 /// \headerfile <x86intrin.h>
1612 ///
1613 /// This intrinsic does not correspond to a specific instruction.
1614 ///
1615 /// \param __a
1616 /// A pointer to a 16-bit memory location. The address of the memory
1617 /// location does not have to be aligned.
1618 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
_mm_loadu_si16(void const * __a)1619 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1620 struct __loadu_si16 {
1621 short __v;
1622 } __attribute__((__packed__, __may_alias__));
1623 short __u = ((const struct __loadu_si16 *)__a)->__v;
1624 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1625 }
1626
1627 /// Loads a 64-bit double-precision value to the low element of a
1628 /// 128-bit integer vector and clears the upper element.
1629 ///
1630 /// \headerfile <x86intrin.h>
1631 ///
1632 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1633 ///
1634 /// \param __dp
1635 /// A pointer to a memory location containing a double-precision value.
1636 /// The address of the memory location does not have to be aligned.
1637 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
_mm_load_sd(double const * __dp)1638 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1639 struct __mm_load_sd_struct {
1640 double __u;
1641 } __attribute__((__packed__, __may_alias__));
1642 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1643 return __extension__(__m128d){__u, 0};
1644 }
1645
1646 /// Loads a double-precision value into the high-order bits of a 128-bit
1647 /// vector of [2 x double]. The low-order bits are copied from the low-order
1648 /// bits of the first operand.
1649 ///
1650 /// \headerfile <x86intrin.h>
1651 ///
1652 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1653 ///
1654 /// \param __a
1655 /// A 128-bit vector of [2 x double]. \n
1656 /// Bits [63:0] are written to bits [63:0] of the result.
1657 /// \param __dp
1658 /// A pointer to a 64-bit memory location containing a double-precision
1659 /// floating-point value that is loaded. The loaded value is written to bits
1660 /// [127:64] of the result. The address of the memory location does not have
1661 /// to be aligned.
1662 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadh_pd(__m128d __a,double const * __dp)1663 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1664 double const *__dp) {
1665 struct __mm_loadh_pd_struct {
1666 double __u;
1667 } __attribute__((__packed__, __may_alias__));
1668 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1669 return __extension__(__m128d){__a[0], __u};
1670 }
1671
1672 /// Loads a double-precision value into the low-order bits of a 128-bit
1673 /// vector of [2 x double]. The high-order bits are copied from the
1674 /// high-order bits of the first operand.
1675 ///
1676 /// \headerfile <x86intrin.h>
1677 ///
1678 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1679 ///
1680 /// \param __a
1681 /// A 128-bit vector of [2 x double]. \n
1682 /// Bits [127:64] are written to bits [127:64] of the result.
1683 /// \param __dp
1684 /// A pointer to a 64-bit memory location containing a double-precision
1685 /// floating-point value that is loaded. The loaded value is written to bits
1686 /// [63:0] of the result. The address of the memory location does not have to
1687 /// be aligned.
1688 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadl_pd(__m128d __a,double const * __dp)1689 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1690 double const *__dp) {
1691 struct __mm_loadl_pd_struct {
1692 double __u;
1693 } __attribute__((__packed__, __may_alias__));
1694 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1695 return __extension__(__m128d){__u, __a[1]};
1696 }
1697
1698 /// Constructs a 128-bit floating-point vector of [2 x double] with
1699 /// unspecified content. This could be used as an argument to another
1700 /// intrinsic function where the argument is required but the value is not
1701 /// actually used.
1702 ///
1703 /// \headerfile <x86intrin.h>
1704 ///
1705 /// This intrinsic has no corresponding instruction.
1706 ///
1707 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1708 /// content.
_mm_undefined_pd(void)1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1710 return (__m128d)__builtin_ia32_undef128();
1711 }
1712
1713 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1714 /// 64 bits of the vector are initialized with the specified double-precision
1715 /// floating-point value. The upper 64 bits are set to zero.
1716 ///
1717 /// \headerfile <x86intrin.h>
1718 ///
1719 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1720 ///
1721 /// \param __w
1722 /// A double-precision floating-point value used to initialize the lower 64
1723 /// bits of the result.
1724 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1725 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1726 /// set to zero.
_mm_set_sd(double __w)1727 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1728 return __extension__(__m128d){__w, 0};
1729 }
1730
1731 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1732 /// of the two double-precision floating-point vector elements set to the
1733 /// specified double-precision floating-point value.
1734 ///
1735 /// \headerfile <x86intrin.h>
1736 ///
1737 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1738 ///
1739 /// \param __w
1740 /// A double-precision floating-point value used to initialize each vector
1741 /// element of the result.
1742 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set1_pd(double __w)1743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1744 return __extension__(__m128d){__w, __w};
1745 }
1746
1747 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1748 /// of the two double-precision floating-point vector elements set to the
1749 /// specified double-precision floating-point value.
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1754 ///
1755 /// \param __w
1756 /// A double-precision floating-point value used to initialize each vector
1757 /// element of the result.
1758 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd1(double __w)1759 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1760 return _mm_set1_pd(__w);
1761 }
1762
1763 /// Constructs a 128-bit floating-point vector of [2 x double]
1764 /// initialized with the specified double-precision floating-point values.
1765 ///
1766 /// \headerfile <x86intrin.h>
1767 ///
1768 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1769 ///
1770 /// \param __w
1771 /// A double-precision floating-point value used to initialize the upper 64
1772 /// bits of the result.
1773 /// \param __x
1774 /// A double-precision floating-point value used to initialize the lower 64
1775 /// bits of the result.
1776 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd(double __w,double __x)1777 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1778 double __x) {
1779 return __extension__(__m128d){__x, __w};
1780 }
1781
1782 /// Constructs a 128-bit floating-point vector of [2 x double],
1783 /// initialized in reverse order with the specified double-precision
1784 /// floating-point values.
1785 ///
1786 /// \headerfile <x86intrin.h>
1787 ///
1788 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1789 ///
1790 /// \param __w
1791 /// A double-precision floating-point value used to initialize the lower 64
1792 /// bits of the result.
1793 /// \param __x
1794 /// A double-precision floating-point value used to initialize the upper 64
1795 /// bits of the result.
1796 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_setr_pd(double __w,double __x)1797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1798 double __x) {
1799 return __extension__(__m128d){__w, __x};
1800 }
1801
1802 /// Constructs a 128-bit floating-point vector of [2 x double]
1803 /// initialized to zero.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1808 ///
1809 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1810 /// all elements set to zero.
_mm_setzero_pd(void)1811 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1812 return __extension__(__m128d){0.0, 0.0};
1813 }
1814
1815 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1816 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1817 /// 64 bits are set to the upper 64 bits of the first parameter.
1818 ///
1819 /// \headerfile <x86intrin.h>
1820 ///
1821 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1822 ///
1823 /// \param __a
1824 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1825 /// upper 64 bits of the result.
1826 /// \param __b
1827 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1828 /// lower 64 bits of the result.
1829 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_move_sd(__m128d __a,__m128d __b)1830 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1831 __m128d __b) {
1832 __a[0] = __b[0];
1833 return __a;
1834 }
1835
1836 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1837 /// memory location.
1838 ///
1839 /// \headerfile <x86intrin.h>
1840 ///
1841 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1842 ///
1843 /// \param __dp
1844 /// A pointer to a 64-bit memory location.
1845 /// \param __a
1846 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_store_sd(double * __dp,__m128d __a)1847 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1848 __m128d __a) {
1849 struct __mm_store_sd_struct {
1850 double __u;
1851 } __attribute__((__packed__, __may_alias__));
1852 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1853 }
1854
1855 /// Moves packed double-precision values from a 128-bit vector of
1856 /// [2 x double] to a memory location.
1857 ///
1858 /// \headerfile <x86intrin.h>
1859 ///
1860 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1861 ///
1862 /// \param __dp
1863 /// A pointer to an aligned memory location that can store two
1864 /// double-precision values.
1865 /// \param __a
1866 /// A packed 128-bit vector of [2 x double] containing the values to be
1867 /// moved.
_mm_store_pd(double * __dp,__m128d __a)1868 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1869 __m128d __a) {
1870 *(__m128d *)__dp = __a;
1871 }
1872
1873 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1874 /// the upper and lower 64 bits of a memory location.
1875 ///
1876 /// \headerfile <x86intrin.h>
1877 ///
1878 /// This intrinsic corresponds to the
1879 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1880 ///
1881 /// \param __dp
1882 /// A pointer to a memory location that can store two double-precision
1883 /// values.
1884 /// \param __a
1885 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1886 /// of the values in \a __dp.
_mm_store1_pd(double * __dp,__m128d __a)1887 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1888 __m128d __a) {
1889 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1890 _mm_store_pd(__dp, __a);
1891 }
1892
1893 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1894 /// the upper and lower 64 bits of a memory location.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the
1899 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1900 ///
1901 /// \param __dp
1902 /// A pointer to a memory location that can store two double-precision
1903 /// values.
1904 /// \param __a
1905 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1906 /// of the values in \a __dp.
_mm_store_pd1(double * __dp,__m128d __a)1907 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1908 __m128d __a) {
1909 _mm_store1_pd(__dp, __a);
1910 }
1911
1912 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1913 /// location.
1914 ///
1915 /// \headerfile <x86intrin.h>
1916 ///
1917 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1918 ///
1919 /// \param __dp
1920 /// A pointer to a 128-bit memory location. The address of the memory
1921 /// location does not have to be aligned.
1922 /// \param __a
1923 /// A 128-bit vector of [2 x double] containing the values to be stored.
_mm_storeu_pd(double * __dp,__m128d __a)1924 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1925 __m128d __a) {
1926 struct __storeu_pd {
1927 __m128d_u __v;
1928 } __attribute__((__packed__, __may_alias__));
1929 ((struct __storeu_pd *)__dp)->__v = __a;
1930 }
1931
1932 /// Stores two double-precision values, in reverse order, from a 128-bit
1933 /// vector of [2 x double] to a 16-byte aligned memory location.
1934 ///
1935 /// \headerfile <x86intrin.h>
1936 ///
1937 /// This intrinsic corresponds to a shuffling instruction followed by a
1938 /// <c> VMOVAPD / MOVAPD </c> instruction.
1939 ///
1940 /// \param __dp
1941 /// A pointer to a 16-byte aligned memory location that can store two
1942 /// double-precision values.
1943 /// \param __a
1944 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1945 /// stored.
_mm_storer_pd(double * __dp,__m128d __a)1946 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1947 __m128d __a) {
1948 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1949 *(__m128d *)__dp = __a;
1950 }
1951
1952 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1953 /// memory location.
1954 ///
1955 /// \headerfile <x86intrin.h>
1956 ///
1957 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1958 ///
1959 /// \param __dp
1960 /// A pointer to a 64-bit memory location.
1961 /// \param __a
1962 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storeh_pd(double * __dp,__m128d __a)1963 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1964 __m128d __a) {
1965 struct __mm_storeh_pd_struct {
1966 double __u;
1967 } __attribute__((__packed__, __may_alias__));
1968 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1969 }
1970
1971 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1972 /// memory location.
1973 ///
1974 /// \headerfile <x86intrin.h>
1975 ///
1976 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1977 ///
1978 /// \param __dp
1979 /// A pointer to a 64-bit memory location.
1980 /// \param __a
1981 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storel_pd(double * __dp,__m128d __a)1982 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1983 __m128d __a) {
1984 struct __mm_storeh_pd_struct {
1985 double __u;
1986 } __attribute__((__packed__, __may_alias__));
1987 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1988 }
1989
1990 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1991 /// saving the lower 8 bits of each sum in the corresponding element of a
1992 /// 128-bit result vector of [16 x i8].
1993 ///
1994 /// The integer elements of both parameters can be either signed or unsigned.
1995 ///
1996 /// \headerfile <x86intrin.h>
1997 ///
1998 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
1999 ///
2000 /// \param __a
2001 /// A 128-bit vector of [16 x i8].
2002 /// \param __b
2003 /// A 128-bit vector of [16 x i8].
2004 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2005 /// parameters.
_mm_add_epi8(__m128i __a,__m128i __b)2006 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2007 __m128i __b) {
2008 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2009 }
2010
2011 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2012 /// saving the lower 16 bits of each sum in the corresponding element of a
2013 /// 128-bit result vector of [8 x i16].
2014 ///
2015 /// The integer elements of both parameters can be either signed or unsigned.
2016 ///
2017 /// \headerfile <x86intrin.h>
2018 ///
2019 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2020 ///
2021 /// \param __a
2022 /// A 128-bit vector of [8 x i16].
2023 /// \param __b
2024 /// A 128-bit vector of [8 x i16].
2025 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2026 /// parameters.
_mm_add_epi16(__m128i __a,__m128i __b)2027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2028 __m128i __b) {
2029 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2030 }
2031
2032 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2033 /// saving the lower 32 bits of each sum in the corresponding element of a
2034 /// 128-bit result vector of [4 x i32].
2035 ///
2036 /// The integer elements of both parameters can be either signed or unsigned.
2037 ///
2038 /// \headerfile <x86intrin.h>
2039 ///
2040 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2041 ///
2042 /// \param __a
2043 /// A 128-bit vector of [4 x i32].
2044 /// \param __b
2045 /// A 128-bit vector of [4 x i32].
2046 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2047 /// parameters.
_mm_add_epi32(__m128i __a,__m128i __b)2048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2049 __m128i __b) {
2050 return (__m128i)((__v4su)__a + (__v4su)__b);
2051 }
2052
2053 /// Adds two signed or unsigned 64-bit integer values, returning the
2054 /// lower 64 bits of the sum.
2055 ///
2056 /// \headerfile <x86intrin.h>
2057 ///
2058 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2059 ///
2060 /// \param __a
2061 /// A 64-bit integer.
2062 /// \param __b
2063 /// A 64-bit integer.
2064 /// \returns A 64-bit integer containing the sum of both parameters.
_mm_add_si64(__m64 __a,__m64 __b)2065 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2066 __m64 __b) {
2067 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2068 }
2069
2070 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2071 /// saving the lower 64 bits of each sum in the corresponding element of a
2072 /// 128-bit result vector of [2 x i64].
2073 ///
2074 /// The integer elements of both parameters can be either signed or unsigned.
2075 ///
2076 /// \headerfile <x86intrin.h>
2077 ///
2078 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2079 ///
2080 /// \param __a
2081 /// A 128-bit vector of [2 x i64].
2082 /// \param __b
2083 /// A 128-bit vector of [2 x i64].
2084 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2085 /// parameters.
_mm_add_epi64(__m128i __a,__m128i __b)2086 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2087 __m128i __b) {
2088 return (__m128i)((__v2du)__a + (__v2du)__b);
2089 }
2090
2091 /// Adds, with saturation, the corresponding elements of two 128-bit
2092 /// signed [16 x i8] vectors, saving each sum in the corresponding element
2093 /// of a 128-bit result vector of [16 x i8].
2094 ///
2095 /// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2096 /// less than 0x80 are saturated to 0x80.
2097 ///
2098 /// \headerfile <x86intrin.h>
2099 ///
2100 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2101 ///
2102 /// \param __a
2103 /// A 128-bit signed [16 x i8] vector.
2104 /// \param __b
2105 /// A 128-bit signed [16 x i8] vector.
2106 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2107 /// both parameters.
_mm_adds_epi8(__m128i __a,__m128i __b)2108 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2109 __m128i __b) {
2110 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2111 }
2112
2113 /// Adds, with saturation, the corresponding elements of two 128-bit
2114 /// signed [8 x i16] vectors, saving each sum in the corresponding element
2115 /// of a 128-bit result vector of [8 x i16].
2116 ///
2117 /// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2118 /// less than 0x8000 are saturated to 0x8000.
2119 ///
2120 /// \headerfile <x86intrin.h>
2121 ///
2122 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2123 ///
2124 /// \param __a
2125 /// A 128-bit signed [8 x i16] vector.
2126 /// \param __b
2127 /// A 128-bit signed [8 x i16] vector.
2128 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2129 /// both parameters.
_mm_adds_epi16(__m128i __a,__m128i __b)2130 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2131 __m128i __b) {
2132 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2133 }
2134
2135 /// Adds, with saturation, the corresponding elements of two 128-bit
2136 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2137 /// of a 128-bit result vector of [16 x i8].
2138 ///
2139 /// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2140 /// saturated to 0x00.
2141 ///
2142 /// \headerfile <x86intrin.h>
2143 ///
2144 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2145 ///
2146 /// \param __a
2147 /// A 128-bit unsigned [16 x i8] vector.
2148 /// \param __b
2149 /// A 128-bit unsigned [16 x i8] vector.
2150 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2151 /// of both parameters.
_mm_adds_epu8(__m128i __a,__m128i __b)2152 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2153 __m128i __b) {
2154 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2155 }
2156
2157 /// Adds, with saturation, the corresponding elements of two 128-bit
2158 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2159 /// of a 128-bit result vector of [8 x i16].
2160 ///
2161 /// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2162 /// are saturated to 0x0000.
2163 ///
2164 /// \headerfile <x86intrin.h>
2165 ///
2166 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2167 ///
2168 /// \param __a
2169 /// A 128-bit unsigned [8 x i16] vector.
2170 /// \param __b
2171 /// A 128-bit unsigned [8 x i16] vector.
2172 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2173 /// of both parameters.
_mm_adds_epu16(__m128i __a,__m128i __b)2174 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2175 __m128i __b) {
2176 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2177 }
2178
2179 /// Computes the rounded averages of corresponding elements of two
2180 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2181 /// corresponding element of a 128-bit result vector of [16 x i8].
2182 ///
2183 /// \headerfile <x86intrin.h>
2184 ///
2185 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2186 ///
2187 /// \param __a
2188 /// A 128-bit unsigned [16 x i8] vector.
2189 /// \param __b
2190 /// A 128-bit unsigned [16 x i8] vector.
2191 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2192 /// averages of both parameters.
_mm_avg_epu8(__m128i __a,__m128i __b)2193 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2194 __m128i __b) {
2195 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2196 }
2197
2198 /// Computes the rounded averages of corresponding elements of two
2199 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2200 /// corresponding element of a 128-bit result vector of [8 x i16].
2201 ///
2202 /// \headerfile <x86intrin.h>
2203 ///
2204 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2205 ///
2206 /// \param __a
2207 /// A 128-bit unsigned [8 x i16] vector.
2208 /// \param __b
2209 /// A 128-bit unsigned [8 x i16] vector.
2210 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2211 /// averages of both parameters.
_mm_avg_epu16(__m128i __a,__m128i __b)2212 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2213 __m128i __b) {
2214 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2215 }
2216
2217 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2218 /// vectors, producing eight intermediate 32-bit signed integer products, and
2219 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2220 /// [4 x i32] vector.
2221 ///
2222 /// For example, bits [15:0] of both parameters are multiplied producing a
2223 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2224 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2225 /// of the result.
2226 ///
2227 /// \headerfile <x86intrin.h>
2228 ///
2229 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2230 ///
2231 /// \param __a
2232 /// A 128-bit signed [8 x i16] vector.
2233 /// \param __b
2234 /// A 128-bit signed [8 x i16] vector.
2235 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2236 /// of both parameters.
_mm_madd_epi16(__m128i __a,__m128i __b)2237 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2238 __m128i __b) {
2239 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2240 }
2241
2242 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2243 /// vectors, saving the greater value from each comparison in the
2244 /// corresponding element of a 128-bit result vector of [8 x i16].
2245 ///
2246 /// \headerfile <x86intrin.h>
2247 ///
2248 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2249 ///
2250 /// \param __a
2251 /// A 128-bit signed [8 x i16] vector.
2252 /// \param __b
2253 /// A 128-bit signed [8 x i16] vector.
2254 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2255 /// each comparison.
_mm_max_epi16(__m128i __a,__m128i __b)2256 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2257 __m128i __b) {
2258 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2259 }
2260
2261 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2262 /// vectors, saving the greater value from each comparison in the
2263 /// corresponding element of a 128-bit result vector of [16 x i8].
2264 ///
2265 /// \headerfile <x86intrin.h>
2266 ///
2267 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2268 ///
2269 /// \param __a
2270 /// A 128-bit unsigned [16 x i8] vector.
2271 /// \param __b
2272 /// A 128-bit unsigned [16 x i8] vector.
2273 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2274 /// each comparison.
_mm_max_epu8(__m128i __a,__m128i __b)2275 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2276 __m128i __b) {
2277 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2278 }
2279
2280 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2281 /// vectors, saving the smaller value from each comparison in the
2282 /// corresponding element of a 128-bit result vector of [8 x i16].
2283 ///
2284 /// \headerfile <x86intrin.h>
2285 ///
2286 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2287 ///
2288 /// \param __a
2289 /// A 128-bit signed [8 x i16] vector.
2290 /// \param __b
2291 /// A 128-bit signed [8 x i16] vector.
2292 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2293 /// each comparison.
_mm_min_epi16(__m128i __a,__m128i __b)2294 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2295 __m128i __b) {
2296 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2297 }
2298
2299 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2300 /// vectors, saving the smaller value from each comparison in the
2301 /// corresponding element of a 128-bit result vector of [16 x i8].
2302 ///
2303 /// \headerfile <x86intrin.h>
2304 ///
2305 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2306 ///
2307 /// \param __a
2308 /// A 128-bit unsigned [16 x i8] vector.
2309 /// \param __b
2310 /// A 128-bit unsigned [16 x i8] vector.
2311 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2312 /// each comparison.
_mm_min_epu8(__m128i __a,__m128i __b)2313 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2314 __m128i __b) {
2315 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2316 }
2317
2318 /// Multiplies the corresponding elements of two signed [8 x i16]
2319 /// vectors, saving the upper 16 bits of each 32-bit product in the
2320 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2321 ///
2322 /// \headerfile <x86intrin.h>
2323 ///
2324 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2325 ///
2326 /// \param __a
2327 /// A 128-bit signed [8 x i16] vector.
2328 /// \param __b
2329 /// A 128-bit signed [8 x i16] vector.
2330 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2331 /// each of the eight 32-bit products.
_mm_mulhi_epi16(__m128i __a,__m128i __b)2332 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2333 __m128i __b) {
2334 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2335 }
2336
2337 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2338 /// vectors, saving the upper 16 bits of each 32-bit product in the
2339 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2340 ///
2341 /// \headerfile <x86intrin.h>
2342 ///
2343 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2344 ///
2345 /// \param __a
2346 /// A 128-bit unsigned [8 x i16] vector.
2347 /// \param __b
2348 /// A 128-bit unsigned [8 x i16] vector.
2349 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2350 /// of each of the eight 32-bit products.
_mm_mulhi_epu16(__m128i __a,__m128i __b)2351 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2352 __m128i __b) {
2353 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2354 }
2355
2356 /// Multiplies the corresponding elements of two signed [8 x i16]
2357 /// vectors, saving the lower 16 bits of each 32-bit product in the
2358 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2359 ///
2360 /// \headerfile <x86intrin.h>
2361 ///
2362 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2363 ///
2364 /// \param __a
2365 /// A 128-bit signed [8 x i16] vector.
2366 /// \param __b
2367 /// A 128-bit signed [8 x i16] vector.
2368 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2369 /// each of the eight 32-bit products.
_mm_mullo_epi16(__m128i __a,__m128i __b)2370 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2371 __m128i __b) {
2372 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2373 }
2374
2375 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2376 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2377 /// product.
2378 ///
2379 /// \headerfile <x86intrin.h>
2380 ///
2381 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2382 ///
2383 /// \param __a
2384 /// A 64-bit integer containing one of the source operands.
2385 /// \param __b
2386 /// A 64-bit integer containing one of the source operands.
2387 /// \returns A 64-bit integer vector containing the product of both operands.
_mm_mul_su32(__m64 __a,__m64 __b)2388 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2389 __m64 __b) {
2390 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2391 }
2392
2393 /// Multiplies 32-bit unsigned integer values contained in the lower
2394 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2395 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2396 ///
2397 /// \headerfile <x86intrin.h>
2398 ///
2399 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2400 ///
2401 /// \param __a
2402 /// A [2 x i64] vector containing one of the source operands.
2403 /// \param __b
2404 /// A [2 x i64] vector containing one of the source operands.
2405 /// \returns A [2 x i64] vector containing the product of both operands.
_mm_mul_epu32(__m128i __a,__m128i __b)2406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2407 __m128i __b) {
2408 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2409 }
2410
2411 /// Computes the absolute differences of corresponding 8-bit integer
2412 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2413 /// separately sums the second 8 absolute differences. Packs these two
2414 /// unsigned 16-bit integer sums into the upper and lower elements of a
2415 /// [2 x i64] vector.
2416 ///
2417 /// \headerfile <x86intrin.h>
2418 ///
2419 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2420 ///
2421 /// \param __a
2422 /// A 128-bit integer vector containing one of the source operands.
2423 /// \param __b
2424 /// A 128-bit integer vector containing one of the source operands.
2425 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2426 /// differences between both operands.
_mm_sad_epu8(__m128i __a,__m128i __b)2427 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2428 __m128i __b) {
2429 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2430 }
2431
2432 /// Subtracts the corresponding 8-bit integer values in the operands.
2433 ///
2434 /// \headerfile <x86intrin.h>
2435 ///
2436 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2437 ///
2438 /// \param __a
2439 /// A 128-bit integer vector containing the minuends.
2440 /// \param __b
2441 /// A 128-bit integer vector containing the subtrahends.
2442 /// \returns A 128-bit integer vector containing the differences of the values
2443 /// in the operands.
_mm_sub_epi8(__m128i __a,__m128i __b)2444 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2445 __m128i __b) {
2446 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2447 }
2448
2449 /// Subtracts the corresponding 16-bit integer values in the operands.
2450 ///
2451 /// \headerfile <x86intrin.h>
2452 ///
2453 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2454 ///
2455 /// \param __a
2456 /// A 128-bit integer vector containing the minuends.
2457 /// \param __b
2458 /// A 128-bit integer vector containing the subtrahends.
2459 /// \returns A 128-bit integer vector containing the differences of the values
2460 /// in the operands.
_mm_sub_epi16(__m128i __a,__m128i __b)2461 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2462 __m128i __b) {
2463 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2464 }
2465
2466 /// Subtracts the corresponding 32-bit integer values in the operands.
2467 ///
2468 /// \headerfile <x86intrin.h>
2469 ///
2470 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2471 ///
2472 /// \param __a
2473 /// A 128-bit integer vector containing the minuends.
2474 /// \param __b
2475 /// A 128-bit integer vector containing the subtrahends.
2476 /// \returns A 128-bit integer vector containing the differences of the values
2477 /// in the operands.
_mm_sub_epi32(__m128i __a,__m128i __b)2478 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2479 __m128i __b) {
2480 return (__m128i)((__v4su)__a - (__v4su)__b);
2481 }
2482
2483 /// Subtracts signed or unsigned 64-bit integer values and writes the
2484 /// difference to the corresponding bits in the destination.
2485 ///
2486 /// \headerfile <x86intrin.h>
2487 ///
2488 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2489 ///
2490 /// \param __a
2491 /// A 64-bit integer vector containing the minuend.
2492 /// \param __b
2493 /// A 64-bit integer vector containing the subtrahend.
2494 /// \returns A 64-bit integer vector containing the difference of the values in
2495 /// the operands.
_mm_sub_si64(__m64 __a,__m64 __b)2496 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2497 __m64 __b) {
2498 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2499 }
2500
2501 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2502 ///
2503 /// \headerfile <x86intrin.h>
2504 ///
2505 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2506 ///
2507 /// \param __a
2508 /// A 128-bit integer vector containing the minuends.
2509 /// \param __b
2510 /// A 128-bit integer vector containing the subtrahends.
2511 /// \returns A 128-bit integer vector containing the differences of the values
2512 /// in the operands.
_mm_sub_epi64(__m128i __a,__m128i __b)2513 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2514 __m128i __b) {
2515 return (__m128i)((__v2du)__a - (__v2du)__b);
2516 }
2517
2518 /// Subtracts, with saturation, corresponding 8-bit signed integer values in
2519 /// the input and returns the differences in the corresponding bytes in the
2520 /// destination.
2521 ///
2522 /// Differences greater than 0x7F are saturated to 0x7F, and differences
2523 /// less than 0x80 are saturated to 0x80.
2524 ///
2525 /// \headerfile <x86intrin.h>
2526 ///
2527 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2528 ///
2529 /// \param __a
2530 /// A 128-bit integer vector containing the minuends.
2531 /// \param __b
2532 /// A 128-bit integer vector containing the subtrahends.
2533 /// \returns A 128-bit integer vector containing the differences of the values
2534 /// in the operands.
_mm_subs_epi8(__m128i __a,__m128i __b)2535 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2536 __m128i __b) {
2537 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2538 }
2539
2540 /// Subtracts, with saturation, corresponding 16-bit signed integer values in
2541 /// the input and returns the differences in the corresponding bytes in the
2542 /// destination.
2543 ///
2544 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2545 /// than 0x8000 are saturated to 0x8000.
2546 ///
2547 /// \headerfile <x86intrin.h>
2548 ///
2549 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2550 ///
2551 /// \param __a
2552 /// A 128-bit integer vector containing the minuends.
2553 /// \param __b
2554 /// A 128-bit integer vector containing the subtrahends.
2555 /// \returns A 128-bit integer vector containing the differences of the values
2556 /// in the operands.
_mm_subs_epi16(__m128i __a,__m128i __b)2557 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2558 __m128i __b) {
2559 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2560 }
2561
2562 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2563 /// the input and returns the differences in the corresponding bytes in the
2564 /// destination.
2565 ///
2566 /// Differences less than 0x00 are saturated to 0x00.
2567 ///
2568 /// \headerfile <x86intrin.h>
2569 ///
2570 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2571 ///
2572 /// \param __a
2573 /// A 128-bit integer vector containing the minuends.
2574 /// \param __b
2575 /// A 128-bit integer vector containing the subtrahends.
2576 /// \returns A 128-bit integer vector containing the unsigned integer
2577 /// differences of the values in the operands.
_mm_subs_epu8(__m128i __a,__m128i __b)2578 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2579 __m128i __b) {
2580 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2581 }
2582
2583 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2584 /// the input and returns the differences in the corresponding bytes in the
2585 /// destination.
2586 ///
2587 /// Differences less than 0x0000 are saturated to 0x0000.
2588 ///
2589 /// \headerfile <x86intrin.h>
2590 ///
2591 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2592 ///
2593 /// \param __a
2594 /// A 128-bit integer vector containing the minuends.
2595 /// \param __b
2596 /// A 128-bit integer vector containing the subtrahends.
2597 /// \returns A 128-bit integer vector containing the unsigned integer
2598 /// differences of the values in the operands.
_mm_subs_epu16(__m128i __a,__m128i __b)2599 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2600 __m128i __b) {
2601 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2602 }
2603
2604 /// Performs a bitwise AND of two 128-bit integer vectors.
2605 ///
2606 /// \headerfile <x86intrin.h>
2607 ///
2608 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2609 ///
2610 /// \param __a
2611 /// A 128-bit integer vector containing one of the source operands.
2612 /// \param __b
2613 /// A 128-bit integer vector containing one of the source operands.
2614 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2615 /// in both operands.
_mm_and_si128(__m128i __a,__m128i __b)2616 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2617 __m128i __b) {
2618 return (__m128i)((__v2du)__a & (__v2du)__b);
2619 }
2620
2621 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2622 /// one's complement of the values contained in the first source operand.
2623 ///
2624 /// \headerfile <x86intrin.h>
2625 ///
2626 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2627 ///
2628 /// \param __a
2629 /// A 128-bit vector containing the left source operand. The one's complement
2630 /// of this value is used in the bitwise AND.
2631 /// \param __b
2632 /// A 128-bit vector containing the right source operand.
2633 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2634 /// complement of the first operand and the values in the second operand.
_mm_andnot_si128(__m128i __a,__m128i __b)2635 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2636 __m128i __b) {
2637 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2638 }
2639 /// Performs a bitwise OR of two 128-bit integer vectors.
2640 ///
2641 /// \headerfile <x86intrin.h>
2642 ///
2643 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2644 ///
2645 /// \param __a
2646 /// A 128-bit integer vector containing one of the source operands.
2647 /// \param __b
2648 /// A 128-bit integer vector containing one of the source operands.
2649 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2650 /// in both operands.
_mm_or_si128(__m128i __a,__m128i __b)2651 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2652 __m128i __b) {
2653 return (__m128i)((__v2du)__a | (__v2du)__b);
2654 }
2655
2656 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2657 ///
2658 /// \headerfile <x86intrin.h>
2659 ///
2660 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2661 ///
2662 /// \param __a
2663 /// A 128-bit integer vector containing one of the source operands.
2664 /// \param __b
2665 /// A 128-bit integer vector containing one of the source operands.
2666 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2667 /// values in both operands.
_mm_xor_si128(__m128i __a,__m128i __b)2668 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2669 __m128i __b) {
2670 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2671 }
2672
2673 /// Left-shifts the 128-bit integer vector operand by the specified
2674 /// number of bytes. Low-order bits are cleared.
2675 ///
2676 /// \headerfile <x86intrin.h>
2677 ///
2678 /// \code
2679 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2680 /// \endcode
2681 ///
2682 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2683 ///
2684 /// \param a
2685 /// A 128-bit integer vector containing the source operand.
2686 /// \param imm
2687 /// An immediate value specifying the number of bytes to left-shift operand
2688 /// \a a.
2689 /// \returns A 128-bit integer vector containing the left-shifted value.
2690 #define _mm_slli_si128(a, imm) \
2691 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2692 (int)(imm)))
2693
2694 #define _mm_bslli_si128(a, imm) \
2695 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2696 (int)(imm)))
2697
2698 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2699 /// by the specified number of bits. Low-order bits are cleared.
2700 ///
2701 /// \headerfile <x86intrin.h>
2702 ///
2703 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2704 ///
2705 /// \param __a
2706 /// A 128-bit integer vector containing the source operand.
2707 /// \param __count
2708 /// An integer value specifying the number of bits to left-shift each value
2709 /// in operand \a __a.
2710 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi16(__m128i __a,int __count)2711 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2712 int __count) {
2713 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2714 }
2715
2716 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2717 /// by the specified number of bits. Low-order bits are cleared.
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2722 ///
2723 /// \param __a
2724 /// A 128-bit integer vector containing the source operand.
2725 /// \param __count
2726 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2727 /// to left-shift each value in operand \a __a.
2728 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi16(__m128i __a,__m128i __count)2729 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2730 __m128i __count) {
2731 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2732 }
2733
2734 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2735 /// by the specified number of bits. Low-order bits are cleared.
2736 ///
2737 /// \headerfile <x86intrin.h>
2738 ///
2739 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2740 ///
2741 /// \param __a
2742 /// A 128-bit integer vector containing the source operand.
2743 /// \param __count
2744 /// An integer value specifying the number of bits to left-shift each value
2745 /// in operand \a __a.
2746 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi32(__m128i __a,int __count)2747 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2748 int __count) {
2749 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2750 }
2751
2752 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2753 /// by the specified number of bits. Low-order bits are cleared.
2754 ///
2755 /// \headerfile <x86intrin.h>
2756 ///
2757 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2758 ///
2759 /// \param __a
2760 /// A 128-bit integer vector containing the source operand.
2761 /// \param __count
2762 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2763 /// to left-shift each value in operand \a __a.
2764 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi32(__m128i __a,__m128i __count)2765 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2766 __m128i __count) {
2767 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2768 }
2769
2770 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2771 /// by the specified number of bits. Low-order bits are cleared.
2772 ///
2773 /// \headerfile <x86intrin.h>
2774 ///
2775 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2776 ///
2777 /// \param __a
2778 /// A 128-bit integer vector containing the source operand.
2779 /// \param __count
2780 /// An integer value specifying the number of bits to left-shift each value
2781 /// in operand \a __a.
2782 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi64(__m128i __a,int __count)2783 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2784 int __count) {
2785 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2786 }
2787
2788 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2789 /// by the specified number of bits. Low-order bits are cleared.
2790 ///
2791 /// \headerfile <x86intrin.h>
2792 ///
2793 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2794 ///
2795 /// \param __a
2796 /// A 128-bit integer vector containing the source operand.
2797 /// \param __count
2798 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2799 /// to left-shift each value in operand \a __a.
2800 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi64(__m128i __a,__m128i __count)2801 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2802 __m128i __count) {
2803 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2804 }
2805
2806 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2807 /// by the specified number of bits. High-order bits are filled with the sign
2808 /// bit of the initial value.
2809 ///
2810 /// \headerfile <x86intrin.h>
2811 ///
2812 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2813 ///
2814 /// \param __a
2815 /// A 128-bit integer vector containing the source operand.
2816 /// \param __count
2817 /// An integer value specifying the number of bits to right-shift each value
2818 /// in operand \a __a.
2819 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi16(__m128i __a,int __count)2820 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2821 int __count) {
2822 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2823 }
2824
2825 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2826 /// by the specified number of bits. High-order bits are filled with the sign
2827 /// bit of the initial value.
2828 ///
2829 /// \headerfile <x86intrin.h>
2830 ///
2831 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2832 ///
2833 /// \param __a
2834 /// A 128-bit integer vector containing the source operand.
2835 /// \param __count
2836 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2837 /// to right-shift each value in operand \a __a.
2838 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi16(__m128i __a,__m128i __count)2839 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2840 __m128i __count) {
2841 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2842 }
2843
2844 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2845 /// by the specified number of bits. High-order bits are filled with the sign
2846 /// bit of the initial value.
2847 ///
2848 /// \headerfile <x86intrin.h>
2849 ///
2850 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2851 ///
2852 /// \param __a
2853 /// A 128-bit integer vector containing the source operand.
2854 /// \param __count
2855 /// An integer value specifying the number of bits to right-shift each value
2856 /// in operand \a __a.
2857 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi32(__m128i __a,int __count)2858 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2859 int __count) {
2860 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2861 }
2862
2863 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2864 /// by the specified number of bits. High-order bits are filled with the sign
2865 /// bit of the initial value.
2866 ///
2867 /// \headerfile <x86intrin.h>
2868 ///
2869 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2870 ///
2871 /// \param __a
2872 /// A 128-bit integer vector containing the source operand.
2873 /// \param __count
2874 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2875 /// to right-shift each value in operand \a __a.
2876 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi32(__m128i __a,__m128i __count)2877 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2878 __m128i __count) {
2879 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2880 }
2881
2882 /// Right-shifts the 128-bit integer vector operand by the specified
2883 /// number of bytes. High-order bits are cleared.
2884 ///
2885 /// \headerfile <x86intrin.h>
2886 ///
2887 /// \code
2888 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2889 /// \endcode
2890 ///
2891 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2892 ///
2893 /// \param a
2894 /// A 128-bit integer vector containing the source operand.
2895 /// \param imm
2896 /// An immediate value specifying the number of bytes to right-shift operand
2897 /// \a a.
2898 /// \returns A 128-bit integer vector containing the right-shifted value.
2899 #define _mm_srli_si128(a, imm) \
2900 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2901 (int)(imm)))
2902
2903 #define _mm_bsrli_si128(a, imm) \
2904 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2905 (int)(imm)))
2906
2907 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2908 /// operand by the specified number of bits. High-order bits are cleared.
2909 ///
2910 /// \headerfile <x86intrin.h>
2911 ///
2912 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2913 ///
2914 /// \param __a
2915 /// A 128-bit integer vector containing the source operand.
2916 /// \param __count
2917 /// An integer value specifying the number of bits to right-shift each value
2918 /// in operand \a __a.
2919 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi16(__m128i __a,int __count)2920 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2921 int __count) {
2922 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2923 }
2924
2925 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2926 /// operand by the specified number of bits. High-order bits are cleared.
2927 ///
2928 /// \headerfile <x86intrin.h>
2929 ///
2930 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2931 ///
2932 /// \param __a
2933 /// A 128-bit integer vector containing the source operand.
2934 /// \param __count
2935 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2936 /// to right-shift each value in operand \a __a.
2937 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi16(__m128i __a,__m128i __count)2938 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2939 __m128i __count) {
2940 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2941 }
2942
2943 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2944 /// operand by the specified number of bits. High-order bits are cleared.
2945 ///
2946 /// \headerfile <x86intrin.h>
2947 ///
2948 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2949 ///
2950 /// \param __a
2951 /// A 128-bit integer vector containing the source operand.
2952 /// \param __count
2953 /// An integer value specifying the number of bits to right-shift each value
2954 /// in operand \a __a.
2955 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi32(__m128i __a,int __count)2956 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2957 int __count) {
2958 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2959 }
2960
2961 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2962 /// operand by the specified number of bits. High-order bits are cleared.
2963 ///
2964 /// \headerfile <x86intrin.h>
2965 ///
2966 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2967 ///
2968 /// \param __a
2969 /// A 128-bit integer vector containing the source operand.
2970 /// \param __count
2971 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2972 /// to right-shift each value in operand \a __a.
2973 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi32(__m128i __a,__m128i __count)2974 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2975 __m128i __count) {
2976 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2977 }
2978
2979 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2980 /// operand by the specified number of bits. High-order bits are cleared.
2981 ///
2982 /// \headerfile <x86intrin.h>
2983 ///
2984 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2985 ///
2986 /// \param __a
2987 /// A 128-bit integer vector containing the source operand.
2988 /// \param __count
2989 /// An integer value specifying the number of bits to right-shift each value
2990 /// in operand \a __a.
2991 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi64(__m128i __a,int __count)2992 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2993 int __count) {
2994 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2995 }
2996
2997 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2998 /// operand by the specified number of bits. High-order bits are cleared.
2999 ///
3000 /// \headerfile <x86intrin.h>
3001 ///
3002 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3003 ///
3004 /// \param __a
3005 /// A 128-bit integer vector containing the source operand.
3006 /// \param __count
3007 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3008 /// to right-shift each value in operand \a __a.
3009 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi64(__m128i __a,__m128i __count)3010 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3011 __m128i __count) {
3012 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3013 }
3014
3015 /// Compares each of the corresponding 8-bit values of the 128-bit
3016 /// integer vectors for equality.
3017 ///
3018 /// Each comparison yields 0x0 for false, 0xFF for true.
3019 ///
3020 /// \headerfile <x86intrin.h>
3021 ///
3022 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3023 ///
3024 /// \param __a
3025 /// A 128-bit integer vector.
3026 /// \param __b
3027 /// A 128-bit integer vector.
3028 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi8(__m128i __a,__m128i __b)3029 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3030 __m128i __b) {
3031 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3032 }
3033
3034 /// Compares each of the corresponding 16-bit values of the 128-bit
3035 /// integer vectors for equality.
3036 ///
3037 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3038 ///
3039 /// \headerfile <x86intrin.h>
3040 ///
3041 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3042 ///
3043 /// \param __a
3044 /// A 128-bit integer vector.
3045 /// \param __b
3046 /// A 128-bit integer vector.
3047 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi16(__m128i __a,__m128i __b)3048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3049 __m128i __b) {
3050 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3051 }
3052
3053 /// Compares each of the corresponding 32-bit values of the 128-bit
3054 /// integer vectors for equality.
3055 ///
3056 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3057 ///
3058 /// \headerfile <x86intrin.h>
3059 ///
3060 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3061 ///
3062 /// \param __a
3063 /// A 128-bit integer vector.
3064 /// \param __b
3065 /// A 128-bit integer vector.
3066 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi32(__m128i __a,__m128i __b)3067 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3068 __m128i __b) {
3069 return (__m128i)((__v4si)__a == (__v4si)__b);
3070 }
3071
3072 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3073 /// integer vectors to determine if the values in the first operand are
3074 /// greater than those in the second operand.
3075 ///
3076 /// Each comparison yields 0x0 for false, 0xFF for true.
3077 ///
3078 /// \headerfile <x86intrin.h>
3079 ///
3080 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3081 ///
3082 /// \param __a
3083 /// A 128-bit integer vector.
3084 /// \param __b
3085 /// A 128-bit integer vector.
3086 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi8(__m128i __a,__m128i __b)3087 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3088 __m128i __b) {
3089 /* This function always performs a signed comparison, but __v16qi is a char
3090 which may be signed or unsigned, so use __v16qs. */
3091 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3092 }
3093
3094 /// Compares each of the corresponding signed 16-bit values of the
3095 /// 128-bit integer vectors to determine if the values in the first operand
3096 /// are greater than those in the second operand.
3097 ///
3098 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3099 ///
3100 /// \headerfile <x86intrin.h>
3101 ///
3102 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3103 ///
3104 /// \param __a
3105 /// A 128-bit integer vector.
3106 /// \param __b
3107 /// A 128-bit integer vector.
3108 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi16(__m128i __a,__m128i __b)3109 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3110 __m128i __b) {
3111 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3112 }
3113
3114 /// Compares each of the corresponding signed 32-bit values of the
3115 /// 128-bit integer vectors to determine if the values in the first operand
3116 /// are greater than those in the second operand.
3117 ///
3118 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3119 ///
3120 /// \headerfile <x86intrin.h>
3121 ///
3122 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3123 ///
3124 /// \param __a
3125 /// A 128-bit integer vector.
3126 /// \param __b
3127 /// A 128-bit integer vector.
3128 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi32(__m128i __a,__m128i __b)3129 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3130 __m128i __b) {
3131 return (__m128i)((__v4si)__a > (__v4si)__b);
3132 }
3133
3134 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3135 /// integer vectors to determine if the values in the first operand are less
3136 /// than those in the second operand.
3137 ///
3138 /// Each comparison yields 0x0 for false, 0xFF for true.
3139 ///
3140 /// \headerfile <x86intrin.h>
3141 ///
3142 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3143 ///
3144 /// \param __a
3145 /// A 128-bit integer vector.
3146 /// \param __b
3147 /// A 128-bit integer vector.
3148 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi8(__m128i __a,__m128i __b)3149 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3150 __m128i __b) {
3151 return _mm_cmpgt_epi8(__b, __a);
3152 }
3153
3154 /// Compares each of the corresponding signed 16-bit values of the
3155 /// 128-bit integer vectors to determine if the values in the first operand
3156 /// are less than those in the second operand.
3157 ///
3158 /// Each comparison yields 0x0 for false, 0xFFFF for true.
3159 ///
3160 /// \headerfile <x86intrin.h>
3161 ///
3162 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3163 ///
3164 /// \param __a
3165 /// A 128-bit integer vector.
3166 /// \param __b
3167 /// A 128-bit integer vector.
3168 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi16(__m128i __a,__m128i __b)3169 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3170 __m128i __b) {
3171 return _mm_cmpgt_epi16(__b, __a);
3172 }
3173
3174 /// Compares each of the corresponding signed 32-bit values of the
3175 /// 128-bit integer vectors to determine if the values in the first operand
3176 /// are less than those in the second operand.
3177 ///
3178 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3179 ///
3180 /// \headerfile <x86intrin.h>
3181 ///
3182 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3183 ///
3184 /// \param __a
3185 /// A 128-bit integer vector.
3186 /// \param __b
3187 /// A 128-bit integer vector.
3188 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi32(__m128i __a,__m128i __b)3189 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3190 __m128i __b) {
3191 return _mm_cmpgt_epi32(__b, __a);
3192 }
3193
3194 #ifdef __x86_64__
3195 /// Converts a 64-bit signed integer value from the second operand into a
3196 /// double-precision value and returns it in the lower element of a [2 x
3197 /// double] vector; the upper element of the returned vector is copied from
3198 /// the upper element of the first operand.
3199 ///
3200 /// \headerfile <x86intrin.h>
3201 ///
3202 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3203 ///
3204 /// \param __a
3205 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3206 /// copied to the upper 64 bits of the destination.
3207 /// \param __b
3208 /// A 64-bit signed integer operand containing the value to be converted.
3209 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3210 /// converted value of the second operand. The upper 64 bits are copied from
3211 /// the upper 64 bits of the first operand.
_mm_cvtsi64_sd(__m128d __a,long long __b)3212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3213 long long __b) {
3214 __a[0] = __b;
3215 return __a;
3216 }
3217
3218 /// Converts the first (lower) element of a vector of [2 x double] into a
3219 /// 64-bit signed integer value, according to the current rounding mode.
3220 ///
3221 /// \headerfile <x86intrin.h>
3222 ///
3223 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3224 ///
3225 /// \param __a
3226 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3227 /// conversion.
3228 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvtsd_si64(__m128d __a)3229 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3230 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3231 }
3232
3233 /// Converts the first (lower) element of a vector of [2 x double] into a
3234 /// 64-bit signed integer value, truncating the result when it is inexact.
3235 ///
3236 /// \headerfile <x86intrin.h>
3237 ///
3238 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3239 /// instruction.
3240 ///
3241 /// \param __a
3242 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3243 /// conversion.
3244 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvttsd_si64(__m128d __a)3245 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3246 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3247 }
3248 #endif
3249
3250 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3251 ///
3252 /// \headerfile <x86intrin.h>
3253 ///
3254 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3255 ///
3256 /// \param __a
3257 /// A 128-bit integer vector.
3258 /// \returns A 128-bit vector of [4 x float] containing the converted values.
_mm_cvtepi32_ps(__m128i __a)3259 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3260 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3261 }
3262
3263 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3264 ///
3265 /// \headerfile <x86intrin.h>
3266 ///
3267 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3268 ///
3269 /// \param __a
3270 /// A 128-bit vector of [4 x float].
3271 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3272 /// values.
_mm_cvtps_epi32(__m128 __a)3273 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3274 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3275 }
3276
3277 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3278 /// truncating the result when it is inexact.
3279 ///
3280 /// \headerfile <x86intrin.h>
3281 ///
3282 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3283 /// instruction.
3284 ///
3285 /// \param __a
3286 /// A 128-bit vector of [4 x float].
3287 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
_mm_cvttps_epi32(__m128 __a)3288 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3289 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3290 }
3291
3292 /// Returns a vector of [4 x i32] where the lowest element is the input
3293 /// operand and the remaining elements are zero.
3294 ///
3295 /// \headerfile <x86intrin.h>
3296 ///
3297 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3298 ///
3299 /// \param __a
3300 /// A 32-bit signed integer operand.
3301 /// \returns A 128-bit vector of [4 x i32].
_mm_cvtsi32_si128(int __a)3302 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3303 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3304 }
3305
3306 /// Returns a vector of [2 x i64] where the lower element is the input
3307 /// operand and the upper element is zero.
3308 ///
3309 /// \headerfile <x86intrin.h>
3310 ///
3311 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3312 /// in 64-bit mode.
3313 ///
3314 /// \param __a
3315 /// A 64-bit signed integer operand containing the value to be converted.
3316 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
_mm_cvtsi64_si128(long long __a)3317 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3318 return __extension__(__m128i)(__v2di){__a, 0};
3319 }
3320
3321 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3322 /// 32-bit signed integer value.
3323 ///
3324 /// \headerfile <x86intrin.h>
3325 ///
3326 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3327 ///
3328 /// \param __a
3329 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3330 /// destination.
3331 /// \returns A 32-bit signed integer containing the moved value.
_mm_cvtsi128_si32(__m128i __a)3332 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3333 __v4si __b = (__v4si)__a;
3334 return __b[0];
3335 }
3336
3337 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3338 /// 64-bit signed integer value.
3339 ///
3340 /// \headerfile <x86intrin.h>
3341 ///
3342 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3343 ///
3344 /// \param __a
3345 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3346 /// destination.
3347 /// \returns A 64-bit signed integer containing the moved value.
_mm_cvtsi128_si64(__m128i __a)3348 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3349 return __a[0];
3350 }
3351
3352 /// Moves packed integer values from an aligned 128-bit memory location
3353 /// to elements in a 128-bit integer vector.
3354 ///
3355 /// \headerfile <x86intrin.h>
3356 ///
3357 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3358 ///
3359 /// \param __p
3360 /// An aligned pointer to a memory location containing integer values.
3361 /// \returns A 128-bit integer vector containing the moved values.
3362 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_load_si128(__m128i const * __p)3363 _mm_load_si128(__m128i const *__p) {
3364 return *__p;
3365 }
3366
3367 /// Moves packed integer values from an unaligned 128-bit memory location
3368 /// to elements in a 128-bit integer vector.
3369 ///
3370 /// \headerfile <x86intrin.h>
3371 ///
3372 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3373 ///
3374 /// \param __p
3375 /// A pointer to a memory location containing integer values.
3376 /// \returns A 128-bit integer vector containing the moved values.
3377 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i_u const * __p)3378 _mm_loadu_si128(__m128i_u const *__p) {
3379 struct __loadu_si128 {
3380 __m128i_u __v;
3381 } __attribute__((__packed__, __may_alias__));
3382 return ((const struct __loadu_si128 *)__p)->__v;
3383 }
3384
3385 /// Returns a vector of [2 x i64] where the lower element is taken from
3386 /// the lower element of the operand, and the upper element is zero.
3387 ///
3388 /// \headerfile <x86intrin.h>
3389 ///
3390 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3391 ///
3392 /// \param __p
3393 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3394 /// the destination.
3395 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3396 /// moved value. The higher order bits are cleared.
3397 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i_u const * __p)3398 _mm_loadl_epi64(__m128i_u const *__p) {
3399 struct __mm_loadl_epi64_struct {
3400 long long __u;
3401 } __attribute__((__packed__, __may_alias__));
3402 return __extension__(__m128i){
3403 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3404 }
3405
3406 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3407 /// This could be used as an argument to another intrinsic function where the
3408 /// argument is required but the value is not actually used.
3409 ///
3410 /// \headerfile <x86intrin.h>
3411 ///
3412 /// This intrinsic has no corresponding instruction.
3413 ///
3414 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
_mm_undefined_si128(void)3415 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3416 return (__m128i)__builtin_ia32_undef128();
3417 }
3418
3419 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3420 /// the specified 64-bit integer values.
3421 ///
3422 /// \headerfile <x86intrin.h>
3423 ///
3424 /// This intrinsic is a utility function and does not correspond to a specific
3425 /// instruction.
3426 ///
3427 /// \param __q1
3428 /// A 64-bit integer value used to initialize the upper 64 bits of the
3429 /// destination vector of [2 x i64].
3430 /// \param __q0
3431 /// A 64-bit integer value used to initialize the lower 64 bits of the
3432 /// destination vector of [2 x i64].
3433 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3434 /// provided in the operands.
_mm_set_epi64x(long long __q1,long long __q0)3435 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3436 long long __q0) {
3437 return __extension__(__m128i)(__v2di){__q0, __q1};
3438 }
3439
3440 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3441 /// the specified 64-bit integer values.
3442 ///
3443 /// \headerfile <x86intrin.h>
3444 ///
3445 /// This intrinsic is a utility function and does not correspond to a specific
3446 /// instruction.
3447 ///
3448 /// \param __q1
3449 /// A 64-bit integer value used to initialize the upper 64 bits of the
3450 /// destination vector of [2 x i64].
3451 /// \param __q0
3452 /// A 64-bit integer value used to initialize the lower 64 bits of the
3453 /// destination vector of [2 x i64].
3454 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3455 /// provided in the operands.
_mm_set_epi64(__m64 __q1,__m64 __q0)3456 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3457 __m64 __q0) {
3458 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3459 }
3460
3461 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3462 /// the specified 32-bit integer values.
3463 ///
3464 /// \headerfile <x86intrin.h>
3465 ///
3466 /// This intrinsic is a utility function and does not correspond to a specific
3467 /// instruction.
3468 ///
3469 /// \param __i3
3470 /// A 32-bit integer value used to initialize bits [127:96] of the
3471 /// destination vector.
3472 /// \param __i2
3473 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3474 /// vector.
3475 /// \param __i1
3476 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3477 /// vector.
3478 /// \param __i0
3479 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3480 /// vector.
3481 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3482 /// provided in the operands.
_mm_set_epi32(int __i3,int __i2,int __i1,int __i0)3483 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3484 int __i1, int __i0) {
3485 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3486 }
3487
3488 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3489 /// the specified 16-bit integer values.
3490 ///
3491 /// \headerfile <x86intrin.h>
3492 ///
3493 /// This intrinsic is a utility function and does not correspond to a specific
3494 /// instruction.
3495 ///
3496 /// \param __w7
3497 /// A 16-bit integer value used to initialize bits [127:112] of the
3498 /// destination vector.
3499 /// \param __w6
3500 /// A 16-bit integer value used to initialize bits [111:96] of the
3501 /// destination vector.
3502 /// \param __w5
3503 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3504 /// vector.
3505 /// \param __w4
3506 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3507 /// vector.
3508 /// \param __w3
3509 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3510 /// vector.
3511 /// \param __w2
3512 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3513 /// vector.
3514 /// \param __w1
3515 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3516 /// vector.
3517 /// \param __w0
3518 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3519 /// vector.
3520 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3521 /// provided in the operands.
3522 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi16(short __w7,short __w6,short __w5,short __w4,short __w3,short __w2,short __w1,short __w0)3523 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3524 short __w2, short __w1, short __w0) {
3525 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3526 __w4, __w5, __w6, __w7};
3527 }
3528
3529 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3530 /// the specified 8-bit integer values.
3531 ///
3532 /// \headerfile <x86intrin.h>
3533 ///
3534 /// This intrinsic is a utility function and does not correspond to a specific
3535 /// instruction.
3536 ///
3537 /// \param __b15
3538 /// Initializes bits [127:120] of the destination vector.
3539 /// \param __b14
3540 /// Initializes bits [119:112] of the destination vector.
3541 /// \param __b13
3542 /// Initializes bits [111:104] of the destination vector.
3543 /// \param __b12
3544 /// Initializes bits [103:96] of the destination vector.
3545 /// \param __b11
3546 /// Initializes bits [95:88] of the destination vector.
3547 /// \param __b10
3548 /// Initializes bits [87:80] of the destination vector.
3549 /// \param __b9
3550 /// Initializes bits [79:72] of the destination vector.
3551 /// \param __b8
3552 /// Initializes bits [71:64] of the destination vector.
3553 /// \param __b7
3554 /// Initializes bits [63:56] of the destination vector.
3555 /// \param __b6
3556 /// Initializes bits [55:48] of the destination vector.
3557 /// \param __b5
3558 /// Initializes bits [47:40] of the destination vector.
3559 /// \param __b4
3560 /// Initializes bits [39:32] of the destination vector.
3561 /// \param __b3
3562 /// Initializes bits [31:24] of the destination vector.
3563 /// \param __b2
3564 /// Initializes bits [23:16] of the destination vector.
3565 /// \param __b1
3566 /// Initializes bits [15:8] of the destination vector.
3567 /// \param __b0
3568 /// Initializes bits [7:0] of the destination vector.
3569 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3570 /// provided in the operands.
3571 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi8(char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b9,char __b8,char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)3572 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3573 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3574 char __b4, char __b3, char __b2, char __b1, char __b0) {
3575 return __extension__(__m128i)(__v16qi){
3576 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3577 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3578 }
3579
3580 /// Initializes both values in a 128-bit integer vector with the
3581 /// specified 64-bit integer value.
3582 ///
3583 /// \headerfile <x86intrin.h>
3584 ///
3585 /// This intrinsic is a utility function and does not correspond to a specific
3586 /// instruction.
3587 ///
3588 /// \param __q
3589 /// Integer value used to initialize the elements of the destination integer
3590 /// vector.
3591 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3592 /// elements containing the value provided in the operand.
_mm_set1_epi64x(long long __q)3593 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3594 return _mm_set_epi64x(__q, __q);
3595 }
3596
3597 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3598 /// specified 64-bit value.
3599 ///
3600 /// \headerfile <x86intrin.h>
3601 ///
3602 /// This intrinsic is a utility function and does not correspond to a specific
3603 /// instruction.
3604 ///
3605 /// \param __q
3606 /// A 64-bit value used to initialize the elements of the destination integer
3607 /// vector.
3608 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3609 /// containing the value provided in the operand.
_mm_set1_epi64(__m64 __q)3610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3611 return _mm_set_epi64(__q, __q);
3612 }
3613
3614 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3615 /// specified 32-bit value.
3616 ///
3617 /// \headerfile <x86intrin.h>
3618 ///
3619 /// This intrinsic is a utility function and does not correspond to a specific
3620 /// instruction.
3621 ///
3622 /// \param __i
3623 /// A 32-bit value used to initialize the elements of the destination integer
3624 /// vector.
3625 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3626 /// containing the value provided in the operand.
_mm_set1_epi32(int __i)3627 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3628 return _mm_set_epi32(__i, __i, __i, __i);
3629 }
3630
3631 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3632 /// specified 16-bit value.
3633 ///
3634 /// \headerfile <x86intrin.h>
3635 ///
3636 /// This intrinsic is a utility function and does not correspond to a specific
3637 /// instruction.
3638 ///
3639 /// \param __w
3640 /// A 16-bit value used to initialize the elements of the destination integer
3641 /// vector.
3642 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3643 /// containing the value provided in the operand.
_mm_set1_epi16(short __w)3644 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3645 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3646 }
3647
3648 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3649 /// specified 8-bit value.
3650 ///
3651 /// \headerfile <x86intrin.h>
3652 ///
3653 /// This intrinsic is a utility function and does not correspond to a specific
3654 /// instruction.
3655 ///
3656 /// \param __b
3657 /// An 8-bit value used to initialize the elements of the destination integer
3658 /// vector.
3659 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3660 /// containing the value provided in the operand.
_mm_set1_epi8(char __b)3661 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3662 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3663 __b, __b, __b, __b, __b);
3664 }
3665
3666 /// Constructs a 128-bit integer vector, initialized in reverse order
3667 /// with the specified 64-bit integral values.
3668 ///
3669 /// \headerfile <x86intrin.h>
3670 ///
3671 /// This intrinsic does not correspond to a specific instruction.
3672 ///
3673 /// \param __q0
3674 /// A 64-bit integral value used to initialize the lower 64 bits of the
3675 /// result.
3676 /// \param __q1
3677 /// A 64-bit integral value used to initialize the upper 64 bits of the
3678 /// result.
3679 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi64(__m64 __q0,__m64 __q1)3680 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3681 __m64 __q1) {
3682 return _mm_set_epi64(__q1, __q0);
3683 }
3684
3685 /// Constructs a 128-bit integer vector, initialized in reverse order
3686 /// with the specified 32-bit integral values.
3687 ///
3688 /// \headerfile <x86intrin.h>
3689 ///
3690 /// This intrinsic is a utility function and does not correspond to a specific
3691 /// instruction.
3692 ///
3693 /// \param __i0
3694 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3695 /// \param __i1
3696 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3697 /// \param __i2
3698 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3699 /// \param __i3
3700 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3701 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi32(int __i0,int __i1,int __i2,int __i3)3702 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3703 int __i2,
3704 int __i3) {
3705 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3706 }
3707
3708 /// Constructs a 128-bit integer vector, initialized in reverse order
3709 /// with the specified 16-bit integral values.
3710 ///
3711 /// \headerfile <x86intrin.h>
3712 ///
3713 /// This intrinsic is a utility function and does not correspond to a specific
3714 /// instruction.
3715 ///
3716 /// \param __w0
3717 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3718 /// \param __w1
3719 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3720 /// \param __w2
3721 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3722 /// \param __w3
3723 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3724 /// \param __w4
3725 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3726 /// \param __w5
3727 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3728 /// \param __w6
3729 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3730 /// \param __w7
3731 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3732 /// \returns An initialized 128-bit integer vector.
3733 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0,short __w1,short __w2,short __w3,short __w4,short __w5,short __w6,short __w7)3734 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3735 short __w5, short __w6, short __w7) {
3736 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3737 }
3738
3739 /// Constructs a 128-bit integer vector, initialized in reverse order
3740 /// with the specified 8-bit integral values.
3741 ///
3742 /// \headerfile <x86intrin.h>
3743 ///
3744 /// This intrinsic is a utility function and does not correspond to a specific
3745 /// instruction.
3746 ///
3747 /// \param __b0
3748 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3749 /// \param __b1
3750 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3751 /// \param __b2
3752 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3753 /// \param __b3
3754 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3755 /// \param __b4
3756 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3757 /// \param __b5
3758 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3759 /// \param __b6
3760 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3761 /// \param __b7
3762 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3763 /// \param __b8
3764 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3765 /// \param __b9
3766 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3767 /// \param __b10
3768 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3769 /// \param __b11
3770 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3771 /// \param __b12
3772 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3773 /// \param __b13
3774 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3775 /// \param __b14
3776 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3777 /// \param __b15
3778 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3779 /// \returns An initialized 128-bit integer vector.
3780 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7,char __b8,char __b9,char __b10,char __b11,char __b12,char __b13,char __b14,char __b15)3781 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3782 char __b6, char __b7, char __b8, char __b9, char __b10,
3783 char __b11, char __b12, char __b13, char __b14, char __b15) {
3784 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3785 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3786 }
3787
3788 /// Creates a 128-bit integer vector initialized to zero.
3789 ///
3790 /// \headerfile <x86intrin.h>
3791 ///
3792 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3793 ///
3794 /// \returns An initialized 128-bit integer vector with all elements set to
3795 /// zero.
_mm_setzero_si128(void)3796 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3797 return __extension__(__m128i)(__v2di){0LL, 0LL};
3798 }
3799
3800 /// Stores a 128-bit integer vector to a memory location aligned on a
3801 /// 128-bit boundary.
3802 ///
3803 /// \headerfile <x86intrin.h>
3804 ///
3805 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3806 ///
3807 /// \param __p
3808 /// A pointer to an aligned memory location that will receive the integer
3809 /// values.
3810 /// \param __b
3811 /// A 128-bit integer vector containing the values to be moved.
_mm_store_si128(__m128i * __p,__m128i __b)3812 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3813 __m128i __b) {
3814 *__p = __b;
3815 }
3816
3817 /// Stores a 128-bit integer vector to an unaligned memory location.
3818 ///
3819 /// \headerfile <x86intrin.h>
3820 ///
3821 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3822 ///
3823 /// \param __p
3824 /// A pointer to a memory location that will receive the integer values.
3825 /// \param __b
3826 /// A 128-bit integer vector containing the values to be moved.
_mm_storeu_si128(__m128i_u * __p,__m128i __b)3827 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3828 __m128i __b) {
3829 struct __storeu_si128 {
3830 __m128i_u __v;
3831 } __attribute__((__packed__, __may_alias__));
3832 ((struct __storeu_si128 *)__p)->__v = __b;
3833 }
3834
3835 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3836 /// vector.
3837 ///
3838 /// \headerfile <x86intrin.h>
3839 ///
3840 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3841 ///
3842 /// \param __p
3843 /// A pointer to a 64-bit memory location. The address of the memory
3844 /// location does not have to be aligned.
3845 /// \param __b
3846 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si64(void * __p,__m128i __b)3847 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3848 __m128i __b) {
3849 struct __storeu_si64 {
3850 long long __v;
3851 } __attribute__((__packed__, __may_alias__));
3852 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3853 }
3854
3855 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3856 /// vector.
3857 ///
3858 /// \headerfile <x86intrin.h>
3859 ///
3860 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3861 ///
3862 /// \param __p
3863 /// A pointer to a 32-bit memory location. The address of the memory
3864 /// location does not have to be aligned.
3865 /// \param __b
3866 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si32(void * __p,__m128i __b)3867 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3868 __m128i __b) {
3869 struct __storeu_si32 {
3870 int __v;
3871 } __attribute__((__packed__, __may_alias__));
3872 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3873 }
3874
3875 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3876 /// vector.
3877 ///
3878 /// \headerfile <x86intrin.h>
3879 ///
3880 /// This intrinsic does not correspond to a specific instruction.
3881 ///
3882 /// \param __p
3883 /// A pointer to a 16-bit memory location. The address of the memory
3884 /// location does not have to be aligned.
3885 /// \param __b
3886 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si16(void * __p,__m128i __b)3887 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3888 __m128i __b) {
3889 struct __storeu_si16 {
3890 short __v;
3891 } __attribute__((__packed__, __may_alias__));
3892 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3893 }
3894
3895 /// Moves bytes selected by the mask from the first operand to the
3896 /// specified unaligned memory location. When a mask bit is 1, the
3897 /// corresponding byte is written, otherwise it is not written.
3898 ///
3899 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3900 /// used again soon). Exception and trap behavior for elements not selected
3901 /// for storage to memory are implementation dependent.
3902 ///
3903 /// \headerfile <x86intrin.h>
3904 ///
3905 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3906 /// instruction.
3907 ///
3908 /// \param __d
3909 /// A 128-bit integer vector containing the values to be moved.
3910 /// \param __n
3911 /// A 128-bit integer vector containing the mask. The most significant bit of
3912 /// each byte represents the mask bits.
3913 /// \param __p
3914 /// A pointer to an unaligned 128-bit memory location where the specified
3915 /// values are moved.
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3917 __m128i __n,
3918 char *__p) {
3919 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3920 }
3921
3922 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3923 /// a memory location.
3924 ///
3925 /// \headerfile <x86intrin.h>
3926 ///
3927 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3928 ///
3929 /// \param __p
3930 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3931 /// of the integer vector parameter.
3932 /// \param __a
3933 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3934 /// value to be stored.
_mm_storel_epi64(__m128i_u * __p,__m128i __a)3935 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3936 __m128i __a) {
3937 struct __mm_storel_epi64_struct {
3938 long long __u;
3939 } __attribute__((__packed__, __may_alias__));
3940 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3941 }
3942
3943 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3944 /// aligned memory location.
3945 ///
3946 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3947 /// used again soon).
3948 ///
3949 /// \headerfile <x86intrin.h>
3950 ///
3951 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3952 ///
3953 /// \param __p
3954 /// A pointer to the 128-bit aligned memory location used to store the value.
3955 /// \param __a
3956 /// A vector of [2 x double] containing the 64-bit values to be stored.
_mm_stream_pd(void * __p,__m128d __a)3957 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
3958 __m128d __a) {
3959 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3960 }
3961
3962 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3963 ///
3964 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3965 /// used again soon).
3966 ///
3967 /// \headerfile <x86intrin.h>
3968 ///
3969 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3970 ///
3971 /// \param __p
3972 /// A pointer to the 128-bit aligned memory location used to store the value.
3973 /// \param __a
3974 /// A 128-bit integer vector containing the values to be stored.
_mm_stream_si128(void * __p,__m128i __a)3975 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
3976 __m128i __a) {
3977 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3978 }
3979
3980 /// Stores a 32-bit integer value in the specified memory location.
3981 ///
3982 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3983 /// used again soon).
3984 ///
3985 /// \headerfile <x86intrin.h>
3986 ///
3987 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3988 ///
3989 /// \param __p
3990 /// A pointer to the 32-bit memory location used to store the value.
3991 /// \param __a
3992 /// A 32-bit integer containing the value to be stored.
3993 static __inline__ void
3994 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(void * __p,int __a)3995 _mm_stream_si32(void *__p, int __a) {
3996 __builtin_ia32_movnti((int *)__p, __a);
3997 }
3998
3999 #ifdef __x86_64__
4000 /// Stores a 64-bit integer value in the specified memory location.
4001 ///
4002 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4003 /// used again soon).
4004 ///
4005 /// \headerfile <x86intrin.h>
4006 ///
4007 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4008 ///
4009 /// \param __p
4010 /// A pointer to the 64-bit memory location used to store the value.
4011 /// \param __a
4012 /// A 64-bit integer containing the value to be stored.
4013 static __inline__ void
4014 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(void * __p,long long __a)4015 _mm_stream_si64(void *__p, long long __a) {
4016 __builtin_ia32_movnti64((long long *)__p, __a);
4017 }
4018 #endif
4019
4020 #if defined(__cplusplus)
4021 extern "C" {
4022 #endif
4023
4024 /// The cache line containing \a __p is flushed and invalidated from all
4025 /// caches in the coherency domain.
4026 ///
4027 /// \headerfile <x86intrin.h>
4028 ///
4029 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4030 ///
4031 /// \param __p
4032 /// A pointer to the memory location used to identify the cache line to be
4033 /// flushed.
4034 void _mm_clflush(void const *__p);
4035
4036 /// Forces strong memory ordering (serialization) between load
4037 /// instructions preceding this instruction and load instructions following
4038 /// this instruction, ensuring the system completes all previous loads before
4039 /// executing subsequent loads.
4040 ///
4041 /// \headerfile <x86intrin.h>
4042 ///
4043 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4044 ///
4045 void _mm_lfence(void);
4046
4047 /// Forces strong memory ordering (serialization) between load and store
4048 /// instructions preceding this instruction and load and store instructions
4049 /// following this instruction, ensuring that the system completes all
4050 /// previous memory accesses before executing subsequent memory accesses.
4051 ///
4052 /// \headerfile <x86intrin.h>
4053 ///
4054 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4055 ///
4056 void _mm_mfence(void);
4057
4058 #if defined(__cplusplus)
4059 } // extern "C"
4060 #endif
4061
4062 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4063 /// vector operands into 8-bit signed integers, and packs the results into
4064 /// the destination.
4065 ///
4066 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4067 /// less than 0x80 are saturated to 0x80.
4068 ///
4069 /// \headerfile <x86intrin.h>
4070 ///
4071 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4072 ///
4073 /// \param __a
4074 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4075 /// written to the lower 64 bits of the result.
4076 /// \param __b
4077 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4078 /// written to the higher 64 bits of the result.
4079 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packs_epi16(__m128i __a,__m128i __b)4080 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4081 __m128i __b) {
4082 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4083 }
4084
4085 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4086 /// vector operands into 16-bit signed integers, and packs the results into
4087 /// the destination.
4088 ///
4089 /// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4090 /// values less than 0x8000 are saturated to 0x8000.
4091 ///
4092 /// \headerfile <x86intrin.h>
4093 ///
4094 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4095 ///
4096 /// \param __a
4097 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4098 /// are written to the lower 64 bits of the result.
4099 /// \param __b
4100 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4101 /// are written to the higher 64 bits of the result.
4102 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
_mm_packs_epi32(__m128i __a,__m128i __b)4103 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4104 __m128i __b) {
4105 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4106 }
4107
4108 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4109 /// vector operands into 8-bit unsigned integers, and packs the results into
4110 /// the destination.
4111 ///
4112 /// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4113 /// are saturated to 0x00.
4114 ///
4115 /// \headerfile <x86intrin.h>
4116 ///
4117 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4118 ///
4119 /// \param __a
4120 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4121 /// written to the lower 64 bits of the result.
4122 /// \param __b
4123 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4124 /// written to the higher 64 bits of the result.
4125 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packus_epi16(__m128i __a,__m128i __b)4126 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4127 __m128i __b) {
4128 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4129 }
4130
4131 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4132 /// the immediate-value parameter as a selector.
4133 ///
4134 /// \headerfile <x86intrin.h>
4135 ///
4136 /// \code
4137 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4138 /// \endcode
4139 ///
4140 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4141 ///
4142 /// \param a
4143 /// A 128-bit integer vector.
4144 /// \param imm
4145 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4146 /// to bits[15:0] of the result. \n
4147 /// 000: assign values from bits [15:0] of \a a. \n
4148 /// 001: assign values from bits [31:16] of \a a. \n
4149 /// 010: assign values from bits [47:32] of \a a. \n
4150 /// 011: assign values from bits [63:48] of \a a. \n
4151 /// 100: assign values from bits [79:64] of \a a. \n
4152 /// 101: assign values from bits [95:80] of \a a. \n
4153 /// 110: assign values from bits [111:96] of \a a. \n
4154 /// 111: assign values from bits [127:112] of \a a.
4155 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4156 /// integer vector parameter and the remaining bits are assigned zeros.
4157 #define _mm_extract_epi16(a, imm) \
4158 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4159 (int)(imm)))
4160
4161 /// Constructs a 128-bit integer vector by first making a copy of the
4162 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4163 /// of an integer parameter into an offset specified by the immediate-value
4164 /// parameter.
4165 ///
4166 /// \headerfile <x86intrin.h>
4167 ///
4168 /// \code
4169 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4170 /// \endcode
4171 ///
4172 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4173 ///
4174 /// \param a
4175 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4176 /// result and then one of the eight elements in the result is replaced by
4177 /// the lower 16 bits of \a b.
4178 /// \param b
4179 /// An integer. The lower 16 bits of this parameter are written to the
4180 /// result beginning at an offset specified by \a imm.
4181 /// \param imm
4182 /// An immediate value specifying the bit offset in the result at which the
4183 /// lower 16 bits of \a b are written.
4184 /// \returns A 128-bit integer vector containing the constructed values.
4185 #define _mm_insert_epi16(a, b, imm) \
4186 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4187 (int)(imm)))
4188
4189 /// Copies the values of the most significant bits from each 8-bit
4190 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4191 /// value, zero-extends the value, and writes it to the destination.
4192 ///
4193 /// \headerfile <x86intrin.h>
4194 ///
4195 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4196 ///
4197 /// \param __a
4198 /// A 128-bit integer vector containing the values with bits to be extracted.
4199 /// \returns The most significant bits from each 8-bit element in \a __a,
4200 /// written to bits [15:0]. The other bits are assigned zeros.
_mm_movemask_epi8(__m128i __a)4201 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4202 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4203 }
4204
4205 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4206 /// elements of a 128-bit integer vector parameter, using the immediate-value
4207 /// parameter as a specifier.
4208 ///
4209 /// \headerfile <x86intrin.h>
4210 ///
4211 /// \code
4212 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4213 /// \endcode
4214 ///
4215 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4216 ///
4217 /// \param a
4218 /// A 128-bit integer vector containing the values to be copied.
4219 /// \param imm
4220 /// An immediate value containing an 8-bit value specifying which elements to
4221 /// copy from a. The destinations within the 128-bit destination are assigned
4222 /// values as follows: \n
4223 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4224 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4225 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4226 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4227 /// Bit value assignments: \n
4228 /// 00: assign values from bits [31:0] of \a a. \n
4229 /// 01: assign values from bits [63:32] of \a a. \n
4230 /// 10: assign values from bits [95:64] of \a a. \n
4231 /// 11: assign values from bits [127:96] of \a a. \n
4232 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4233 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4234 /// <c>[b6, b4, b2, b0]</c>.
4235 /// \returns A 128-bit integer vector containing the shuffled values.
4236 #define _mm_shuffle_epi32(a, imm) \
4237 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4238
4239 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4240 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4241 /// value parameter as a specifier.
4242 ///
4243 /// \headerfile <x86intrin.h>
4244 ///
4245 /// \code
4246 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4247 /// \endcode
4248 ///
4249 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4250 ///
4251 /// \param a
4252 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4253 /// [127:64] of the result.
4254 /// \param imm
4255 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4256 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4257 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4258 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4259 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4260 /// Bit value assignments: \n
4261 /// 00: assign values from bits [15:0] of \a a. \n
4262 /// 01: assign values from bits [31:16] of \a a. \n
4263 /// 10: assign values from bits [47:32] of \a a. \n
4264 /// 11: assign values from bits [63:48] of \a a. \n
4265 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4266 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4267 /// <c>[b6, b4, b2, b0]</c>.
4268 /// \returns A 128-bit integer vector containing the shuffled values.
4269 #define _mm_shufflelo_epi16(a, imm) \
4270 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4271
4272 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4273 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4274 /// value parameter as a specifier.
4275 ///
4276 /// \headerfile <x86intrin.h>
4277 ///
4278 /// \code
4279 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4280 /// \endcode
4281 ///
4282 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4283 ///
4284 /// \param a
4285 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4286 /// [63:0] of the result.
4287 /// \param imm
4288 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4289 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4290 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4291 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4292 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4293 /// Bit value assignments: \n
4294 /// 00: assign values from bits [79:64] of \a a. \n
4295 /// 01: assign values from bits [95:80] of \a a. \n
4296 /// 10: assign values from bits [111:96] of \a a. \n
4297 /// 11: assign values from bits [127:112] of \a a. \n
4298 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4299 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4300 /// <c>[b6, b4, b2, b0]</c>.
4301 /// \returns A 128-bit integer vector containing the shuffled values.
4302 #define _mm_shufflehi_epi16(a, imm) \
4303 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4304
4305 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4306 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4307 ///
4308 /// \headerfile <x86intrin.h>
4309 ///
4310 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4311 /// instruction.
4312 ///
4313 /// \param __a
4314 /// A 128-bit vector of [16 x i8].
4315 /// Bits [71:64] are written to bits [7:0] of the result. \n
4316 /// Bits [79:72] are written to bits [23:16] of the result. \n
4317 /// Bits [87:80] are written to bits [39:32] of the result. \n
4318 /// Bits [95:88] are written to bits [55:48] of the result. \n
4319 /// Bits [103:96] are written to bits [71:64] of the result. \n
4320 /// Bits [111:104] are written to bits [87:80] of the result. \n
4321 /// Bits [119:112] are written to bits [103:96] of the result. \n
4322 /// Bits [127:120] are written to bits [119:112] of the result.
4323 /// \param __b
4324 /// A 128-bit vector of [16 x i8]. \n
4325 /// Bits [71:64] are written to bits [15:8] of the result. \n
4326 /// Bits [79:72] are written to bits [31:24] of the result. \n
4327 /// Bits [87:80] are written to bits [47:40] of the result. \n
4328 /// Bits [95:88] are written to bits [63:56] of the result. \n
4329 /// Bits [103:96] are written to bits [79:72] of the result. \n
4330 /// Bits [111:104] are written to bits [95:88] of the result. \n
4331 /// Bits [119:112] are written to bits [111:104] of the result. \n
4332 /// Bits [127:120] are written to bits [127:120] of the result.
4333 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpackhi_epi8(__m128i __a,__m128i __b)4334 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4335 __m128i __b) {
4336 return (__m128i)__builtin_shufflevector(
4337 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4338 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4339 }
4340
4341 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4342 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4343 ///
4344 /// \headerfile <x86intrin.h>
4345 ///
4346 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4347 /// instruction.
4348 ///
4349 /// \param __a
4350 /// A 128-bit vector of [8 x i16].
4351 /// Bits [79:64] are written to bits [15:0] of the result. \n
4352 /// Bits [95:80] are written to bits [47:32] of the result. \n
4353 /// Bits [111:96] are written to bits [79:64] of the result. \n
4354 /// Bits [127:112] are written to bits [111:96] of the result.
4355 /// \param __b
4356 /// A 128-bit vector of [8 x i16].
4357 /// Bits [79:64] are written to bits [31:16] of the result. \n
4358 /// Bits [95:80] are written to bits [63:48] of the result. \n
4359 /// Bits [111:96] are written to bits [95:80] of the result. \n
4360 /// Bits [127:112] are written to bits [127:112] of the result.
4361 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpackhi_epi16(__m128i __a,__m128i __b)4362 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4363 __m128i __b) {
4364 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4365 8 + 5, 6, 8 + 6, 7, 8 + 7);
4366 }
4367
4368 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4369 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4370 ///
4371 /// \headerfile <x86intrin.h>
4372 ///
4373 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4374 /// instruction.
4375 ///
4376 /// \param __a
4377 /// A 128-bit vector of [4 x i32]. \n
4378 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4379 /// Bits [127:96] are written to bits [95:64] of the destination.
4380 /// \param __b
4381 /// A 128-bit vector of [4 x i32]. \n
4382 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4383 /// Bits [127:96] are written to bits [127:96] of the destination.
4384 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpackhi_epi32(__m128i __a,__m128i __b)4385 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4386 __m128i __b) {
4387 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4388 4 + 3);
4389 }
4390
4391 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4392 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4393 ///
4394 /// \headerfile <x86intrin.h>
4395 ///
4396 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4397 /// instruction.
4398 ///
4399 /// \param __a
4400 /// A 128-bit vector of [2 x i64]. \n
4401 /// Bits [127:64] are written to bits [63:0] of the destination.
4402 /// \param __b
4403 /// A 128-bit vector of [2 x i64]. \n
4404 /// Bits [127:64] are written to bits [127:64] of the destination.
4405 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpackhi_epi64(__m128i __a,__m128i __b)4406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4407 __m128i __b) {
4408 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4409 }
4410
4411 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4412 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4413 ///
4414 /// \headerfile <x86intrin.h>
4415 ///
4416 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4417 /// instruction.
4418 ///
4419 /// \param __a
4420 /// A 128-bit vector of [16 x i8]. \n
4421 /// Bits [7:0] are written to bits [7:0] of the result. \n
4422 /// Bits [15:8] are written to bits [23:16] of the result. \n
4423 /// Bits [23:16] are written to bits [39:32] of the result. \n
4424 /// Bits [31:24] are written to bits [55:48] of the result. \n
4425 /// Bits [39:32] are written to bits [71:64] of the result. \n
4426 /// Bits [47:40] are written to bits [87:80] of the result. \n
4427 /// Bits [55:48] are written to bits [103:96] of the result. \n
4428 /// Bits [63:56] are written to bits [119:112] of the result.
4429 /// \param __b
4430 /// A 128-bit vector of [16 x i8].
4431 /// Bits [7:0] are written to bits [15:8] of the result. \n
4432 /// Bits [15:8] are written to bits [31:24] of the result. \n
4433 /// Bits [23:16] are written to bits [47:40] of the result. \n
4434 /// Bits [31:24] are written to bits [63:56] of the result. \n
4435 /// Bits [39:32] are written to bits [79:72] of the result. \n
4436 /// Bits [47:40] are written to bits [95:88] of the result. \n
4437 /// Bits [55:48] are written to bits [111:104] of the result. \n
4438 /// Bits [63:56] are written to bits [127:120] of the result.
4439 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpacklo_epi8(__m128i __a,__m128i __b)4440 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4441 __m128i __b) {
4442 return (__m128i)__builtin_shufflevector(
4443 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4444 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4445 }
4446
4447 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4448 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4449 /// [8 x i16].
4450 ///
4451 /// \headerfile <x86intrin.h>
4452 ///
4453 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4454 /// instruction.
4455 ///
4456 /// \param __a
4457 /// A 128-bit vector of [8 x i16].
4458 /// Bits [15:0] are written to bits [15:0] of the result. \n
4459 /// Bits [31:16] are written to bits [47:32] of the result. \n
4460 /// Bits [47:32] are written to bits [79:64] of the result. \n
4461 /// Bits [63:48] are written to bits [111:96] of the result.
4462 /// \param __b
4463 /// A 128-bit vector of [8 x i16].
4464 /// Bits [15:0] are written to bits [31:16] of the result. \n
4465 /// Bits [31:16] are written to bits [63:48] of the result. \n
4466 /// Bits [47:32] are written to bits [95:80] of the result. \n
4467 /// Bits [63:48] are written to bits [127:112] of the result.
4468 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpacklo_epi16(__m128i __a,__m128i __b)4469 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4470 __m128i __b) {
4471 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4472 8 + 1, 2, 8 + 2, 3, 8 + 3);
4473 }
4474
4475 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4476 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4477 ///
4478 /// \headerfile <x86intrin.h>
4479 ///
4480 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4481 /// instruction.
4482 ///
4483 /// \param __a
4484 /// A 128-bit vector of [4 x i32]. \n
4485 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4486 /// Bits [63:32] are written to bits [95:64] of the destination.
4487 /// \param __b
4488 /// A 128-bit vector of [4 x i32]. \n
4489 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4490 /// Bits [63:32] are written to bits [127:96] of the destination.
4491 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpacklo_epi32(__m128i __a,__m128i __b)4492 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4493 __m128i __b) {
4494 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4495 4 + 1);
4496 }
4497
4498 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4499 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4500 ///
4501 /// \headerfile <x86intrin.h>
4502 ///
4503 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4504 /// instruction.
4505 ///
4506 /// \param __a
4507 /// A 128-bit vector of [2 x i64]. \n
4508 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4509 /// \param __b
4510 /// A 128-bit vector of [2 x i64]. \n
4511 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4512 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpacklo_epi64(__m128i __a,__m128i __b)4513 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4514 __m128i __b) {
4515 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4516 }
4517
4518 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4519 /// integer.
4520 ///
4521 /// \headerfile <x86intrin.h>
4522 ///
4523 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4524 ///
4525 /// \param __a
4526 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4527 /// destination.
4528 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
_mm_movepi64_pi64(__m128i __a)4529 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4530 return (__m64)__a[0];
4531 }
4532
4533 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4534 /// upper bits.
4535 ///
4536 /// \headerfile <x86intrin.h>
4537 ///
4538 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4539 ///
4540 /// \param __a
4541 /// A 64-bit value.
4542 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4543 /// the operand. The upper 64 bits are assigned zeros.
_mm_movpi64_epi64(__m64 __a)4544 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4545 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4546 }
4547
4548 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4549 /// integer vector, zeroing the upper bits.
4550 ///
4551 /// \headerfile <x86intrin.h>
4552 ///
4553 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4554 ///
4555 /// \param __a
4556 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4557 /// destination.
4558 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4559 /// the operand. The upper 64 bits are assigned zeros.
_mm_move_epi64(__m128i __a)4560 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4561 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4562 }
4563
4564 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4565 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4566 /// double].
4567 ///
4568 /// \headerfile <x86intrin.h>
4569 ///
4570 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4571 ///
4572 /// \param __a
4573 /// A 128-bit vector of [2 x double]. \n
4574 /// Bits [127:64] are written to bits [63:0] of the destination.
4575 /// \param __b
4576 /// A 128-bit vector of [2 x double]. \n
4577 /// Bits [127:64] are written to bits [127:64] of the destination.
4578 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpackhi_pd(__m128d __a,__m128d __b)4579 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4580 __m128d __b) {
4581 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4582 }
4583
4584 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4585 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4586 /// double].
4587 ///
4588 /// \headerfile <x86intrin.h>
4589 ///
4590 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4591 ///
4592 /// \param __a
4593 /// A 128-bit vector of [2 x double]. \n
4594 /// Bits [63:0] are written to bits [63:0] of the destination.
4595 /// \param __b
4596 /// A 128-bit vector of [2 x double]. \n
4597 /// Bits [63:0] are written to bits [127:64] of the destination.
4598 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpacklo_pd(__m128d __a,__m128d __b)4599 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4600 __m128d __b) {
4601 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4602 }
4603
4604 /// Extracts the sign bits of the double-precision values in the 128-bit
4605 /// vector of [2 x double], zero-extends the value, and writes it to the
4606 /// low-order bits of the destination.
4607 ///
4608 /// \headerfile <x86intrin.h>
4609 ///
4610 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4611 ///
4612 /// \param __a
4613 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4614 /// be extracted.
4615 /// \returns The sign bits from each of the double-precision elements in \a __a,
4616 /// written to bits [1:0]. The remaining bits are assigned values of zero.
_mm_movemask_pd(__m128d __a)4617 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4618 return __builtin_ia32_movmskpd((__v2df)__a);
4619 }
4620
4621 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4622 /// 128-bit vector parameters of [2 x double], using the immediate-value
4623 /// parameter as a specifier.
4624 ///
4625 /// \headerfile <x86intrin.h>
4626 ///
4627 /// \code
4628 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4629 /// \endcode
4630 ///
4631 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4632 ///
4633 /// \param a
4634 /// A 128-bit vector of [2 x double].
4635 /// \param b
4636 /// A 128-bit vector of [2 x double].
4637 /// \param i
4638 /// An 8-bit immediate value. The least significant two bits specify which
4639 /// elements to copy from \a a and \a b: \n
4640 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4641 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4642 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4643 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4644 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4645 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4646 /// <c>[b1, b0]</c>.
4647 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4648 #define _mm_shuffle_pd(a, b, i) \
4649 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4650 (int)(i)))
4651
4652 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4653 /// floating-point vector of [4 x float].
4654 ///
4655 /// \headerfile <x86intrin.h>
4656 ///
4657 /// This intrinsic has no corresponding instruction.
4658 ///
4659 /// \param __a
4660 /// A 128-bit floating-point vector of [2 x double].
4661 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4662 /// bitwise pattern as the parameter.
_mm_castpd_ps(__m128d __a)4663 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4664 return (__m128)__a;
4665 }
4666
4667 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4668 /// integer vector.
4669 ///
4670 /// \headerfile <x86intrin.h>
4671 ///
4672 /// This intrinsic has no corresponding instruction.
4673 ///
4674 /// \param __a
4675 /// A 128-bit floating-point vector of [2 x double].
4676 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4677 /// parameter.
_mm_castpd_si128(__m128d __a)4678 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4679 return (__m128i)__a;
4680 }
4681
4682 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4683 /// floating-point vector of [2 x double].
4684 ///
4685 /// \headerfile <x86intrin.h>
4686 ///
4687 /// This intrinsic has no corresponding instruction.
4688 ///
4689 /// \param __a
4690 /// A 128-bit floating-point vector of [4 x float].
4691 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4692 /// bitwise pattern as the parameter.
_mm_castps_pd(__m128 __a)4693 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4694 return (__m128d)__a;
4695 }
4696
4697 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4698 /// integer vector.
4699 ///
4700 /// \headerfile <x86intrin.h>
4701 ///
4702 /// This intrinsic has no corresponding instruction.
4703 ///
4704 /// \param __a
4705 /// A 128-bit floating-point vector of [4 x float].
4706 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4707 /// parameter.
_mm_castps_si128(__m128 __a)4708 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4709 return (__m128i)__a;
4710 }
4711
4712 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4713 /// of [4 x float].
4714 ///
4715 /// \headerfile <x86intrin.h>
4716 ///
4717 /// This intrinsic has no corresponding instruction.
4718 ///
4719 /// \param __a
4720 /// A 128-bit integer vector.
4721 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4722 /// bitwise pattern as the parameter.
_mm_castsi128_ps(__m128i __a)4723 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4724 return (__m128)__a;
4725 }
4726
4727 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4728 /// of [2 x double].
4729 ///
4730 /// \headerfile <x86intrin.h>
4731 ///
4732 /// This intrinsic has no corresponding instruction.
4733 ///
4734 /// \param __a
4735 /// A 128-bit integer vector.
4736 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4737 /// bitwise pattern as the parameter.
_mm_castsi128_pd(__m128i __a)4738 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4739 return (__m128d)__a;
4740 }
4741
4742 /// Compares each of the corresponding double-precision values of two
4743 /// 128-bit vectors of [2 x double], using the operation specified by the
4744 /// immediate integer operand.
4745 ///
4746 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4747 ///
4748 /// \headerfile <x86intrin.h>
4749 ///
4750 /// \code
4751 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4752 /// \endcode
4753 ///
4754 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4755 ///
4756 /// \param a
4757 /// A 128-bit vector of [2 x double].
4758 /// \param b
4759 /// A 128-bit vector of [2 x double].
4760 /// \param c
4761 /// An immediate integer operand, with bits [4:0] specifying which comparison
4762 /// operation to use: \n
4763 /// 0x00: Equal (ordered, non-signaling) \n
4764 /// 0x01: Less-than (ordered, signaling) \n
4765 /// 0x02: Less-than-or-equal (ordered, signaling) \n
4766 /// 0x03: Unordered (non-signaling) \n
4767 /// 0x04: Not-equal (unordered, non-signaling) \n
4768 /// 0x05: Not-less-than (unordered, signaling) \n
4769 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4770 /// 0x07: Ordered (non-signaling) \n
4771 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4772 #define _mm_cmp_pd(a, b, c) \
4773 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4774 (c)))
4775
4776 /// Compares each of the corresponding scalar double-precision values of
4777 /// two 128-bit vectors of [2 x double], using the operation specified by the
4778 /// immediate integer operand.
4779 ///
4780 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4781 ///
4782 /// \headerfile <x86intrin.h>
4783 ///
4784 /// \code
4785 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4786 /// \endcode
4787 ///
4788 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4789 ///
4790 /// \param a
4791 /// A 128-bit vector of [2 x double].
4792 /// \param b
4793 /// A 128-bit vector of [2 x double].
4794 /// \param c
4795 /// An immediate integer operand, with bits [4:0] specifying which comparison
4796 /// operation to use: \n
4797 /// 0x00: Equal (ordered, non-signaling) \n
4798 /// 0x01: Less-than (ordered, signaling) \n
4799 /// 0x02: Less-than-or-equal (ordered, signaling) \n
4800 /// 0x03: Unordered (non-signaling) \n
4801 /// 0x04: Not-equal (unordered, non-signaling) \n
4802 /// 0x05: Not-less-than (unordered, signaling) \n
4803 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4804 /// 0x07: Ordered (non-signaling) \n
4805 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4806 #define _mm_cmp_sd(a, b, c) \
4807 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4808 (c)))
4809
4810 #if defined(__cplusplus)
4811 extern "C" {
4812 #endif
4813
4814 /// Indicates that a spin loop is being executed for the purposes of
4815 /// optimizing power consumption during the loop.
4816 ///
4817 /// \headerfile <x86intrin.h>
4818 ///
4819 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4820 ///
4821 void _mm_pause(void);
4822
4823 #if defined(__cplusplus)
4824 } // extern "C"
4825 #endif
4826 #undef __DEFAULT_FN_ATTRS
4827 #undef __DEFAULT_FN_ATTRS_MMX
4828
4829 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4830
4831 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4832 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4833
4834 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4835
4836 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4837 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4838 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4839
4840 #endif /* __EMMINTRIN_H */
4841