xref: /aosp_15_r20/prebuilts/clang-tools/linux-x86/lib64/clang/19/include/emmintrin.h (revision bed243d3d9cd544cfb038bfa7be843dedc6e6bf7)
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24     __attribute__((__vector_size__(16), __aligned__(1)));
25 
26 /* Type defines.  */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31 
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36 
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38  * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40 
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46 
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50 
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS                                                     \
53   __attribute__((__always_inline__, __nodebug__,                               \
54                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX                                                 \
56   __attribute__((__always_inline__, __nodebug__,                               \
57                  __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58 
59 /// Adds lower double-precision values in both operands and returns the
60 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
61 ///    are copied from the upper double-precision value of the first operand.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66 ///
67 /// \param __a
68 ///    A 128-bit vector of [2 x double] containing one of the source operands.
69 /// \param __b
70 ///    A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 ///    from the upper 64 bits of the first source operand.
_mm_add_sd(__m128d __a,__m128d __b)74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75                                                         __m128d __b) {
76   __a[0] += __b[0];
77   return __a;
78 }
79 
80 /// Adds two 128-bit vectors of [2 x double].
81 ///
82 /// \headerfile <x86intrin.h>
83 ///
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85 ///
86 /// \param __a
87 ///    A 128-bit vector of [2 x double] containing one of the source operands.
88 /// \param __b
89 ///    A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
91 ///    operands.
_mm_add_pd(__m128d __a,__m128d __b)92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93                                                         __m128d __b) {
94   return (__m128d)((__v2df)__a + (__v2df)__b);
95 }
96 
97 /// Subtracts the lower double-precision value of the second operand
98 ///    from the lower double-precision value of the first operand and returns
99 ///    the difference in the lower 64 bits of the result. The upper 64 bits of
100 ///    the result are copied from the upper double-precision value of the first
101 ///    operand.
102 ///
103 /// \headerfile <x86intrin.h>
104 ///
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106 ///
107 /// \param __a
108 ///    A 128-bit vector of [2 x double] containing the minuend.
109 /// \param __b
110 ///    A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
113 ///    copied from the upper 64 bits of the first source operand.
_mm_sub_sd(__m128d __a,__m128d __b)114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115                                                         __m128d __b) {
116   __a[0] -= __b[0];
117   return __a;
118 }
119 
120 /// Subtracts two 128-bit vectors of [2 x double].
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125 ///
126 /// \param __a
127 ///    A 128-bit vector of [2 x double] containing the minuend.
128 /// \param __b
129 ///    A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
131 ///    both operands.
_mm_sub_pd(__m128d __a,__m128d __b)132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133                                                         __m128d __b) {
134   return (__m128d)((__v2df)__a - (__v2df)__b);
135 }
136 
137 /// Multiplies lower double-precision values in both operands and returns
138 ///    the product in the lower 64 bits of the result. The upper 64 bits of the
139 ///    result are copied from the upper double-precision value of the first
140 ///    operand.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145 ///
146 /// \param __a
147 ///    A 128-bit vector of [2 x double] containing one of the source operands.
148 /// \param __b
149 ///    A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 ///    product of the lower 64 bits of both operands. The upper 64 bits are
152 ///    copied from the upper 64 bits of the first source operand.
_mm_mul_sd(__m128d __a,__m128d __b)153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154                                                         __m128d __b) {
155   __a[0] *= __b[0];
156   return __a;
157 }
158 
159 /// Multiplies two 128-bit vectors of [2 x double].
160 ///
161 /// \headerfile <x86intrin.h>
162 ///
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 ///
165 /// \param __a
166 ///    A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 ///    A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 ///    operands.
_mm_mul_pd(__m128d __a,__m128d __b)171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172                                                         __m128d __b) {
173   return (__m128d)((__v2df)__a * (__v2df)__b);
174 }
175 
176 /// Divides the lower double-precision value of the first operand by the
177 ///    lower double-precision value of the second operand and returns the
178 ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
179 ///    result are copied from the upper double-precision value of the first
180 ///    operand.
181 ///
182 /// \headerfile <x86intrin.h>
183 ///
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185 ///
186 /// \param __a
187 ///    A 128-bit vector of [2 x double] containing the dividend.
188 /// \param __b
189 ///    A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
192 ///    copied from the upper 64 bits of the first source operand.
_mm_div_sd(__m128d __a,__m128d __b)193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194                                                         __m128d __b) {
195   __a[0] /= __b[0];
196   return __a;
197 }
198 
199 /// Performs an element-by-element division of two 128-bit vectors of
200 ///    [2 x double].
201 ///
202 /// \headerfile <x86intrin.h>
203 ///
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205 ///
206 /// \param __a
207 ///    A 128-bit vector of [2 x double] containing the dividend.
208 /// \param __b
209 ///    A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
211 ///    operands.
_mm_div_pd(__m128d __a,__m128d __b)212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213                                                         __m128d __b) {
214   return (__m128d)((__v2df)__a / (__v2df)__b);
215 }
216 
217 /// Calculates the square root of the lower double-precision value of
218 ///    the second operand and returns it in the lower 64 bits of the result.
219 ///    The upper 64 bits of the result are copied from the upper
220 ///    double-precision value of the first operand.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225 ///
226 /// \param __a
227 ///    A 128-bit vector of [2 x double] containing one of the operands. The
228 ///    upper 64 bits of this operand are copied to the upper 64 bits of the
229 ///    result.
230 /// \param __b
231 ///    A 128-bit vector of [2 x double] containing one of the operands. The
232 ///    square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
235 ///    bits are copied from the upper 64 bits of operand \a __a.
_mm_sqrt_sd(__m128d __a,__m128d __b)236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237                                                          __m128d __b) {
238   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239   return __extension__(__m128d){__c[0], __a[1]};
240 }
241 
242 /// Calculates the square root of the each of two values stored in a
243 ///    128-bit vector of [2 x double].
244 ///
245 /// \headerfile <x86intrin.h>
246 ///
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248 ///
249 /// \param __a
250 ///    A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 ///    values in the operand.
_mm_sqrt_pd(__m128d __a)253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254   return __builtin_ia32_sqrtpd((__v2df)__a);
255 }
256 
257 /// Compares lower 64-bit double-precision values of both operands, and
258 ///    returns the lesser of the pair of values in the lower 64-bits of the
259 ///    result. The upper 64 bits of the result are copied from the upper
260 ///    double-precision value of the first operand.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
265 ///
266 /// \param __a
267 ///    A 128-bit vector of [2 x double] containing one of the operands. The
268 ///    lower 64 bits of this operand are used in the comparison.
269 /// \param __b
270 ///    A 128-bit vector of [2 x double] containing one of the operands. The
271 ///    lower 64 bits of this operand are used in the comparison.
272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273 ///    minimum value between both operands. The upper 64 bits are copied from
274 ///    the upper 64 bits of the first source operand.
_mm_min_sd(__m128d __a,__m128d __b)275 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276                                                         __m128d __b) {
277   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
278 }
279 
280 /// Performs element-by-element comparison of the two 128-bit vectors of
281 ///    [2 x double] and returns the vector containing the lesser of each pair of
282 ///    values.
283 ///
284 /// \headerfile <x86intrin.h>
285 ///
286 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
287 ///
288 /// \param __a
289 ///    A 128-bit vector of [2 x double] containing one of the operands.
290 /// \param __b
291 ///    A 128-bit vector of [2 x double] containing one of the operands.
292 /// \returns A 128-bit vector of [2 x double] containing the minimum values
293 ///    between both operands.
_mm_min_pd(__m128d __a,__m128d __b)294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295                                                         __m128d __b) {
296   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
297 }
298 
299 /// Compares lower 64-bit double-precision values of both operands, and
300 ///    returns the greater of the pair of values in the lower 64-bits of the
301 ///    result. The upper 64 bits of the result are copied from the upper
302 ///    double-precision value of the first operand.
303 ///
304 /// \headerfile <x86intrin.h>
305 ///
306 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
307 ///
308 /// \param __a
309 ///    A 128-bit vector of [2 x double] containing one of the operands. The
310 ///    lower 64 bits of this operand are used in the comparison.
311 /// \param __b
312 ///    A 128-bit vector of [2 x double] containing one of the operands. The
313 ///    lower 64 bits of this operand are used in the comparison.
314 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315 ///    maximum value between both operands. The upper 64 bits are copied from
316 ///    the upper 64 bits of the first source operand.
_mm_max_sd(__m128d __a,__m128d __b)317 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318                                                         __m128d __b) {
319   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
320 }
321 
322 /// Performs element-by-element comparison of the two 128-bit vectors of
323 ///    [2 x double] and returns the vector containing the greater of each pair
324 ///    of values.
325 ///
326 /// \headerfile <x86intrin.h>
327 ///
328 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
329 ///
330 /// \param __a
331 ///    A 128-bit vector of [2 x double] containing one of the operands.
332 /// \param __b
333 ///    A 128-bit vector of [2 x double] containing one of the operands.
334 /// \returns A 128-bit vector of [2 x double] containing the maximum values
335 ///    between both operands.
_mm_max_pd(__m128d __a,__m128d __b)336 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337                                                         __m128d __b) {
338   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339 }
340 
341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342 ///
343 /// \headerfile <x86intrin.h>
344 ///
345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346 ///
347 /// \param __a
348 ///    A 128-bit vector of [2 x double] containing one of the source operands.
349 /// \param __b
350 ///    A 128-bit vector of [2 x double] containing one of the source operands.
351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352 ///    values between both operands.
_mm_and_pd(__m128d __a,__m128d __b)353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354                                                         __m128d __b) {
355   return (__m128d)((__v2du)__a & (__v2du)__b);
356 }
357 
358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359 ///    the one's complement of the values contained in the first source operand.
360 ///
361 /// \headerfile <x86intrin.h>
362 ///
363 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
364 ///
365 /// \param __a
366 ///    A 128-bit vector of [2 x double] containing the left source operand. The
367 ///    one's complement of this value is used in the bitwise AND.
368 /// \param __b
369 ///    A 128-bit vector of [2 x double] containing the right source operand.
370 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371 ///    values in the second operand and the one's complement of the first
372 ///    operand.
_mm_andnot_pd(__m128d __a,__m128d __b)373 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374                                                            __m128d __b) {
375   return (__m128d)(~(__v2du)__a & (__v2du)__b);
376 }
377 
378 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
379 ///
380 /// \headerfile <x86intrin.h>
381 ///
382 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
383 ///
384 /// \param __a
385 ///    A 128-bit vector of [2 x double] containing one of the source operands.
386 /// \param __b
387 ///    A 128-bit vector of [2 x double] containing one of the source operands.
388 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389 ///    values between both operands.
_mm_or_pd(__m128d __a,__m128d __b)390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391                                                        __m128d __b) {
392   return (__m128d)((__v2du)__a | (__v2du)__b);
393 }
394 
395 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
396 ///
397 /// \headerfile <x86intrin.h>
398 ///
399 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
400 ///
401 /// \param __a
402 ///    A 128-bit vector of [2 x double] containing one of the source operands.
403 /// \param __b
404 ///    A 128-bit vector of [2 x double] containing one of the source operands.
405 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406 ///    values between both operands.
_mm_xor_pd(__m128d __a,__m128d __b)407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408                                                         __m128d __b) {
409   return (__m128d)((__v2du)__a ^ (__v2du)__b);
410 }
411 
412 /// Compares each of the corresponding double-precision values of the
413 ///    128-bit vectors of [2 x double] for equality.
414 ///
415 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
416 ///
417 /// \headerfile <x86intrin.h>
418 ///
419 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
420 ///
421 /// \param __a
422 ///    A 128-bit vector of [2 x double].
423 /// \param __b
424 ///    A 128-bit vector of [2 x double].
425 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpeq_pd(__m128d __a,__m128d __b)426 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
427                                                           __m128d __b) {
428   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
429 }
430 
431 /// Compares each of the corresponding double-precision values of the
432 ///    128-bit vectors of [2 x double] to determine if the values in the first
433 ///    operand are less than those in the second operand.
434 ///
435 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
436 ///
437 /// \headerfile <x86intrin.h>
438 ///
439 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
440 ///
441 /// \param __a
442 ///    A 128-bit vector of [2 x double].
443 /// \param __b
444 ///    A 128-bit vector of [2 x double].
445 /// \returns A 128-bit vector containing the comparison results.
_mm_cmplt_pd(__m128d __a,__m128d __b)446 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
447                                                           __m128d __b) {
448   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
449 }
450 
451 /// Compares each of the corresponding double-precision values of the
452 ///    128-bit vectors of [2 x double] to determine if the values in the first
453 ///    operand are less than or equal to those in the second operand.
454 ///
455 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
456 ///
457 /// \headerfile <x86intrin.h>
458 ///
459 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
460 ///
461 /// \param __a
462 ///    A 128-bit vector of [2 x double].
463 /// \param __b
464 ///    A 128-bit vector of [2 x double].
465 /// \returns A 128-bit vector containing the comparison results.
_mm_cmple_pd(__m128d __a,__m128d __b)466 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
467                                                           __m128d __b) {
468   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
469 }
470 
471 /// Compares each of the corresponding double-precision values of the
472 ///    128-bit vectors of [2 x double] to determine if the values in the first
473 ///    operand are greater than those in the second operand.
474 ///
475 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
476 ///
477 /// \headerfile <x86intrin.h>
478 ///
479 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
480 ///
481 /// \param __a
482 ///    A 128-bit vector of [2 x double].
483 /// \param __b
484 ///    A 128-bit vector of [2 x double].
485 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpgt_pd(__m128d __a,__m128d __b)486 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
487                                                           __m128d __b) {
488   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
489 }
490 
491 /// Compares each of the corresponding double-precision values of the
492 ///    128-bit vectors of [2 x double] to determine if the values in the first
493 ///    operand are greater than or equal to those in the second operand.
494 ///
495 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
496 ///
497 /// \headerfile <x86intrin.h>
498 ///
499 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
500 ///
501 /// \param __a
502 ///    A 128-bit vector of [2 x double].
503 /// \param __b
504 ///    A 128-bit vector of [2 x double].
505 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpge_pd(__m128d __a,__m128d __b)506 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
507                                                           __m128d __b) {
508   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
509 }
510 
511 /// Compares each of the corresponding double-precision values of the
512 ///    128-bit vectors of [2 x double] to determine if the values in the first
513 ///    operand are ordered with respect to those in the second operand.
514 ///
515 ///    A pair of double-precision values are "ordered" with respect to each
516 ///    other if neither value is a NaN. Each comparison yields 0x0 for false,
517 ///    0xFFFFFFFFFFFFFFFF for true.
518 ///
519 /// \headerfile <x86intrin.h>
520 ///
521 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
522 ///
523 /// \param __a
524 ///    A 128-bit vector of [2 x double].
525 /// \param __b
526 ///    A 128-bit vector of [2 x double].
527 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpord_pd(__m128d __a,__m128d __b)528 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
529                                                            __m128d __b) {
530   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
531 }
532 
533 /// Compares each of the corresponding double-precision values of the
534 ///    128-bit vectors of [2 x double] to determine if the values in the first
535 ///    operand are unordered with respect to those in the second operand.
536 ///
537 ///    A pair of double-precision values are "unordered" with respect to each
538 ///    other if one or both values are NaN. Each comparison yields 0x0 for
539 ///    false, 0xFFFFFFFFFFFFFFFF for true.
540 ///
541 /// \headerfile <x86intrin.h>
542 ///
543 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
544 ///   instruction.
545 ///
546 /// \param __a
547 ///    A 128-bit vector of [2 x double].
548 /// \param __b
549 ///    A 128-bit vector of [2 x double].
550 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpunord_pd(__m128d __a,__m128d __b)551 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
552                                                              __m128d __b) {
553   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
554 }
555 
556 /// Compares each of the corresponding double-precision values of the
557 ///    128-bit vectors of [2 x double] to determine if the values in the first
558 ///    operand are unequal to those in the second operand.
559 ///
560 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
561 ///
562 /// \headerfile <x86intrin.h>
563 ///
564 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
565 ///
566 /// \param __a
567 ///    A 128-bit vector of [2 x double].
568 /// \param __b
569 ///    A 128-bit vector of [2 x double].
570 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpneq_pd(__m128d __a,__m128d __b)571 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
572                                                            __m128d __b) {
573   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
574 }
575 
576 /// Compares each of the corresponding double-precision values of the
577 ///    128-bit vectors of [2 x double] to determine if the values in the first
578 ///    operand are not less than those in the second operand.
579 ///
580 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
581 ///
582 /// \headerfile <x86intrin.h>
583 ///
584 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
585 ///
586 /// \param __a
587 ///    A 128-bit vector of [2 x double].
588 /// \param __b
589 ///    A 128-bit vector of [2 x double].
590 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnlt_pd(__m128d __a,__m128d __b)591 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
592                                                            __m128d __b) {
593   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
594 }
595 
596 /// Compares each of the corresponding double-precision values of the
597 ///    128-bit vectors of [2 x double] to determine if the values in the first
598 ///    operand are not less than or equal to those in the second operand.
599 ///
600 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
601 ///
602 /// \headerfile <x86intrin.h>
603 ///
604 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
605 ///
606 /// \param __a
607 ///    A 128-bit vector of [2 x double].
608 /// \param __b
609 ///    A 128-bit vector of [2 x double].
610 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnle_pd(__m128d __a,__m128d __b)611 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
612                                                            __m128d __b) {
613   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
614 }
615 
616 /// Compares each of the corresponding double-precision values of the
617 ///    128-bit vectors of [2 x double] to determine if the values in the first
618 ///    operand are not greater than those in the second operand.
619 ///
620 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
621 ///
622 /// \headerfile <x86intrin.h>
623 ///
624 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
625 ///
626 /// \param __a
627 ///    A 128-bit vector of [2 x double].
628 /// \param __b
629 ///    A 128-bit vector of [2 x double].
630 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpngt_pd(__m128d __a,__m128d __b)631 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
632                                                            __m128d __b) {
633   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
634 }
635 
636 /// Compares each of the corresponding double-precision values of the
637 ///    128-bit vectors of [2 x double] to determine if the values in the first
638 ///    operand are not greater than or equal to those in the second operand.
639 ///
640 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
641 ///
642 /// \headerfile <x86intrin.h>
643 ///
644 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
645 ///
646 /// \param __a
647 ///    A 128-bit vector of [2 x double].
648 /// \param __b
649 ///    A 128-bit vector of [2 x double].
650 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnge_pd(__m128d __a,__m128d __b)651 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
652                                                            __m128d __b) {
653   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
654 }
655 
656 /// Compares the lower double-precision floating-point values in each of
657 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
658 ///
659 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
660 ///
661 /// \headerfile <x86intrin.h>
662 ///
663 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
664 ///
665 /// \param __a
666 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
667 ///    compared to the lower double-precision value of \a __b.
668 /// \param __b
669 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
670 ///    compared to the lower double-precision value of \a __a.
671 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
672 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpeq_sd(__m128d __a,__m128d __b)673 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
674                                                           __m128d __b) {
675   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
676 }
677 
678 /// Compares the lower double-precision floating-point values in each of
679 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
680 ///    the value in the first parameter is less than the corresponding value in
681 ///    the second parameter.
682 ///
683 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
684 ///
685 /// \headerfile <x86intrin.h>
686 ///
687 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
688 ///
689 /// \param __a
690 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
691 ///    compared to the lower double-precision value of \a __b.
692 /// \param __b
693 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
694 ///    compared to the lower double-precision value of \a __a.
695 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
696 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmplt_sd(__m128d __a,__m128d __b)697 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
698                                                           __m128d __b) {
699   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
700 }
701 
702 /// Compares the lower double-precision floating-point values in each of
703 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
704 ///    the value in the first parameter is less than or equal to the
705 ///    corresponding value in the second parameter.
706 ///
707 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
708 ///
709 /// \headerfile <x86intrin.h>
710 ///
711 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
712 ///
713 /// \param __a
714 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
715 ///    compared to the lower double-precision value of \a __b.
716 /// \param __b
717 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
718 ///    compared to the lower double-precision value of \a __a.
719 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
720 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmple_sd(__m128d __a,__m128d __b)721 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
722                                                           __m128d __b) {
723   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
724 }
725 
726 /// Compares the lower double-precision floating-point values in each of
727 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
728 ///    the value in the first parameter is greater than the corresponding value
729 ///    in the second parameter.
730 ///
731 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
732 ///
733 /// \headerfile <x86intrin.h>
734 ///
735 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
736 ///
737 /// \param __a
738 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
739 ///     compared to the lower double-precision value of \a __b.
740 /// \param __b
741 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
742 ///     compared to the lower double-precision value of \a __a.
743 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
744 ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpgt_sd(__m128d __a,__m128d __b)745 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
746                                                           __m128d __b) {
747   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
748   return __extension__(__m128d){__c[0], __a[1]};
749 }
750 
751 /// Compares the lower double-precision floating-point values in each of
752 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
753 ///    the value in the first parameter is greater than or equal to the
754 ///    corresponding value in the second parameter.
755 ///
756 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
757 ///
758 /// \headerfile <x86intrin.h>
759 ///
760 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
761 ///
762 /// \param __a
763 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
764 ///    compared to the lower double-precision value of \a __b.
765 /// \param __b
766 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
767 ///    compared to the lower double-precision value of \a __a.
768 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
769 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpge_sd(__m128d __a,__m128d __b)770 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
771                                                           __m128d __b) {
772   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
773   return __extension__(__m128d){__c[0], __a[1]};
774 }
775 
776 /// Compares the lower double-precision floating-point values in each of
777 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
778 ///    the value in the first parameter is "ordered" with respect to the
779 ///    corresponding value in the second parameter.
780 ///
781 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
782 ///    of double-precision values are "ordered" with respect to each other if
783 ///    neither value is a NaN.
784 ///
785 /// \headerfile <x86intrin.h>
786 ///
787 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
788 ///
789 /// \param __a
790 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
791 ///    compared to the lower double-precision value of \a __b.
792 /// \param __b
793 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
794 ///    compared to the lower double-precision value of \a __a.
795 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
796 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpord_sd(__m128d __a,__m128d __b)797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
798                                                            __m128d __b) {
799   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
800 }
801 
802 /// Compares the lower double-precision floating-point values in each of
803 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
804 ///    the value in the first parameter is "unordered" with respect to the
805 ///    corresponding value in the second parameter.
806 ///
807 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
808 ///    of double-precision values are "unordered" with respect to each other if
809 ///    one or both values are NaN.
810 ///
811 /// \headerfile <x86intrin.h>
812 ///
813 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
814 ///   instruction.
815 ///
816 /// \param __a
817 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
818 ///    compared to the lower double-precision value of \a __b.
819 /// \param __b
820 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
821 ///    compared to the lower double-precision value of \a __a.
822 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
823 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpunord_sd(__m128d __a,__m128d __b)824 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
825                                                              __m128d __b) {
826   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
827 }
828 
829 /// Compares the lower double-precision floating-point values in each of
830 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
831 ///    the value in the first parameter is unequal to the corresponding value in
832 ///    the second parameter.
833 ///
834 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
835 ///
836 /// \headerfile <x86intrin.h>
837 ///
838 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
839 ///
840 /// \param __a
841 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
842 ///    compared to the lower double-precision value of \a __b.
843 /// \param __b
844 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
845 ///    compared to the lower double-precision value of \a __a.
846 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
847 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpneq_sd(__m128d __a,__m128d __b)848 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
849                                                            __m128d __b) {
850   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
851 }
852 
853 /// Compares the lower double-precision floating-point values in each of
854 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
855 ///    the value in the first parameter is not less than the corresponding
856 ///    value in the second parameter.
857 ///
858 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
859 ///
860 /// \headerfile <x86intrin.h>
861 ///
862 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
863 ///
864 /// \param __a
865 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
866 ///    compared to the lower double-precision value of \a __b.
867 /// \param __b
868 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
869 ///    compared to the lower double-precision value of \a __a.
870 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
871 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnlt_sd(__m128d __a,__m128d __b)872 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
873                                                            __m128d __b) {
874   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
875 }
876 
877 /// Compares the lower double-precision floating-point values in each of
878 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
879 ///    the value in the first parameter is not less than or equal to the
880 ///    corresponding value in the second parameter.
881 ///
882 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
883 ///
884 /// \headerfile <x86intrin.h>
885 ///
886 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
887 ///
888 /// \param __a
889 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
890 ///    compared to the lower double-precision value of \a __b.
891 /// \param __b
892 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
893 ///    compared to the lower double-precision value of \a __a.
894 /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
895 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnle_sd(__m128d __a,__m128d __b)896 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
897                                                            __m128d __b) {
898   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
899 }
900 
901 /// Compares the lower double-precision floating-point values in each of
902 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
903 ///    the value in the first parameter is not greater than the corresponding
904 ///    value in the second parameter.
905 ///
906 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
907 ///
908 /// \headerfile <x86intrin.h>
909 ///
910 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
911 ///
912 /// \param __a
913 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
914 ///    compared to the lower double-precision value of \a __b.
915 /// \param __b
916 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
917 ///    compared to the lower double-precision value of \a __a.
918 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
919 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpngt_sd(__m128d __a,__m128d __b)920 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
921                                                            __m128d __b) {
922   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
923   return __extension__(__m128d){__c[0], __a[1]};
924 }
925 
926 /// Compares the lower double-precision floating-point values in each of
927 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
928 ///    the value in the first parameter is not greater than or equal to the
929 ///    corresponding value in the second parameter.
930 ///
931 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
932 ///
933 /// \headerfile <x86intrin.h>
934 ///
935 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
936 ///
937 /// \param __a
938 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
939 ///    compared to the lower double-precision value of \a __b.
940 /// \param __b
941 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
942 ///    compared to the lower double-precision value of \a __a.
943 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
944 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnge_sd(__m128d __a,__m128d __b)945 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
946                                                            __m128d __b) {
947   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
948   return __extension__(__m128d){__c[0], __a[1]};
949 }
950 
951 /// Compares the lower double-precision floating-point values in each of
952 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
953 ///
954 ///    The comparison returns 0 for false, 1 for true. If either of the two
955 ///    lower double-precision values is NaN, returns 0.
956 ///
957 /// \headerfile <x86intrin.h>
958 ///
959 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
960 ///
961 /// \param __a
962 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
963 ///    compared to the lower double-precision value of \a __b.
964 /// \param __b
965 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
966 ///    compared to the lower double-precision value of \a __a.
967 /// \returns An integer containing the comparison results.
_mm_comieq_sd(__m128d __a,__m128d __b)968 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
969                                                        __m128d __b) {
970   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
971 }
972 
973 /// Compares the lower double-precision floating-point values in each of
974 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
975 ///    the value in the first parameter is less than the corresponding value in
976 ///    the second parameter.
977 ///
978 ///    The comparison returns 0 for false, 1 for true. If either of the two
979 ///    lower double-precision values is NaN, returns 0.
980 ///
981 /// \headerfile <x86intrin.h>
982 ///
983 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
984 ///
985 /// \param __a
986 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
987 ///    compared to the lower double-precision value of \a __b.
988 /// \param __b
989 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
990 ///    compared to the lower double-precision value of \a __a.
991 /// \returns An integer containing the comparison results.
_mm_comilt_sd(__m128d __a,__m128d __b)992 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993                                                        __m128d __b) {
994   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
995 }
996 
997 /// Compares the lower double-precision floating-point values in each of
998 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
999 ///    the value in the first parameter is less than or equal to the
1000 ///    corresponding value in the second parameter.
1001 ///
1002 ///    The comparison returns 0 for false, 1 for true. If either of the two
1003 ///    lower double-precision values is NaN, returns 0.
1004 ///
1005 /// \headerfile <x86intrin.h>
1006 ///
1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008 ///
1009 /// \param __a
1010 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1011 ///    compared to the lower double-precision value of \a __b.
1012 /// \param __b
1013 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1014 ///     compared to the lower double-precision value of \a __a.
1015 /// \returns An integer containing the comparison results.
_mm_comile_sd(__m128d __a,__m128d __b)1016 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1017                                                        __m128d __b) {
1018   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1019 }
1020 
1021 /// Compares the lower double-precision floating-point values in each of
1022 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1023 ///    the value in the first parameter is greater than the corresponding value
1024 ///    in the second parameter.
1025 ///
1026 ///    The comparison returns 0 for false, 1 for true. If either of the two
1027 ///    lower double-precision values is NaN, returns 0.
1028 ///
1029 /// \headerfile <x86intrin.h>
1030 ///
1031 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1032 ///
1033 /// \param __a
1034 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1035 ///    compared to the lower double-precision value of \a __b.
1036 /// \param __b
1037 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1038 ///    compared to the lower double-precision value of \a __a.
1039 /// \returns An integer containing the comparison results.
_mm_comigt_sd(__m128d __a,__m128d __b)1040 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1041                                                        __m128d __b) {
1042   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1043 }
1044 
1045 /// Compares the lower double-precision floating-point values in each of
1046 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1047 ///    the value in the first parameter is greater than or equal to the
1048 ///    corresponding value in the second parameter.
1049 ///
1050 ///    The comparison returns 0 for false, 1 for true. If either of the two
1051 ///    lower double-precision values is NaN, returns 0.
1052 ///
1053 /// \headerfile <x86intrin.h>
1054 ///
1055 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1056 ///
1057 /// \param __a
1058 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1059 ///    compared to the lower double-precision value of \a __b.
1060 /// \param __b
1061 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1062 ///    compared to the lower double-precision value of \a __a.
1063 /// \returns An integer containing the comparison results.
_mm_comige_sd(__m128d __a,__m128d __b)1064 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1065                                                        __m128d __b) {
1066   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1067 }
1068 
1069 /// Compares the lower double-precision floating-point values in each of
1070 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1071 ///    the value in the first parameter is unequal to the corresponding value in
1072 ///    the second parameter.
1073 ///
1074 ///    The comparison returns 0 for false, 1 for true. If either of the two
1075 ///    lower double-precision values is NaN, 1 is returned.
1076 ///
1077 /// \headerfile <x86intrin.h>
1078 ///
1079 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1080 ///
1081 /// \param __a
1082 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1083 ///    compared to the lower double-precision value of \a __b.
1084 /// \param __b
1085 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1086 ///    compared to the lower double-precision value of \a __a.
1087 /// \returns An integer containing the comparison results.
_mm_comineq_sd(__m128d __a,__m128d __b)1088 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1089                                                         __m128d __b) {
1090   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1091 }
1092 
1093 /// Compares the lower double-precision floating-point values in each of
1094 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
1095 ///
1096 ///    The comparison returns 0 for false, 1 for true. If either of the two
1097 ///    lower double-precision values is NaN, returns 0.
1098 ///
1099 /// \headerfile <x86intrin.h>
1100 ///
1101 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1102 ///
1103 /// \param __a
1104 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1105 ///    compared to the lower double-precision value of \a __b.
1106 /// \param __b
1107 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1108 ///    compared to the lower double-precision value of \a __a.
1109 /// \returns An integer containing the comparison results.
_mm_ucomieq_sd(__m128d __a,__m128d __b)1110 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1111                                                         __m128d __b) {
1112   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1113 }
1114 
1115 /// Compares the lower double-precision floating-point values in each of
1116 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1117 ///    the value in the first parameter is less than the corresponding value in
1118 ///    the second parameter.
1119 ///
1120 ///    The comparison returns 0 for false, 1 for true. If either of the two
1121 ///    lower double-precision values is NaN, returns 0.
1122 ///
1123 /// \headerfile <x86intrin.h>
1124 ///
1125 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1126 ///
1127 /// \param __a
1128 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1129 ///    compared to the lower double-precision value of \a __b.
1130 /// \param __b
1131 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1132 ///    compared to the lower double-precision value of \a __a.
1133 /// \returns An integer containing the comparison results.
_mm_ucomilt_sd(__m128d __a,__m128d __b)1134 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1135                                                         __m128d __b) {
1136   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1137 }
1138 
1139 /// Compares the lower double-precision floating-point values in each of
1140 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1141 ///    the value in the first parameter is less than or equal to the
1142 ///    corresponding value in the second parameter.
1143 ///
1144 ///    The comparison returns 0 for false, 1 for true. If either of the two
1145 ///    lower double-precision values is NaN, returns 0.
1146 ///
1147 /// \headerfile <x86intrin.h>
1148 ///
1149 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1150 ///
1151 /// \param __a
1152 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1153 ///    compared to the lower double-precision value of \a __b.
1154 /// \param __b
1155 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1156 ///     compared to the lower double-precision value of \a __a.
1157 /// \returns An integer containing the comparison results.
_mm_ucomile_sd(__m128d __a,__m128d __b)1158 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1159                                                         __m128d __b) {
1160   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1161 }
1162 
1163 /// Compares the lower double-precision floating-point values in each of
1164 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1165 ///    the value in the first parameter is greater than the corresponding value
1166 ///    in the second parameter.
1167 ///
1168 ///    The comparison returns 0 for false, 1 for true. If either of the two
1169 ///    lower double-precision values is NaN, returns 0.
1170 ///
1171 /// \headerfile <x86intrin.h>
1172 ///
1173 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1174 ///
1175 /// \param __a
1176 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1177 ///    compared to the lower double-precision value of \a __b.
1178 /// \param __b
1179 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1180 ///     compared to the lower double-precision value of \a __a.
1181 /// \returns An integer containing the comparison results.
_mm_ucomigt_sd(__m128d __a,__m128d __b)1182 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1183                                                         __m128d __b) {
1184   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1185 }
1186 
1187 /// Compares the lower double-precision floating-point values in each of
1188 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1189 ///    the value in the first parameter is greater than or equal to the
1190 ///    corresponding value in the second parameter.
1191 ///
1192 ///    The comparison returns 0 for false, 1 for true.  If either of the two
1193 ///    lower double-precision values is NaN, returns 0.
1194 ///
1195 /// \headerfile <x86intrin.h>
1196 ///
1197 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1198 ///
1199 /// \param __a
1200 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1201 ///    compared to the lower double-precision value of \a __b.
1202 /// \param __b
1203 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1204 ///    compared to the lower double-precision value of \a __a.
1205 /// \returns An integer containing the comparison results.
_mm_ucomige_sd(__m128d __a,__m128d __b)1206 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1207                                                         __m128d __b) {
1208   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1209 }
1210 
1211 /// Compares the lower double-precision floating-point values in each of
1212 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1213 ///    the value in the first parameter is unequal to the corresponding value in
1214 ///    the second parameter.
1215 ///
1216 ///    The comparison returns 0 for false, 1 for true. If either of the two
1217 ///    lower double-precision values is NaN, 1 is returned.
1218 ///
1219 /// \headerfile <x86intrin.h>
1220 ///
1221 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1222 ///
1223 /// \param __a
1224 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1225 ///    compared to the lower double-precision value of \a __b.
1226 /// \param __b
1227 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1228 ///    compared to the lower double-precision value of \a __a.
1229 /// \returns An integer containing the comparison result.
_mm_ucomineq_sd(__m128d __a,__m128d __b)1230 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1231                                                          __m128d __b) {
1232   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1233 }
1234 
1235 /// Converts the two double-precision floating-point elements of a
1236 ///    128-bit vector of [2 x double] into two single-precision floating-point
1237 ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1238 ///    The upper 64 bits of the result vector are set to zero.
1239 ///
1240 /// \headerfile <x86intrin.h>
1241 ///
1242 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1243 ///
1244 /// \param __a
1245 ///    A 128-bit vector of [2 x double].
1246 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1247 ///    converted values. The upper 64 bits are set to zero.
_mm_cvtpd_ps(__m128d __a)1248 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1249   return __builtin_ia32_cvtpd2ps((__v2df)__a);
1250 }
1251 
1252 /// Converts the lower two single-precision floating-point elements of a
1253 ///    128-bit vector of [4 x float] into two double-precision floating-point
1254 ///    values, returned in a 128-bit vector of [2 x double]. The upper two
1255 ///    elements of the input vector are unused.
1256 ///
1257 /// \headerfile <x86intrin.h>
1258 ///
1259 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1260 ///
1261 /// \param __a
1262 ///    A 128-bit vector of [4 x float]. The lower two single-precision
1263 ///    floating-point elements are converted to double-precision values. The
1264 ///    upper two elements are unused.
1265 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtps_pd(__m128 __a)1266 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1267   return (__m128d) __builtin_convertvector(
1268       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1269 }
1270 
1271 /// Converts the lower two integer elements of a 128-bit vector of
1272 ///    [4 x i32] into two double-precision floating-point values, returned in a
1273 ///    128-bit vector of [2 x double].
1274 ///
1275 ///    The upper two elements of the input vector are unused.
1276 ///
1277 /// \headerfile <x86intrin.h>
1278 ///
1279 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1280 ///
1281 /// \param __a
1282 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1283 ///    converted to double-precision values.
1284 ///
1285 ///    The upper two elements are unused.
1286 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtepi32_pd(__m128i __a)1287 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1288   return (__m128d) __builtin_convertvector(
1289       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1290 }
1291 
1292 /// Converts the two double-precision floating-point elements of a
1293 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1294 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1295 ///    64 bits of the result vector are set to zero.
1296 ///
1297 /// \headerfile <x86intrin.h>
1298 ///
1299 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1300 ///
1301 /// \param __a
1302 ///    A 128-bit vector of [2 x double].
1303 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1304 ///    converted values. The upper 64 bits are set to zero.
_mm_cvtpd_epi32(__m128d __a)1305 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1306   return __builtin_ia32_cvtpd2dq((__v2df)__a);
1307 }
1308 
1309 /// Converts the low-order element of a 128-bit vector of [2 x double]
1310 ///    into a 32-bit signed integer value.
1311 ///
1312 /// \headerfile <x86intrin.h>
1313 ///
1314 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1315 ///
1316 /// \param __a
1317 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1318 ///    conversion.
1319 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvtsd_si32(__m128d __a)1320 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1321   return __builtin_ia32_cvtsd2si((__v2df)__a);
1322 }
1323 
1324 /// Converts the lower double-precision floating-point element of a
1325 ///    128-bit vector of [2 x double], in the second parameter, into a
1326 ///    single-precision floating-point value, returned in the lower 32 bits of a
1327 ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1328 ///    copied from the upper 96 bits of the first parameter.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1333 ///
1334 /// \param __a
1335 ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1336 ///    copied to the upper 96 bits of the result.
1337 /// \param __b
1338 ///    A 128-bit vector of [2 x double]. The lower double-precision
1339 ///    floating-point element is used in the conversion.
1340 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1341 ///    converted value from the second parameter. The upper 96 bits are copied
1342 ///    from the upper 96 bits of the first parameter.
_mm_cvtsd_ss(__m128 __a,__m128d __b)1343 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1344                                                          __m128d __b) {
1345   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1346 }
1347 
1348 /// Converts a 32-bit signed integer value, in the second parameter, into
1349 ///    a double-precision floating-point value, returned in the lower 64 bits of
1350 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1351 ///    are copied from the upper 64 bits of the first parameter.
1352 ///
1353 /// \headerfile <x86intrin.h>
1354 ///
1355 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1356 ///
1357 /// \param __a
1358 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1359 ///    copied to the upper 64 bits of the result.
1360 /// \param __b
1361 ///    A 32-bit signed integer containing the value to be converted.
1362 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1363 ///    converted value from the second parameter. The upper 64 bits are copied
1364 ///    from the upper 64 bits of the first parameter.
_mm_cvtsi32_sd(__m128d __a,int __b)1365 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1366                                                             int __b) {
1367   __a[0] = __b;
1368   return __a;
1369 }
1370 
1371 /// Converts the lower single-precision floating-point element of a
1372 ///    128-bit vector of [4 x float], in the second parameter, into a
1373 ///    double-precision floating-point value, returned in the lower 64 bits of
1374 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1375 ///    are copied from the upper 64 bits of the first parameter.
1376 ///
1377 /// \headerfile <x86intrin.h>
1378 ///
1379 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1380 ///
1381 /// \param __a
1382 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1383 ///    copied to the upper 64 bits of the result.
1384 /// \param __b
1385 ///    A 128-bit vector of [4 x float]. The lower single-precision
1386 ///    floating-point element is used in the conversion.
1387 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1388 ///    converted value from the second parameter. The upper 64 bits are copied
1389 ///    from the upper 64 bits of the first parameter.
_mm_cvtss_sd(__m128d __a,__m128 __b)1390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1391                                                           __m128 __b) {
1392   __a[0] = __b[0];
1393   return __a;
1394 }
1395 
1396 /// Converts the two double-precision floating-point elements of a
1397 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1398 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1399 ///
1400 ///    If the result of either conversion is inexact, the result is truncated
1401 ///    (rounded towards zero) regardless of the current MXCSR setting. The upper
1402 ///    64 bits of the result vector are set to zero.
1403 ///
1404 /// \headerfile <x86intrin.h>
1405 ///
1406 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1407 ///   instruction.
1408 ///
1409 /// \param __a
1410 ///    A 128-bit vector of [2 x double].
1411 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1412 ///    converted values. The upper 64 bits are set to zero.
_mm_cvttpd_epi32(__m128d __a)1413 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1414   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1415 }
1416 
1417 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1418 ///    signed integer value, truncating the result when it is inexact.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1423 ///   instruction.
1424 ///
1425 /// \param __a
1426 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1427 ///    conversion.
1428 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvttsd_si32(__m128d __a)1429 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1430   return __builtin_ia32_cvttsd2si((__v2df)__a);
1431 }
1432 
1433 /// Converts the two double-precision floating-point elements of a
1434 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1435 ///    returned in a 64-bit vector of [2 x i32].
1436 ///
1437 /// \headerfile <x86intrin.h>
1438 ///
1439 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1440 ///
1441 /// \param __a
1442 ///    A 128-bit vector of [2 x double].
1443 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvtpd_pi32(__m128d __a)1444 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1445   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1446 }
1447 
1448 /// Converts the two double-precision floating-point elements of a
1449 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1450 ///    returned in a 64-bit vector of [2 x i32].
1451 ///
1452 ///    If the result of either conversion is inexact, the result is truncated
1453 ///    (rounded towards zero) regardless of the current MXCSR setting.
1454 ///
1455 /// \headerfile <x86intrin.h>
1456 ///
1457 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1458 ///
1459 /// \param __a
1460 ///    A 128-bit vector of [2 x double].
1461 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvttpd_pi32(__m128d __a)1462 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1463   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1464 }
1465 
1466 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1467 ///    [2 x i32] into two double-precision floating-point values, returned in a
1468 ///    128-bit vector of [2 x double].
1469 ///
1470 /// \headerfile <x86intrin.h>
1471 ///
1472 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1473 ///
1474 /// \param __a
1475 ///    A 64-bit vector of [2 x i32].
1476 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtpi32_pd(__m64 __a)1477 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1478   return __builtin_ia32_cvtpi2pd((__v2si)__a);
1479 }
1480 
1481 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1482 ///    a double-precision floating-point value.
1483 ///
1484 /// \headerfile <x86intrin.h>
1485 ///
1486 /// This intrinsic has no corresponding instruction.
1487 ///
1488 /// \param __a
1489 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1490 /// \returns A double-precision floating-point value copied from the lower 64
1491 ///    bits of \a __a.
_mm_cvtsd_f64(__m128d __a)1492 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1493   return __a[0];
1494 }
1495 
1496 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1497 ///    memory location.
1498 ///
1499 /// \headerfile <x86intrin.h>
1500 ///
1501 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1502 ///
1503 /// \param __dp
1504 ///    A pointer to a 128-bit memory location. The address of the memory
1505 ///    location has to be 16-byte aligned.
1506 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_load_pd(double const * __dp)1507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1508   return *(const __m128d *)__dp;
1509 }
1510 
1511 /// Loads a double-precision floating-point value from a specified memory
1512 ///    location and duplicates it to both vector elements of a 128-bit vector of
1513 ///    [2 x double].
1514 ///
1515 /// \headerfile <x86intrin.h>
1516 ///
1517 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1518 ///
1519 /// \param __dp
1520 ///    A pointer to a memory location containing a double-precision value.
1521 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1522 ///    duplicated values.
_mm_load1_pd(double const * __dp)1523 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1524   struct __mm_load1_pd_struct {
1525     double __u;
1526   } __attribute__((__packed__, __may_alias__));
1527   double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1528   return __extension__(__m128d){__u, __u};
1529 }
1530 
1531 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1532 
1533 /// Loads two double-precision values, in reverse order, from an aligned
1534 ///    memory location into a 128-bit vector of [2 x double].
1535 ///
1536 /// \headerfile <x86intrin.h>
1537 ///
1538 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1539 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1540 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1541 ///
1542 /// \param __dp
1543 ///    A 16-byte aligned pointer to an array of double-precision values to be
1544 ///    loaded in reverse order.
1545 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1546 ///    values.
_mm_loadr_pd(double const * __dp)1547 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1548   __m128d __u = *(const __m128d *)__dp;
1549   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1550 }
1551 
1552 /// Loads a 128-bit floating-point vector of [2 x double] from an
1553 ///    unaligned memory location.
1554 ///
1555 /// \headerfile <x86intrin.h>
1556 ///
1557 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1558 ///
1559 /// \param __dp
1560 ///    A pointer to a 128-bit memory location. The address of the memory
1561 ///    location does not have to be aligned.
1562 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_loadu_pd(double const * __dp)1563 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1564   struct __loadu_pd {
1565     __m128d_u __v;
1566   } __attribute__((__packed__, __may_alias__));
1567   return ((const struct __loadu_pd *)__dp)->__v;
1568 }
1569 
1570 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1571 ///    vector and clears the upper element.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1576 ///
1577 /// \param __a
1578 ///    A pointer to a 64-bit memory location. The address of the memory
1579 ///    location does not have to be aligned.
1580 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
_mm_loadu_si64(void const * __a)1581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1582   struct __loadu_si64 {
1583     long long __v;
1584   } __attribute__((__packed__, __may_alias__));
1585   long long __u = ((const struct __loadu_si64 *)__a)->__v;
1586   return __extension__(__m128i)(__v2di){__u, 0LL};
1587 }
1588 
1589 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1590 ///    vector and clears the upper element.
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1595 ///
1596 /// \param __a
1597 ///    A pointer to a 32-bit memory location. The address of the memory
1598 ///    location does not have to be aligned.
1599 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
_mm_loadu_si32(void const * __a)1600 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1601   struct __loadu_si32 {
1602     int __v;
1603   } __attribute__((__packed__, __may_alias__));
1604   int __u = ((const struct __loadu_si32 *)__a)->__v;
1605   return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1606 }
1607 
1608 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1609 ///    vector and clears the upper element.
1610 ///
1611 /// \headerfile <x86intrin.h>
1612 ///
1613 /// This intrinsic does not correspond to a specific instruction.
1614 ///
1615 /// \param __a
1616 ///    A pointer to a 16-bit memory location. The address of the memory
1617 ///    location does not have to be aligned.
1618 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
_mm_loadu_si16(void const * __a)1619 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1620   struct __loadu_si16 {
1621     short __v;
1622   } __attribute__((__packed__, __may_alias__));
1623   short __u = ((const struct __loadu_si16 *)__a)->__v;
1624   return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1625 }
1626 
1627 /// Loads a 64-bit double-precision value to the low element of a
1628 ///    128-bit integer vector and clears the upper element.
1629 ///
1630 /// \headerfile <x86intrin.h>
1631 ///
1632 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1633 ///
1634 /// \param __dp
1635 ///    A pointer to a memory location containing a double-precision value.
1636 ///    The address of the memory location does not have to be aligned.
1637 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
_mm_load_sd(double const * __dp)1638 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1639   struct __mm_load_sd_struct {
1640     double __u;
1641   } __attribute__((__packed__, __may_alias__));
1642   double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1643   return __extension__(__m128d){__u, 0};
1644 }
1645 
1646 /// Loads a double-precision value into the high-order bits of a 128-bit
1647 ///    vector of [2 x double]. The low-order bits are copied from the low-order
1648 ///    bits of the first operand.
1649 ///
1650 /// \headerfile <x86intrin.h>
1651 ///
1652 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1653 ///
1654 /// \param __a
1655 ///    A 128-bit vector of [2 x double]. \n
1656 ///    Bits [63:0] are written to bits [63:0] of the result.
1657 /// \param __dp
1658 ///    A pointer to a 64-bit memory location containing a double-precision
1659 ///    floating-point value that is loaded. The loaded value is written to bits
1660 ///    [127:64] of the result. The address of the memory location does not have
1661 ///    to be aligned.
1662 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadh_pd(__m128d __a,double const * __dp)1663 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1664                                                           double const *__dp) {
1665   struct __mm_loadh_pd_struct {
1666     double __u;
1667   } __attribute__((__packed__, __may_alias__));
1668   double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1669   return __extension__(__m128d){__a[0], __u};
1670 }
1671 
1672 /// Loads a double-precision value into the low-order bits of a 128-bit
1673 ///    vector of [2 x double]. The high-order bits are copied from the
1674 ///    high-order bits of the first operand.
1675 ///
1676 /// \headerfile <x86intrin.h>
1677 ///
1678 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1679 ///
1680 /// \param __a
1681 ///    A 128-bit vector of [2 x double]. \n
1682 ///    Bits [127:64] are written to bits [127:64] of the result.
1683 /// \param __dp
1684 ///    A pointer to a 64-bit memory location containing a double-precision
1685 ///    floating-point value that is loaded. The loaded value is written to bits
1686 ///    [63:0] of the result. The address of the memory location does not have to
1687 ///    be aligned.
1688 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadl_pd(__m128d __a,double const * __dp)1689 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1690                                                           double const *__dp) {
1691   struct __mm_loadl_pd_struct {
1692     double __u;
1693   } __attribute__((__packed__, __may_alias__));
1694   double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1695   return __extension__(__m128d){__u, __a[1]};
1696 }
1697 
1698 /// Constructs a 128-bit floating-point vector of [2 x double] with
1699 ///    unspecified content. This could be used as an argument to another
1700 ///    intrinsic function where the argument is required but the value is not
1701 ///    actually used.
1702 ///
1703 /// \headerfile <x86intrin.h>
1704 ///
1705 /// This intrinsic has no corresponding instruction.
1706 ///
1707 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1708 ///    content.
_mm_undefined_pd(void)1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1710   return (__m128d)__builtin_ia32_undef128();
1711 }
1712 
1713 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1714 ///    64 bits of the vector are initialized with the specified double-precision
1715 ///    floating-point value. The upper 64 bits are set to zero.
1716 ///
1717 /// \headerfile <x86intrin.h>
1718 ///
1719 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1720 ///
1721 /// \param __w
1722 ///    A double-precision floating-point value used to initialize the lower 64
1723 ///    bits of the result.
1724 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1725 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1726 ///    set to zero.
_mm_set_sd(double __w)1727 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1728   return __extension__(__m128d){__w, 0};
1729 }
1730 
1731 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1732 ///    of the two double-precision floating-point vector elements set to the
1733 ///    specified double-precision floating-point value.
1734 ///
1735 /// \headerfile <x86intrin.h>
1736 ///
1737 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1738 ///
1739 /// \param __w
1740 ///    A double-precision floating-point value used to initialize each vector
1741 ///    element of the result.
1742 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set1_pd(double __w)1743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1744   return __extension__(__m128d){__w, __w};
1745 }
1746 
1747 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1748 ///    of the two double-precision floating-point vector elements set to the
1749 ///    specified double-precision floating-point value.
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1754 ///
1755 /// \param __w
1756 ///    A double-precision floating-point value used to initialize each vector
1757 ///    element of the result.
1758 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd1(double __w)1759 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1760   return _mm_set1_pd(__w);
1761 }
1762 
1763 /// Constructs a 128-bit floating-point vector of [2 x double]
1764 ///    initialized with the specified double-precision floating-point values.
1765 ///
1766 /// \headerfile <x86intrin.h>
1767 ///
1768 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1769 ///
1770 /// \param __w
1771 ///    A double-precision floating-point value used to initialize the upper 64
1772 ///    bits of the result.
1773 /// \param __x
1774 ///    A double-precision floating-point value used to initialize the lower 64
1775 ///    bits of the result.
1776 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd(double __w,double __x)1777 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1778                                                         double __x) {
1779   return __extension__(__m128d){__x, __w};
1780 }
1781 
1782 /// Constructs a 128-bit floating-point vector of [2 x double],
1783 ///    initialized in reverse order with the specified double-precision
1784 ///    floating-point values.
1785 ///
1786 /// \headerfile <x86intrin.h>
1787 ///
1788 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1789 ///
1790 /// \param __w
1791 ///    A double-precision floating-point value used to initialize the lower 64
1792 ///    bits of the result.
1793 /// \param __x
1794 ///    A double-precision floating-point value used to initialize the upper 64
1795 ///    bits of the result.
1796 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_setr_pd(double __w,double __x)1797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1798                                                          double __x) {
1799   return __extension__(__m128d){__w, __x};
1800 }
1801 
1802 /// Constructs a 128-bit floating-point vector of [2 x double]
1803 ///    initialized to zero.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1808 ///
1809 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1810 ///    all elements set to zero.
_mm_setzero_pd(void)1811 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1812   return __extension__(__m128d){0.0, 0.0};
1813 }
1814 
1815 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1816 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
1817 ///    64 bits are set to the upper 64 bits of the first parameter.
1818 ///
1819 /// \headerfile <x86intrin.h>
1820 ///
1821 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1822 ///
1823 /// \param __a
1824 ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1825 ///    upper 64 bits of the result.
1826 /// \param __b
1827 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1828 ///    lower 64 bits of the result.
1829 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_move_sd(__m128d __a,__m128d __b)1830 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1831                                                          __m128d __b) {
1832   __a[0] = __b[0];
1833   return __a;
1834 }
1835 
1836 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1837 ///    memory location.
1838 ///
1839 /// \headerfile <x86intrin.h>
1840 ///
1841 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1842 ///
1843 /// \param __dp
1844 ///    A pointer to a 64-bit memory location.
1845 /// \param __a
1846 ///    A 128-bit vector of [2 x double] containing the value to be stored.
_mm_store_sd(double * __dp,__m128d __a)1847 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1848                                                        __m128d __a) {
1849   struct __mm_store_sd_struct {
1850     double __u;
1851   } __attribute__((__packed__, __may_alias__));
1852   ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1853 }
1854 
1855 /// Moves packed double-precision values from a 128-bit vector of
1856 ///    [2 x double] to a memory location.
1857 ///
1858 /// \headerfile <x86intrin.h>
1859 ///
1860 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1861 ///
1862 /// \param __dp
1863 ///    A pointer to an aligned memory location that can store two
1864 ///    double-precision values.
1865 /// \param __a
1866 ///    A packed 128-bit vector of [2 x double] containing the values to be
1867 ///    moved.
_mm_store_pd(double * __dp,__m128d __a)1868 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1869                                                        __m128d __a) {
1870   *(__m128d *)__dp = __a;
1871 }
1872 
1873 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1874 ///    the upper and lower 64 bits of a memory location.
1875 ///
1876 /// \headerfile <x86intrin.h>
1877 ///
1878 /// This intrinsic corresponds to the
1879 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1880 ///
1881 /// \param __dp
1882 ///    A pointer to a memory location that can store two double-precision
1883 ///    values.
1884 /// \param __a
1885 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1886 ///    of the values in \a __dp.
_mm_store1_pd(double * __dp,__m128d __a)1887 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1888                                                         __m128d __a) {
1889   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1890   _mm_store_pd(__dp, __a);
1891 }
1892 
1893 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1894 ///    the upper and lower 64 bits of a memory location.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the
1899 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1900 ///
1901 /// \param __dp
1902 ///    A pointer to a memory location that can store two double-precision
1903 ///    values.
1904 /// \param __a
1905 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1906 ///    of the values in \a __dp.
_mm_store_pd1(double * __dp,__m128d __a)1907 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1908                                                         __m128d __a) {
1909   _mm_store1_pd(__dp, __a);
1910 }
1911 
1912 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1913 ///    location.
1914 ///
1915 /// \headerfile <x86intrin.h>
1916 ///
1917 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1918 ///
1919 /// \param __dp
1920 ///    A pointer to a 128-bit memory location. The address of the memory
1921 ///    location does not have to be aligned.
1922 /// \param __a
1923 ///    A 128-bit vector of [2 x double] containing the values to be stored.
_mm_storeu_pd(double * __dp,__m128d __a)1924 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1925                                                         __m128d __a) {
1926   struct __storeu_pd {
1927     __m128d_u __v;
1928   } __attribute__((__packed__, __may_alias__));
1929   ((struct __storeu_pd *)__dp)->__v = __a;
1930 }
1931 
1932 /// Stores two double-precision values, in reverse order, from a 128-bit
1933 ///    vector of [2 x double] to a 16-byte aligned memory location.
1934 ///
1935 /// \headerfile <x86intrin.h>
1936 ///
1937 /// This intrinsic corresponds to a shuffling instruction followed by a
1938 /// <c> VMOVAPD / MOVAPD </c> instruction.
1939 ///
1940 /// \param __dp
1941 ///    A pointer to a 16-byte aligned memory location that can store two
1942 ///    double-precision values.
1943 /// \param __a
1944 ///    A 128-bit vector of [2 x double] containing the values to be reversed and
1945 ///    stored.
_mm_storer_pd(double * __dp,__m128d __a)1946 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1947                                                         __m128d __a) {
1948   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1949   *(__m128d *)__dp = __a;
1950 }
1951 
1952 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1953 ///    memory location.
1954 ///
1955 /// \headerfile <x86intrin.h>
1956 ///
1957 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1958 ///
1959 /// \param __dp
1960 ///    A pointer to a 64-bit memory location.
1961 /// \param __a
1962 ///    A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storeh_pd(double * __dp,__m128d __a)1963 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1964                                                         __m128d __a) {
1965   struct __mm_storeh_pd_struct {
1966     double __u;
1967   } __attribute__((__packed__, __may_alias__));
1968   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1969 }
1970 
1971 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1972 ///    memory location.
1973 ///
1974 /// \headerfile <x86intrin.h>
1975 ///
1976 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1977 ///
1978 /// \param __dp
1979 ///    A pointer to a 64-bit memory location.
1980 /// \param __a
1981 ///    A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storel_pd(double * __dp,__m128d __a)1982 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1983                                                         __m128d __a) {
1984   struct __mm_storeh_pd_struct {
1985     double __u;
1986   } __attribute__((__packed__, __may_alias__));
1987   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1988 }
1989 
1990 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1991 ///    saving the lower 8 bits of each sum in the corresponding element of a
1992 ///    128-bit result vector of [16 x i8].
1993 ///
1994 ///    The integer elements of both parameters can be either signed or unsigned.
1995 ///
1996 /// \headerfile <x86intrin.h>
1997 ///
1998 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
1999 ///
2000 /// \param __a
2001 ///    A 128-bit vector of [16 x i8].
2002 /// \param __b
2003 ///    A 128-bit vector of [16 x i8].
2004 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2005 ///    parameters.
_mm_add_epi8(__m128i __a,__m128i __b)2006 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2007                                                           __m128i __b) {
2008   return (__m128i)((__v16qu)__a + (__v16qu)__b);
2009 }
2010 
2011 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2012 ///    saving the lower 16 bits of each sum in the corresponding element of a
2013 ///    128-bit result vector of [8 x i16].
2014 ///
2015 ///    The integer elements of both parameters can be either signed or unsigned.
2016 ///
2017 /// \headerfile <x86intrin.h>
2018 ///
2019 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2020 ///
2021 /// \param __a
2022 ///    A 128-bit vector of [8 x i16].
2023 /// \param __b
2024 ///    A 128-bit vector of [8 x i16].
2025 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2026 ///    parameters.
_mm_add_epi16(__m128i __a,__m128i __b)2027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2028                                                            __m128i __b) {
2029   return (__m128i)((__v8hu)__a + (__v8hu)__b);
2030 }
2031 
2032 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2033 ///    saving the lower 32 bits of each sum in the corresponding element of a
2034 ///    128-bit result vector of [4 x i32].
2035 ///
2036 ///    The integer elements of both parameters can be either signed or unsigned.
2037 ///
2038 /// \headerfile <x86intrin.h>
2039 ///
2040 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2041 ///
2042 /// \param __a
2043 ///    A 128-bit vector of [4 x i32].
2044 /// \param __b
2045 ///    A 128-bit vector of [4 x i32].
2046 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2047 ///    parameters.
_mm_add_epi32(__m128i __a,__m128i __b)2048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2049                                                            __m128i __b) {
2050   return (__m128i)((__v4su)__a + (__v4su)__b);
2051 }
2052 
2053 /// Adds two signed or unsigned 64-bit integer values, returning the
2054 ///    lower 64 bits of the sum.
2055 ///
2056 /// \headerfile <x86intrin.h>
2057 ///
2058 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2059 ///
2060 /// \param __a
2061 ///    A 64-bit integer.
2062 /// \param __b
2063 ///    A 64-bit integer.
2064 /// \returns A 64-bit integer containing the sum of both parameters.
_mm_add_si64(__m64 __a,__m64 __b)2065 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2066                                                             __m64 __b) {
2067   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2068 }
2069 
2070 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2071 ///    saving the lower 64 bits of each sum in the corresponding element of a
2072 ///    128-bit result vector of [2 x i64].
2073 ///
2074 ///    The integer elements of both parameters can be either signed or unsigned.
2075 ///
2076 /// \headerfile <x86intrin.h>
2077 ///
2078 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2079 ///
2080 /// \param __a
2081 ///    A 128-bit vector of [2 x i64].
2082 /// \param __b
2083 ///    A 128-bit vector of [2 x i64].
2084 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2085 ///    parameters.
_mm_add_epi64(__m128i __a,__m128i __b)2086 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2087                                                            __m128i __b) {
2088   return (__m128i)((__v2du)__a + (__v2du)__b);
2089 }
2090 
2091 /// Adds, with saturation, the corresponding elements of two 128-bit
2092 ///    signed [16 x i8] vectors, saving each sum in the corresponding element
2093 ///    of a 128-bit result vector of [16 x i8].
2094 ///
2095 ///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2096 ///    less than 0x80 are saturated to 0x80.
2097 ///
2098 /// \headerfile <x86intrin.h>
2099 ///
2100 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2101 ///
2102 /// \param __a
2103 ///    A 128-bit signed [16 x i8] vector.
2104 /// \param __b
2105 ///    A 128-bit signed [16 x i8] vector.
2106 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2107 ///    both parameters.
_mm_adds_epi8(__m128i __a,__m128i __b)2108 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2109                                                            __m128i __b) {
2110   return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2111 }
2112 
2113 /// Adds, with saturation, the corresponding elements of two 128-bit
2114 ///    signed [8 x i16] vectors, saving each sum in the corresponding element
2115 ///    of a 128-bit result vector of [8 x i16].
2116 ///
2117 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2118 ///    less than 0x8000 are saturated to 0x8000.
2119 ///
2120 /// \headerfile <x86intrin.h>
2121 ///
2122 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2123 ///
2124 /// \param __a
2125 ///    A 128-bit signed [8 x i16] vector.
2126 /// \param __b
2127 ///    A 128-bit signed [8 x i16] vector.
2128 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2129 ///    both parameters.
_mm_adds_epi16(__m128i __a,__m128i __b)2130 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2131                                                             __m128i __b) {
2132   return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2133 }
2134 
2135 /// Adds, with saturation, the corresponding elements of two 128-bit
2136 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2137 ///    of a 128-bit result vector of [16 x i8].
2138 ///
2139 ///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2140 ///    saturated to 0x00.
2141 ///
2142 /// \headerfile <x86intrin.h>
2143 ///
2144 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2145 ///
2146 /// \param __a
2147 ///    A 128-bit unsigned [16 x i8] vector.
2148 /// \param __b
2149 ///    A 128-bit unsigned [16 x i8] vector.
2150 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2151 ///    of both parameters.
_mm_adds_epu8(__m128i __a,__m128i __b)2152 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2153                                                            __m128i __b) {
2154   return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2155 }
2156 
2157 /// Adds, with saturation, the corresponding elements of two 128-bit
2158 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2159 ///    of a 128-bit result vector of [8 x i16].
2160 ///
2161 ///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2162 ///    are saturated to 0x0000.
2163 ///
2164 /// \headerfile <x86intrin.h>
2165 ///
2166 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2167 ///
2168 /// \param __a
2169 ///    A 128-bit unsigned [8 x i16] vector.
2170 /// \param __b
2171 ///    A 128-bit unsigned [8 x i16] vector.
2172 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2173 ///    of both parameters.
_mm_adds_epu16(__m128i __a,__m128i __b)2174 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2175                                                             __m128i __b) {
2176   return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2177 }
2178 
2179 /// Computes the rounded averages of corresponding elements of two
2180 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
2181 ///    corresponding element of a 128-bit result vector of [16 x i8].
2182 ///
2183 /// \headerfile <x86intrin.h>
2184 ///
2185 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2186 ///
2187 /// \param __a
2188 ///    A 128-bit unsigned [16 x i8] vector.
2189 /// \param __b
2190 ///    A 128-bit unsigned [16 x i8] vector.
2191 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2192 ///    averages of both parameters.
_mm_avg_epu8(__m128i __a,__m128i __b)2193 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2194                                                           __m128i __b) {
2195   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2196 }
2197 
2198 /// Computes the rounded averages of corresponding elements of two
2199 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
2200 ///    corresponding element of a 128-bit result vector of [8 x i16].
2201 ///
2202 /// \headerfile <x86intrin.h>
2203 ///
2204 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2205 ///
2206 /// \param __a
2207 ///    A 128-bit unsigned [8 x i16] vector.
2208 /// \param __b
2209 ///    A 128-bit unsigned [8 x i16] vector.
2210 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2211 ///    averages of both parameters.
_mm_avg_epu16(__m128i __a,__m128i __b)2212 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2213                                                            __m128i __b) {
2214   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2215 }
2216 
2217 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2218 ///    vectors, producing eight intermediate 32-bit signed integer products, and
2219 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2220 ///    [4 x i32] vector.
2221 ///
2222 ///    For example, bits [15:0] of both parameters are multiplied producing a
2223 ///    32-bit product, bits [31:16] of both parameters are multiplied producing
2224 ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2225 ///    of the result.
2226 ///
2227 /// \headerfile <x86intrin.h>
2228 ///
2229 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2230 ///
2231 /// \param __a
2232 ///    A 128-bit signed [8 x i16] vector.
2233 /// \param __b
2234 ///    A 128-bit signed [8 x i16] vector.
2235 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2236 ///    of both parameters.
_mm_madd_epi16(__m128i __a,__m128i __b)2237 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2238                                                             __m128i __b) {
2239   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2240 }
2241 
2242 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2243 ///    vectors, saving the greater value from each comparison in the
2244 ///    corresponding element of a 128-bit result vector of [8 x i16].
2245 ///
2246 /// \headerfile <x86intrin.h>
2247 ///
2248 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2249 ///
2250 /// \param __a
2251 ///    A 128-bit signed [8 x i16] vector.
2252 /// \param __b
2253 ///    A 128-bit signed [8 x i16] vector.
2254 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2255 ///    each comparison.
_mm_max_epi16(__m128i __a,__m128i __b)2256 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2257                                                            __m128i __b) {
2258   return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2259 }
2260 
2261 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2262 ///    vectors, saving the greater value from each comparison in the
2263 ///    corresponding element of a 128-bit result vector of [16 x i8].
2264 ///
2265 /// \headerfile <x86intrin.h>
2266 ///
2267 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2268 ///
2269 /// \param __a
2270 ///    A 128-bit unsigned [16 x i8] vector.
2271 /// \param __b
2272 ///    A 128-bit unsigned [16 x i8] vector.
2273 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2274 ///    each comparison.
_mm_max_epu8(__m128i __a,__m128i __b)2275 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2276                                                           __m128i __b) {
2277   return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2278 }
2279 
2280 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2281 ///    vectors, saving the smaller value from each comparison in the
2282 ///    corresponding element of a 128-bit result vector of [8 x i16].
2283 ///
2284 /// \headerfile <x86intrin.h>
2285 ///
2286 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2287 ///
2288 /// \param __a
2289 ///    A 128-bit signed [8 x i16] vector.
2290 /// \param __b
2291 ///    A 128-bit signed [8 x i16] vector.
2292 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2293 ///    each comparison.
_mm_min_epi16(__m128i __a,__m128i __b)2294 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2295                                                            __m128i __b) {
2296   return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2297 }
2298 
2299 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2300 ///    vectors, saving the smaller value from each comparison in the
2301 ///    corresponding element of a 128-bit result vector of [16 x i8].
2302 ///
2303 /// \headerfile <x86intrin.h>
2304 ///
2305 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2306 ///
2307 /// \param __a
2308 ///    A 128-bit unsigned [16 x i8] vector.
2309 /// \param __b
2310 ///    A 128-bit unsigned [16 x i8] vector.
2311 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2312 ///    each comparison.
_mm_min_epu8(__m128i __a,__m128i __b)2313 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2314                                                           __m128i __b) {
2315   return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2316 }
2317 
2318 /// Multiplies the corresponding elements of two signed [8 x i16]
2319 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2320 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2321 ///
2322 /// \headerfile <x86intrin.h>
2323 ///
2324 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2325 ///
2326 /// \param __a
2327 ///    A 128-bit signed [8 x i16] vector.
2328 /// \param __b
2329 ///    A 128-bit signed [8 x i16] vector.
2330 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2331 ///    each of the eight 32-bit products.
_mm_mulhi_epi16(__m128i __a,__m128i __b)2332 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2333                                                              __m128i __b) {
2334   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2335 }
2336 
2337 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2338 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2339 ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2340 ///
2341 /// \headerfile <x86intrin.h>
2342 ///
2343 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2344 ///
2345 /// \param __a
2346 ///    A 128-bit unsigned [8 x i16] vector.
2347 /// \param __b
2348 ///    A 128-bit unsigned [8 x i16] vector.
2349 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2350 ///    of each of the eight 32-bit products.
_mm_mulhi_epu16(__m128i __a,__m128i __b)2351 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2352                                                              __m128i __b) {
2353   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2354 }
2355 
2356 /// Multiplies the corresponding elements of two signed [8 x i16]
2357 ///    vectors, saving the lower 16 bits of each 32-bit product in the
2358 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2359 ///
2360 /// \headerfile <x86intrin.h>
2361 ///
2362 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2363 ///
2364 /// \param __a
2365 ///    A 128-bit signed [8 x i16] vector.
2366 /// \param __b
2367 ///    A 128-bit signed [8 x i16] vector.
2368 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2369 ///    each of the eight 32-bit products.
_mm_mullo_epi16(__m128i __a,__m128i __b)2370 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2371                                                              __m128i __b) {
2372   return (__m128i)((__v8hu)__a * (__v8hu)__b);
2373 }
2374 
2375 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2376 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2377 ///    product.
2378 ///
2379 /// \headerfile <x86intrin.h>
2380 ///
2381 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2382 ///
2383 /// \param __a
2384 ///    A 64-bit integer containing one of the source operands.
2385 /// \param __b
2386 ///    A 64-bit integer containing one of the source operands.
2387 /// \returns A 64-bit integer vector containing the product of both operands.
_mm_mul_su32(__m64 __a,__m64 __b)2388 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2389                                                             __m64 __b) {
2390   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2391 }
2392 
2393 /// Multiplies 32-bit unsigned integer values contained in the lower
2394 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2395 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2396 ///
2397 /// \headerfile <x86intrin.h>
2398 ///
2399 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2400 ///
2401 /// \param __a
2402 ///    A [2 x i64] vector containing one of the source operands.
2403 /// \param __b
2404 ///    A [2 x i64] vector containing one of the source operands.
2405 /// \returns A [2 x i64] vector containing the product of both operands.
_mm_mul_epu32(__m128i __a,__m128i __b)2406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2407                                                            __m128i __b) {
2408   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2409 }
2410 
2411 /// Computes the absolute differences of corresponding 8-bit integer
2412 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2413 ///    separately sums the second 8 absolute differences. Packs these two
2414 ///    unsigned 16-bit integer sums into the upper and lower elements of a
2415 ///    [2 x i64] vector.
2416 ///
2417 /// \headerfile <x86intrin.h>
2418 ///
2419 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2420 ///
2421 /// \param __a
2422 ///    A 128-bit integer vector containing one of the source operands.
2423 /// \param __b
2424 ///    A 128-bit integer vector containing one of the source operands.
2425 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2426 ///    differences between both operands.
_mm_sad_epu8(__m128i __a,__m128i __b)2427 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2428                                                           __m128i __b) {
2429   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2430 }
2431 
2432 /// Subtracts the corresponding 8-bit integer values in the operands.
2433 ///
2434 /// \headerfile <x86intrin.h>
2435 ///
2436 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2437 ///
2438 /// \param __a
2439 ///    A 128-bit integer vector containing the minuends.
2440 /// \param __b
2441 ///    A 128-bit integer vector containing the subtrahends.
2442 /// \returns A 128-bit integer vector containing the differences of the values
2443 ///    in the operands.
_mm_sub_epi8(__m128i __a,__m128i __b)2444 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2445                                                           __m128i __b) {
2446   return (__m128i)((__v16qu)__a - (__v16qu)__b);
2447 }
2448 
2449 /// Subtracts the corresponding 16-bit integer values in the operands.
2450 ///
2451 /// \headerfile <x86intrin.h>
2452 ///
2453 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2454 ///
2455 /// \param __a
2456 ///    A 128-bit integer vector containing the minuends.
2457 /// \param __b
2458 ///    A 128-bit integer vector containing the subtrahends.
2459 /// \returns A 128-bit integer vector containing the differences of the values
2460 ///    in the operands.
_mm_sub_epi16(__m128i __a,__m128i __b)2461 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2462                                                            __m128i __b) {
2463   return (__m128i)((__v8hu)__a - (__v8hu)__b);
2464 }
2465 
2466 /// Subtracts the corresponding 32-bit integer values in the operands.
2467 ///
2468 /// \headerfile <x86intrin.h>
2469 ///
2470 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2471 ///
2472 /// \param __a
2473 ///    A 128-bit integer vector containing the minuends.
2474 /// \param __b
2475 ///    A 128-bit integer vector containing the subtrahends.
2476 /// \returns A 128-bit integer vector containing the differences of the values
2477 ///    in the operands.
_mm_sub_epi32(__m128i __a,__m128i __b)2478 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2479                                                            __m128i __b) {
2480   return (__m128i)((__v4su)__a - (__v4su)__b);
2481 }
2482 
2483 /// Subtracts signed or unsigned 64-bit integer values and writes the
2484 ///    difference to the corresponding bits in the destination.
2485 ///
2486 /// \headerfile <x86intrin.h>
2487 ///
2488 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2489 ///
2490 /// \param __a
2491 ///    A 64-bit integer vector containing the minuend.
2492 /// \param __b
2493 ///    A 64-bit integer vector containing the subtrahend.
2494 /// \returns A 64-bit integer vector containing the difference of the values in
2495 ///    the operands.
_mm_sub_si64(__m64 __a,__m64 __b)2496 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2497                                                             __m64 __b) {
2498   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2499 }
2500 
2501 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2502 ///
2503 /// \headerfile <x86intrin.h>
2504 ///
2505 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2506 ///
2507 /// \param __a
2508 ///    A 128-bit integer vector containing the minuends.
2509 /// \param __b
2510 ///    A 128-bit integer vector containing the subtrahends.
2511 /// \returns A 128-bit integer vector containing the differences of the values
2512 ///    in the operands.
_mm_sub_epi64(__m128i __a,__m128i __b)2513 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2514                                                            __m128i __b) {
2515   return (__m128i)((__v2du)__a - (__v2du)__b);
2516 }
2517 
2518 /// Subtracts, with saturation, corresponding 8-bit signed integer values in
2519 ///    the input and returns the differences in the corresponding bytes in the
2520 ///    destination.
2521 ///
2522 ///    Differences greater than 0x7F are saturated to 0x7F, and differences
2523 ///    less than 0x80 are saturated to 0x80.
2524 ///
2525 /// \headerfile <x86intrin.h>
2526 ///
2527 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2528 ///
2529 /// \param __a
2530 ///    A 128-bit integer vector containing the minuends.
2531 /// \param __b
2532 ///    A 128-bit integer vector containing the subtrahends.
2533 /// \returns A 128-bit integer vector containing the differences of the values
2534 ///    in the operands.
_mm_subs_epi8(__m128i __a,__m128i __b)2535 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2536                                                            __m128i __b) {
2537   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2538 }
2539 
2540 /// Subtracts, with saturation, corresponding 16-bit signed integer values in
2541 ///    the input and returns the differences in the corresponding bytes in the
2542 ///    destination.
2543 ///
2544 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2545 ///    than 0x8000 are saturated to 0x8000.
2546 ///
2547 /// \headerfile <x86intrin.h>
2548 ///
2549 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2550 ///
2551 /// \param __a
2552 ///    A 128-bit integer vector containing the minuends.
2553 /// \param __b
2554 ///    A 128-bit integer vector containing the subtrahends.
2555 /// \returns A 128-bit integer vector containing the differences of the values
2556 ///    in the operands.
_mm_subs_epi16(__m128i __a,__m128i __b)2557 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2558                                                             __m128i __b) {
2559   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2560 }
2561 
2562 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2563 ///    the input and returns the differences in the corresponding bytes in the
2564 ///    destination.
2565 ///
2566 ///    Differences less than 0x00 are saturated to 0x00.
2567 ///
2568 /// \headerfile <x86intrin.h>
2569 ///
2570 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2571 ///
2572 /// \param __a
2573 ///    A 128-bit integer vector containing the minuends.
2574 /// \param __b
2575 ///    A 128-bit integer vector containing the subtrahends.
2576 /// \returns A 128-bit integer vector containing the unsigned integer
2577 ///    differences of the values in the operands.
_mm_subs_epu8(__m128i __a,__m128i __b)2578 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2579                                                            __m128i __b) {
2580   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2581 }
2582 
2583 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2584 ///    the input and returns the differences in the corresponding bytes in the
2585 ///    destination.
2586 ///
2587 ///    Differences less than 0x0000 are saturated to 0x0000.
2588 ///
2589 /// \headerfile <x86intrin.h>
2590 ///
2591 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2592 ///
2593 /// \param __a
2594 ///    A 128-bit integer vector containing the minuends.
2595 /// \param __b
2596 ///    A 128-bit integer vector containing the subtrahends.
2597 /// \returns A 128-bit integer vector containing the unsigned integer
2598 ///    differences of the values in the operands.
_mm_subs_epu16(__m128i __a,__m128i __b)2599 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2600                                                             __m128i __b) {
2601   return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2602 }
2603 
2604 /// Performs a bitwise AND of two 128-bit integer vectors.
2605 ///
2606 /// \headerfile <x86intrin.h>
2607 ///
2608 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2609 ///
2610 /// \param __a
2611 ///    A 128-bit integer vector containing one of the source operands.
2612 /// \param __b
2613 ///    A 128-bit integer vector containing one of the source operands.
2614 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2615 ///    in both operands.
_mm_and_si128(__m128i __a,__m128i __b)2616 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2617                                                            __m128i __b) {
2618   return (__m128i)((__v2du)__a & (__v2du)__b);
2619 }
2620 
2621 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2622 ///    one's complement of the values contained in the first source operand.
2623 ///
2624 /// \headerfile <x86intrin.h>
2625 ///
2626 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2627 ///
2628 /// \param __a
2629 ///    A 128-bit vector containing the left source operand. The one's complement
2630 ///    of this value is used in the bitwise AND.
2631 /// \param __b
2632 ///    A 128-bit vector containing the right source operand.
2633 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2634 ///    complement of the first operand and the values in the second operand.
_mm_andnot_si128(__m128i __a,__m128i __b)2635 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2636                                                               __m128i __b) {
2637   return (__m128i)(~(__v2du)__a & (__v2du)__b);
2638 }
2639 /// Performs a bitwise OR of two 128-bit integer vectors.
2640 ///
2641 /// \headerfile <x86intrin.h>
2642 ///
2643 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2644 ///
2645 /// \param __a
2646 ///    A 128-bit integer vector containing one of the source operands.
2647 /// \param __b
2648 ///    A 128-bit integer vector containing one of the source operands.
2649 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2650 ///    in both operands.
_mm_or_si128(__m128i __a,__m128i __b)2651 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2652                                                           __m128i __b) {
2653   return (__m128i)((__v2du)__a | (__v2du)__b);
2654 }
2655 
2656 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2657 ///
2658 /// \headerfile <x86intrin.h>
2659 ///
2660 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2661 ///
2662 /// \param __a
2663 ///    A 128-bit integer vector containing one of the source operands.
2664 /// \param __b
2665 ///    A 128-bit integer vector containing one of the source operands.
2666 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2667 ///    values in both operands.
_mm_xor_si128(__m128i __a,__m128i __b)2668 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2669                                                            __m128i __b) {
2670   return (__m128i)((__v2du)__a ^ (__v2du)__b);
2671 }
2672 
2673 /// Left-shifts the 128-bit integer vector operand by the specified
2674 ///    number of bytes. Low-order bits are cleared.
2675 ///
2676 /// \headerfile <x86intrin.h>
2677 ///
2678 /// \code
2679 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2680 /// \endcode
2681 ///
2682 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2683 ///
2684 /// \param a
2685 ///    A 128-bit integer vector containing the source operand.
2686 /// \param imm
2687 ///    An immediate value specifying the number of bytes to left-shift operand
2688 ///    \a a.
2689 /// \returns A 128-bit integer vector containing the left-shifted value.
2690 #define _mm_slli_si128(a, imm)                                                 \
2691   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2692                                                 (int)(imm)))
2693 
2694 #define _mm_bslli_si128(a, imm)                                                \
2695   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2696                                                 (int)(imm)))
2697 
2698 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2699 ///    by the specified number of bits. Low-order bits are cleared.
2700 ///
2701 /// \headerfile <x86intrin.h>
2702 ///
2703 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2704 ///
2705 /// \param __a
2706 ///    A 128-bit integer vector containing the source operand.
2707 /// \param __count
2708 ///    An integer value specifying the number of bits to left-shift each value
2709 ///    in operand \a __a.
2710 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi16(__m128i __a,int __count)2711 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2712                                                             int __count) {
2713   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2714 }
2715 
2716 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2717 ///    by the specified number of bits. Low-order bits are cleared.
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2722 ///
2723 /// \param __a
2724 ///    A 128-bit integer vector containing the source operand.
2725 /// \param __count
2726 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2727 ///    to left-shift each value in operand \a __a.
2728 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi16(__m128i __a,__m128i __count)2729 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2730                                                            __m128i __count) {
2731   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2732 }
2733 
2734 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2735 ///    by the specified number of bits. Low-order bits are cleared.
2736 ///
2737 /// \headerfile <x86intrin.h>
2738 ///
2739 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2740 ///
2741 /// \param __a
2742 ///    A 128-bit integer vector containing the source operand.
2743 /// \param __count
2744 ///    An integer value specifying the number of bits to left-shift each value
2745 ///    in operand \a __a.
2746 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi32(__m128i __a,int __count)2747 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2748                                                             int __count) {
2749   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2750 }
2751 
2752 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2753 ///    by the specified number of bits. Low-order bits are cleared.
2754 ///
2755 /// \headerfile <x86intrin.h>
2756 ///
2757 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2758 ///
2759 /// \param __a
2760 ///    A 128-bit integer vector containing the source operand.
2761 /// \param __count
2762 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2763 ///    to left-shift each value in operand \a __a.
2764 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi32(__m128i __a,__m128i __count)2765 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2766                                                            __m128i __count) {
2767   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2768 }
2769 
2770 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2771 ///    by the specified number of bits. Low-order bits are cleared.
2772 ///
2773 /// \headerfile <x86intrin.h>
2774 ///
2775 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2776 ///
2777 /// \param __a
2778 ///    A 128-bit integer vector containing the source operand.
2779 /// \param __count
2780 ///    An integer value specifying the number of bits to left-shift each value
2781 ///    in operand \a __a.
2782 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi64(__m128i __a,int __count)2783 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2784                                                             int __count) {
2785   return __builtin_ia32_psllqi128((__v2di)__a, __count);
2786 }
2787 
2788 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2789 ///    by the specified number of bits. Low-order bits are cleared.
2790 ///
2791 /// \headerfile <x86intrin.h>
2792 ///
2793 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2794 ///
2795 /// \param __a
2796 ///    A 128-bit integer vector containing the source operand.
2797 /// \param __count
2798 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2799 ///    to left-shift each value in operand \a __a.
2800 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi64(__m128i __a,__m128i __count)2801 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2802                                                            __m128i __count) {
2803   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2804 }
2805 
2806 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2807 ///    by the specified number of bits. High-order bits are filled with the sign
2808 ///    bit of the initial value.
2809 ///
2810 /// \headerfile <x86intrin.h>
2811 ///
2812 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2813 ///
2814 /// \param __a
2815 ///    A 128-bit integer vector containing the source operand.
2816 /// \param __count
2817 ///    An integer value specifying the number of bits to right-shift each value
2818 ///    in operand \a __a.
2819 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi16(__m128i __a,int __count)2820 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2821                                                             int __count) {
2822   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2823 }
2824 
2825 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2826 ///    by the specified number of bits. High-order bits are filled with the sign
2827 ///    bit of the initial value.
2828 ///
2829 /// \headerfile <x86intrin.h>
2830 ///
2831 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2832 ///
2833 /// \param __a
2834 ///    A 128-bit integer vector containing the source operand.
2835 /// \param __count
2836 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2837 ///    to right-shift each value in operand \a __a.
2838 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi16(__m128i __a,__m128i __count)2839 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2840                                                            __m128i __count) {
2841   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2842 }
2843 
2844 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2845 ///    by the specified number of bits. High-order bits are filled with the sign
2846 ///    bit of the initial value.
2847 ///
2848 /// \headerfile <x86intrin.h>
2849 ///
2850 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2851 ///
2852 /// \param __a
2853 ///    A 128-bit integer vector containing the source operand.
2854 /// \param __count
2855 ///    An integer value specifying the number of bits to right-shift each value
2856 ///    in operand \a __a.
2857 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi32(__m128i __a,int __count)2858 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2859                                                             int __count) {
2860   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2861 }
2862 
2863 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2864 ///    by the specified number of bits. High-order bits are filled with the sign
2865 ///    bit of the initial value.
2866 ///
2867 /// \headerfile <x86intrin.h>
2868 ///
2869 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2870 ///
2871 /// \param __a
2872 ///    A 128-bit integer vector containing the source operand.
2873 /// \param __count
2874 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2875 ///    to right-shift each value in operand \a __a.
2876 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi32(__m128i __a,__m128i __count)2877 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2878                                                            __m128i __count) {
2879   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2880 }
2881 
2882 /// Right-shifts the 128-bit integer vector operand by the specified
2883 ///    number of bytes. High-order bits are cleared.
2884 ///
2885 /// \headerfile <x86intrin.h>
2886 ///
2887 /// \code
2888 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2889 /// \endcode
2890 ///
2891 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2892 ///
2893 /// \param a
2894 ///    A 128-bit integer vector containing the source operand.
2895 /// \param imm
2896 ///    An immediate value specifying the number of bytes to right-shift operand
2897 ///    \a a.
2898 /// \returns A 128-bit integer vector containing the right-shifted value.
2899 #define _mm_srli_si128(a, imm)                                                 \
2900   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2901                                                 (int)(imm)))
2902 
2903 #define _mm_bsrli_si128(a, imm)                                                \
2904   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2905                                                 (int)(imm)))
2906 
2907 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2908 ///    operand by the specified number of bits. High-order bits are cleared.
2909 ///
2910 /// \headerfile <x86intrin.h>
2911 ///
2912 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2913 ///
2914 /// \param __a
2915 ///    A 128-bit integer vector containing the source operand.
2916 /// \param __count
2917 ///    An integer value specifying the number of bits to right-shift each value
2918 ///    in operand \a __a.
2919 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi16(__m128i __a,int __count)2920 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2921                                                             int __count) {
2922   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2923 }
2924 
2925 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2926 ///    operand by the specified number of bits. High-order bits are cleared.
2927 ///
2928 /// \headerfile <x86intrin.h>
2929 ///
2930 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2931 ///
2932 /// \param __a
2933 ///    A 128-bit integer vector containing the source operand.
2934 /// \param __count
2935 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2936 ///    to right-shift each value in operand \a __a.
2937 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi16(__m128i __a,__m128i __count)2938 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2939                                                            __m128i __count) {
2940   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2941 }
2942 
2943 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2944 ///    operand by the specified number of bits. High-order bits are cleared.
2945 ///
2946 /// \headerfile <x86intrin.h>
2947 ///
2948 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2949 ///
2950 /// \param __a
2951 ///    A 128-bit integer vector containing the source operand.
2952 /// \param __count
2953 ///    An integer value specifying the number of bits to right-shift each value
2954 ///    in operand \a __a.
2955 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi32(__m128i __a,int __count)2956 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2957                                                             int __count) {
2958   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2959 }
2960 
2961 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2962 ///    operand by the specified number of bits. High-order bits are cleared.
2963 ///
2964 /// \headerfile <x86intrin.h>
2965 ///
2966 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2967 ///
2968 /// \param __a
2969 ///    A 128-bit integer vector containing the source operand.
2970 /// \param __count
2971 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2972 ///    to right-shift each value in operand \a __a.
2973 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi32(__m128i __a,__m128i __count)2974 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2975                                                            __m128i __count) {
2976   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2977 }
2978 
2979 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2980 ///    operand by the specified number of bits. High-order bits are cleared.
2981 ///
2982 /// \headerfile <x86intrin.h>
2983 ///
2984 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2985 ///
2986 /// \param __a
2987 ///    A 128-bit integer vector containing the source operand.
2988 /// \param __count
2989 ///    An integer value specifying the number of bits to right-shift each value
2990 ///    in operand \a __a.
2991 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi64(__m128i __a,int __count)2992 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2993                                                             int __count) {
2994   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2995 }
2996 
2997 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2998 ///    operand by the specified number of bits. High-order bits are cleared.
2999 ///
3000 /// \headerfile <x86intrin.h>
3001 ///
3002 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3003 ///
3004 /// \param __a
3005 ///    A 128-bit integer vector containing the source operand.
3006 /// \param __count
3007 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3008 ///    to right-shift each value in operand \a __a.
3009 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi64(__m128i __a,__m128i __count)3010 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3011                                                            __m128i __count) {
3012   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3013 }
3014 
3015 /// Compares each of the corresponding 8-bit values of the 128-bit
3016 ///    integer vectors for equality.
3017 ///
3018 ///    Each comparison yields 0x0 for false, 0xFF for true.
3019 ///
3020 /// \headerfile <x86intrin.h>
3021 ///
3022 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3023 ///
3024 /// \param __a
3025 ///    A 128-bit integer vector.
3026 /// \param __b
3027 ///    A 128-bit integer vector.
3028 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi8(__m128i __a,__m128i __b)3029 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3030                                                             __m128i __b) {
3031   return (__m128i)((__v16qi)__a == (__v16qi)__b);
3032 }
3033 
3034 /// Compares each of the corresponding 16-bit values of the 128-bit
3035 ///    integer vectors for equality.
3036 ///
3037 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3038 ///
3039 /// \headerfile <x86intrin.h>
3040 ///
3041 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3042 ///
3043 /// \param __a
3044 ///    A 128-bit integer vector.
3045 /// \param __b
3046 ///    A 128-bit integer vector.
3047 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi16(__m128i __a,__m128i __b)3048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3049                                                              __m128i __b) {
3050   return (__m128i)((__v8hi)__a == (__v8hi)__b);
3051 }
3052 
3053 /// Compares each of the corresponding 32-bit values of the 128-bit
3054 ///    integer vectors for equality.
3055 ///
3056 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3057 ///
3058 /// \headerfile <x86intrin.h>
3059 ///
3060 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3061 ///
3062 /// \param __a
3063 ///    A 128-bit integer vector.
3064 /// \param __b
3065 ///    A 128-bit integer vector.
3066 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi32(__m128i __a,__m128i __b)3067 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3068                                                              __m128i __b) {
3069   return (__m128i)((__v4si)__a == (__v4si)__b);
3070 }
3071 
3072 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3073 ///    integer vectors to determine if the values in the first operand are
3074 ///    greater than those in the second operand.
3075 ///
3076 ///    Each comparison yields 0x0 for false, 0xFF for true.
3077 ///
3078 /// \headerfile <x86intrin.h>
3079 ///
3080 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3081 ///
3082 /// \param __a
3083 ///    A 128-bit integer vector.
3084 /// \param __b
3085 ///    A 128-bit integer vector.
3086 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi8(__m128i __a,__m128i __b)3087 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3088                                                             __m128i __b) {
3089   /* This function always performs a signed comparison, but __v16qi is a char
3090      which may be signed or unsigned, so use __v16qs. */
3091   return (__m128i)((__v16qs)__a > (__v16qs)__b);
3092 }
3093 
3094 /// Compares each of the corresponding signed 16-bit values of the
3095 ///    128-bit integer vectors to determine if the values in the first operand
3096 ///    are greater than those in the second operand.
3097 ///
3098 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3099 ///
3100 /// \headerfile <x86intrin.h>
3101 ///
3102 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3103 ///
3104 /// \param __a
3105 ///    A 128-bit integer vector.
3106 /// \param __b
3107 ///    A 128-bit integer vector.
3108 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi16(__m128i __a,__m128i __b)3109 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3110                                                              __m128i __b) {
3111   return (__m128i)((__v8hi)__a > (__v8hi)__b);
3112 }
3113 
3114 /// Compares each of the corresponding signed 32-bit values of the
3115 ///    128-bit integer vectors to determine if the values in the first operand
3116 ///    are greater than those in the second operand.
3117 ///
3118 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3119 ///
3120 /// \headerfile <x86intrin.h>
3121 ///
3122 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3123 ///
3124 /// \param __a
3125 ///    A 128-bit integer vector.
3126 /// \param __b
3127 ///    A 128-bit integer vector.
3128 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi32(__m128i __a,__m128i __b)3129 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3130                                                              __m128i __b) {
3131   return (__m128i)((__v4si)__a > (__v4si)__b);
3132 }
3133 
3134 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3135 ///    integer vectors to determine if the values in the first operand are less
3136 ///    than those in the second operand.
3137 ///
3138 ///    Each comparison yields 0x0 for false, 0xFF for true.
3139 ///
3140 /// \headerfile <x86intrin.h>
3141 ///
3142 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3143 ///
3144 /// \param __a
3145 ///    A 128-bit integer vector.
3146 /// \param __b
3147 ///    A 128-bit integer vector.
3148 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi8(__m128i __a,__m128i __b)3149 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3150                                                             __m128i __b) {
3151   return _mm_cmpgt_epi8(__b, __a);
3152 }
3153 
3154 /// Compares each of the corresponding signed 16-bit values of the
3155 ///    128-bit integer vectors to determine if the values in the first operand
3156 ///    are less than those in the second operand.
3157 ///
3158 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3159 ///
3160 /// \headerfile <x86intrin.h>
3161 ///
3162 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3163 ///
3164 /// \param __a
3165 ///    A 128-bit integer vector.
3166 /// \param __b
3167 ///    A 128-bit integer vector.
3168 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi16(__m128i __a,__m128i __b)3169 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3170                                                              __m128i __b) {
3171   return _mm_cmpgt_epi16(__b, __a);
3172 }
3173 
3174 /// Compares each of the corresponding signed 32-bit values of the
3175 ///    128-bit integer vectors to determine if the values in the first operand
3176 ///    are less than those in the second operand.
3177 ///
3178 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3179 ///
3180 /// \headerfile <x86intrin.h>
3181 ///
3182 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3183 ///
3184 /// \param __a
3185 ///    A 128-bit integer vector.
3186 /// \param __b
3187 ///    A 128-bit integer vector.
3188 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi32(__m128i __a,__m128i __b)3189 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3190                                                              __m128i __b) {
3191   return _mm_cmpgt_epi32(__b, __a);
3192 }
3193 
3194 #ifdef __x86_64__
3195 /// Converts a 64-bit signed integer value from the second operand into a
3196 ///    double-precision value and returns it in the lower element of a [2 x
3197 ///    double] vector; the upper element of the returned vector is copied from
3198 ///    the upper element of the first operand.
3199 ///
3200 /// \headerfile <x86intrin.h>
3201 ///
3202 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3203 ///
3204 /// \param __a
3205 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3206 ///    copied to the upper 64 bits of the destination.
3207 /// \param __b
3208 ///    A 64-bit signed integer operand containing the value to be converted.
3209 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3210 ///    converted value of the second operand. The upper 64 bits are copied from
3211 ///    the upper 64 bits of the first operand.
_mm_cvtsi64_sd(__m128d __a,long long __b)3212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3213                                                             long long __b) {
3214   __a[0] = __b;
3215   return __a;
3216 }
3217 
3218 /// Converts the first (lower) element of a vector of [2 x double] into a
3219 ///    64-bit signed integer value, according to the current rounding mode.
3220 ///
3221 /// \headerfile <x86intrin.h>
3222 ///
3223 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3224 ///
3225 /// \param __a
3226 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3227 ///    conversion.
3228 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvtsd_si64(__m128d __a)3229 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3230   return __builtin_ia32_cvtsd2si64((__v2df)__a);
3231 }
3232 
3233 /// Converts the first (lower) element of a vector of [2 x double] into a
3234 ///    64-bit signed integer value, truncating the result when it is inexact.
3235 ///
3236 /// \headerfile <x86intrin.h>
3237 ///
3238 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3239 ///   instruction.
3240 ///
3241 /// \param __a
3242 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3243 ///    conversion.
3244 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvttsd_si64(__m128d __a)3245 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3246   return __builtin_ia32_cvttsd2si64((__v2df)__a);
3247 }
3248 #endif
3249 
3250 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3251 ///
3252 /// \headerfile <x86intrin.h>
3253 ///
3254 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3255 ///
3256 /// \param __a
3257 ///    A 128-bit integer vector.
3258 /// \returns A 128-bit vector of [4 x float] containing the converted values.
_mm_cvtepi32_ps(__m128i __a)3259 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3260   return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3261 }
3262 
3263 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3264 ///
3265 /// \headerfile <x86intrin.h>
3266 ///
3267 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3268 ///
3269 /// \param __a
3270 ///    A 128-bit vector of [4 x float].
3271 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3272 ///    values.
_mm_cvtps_epi32(__m128 __a)3273 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3274   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3275 }
3276 
3277 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3278 ///    truncating the result when it is inexact.
3279 ///
3280 /// \headerfile <x86intrin.h>
3281 ///
3282 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3283 ///   instruction.
3284 ///
3285 /// \param __a
3286 ///    A 128-bit vector of [4 x float].
3287 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
_mm_cvttps_epi32(__m128 __a)3288 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3289   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3290 }
3291 
3292 /// Returns a vector of [4 x i32] where the lowest element is the input
3293 ///    operand and the remaining elements are zero.
3294 ///
3295 /// \headerfile <x86intrin.h>
3296 ///
3297 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3298 ///
3299 /// \param __a
3300 ///    A 32-bit signed integer operand.
3301 /// \returns A 128-bit vector of [4 x i32].
_mm_cvtsi32_si128(int __a)3302 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3303   return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3304 }
3305 
3306 /// Returns a vector of [2 x i64] where the lower element is the input
3307 ///    operand and the upper element is zero.
3308 ///
3309 /// \headerfile <x86intrin.h>
3310 ///
3311 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3312 /// in 64-bit mode.
3313 ///
3314 /// \param __a
3315 ///    A 64-bit signed integer operand containing the value to be converted.
3316 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
_mm_cvtsi64_si128(long long __a)3317 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3318   return __extension__(__m128i)(__v2di){__a, 0};
3319 }
3320 
3321 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3322 ///    32-bit signed integer value.
3323 ///
3324 /// \headerfile <x86intrin.h>
3325 ///
3326 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3327 ///
3328 /// \param __a
3329 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3330 ///    destination.
3331 /// \returns A 32-bit signed integer containing the moved value.
_mm_cvtsi128_si32(__m128i __a)3332 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3333   __v4si __b = (__v4si)__a;
3334   return __b[0];
3335 }
3336 
3337 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3338 ///    64-bit signed integer value.
3339 ///
3340 /// \headerfile <x86intrin.h>
3341 ///
3342 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3343 ///
3344 /// \param __a
3345 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3346 ///    destination.
3347 /// \returns A 64-bit signed integer containing the moved value.
_mm_cvtsi128_si64(__m128i __a)3348 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3349   return __a[0];
3350 }
3351 
3352 /// Moves packed integer values from an aligned 128-bit memory location
3353 ///    to elements in a 128-bit integer vector.
3354 ///
3355 /// \headerfile <x86intrin.h>
3356 ///
3357 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3358 ///
3359 /// \param __p
3360 ///    An aligned pointer to a memory location containing integer values.
3361 /// \returns A 128-bit integer vector containing the moved values.
3362 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_load_si128(__m128i const * __p)3363 _mm_load_si128(__m128i const *__p) {
3364   return *__p;
3365 }
3366 
3367 /// Moves packed integer values from an unaligned 128-bit memory location
3368 ///    to elements in a 128-bit integer vector.
3369 ///
3370 /// \headerfile <x86intrin.h>
3371 ///
3372 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3373 ///
3374 /// \param __p
3375 ///    A pointer to a memory location containing integer values.
3376 /// \returns A 128-bit integer vector containing the moved values.
3377 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i_u const * __p)3378 _mm_loadu_si128(__m128i_u const *__p) {
3379   struct __loadu_si128 {
3380     __m128i_u __v;
3381   } __attribute__((__packed__, __may_alias__));
3382   return ((const struct __loadu_si128 *)__p)->__v;
3383 }
3384 
3385 /// Returns a vector of [2 x i64] where the lower element is taken from
3386 ///    the lower element of the operand, and the upper element is zero.
3387 ///
3388 /// \headerfile <x86intrin.h>
3389 ///
3390 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3391 ///
3392 /// \param __p
3393 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3394 ///    the destination.
3395 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3396 ///    moved value. The higher order bits are cleared.
3397 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i_u const * __p)3398 _mm_loadl_epi64(__m128i_u const *__p) {
3399   struct __mm_loadl_epi64_struct {
3400     long long __u;
3401   } __attribute__((__packed__, __may_alias__));
3402   return __extension__(__m128i){
3403       ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3404 }
3405 
3406 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3407 ///    This could be used as an argument to another intrinsic function where the
3408 ///    argument is required but the value is not actually used.
3409 ///
3410 /// \headerfile <x86intrin.h>
3411 ///
3412 /// This intrinsic has no corresponding instruction.
3413 ///
3414 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
_mm_undefined_si128(void)3415 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3416   return (__m128i)__builtin_ia32_undef128();
3417 }
3418 
3419 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3420 ///    the specified 64-bit integer values.
3421 ///
3422 /// \headerfile <x86intrin.h>
3423 ///
3424 /// This intrinsic is a utility function and does not correspond to a specific
3425 ///    instruction.
3426 ///
3427 /// \param __q1
3428 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3429 ///    destination vector of [2 x i64].
3430 /// \param __q0
3431 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3432 ///    destination vector of [2 x i64].
3433 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3434 ///    provided in the operands.
_mm_set_epi64x(long long __q1,long long __q0)3435 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3436                                                             long long __q0) {
3437   return __extension__(__m128i)(__v2di){__q0, __q1};
3438 }
3439 
3440 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3441 ///    the specified 64-bit integer values.
3442 ///
3443 /// \headerfile <x86intrin.h>
3444 ///
3445 /// This intrinsic is a utility function and does not correspond to a specific
3446 ///    instruction.
3447 ///
3448 /// \param __q1
3449 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3450 ///    destination vector of [2 x i64].
3451 /// \param __q0
3452 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3453 ///    destination vector of [2 x i64].
3454 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3455 ///    provided in the operands.
_mm_set_epi64(__m64 __q1,__m64 __q0)3456 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3457                                                            __m64 __q0) {
3458   return _mm_set_epi64x((long long)__q1, (long long)__q0);
3459 }
3460 
3461 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3462 ///    the specified 32-bit integer values.
3463 ///
3464 /// \headerfile <x86intrin.h>
3465 ///
3466 /// This intrinsic is a utility function and does not correspond to a specific
3467 ///    instruction.
3468 ///
3469 /// \param __i3
3470 ///    A 32-bit integer value used to initialize bits [127:96] of the
3471 ///    destination vector.
3472 /// \param __i2
3473 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
3474 ///    vector.
3475 /// \param __i1
3476 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
3477 ///    vector.
3478 /// \param __i0
3479 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
3480 ///    vector.
3481 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3482 ///    provided in the operands.
_mm_set_epi32(int __i3,int __i2,int __i1,int __i0)3483 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3484                                                            int __i1, int __i0) {
3485   return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3486 }
3487 
3488 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3489 ///    the specified 16-bit integer values.
3490 ///
3491 /// \headerfile <x86intrin.h>
3492 ///
3493 /// This intrinsic is a utility function and does not correspond to a specific
3494 ///    instruction.
3495 ///
3496 /// \param __w7
3497 ///    A 16-bit integer value used to initialize bits [127:112] of the
3498 ///    destination vector.
3499 /// \param __w6
3500 ///    A 16-bit integer value used to initialize bits [111:96] of the
3501 ///    destination vector.
3502 /// \param __w5
3503 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
3504 ///    vector.
3505 /// \param __w4
3506 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
3507 ///    vector.
3508 /// \param __w3
3509 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
3510 ///    vector.
3511 /// \param __w2
3512 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
3513 ///    vector.
3514 /// \param __w1
3515 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
3516 ///    vector.
3517 /// \param __w0
3518 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
3519 ///    vector.
3520 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3521 ///    provided in the operands.
3522 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi16(short __w7,short __w6,short __w5,short __w4,short __w3,short __w2,short __w1,short __w0)3523 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3524               short __w2, short __w1, short __w0) {
3525   return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3526                                         __w4, __w5, __w6, __w7};
3527 }
3528 
3529 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3530 ///    the specified 8-bit integer values.
3531 ///
3532 /// \headerfile <x86intrin.h>
3533 ///
3534 /// This intrinsic is a utility function and does not correspond to a specific
3535 ///    instruction.
3536 ///
3537 /// \param __b15
3538 ///    Initializes bits [127:120] of the destination vector.
3539 /// \param __b14
3540 ///    Initializes bits [119:112] of the destination vector.
3541 /// \param __b13
3542 ///    Initializes bits [111:104] of the destination vector.
3543 /// \param __b12
3544 ///    Initializes bits [103:96] of the destination vector.
3545 /// \param __b11
3546 ///    Initializes bits [95:88] of the destination vector.
3547 /// \param __b10
3548 ///    Initializes bits [87:80] of the destination vector.
3549 /// \param __b9
3550 ///    Initializes bits [79:72] of the destination vector.
3551 /// \param __b8
3552 ///    Initializes bits [71:64] of the destination vector.
3553 /// \param __b7
3554 ///    Initializes bits [63:56] of the destination vector.
3555 /// \param __b6
3556 ///    Initializes bits [55:48] of the destination vector.
3557 /// \param __b5
3558 ///    Initializes bits [47:40] of the destination vector.
3559 /// \param __b4
3560 ///    Initializes bits [39:32] of the destination vector.
3561 /// \param __b3
3562 ///    Initializes bits [31:24] of the destination vector.
3563 /// \param __b2
3564 ///    Initializes bits [23:16] of the destination vector.
3565 /// \param __b1
3566 ///    Initializes bits [15:8] of the destination vector.
3567 /// \param __b0
3568 ///    Initializes bits [7:0] of the destination vector.
3569 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3570 ///    provided in the operands.
3571 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi8(char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b9,char __b8,char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)3572 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3573              char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3574              char __b4, char __b3, char __b2, char __b1, char __b0) {
3575   return __extension__(__m128i)(__v16qi){
3576       __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
3577       __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3578 }
3579 
3580 /// Initializes both values in a 128-bit integer vector with the
3581 ///    specified 64-bit integer value.
3582 ///
3583 /// \headerfile <x86intrin.h>
3584 ///
3585 /// This intrinsic is a utility function and does not correspond to a specific
3586 ///    instruction.
3587 ///
3588 /// \param __q
3589 ///    Integer value used to initialize the elements of the destination integer
3590 ///    vector.
3591 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3592 ///    elements containing the value provided in the operand.
_mm_set1_epi64x(long long __q)3593 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3594   return _mm_set_epi64x(__q, __q);
3595 }
3596 
3597 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3598 ///    specified 64-bit value.
3599 ///
3600 /// \headerfile <x86intrin.h>
3601 ///
3602 /// This intrinsic is a utility function and does not correspond to a specific
3603 ///    instruction.
3604 ///
3605 /// \param __q
3606 ///    A 64-bit value used to initialize the elements of the destination integer
3607 ///    vector.
3608 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3609 ///    containing the value provided in the operand.
_mm_set1_epi64(__m64 __q)3610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3611   return _mm_set_epi64(__q, __q);
3612 }
3613 
3614 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3615 ///    specified 32-bit value.
3616 ///
3617 /// \headerfile <x86intrin.h>
3618 ///
3619 /// This intrinsic is a utility function and does not correspond to a specific
3620 ///    instruction.
3621 ///
3622 /// \param __i
3623 ///    A 32-bit value used to initialize the elements of the destination integer
3624 ///    vector.
3625 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3626 ///    containing the value provided in the operand.
_mm_set1_epi32(int __i)3627 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3628   return _mm_set_epi32(__i, __i, __i, __i);
3629 }
3630 
3631 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3632 ///    specified 16-bit value.
3633 ///
3634 /// \headerfile <x86intrin.h>
3635 ///
3636 /// This intrinsic is a utility function and does not correspond to a specific
3637 ///    instruction.
3638 ///
3639 /// \param __w
3640 ///    A 16-bit value used to initialize the elements of the destination integer
3641 ///    vector.
3642 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3643 ///    containing the value provided in the operand.
_mm_set1_epi16(short __w)3644 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3645   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3646 }
3647 
3648 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3649 ///    specified 8-bit value.
3650 ///
3651 /// \headerfile <x86intrin.h>
3652 ///
3653 /// This intrinsic is a utility function and does not correspond to a specific
3654 ///    instruction.
3655 ///
3656 /// \param __b
3657 ///    An 8-bit value used to initialize the elements of the destination integer
3658 ///    vector.
3659 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3660 ///    containing the value provided in the operand.
_mm_set1_epi8(char __b)3661 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3662   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3663                       __b, __b, __b, __b, __b);
3664 }
3665 
3666 /// Constructs a 128-bit integer vector, initialized in reverse order
3667 ///     with the specified 64-bit integral values.
3668 ///
3669 /// \headerfile <x86intrin.h>
3670 ///
3671 /// This intrinsic does not correspond to a specific instruction.
3672 ///
3673 /// \param __q0
3674 ///    A 64-bit integral value used to initialize the lower 64 bits of the
3675 ///    result.
3676 /// \param __q1
3677 ///    A 64-bit integral value used to initialize the upper 64 bits of the
3678 ///    result.
3679 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi64(__m64 __q0,__m64 __q1)3680 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3681                                                             __m64 __q1) {
3682   return _mm_set_epi64(__q1, __q0);
3683 }
3684 
3685 /// Constructs a 128-bit integer vector, initialized in reverse order
3686 ///     with the specified 32-bit integral values.
3687 ///
3688 /// \headerfile <x86intrin.h>
3689 ///
3690 /// This intrinsic is a utility function and does not correspond to a specific
3691 ///    instruction.
3692 ///
3693 /// \param __i0
3694 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
3695 /// \param __i1
3696 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
3697 /// \param __i2
3698 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
3699 /// \param __i3
3700 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
3701 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi32(int __i0,int __i1,int __i2,int __i3)3702 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3703                                                             int __i2,
3704                                                             int __i3) {
3705   return _mm_set_epi32(__i3, __i2, __i1, __i0);
3706 }
3707 
3708 /// Constructs a 128-bit integer vector, initialized in reverse order
3709 ///     with the specified 16-bit integral values.
3710 ///
3711 /// \headerfile <x86intrin.h>
3712 ///
3713 /// This intrinsic is a utility function and does not correspond to a specific
3714 ///    instruction.
3715 ///
3716 /// \param __w0
3717 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
3718 /// \param __w1
3719 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
3720 /// \param __w2
3721 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
3722 /// \param __w3
3723 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
3724 /// \param __w4
3725 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
3726 /// \param __w5
3727 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
3728 /// \param __w6
3729 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
3730 /// \param __w7
3731 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
3732 /// \returns An initialized 128-bit integer vector.
3733 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0,short __w1,short __w2,short __w3,short __w4,short __w5,short __w6,short __w7)3734 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3735                short __w5, short __w6, short __w7) {
3736   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3737 }
3738 
3739 /// Constructs a 128-bit integer vector, initialized in reverse order
3740 ///     with the specified 8-bit integral values.
3741 ///
3742 /// \headerfile <x86intrin.h>
3743 ///
3744 /// This intrinsic is a utility function and does not correspond to a specific
3745 ///    instruction.
3746 ///
3747 /// \param __b0
3748 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
3749 /// \param __b1
3750 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
3751 /// \param __b2
3752 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
3753 /// \param __b3
3754 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
3755 /// \param __b4
3756 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
3757 /// \param __b5
3758 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
3759 /// \param __b6
3760 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
3761 /// \param __b7
3762 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
3763 /// \param __b8
3764 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
3765 /// \param __b9
3766 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
3767 /// \param __b10
3768 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
3769 /// \param __b11
3770 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
3771 /// \param __b12
3772 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
3773 /// \param __b13
3774 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
3775 /// \param __b14
3776 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
3777 /// \param __b15
3778 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
3779 /// \returns An initialized 128-bit integer vector.
3780 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7,char __b8,char __b9,char __b10,char __b11,char __b12,char __b13,char __b14,char __b15)3781 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3782               char __b6, char __b7, char __b8, char __b9, char __b10,
3783               char __b11, char __b12, char __b13, char __b14, char __b15) {
3784   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3785                       __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3786 }
3787 
3788 /// Creates a 128-bit integer vector initialized to zero.
3789 ///
3790 /// \headerfile <x86intrin.h>
3791 ///
3792 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3793 ///
3794 /// \returns An initialized 128-bit integer vector with all elements set to
3795 ///    zero.
_mm_setzero_si128(void)3796 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3797   return __extension__(__m128i)(__v2di){0LL, 0LL};
3798 }
3799 
3800 /// Stores a 128-bit integer vector to a memory location aligned on a
3801 ///    128-bit boundary.
3802 ///
3803 /// \headerfile <x86intrin.h>
3804 ///
3805 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3806 ///
3807 /// \param __p
3808 ///    A pointer to an aligned memory location that will receive the integer
3809 ///    values.
3810 /// \param __b
3811 ///    A 128-bit integer vector containing the values to be moved.
_mm_store_si128(__m128i * __p,__m128i __b)3812 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3813                                                           __m128i __b) {
3814   *__p = __b;
3815 }
3816 
3817 /// Stores a 128-bit integer vector to an unaligned memory location.
3818 ///
3819 /// \headerfile <x86intrin.h>
3820 ///
3821 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3822 ///
3823 /// \param __p
3824 ///    A pointer to a memory location that will receive the integer values.
3825 /// \param __b
3826 ///    A 128-bit integer vector containing the values to be moved.
_mm_storeu_si128(__m128i_u * __p,__m128i __b)3827 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3828                                                            __m128i __b) {
3829   struct __storeu_si128 {
3830     __m128i_u __v;
3831   } __attribute__((__packed__, __may_alias__));
3832   ((struct __storeu_si128 *)__p)->__v = __b;
3833 }
3834 
3835 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3836 ///    vector.
3837 ///
3838 /// \headerfile <x86intrin.h>
3839 ///
3840 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3841 ///
3842 /// \param __p
3843 ///    A pointer to a 64-bit memory location. The address of the memory
3844 ///    location does not have to be aligned.
3845 /// \param __b
3846 ///    A 128-bit integer vector containing the value to be stored.
_mm_storeu_si64(void * __p,__m128i __b)3847 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3848                                                           __m128i __b) {
3849   struct __storeu_si64 {
3850     long long __v;
3851   } __attribute__((__packed__, __may_alias__));
3852   ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3853 }
3854 
3855 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3856 ///    vector.
3857 ///
3858 /// \headerfile <x86intrin.h>
3859 ///
3860 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3861 ///
3862 /// \param __p
3863 ///    A pointer to a 32-bit memory location. The address of the memory
3864 ///    location does not have to be aligned.
3865 /// \param __b
3866 ///    A 128-bit integer vector containing the value to be stored.
_mm_storeu_si32(void * __p,__m128i __b)3867 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3868                                                           __m128i __b) {
3869   struct __storeu_si32 {
3870     int __v;
3871   } __attribute__((__packed__, __may_alias__));
3872   ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3873 }
3874 
3875 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3876 ///    vector.
3877 ///
3878 /// \headerfile <x86intrin.h>
3879 ///
3880 /// This intrinsic does not correspond to a specific instruction.
3881 ///
3882 /// \param __p
3883 ///    A pointer to a 16-bit memory location. The address of the memory
3884 ///    location does not have to be aligned.
3885 /// \param __b
3886 ///    A 128-bit integer vector containing the value to be stored.
_mm_storeu_si16(void * __p,__m128i __b)3887 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3888                                                           __m128i __b) {
3889   struct __storeu_si16 {
3890     short __v;
3891   } __attribute__((__packed__, __may_alias__));
3892   ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3893 }
3894 
3895 /// Moves bytes selected by the mask from the first operand to the
3896 ///    specified unaligned memory location. When a mask bit is 1, the
3897 ///    corresponding byte is written, otherwise it is not written.
3898 ///
3899 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3900 ///    used again soon). Exception and trap behavior for elements not selected
3901 ///    for storage to memory are implementation dependent.
3902 ///
3903 /// \headerfile <x86intrin.h>
3904 ///
3905 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3906 ///   instruction.
3907 ///
3908 /// \param __d
3909 ///    A 128-bit integer vector containing the values to be moved.
3910 /// \param __n
3911 ///    A 128-bit integer vector containing the mask. The most significant bit of
3912 ///    each byte represents the mask bits.
3913 /// \param __p
3914 ///    A pointer to an unaligned 128-bit memory location where the specified
3915 ///    values are moved.
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3917                                                               __m128i __n,
3918                                                               char *__p) {
3919   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3920 }
3921 
3922 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3923 ///    a memory location.
3924 ///
3925 /// \headerfile <x86intrin.h>
3926 ///
3927 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3928 ///
3929 /// \param __p
3930 ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
3931 ///    of the integer vector parameter.
3932 /// \param __a
3933 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3934 ///    value to be stored.
_mm_storel_epi64(__m128i_u * __p,__m128i __a)3935 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3936                                                            __m128i __a) {
3937   struct __mm_storel_epi64_struct {
3938     long long __u;
3939   } __attribute__((__packed__, __may_alias__));
3940   ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3941 }
3942 
3943 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3944 ///    aligned memory location.
3945 ///
3946 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3947 ///    used again soon).
3948 ///
3949 /// \headerfile <x86intrin.h>
3950 ///
3951 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3952 ///
3953 /// \param __p
3954 ///    A pointer to the 128-bit aligned memory location used to store the value.
3955 /// \param __a
3956 ///    A vector of [2 x double] containing the 64-bit values to be stored.
_mm_stream_pd(void * __p,__m128d __a)3957 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
3958                                                         __m128d __a) {
3959   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3960 }
3961 
3962 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3963 ///
3964 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3965 ///    used again soon).
3966 ///
3967 /// \headerfile <x86intrin.h>
3968 ///
3969 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3970 ///
3971 /// \param __p
3972 ///    A pointer to the 128-bit aligned memory location used to store the value.
3973 /// \param __a
3974 ///    A 128-bit integer vector containing the values to be stored.
_mm_stream_si128(void * __p,__m128i __a)3975 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
3976                                                            __m128i __a) {
3977   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3978 }
3979 
3980 /// Stores a 32-bit integer value in the specified memory location.
3981 ///
3982 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3983 ///    used again soon).
3984 ///
3985 /// \headerfile <x86intrin.h>
3986 ///
3987 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3988 ///
3989 /// \param __p
3990 ///    A pointer to the 32-bit memory location used to store the value.
3991 /// \param __a
3992 ///    A 32-bit integer containing the value to be stored.
3993 static __inline__ void
3994     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(void * __p,int __a)3995     _mm_stream_si32(void *__p, int __a) {
3996   __builtin_ia32_movnti((int *)__p, __a);
3997 }
3998 
3999 #ifdef __x86_64__
4000 /// Stores a 64-bit integer value in the specified memory location.
4001 ///
4002 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4003 ///    used again soon).
4004 ///
4005 /// \headerfile <x86intrin.h>
4006 ///
4007 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4008 ///
4009 /// \param __p
4010 ///    A pointer to the 64-bit memory location used to store the value.
4011 /// \param __a
4012 ///    A 64-bit integer containing the value to be stored.
4013 static __inline__ void
4014     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(void * __p,long long __a)4015     _mm_stream_si64(void *__p, long long __a) {
4016   __builtin_ia32_movnti64((long long *)__p, __a);
4017 }
4018 #endif
4019 
4020 #if defined(__cplusplus)
4021 extern "C" {
4022 #endif
4023 
4024 /// The cache line containing \a __p is flushed and invalidated from all
4025 ///    caches in the coherency domain.
4026 ///
4027 /// \headerfile <x86intrin.h>
4028 ///
4029 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4030 ///
4031 /// \param __p
4032 ///    A pointer to the memory location used to identify the cache line to be
4033 ///    flushed.
4034 void _mm_clflush(void const *__p);
4035 
4036 /// Forces strong memory ordering (serialization) between load
4037 ///    instructions preceding this instruction and load instructions following
4038 ///    this instruction, ensuring the system completes all previous loads before
4039 ///    executing subsequent loads.
4040 ///
4041 /// \headerfile <x86intrin.h>
4042 ///
4043 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4044 ///
4045 void _mm_lfence(void);
4046 
4047 /// Forces strong memory ordering (serialization) between load and store
4048 ///    instructions preceding this instruction and load and store instructions
4049 ///    following this instruction, ensuring that the system completes all
4050 ///    previous memory accesses before executing subsequent memory accesses.
4051 ///
4052 /// \headerfile <x86intrin.h>
4053 ///
4054 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4055 ///
4056 void _mm_mfence(void);
4057 
4058 #if defined(__cplusplus)
4059 } // extern "C"
4060 #endif
4061 
4062 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4063 ///    vector operands into 8-bit signed integers, and packs the results into
4064 ///    the destination.
4065 ///
4066 ///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
4067 ///    less than 0x80 are saturated to 0x80.
4068 ///
4069 /// \headerfile <x86intrin.h>
4070 ///
4071 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4072 ///
4073 /// \param __a
4074 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4075 ///   written to the lower 64 bits of the result.
4076 /// \param __b
4077 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4078 ///   written to the higher 64 bits of the result.
4079 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packs_epi16(__m128i __a,__m128i __b)4080 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4081                                                              __m128i __b) {
4082   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4083 }
4084 
4085 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4086 ///    vector operands into 16-bit signed integers, and packs the results into
4087 ///    the destination.
4088 ///
4089 ///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4090 ///    values less than 0x8000 are saturated to 0x8000.
4091 ///
4092 /// \headerfile <x86intrin.h>
4093 ///
4094 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4095 ///
4096 /// \param __a
4097 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4098 ///    are written to the lower 64 bits of the result.
4099 /// \param __b
4100 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4101 ///    are written to the higher 64 bits of the result.
4102 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
_mm_packs_epi32(__m128i __a,__m128i __b)4103 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4104                                                              __m128i __b) {
4105   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4106 }
4107 
4108 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4109 ///    vector operands into 8-bit unsigned integers, and packs the results into
4110 ///    the destination.
4111 ///
4112 ///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4113 ///    are saturated to 0x00.
4114 ///
4115 /// \headerfile <x86intrin.h>
4116 ///
4117 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4118 ///
4119 /// \param __a
4120 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4121 ///    written to the lower 64 bits of the result.
4122 /// \param __b
4123 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4124 ///    written to the higher 64 bits of the result.
4125 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packus_epi16(__m128i __a,__m128i __b)4126 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4127                                                               __m128i __b) {
4128   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4129 }
4130 
4131 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4132 ///    the immediate-value parameter as a selector.
4133 ///
4134 /// \headerfile <x86intrin.h>
4135 ///
4136 /// \code
4137 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4138 /// \endcode
4139 ///
4140 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4141 ///
4142 /// \param a
4143 ///    A 128-bit integer vector.
4144 /// \param imm
4145 ///    An immediate value. Bits [2:0] selects values from \a a to be assigned
4146 ///    to bits[15:0] of the result. \n
4147 ///    000: assign values from bits [15:0] of \a a. \n
4148 ///    001: assign values from bits [31:16] of \a a. \n
4149 ///    010: assign values from bits [47:32] of \a a. \n
4150 ///    011: assign values from bits [63:48] of \a a. \n
4151 ///    100: assign values from bits [79:64] of \a a. \n
4152 ///    101: assign values from bits [95:80] of \a a. \n
4153 ///    110: assign values from bits [111:96] of \a a. \n
4154 ///    111: assign values from bits [127:112] of \a a.
4155 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4156 ///    integer vector parameter and the remaining bits are assigned zeros.
4157 #define _mm_extract_epi16(a, imm)                                              \
4158   ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
4159                                                     (int)(imm)))
4160 
4161 /// Constructs a 128-bit integer vector by first making a copy of the
4162 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
4163 ///    of an integer parameter into an offset specified by the immediate-value
4164 ///    parameter.
4165 ///
4166 /// \headerfile <x86intrin.h>
4167 ///
4168 /// \code
4169 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4170 /// \endcode
4171 ///
4172 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4173 ///
4174 /// \param a
4175 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4176 ///    result and then one of the eight elements in the result is replaced by
4177 ///    the lower 16 bits of \a b.
4178 /// \param b
4179 ///    An integer. The lower 16 bits of this parameter are written to the
4180 ///    result beginning at an offset specified by \a imm.
4181 /// \param imm
4182 ///    An immediate value specifying the bit offset in the result at which the
4183 ///    lower 16 bits of \a b are written.
4184 /// \returns A 128-bit integer vector containing the constructed values.
4185 #define _mm_insert_epi16(a, b, imm)                                            \
4186   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
4187                                         (int)(imm)))
4188 
4189 /// Copies the values of the most significant bits from each 8-bit
4190 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4191 ///    value, zero-extends the value, and writes it to the destination.
4192 ///
4193 /// \headerfile <x86intrin.h>
4194 ///
4195 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4196 ///
4197 /// \param __a
4198 ///    A 128-bit integer vector containing the values with bits to be extracted.
4199 /// \returns The most significant bits from each 8-bit element in \a __a,
4200 ///    written to bits [15:0]. The other bits are assigned zeros.
_mm_movemask_epi8(__m128i __a)4201 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4202   return __builtin_ia32_pmovmskb128((__v16qi)__a);
4203 }
4204 
4205 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4206 ///    elements of a 128-bit integer vector parameter, using the immediate-value
4207 ///    parameter as a specifier.
4208 ///
4209 /// \headerfile <x86intrin.h>
4210 ///
4211 /// \code
4212 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4213 /// \endcode
4214 ///
4215 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4216 ///
4217 /// \param a
4218 ///    A 128-bit integer vector containing the values to be copied.
4219 /// \param imm
4220 ///    An immediate value containing an 8-bit value specifying which elements to
4221 ///    copy from a. The destinations within the 128-bit destination are assigned
4222 ///    values as follows: \n
4223 ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4224 ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4225 ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4226 ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4227 ///    Bit value assignments: \n
4228 ///    00: assign values from bits [31:0] of \a a. \n
4229 ///    01: assign values from bits [63:32] of \a a. \n
4230 ///    10: assign values from bits [95:64] of \a a. \n
4231 ///    11: assign values from bits [127:96] of \a a. \n
4232 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4233 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4234 ///    <c>[b6, b4, b2, b0]</c>.
4235 /// \returns A 128-bit integer vector containing the shuffled values.
4236 #define _mm_shuffle_epi32(a, imm)                                              \
4237   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4238 
4239 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4240 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4241 ///    value parameter as a specifier.
4242 ///
4243 /// \headerfile <x86intrin.h>
4244 ///
4245 /// \code
4246 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4247 /// \endcode
4248 ///
4249 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4250 ///
4251 /// \param a
4252 ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4253 ///    [127:64] of the result.
4254 /// \param imm
4255 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4256 ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4257 ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4258 ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4259 ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4260 ///    Bit value assignments: \n
4261 ///    00: assign values from bits [15:0] of \a a. \n
4262 ///    01: assign values from bits [31:16] of \a a. \n
4263 ///    10: assign values from bits [47:32] of \a a. \n
4264 ///    11: assign values from bits [63:48] of \a a. \n
4265 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4266 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4267 ///    <c>[b6, b4, b2, b0]</c>.
4268 /// \returns A 128-bit integer vector containing the shuffled values.
4269 #define _mm_shufflelo_epi16(a, imm)                                            \
4270   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4271 
4272 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4273 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4274 ///    value parameter as a specifier.
4275 ///
4276 /// \headerfile <x86intrin.h>
4277 ///
4278 /// \code
4279 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4280 /// \endcode
4281 ///
4282 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4283 ///
4284 /// \param a
4285 ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4286 ///    [63:0] of the result.
4287 /// \param imm
4288 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4289 ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4290 ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4291 ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4292 ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4293 ///    Bit value assignments: \n
4294 ///    00: assign values from bits [79:64] of \a a. \n
4295 ///    01: assign values from bits [95:80] of \a a. \n
4296 ///    10: assign values from bits [111:96] of \a a. \n
4297 ///    11: assign values from bits [127:112] of \a a. \n
4298 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4299 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4300 ///    <c>[b6, b4, b2, b0]</c>.
4301 /// \returns A 128-bit integer vector containing the shuffled values.
4302 #define _mm_shufflehi_epi16(a, imm)                                            \
4303   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4304 
4305 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4306 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4307 ///
4308 /// \headerfile <x86intrin.h>
4309 ///
4310 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4311 ///   instruction.
4312 ///
4313 /// \param __a
4314 ///    A 128-bit vector of [16 x i8].
4315 ///    Bits [71:64] are written to bits [7:0] of the result. \n
4316 ///    Bits [79:72] are written to bits [23:16] of the result. \n
4317 ///    Bits [87:80] are written to bits [39:32] of the result. \n
4318 ///    Bits [95:88] are written to bits [55:48] of the result. \n
4319 ///    Bits [103:96] are written to bits [71:64] of the result. \n
4320 ///    Bits [111:104] are written to bits [87:80] of the result. \n
4321 ///    Bits [119:112] are written to bits [103:96] of the result. \n
4322 ///    Bits [127:120] are written to bits [119:112] of the result.
4323 /// \param __b
4324 ///    A 128-bit vector of [16 x i8]. \n
4325 ///    Bits [71:64] are written to bits [15:8] of the result. \n
4326 ///    Bits [79:72] are written to bits [31:24] of the result. \n
4327 ///    Bits [87:80] are written to bits [47:40] of the result. \n
4328 ///    Bits [95:88] are written to bits [63:56] of the result. \n
4329 ///    Bits [103:96] are written to bits [79:72] of the result. \n
4330 ///    Bits [111:104] are written to bits [95:88] of the result. \n
4331 ///    Bits [119:112] are written to bits [111:104] of the result. \n
4332 ///    Bits [127:120] are written to bits [127:120] of the result.
4333 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpackhi_epi8(__m128i __a,__m128i __b)4334 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4335                                                                __m128i __b) {
4336   return (__m128i)__builtin_shufflevector(
4337       (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4338       16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4339 }
4340 
4341 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4342 ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4343 ///
4344 /// \headerfile <x86intrin.h>
4345 ///
4346 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4347 ///   instruction.
4348 ///
4349 /// \param __a
4350 ///    A 128-bit vector of [8 x i16].
4351 ///    Bits [79:64] are written to bits [15:0] of the result. \n
4352 ///    Bits [95:80] are written to bits [47:32] of the result. \n
4353 ///    Bits [111:96] are written to bits [79:64] of the result. \n
4354 ///    Bits [127:112] are written to bits [111:96] of the result.
4355 /// \param __b
4356 ///    A 128-bit vector of [8 x i16].
4357 ///    Bits [79:64] are written to bits [31:16] of the result. \n
4358 ///    Bits [95:80] are written to bits [63:48] of the result. \n
4359 ///    Bits [111:96] are written to bits [95:80] of the result. \n
4360 ///    Bits [127:112] are written to bits [127:112] of the result.
4361 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpackhi_epi16(__m128i __a,__m128i __b)4362 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4363                                                                 __m128i __b) {
4364   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4365                                           8 + 5, 6, 8 + 6, 7, 8 + 7);
4366 }
4367 
4368 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4369 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4370 ///
4371 /// \headerfile <x86intrin.h>
4372 ///
4373 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4374 ///   instruction.
4375 ///
4376 /// \param __a
4377 ///    A 128-bit vector of [4 x i32]. \n
4378 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
4379 ///    Bits [127:96] are written to bits [95:64] of the destination.
4380 /// \param __b
4381 ///    A 128-bit vector of [4 x i32]. \n
4382 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
4383 ///    Bits [127:96] are written to bits [127:96] of the destination.
4384 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpackhi_epi32(__m128i __a,__m128i __b)4385 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4386                                                                 __m128i __b) {
4387   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4388                                           4 + 3);
4389 }
4390 
4391 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4392 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4393 ///
4394 /// \headerfile <x86intrin.h>
4395 ///
4396 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4397 ///   instruction.
4398 ///
4399 /// \param __a
4400 ///    A 128-bit vector of [2 x i64]. \n
4401 ///    Bits [127:64] are written to bits [63:0] of the destination.
4402 /// \param __b
4403 ///    A 128-bit vector of [2 x i64]. \n
4404 ///    Bits [127:64] are written to bits [127:64] of the destination.
4405 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpackhi_epi64(__m128i __a,__m128i __b)4406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4407                                                                 __m128i __b) {
4408   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4409 }
4410 
4411 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4412 ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4413 ///
4414 /// \headerfile <x86intrin.h>
4415 ///
4416 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4417 ///   instruction.
4418 ///
4419 /// \param __a
4420 ///    A 128-bit vector of [16 x i8]. \n
4421 ///    Bits [7:0] are written to bits [7:0] of the result. \n
4422 ///    Bits [15:8] are written to bits [23:16] of the result. \n
4423 ///    Bits [23:16] are written to bits [39:32] of the result. \n
4424 ///    Bits [31:24] are written to bits [55:48] of the result. \n
4425 ///    Bits [39:32] are written to bits [71:64] of the result. \n
4426 ///    Bits [47:40] are written to bits [87:80] of the result. \n
4427 ///    Bits [55:48] are written to bits [103:96] of the result. \n
4428 ///    Bits [63:56] are written to bits [119:112] of the result.
4429 /// \param __b
4430 ///    A 128-bit vector of [16 x i8].
4431 ///    Bits [7:0] are written to bits [15:8] of the result. \n
4432 ///    Bits [15:8] are written to bits [31:24] of the result. \n
4433 ///    Bits [23:16] are written to bits [47:40] of the result. \n
4434 ///    Bits [31:24] are written to bits [63:56] of the result. \n
4435 ///    Bits [39:32] are written to bits [79:72] of the result. \n
4436 ///    Bits [47:40] are written to bits [95:88] of the result. \n
4437 ///    Bits [55:48] are written to bits [111:104] of the result. \n
4438 ///    Bits [63:56] are written to bits [127:120] of the result.
4439 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpacklo_epi8(__m128i __a,__m128i __b)4440 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4441                                                                __m128i __b) {
4442   return (__m128i)__builtin_shufflevector(
4443       (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4444       16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4445 }
4446 
4447 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4448 ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4449 ///    [8 x i16].
4450 ///
4451 /// \headerfile <x86intrin.h>
4452 ///
4453 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4454 ///   instruction.
4455 ///
4456 /// \param __a
4457 ///    A 128-bit vector of [8 x i16].
4458 ///    Bits [15:0] are written to bits [15:0] of the result. \n
4459 ///    Bits [31:16] are written to bits [47:32] of the result. \n
4460 ///    Bits [47:32] are written to bits [79:64] of the result. \n
4461 ///    Bits [63:48] are written to bits [111:96] of the result.
4462 /// \param __b
4463 ///    A 128-bit vector of [8 x i16].
4464 ///    Bits [15:0] are written to bits [31:16] of the result. \n
4465 ///    Bits [31:16] are written to bits [63:48] of the result. \n
4466 ///    Bits [47:32] are written to bits [95:80] of the result. \n
4467 ///    Bits [63:48] are written to bits [127:112] of the result.
4468 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpacklo_epi16(__m128i __a,__m128i __b)4469 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4470                                                                 __m128i __b) {
4471   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4472                                           8 + 1, 2, 8 + 2, 3, 8 + 3);
4473 }
4474 
4475 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4476 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4477 ///
4478 /// \headerfile <x86intrin.h>
4479 ///
4480 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4481 ///   instruction.
4482 ///
4483 /// \param __a
4484 ///    A 128-bit vector of [4 x i32]. \n
4485 ///    Bits [31:0] are written to bits [31:0] of the destination. \n
4486 ///    Bits [63:32] are written to bits [95:64] of the destination.
4487 /// \param __b
4488 ///    A 128-bit vector of [4 x i32]. \n
4489 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
4490 ///    Bits [63:32] are written to bits [127:96] of the destination.
4491 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpacklo_epi32(__m128i __a,__m128i __b)4492 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4493                                                                 __m128i __b) {
4494   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4495                                           4 + 1);
4496 }
4497 
4498 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4499 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4500 ///
4501 /// \headerfile <x86intrin.h>
4502 ///
4503 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4504 ///   instruction.
4505 ///
4506 /// \param __a
4507 ///    A 128-bit vector of [2 x i64]. \n
4508 ///    Bits [63:0] are written to bits [63:0] of the destination. \n
4509 /// \param __b
4510 ///    A 128-bit vector of [2 x i64]. \n
4511 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
4512 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpacklo_epi64(__m128i __a,__m128i __b)4513 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4514                                                                 __m128i __b) {
4515   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4516 }
4517 
4518 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4519 ///    integer.
4520 ///
4521 /// \headerfile <x86intrin.h>
4522 ///
4523 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4524 ///
4525 /// \param __a
4526 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4527 ///    destination.
4528 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
_mm_movepi64_pi64(__m128i __a)4529 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4530   return (__m64)__a[0];
4531 }
4532 
4533 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4534 ///    upper bits.
4535 ///
4536 /// \headerfile <x86intrin.h>
4537 ///
4538 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4539 ///
4540 /// \param __a
4541 ///    A 64-bit value.
4542 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4543 ///    the operand. The upper 64 bits are assigned zeros.
_mm_movpi64_epi64(__m64 __a)4544 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4545   return __extension__(__m128i)(__v2di){(long long)__a, 0};
4546 }
4547 
4548 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4549 ///    integer vector, zeroing the upper bits.
4550 ///
4551 /// \headerfile <x86intrin.h>
4552 ///
4553 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4554 ///
4555 /// \param __a
4556 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4557 ///    destination.
4558 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4559 ///    the operand. The upper 64 bits are assigned zeros.
_mm_move_epi64(__m128i __a)4560 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4561   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4562 }
4563 
4564 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4565 ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4566 ///    double].
4567 ///
4568 /// \headerfile <x86intrin.h>
4569 ///
4570 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4571 ///
4572 /// \param __a
4573 ///    A 128-bit vector of [2 x double]. \n
4574 ///    Bits [127:64] are written to bits [63:0] of the destination.
4575 /// \param __b
4576 ///    A 128-bit vector of [2 x double]. \n
4577 ///    Bits [127:64] are written to bits [127:64] of the destination.
4578 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpackhi_pd(__m128d __a,__m128d __b)4579 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4580                                                              __m128d __b) {
4581   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4582 }
4583 
4584 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4585 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4586 ///    double].
4587 ///
4588 /// \headerfile <x86intrin.h>
4589 ///
4590 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4591 ///
4592 /// \param __a
4593 ///    A 128-bit vector of [2 x double]. \n
4594 ///    Bits [63:0] are written to bits [63:0] of the destination.
4595 /// \param __b
4596 ///    A 128-bit vector of [2 x double]. \n
4597 ///    Bits [63:0] are written to bits [127:64] of the destination.
4598 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpacklo_pd(__m128d __a,__m128d __b)4599 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4600                                                              __m128d __b) {
4601   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4602 }
4603 
4604 /// Extracts the sign bits of the double-precision values in the 128-bit
4605 ///    vector of [2 x double], zero-extends the value, and writes it to the
4606 ///    low-order bits of the destination.
4607 ///
4608 /// \headerfile <x86intrin.h>
4609 ///
4610 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4611 ///
4612 /// \param __a
4613 ///    A 128-bit vector of [2 x double] containing the values with sign bits to
4614 ///    be extracted.
4615 /// \returns The sign bits from each of the double-precision elements in \a __a,
4616 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
_mm_movemask_pd(__m128d __a)4617 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4618   return __builtin_ia32_movmskpd((__v2df)__a);
4619 }
4620 
4621 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4622 ///    128-bit vector parameters of [2 x double], using the immediate-value
4623 ///     parameter as a specifier.
4624 ///
4625 /// \headerfile <x86intrin.h>
4626 ///
4627 /// \code
4628 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4629 /// \endcode
4630 ///
4631 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4632 ///
4633 /// \param a
4634 ///    A 128-bit vector of [2 x double].
4635 /// \param b
4636 ///    A 128-bit vector of [2 x double].
4637 /// \param i
4638 ///    An 8-bit immediate value. The least significant two bits specify which
4639 ///    elements to copy from \a a and \a b: \n
4640 ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4641 ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4642 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4643 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4644 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4645 ///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4646 ///    <c>[b1, b0]</c>.
4647 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4648 #define _mm_shuffle_pd(a, b, i)                                                \
4649   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
4650                                   (int)(i)))
4651 
4652 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4653 ///    floating-point vector of [4 x float].
4654 ///
4655 /// \headerfile <x86intrin.h>
4656 ///
4657 /// This intrinsic has no corresponding instruction.
4658 ///
4659 /// \param __a
4660 ///    A 128-bit floating-point vector of [2 x double].
4661 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4662 ///    bitwise pattern as the parameter.
_mm_castpd_ps(__m128d __a)4663 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4664   return (__m128)__a;
4665 }
4666 
4667 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4668 ///    integer vector.
4669 ///
4670 /// \headerfile <x86intrin.h>
4671 ///
4672 /// This intrinsic has no corresponding instruction.
4673 ///
4674 /// \param __a
4675 ///    A 128-bit floating-point vector of [2 x double].
4676 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4677 ///    parameter.
_mm_castpd_si128(__m128d __a)4678 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4679   return (__m128i)__a;
4680 }
4681 
4682 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4683 ///    floating-point vector of [2 x double].
4684 ///
4685 /// \headerfile <x86intrin.h>
4686 ///
4687 /// This intrinsic has no corresponding instruction.
4688 ///
4689 /// \param __a
4690 ///    A 128-bit floating-point vector of [4 x float].
4691 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4692 ///    bitwise pattern as the parameter.
_mm_castps_pd(__m128 __a)4693 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4694   return (__m128d)__a;
4695 }
4696 
4697 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4698 ///    integer vector.
4699 ///
4700 /// \headerfile <x86intrin.h>
4701 ///
4702 /// This intrinsic has no corresponding instruction.
4703 ///
4704 /// \param __a
4705 ///    A 128-bit floating-point vector of [4 x float].
4706 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4707 ///    parameter.
_mm_castps_si128(__m128 __a)4708 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4709   return (__m128i)__a;
4710 }
4711 
4712 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4713 ///    of [4 x float].
4714 ///
4715 /// \headerfile <x86intrin.h>
4716 ///
4717 /// This intrinsic has no corresponding instruction.
4718 ///
4719 /// \param __a
4720 ///    A 128-bit integer vector.
4721 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4722 ///    bitwise pattern as the parameter.
_mm_castsi128_ps(__m128i __a)4723 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4724   return (__m128)__a;
4725 }
4726 
4727 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4728 ///    of [2 x double].
4729 ///
4730 /// \headerfile <x86intrin.h>
4731 ///
4732 /// This intrinsic has no corresponding instruction.
4733 ///
4734 /// \param __a
4735 ///    A 128-bit integer vector.
4736 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4737 ///    bitwise pattern as the parameter.
_mm_castsi128_pd(__m128i __a)4738 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4739   return (__m128d)__a;
4740 }
4741 
4742 /// Compares each of the corresponding double-precision values of two
4743 ///    128-bit vectors of [2 x double], using the operation specified by the
4744 ///    immediate integer operand.
4745 ///
4746 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4747 ///
4748 /// \headerfile <x86intrin.h>
4749 ///
4750 /// \code
4751 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4752 /// \endcode
4753 ///
4754 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4755 ///
4756 /// \param a
4757 ///    A 128-bit vector of [2 x double].
4758 /// \param b
4759 ///    A 128-bit vector of [2 x double].
4760 /// \param c
4761 ///    An immediate integer operand, with bits [4:0] specifying which comparison
4762 ///    operation to use: \n
4763 ///    0x00: Equal (ordered, non-signaling) \n
4764 ///    0x01: Less-than (ordered, signaling) \n
4765 ///    0x02: Less-than-or-equal (ordered, signaling) \n
4766 ///    0x03: Unordered (non-signaling) \n
4767 ///    0x04: Not-equal (unordered, non-signaling) \n
4768 ///    0x05: Not-less-than (unordered, signaling) \n
4769 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4770 ///    0x07: Ordered (non-signaling) \n
4771 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4772 #define _mm_cmp_pd(a, b, c)                                                    \
4773   ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4774                                  (c)))
4775 
4776 /// Compares each of the corresponding scalar double-precision values of
4777 ///    two 128-bit vectors of [2 x double], using the operation specified by the
4778 ///    immediate integer operand.
4779 ///
4780 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4781 ///
4782 /// \headerfile <x86intrin.h>
4783 ///
4784 /// \code
4785 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4786 /// \endcode
4787 ///
4788 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4789 ///
4790 /// \param a
4791 ///    A 128-bit vector of [2 x double].
4792 /// \param b
4793 ///    A 128-bit vector of [2 x double].
4794 /// \param c
4795 ///    An immediate integer operand, with bits [4:0] specifying which comparison
4796 ///    operation to use: \n
4797 ///    0x00: Equal (ordered, non-signaling) \n
4798 ///    0x01: Less-than (ordered, signaling) \n
4799 ///    0x02: Less-than-or-equal (ordered, signaling) \n
4800 ///    0x03: Unordered (non-signaling) \n
4801 ///    0x04: Not-equal (unordered, non-signaling) \n
4802 ///    0x05: Not-less-than (unordered, signaling) \n
4803 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4804 ///    0x07: Ordered (non-signaling) \n
4805 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4806 #define _mm_cmp_sd(a, b, c)                                                    \
4807   ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4808                                  (c)))
4809 
4810 #if defined(__cplusplus)
4811 extern "C" {
4812 #endif
4813 
4814 /// Indicates that a spin loop is being executed for the purposes of
4815 ///    optimizing power consumption during the loop.
4816 ///
4817 /// \headerfile <x86intrin.h>
4818 ///
4819 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4820 ///
4821 void _mm_pause(void);
4822 
4823 #if defined(__cplusplus)
4824 } // extern "C"
4825 #endif
4826 #undef __DEFAULT_FN_ATTRS
4827 #undef __DEFAULT_FN_ATTRS_MMX
4828 
4829 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4830 
4831 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4832 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4833 
4834 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4835 
4836 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4837 #define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
4838   (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4839 
4840 #endif /* __EMMINTRIN_H */
4841