1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, [email protected]
2
3 //*** Copyright (C) 2012-2018 Intel Corporation. All rights reserved.
4
5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
6
7 //By downloading, copying, installing or using the software you agree to this license.
8 //If you do not agree to this license, do not download, install, copy or use the software.
9
10 // License Agreement
11 //Redistribution and use in source and binary forms, with or without modification,
12 //are permitted provided that the following conditions are met:
13
14 // * Redistributions of source code must retain the above copyright notice,
15 // this list of conditions and the following disclaimer.
16
17 // * The name of the copyright holders may not be used to endorse or promote products
18 // derived from this software without specific prior written permission.
19
20 //This software is provided by the copyright holders and contributors "as is" and
21 //any express or implied warranties, including, but not limited to, the implied
22 //warranties of merchantability and fitness for a particular purpose are disclaimed.
23 //In no event shall the Intel Corporation or contributors be liable for any direct,
24 //indirect, incidental, special, exemplary, or consequential damages
25 //(including, but not limited to, procurement of substitute goods or services;
26 //loss of use, data, or profits; or business interruption) however caused
27 //and on any theory of liability, whether in contract, strict liability,
28 //or tort (including negligence or otherwise) arising in any way out of
29 //the use of this software, even if advised of the possibility of such damage.
30
31 //*****************************************************************************************
32 // This file is intended to simplify ARM->IA32 porting
33 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
34 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
35 //MMX instruction set is not used due to non availability on x64 systems,
36 //performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point switching
37 //*****************************************************************************************
38
39 //!!!!!!!!!!!!!! To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual
40 //!!!!!!!!!!!!!! but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
41
42 #ifndef NEON2SSE_H
43 #define NEON2SSE_H
44
45 /*********************************************************************************************************************/
46 //!!!!!!!!!!!!!!
47 //if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
48 //For older devices without SSE4 support it should be undefined, for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
49 #ifndef USE_SSE4
50 # if defined(__SSE4_2__)
51 # define USE_SSE4
52 # endif
53 #endif
54 /*********************************************************************************************************************/
55
56 #include <xmmintrin.h> //SSE
57 #include <emmintrin.h> //SSE2
58 #include <pmmintrin.h> //SSE3
59 #include <tmmintrin.h> //SSSE3
60 #ifdef USE_SSE4
61 # include <smmintrin.h> //SSE4.1
62 # include <nmmintrin.h> //SSE4.2
63 #endif
64
65 #include <math.h>
66
67 //*************** functions and data attributes, compiler dependent *********************************
68 //***********************************************************************************
69 #ifdef __GNUC__
70 # define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
71 # define _NEON2SSESTORAGE static
72 # define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
73 # define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
74 # ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
75 # if _GCC_VERSION < 40500
76 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
77 # else
78 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
79 # endif
80 # else
81 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
82 # endif
83 # if defined(__x86_64__)
84 # define _NEON2SSE_64BIT __x86_64__
85 # endif
86 #else
87 # define _NEON2SSESTORAGE static
88 # define _NEON2SSE_ALIGN_16 __declspec(align(16))
89 # define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
90 # if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
91 # define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
92 # if defined(_M_X64)
93 # define _NEON2SSE_64BIT _M_X64
94 # endif
95 # else
96 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
97 # endif
98 #endif
99
100 #if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
101 # define _NEON2SSE_64BIT_SSE4
102 #endif
103
104 /*********************************************************************************************************************/
105 // data types conversion
106 /*********************************************************************************************************************/
107 #if defined(_MSC_VER) && (_MSC_VER < 1300)
108 typedef signed char int8_t;
109 typedef unsigned char uint8_t;
110 typedef signed short int16_t;
111 typedef unsigned short uint16_t;
112 typedef signed int int32_t;
113 typedef unsigned int uint32_t;
114 typedef signed long long int64_t;
115 typedef unsigned long long uint64_t;
116 #elif defined(_MSC_VER)
117 typedef signed __int8 int8_t;
118 typedef unsigned __int8 uint8_t;
119 typedef signed __int16 int16_t;
120 typedef unsigned __int16 uint16_t;
121 typedef signed __int32 int32_t;
122 typedef unsigned __int32 uint32_t;
123
124 typedef signed long long int64_t;
125 typedef unsigned long long uint64_t;
126 #else
127 # include <stdint.h>
128 # include <limits.h>
129 #endif
130
131 typedef union __m64_128 {
132 uint64_t m64_u64[1];
133 float m64_f32[2];
134 int8_t m64_i8[8];
135 int16_t m64_i16[4];
136 int32_t m64_i32[2];
137 int64_t m64_i64[1];
138 uint8_t m64_u8[8];
139 uint16_t m64_u16[4];
140 uint32_t m64_u32[2];
141 } __m64_128;
142
143 typedef __m64_128 int8x8_t;
144 typedef __m64_128 uint8x8_t;
145 typedef __m64_128 int16x4_t;
146 typedef __m64_128 uint16x4_t;
147 typedef __m64_128 int32x2_t;
148 typedef __m64_128 uint32x2_t;
149 typedef __m64_128 int64x1_t;
150 typedef __m64_128 uint64x1_t;
151 typedef __m64_128 poly8x8_t;
152 typedef __m64_128 poly16x4_t;
153
154 typedef __m64_128 float32x2_t;
155 typedef __m128 float32x4_t;
156
157 typedef __m128 float16x4_t; //not supported by IA, for compartibility
158 typedef __m128 float16x8_t; //not supported by IA, for compartibility
159
160 typedef __m64_128 float64x1_t;
161 typedef __m128d float64x2_t;
162
163 typedef __m128i int8x16_t;
164 typedef __m128i int16x8_t;
165 typedef __m128i int32x4_t;
166 typedef __m128i int64x2_t;
167 typedef __m128i uint8x16_t;
168 typedef __m128i uint16x8_t;
169 typedef __m128i uint32x4_t;
170 typedef __m128i uint64x2_t;
171 typedef __m128i poly8x16_t;
172 typedef __m128i poly16x8_t;
173
174 #if defined(_MSC_VER)
175 # define SINT_MIN (-2147483647 - 1) /* min signed int value */
176 # define SINT_MAX 2147483647 /* max signed int value */
177 #else
178 # define SINT_MIN INT_MIN /* min signed int value */
179 # define SINT_MAX INT_MAX /* max signed int value */
180 #endif
181
182 typedef float float32_t;
183 #if !defined(__clang__)
184 typedef float __fp16;
185 #endif
186
187 typedef double float64_t;
188
189
190 typedef uint8_t poly8_t;
191 typedef uint16_t poly16_t;
192
193
194 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
195 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
196 struct int8x16x2_t {
197 int8x16_t val[2];
198 };
199 struct int16x8x2_t {
200 int16x8_t val[2];
201 };
202 struct int32x4x2_t {
203 int32x4_t val[2];
204 };
205 struct int64x2x2_t {
206 int64x2_t val[2];
207 };
208 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
209 struct int8x8x2_t {
210 int8x8_t val[2];
211 };
212 struct int16x4x2_t {
213 int16x4_t val[2];
214 };
215 struct int32x2x2_t {
216 int32x2_t val[2];
217 };
218 struct int64x1x2_t {
219 int64x1_t val[2];
220 };
221
222 typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
223 typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
224 typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
225 typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
226
227 typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
228 typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
229 typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
230 typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
231
232 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
233 typedef struct int8x16x2_t uint8x16x2_t;
234 typedef struct int16x8x2_t uint16x8x2_t;
235 typedef struct int32x4x2_t uint32x4x2_t;
236 typedef struct int64x2x2_t uint64x2x2_t;
237 typedef struct int8x16x2_t poly8x16x2_t;
238 typedef struct int16x8x2_t poly16x8x2_t;
239
240 typedef struct int8x8x2_t uint8x8x2_t;
241 typedef struct int16x4x2_t uint16x4x2_t;
242 typedef struct int32x2x2_t uint32x2x2_t;
243 typedef struct int64x1x2_t uint64x1x2_t;
244 typedef struct int8x8x2_t poly8x8x2_t;
245 typedef struct int16x4x2_t poly16x4x2_t;
246
247 //float
248 struct float32x4x2_t {
249 float32x4_t val[2];
250 };
251 struct float16x8x2_t {
252 float16x8_t val[2];
253 };
254 struct float32x2x2_t {
255 float32x2_t val[2];
256 };
257
258 typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
259 typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
260 typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy
261 typedef float16x8x2_t float16x4x2_t;
262
263 //4
264 struct int8x16x4_t {
265 int8x16_t val[4];
266 };
267 struct int16x8x4_t {
268 int16x8_t val[4];
269 };
270 struct int32x4x4_t {
271 int32x4_t val[4];
272 };
273 struct int64x2x4_t {
274 int64x2_t val[4];
275 };
276
277 struct int8x8x4_t {
278 int8x8_t val[4];
279 };
280 struct int16x4x4_t {
281 int16x4_t val[4];
282 };
283 struct int32x2x4_t {
284 int32x2_t val[4];
285 };
286 struct int64x1x4_t {
287 int64x1_t val[4];
288 };
289
290 typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
291 typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
292 typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
293 typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
294
295 typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
296 typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
297 typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
298 typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
299
300 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
301 typedef struct int8x8x4_t uint8x8x4_t;
302 typedef struct int16x4x4_t uint16x4x4_t;
303 typedef struct int32x2x4_t uint32x2x4_t;
304 typedef struct int64x1x4_t uint64x1x4_t;
305 typedef struct int8x8x4_t poly8x8x4_t;
306 typedef struct int16x4x4_t poly16x4x4_t;
307
308 typedef struct int8x16x4_t uint8x16x4_t;
309 typedef struct int16x8x4_t uint16x8x4_t;
310 typedef struct int32x4x4_t uint32x4x4_t;
311 typedef struct int64x2x4_t uint64x2x4_t;
312 typedef struct int8x16x4_t poly8x16x4_t;
313 typedef struct int16x8x4_t poly16x8x4_t;
314
315 struct float32x4x4_t {
316 float32x4_t val[4];
317 };
318 struct float16x8x4_t {
319 float16x8_t val[4];
320 };
321 struct float32x2x4_t {
322 float32x2_t val[4];
323 };
324
325 typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
326 typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
327 typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy
328 typedef float16x8x4_t float16x4x4_t;
329
330 //3
331 struct int16x8x3_t {
332 int16x8_t val[3];
333 };
334 struct int32x4x3_t {
335 int32x4_t val[3];
336 };
337 struct int64x2x3_t {
338 int64x2_t val[3];
339 };
340 struct int8x16x3_t {
341 int8x16_t val[3];
342 };
343
344 struct int16x4x3_t {
345 int16x4_t val[3];
346 };
347 struct int32x2x3_t {
348 int32x2_t val[3];
349 };
350 struct int64x1x3_t {
351 int64x1_t val[3];
352 };
353 struct int8x8x3_t {
354 int8x8_t val[3];
355 };
356 typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
357 typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
358 typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
359 typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
360
361 typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
362 typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
363 typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
364 typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
365
366
367 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
368 typedef struct int8x16x3_t uint8x16x3_t;
369 typedef struct int16x8x3_t uint16x8x3_t;
370 typedef struct int32x4x3_t uint32x4x3_t;
371 typedef struct int64x2x3_t uint64x2x3_t;
372 typedef struct int8x16x3_t poly8x16x3_t;
373 typedef struct int16x8x3_t poly16x8x3_t;
374 typedef struct int8x8x3_t uint8x8x3_t;
375 typedef struct int16x4x3_t uint16x4x3_t;
376 typedef struct int32x2x3_t uint32x2x3_t;
377 typedef struct int64x1x3_t uint64x1x3_t;
378 typedef struct int8x8x3_t poly8x8x3_t;
379 typedef struct int16x4x3_t poly16x4x3_t;
380
381 //float
382 struct float32x4x3_t {
383 float32x4_t val[3];
384 };
385 struct float32x2x3_t {
386 float32x2_t val[3];
387 };
388 struct float16x8x3_t {
389 float16x8_t val[3];
390 };
391
392 typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
393 typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
394 typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
395 typedef float16x8x3_t float16x4x3_t;
396
397
398 //****************************************************************************
399 //****** Porting auxiliary macros ********************************************
400
401 //** floating point related macros **
402 #define _M128i(a) _mm_castps_si128(a)
403 #define _M128(a) _mm_castsi128_ps(a)
404 //here the most performance effective implementation is compiler and 32/64 bits build dependent
405 #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
406 # define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
407 # define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
408 # define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
409 #else
410 //for 32bit gcc and Microsoft compilers builds
411 # define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
412 # define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
413 # define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
414 #endif
415 #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
416
417 #define return64(a) _M64(res64,a); return res64;
418 #define return64f(a) _M64f(res64,a); return res64;
419
420 #define _Ui64(a) (*(uint64_t*)&(a))
421 #define _UNSIGNED_T(a) u ## a
422
423 #define _SIGNBIT64 ((uint64_t)1 << 63)
424 #define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6))
425 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
426
427 #define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
428 #define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
429
430 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
431 #define __constrange(min,max) const
432 #define __transfersize(size)
433 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
434
435 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
436 _NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
437 _NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
438 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
439
440 //*************************************************************************
441 //*************************************************************************
442 //********* Functions declarations as declared in original arm_neon.h *****
443 //*************************************************************************
444 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
445 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
446 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
447 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
448 _NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
449 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
450 _NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
451 _NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
452 _NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
453 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
454 _NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
455 _NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
456 _NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
457 _NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
458 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
459 _NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
460 _NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
461 _NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
462 _NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
463 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
464 _NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
465 _NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
466 _NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
467 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
468 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
469 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
470 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
471 _NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
472 _NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
473 _NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
474 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
475 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
476 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
477 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
478 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
479 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
480 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
481 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
482 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
483 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
484 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
485 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
486 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
487 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
488 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
489 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
490 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
491 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
492 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
493 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
494 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
495 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
496 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
497 _NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
498 _NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
499 _NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
500 _NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
501 _NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
502 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
503 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
504 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
505 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
506 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
507 _NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
508 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
509 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
510 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
511 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
512 _NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
513 _NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
514 _NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
515 _NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
516 _NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
517 _NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
518 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
519 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
520 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
521 _NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
522 _NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
523 _NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
524 _NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
525 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
526 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
527 //Vector rounding add high half: vraddhn
528 _NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
529 _NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
530 _NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
531 _NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
532 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
533 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
534 //Multiplication
535 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
536 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
537 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
539 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
540 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
541 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
542 _NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
543 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
544 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
545 _NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
546 _NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
547 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
548 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
549 _NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
550 _NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
551 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
552 //multiply lane
553 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
554 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
555 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
556 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
557 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
558 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
559 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
560 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
561 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
562 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
563 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
564 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
565 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
566 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
567 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
568 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
569 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
570 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
571 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
572 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
573 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
574 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
575 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
576 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
577 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
578 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
579 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
580 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
581 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
582 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
583 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
584 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
585 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
586 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
587 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
588 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
589 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
590 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
591 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
592 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
593 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
594 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
595 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
596 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
597 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
598 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
599 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
600 //Vector multiply subtract long
601 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
602 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
603 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
604 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
605 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
606 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
607 //Vector saturating doubling multiply high
608 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
609 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
610 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
611 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
612 //Vector saturating rounding doubling multiply high
613 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
614 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
615 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
616 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
617 //Vector saturating doubling multiply accumulate long
618 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
619 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
620 //Vector saturating doubling multiply subtract long
621 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
622 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
623 //Vector long multiply
624 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
625 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
626 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
627 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
628 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
629 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
630 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
631 //Vector saturating doubling long multiply
632 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
633 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
634 //Subtraction
635 //Vector subtract
636 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
637 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
638 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
639 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
640 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
641 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
642 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
643 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
644 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
645 _NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
646 _NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
647 _NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
648 _NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
649 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
650 _NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
651 _NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
652 _NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
653 _NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
654 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
655 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
656 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
657 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
658 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
659 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
660 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
661 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
662 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
663 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
664 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
665 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
666 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
667 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
668 //Vector saturating subtract
669 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
670 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
671 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
672 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
673 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
674 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
675 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
676 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
677 _NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
678 _NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
679 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
680 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
681 _NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
682 _NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
683 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
684 _NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
685 //Vector halving subtract
686 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
687 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
688 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
689 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
690 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
691 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
692 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
693 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
694 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
695 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
696 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
697 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
698 //Vector subtract high half
699 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
700 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
701 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
702 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
703 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
704 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
705 //Vector rounding subtract high half
706 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
707 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
708 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
709 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
710 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
711 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
712 //Comparison
713 //Vector compare equal
714 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
715 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
716 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
717 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
718 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
719 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
720 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
721 _NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
722 _NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
723 _NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
724 _NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
725 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
726 _NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
727 _NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
728 _NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
729 _NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
730 //Vector compare greater-than or equal
731 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
732 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
733 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
734 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
735 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
736 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
737 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
738 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
739 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
740 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
741 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
742 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
743 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
744 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
745 //Vector compare less-than or equal
746 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
747 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
748 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
749 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
750 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
751 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
752 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
753 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
754 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
755 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
756 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
757 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
758 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
759 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
760 //Vector compare greater-than
761 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
762 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
763 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
764 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
765 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
766 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
767 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
768 _NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
769 _NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
770 _NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
771 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
772 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
773 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
774 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
775 //Vector compare less-than
776 _NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
777 _NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
778 _NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
779 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
780 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
781 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
782 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
783 _NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
784 _NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
785 _NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
786 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
787 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
788 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
789 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
790 //Vector compare absolute greater-than or equal
791 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
792 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
793 //Vector compare absolute less-than or equal
794 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
795 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
796 //Vector compare absolute greater-than
797 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
798 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
799 //Vector compare absolute less-than
800 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
801 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
802 //Vector test bits
803 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
804 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
805 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
806 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
807 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
808 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
809 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
810 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
811 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
812 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
813 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
814 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
815 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
816 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
817 //Absolute difference
818 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
819 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
820 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
821 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
822 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
823 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
824 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
825 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
826 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
827 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
828 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
829 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
830 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
831 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
832 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
833 //Absolute difference - long
834 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
835 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
836 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
837 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
838 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
839 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
840 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
841 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
842 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
843 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
844 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
845 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
846 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
847 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
848 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
849 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
850 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
851 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
852 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
853 //Absolute difference and accumulate - long
854 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
855 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
856 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
857 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
858 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
859 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
860 //Max/Min
861 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
862 _NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
863 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
864 _NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
865 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
866 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
867 _NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
868 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
869 _NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
870 _NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
871 _NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
872 _NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
873 _NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
874 _NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
875 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
876
877 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
878
879 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
880 _NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
881 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
882 _NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
883 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
884 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
885 _NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
886 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
887 _NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
888 _NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
889 _NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
890 _NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
891 _NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
892 _NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
893 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
894
895 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
896
897 //Pairwise addition
898 //Pairwise add
899 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
900 _NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
901 _NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
902 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
903 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
904 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
905 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
906 //Long pairwise add
907 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
908 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
909 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
910 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
911 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
912 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
913 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
914 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
915 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
916 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
917 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
918 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
919 //Long pairwise add and accumulate
920 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
921 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
922 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
923 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
924 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
925 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
926 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
927 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
928 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
929 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
930 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
931 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
932 //Folding maximum vpmax -> takes maximum of adjacent pairs
933 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
934 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
935 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
936 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
937 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
938 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
939 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
940 //Folding minimum vpmin -> takes minimum of adjacent pairs
941 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
942 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
943 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
944 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
945 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
946 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
947 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
948 //Reciprocal/Sqrt
949 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
950 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
951 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
952 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
953 //Shifts by signed variable
954 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
955 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
956 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
957 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
958 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
959 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
960 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
961 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
962 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
963 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
964 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
965 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
966 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
967 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
968 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
969 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
970 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
971 //Vector saturating shift left: (negative values shift right)
972 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
973 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
974 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
975 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
976 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
977 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
978 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
979 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
980 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
981 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
982 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
983 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
984 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
985 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
986 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
987 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
988 //Vector rounding shift left: (negative values shift right)
989 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
990 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
991 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
992 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
993 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
994 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
995 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
996 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
997 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
998 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
999 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
1000 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
1001 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
1002 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
1003 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
1004 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
1005 //Vector saturating rounding shift left: (negative values shift right)
1006 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
1007 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
1008 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
1009 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
1010 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
1011 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
1012 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
1013 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
1014 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
1015 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
1016 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
1017 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
1018 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
1019 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
1020 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
1021 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
1022 //Shifts by a constant
1023 //Vector shift right by constant
1024 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
1025 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
1026 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
1027 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
1028 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
1029 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
1030 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
1031 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
1032 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
1033 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
1034 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
1035 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
1036 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
1037 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
1038 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
1039 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
1040 //Vector shift left by constant
1041 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1042 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1043 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1044 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1045 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
1046 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
1047 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
1048 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
1049 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1050 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1051 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1052 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1053 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
1054 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
1055 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
1056 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
1057 //Vector rounding shift right by constant
1058 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
1059 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
1060 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
1061 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
1062 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
1063 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
1064 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
1065 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
1066 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
1067 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
1068 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
1069 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
1070 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
1071 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
1072 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
1073 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
1074 //Vector shift right by constant and accumulate
1075 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
1076 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
1077 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
1078 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
1079 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
1080 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
1081 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
1082 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
1083 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
1084 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
1085 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
1086 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
1087 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
1088 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
1089 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
1090 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
1091 //Vector rounding shift right by constant and accumulate
1092 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
1093 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
1094 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
1095 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
1096 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
1097 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
1098 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
1099 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
1100 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
1101 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
1102 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
1103 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
1104 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
1105 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
1106 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
1107 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
1108 //Vector saturating shift left by constant
1109 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
1110 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
1111 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
1112 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
1113 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
1114 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
1115 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
1116 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
1117 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
1118 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
1119 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
1120 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
1121 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
1122 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
1123 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
1124 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
1125 //Vector signed->unsigned saturating shift left by constant
1126 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
1127 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
1128 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
1129 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
1130 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
1131 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
1132 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
1133 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
1134 //Vector narrowing shift right by constant
1135 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1136 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1137 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1138 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
1139 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
1140 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
1141 //Vector signed->unsigned narrowing saturating shift right by constant
1142 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
1143 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
1144 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
1145 //Vector signed->unsigned rounding narrowing saturating shift right by constant
1146 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
1147 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
1148 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
1149 //Vector narrowing saturating shift right by constant
1150 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
1151 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
1152 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
1153 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
1154 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
1155 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
1156 //Vector rounding narrowing shift right by constant
1157 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1158 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1159 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1160 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
1161 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
1162 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
1163 //Vector rounding narrowing saturating shift right by constant
1164 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
1165 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
1166 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
1167 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
1168 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
1169 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
1170 //Vector widening shift left by constant
1171 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
1172 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
1173 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
1174 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
1175 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
1176 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
1177 //Shifts with insert
1178 //Vector shift right and insert
1179 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1180 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1181 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1182 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1183 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1184 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1185 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
1186 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
1187 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
1188 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
1189 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1190 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1191 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1192 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1193 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1194 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1195 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
1196 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
1197 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
1198 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
1199 //Vector shift left and insert
1200 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1201 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1202 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1203 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1204 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1205 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1206 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
1207 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
1208 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
1209 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
1210 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1211 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1212 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1213 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1214 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1215 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1216 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
1217 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
1218 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
1219 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
1220 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
1221 //Load a single vector from memory
1222 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1223 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1224 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1225 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1226 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1227 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1228 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1229 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1230 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
1231 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
1232 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
1233 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
1234 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
1235 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
1236 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
1237 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1238 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
1239 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
1240 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
1241 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1242 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
1243 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
1244 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
1245 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
1246
1247 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1248
1249 //Load a single lane from memory
1250 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1251 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1252 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1253 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
1254 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1255 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1256 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
1257 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
1258 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
1259 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
1260 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
1261 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
1262 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1263 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1264 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1265 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1266 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
1267 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1268 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
1269 _NEON2SSESTORAGE float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1270 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
1271 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
1272 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
1273 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
1274 //Load all lanes of vector with same value from memory
1275 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1276 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1277 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1278 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1279 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1280 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1281 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1282 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1283 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1284 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1285 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1286 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1287 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1288 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1289 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1290 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
1291 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1292 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1293 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1294 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
1295 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
1296 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
1297 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
1298 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
1299 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
1300 //Store a single vector into memory
1301 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
1302 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
1303 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
1304 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
1305 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
1306 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
1307 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
1308 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
1309 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
1310 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
1311 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
1312 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
1313 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
1314 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
1315 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
1316 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
1317 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
1318 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
1319 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
1320 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
1321 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
1322 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
1323 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
1324 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
1325 //Store a lane of a vector into memory
1326 //Loads of an N-element structure
1327 //Load N-element structure from memory
1328 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1329 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1330 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1331 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1332 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1333 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1334 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
1335 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
1336 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
1337 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
1338 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1339 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1340 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1341 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1342 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1343 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1344 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1345 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1346 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
1347 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
1348 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
1349 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
1350 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1351 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1352 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1353 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1354 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1355 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1356 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1357 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
1358 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
1359 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
1360 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1361 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1362 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1363 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1364 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1365 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1366 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1367 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1368 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1369 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
1370 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
1371 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
1372 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1373 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1374 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1375 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1376 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1377 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1378 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1379 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
1380 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
1381 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
1382 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1383 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1384 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1385 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1386 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1387 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1388 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1389 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1390 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1391 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
1392 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
1393 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
1394 //Load all lanes of N-element structure with same value from memory
1395 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1396 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1397 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1398 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1399 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1400 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1401 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1402 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
1403 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1404 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
1405 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
1406 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1407 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1408 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1409 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1410 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1411 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1412 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1413 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1414 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
1415 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1416 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1417 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1418 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1419 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1420 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1421 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1422 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1423 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1424 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1425 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1426 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
1427 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1428 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1429 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1430 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1431 //Load a single lane of N-element structure from memory
1432 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
1433 _NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1434 _NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1435 _NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1436 _NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1437 _NEON2SSESTORAGE float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1438 _NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
1439 _NEON2SSESTORAGE poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
1440 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1441 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1442 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1443 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1444 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
1445 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
1446 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1447 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
1448 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
1449 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1450 _NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1451 _NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1452 _NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1453 _NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1454 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1455 _NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1456 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1457 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1458 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1459 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1460 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1461 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1462 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1463 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1464 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1465 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1466 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1467 _NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1468 _NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1469 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1470 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1471 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1472 _NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1473 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1474 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1475 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1476 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1477 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1478 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1479 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1480 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1481 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1482 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1483 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1484 //Store N-element structure to memory
1485 _NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1486 _NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1487 _NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1488 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1489 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1490 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1491 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1492 _NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
1493 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
1494 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
1495 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1496 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1497 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1498 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1499 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1500 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1501 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
1502 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
1503 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1504 _NEON2SSESTORAGE void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
1505 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
1506 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
1507 _NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1508 _NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1509 _NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1510 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1511 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1512 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1513 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1514 _NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
1515 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
1516 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
1517 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1518 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1519 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1520 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1521 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1522 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1523 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1524 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
1525 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
1526 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
1527 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
1528 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
1529 _NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1530 _NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1531 _NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1532 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1533 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1534 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1535 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1536 _NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
1537 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
1538 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
1539 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1540 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1541 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1542 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1543 _NEON2SSESTORAGE void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1544 _NEON2SSESTORAGE void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1545 _NEON2SSESTORAGE void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1546 _NEON2SSESTORAGE void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
1547 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
1548 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
1549 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
1550 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
1551 //Store a single lane of N-element structure to memory
1552 _NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1553 _NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1554 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1555 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
1556 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1557 _NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
1558 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
1559 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1560 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1561 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1562 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
1563 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1564 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1565 _NEON2SSESTORAGE void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1566 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
1567 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
1568 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
1569 _NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1570 _NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1571 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1572 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1573 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1574 _NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
1575 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1576 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1577 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1578 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1579 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
1580 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1581 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1582 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1583 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1584 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1585 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1586 _NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1587 _NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1588 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1589 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1590 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1591 _NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1592 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1593 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1594 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1595 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1596 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
1597 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1598 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1599 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1600 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1601 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1602 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1603 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
1604 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1605 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1606 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1607 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
1608 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
1609 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1610 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
1611 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
1612 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
1613 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1614 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1615 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1616 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
1617 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
1618 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1619 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
1620 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
1621 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
1622 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1623 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
1624 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1625 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
1626 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
1627 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1628 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1629 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1630 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1631 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1632 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1633 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
1634 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
1635 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
1636 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1637 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1638 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1639 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1640 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1641 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1642 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
1643 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
1644 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
1645 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1646 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
1647 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1648 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
1649 //Initialize a vector from a literal bit pattern.
1650 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
1651 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
1652 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
1653 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
1654 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
1655 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
1656 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
1657 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
1658 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
1659 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
1660 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
1661 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
1662 //Set all lanes to same value
1663 //Load all lanes of vector to the same literal value
1664 _NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
1665 _NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
1666 _NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
1667 _NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
1668 _NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
1669 _NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
1670 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
1671 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
1672 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
1673 _NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
1674 _NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
1675 _NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
1676 _NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
1677 _NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
1678 _NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
1679 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
1680 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
1681 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
1682 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
1683 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
1684 _NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
1685 _NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
1686 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
1687 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
1688 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
1689 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
1690 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
1691 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
1692 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
1693 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
1694 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
1695 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
1696 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
1697 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
1698 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
1699 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
1700 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
1701 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
1702 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
1703 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
1704 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
1705 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
1706 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
1707 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
1708 //Load all lanes of the vector to the value of a lane of a vector
1709 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1710 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1711 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1712 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1713 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1714 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1715 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
1716 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
1717 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
1718 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1719 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1720 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1721 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1722 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1723 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1724 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
1725 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
1726 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
1727 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1728 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
1729 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1730 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
1731 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
1732 _NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
1733 _NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
1734 _NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
1735 _NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
1736 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
1737 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
1738 _NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
1739 _NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
1740 _NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
1741 _NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
1742 _NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
1743 _NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
1744 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
1745 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
1746 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
1747 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
1748 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
1749 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
1750 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
1751 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
1752 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
1753 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
1754 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
1755 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
1756 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
1757 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
1758 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
1759 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
1760 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
1761 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
1762 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
1763 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
1764 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
1765 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
1766 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
1767 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
1768 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
1769 //Converting vectors. These intrinsics are used to convert vectors.
1770 //Convert from float
1771 _NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
1772 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
1773 _NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
1774 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
1775 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
1776 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
1777 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
1778 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
1779 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
1780 //Convert to float
1781 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
1782 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
1783 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
1784 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
1785 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
1786 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
1787 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
1788 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
1789 //Convert between floats
1790 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
1791 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
1792 //Vector narrow integer
1793 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
1794 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
1795 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
1796 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
1797 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
1798 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
1799 //Vector long move
1800 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
1801 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
1802 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
1803 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
1804 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
1805 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
1806 //Vector saturating narrow integer
1807 _NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
1808 _NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
1809 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
1810 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
1811 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
1812 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
1813 //Vector saturating narrow integer signed->unsigned
1814 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
1815 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
1816 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
1817 //Table look up
1818 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1819 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
1820 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
1821 //Extended table look up intrinsics
1822 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1823 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
1824 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
1825 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1826 _NEON2SSESTORAGE int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1827 _NEON2SSESTORAGE poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
1828 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1829 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1830 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
1831 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1832 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1833 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1834 //Operations with a scalar value
1835 //Vector multiply accumulate with scalar
1836 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1837 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1838 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
1839 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
1840 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
1841 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
1842 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
1843 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
1844 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
1845 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
1846 //Vector widening multiply accumulate with scalar
1847 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
1848 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
1849 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
1850 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
1851 //Vector widening saturating doubling multiply accumulate with scalar
1852 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
1853 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
1854 //Vector multiply subtract with scalar
1855 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1856 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1857 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
1858 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
1859 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
1860 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
1861 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
1862 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
1863 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
1864 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
1865 //Vector widening multiply subtract with scalar
1866 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
1867 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
1868 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
1869 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
1870 //Vector widening saturating doubling multiply subtract with scalar
1871 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
1872 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
1873 //Vector multiply by scalar
1874 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
1875 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
1876 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
1877 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
1878 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
1879 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
1880 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
1881 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
1882 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
1883 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
1884 //Vector long multiply with scalar
1885 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
1886 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
1887 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
1888 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
1889 //Vector long multiply by scalar
1890 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
1891 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
1892 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
1893 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
1894 //Vector saturating doubling long multiply with scalar
1895 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
1896 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
1897 //Vector saturating doubling long multiply by scalar
1898 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
1899 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
1900 //Vector saturating doubling multiply high with scalar
1901 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
1902 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
1903 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
1904 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
1905 //Vector saturating doubling multiply high by scalar
1906 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
1907 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
1908 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
1909 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
1910 //Vector saturating rounding doubling multiply high with scalar
1911 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
1912 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
1913 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
1914 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
1915 //Vector rounding saturating doubling multiply high by scalar
1916 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
1917 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
1918 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
1919 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
1920 //Vector multiply accumulate with scalar
1921 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
1922 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
1923 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
1924 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
1925 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
1926 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
1927 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
1928 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
1929 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
1930 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
1931 //Vector widening multiply accumulate with scalar
1932 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
1933 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
1934 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
1935 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
1936 //Vector widening saturating doubling multiply accumulate with scalar
1937 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
1938 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
1939 //Vector multiply subtract with scalar
1940 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
1941 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
1942 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
1943 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
1944 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
1945 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
1946 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
1947 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
1948 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
1949 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
1950 //Vector widening multiply subtract with scalar
1951 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
1952 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
1953 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
1954 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
1955 //Vector widening saturating doubling multiply subtract with scalar
1956 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
1957 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
1958 //Vector extract
1959 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1960 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1961 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
1962 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1963 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1964 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
1965 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1966 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1967 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1968 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
1969 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
1970 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1971 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1972 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
1973 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1974 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1975 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
1976 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1977 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
1978 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1979 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
1980 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
1981 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
1982 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
1983 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
1984 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
1985 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
1986 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
1987 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
1988 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
1989 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
1990 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
1991 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
1992 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
1993 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
1994 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
1995 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
1996 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
1997 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
1998 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
1999 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
2000 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
2001 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
2002 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
2003 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
2004 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
2005 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
2006 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
2007 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
2008 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
2009 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
2010 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
2011 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
2012 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
2013 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
2014 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
2015 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
2016 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
2017 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
2018 //Other single operand arithmetic
2019 //Absolute: Vd[i] = |Va[i]|
2020 _NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
2021 _NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
2022 _NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
2023 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
2024 _NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
2025 _NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
2026 _NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
2027 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
2028
2029 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
2030 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
2031
2032 //Saturating absolute: Vd[i] = sat(|Va[i]|)
2033 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
2034 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
2035 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
2036 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
2037 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
2038 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
2039 //Negate: Vd[i] = - Va[i]
2040 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
2041 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
2042 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
2043 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
2044 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
2045 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
2046 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
2047 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
2048 //Saturating Negate: sat(Vd[i] = - Va[i])
2049 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
2050 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
2051 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
2052 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
2053 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
2054 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
2055 //Count leading sign bits
2056 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
2057 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
2058 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
2059 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
2060 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
2061 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
2062 //Count leading zeros
2063 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
2064 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
2065 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
2066 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
2067 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
2068 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
2069 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
2070 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
2071 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
2072 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
2073 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
2074 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
2075 //Count number of set bits
2076 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
2077 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
2078 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
2079 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
2080 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
2081 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
2082 //Reciprocal estimate
2083 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
2084 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
2085 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
2086 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
2087 //Reciprocal square root estimate
2088 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
2089 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
2090 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
2091 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
2092 //Logical operations
2093 //Bitwise not
2094 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
2095 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
2096 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
2097 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
2098 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
2099 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
2100 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
2101 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
2102 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
2103 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
2104 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
2105 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
2106 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
2107 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
2108 //Bitwise and
2109 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
2110 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
2111 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
2112 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
2113 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
2114 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
2115 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
2116 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
2117 _NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
2118 _NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
2119 _NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
2120 _NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
2121 _NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
2122 _NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
2123 _NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
2124 _NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
2125 //Bitwise or
2126 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
2127 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
2128 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
2129 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
2130 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
2131 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
2132 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
2133 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
2134 _NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
2135 _NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
2136 _NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
2137 _NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
2138 _NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
2139 _NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
2140 _NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
2141 _NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
2142 //Bitwise exclusive or (EOR or XOR)
2143 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
2144 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
2145 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
2146 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
2147 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
2148 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
2149 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
2150 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
2151 _NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
2152 _NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
2153 _NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
2154 _NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
2155 _NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
2156 _NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
2157 _NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
2158 _NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
2159 //Bit Clear
2160 _NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
2161 _NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
2162 _NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
2163 _NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
2164 _NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
2165 _NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
2166 _NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
2167 _NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
2168 _NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
2169 _NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
2170 _NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
2171 _NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
2172 _NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
2173 _NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
2174 _NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
2175 _NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
2176 //Bitwise OR complement
2177 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
2178 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
2179 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
2180 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
2181 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
2182 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
2183 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
2184 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
2185 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
2186 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
2187 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
2188 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
2189 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
2190 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
2191 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
2192 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
2193 //Bitwise Select
2194 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
2195 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
2196 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
2197 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
2198 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
2199 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
2200 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
2201 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
2202 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
2203 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
2204 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
2205 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
2206 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
2207 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
2208 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
2209 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
2210 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
2211 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
2212 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
2213 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
2214 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
2215 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
2216 //Transposition operations
2217 //Transpose elements
2218 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
2219 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
2220 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
2221 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
2222 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
2223 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
2224 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
2225 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
2226 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
2227 _NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
2228 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
2229 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
2230 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
2231 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
2232 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
2233 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
2234 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
2235 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
2236 //Interleave elements
2237 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
2238 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
2239 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
2240 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
2241 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
2242 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
2243 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
2244 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
2245 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
2246 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
2247 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
2248 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
2249 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
2250 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
2251 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
2252 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
2253 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
2254 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
2255 //De-Interleave elements
2256 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
2257 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
2258 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
2259 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
2260 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
2261 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
2262 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
2263 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
2264 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
2265 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
2266 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
2267 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
2268 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
2269 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
2270 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
2271 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
2272 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
2273 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
2274
2275 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
2276
2277 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
2278
2279 //Sqrt
2280 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
2281
2282 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
2283
2284
2285
2286 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2287 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
2288 // we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
2289 //
2290 #if ( defined (__INTEL_COMPILER) || defined (__GNUC__) && !defined(__llvm__) )
2291 # define _MM_ALIGNR_EPI8 _mm_alignr_epi8
2292 # define _MM_EXTRACT_EPI16 (int16_t) _mm_extract_epi16
2293 # define _MM_INSERT_EPI16 _mm_insert_epi16
2294 # ifdef USE_SSE4
2295 # define _MM_EXTRACT_EPI8 _mm_extract_epi8
2296 # define _MM_EXTRACT_EPI32 _mm_extract_epi32
2297 # define _MM_EXTRACT_PS _mm_extract_ps
2298 # define _MM_INSERT_EPI8 _mm_insert_epi8
2299 # define _MM_INSERT_EPI32 _mm_insert_epi32
2300 # define _MM_INSERT_PS _mm_insert_ps
2301 # ifdef _NEON2SSE_64BIT
2302 # define _MM_INSERT_EPI64 _mm_insert_epi64
2303 # define _MM_EXTRACT_EPI64 _mm_extract_epi64
2304 # endif
2305 # endif //SSE4
2306 #else
2307 # define _NEON2SSE_COMMA ,
2308 # define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
2309 switch(LANE) \
2310 { \
2311 case 0: return NAME(a b, 0); \
2312 case 1: return NAME(a b, 1); \
2313 case 2: return NAME(a b, 2); \
2314 case 3: return NAME(a b, 3); \
2315 case 4: return NAME(a b, 4); \
2316 case 5: return NAME(a b, 5); \
2317 case 6: return NAME(a b, 6); \
2318 case 7: return NAME(a b, 7); \
2319 case 8: return NAME(a b, 8); \
2320 case 9: return NAME(a b, 9); \
2321 case 10: return NAME(a b, 10); \
2322 case 11: return NAME(a b, 11); \
2323 case 12: return NAME(a b, 12); \
2324 case 13: return NAME(a b, 13); \
2325 case 14: return NAME(a b, 14); \
2326 case 15: return NAME(a b, 15); \
2327 default: return NAME(a b, 0); \
2328 }
2329
2330 # define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
2331 switch(LANE) \
2332 { \
2333 case 0: return NAME(vec p,0); \
2334 case 1: return NAME(vec p,1); \
2335 case 2: return NAME(vec p,2); \
2336 case 3: return NAME(vec p,3); \
2337 case 4: return NAME(vec p,4); \
2338 case 5: return NAME(vec p,5); \
2339 case 6: return NAME(vec p,6); \
2340 case 7: return NAME(vec p,7); \
2341 default: return NAME(vec p,0); \
2342 }
2343
2344 # define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
2345 switch(LANE) \
2346 { \
2347 case case0: return NAME(vec p,case0); \
2348 case case1: return NAME(vec p,case1); \
2349 case case2: return NAME(vec p,case2); \
2350 case case3: return NAME(vec p,case3); \
2351 default: return NAME(vec p,case0); \
2352 }
2353
_MM_ALIGNR_EPI8(__m128i a,__m128i b,int LANE)2354 _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
2355 {
2356 _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
2357 }
2358
_MM_INSERT_EPI16(__m128i vec,int p,const int LANE)2359 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
2360 {
2361 _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
2362 }
2363
_MM_EXTRACT_EPI16(__m128i vec,const int LANE)2364 _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
2365 {
2366 _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
2367 }
2368
2369 #ifdef USE_SSE4
_MM_EXTRACT_EPI32(__m128i vec,const int LANE)2370 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2371 {
2372 _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
2373 }
2374
_MM_EXTRACT_PS(__m128 vec,const int LANE)2375 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2376 {
2377 _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
2378 }
2379
_MM_EXTRACT_EPI8(__m128i vec,const int LANE)2380 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2381 {
2382 _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
2383 }
2384
_MM_INSERT_EPI32(__m128i vec,int p,const int LANE)2385 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2386 {
2387 _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
2388 }
2389
_MM_INSERT_EPI8(__m128i vec,int p,const int LANE)2390 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2391 {
2392 _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
2393 }
2394
2395 #ifdef _NEON2SSE_64BIT
2396 //the special case of functions available only for SSE4 and 64-bit build.
_MM_INSERT_EPI64(__m128i vec,int p,const int LANE)2397 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
2398 {
2399 switch(LANE) {
2400 case 0:
2401 return _mm_insert_epi64(vec, p, 0);
2402 case 1:
2403 return _mm_insert_epi64(vec, p, 1);
2404 default:
2405 return _mm_insert_epi64(vec, p, 0);
2406 }
2407 }
2408
_MM_EXTRACT_EPI64(__m128i val,const int LANE)2409 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
2410 {
2411 if (LANE ==0) return _mm_extract_epi64(val, 0);
2412 else return _mm_extract_epi64(val, 1);
2413 }
2414 #endif
2415
_MM_INSERT_PS(__m128 vec,__m128 p,const int LANE)2416 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2417 {
2418 _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
2419 }
2420
2421 #endif //USE_SSE4
2422
2423 #endif //#ifdef NDEBUG
2424
2425 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2426 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
2427 // or for some specific commonly used operations implementation missing in SSE
2428 #ifdef USE_SSE4
2429 # define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
2430 # define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
2431 # define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
2432
2433 # define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
2434 # define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
2435 # define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
2436
2437 # define _MM_MAX_EPI8 _mm_max_epi8
2438 # define _MM_MAX_EPI32 _mm_max_epi32
2439 # define _MM_MAX_EPU16 _mm_max_epu16
2440 # define _MM_MAX_EPU32 _mm_max_epu32
2441
2442 # define _MM_MIN_EPI8 _mm_min_epi8
2443 # define _MM_MIN_EPI32 _mm_min_epi32
2444 # define _MM_MIN_EPU16 _mm_min_epu16
2445 # define _MM_MIN_EPU32 _mm_min_epu32
2446
2447 # define _MM_BLENDV_EPI8 _mm_blendv_epi8
2448 # define _MM_PACKUS_EPI32 _mm_packus_epi32
2449 # define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
2450
2451 # define _MM_MULLO_EPI32 _mm_mullo_epi32
2452 # define _MM_MUL_EPI32 _mm_mul_epi32
2453
2454 # define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
2455 #else //no SSE4 !!!!!!
_MM_CVTEPU8_EPI16(__m128i a)2456 _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
2457 {
2458 __m128i zero = _mm_setzero_si128();
2459 return _mm_unpacklo_epi8(a, zero);
2460 }
2461
_MM_CVTEPU16_EPI32(__m128i a)2462 _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
2463 {
2464 __m128i zero = _mm_setzero_si128();
2465 return _mm_unpacklo_epi16(a, zero);
2466 }
2467
_MM_CVTEPU32_EPI64(__m128i a)2468 _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
2469 {
2470 __m128i zero = _mm_setzero_si128();
2471 return _mm_unpacklo_epi32(a, zero);
2472 }
2473
_MM_CVTEPI8_EPI16(__m128i a)2474 _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
2475 {
2476 __m128i zero = _mm_setzero_si128();
2477 __m128i sign = _mm_cmpgt_epi8(zero, a);
2478 return _mm_unpacklo_epi8(a, sign);
2479 }
2480
_MM_CVTEPI16_EPI32(__m128i a)2481 _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
2482 {
2483 __m128i zero = _mm_setzero_si128();
2484 __m128i sign = _mm_cmpgt_epi16(zero, a);
2485 return _mm_unpacklo_epi16(a, sign);
2486 }
2487
_MM_CVTEPI32_EPI64(__m128i a)2488 _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
2489 {
2490 __m128i zero = _mm_setzero_si128();
2491 __m128i sign = _mm_cmpgt_epi32(zero, a);
2492 return _mm_unpacklo_epi32(a, sign);
2493 }
2494
_MM_EXTRACT_EPI32(__m128i vec,const int LANE)2495 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
2496 {
2497 _NEON2SSE_ALIGN_16 int32_t tmp[4];
2498 _mm_store_si128((__m128i*)tmp, vec);
2499 return tmp[LANE];
2500 }
2501
_MM_EXTRACT_EPI8(__m128i vec,const int LANE)2502 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
2503 {
2504 _NEON2SSE_ALIGN_16 int8_t tmp[16];
2505 _mm_store_si128((__m128i*)tmp, vec);
2506 return (int)tmp[LANE];
2507 }
2508
_MM_EXTRACT_PS(__m128 vec,const int LANE)2509 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
2510 {
2511 _NEON2SSE_ALIGN_16 int32_t tmp[4];
2512 _mm_store_si128((__m128i*)tmp, _M128i(vec));
2513 return tmp[LANE];
2514 }
2515
_MM_INSERT_EPI32(__m128i vec,int p,const int LANE)2516 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
2517 {
2518 _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
2519 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2520 __m128i vec_masked, p_masked;
2521 pvec[LANE] = p;
2522 mask[LANE] = 0x0;
2523 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2524 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2525 return _mm_or_si128(vec_masked, p_masked);
2526 }
2527
_MM_INSERT_EPI8(__m128i vec,int p,const int LANE)2528 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
2529 {
2530 _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
2531 _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
2532 __m128i vec_masked, p_masked;
2533 pvec[LANE] = (int8_t)p;
2534 mask[LANE] = 0x0;
2535 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2536 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2537 return _mm_or_si128(vec_masked, p_masked);
2538 }
2539
_MM_INSERT_PS(__m128 vec,__m128 p,const int LANE)2540 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
2541 {
2542 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2543 __m128 tmp, vec_masked, p_masked;
2544 mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
2545 vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
2546 p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
2547 tmp = _mm_or_ps(vec_masked, p_masked);
2548 return tmp;
2549 }
2550
_MM_MAX_EPI8(__m128i a,__m128i b)2551 _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
2552 {
2553 __m128i cmp, resa, resb;
2554 cmp = _mm_cmpgt_epi8 (a, b);
2555 resa = _mm_and_si128 (cmp, a);
2556 resb = _mm_andnot_si128 (cmp,b);
2557 return _mm_or_si128(resa, resb);
2558 }
2559
_MM_MAX_EPI32(__m128i a,__m128i b)2560 _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
2561 {
2562 __m128i cmp, resa, resb;
2563 cmp = _mm_cmpgt_epi32(a, b);
2564 resa = _mm_and_si128 (cmp, a);
2565 resb = _mm_andnot_si128 (cmp,b);
2566 return _mm_or_si128(resa, resb);
2567 }
2568
_MM_MAX_EPU16(__m128i a,__m128i b)2569 _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
2570 {
2571 __m128i c8000, b_s, a_s, cmp;
2572 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2573 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2574 b_s = _mm_sub_epi16 (b, c8000);
2575 a_s = _mm_sub_epi16 (a, c8000);
2576 cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
2577 a_s = _mm_and_si128 (cmp,a);
2578 b_s = _mm_andnot_si128 (cmp,b);
2579 return _mm_or_si128(a_s, b_s);
2580 }
2581
_MM_MAX_EPU32(__m128i a,__m128i b)2582 _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
2583 {
2584 __m128i c80000000, b_s, a_s, cmp;
2585 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2586 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2587 b_s = _mm_sub_epi32 (b, c80000000);
2588 a_s = _mm_sub_epi32 (a, c80000000);
2589 cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
2590 a_s = _mm_and_si128 (cmp,a);
2591 b_s = _mm_andnot_si128 (cmp,b);
2592 return _mm_or_si128(a_s, b_s);
2593 }
2594
_MM_MIN_EPI8(__m128i a,__m128i b)2595 _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
2596 {
2597 __m128i cmp, resa, resb;
2598 cmp = _mm_cmpgt_epi8 (b, a);
2599 resa = _mm_and_si128 (cmp, a);
2600 resb = _mm_andnot_si128 (cmp,b);
2601 return _mm_or_si128(resa, resb);
2602 }
2603
_MM_MIN_EPI32(__m128i a,__m128i b)2604 _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
2605 {
2606 __m128i cmp, resa, resb;
2607 cmp = _mm_cmpgt_epi32(b, a);
2608 resa = _mm_and_si128 (cmp, a);
2609 resb = _mm_andnot_si128 (cmp,b);
2610 return _mm_or_si128(resa, resb);
2611 }
2612
_MM_MIN_EPU16(__m128i a,__m128i b)2613 _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
2614 {
2615 __m128i c8000, b_s, a_s, cmp;
2616 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
2617 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
2618 b_s = _mm_sub_epi16 (b, c8000);
2619 a_s = _mm_sub_epi16 (a, c8000);
2620 cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
2621 a_s = _mm_and_si128 (cmp,a);
2622 b_s = _mm_andnot_si128 (cmp,b);
2623 return _mm_or_si128(a_s, b_s);
2624 }
2625
_MM_MIN_EPU32(__m128i a,__m128i b)2626 _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
2627 {
2628 __m128i c80000000, b_s, a_s, cmp;
2629 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
2630 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
2631 b_s = _mm_sub_epi32 (b, c80000000);
2632 a_s = _mm_sub_epi32 (a, c80000000);
2633 cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
2634 a_s = _mm_and_si128 (cmp,a);
2635 b_s = _mm_andnot_si128 (cmp,b);
2636 return _mm_or_si128(a_s, b_s);
2637 }
2638
_MM_BLENDV_EPI8(__m128i a,__m128i b,__m128i mask)2639 _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
2640 {
2641 //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
2642 __m128i a_masked, b_masked;
2643 b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
2644 a_masked = _mm_andnot_si128 (mask,a);
2645 return _mm_or_si128(a_masked, b_masked);
2646 }
2647
_MM_PACKUS_EPI32(__m128i a,__m128i b)2648 _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
2649 {
2650 __m128i a16, b16, res, reshi,cmp, zero;
2651 zero = _mm_setzero_si128();
2652 a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
2653 b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
2654 res = _mm_unpacklo_epi64(a16, b16); //result without saturation
2655 reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
2656 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2657 res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2658 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2659 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2660 }
2661
_MM_PACKUS1_EPI32(__m128i a)2662 _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
2663 {
2664 __m128i a16, res, reshi,cmp, zero;
2665 zero = _mm_setzero_si128();
2666 a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
2667 reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
2668 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
2669 res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2670 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
2671 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2672 }
2673
2674
_NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32 (__m128i a,__m128i b),_NEON2SSE_REASON_SLOW_SERIAL)2675 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
2676 {
2677 _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
2678 int64_t res64;
2679 int i;
2680 _mm_store_si128((__m128i*)atmp, a);
2681 _mm_store_si128((__m128i*)btmp, b);
2682 for (i = 0; i<4; i++) {
2683 res64 = atmp[i] * btmp[i];
2684 res[i] = (int)(res64 & 0xffffffff);
2685 }
2686 return _mm_load_si128((__m128i*)res);
2687 }
2688
_MM_MUL_EPI32(__m128i a,__m128i b)2689 _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
2690 {
2691 __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg;
2692 sign = _mm_xor_si128 (a, b);
2693 sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
2694 sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
2695 zero = _mm_setzero_si128();
2696 a_neg = _mm_abs_epi32 (a); //negate a and b
2697 b_neg = _mm_abs_epi32 (b); //negate a and b
2698 mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
2699 mul_us_neg = _mm_sub_epi64(zero, mul_us);
2700 mul_us_neg = _mm_and_si128(sign, mul_us_neg);
2701 mul_us = _mm_andnot_si128(sign, mul_us);
2702 return _mm_or_si128 (mul_us, mul_us_neg);
2703 }
2704
_MM_CMPEQ_EPI64(__m128i a,__m128i b)2705 _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
2706 {
2707 __m128i res;
2708 res = _mm_cmpeq_epi32 (a, b);
2709 return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
2710 }
2711 #endif //SSE4
2712
2713 //the special case of functions working only for 32 bits, no SSE4
_MM_INSERT_EPI64_32(__m128i vec,int p,const int LANE)2714 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
2715 {
2716 _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
2717 _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
2718 __m128i vec_masked, p_masked;
2719 pvec[LANE] = p;
2720 mask[LANE] = 0x0;
2721 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
2722 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
2723 return _mm_or_si128(vec_masked, p_masked);
2724 }
2725
_MM_EXTRACT_EPI64_32(__m128i val,const int LANE)2726 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
2727 {
2728 _NEON2SSE_ALIGN_16 int64_t tmp[2];
2729 _mm_store_si128((__m128i*)tmp, val);
2730 return tmp[LANE];
2731 }
2732
2733 #ifndef _NEON2SSE_64BIT_SSE4
2734 # define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
2735 # define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
2736 #endif
2737
2738 _NEON2SSESTORAGE int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints
vqd_s32(int32x4_t a)2739 _NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a)
2740 {
2741 //Overflow happens only if a and sum have the opposite signs
2742 __m128i c7fffffff, res, res_sat, res_xor_a;
2743 c7fffffff = _mm_set1_epi32(0x7fffffff);
2744 res = _mm_slli_epi32 (a, 1); // res = a*2
2745 res_sat = _mm_srli_epi32(a, 31);
2746 res_sat = _mm_add_epi32(res_sat, c7fffffff);
2747 res_xor_a = _mm_xor_si128(res, a);
2748 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
2749 res_sat = _mm_and_si128(res_xor_a, res_sat);
2750 res = _mm_andnot_si128(res_xor_a, res);
2751 return _mm_or_si128(res, res_sat);
2752 }
2753
2754
2755 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2756 //*************************************************************************
2757 //*************************************************************************
2758 //***************** Functions redefinition\implementatin starts here *****
2759 //*************************************************************************
2760 //*************************************************************************
2761 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2762
2763 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
2764 #ifdef ARM
2765 #define vector_addq_s32 _mm_add_epi32
2766 #else //if we have IA
2767 #define vector_addq_s32 vadd_s32
2768 #endif
2769
2770 ********************************************************************************************
2771 Functions below are organised in the following way:
2772
2773 Each NEON intrinsic function has one of the following options:
2774 1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
2775 2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
2776 3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
2777 4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
2778 the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
2779 - please consider such functions removal from your code.
2780 */
2781
2782 //***********************************************************************
2783 //************************ Vector add *****************************
2784 //***********************************************************************
2785 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
vadd_s8(int8x8_t a,int8x8_t b)2786 _NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
2787 {
2788 int8x8_t res64;
2789 return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
2790 }
2791
2792
2793 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
vadd_s16(int16x4_t a,int16x4_t b)2794 _NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
2795 {
2796 int16x4_t res64;
2797 return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
2798 }
2799
2800
2801 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
vadd_s32(int32x2_t a,int32x2_t b)2802 _NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
2803 {
2804 int32x2_t res64;
2805 return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
2806 }
2807
2808
2809 _NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
vadd_s64(int64x1_t a,int64x1_t b)2810 _NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b)
2811 {
2812 int64x1_t res64;
2813 res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
2814 return res64;
2815 }
2816
2817
2818 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
vadd_f32(float32x2_t a,float32x2_t b)2819 _NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
2820 {
2821 __m128 res;
2822 __m64_128 res64;
2823 res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
2824 _M64f(res64, res);
2825 return res64;
2826 }
2827
2828 _NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
2829 #define vadd_u8 vadd_s8
2830
2831 _NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
2832 #define vadd_u16 vadd_s16
2833
2834 _NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
2835 #define vadd_u32 vadd_s32
2836
2837 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
vadd_u64(uint64x1_t a,uint64x1_t b)2838 _NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b)
2839 {
2840 uint64x1_t res64;
2841 res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
2842 return res64;
2843 }
2844
2845
2846 _NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
2847 #define vaddq_s8 _mm_add_epi8
2848
2849 _NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
2850 #define vaddq_s16 _mm_add_epi16
2851
2852 _NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
2853 #define vaddq_s32 _mm_add_epi32
2854
2855 _NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
2856 #define vaddq_s64 _mm_add_epi64
2857
2858 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
2859 #define vaddq_f32 _mm_add_ps
2860
2861 _NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
2862 #define vaddq_u8 _mm_add_epi8
2863
2864 _NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
2865 #define vaddq_u16 _mm_add_epi16
2866
2867 _NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
2868 #define vaddq_u32 _mm_add_epi32
2869
2870 _NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
2871 #define vaddq_u64 _mm_add_epi64
2872
2873 //**************************** Vector long add *****************************:
2874 //***********************************************************************
2875 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
2876 _NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
vaddl_s8(int8x8_t a,int8x8_t b)2877 _NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
2878 {
2879 __m128i a16, b16;
2880 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
2881 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2882 return _mm_add_epi16 (a16, b16);
2883 }
2884
2885 _NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
vaddl_s16(int16x4_t a,int16x4_t b)2886 _NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
2887 {
2888 __m128i a32, b32;
2889 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
2890 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
2891 return _mm_add_epi32 (a32, b32);
2892 }
2893
2894 _NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
vaddl_s32(int32x2_t a,int32x2_t b)2895 _NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
2896 {
2897 //may be not optimal
2898 __m128i a64, b64;
2899 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
2900 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2901 return _mm_add_epi64 ( a64, b64);
2902 }
2903
2904 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
vaddl_u8(uint8x8_t a,uint8x8_t b)2905 _NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
2906 {
2907 __m128i a16, b16;
2908 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
2909 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2910 return _mm_add_epi16 (a16, b16);
2911 }
2912
2913 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
vaddl_u16(uint16x4_t a,uint16x4_t b)2914 _NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
2915 {
2916 __m128i a32, b32;
2917 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
2918 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2919 return _mm_add_epi32 (a32, b32);
2920 }
2921
2922 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
vaddl_u32(uint32x2_t a,uint32x2_t b)2923 _NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
2924 {
2925 //may be not optimal
2926 __m128i a64, b64;
2927 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
2928 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2929 return _mm_add_epi64 (a64, b64);
2930 }
2931
2932 //*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
2933 //*************** *********************************************************************
2934 _NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
vaddw_s8(int16x8_t a,int8x8_t b)2935 _NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
2936 {
2937 __m128i b16;
2938 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
2939 return _mm_add_epi16 (a, b16);
2940 }
2941
2942 _NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
vaddw_s16(int32x4_t a,int16x4_t b)2943 _NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
2944 {
2945 __m128i b32;
2946 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
2947 return _mm_add_epi32 (a, b32);
2948 }
2949
2950 _NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
vaddw_s32(int64x2_t a,int32x2_t b)2951 _NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
2952 {
2953 __m128i b64;
2954 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
2955 return _mm_add_epi64 (a, b64);
2956 }
2957
2958 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
vaddw_u8(uint16x8_t a,uint8x8_t b)2959 _NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
2960 {
2961 __m128i b16;
2962 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
2963 return _mm_add_epi16 (a, b16);
2964 }
2965
2966 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
vaddw_u16(uint32x4_t a,uint16x4_t b)2967 _NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
2968 {
2969 __m128i b32;
2970 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
2971 return _mm_add_epi32 (a, b32);
2972 }
2973
2974 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
vaddw_u32(uint64x2_t a,uint32x2_t b)2975 _NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
2976 {
2977 __m128i b64;
2978 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
2979 return _mm_add_epi64 (a, b64);
2980 }
2981
2982 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *******************************
2983 //*************************************************************************************************************************
2984 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
vhadd_s8(int8x8_t a,int8x8_t b)2985 _NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b)
2986 {
2987 int8x8_t res64;
2988 return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
2989 }
2990
2991
2992 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
vhadd_s16(int16x4_t a,int16x4_t b)2993 _NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b)
2994 {
2995 int16x4_t res64;
2996 return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
2997 }
2998
2999
3000 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
vhadd_s32(int32x2_t a,int32x2_t b)3001 _NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b)
3002 {
3003 int32x2_t res64;
3004 return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
3005 }
3006
3007
3008 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0
vhadd_u8(uint8x8_t a,uint8x8_t b)3009 _NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b)
3010 {
3011 uint8x8_t res64;
3012 return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
3013 }
3014
3015
3016 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0
vhadd_u16(uint16x4_t a,uint16x4_t b)3017 _NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b)
3018 {
3019 uint16x4_t res64;
3020 return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
3021 }
3022
3023
3024 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
vhadd_u32(uint32x2_t a,uint32x2_t b)3025 _NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b)
3026 {
3027 uint32x2_t res64;
3028 return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
3029 }
3030
3031
3032 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
vhaddq_s8(int8x16_t a,int8x16_t b)3033 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
3034 {
3035 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3036 __m128i tmp1, tmp2;
3037 tmp1 = _mm_and_si128(a,b);
3038 tmp2 = _mm_xor_si128(a,b);
3039 tmp2 = vshrq_n_s8(tmp2,1);
3040 return _mm_add_epi8(tmp1,tmp2);
3041 }
3042
3043 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
vhaddq_s16(int16x8_t a,int16x8_t b)3044 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
3045 {
3046 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3047 __m128i tmp1, tmp2;
3048 tmp1 = _mm_and_si128(a,b);
3049 tmp2 = _mm_xor_si128(a,b);
3050 tmp2 = _mm_srai_epi16(tmp2,1);
3051 return _mm_add_epi16(tmp1,tmp2);
3052 }
3053
3054 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
vhaddq_s32(int32x4_t a,int32x4_t b)3055 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
3056 {
3057 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3058 __m128i tmp1, tmp2;
3059 tmp1 = _mm_and_si128(a,b);
3060 tmp2 = _mm_xor_si128(a,b);
3061 tmp2 = _mm_srai_epi32(tmp2,1);
3062 return _mm_add_epi32(tmp1,tmp2);
3063 }
3064
3065 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
vhaddq_u8(uint8x16_t a,uint8x16_t b)3066 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
3067 {
3068 __m128i c1, sum, res;
3069 c1 = _mm_set1_epi8(1);
3070 sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
3071 res = _mm_xor_si128(a, b); //for rounding compensation
3072 res = _mm_and_si128(res,c1); //for rounding compensation
3073 return _mm_sub_epi8 (sum, res); //actual rounding compensation
3074 }
3075
3076 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
vhaddq_u16(uint16x8_t a,uint16x8_t b)3077 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
3078 {
3079 __m128i sum, res;
3080 sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
3081 res = _mm_xor_si128(a, b); //for rounding compensation
3082 res = _mm_slli_epi16 (res,15); //shift left then back right to
3083 res = _mm_srli_epi16 (res,15); //get 1 or zero
3084 return _mm_sub_epi16 (sum, res); //actual rounding compensation
3085 }
3086
3087 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
vhaddq_u32(uint32x4_t a,uint32x4_t b)3088 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
3089 {
3090 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3091 __m128i tmp1, tmp2;
3092 tmp1 = _mm_and_si128(a,b);
3093 tmp2 = _mm_xor_si128(a,b);
3094 tmp2 = _mm_srli_epi32(tmp2,1);
3095 return _mm_add_epi32(tmp1,tmp2);
3096 }
3097
3098 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 ***************************
3099 //*****************************************************************************************************************************
3100 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
vrhadd_s8(int8x8_t a,int8x8_t b)3101 _NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b)
3102 {
3103 int8x8_t res64;
3104 return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
3105 }
3106
3107
3108 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
vrhadd_s16(int16x4_t a,int16x4_t b)3109 _NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b)
3110 {
3111 int16x4_t res64;
3112 return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
3113 }
3114
3115
3116 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
vrhadd_s32(int32x2_t a,int32x2_t b)3117 _NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b)
3118 {
3119 int32x2_t res64;
3120 return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
3121 }
3122
3123
3124 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
vrhadd_u8(uint8x8_t a,uint8x8_t b)3125 _NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
3126 {
3127 uint8x8_t res64;
3128 return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3129 }
3130
3131
3132 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
vrhadd_u16(uint16x4_t a,uint16x4_t b)3133 _NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
3134 {
3135 uint16x4_t res64;
3136 return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
3137 }
3138
3139
3140 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
vrhadd_u32(uint32x2_t a,uint32x2_t b)3141 _NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b)
3142 {
3143 uint32x2_t res64;
3144 return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
3145 }
3146
3147
3148 _NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
vrhaddq_s8(int8x16_t a,int8x16_t b)3149 _NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
3150 {
3151 //no signed average in x86 SIMD, go to unsigned
3152 __m128i c128, au, bu, sum;
3153 c128 = _mm_set1_epi8((int8_t)0x80); //-128
3154 au = _mm_sub_epi8(a, c128); //add 128
3155 bu = _mm_sub_epi8(b, c128); //add 128
3156 sum = _mm_avg_epu8(au, bu);
3157 return _mm_add_epi8 (sum, c128); //sub 128
3158 }
3159
3160 _NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
vrhaddq_s16(int16x8_t a,int16x8_t b)3161 _NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
3162 {
3163 //no signed average in x86 SIMD, go to unsigned
3164 __m128i cx8000, au, bu, sum;
3165 cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768
3166 au = _mm_sub_epi16(a, cx8000); //add 32768
3167 bu = _mm_sub_epi16(b, cx8000); //add 32768
3168 sum = _mm_avg_epu16(au, bu);
3169 return _mm_add_epi16 (sum, cx8000); //sub 32768
3170 }
3171
3172 _NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
vrhaddq_s32(int32x4_t a,int32x4_t b)3173 _NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b)
3174 {
3175 //need to avoid overflow
3176 __m128i a2, b2, res, sum;
3177 a2 = _mm_srai_epi32(a,1); //a2=a/2;
3178 b2 = _mm_srai_epi32(b,1); // b2=b/2;
3179 res = _mm_or_si128(a,b); //for rounding
3180 res = _mm_slli_epi32 (res,31); //shift left then back right to
3181 res = _mm_srli_epi32 (res,31); //get 1 or zero
3182 sum = _mm_add_epi32(a2,b2);
3183 return _mm_add_epi32(sum,res);
3184 }
3185
3186 _NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
3187 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
3188
3189 _NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
3190 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
3191
3192
3193 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
vrhaddq_u32(uint32x4_t a,uint32x4_t b)3194 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
3195 {
3196 //need to avoid overflow
3197 __m128i a2, b2, res, sum;
3198 a2 = _mm_srli_epi32(a,1); //a2=a/2;
3199 b2 = _mm_srli_epi32(b,1); // b2=b/2;
3200 res = _mm_or_si128(a,b); //for rounding
3201 res = _mm_slli_epi32 (res,31); //shift left then back right to
3202 res = _mm_srli_epi32 (res,31); //get 1 or zero
3203 sum = _mm_add_epi32(a2,b2);
3204 return _mm_add_epi32(sum,res);
3205 }
3206
3207 //****************** VQADD: Vector saturating add ************************
3208 //************************************************************************
3209 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
vqadd_s8(int8x8_t a,int8x8_t b)3210 _NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
3211 {
3212 int8x8_t res64;
3213 return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
3214 }
3215
3216
3217 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
vqadd_s16(int16x4_t a,int16x4_t b)3218 _NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
3219 {
3220 int16x4_t res64;
3221 return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
3222 }
3223
3224
3225 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
vqadd_s32(int32x2_t a,int32x2_t b)3226 _NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b)
3227 {
3228 int32x2_t res64;
3229 return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
3230 }
3231
3232
3233 _NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)3234 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3235 {
3236 int64x1_t res;
3237 uint64_t a64, b64;
3238 a64 = a.m64_u64[0];
3239 b64 = b.m64_u64[0];
3240 res.m64_u64[0] = a64 + b64;
3241 a64 = (a64 >> 63) + (~_SIGNBIT64);
3242 if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
3243 res.m64_u64[0] = a64;
3244 }
3245 return res;
3246 }
3247
3248 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
vqadd_u8(uint8x8_t a,uint8x8_t b)3249 _NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
3250 {
3251 uint8x8_t res64;
3252 return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
3253 }
3254
3255
3256 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
vqadd_u16(uint16x4_t a,uint16x4_t b)3257 _NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
3258 {
3259 uint16x4_t res64;
3260 return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
3261 }
3262
3263
3264 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
vqadd_u32(uint32x2_t a,uint32x2_t b)3265 _NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b)
3266 {
3267 uint32x2_t res64;
3268 return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
3269 }
3270
3271
3272 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64 (uint64x1_t a,uint64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)3273 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3274 {
3275 _NEON2SSE_ALIGN_16 uint64_t a64, b64;
3276 uint64x1_t res;
3277 a64 = a.m64_u64[0];
3278 b64 = b.m64_u64[0];
3279 res.m64_u64[0] = a64 + b64;
3280 if (res.m64_u64[0] < a64) {
3281 res.m64_u64[0] = ~(uint64_t)0;
3282 }
3283 return res;
3284 }
3285
3286 _NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
3287 #define vqaddq_s8 _mm_adds_epi8
3288
3289 _NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
3290 #define vqaddq_s16 _mm_adds_epi16
3291
3292 _NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
vqaddq_s32(int32x4_t a,int32x4_t b)3293 _NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b)
3294 {
3295 //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
3296 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
3297 c7fffffff = _mm_set1_epi32(0x7fffffff);
3298 res = _mm_add_epi32(a, b);
3299 res_sat = _mm_srli_epi32(a, 31);
3300 res_sat = _mm_add_epi32(res_sat, c7fffffff);
3301 res_xor_a = _mm_xor_si128(res, a);
3302 b_xor_a_ = _mm_xor_si128(b, a);
3303 res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
3304 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
3305 res_sat = _mm_and_si128(res_xor_a, res_sat);
3306 res = _mm_andnot_si128(res_xor_a, res);
3307 return _mm_or_si128(res, res_sat);
3308 }
3309
3310 _NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3312 {
3313 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3314 _mm_store_si128((__m128i*)atmp, a);
3315 _mm_store_si128((__m128i*)btmp, b);
3316 res[0] = atmp[0] + btmp[0];
3317 res[1] = atmp[1] + btmp[1];
3318
3319 atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
3320 atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
3321
3322 if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
3323 res[0] = atmp[0];
3324 }
3325 if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
3326 res[1] = atmp[1];
3327 }
3328 return _mm_load_si128((__m128i*)res);
3329 }
3330
3331 _NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
3332 #define vqaddq_u8 _mm_adds_epu8
3333
3334 _NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
3335 #define vqaddq_u16 _mm_adds_epu16
3336
3337 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
vqaddq_u32(uint32x4_t a,uint32x4_t b)3338 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
3339 {
3340 __m128i c80000000, cmp, subsum, suba, sum;
3341 c80000000 = _mm_set1_epi32 (0x80000000);
3342 sum = _mm_add_epi32 (a, b);
3343 subsum = _mm_sub_epi32 (sum, c80000000);
3344 suba = _mm_sub_epi32 (a, c80000000);
3345 cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
3346 return _mm_or_si128 (sum, cmp); //saturation
3347 }
3348
3349 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
3350 #ifdef USE_SSE4
vqaddq_u64(uint64x2_t a,uint64x2_t b)3351 _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
3352 {
3353 __m128i c80000000, sum, cmp, suba, subsum;
3354 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
3355 sum = _mm_add_epi64 (a, b);
3356 subsum = _mm_sub_epi64 (sum, c80000000);
3357 suba = _mm_sub_epi64 (a, c80000000);
3358 cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
3359 return _mm_or_si128 (sum, cmp); //saturation
3360 }
3361 #else
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64 (uint64x2_t a,uint64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3362 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3363 {
3364 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
3365 _mm_store_si128((__m128i*)atmp, a);
3366 _mm_store_si128((__m128i*)btmp, b);
3367 res[0] = atmp[0] + btmp[0];
3368 res[1] = atmp[1] + btmp[1];
3369 if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
3370 if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
3371 return _mm_load_si128((__m128i*)(res));
3372 }
3373 #endif
3374
3375
3376 //******************* Vector add high half (truncated) ******************
3377 //************************************************************************
3378 _NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
vaddhn_s16(int16x8_t a,int16x8_t b)3379 _NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
3380 {
3381 int8x8_t res64;
3382 __m128i sum;
3383 sum = _mm_add_epi16 (a, b);
3384 sum = _mm_srai_epi16 (sum, 8);
3385 sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
3386 return64(sum);
3387 }
3388
3389 _NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
vaddhn_s32(int32x4_t a,int32x4_t b)3390 _NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
3391 {
3392 int16x4_t res64;
3393 __m128i sum;
3394 sum = _mm_add_epi32 (a, b);
3395 sum = _mm_srai_epi32(sum, 16);
3396 sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
3397 return64(sum);
3398 }
3399
3400 _NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
vaddhn_s64(int64x2_t a,int64x2_t b)3401 _NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b)
3402 {
3403 int32x2_t res64;
3404 __m128i sum;
3405 sum = _mm_add_epi64 (a, b);
3406 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6));
3407 return64(sum);
3408 }
3409
3410 _NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
vaddhn_u16(uint16x8_t a,uint16x8_t b)3411 _NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
3412 {
3413 uint8x8_t res64;
3414 __m128i sum;
3415 sum = _mm_add_epi16 (a, b);
3416 sum = _mm_srli_epi16 (sum, 8);
3417 sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
3418 return64(sum);
3419 }
3420
3421 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
vaddhn_u32(uint32x4_t a,uint32x4_t b)3422 _NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
3423 {
3424 uint16x4_t res64;
3425 __m128i sum;
3426 sum = _mm_add_epi32 (a, b);
3427 sum = _mm_srli_epi32 (sum, 16);
3428 #ifdef USE_SSE4
3429 sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
3430 #else
3431 sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
3432 #endif
3433 return64(sum);
3434 }
3435
3436 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
3437 #define vaddhn_u64 vaddhn_s64
3438
3439 //*********** Vector rounding add high half: vraddhn_<type> ******************.
3440 //***************************************************************************
3441 _NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
vraddhn_s16(int16x8_t a,int16x8_t b)3442 _NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
3443 {
3444 int8x8_t res64;
3445 __m128i sum, mask1;
3446 sum = _mm_add_epi16 (a, b);
3447 mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3448 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
3449 sum = _mm_srai_epi16 (sum, 8); //get high half
3450 sum = _mm_add_epi16 (sum, mask1); //actual rounding
3451 sum = _mm_packs_epi16 (sum, sum);
3452 return64(sum);
3453 }
3454
3455 _NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
vraddhn_s32(int32x4_t a,int32x4_t b)3456 _NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
3457 {
3458 //SIMD may be not optimal, serial may be faster
3459 int16x4_t res64;
3460 __m128i sum, mask1;
3461 sum = _mm_add_epi32 (a, b);
3462 mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3463 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
3464 sum = _mm_srai_epi32 (sum, 16); //get high half
3465 sum = _mm_add_epi32 (sum, mask1); //actual rounding
3466 sum = _mm_packs_epi32 (sum, sum);
3467 return64(sum);
3468 }
3469
3470 _NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
vraddhn_s64(int64x2_t a,int64x2_t b)3471 _NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
3472 {
3473 //SIMD may be not optimal, serial may be faster
3474 int32x2_t res64;
3475 __m128i sum, mask1;
3476 sum = _mm_add_epi64 (a, b);
3477 mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
3478 mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
3479 sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
3480 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6));
3481 return64(sum);
3482 }
3483
3484 _NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
vraddhn_u16(uint16x8_t a,uint16x8_t b)3485 _NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
3486 {
3487 uint8x8_t res64;
3488 __m128i sum, mask1;
3489 sum = _mm_add_epi16 (a, b);
3490 mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
3491 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
3492 sum = _mm_srai_epi16 (sum, 8); //get high half
3493 sum = _mm_add_epi16 (sum, mask1); //actual rounding
3494 sum = _mm_packus_epi16 (sum, sum);
3495 return64(sum);
3496 }
3497
3498 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
vraddhn_u32(uint32x4_t a,uint32x4_t b)3499 _NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
3500 {
3501 //SIMD may be not optimal, serial may be faster
3502 uint16x4_t res64;
3503 __m128i sum, mask1;
3504 sum = _mm_add_epi32 (a, b);
3505 mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
3506 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
3507 sum = _mm_srai_epi32 (sum, 16); //get high half
3508 sum = _mm_add_epi32 (sum, mask1); //actual rounding
3509 sum = _MM_PACKUS1_EPI32 (sum);
3510 return64(sum);
3511 }
3512
3513 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
3514 #define vraddhn_u64 vraddhn_s64
3515
3516 //**********************************************************************************
3517 //********* Multiplication *************************************
3518 //**************************************************************************************
3519
3520 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
3521 //As we don't go to wider result functions are equal to "multiply low" in x86
3522 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
vmul_s8(int8x8_t a,int8x8_t b)3523 _NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
3524 {
3525 // no 8 bit simd multiply, need to go to 16 bits in SSE
3526 int8x8_t res64;
3527 __m128i a128, b128, res;
3528 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
3529 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3530 res = _mm_mullo_epi16 (a128, b128);
3531 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
3532 return64(res);
3533 }
3534
3535 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
3536 #define vmul_s16 vmul_u16
3537
3538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
3539 #define vmul_s32 vmul_u32
3540
3541 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
vmul_f32(float32x2_t a,float32x2_t b)3542 _NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
3543 {
3544 float32x4_t tmp;
3545 __m64_128 res64;
3546 tmp = _mm_mul_ps(_pM128(a),_pM128(b));
3547 _M64f(res64, tmp); //use low 64 bits
3548 return res64;
3549 }
3550
3551 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
vmul_u8(uint8x8_t a,uint8x8_t b)3552 _NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
3553 {
3554 // no 8 bit simd multiply, need to go to 16 bits in SSE
3555 uint8x8_t res64;
3556 __m128i mask, a128, b128, res;
3557 mask = _mm_set1_epi16(0xff);
3558 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
3559 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
3560 res = _mm_mullo_epi16 (a128, b128);
3561 res = _mm_and_si128(res, mask); //to avoid saturation
3562 res = _mm_packus_epi16 (res,res); //use only low 64 bits
3563 return64(res);
3564 }
3565
3566 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
vmul_u16(uint16x4_t a,uint16x4_t b)3567 _NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
3568 {
3569 uint16x4_t res64;
3570 return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
3571 }
3572
3573 _NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vmul_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3574 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
3575 {
3576 uint32x2_t res;
3577 res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
3578 res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
3579 return res;
3580 }
3581
3582 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
vmul_p8(poly8x8_t a,poly8x8_t b)3583 _NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
3584 {
3585 //may be optimized
3586 poly8x8_t res64;
3587 __m128i a64, b64, c1, res, tmp, bmasked;
3588 int i;
3589 a64 = _pM128i(a);
3590 b64 = _pM128i(b);
3591 c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
3592 c1 = vshrq_n_u8(c1,7); //0x1
3593 bmasked = _mm_and_si128(b64, c1); //0x1
3594 res = vmulq_u8(a64, bmasked);
3595 for(i = 1; i<8; i++) {
3596 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3597 bmasked = _mm_and_si128(b64, c1); //0x1
3598 tmp = vmulq_u8(a64, bmasked);
3599 res = _mm_xor_si128(res, tmp);
3600 }
3601 return64 (res);
3602 }
3603
3604 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
vmulq_s8(int8x16_t a,int8x16_t b)3605 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
3606 {
3607 // no 8 bit simd multiply, need to go to 16 bits
3608 //solution may be not optimal
3609 __m128i a16, b16, r16_1, r16_2;
3610 a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
3611 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3612 r16_1 = _mm_mullo_epi16 (a16, b16);
3613 //swap hi and low part of a and b to process the remaining data
3614 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3615 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3616 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3617 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2
3618
3619 r16_2 = _mm_mullo_epi16 (a16, b16);
3620 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3621 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
3622
3623 return _mm_unpacklo_epi64(r16_1, r16_2);
3624 }
3625
3626 _NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
3627 #define vmulq_s16 _mm_mullo_epi16
3628
3629 _NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
3630 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
3631
3632 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
3633 #define vmulq_f32 _mm_mul_ps
3634
3635 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
vmulq_u8(uint8x16_t a,uint8x16_t b)3636 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
3637 {
3638 // no 8 bit simd multiply, need to go to 16 bits
3639 //solution may be not optimal
3640 __m128i maskff, a16, b16, r16_1, r16_2;
3641 maskff = _mm_set1_epi16(0xff);
3642 a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
3643 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3644 r16_1 = _mm_mullo_epi16 (a16, b16);
3645 r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
3646 //swap hi and low part of a and b to process the remaining data
3647 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3648 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3649 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
3650 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3651
3652 r16_2 = _mm_mullo_epi16 (a16, b16);
3653 r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
3654 return _mm_packus_epi16 (r16_1, r16_2);
3655 }
3656
3657 _NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
3658 #define vmulq_u16 _mm_mullo_epi16
3659
3660 _NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
3661 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
3662
3663 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
vmulq_p8(poly8x16_t a,poly8x16_t b)3664 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
3665 {
3666 //may be optimized
3667 __m128i c1, res, tmp, bmasked;
3668 int i;
3669 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
3670 c1 = vshrq_n_u8(c1,7); //0x1
3671 bmasked = _mm_and_si128(b, c1); //0x1
3672 res = vmulq_u8(a, bmasked);
3673 for(i = 1; i<8; i++) {
3674 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3675 bmasked = _mm_and_si128(b, c1); //0x1
3676 tmp = vmulq_u8(a, bmasked);
3677 res = _mm_xor_si128(res, tmp);
3678 }
3679 return res;
3680 }
3681
3682 //************************* Vector long multiply ***********************************
3683 //****************************************************************************
3684 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
vmull_s8(int8x8_t a,int8x8_t b)3685 _NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
3686 {
3687 //no 8 bit simd multiply, need to go to 16 bits
3688 __m128i a16, b16;
3689 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
3690 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
3691 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3692 }
3693
3694 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
vmull_s16(int16x4_t a,int16x4_t b)3695 _NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
3696 {
3697 #ifdef USE_SSE4
3698 __m128i a16, b16;
3699 a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
3700 b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
3701 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3702 #else
3703 __m128i low, hi, a128,b128;
3704 a128 = _pM128i(a);
3705 b128 = _pM128i(b);
3706 low = _mm_mullo_epi16(a128,b128);
3707 hi = _mm_mulhi_epi16(a128,b128);
3708 return _mm_unpacklo_epi16(low,hi);
3709 #endif
3710 }
3711
3712 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
vmull_s32(int32x2_t a,int32x2_t b)3713 _NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
3714 {
3715 __m128i ab, ba, a128, b128;
3716 a128 = _pM128i(a);
3717 b128 = _pM128i(b);
3718 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3719 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3720 return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3721 }
3722
3723 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
vmull_u8(uint8x8_t a,uint8x8_t b)3724 _NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
3725 {
3726 //no 8 bit simd multiply, need to go to 16 bits
3727 __m128i a16, b16;
3728 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
3729 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
3730 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
3731 }
3732
3733 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
vmull_u16(uint16x4_t a,uint16x4_t b)3734 _NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
3735 {
3736 #ifdef USE_SSE4
3737 __m128i a16, b16;
3738 a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
3739 b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
3740 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
3741 #else
3742 __m128i a128,b128,low, hi;
3743 a128 = _pM128i(a);
3744 b128 = _pM128i(b);
3745 low = _mm_mullo_epi16(a128,b128);
3746 hi = _mm_mulhi_epu16(a128,b128);
3747 return _mm_unpacklo_epi16(low,hi);
3748 #endif
3749 }
3750
3751 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
vmull_u32(uint32x2_t a,uint32x2_t b)3752 _NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
3753 {
3754 ///may be not optimal compared with serial implementation
3755 __m128i ab, ba, a128, b128;
3756 a128 = _pM128i(a);
3757 b128 = _pM128i(b);
3758 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
3759 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
3760 return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3761 }
3762
3763 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
vmull_p8(poly8x8_t a,poly8x8_t b)3764 _NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
3765 {
3766 //may be optimized
3767 __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
3768 int i;
3769 a128 = _pM128i(a);
3770 b128 = _pM128i(b);
3771 c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
3772 c1 = vshrq_n_u8(c1,7); //0x1
3773 bmasked = _mm_and_si128(b128, c1); //0x1
3774
3775 a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
3776 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3777 res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
3778 for(i = 1; i<8; i++) {
3779 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
3780 bmasked = _mm_and_si128(b128, c1); //0x1
3781 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
3782 tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
3783 res = _mm_xor_si128(res, tmp);
3784 }
3785 return res;
3786 }
3787
3788 //****************Vector saturating doubling long multiply **************************
3789 //*****************************************************************
3790 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
vqdmull_s16(int16x4_t a,int16x4_t b)3791 _NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
3792 {
3793 //the serial soulution may be faster due to saturation
3794 __m128i res;
3795 res = vmull_s16(a, b);
3796 return vqd_s32(res);
3797 }
3798
3799 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)3800 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
3801 {
3802 //the serial soulution may be faster due to saturation
3803 __m128i res;
3804 res = vmull_s32(a,b);
3805 return vqaddq_s64(res,res); //slow serial function!!!!
3806 }
3807
3808 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
3809 //******************************************************************************************
3810 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
vmla_s8(int8x8_t a,int8x8_t b,int8x8_t c)3811 _NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
3812 {
3813 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3814 int8x8_t res64;
3815 __m128i b128, c128, res;
3816 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3817 c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3818 res = _mm_mullo_epi16 (c128, b128);
3819 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
3820 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3821 return64(res);
3822 }
3823
3824 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
vmla_s16(int16x4_t a,int16x4_t b,int16x4_t c)3825 _NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c)
3826 {
3827 int16x4_t res64;
3828 return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
3829 }
3830
3831
3832 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
vmla_s32(int32x2_t a,int32x2_t b,int32x2_t c)3833 _NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
3834 {
3835 int32x2_t res64;
3836 __m128i res;
3837 res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
3838 res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
3839 return64(res);
3840 }
3841
3842 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
vmla_f32(float32x2_t a,float32x2_t b,float32x2_t c)3843 _NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
3844 {
3845 //fma is coming soon, but right now:
3846 __m128 res;
3847 __m64_128 res64;
3848 res = _mm_mul_ps (_pM128(c), _pM128(b));
3849 res = _mm_add_ps (_pM128(a), res);
3850 _M64f(res64, res);
3851 return res64;
3852 }
3853
3854 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
vmla_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)3855 _NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
3856 {
3857 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3858 uint8x8_t res64;
3859 __m128i mask, b128, c128, res;
3860 mask = _mm_set1_epi16(0xff);
3861 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
3862 c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
3863 res = _mm_mullo_epi16 (c128, b128);
3864 res = _mm_and_si128(res, mask); //to avoid saturation
3865 res = _mm_packus_epi16 (res, res);
3866 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
3867 return64(res);
3868 }
3869
3870 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
3871 #define vmla_u16 vmla_s16
3872
3873 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
3874 #define vmla_u32 vmla_s32
3875
3876 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
vmlaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)3877 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
3878 {
3879 //solution may be not optimal
3880 // no 8 bit simd multiply, need to go to 16 bits
3881 __m128i b16, c16, r16_1, a_2,r16_2;
3882 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
3883 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
3884 r16_1 = _mm_mullo_epi16 (b16, c16);
3885 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3886 r16_1 = _mm_add_epi8 (r16_1, a);
3887 //swap hi and low part of a, b and c to process the remaining data
3888 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3889 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3890 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3891 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
3892 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
3893
3894 r16_2 = _mm_mullo_epi16 (b16, c16);
3895 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3896 r16_2 = _mm_add_epi8(r16_2, a_2);
3897 return _mm_unpacklo_epi64(r16_1,r16_2);
3898 }
3899
3900 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
vmlaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)3901 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
3902 {
3903 __m128i res;
3904 res = _mm_mullo_epi16 (c, b);
3905 return _mm_add_epi16 (res, a);
3906 }
3907
3908 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
vmlaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)3909 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
3910 {
3911 __m128i res;
3912 res = _MM_MULLO_EPI32 (c, b); //SSE4.1
3913 return _mm_add_epi32 (res, a);
3914 }
3915
3916 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
vmlaq_f32(float32x4_t a,float32x4_t b,float32x4_t c)3917 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
3918 {
3919 //fma is coming soon, but right now:
3920 __m128 res;
3921 res = _mm_mul_ps (c, b);
3922 return _mm_add_ps (a, res);
3923 }
3924
3925 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
vmlaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)3926 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
3927 {
3928 //solution may be not optimal
3929 // no 8 bit simd multiply, need to go to 16 bits
3930 __m128i b16, c16, r16_1, a_2, r16_2;
3931 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
3932 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
3933 r16_1 = _mm_mullo_epi16 (b16, c16);
3934 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
3935 r16_1 = _mm_add_epi8 (r16_1, a);
3936 //swap hi and low part of a, b and c to process the remaining data
3937 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
3938 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
3939 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
3940 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
3941 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
3942
3943 r16_2 = _mm_mullo_epi16 (b16, c16);
3944 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
3945 r16_2 = _mm_add_epi8(r16_2, a_2);
3946 return _mm_unpacklo_epi64(r16_1,r16_2);
3947 }
3948
3949 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
3950 #define vmlaq_u16 vmlaq_s16
3951
3952 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
3953 #define vmlaq_u32 vmlaq_s32
3954
3955 //********************** Vector widening multiply accumulate (long multiply accumulate):
3956 // vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
3957 //********************************************************************************************
3958 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
vmlal_s8(int16x8_t a,int8x8_t b,int8x8_t c)3959 _NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
3960 {
3961 int16x8_t res;
3962 res = vmull_s8(b, c);
3963 return _mm_add_epi16 (res, a);
3964 }
3965
3966 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
vmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)3967 _NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
3968 {
3969 //may be not optimal compared with serial implementation
3970 int32x4_t res;
3971 res = vmull_s16(b, c);
3972 return _mm_add_epi32 (res, a);
3973 }
3974
3975 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
vmlal_s32(int64x2_t a,int32x2_t b,int32x2_t c)3976 _NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
3977 {
3978 //may be not optimal compared with serial implementation
3979 int64x2_t res;
3980 res = vmull_s32( b, c);
3981 return _mm_add_epi64 (res, a);
3982 }
3983
3984 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
vmlal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)3985 _NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
3986 {
3987 uint16x8_t res;
3988 res = vmull_u8(b, c);
3989 return _mm_add_epi16 (res, a);
3990 }
3991
3992 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
vmlal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)3993 _NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
3994 {
3995 //may be not optimal compared with serial implementation
3996 uint32x4_t res;
3997 res = vmull_u16(b, c);
3998 return _mm_add_epi32 (res, a);
3999 }
4000
4001 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
vmlal_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)4002 _NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
4003 {
4004 //may be not optimal compared with serial implementation
4005 int64x2_t res;
4006 res = vmull_u32( b,c);
4007 return _mm_add_epi64 (res, a);
4008 }
4009
4010 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
4011 //********************************************************************************************
4012 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
vmls_s8(int8x8_t a,int8x8_t b,int8x8_t c)4013 _NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
4014 {
4015 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4016 int8x8_t res64;
4017 __m128i res;
4018 res64 = vmul_s8(b,c);
4019 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4020 return64(res);
4021 }
4022
4023 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
vmls_s16(int16x4_t a,int16x4_t b,int16x4_t c)4024 _NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c)
4025 {
4026 int16x4_t res64;
4027 return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
4028 }
4029
4030
4031 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
vmls_s32(int32x2_t a,int32x2_t b,int32x2_t c)4032 _NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
4033 {
4034 int32x2_t res64;
4035 __m128i res;
4036 res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
4037 res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
4038 return64(res);
4039 }
4040
4041 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
vmls_f32(float32x2_t a,float32x2_t b,float32x2_t c)4042 _NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
4043 {
4044 __m128 res;
4045 __m64_128 res64;
4046 res = _mm_mul_ps (_pM128(c), _pM128(b));
4047 res = _mm_sub_ps (_pM128(a), res);
4048 _M64f(res64, res);
4049 return res64;
4050 }
4051
4052 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
vmls_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)4053 _NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
4054 {
4055 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4056 uint8x8_t res64;
4057 __m128i res;
4058 res64 = vmul_u8(b,c);
4059 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
4060 return64(res);
4061 }
4062
4063 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
4064 #define vmls_u16 vmls_s16
4065
4066 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
4067 #define vmls_u32 vmls_s32
4068
4069
4070 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
vmlsq_s8(int8x16_t a,int8x16_t b,int8x16_t c)4071 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
4072 {
4073 //solution may be not optimal
4074 // no 8 bit simd multiply, need to go to 16 bits
4075 __m128i b16, c16, r16_1, a_2, r16_2;
4076 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
4077 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
4078 r16_1 = _mm_mullo_epi16 (b16, c16);
4079 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
4080 r16_1 = _mm_sub_epi8 (a, r16_1);
4081 //swap hi and low part of a, b, c to process the remaining data
4082 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4083 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4084 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4085 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
4086 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
4087
4088 r16_2 = _mm_mullo_epi16 (b16, c16);
4089 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4090 r16_2 = _mm_sub_epi8 (a_2, r16_2);
4091 return _mm_unpacklo_epi64(r16_1,r16_2);
4092 }
4093
4094 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
vmlsq_s16(int16x8_t a,int16x8_t b,int16x8_t c)4095 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
4096 {
4097 __m128i res;
4098 res = _mm_mullo_epi16 (c, b);
4099 return _mm_sub_epi16 (a, res);
4100 }
4101
4102 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
vmlsq_s32(int32x4_t a,int32x4_t b,int32x4_t c)4103 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
4104 {
4105 __m128i res;
4106 res = _MM_MULLO_EPI32 (c, b); //SSE4.1
4107 return _mm_sub_epi32 (a, res);
4108 }
4109
4110 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
vmlsq_f32(float32x4_t a,float32x4_t b,float32x4_t c)4111 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
4112 {
4113 __m128 res;
4114 res = _mm_mul_ps (c, b);
4115 return _mm_sub_ps (a, res);
4116 }
4117
4118 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
vmlsq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)4119 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
4120 {
4121 //solution may be not optimal
4122 // no 8 bit simd multiply, need to go to 16 bits
4123 __m128i b16, c16, r16_1, a_2, r16_2;
4124 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
4125 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
4126 r16_1 = _mm_mullo_epi16 (b16, c16);
4127 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
4128 r16_1 = _mm_sub_epi8 (a, r16_1);
4129 //swap hi and low part of a, b and c to process the remaining data
4130 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
4131 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
4132 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
4133 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
4134 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
4135
4136 r16_2 = _mm_mullo_epi16 (b16, c16);
4137 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
4138 r16_2 = _mm_sub_epi8(a_2, r16_2);
4139 return _mm_unpacklo_epi64(r16_1,r16_2);
4140 }
4141
4142 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
4143 #define vmlsq_u16 vmlsq_s16
4144
4145 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
4146 #define vmlsq_u32 vmlsq_s32
4147
4148 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
4149 //*************************************************************************************************************
4150 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
vmlsl_s8(int16x8_t a,int8x8_t b,int8x8_t c)4151 _NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
4152 {
4153 int16x8_t res;
4154 res = vmull_s8(b, c);
4155 return _mm_sub_epi16 (a, res);
4156 }
4157
4158 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
vmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)4159 _NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
4160 {
4161 //may be not optimal compared with serial implementation
4162 int32x4_t res;
4163 res = vmull_s16(b, c);
4164 return _mm_sub_epi32 (a, res);
4165 }
4166
4167 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
vmlsl_s32(int64x2_t a,int32x2_t b,int32x2_t c)4168 _NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
4169 {
4170 //may be not optimal compared with serial implementation
4171 int64x2_t res;
4172 res = vmull_s32( b,c);
4173 return _mm_sub_epi64 (a, res);
4174 }
4175
4176 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
vmlsl_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)4177 _NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
4178 {
4179 uint16x8_t res;
4180 res = vmull_u8(b, c);
4181 return _mm_sub_epi16 (a, res);
4182 }
4183
4184 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
vmlsl_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)4185 _NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
4186 {
4187 //may be not optimal compared with serial implementation
4188 uint32x4_t res;
4189 res = vmull_u16(b, c);
4190 return _mm_sub_epi32 (a, res);
4191 }
4192
4193 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
vmlsl_u32(uint64x2_t a,uint32x2_t b,uint32x2_t c)4194 _NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
4195 {
4196 //may be not optimal compared with serial implementation
4197 int64x2_t res;
4198 res = vmull_u32( b,c);
4199 return _mm_sub_epi64 (a, res);
4200 }
4201
4202 //****** Vector saturating doubling multiply high **********************
4203 //*************************************************************************
4204 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)4205 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4206 {
4207 int16x4_t res;
4208 int32_t a32, b32, i;
4209 for (i = 0; i<4; i++) {
4210 a32 = (int32_t) a.m64_i16[i];
4211 b32 = (int32_t) b.m64_i16[i];
4212 a32 = (a32 * b32) >> 15;
4213 res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
4214 }
4215 return res;
4216 }
4217
4218 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
vqdmulh_s32(int32x2_t a,int32x2_t b)4219 _NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
4220 {
4221 //may be not optimal compared with a serial solution
4222 int32x2_t res64;
4223 __m128i mask;
4224 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4225 int64x2_t mul;
4226 mul = vmull_s32(a,b);
4227 mul = _mm_slli_epi64(mul,1); //double the result
4228 //at this point start treating 2 64-bit numbers as 4 32-bit
4229 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4230 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4231 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4232 return64(mul);
4233 }
4234
4235 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
vqdmulhq_s16(int16x8_t a,int16x8_t b)4236 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
4237 {
4238 __m128i res, res_lo, mask;
4239 _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4240 res = _mm_mulhi_epi16 (a, b);
4241 res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
4242 res_lo = _mm_mullo_epi16 (a, b);
4243 res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
4244 res = _mm_add_epi16(res, res_lo); //combine results
4245 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4246 return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4247 }
4248
4249 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4250 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4251 {
4252 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4253 __m128i ab, ba, mask, mul, mul1;
4254 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4255 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4256 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4257 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4258 mul = _mm_slli_epi64(mul,1); //double the result
4259 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4260 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4261 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4262 mul1 = _mm_slli_epi64(mul1,1); //double the result
4263 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4264 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4265 mul = _mm_unpacklo_epi64(mul, mul1);
4266 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4267 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4268 }
4269
4270 //********* Vector saturating rounding doubling multiply high ****************
4271 //****************************************************************************
4272 //If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
4273 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
vqrdmulh_s16(int16x4_t a,int16x4_t b)4274 _NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b)
4275 {
4276 int16x4_t res64;
4277 return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
4278 }
4279
4280 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4281 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4282 {
4283 //may be not optimal compared with a serial solution
4284 int32x2_t res64;
4285 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4286 __m128i res_sat, mask, mask1;
4287 int64x2_t mul;
4288 mul = vmull_s32(a,b);
4289 res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4290 mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
4291 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4292 mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
4293 //at this point start treating 2 64-bit numbers as 4 32-bit
4294 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4295 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4296 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4297 return64(mul);
4298 }
4299
4300 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
vqrdmulhq_s16(int16x8_t a,int16x8_t b)4301 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
4302 {
4303 __m128i mask, res;
4304 _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4305 res = _mm_mulhrs_epi16 (a, b);
4306 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
4307 return _mm_xor_si128 (res, mask); //res saturated for 0x8000
4308 }
4309
4310 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)4311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
4312 {
4313 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4314 __m128i ab, ba, mask, mul, mul1, mask1;
4315 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4316 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
4317 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
4318 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4319 mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
4320 mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
4321 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4322 mul = _mm_add_epi32 (mul, mask1); //actual rounding
4323
4324 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
4325 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
4326 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4327 mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
4328 mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
4329 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero
4330 mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
4331 //at this point start treating 2 64-bit numbers as 4 32-bit
4332 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4333 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4334 mul = _mm_unpacklo_epi64(mul, mul1);
4335 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
4336 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000
4337 }
4338
4339 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
4340 //*************************************************************************************************************************
4341 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
vqdmlal_s16(int32x4_t a,int16x4_t b,int16x4_t c)4342 _NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
4343 {
4344 //not optimal SIMD soulution, serial may be faster
4345 __m128i res32;
4346 res32 = vmull_s16(b, c);
4347 res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
4348 return vqaddq_s32(res32, a); //saturation
4349 }
4350
4351 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)4352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
4353 {
4354 __m128i res64;
4355 res64 = vmull_s32(b,c);
4356 res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
4357 return vqaddq_s64(res64, a); //saturation
4358 }
4359
4360 //************************************************************************************
4361 //****************** Vector subtract ***********************************************
4362 //************************************************************************************
4363 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
vsub_s8(int8x8_t a,int8x8_t b)4364 _NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
4365 {
4366 int8x8_t res64;
4367 return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
4368 }
4369
4370
4371 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
vsub_s16(int16x4_t a,int16x4_t b)4372 _NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
4373 {
4374 int16x4_t res64;
4375 return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
4376 }
4377
4378
4379 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
vsub_s32(int32x2_t a,int32x2_t b)4380 _NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
4381 {
4382 int32x2_t res64;
4383 return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
4384 }
4385
4386
4387 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
vsub_s64(int64x1_t a,int64x1_t b)4388 _NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b)
4389 {
4390 int64x1_t res64;
4391 res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
4392 return res64;
4393 }
4394
4395
4396 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
vsub_f32(float32x2_t a,float32x2_t b)4397 _NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
4398 {
4399 float32x2_t res;
4400 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
4401 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
4402 return res;
4403 }
4404
4405 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
4406 #define vsub_u8 vsub_s8
4407
4408 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
4409 #define vsub_u16 vsub_s16
4410
4411 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
4412 #define vsub_u32 vsub_s32
4413
4414
4415 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
vsub_u64(uint64x1_t a,uint64x1_t b)4416 _NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b)
4417 {
4418 int64x1_t res64;
4419 res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
4420 return res64;
4421 }
4422
4423
4424 _NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
4425 #define vsubq_s8 _mm_sub_epi8
4426
4427 _NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
4428 #define vsubq_s16 _mm_sub_epi16
4429
4430 _NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
4431 #define vsubq_s32 _mm_sub_epi32
4432
4433 _NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
4434 #define vsubq_s64 _mm_sub_epi64
4435
4436 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
4437 #define vsubq_f32 _mm_sub_ps
4438
4439 _NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
4440 #define vsubq_u8 _mm_sub_epi8
4441
4442 _NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
4443 #define vsubq_u16 _mm_sub_epi16
4444
4445 _NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
4446 #define vsubq_u32 _mm_sub_epi32
4447
4448 _NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
4449 #define vsubq_u64 _mm_sub_epi64
4450
4451 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
4452 //***********************************************************************************
4453 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
4454 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
vsubl_s8(int8x8_t a,int8x8_t b)4455 _NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
4456 {
4457 __m128i a16, b16;
4458 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
4459 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4460 return _mm_sub_epi16 (a16, b16);
4461 }
4462
4463 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
vsubl_s16(int16x4_t a,int16x4_t b)4464 _NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
4465 {
4466 __m128i a32, b32;
4467 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
4468 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4469 return _mm_sub_epi32 (a32, b32);
4470 }
4471
4472 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
vsubl_s32(int32x2_t a,int32x2_t b)4473 _NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
4474 {
4475 //may be not optimal
4476 __m128i a64, b64;
4477 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
4478 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
4479 return _mm_sub_epi64 (a64, b64);
4480 }
4481
4482 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
vsubl_u8(uint8x8_t a,uint8x8_t b)4483 _NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
4484 {
4485 __m128i a16, b16;
4486 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
4487 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4488 return _mm_sub_epi16 (a16, b16);
4489 }
4490
4491 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
vsubl_u16(uint16x4_t a,uint16x4_t b)4492 _NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
4493 {
4494 __m128i a32, b32;
4495 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
4496 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4497 return _mm_sub_epi32 (a32, b32);
4498 }
4499
4500 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
vsubl_u32(uint32x2_t a,uint32x2_t b)4501 _NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
4502 {
4503 //may be not optimal
4504 __m128i a64, b64;
4505 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
4506 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
4507 return _mm_sub_epi64 (a64, b64);
4508 }
4509
4510 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
4511 //*****************************************************************************************************
4512 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
vsubw_s8(int16x8_t a,int8x8_t b)4513 _NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
4514 {
4515 __m128i b16;
4516 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
4517 return _mm_sub_epi16 (a, b16);
4518 }
4519
4520 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
vsubw_s16(int32x4_t a,int16x4_t b)4521 _NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
4522 {
4523 __m128i b32;
4524 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
4525 return _mm_sub_epi32 (a, b32);
4526 }
4527
4528 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
vsubw_s32(int64x2_t a,int32x2_t b)4529 _NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
4530 {
4531 __m128i b64;
4532 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
4533 return _mm_sub_epi64 (a, b64);
4534 }
4535
4536 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
vsubw_u8(uint16x8_t a,uint8x8_t b)4537 _NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
4538 {
4539 __m128i b16;
4540 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
4541 return _mm_sub_epi16 (a, b16);
4542 }
4543
4544 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
vsubw_u16(uint32x4_t a,uint16x4_t b)4545 _NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
4546 {
4547 __m128i b32;
4548 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
4549 return _mm_sub_epi32 (a, b32);
4550 }
4551
4552 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
vsubw_u32(uint64x2_t a,uint32x2_t b)4553 _NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
4554 {
4555 __m128i b64;
4556 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
4557 return _mm_sub_epi64 (a, b64);
4558 }
4559
4560 //************************Vector saturating subtract *********************************
4561 //*************************************************************************************
4562 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
vqsub_s8(int8x8_t a,int8x8_t b)4563 _NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
4564 {
4565 int8x8_t res64;
4566 return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
4567 }
4568
4569
4570 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
vqsub_s16(int16x4_t a,int16x4_t b)4571 _NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
4572 {
4573 int16x4_t res64;
4574 return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
4575 }
4576
4577
4578 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
vqsub_s32(int32x2_t a,int32x2_t b)4579 _NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b)
4580 {
4581 int32x2_t res64;
4582 return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
4583 }
4584
4585
4586 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)4587 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4588 {
4589 uint64x1_t res;
4590 uint64_t a64,b64;
4591 a64 = a.m64_u64[0];
4592 b64 = b.m64_u64[0];
4593 res.m64_u64[0] = a64 - b64;
4594
4595 a64 = (a64 >> 63) + (~_SIGNBIT64);
4596 if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
4597 res.m64_u64[0] = a64;
4598 }
4599 return res;
4600 }
4601
4602 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
vqsub_u8(uint8x8_t a,uint8x8_t b)4603 _NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
4604 {
4605 uint8x8_t res64;
4606 return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
4607 }
4608
4609
4610 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
vqsub_u16(uint16x4_t a,uint16x4_t b)4611 _NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
4612 {
4613 uint16x4_t res64;
4614 return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
4615 }
4616
4617
4618 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
vqsub_u32(uint32x2_t a,uint32x2_t b)4619 _NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b)
4620 {
4621 uint32x2_t res64;
4622 return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
4623 }
4624
4625
4626 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64 (uint64x1_t a,uint64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)4627 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4628 {
4629 uint64x1_t res;
4630 uint64_t a64, b64;
4631 a64 = _Ui64(a);
4632 b64 = _Ui64(b);
4633 if (a64 > b64) {
4634 res.m64_u64[0] = a64 - b64;
4635 } else {
4636 res.m64_u64[0] = 0;
4637 }
4638 return res;
4639 }
4640
4641 _NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
4642 #define vqsubq_s8 _mm_subs_epi8
4643
4644 _NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
4645 #define vqsubq_s16 _mm_subs_epi16
4646
4647 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
vqsubq_s32(int32x4_t a,int32x4_t b)4648 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
4649 {
4650 //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
4651 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
4652 c7fffffff = _mm_set1_epi32(0x7fffffff);
4653 res = _mm_sub_epi32(a, b);
4654 res_sat = _mm_srli_epi32(a, 31);
4655 res_sat = _mm_add_epi32(res_sat, c7fffffff);
4656 res_xor_a = _mm_xor_si128(res, a);
4657 b_xor_a = _mm_xor_si128(b, a);
4658 res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
4659 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
4660 res_sat = _mm_and_si128(res_xor_a, res_sat);
4661 res = _mm_andnot_si128(res_xor_a, res);
4662 return _mm_or_si128(res, res_sat);
4663 }
4664
4665 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)4666 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
4667 {
4668 _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
4669 _NEON2SSE_ALIGN_16 uint64_t res[2];
4670 _mm_store_si128((__m128i*)atmp, a);
4671 _mm_store_si128((__m128i*)btmp, b);
4672 res[0] = atmp[0] - btmp[0];
4673 res[1] = atmp[1] - btmp[1];
4674 if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
4675 res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
4676 }
4677 if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
4678 res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
4679 }
4680 return _mm_load_si128((__m128i*)res);
4681 }
4682
4683 _NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
4684 #define vqsubq_u8 _mm_subs_epu8
4685
4686 _NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
4687 #define vqsubq_u16 _mm_subs_epu16
4688
4689 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
vqsubq_u32(uint32x4_t a,uint32x4_t b)4690 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
4691 {
4692 __m128i min, mask, sub;
4693 min = _MM_MIN_EPU32(a, b); //SSE4.1
4694 mask = _mm_cmpeq_epi32 (min, b);
4695 sub = _mm_sub_epi32 (a, b);
4696 return _mm_and_si128 ( sub, mask);
4697 }
4698
4699 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
4700 #ifdef USE_SSE4
vqsubq_u64(uint64x2_t a,uint64x2_t b)4701 _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
4702 {
4703 __m128i c80000000, subb, suba, cmp, sub;
4704 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
4705 sub = _mm_sub_epi64 (a, b);
4706 suba = _mm_sub_epi64 (a, c80000000);
4707 subb = _mm_sub_epi64 (b, c80000000);
4708 cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
4709 return _mm_and_si128 (sub, cmp); //saturation
4710 }
4711 #else
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64 (uint64x2_t a,uint64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)4712 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
4713 {
4714 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
4715 _mm_store_si128((__m128i*)atmp, a);
4716 _mm_store_si128((__m128i*)btmp, b);
4717 res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0;
4718 res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0;
4719 return _mm_load_si128((__m128i*)(res));
4720 }
4721 #endif
4722
4723 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ******************************************************
4724 //****************************************************************
4725 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
vhsub_s8(int8x8_t a,int8x8_t b)4726 _NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
4727 {
4728 //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
4729 int8x8_t res64;
4730 __m128i r16;
4731 int8x8_t r;
4732 r = vsub_s8 (a, b);
4733 r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
4734 r16 = _mm_srai_epi16 (r16, 1); //SSE2
4735 r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits
4736 return64(r16);
4737 }
4738
4739 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
vhsub_s16(int16x4_t a,int16x4_t b)4740 _NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b)
4741 {
4742 int16x4_t res64;
4743 return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
4744 }
4745
4746
4747
4748 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
vhsub_s32(int32x2_t a,int32x2_t b)4749 _NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b)
4750 {
4751 int32x2_t res64;
4752 return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
4753 }
4754
4755
4756 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
vhsub_u8(uint8x8_t a,uint8x8_t b)4757 _NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b)
4758 {
4759 uint8x8_t res64;
4760 return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
4761 }
4762
4763 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0
vhsub_u16(uint16x4_t a,uint16x4_t b)4764 _NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b)
4765 {
4766 uint16x4_t res64;
4767 return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
4768 }
4769
4770 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
vhsub_u32(uint32x2_t a,uint32x2_t b)4771 _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b)
4772 {
4773 uint32x2_t res64;
4774 return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
4775 }
4776
4777 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
vhsubq_s8(int8x16_t a,int8x16_t b)4778 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
4779 {
4780 // //need to deal with the possibility of internal overflow
4781 __m128i c128, au,bu;
4782 c128 = _mm_set1_epi8((int8_t)128);
4783 au = _mm_add_epi8( a, c128);
4784 bu = _mm_add_epi8( b, c128);
4785 return vhsubq_u8(au,bu);
4786 }
4787
4788 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
vhsubq_s16(int16x8_t a,int16x8_t b)4789 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
4790 {
4791 //need to deal with the possibility of internal overflow
4792 __m128i c8000, au,bu;
4793 c8000 = _mm_set1_epi16((int16_t)0x8000);
4794 au = _mm_add_epi16( a, c8000);
4795 bu = _mm_add_epi16( b, c8000);
4796 return vhsubq_u16(au,bu);
4797 }
4798
4799 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
vhsubq_s32(int32x4_t a,int32x4_t b)4800 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
4801 {
4802 //need to deal with the possibility of internal overflow
4803 __m128i a2, b2,r, b_1;
4804 a2 = _mm_srai_epi32 (a,1);
4805 b2 = _mm_srai_epi32 (b,1);
4806 r = _mm_sub_epi32 (a2, b2);
4807 b_1 = _mm_andnot_si128(a, b); //!a and b
4808 b_1 = _mm_slli_epi32 (b_1,31);
4809 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4810 return _mm_sub_epi32(r,b_1);
4811 }
4812
4813 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
vhsubq_u8(uint8x16_t a,uint8x16_t b)4814 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
4815 {
4816 __m128i avg;
4817 avg = _mm_avg_epu8 (a, b);
4818 return _mm_sub_epi8(a, avg);
4819 }
4820
4821 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
vhsubq_u16(uint16x8_t a,uint16x8_t b)4822 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
4823 {
4824 __m128i avg;
4825 avg = _mm_avg_epu16 (a, b);
4826 return _mm_sub_epi16(a, avg);
4827 }
4828
4829 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
vhsubq_u32(uint32x4_t a,uint32x4_t b)4830 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
4831 {
4832 //need to deal with the possibility of internal overflow
4833 __m128i a2, b2,r, b_1;
4834 a2 = _mm_srli_epi32 (a,1);
4835 b2 = _mm_srli_epi32 (b,1);
4836 r = _mm_sub_epi32 (a2, b2);
4837 b_1 = _mm_andnot_si128(a, b); //!a and b
4838 b_1 = _mm_slli_epi32 (b_1,31);
4839 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
4840 return _mm_sub_epi32(r,b_1);
4841 }
4842
4843 //******* Vector subtract high half (truncated) ** ************
4844 //************************************************************
4845 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
vsubhn_s16(int16x8_t a,int16x8_t b)4846 _NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
4847 {
4848 int8x8_t res64;
4849 __m128i sum, sum8;
4850 sum = _mm_sub_epi16 (a, b);
4851 sum8 = _mm_srai_epi16 (sum, 8);
4852 sum8 = _mm_packs_epi16(sum8,sum8);
4853 return64(sum8);
4854 }
4855
4856 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
vsubhn_s32(int32x4_t a,int32x4_t b)4857 _NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
4858 {
4859 int16x4_t res64;
4860 __m128i sum, sum16;
4861 sum = _mm_sub_epi32 (a, b);
4862 sum16 = _mm_srai_epi32 (sum, 16);
4863 sum16 = _mm_packs_epi32(sum16,sum16);
4864 return64(sum16);
4865 }
4866
4867 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
vsubhn_s64(int64x2_t a,int64x2_t b)4868 _NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
4869 {
4870 int32x2_t res64;
4871 __m128i sub;
4872 sub = _mm_sub_epi64 (a, b);
4873 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4874 return64(sub);
4875 }
4876
4877 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
vsubhn_u16(uint16x8_t a,uint16x8_t b)4878 _NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
4879 {
4880 uint8x8_t res64;
4881 __m128i sum, sum8;
4882 sum = _mm_sub_epi16 (a, b);
4883 sum8 = _mm_srli_epi16 (sum, 8);
4884 sum8 = _mm_packus_epi16(sum8,sum8);
4885 return64(sum8);
4886 }
4887
4888 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
vsubhn_u32(uint32x4_t a,uint32x4_t b)4889 _NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
4890 {
4891 uint16x4_t res64;
4892 __m128i sum, sum16;
4893 sum = _mm_sub_epi32 (a, b);
4894 sum16 = _mm_srli_epi32 (sum, 16);
4895 #ifdef USE_SSE4
4896 sum16 = _MM_PACKUS1_EPI32(sum16);
4897 #else
4898 sum16 = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4899 #endif
4900 return64(sum16);
4901 }
4902
4903 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
4904 #define vsubhn_u64 vsubhn_s64
4905
4906 //************ Vector rounding subtract high half *********************
4907 //*********************************************************************
4908 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
vrsubhn_s16(int16x8_t a,int16x8_t b)4909 _NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
4910 {
4911 int8x8_t res64;
4912 __m128i sub, mask1;
4913 sub = _mm_sub_epi16 (a, b);
4914 mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4915 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
4916 sub = _mm_srai_epi16 (sub, 8); //get high half
4917 sub = _mm_add_epi16 (sub, mask1); //actual rounding
4918 sub = _mm_packs_epi16 (sub, sub);
4919 return64(sub);
4920 }
4921
4922 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
vrsubhn_s32(int32x4_t a,int32x4_t b)4923 _NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
4924 {
4925 //SIMD may be not optimal, serial may be faster
4926 int16x4_t res64;
4927 __m128i sub, mask1;
4928 sub = _mm_sub_epi32 (a, b);
4929 mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4930 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
4931 sub = _mm_srai_epi32 (sub, 16); //get high half
4932 sub = _mm_add_epi32 (sub, mask1); //actual rounding
4933 sub = _mm_packs_epi32 (sub, sub);
4934 return64(sub);
4935 }
4936
4937 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
vrsubhn_s64(int64x2_t a,int64x2_t b)4938 _NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
4939 {
4940 //SIMD may be not optimal, serial may be faster
4941 int32x2_t res64;
4942 __m128i sub, mask1;
4943 sub = _mm_sub_epi64 (a, b);
4944 mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
4945 mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero
4946 sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
4947 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4948 return64(sub);
4949 }
4950
4951 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
vrsubhn_u16(uint16x8_t a,uint16x8_t b)4952 _NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
4953 {
4954 uint8x8_t res64;
4955 __m128i sub, mask1;
4956 sub = _mm_sub_epi16 (a, b);
4957 mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
4958 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero
4959 sub = _mm_srai_epi16 (sub, 8); //get high half
4960 sub = _mm_add_epi16 (sub, mask1); //actual rounding
4961 sub = _mm_packus_epi16 (sub, sub);
4962 return64(sub);
4963 }
4964
4965 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
vrsubhn_u32(uint32x4_t a,uint32x4_t b)4966 _NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
4967 {
4968 //SIMD may be not optimal, serial may be faster
4969 uint16x4_t res64;
4970 __m128i sub, mask1;
4971 sub = _mm_sub_epi32 (a, b);
4972 mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
4973 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero
4974 sub = _mm_srai_epi32 (sub, 16); //get high half
4975 sub = _mm_add_epi32 (sub, mask1); //actual rounding
4976 #ifdef USE_SSE4
4977 sub = _MM_PACKUS1_EPI32 (sub);
4978 #else
4979 sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
4980 #endif
4981 return64(sub);
4982 }
4983
4984 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
4985 #define vrsubhn_u64 vrsubhn_s64
4986
4987 //*********** Vector saturating doubling multiply subtract long ********************
4988 //************************************************************************************
4989 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
vqdmlsl_s16(int32x4_t a,int16x4_t b,int16x4_t c)4990 _NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
4991 {
4992 //not optimal SIMD soulution, serial may be faster
4993 __m128i res32, mask;
4994 int32x4_t res;
4995 _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4996 res = vmull_s16(b, c);
4997 res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
4998 mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
4999 res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000
5000 return vqsubq_s32(a, res32); //saturation
5001 }
5002
5003 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)5004 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
5005 {
5006 __m128i res64, mask;
5007 int64x2_t res;
5008 _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
5009 res = vmull_s32(b, c);
5010 res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
5011 mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
5012 res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000
5013 return vqsubq_s64(a, res64); //saturation
5014 }
5015
5016 //****************** COMPARISON ***************************************
5017 //******************* Vector compare equal *************************************
5018 //****************************************************************************
5019 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
vceq_s8(int8x8_t a,int8x8_t b)5020 _NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
5021 {
5022 int8x8_t res64;
5023 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5024 }
5025
5026
5027 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
vceq_s16(int16x4_t a,int16x4_t b)5028 _NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
5029 {
5030 int16x4_t res64;
5031 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5032 }
5033
5034
5035 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
vceq_s32(int32x2_t a,int32x2_t b)5036 _NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
5037 {
5038 int32x2_t res64;
5039 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5040 }
5041
5042
5043 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
vceq_f32(float32x2_t a,float32x2_t b)5044 _NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
5045 {
5046 uint32x2_t res64;
5047 __m128 res;
5048 res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
5049 return64f(res);
5050 }
5051
5052 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
vceq_u8(uint8x8_t a,uint8x8_t b)5053 _NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
5054 {
5055 uint8x8_t res64;
5056 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
5057 }
5058
5059
5060 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
vceq_u16(uint16x4_t a,uint16x4_t b)5061 _NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
5062 {
5063 uint16x4_t res64;
5064 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
5065 }
5066
5067
5068 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
vceq_u32(uint32x2_t a,uint32x2_t b)5069 _NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
5070 {
5071 uint32x2_t res64;
5072 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
5073 }
5074
5075
5076 _NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
5077 #define vceq_p8 vceq_u8
5078
5079
5080 _NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
5081 #define vceqq_s8 _mm_cmpeq_epi8
5082
5083 _NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
5084 #define vceqq_s16 _mm_cmpeq_epi16
5085
5086 _NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
5087 #define vceqq_s32 _mm_cmpeq_epi32
5088
5089 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
vceqq_f32(float32x4_t a,float32x4_t b)5090 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
5091 {
5092 __m128 res;
5093 res = _mm_cmpeq_ps(a,b);
5094 return _M128i(res);
5095 }
5096
5097 _NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
5098 #define vceqq_u8 _mm_cmpeq_epi8
5099
5100 _NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
5101 #define vceqq_u16 _mm_cmpeq_epi16
5102
5103 _NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
5104 #define vceqq_u32 _mm_cmpeq_epi32
5105
5106 _NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
5107 #define vceqq_p8 _mm_cmpeq_epi8
5108
5109 //******************Vector compare greater-than or equal*************************
5110 //*******************************************************************************
5111 //in IA SIMD no greater-than-or-equal comparison for integers,
5112 // there is greater-than available only, so we need the following tricks
5113
5114 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
vcge_s8(int8x8_t a,int8x8_t b)5115 _NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b)
5116 {
5117 int8x8_t res64;
5118 return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
5119 }
5120
5121
5122 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
vcge_s16(int16x4_t a,int16x4_t b)5123 _NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b)
5124 {
5125 int16x4_t res64;
5126 return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
5127 }
5128
5129
5130 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
vcge_s32(int32x2_t a,int32x2_t b)5131 _NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b)
5132 {
5133 int32x2_t res64;
5134 return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
5135 }
5136
5137
5138 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
vcge_f32(float32x2_t a,float32x2_t b)5139 _NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
5140 {
5141 uint32x2_t res64;
5142 __m128 res;
5143 res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5144 return64f(res);
5145 }
5146
5147 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
vcge_u8(uint8x8_t a,uint8x8_t b)5148 _NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b)
5149 {
5150 uint8x8_t res64;
5151 return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
5152 }
5153
5154
5155 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
vcge_u16(uint16x4_t a,uint16x4_t b)5156 _NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b)
5157 {
5158 uint16x4_t res64;
5159 return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
5160 }
5161
5162
5163 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
vcge_u32(uint32x2_t a,uint32x2_t b)5164 _NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b)
5165 {
5166 //serial solution looks faster
5167 uint32x2_t res64;
5168 return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
5169 }
5170
5171
5172
5173 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
vcgeq_s8(int8x16_t a,int8x16_t b)5174 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5175 {
5176 __m128i m1, m2;
5177 m1 = _mm_cmpgt_epi8 ( a, b);
5178 m2 = _mm_cmpeq_epi8 ( a, b);
5179 return _mm_or_si128 ( m1, m2);
5180 }
5181
5182 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
vcgeq_s16(int16x8_t a,int16x8_t b)5183 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5184 {
5185 __m128i m1, m2;
5186 m1 = _mm_cmpgt_epi16 ( a, b);
5187 m2 = _mm_cmpeq_epi16 ( a, b);
5188 return _mm_or_si128 ( m1,m2);
5189 }
5190
5191 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
vcgeq_s32(int32x4_t a,int32x4_t b)5192 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5193 {
5194 __m128i m1, m2;
5195 m1 = _mm_cmpgt_epi32 (a, b);
5196 m2 = _mm_cmpeq_epi32 (a, b);
5197 return _mm_or_si128 (m1, m2);
5198 }
5199
5200 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
vcgeq_f32(float32x4_t a,float32x4_t b)5201 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
5202 {
5203 __m128 res;
5204 res = _mm_cmpge_ps(a,b); //use only 2 first entries
5205 return *(__m128i*)&res;
5206 }
5207
5208 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
vcgeq_u8(uint8x16_t a,uint8x16_t b)5209 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5210 {
5211 //no unsigned chars comparison, only signed available,so need the trick
5212 __m128i cmp;
5213 cmp = _mm_max_epu8(a, b);
5214 return _mm_cmpeq_epi8(cmp, a); //a>=b
5215 }
5216
5217 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
vcgeq_u16(uint16x8_t a,uint16x8_t b)5218 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5219 {
5220 //no unsigned shorts comparison, only signed available,so need the trick
5221 #ifdef USE_SSE4
5222 __m128i cmp;
5223 cmp = _mm_max_epu16(a, b);
5224 return _mm_cmpeq_epi16(cmp, a); //a>=b
5225 #else
5226 __m128i as, mask;
5227 __m128i zero = _mm_setzero_si128();
5228 __m128i cffff = _mm_set1_epi16(0xffff);
5229 as = _mm_subs_epu16(b,a);
5230 mask = _mm_cmpgt_epi16(as, zero);
5231 return _mm_xor_si128 ( mask, cffff);
5232 #endif
5233 }
5234
5235 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
vcgeq_u32(uint32x4_t a,uint32x4_t b)5236 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5237 {
5238 //no unsigned ints comparison, only signed available,so need the trick
5239 #ifdef USE_SSE4
5240 __m128i cmp;
5241 cmp = _mm_max_epu32(a, b);
5242 return _mm_cmpeq_epi32(cmp, a); //a>=b
5243 #else
5244 //serial solution may be faster
5245 __m128i c80000000, as, bs, m1, m2;
5246 c80000000 = _mm_set1_epi32 (0x80000000);
5247 as = _mm_sub_epi32(a,c80000000);
5248 bs = _mm_sub_epi32(b,c80000000);
5249 m1 = _mm_cmpgt_epi32 (as, bs);
5250 m2 = _mm_cmpeq_epi32 (as, bs);
5251 return _mm_or_si128 ( m1, m2);
5252 #endif
5253 }
5254
5255 //**********************Vector compare less-than or equal******************************
5256 //***************************************************************************************
5257 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
5258
5259 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
vcle_s8(int8x8_t a,int8x8_t b)5260 _NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b)
5261 {
5262 int8x8_t res64;
5263 return64(vcleq_s8(_pM128i(a), _pM128i(b)));
5264 }
5265
5266
5267 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
vcle_s16(int16x4_t a,int16x4_t b)5268 _NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b)
5269 {
5270 int16x4_t res64;
5271 return64(vcleq_s16(_pM128i(a), _pM128i(b)));
5272 }
5273
5274
5275 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
vcle_s32(int32x2_t a,int32x2_t b)5276 _NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b)
5277 {
5278 int32x2_t res64;
5279 return64(vcleq_s32(_pM128i(a), _pM128i(b)));
5280 }
5281
5282
5283 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
vcle_f32(float32x2_t a,float32x2_t b)5284 _NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
5285 {
5286 uint32x2_t res64;
5287 __m128 res;
5288 res = _mm_cmple_ps(_pM128(a),_pM128(b));
5289 return64f(res);
5290 }
5291
5292 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
5293 #define vcle_u8(a,b) vcge_u8(b,a)
5294
5295
5296 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0
5297 #define vcle_u16(a,b) vcge_u16(b,a)
5298
5299
5300 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
5301 #define vcle_u32(a,b) vcge_u32(b,a)
5302
5303 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
vcleq_s8(int8x16_t a,int8x16_t b)5304 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
5305 {
5306 __m128i c1, res;
5307 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
5308 res = _mm_cmpgt_epi8 ( a, b);
5309 return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
5310 }
5311
5312 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
vcleq_s16(int16x8_t a,int16x8_t b)5313 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
5314 {
5315 __m128i c1, res;
5316 c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
5317 res = _mm_cmpgt_epi16 ( a, b);
5318 return _mm_andnot_si128 (res, c1);
5319 }
5320
5321 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
vcleq_s32(int32x4_t a,int32x4_t b)5322 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
5323 {
5324 __m128i c1, res;
5325 c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
5326 res = _mm_cmpgt_epi32 ( a, b);
5327 return _mm_andnot_si128 (res, c1);
5328 }
5329
5330 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
vcleq_f32(float32x4_t a,float32x4_t b)5331 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
5332 {
5333 __m128 res;
5334 res = _mm_cmple_ps(a,b);
5335 return *(__m128i*)&res;
5336 }
5337
5338 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
5339 #ifdef USE_SSE4
vcleq_u8(uint8x16_t a,uint8x16_t b)5340 _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
5341 {
5342 //no unsigned chars comparison in SSE, only signed available,so need the trick
5343 __m128i cmp;
5344 cmp = _mm_min_epu8(a, b);
5345 return _mm_cmpeq_epi8(cmp, a); //a<=b
5346 }
5347 #else
5348 # define vcleq_u8(a,b) vcgeq_u8(b,a)
5349 #endif
5350
5351
5352 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
5353 #ifdef USE_SSE4
vcleq_u16(uint16x8_t a,uint16x8_t b)5354 _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
5355 {
5356 //no unsigned shorts comparison in SSE, only signed available,so need the trick
5357 __m128i cmp;
5358 cmp = _mm_min_epu16(a, b);
5359 return _mm_cmpeq_epi16(cmp, a); //a<=b
5360 }
5361 #else
5362 # define vcleq_u16(a,b) vcgeq_u16(b,a)
5363 #endif
5364
5365
5366 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
5367 #ifdef USE_SSE4
vcleq_u32(uint32x4_t a,uint32x4_t b)5368 _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
5369 {
5370 //no unsigned chars comparison in SSE, only signed available,so need the trick
5371 __m128i cmp;
5372 cmp = _mm_min_epu32(a, b);
5373 return _mm_cmpeq_epi32(cmp, a); //a<=b
5374 }
5375 #else
5376 //solution may be not optimal compared with the serial one
5377 # define vcleq_u32(a,b) vcgeq_u32(b,a)
5378 #endif
5379
5380
5381 //****** Vector compare greater-than ******************************************
5382 //**************************************************************************
5383 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
vcgt_s8(int8x8_t a,int8x8_t b)5384 _NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
5385 {
5386 int8x8_t res64;
5387 return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
5388 }
5389
5390
5391 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
vcgt_s16(int16x4_t a,int16x4_t b)5392 _NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
5393 {
5394 int16x4_t res64;
5395 return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
5396 }
5397
5398
5399 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
vcgt_s32(int32x2_t a,int32x2_t b)5400 _NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
5401 {
5402 int32x2_t res64;
5403 return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
5404 }
5405
5406
5407 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
vcgt_f32(float32x2_t a,float32x2_t b)5408 _NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
5409 {
5410 uint32x2_t res64;
5411 __m128 res;
5412 res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
5413 return64f(res);
5414 }
5415
5416 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
vcgt_u8(uint8x8_t a,uint8x8_t b)5417 _NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b)
5418 {
5419 uint8x8_t res64;
5420 return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
5421 }
5422
5423
5424 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
vcgt_u16(uint16x4_t a,uint16x4_t b)5425 _NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b)
5426 {
5427 uint16x4_t res64;
5428 return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
5429 }
5430
5431
5432 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
vcgt_u32(uint32x2_t a,uint32x2_t b)5433 _NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b)
5434 {
5435 uint32x2_t res64;
5436 return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
5437 }
5438
5439
5440 _NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5441 #define vcgtq_s8 _mm_cmpgt_epi8
5442
5443 _NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5444 #define vcgtq_s16 _mm_cmpgt_epi16
5445
5446 _NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5447 #define vcgtq_s32 _mm_cmpgt_epi32
5448
5449 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
vcgtq_f32(float32x4_t a,float32x4_t b)5450 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
5451 {
5452 __m128 res;
5453 res = _mm_cmpgt_ps(a,b); //use only 2 first entries
5454 return *(__m128i*)&res;
5455 }
5456
5457 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
vcgtq_u8(uint8x16_t a,uint8x16_t b)5458 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
5459 {
5460 //no unsigned chars comparison, only signed available,so need the trick
5461 __m128i as;
5462 __m128i zero = _mm_setzero_si128();
5463 as = _mm_subs_epu8(a, b);
5464 return _mm_cmpgt_epi8(as, zero);
5465 }
5466
5467 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
vcgtq_u16(uint16x8_t a,uint16x8_t b)5468 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
5469 {
5470 //no unsigned short comparison, only signed available,so need the trick
5471 __m128i as;
5472 __m128i zero = _mm_setzero_si128();
5473 as = _mm_subs_epu16(a, b);
5474 return _mm_cmpgt_epi16(as, zero);
5475 }
5476
5477 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
vcgtq_u32(uint32x4_t a,uint32x4_t b)5478 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
5479 {
5480 //no unsigned int comparison, only signed available,so need the trick
5481 __m128i c80000000, as, bs;
5482 c80000000 = _mm_set1_epi32 (0x80000000);
5483 as = _mm_sub_epi32(a,c80000000);
5484 bs = _mm_sub_epi32(b,c80000000);
5485 return _mm_cmpgt_epi32 ( as, bs);
5486 }
5487
5488 //********************* Vector compare less-than **************************
5489 //*************************************************************************
5490 _NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
5491 #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
5492
5493
5494 _NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
5495 #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
5496
5497
5498 _NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
5499 #define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
5500
5501
5502 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
5503 #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
5504
5505 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
5506 #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
5507
5508 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
5509 #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
5510
5511 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
5512 #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
5513
5514 _NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
5515 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
5516
5517 _NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
5518 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
5519
5520 _NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
5521 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
5522
5523 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
5524 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
5525
5526 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
5527 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
5528
5529 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
5530 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
5531
5532 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
5533 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
5534
5535 //*****************Vector compare absolute greater-than or equal ************
5536 //***************************************************************************
5537 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
vcage_f32(float32x2_t a,float32x2_t b)5538 _NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
5539 {
5540 uint32x2_t res64;
5541 __m128i c7fffffff;
5542 __m128 a0, b0;
5543 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5544 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5545 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5546 a0 = _mm_cmpge_ps ( a0, b0);
5547 return64f(a0);
5548 }
5549
5550 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
vcageq_f32(float32x4_t a,float32x4_t b)5551 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5552 {
5553 __m128i c7fffffff;
5554 __m128 a0, b0;
5555 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5556 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5557 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5558 a0 = _mm_cmpge_ps ( a0, b0);
5559 return (*(__m128i*)&a0);
5560 }
5561
5562 //********Vector compare absolute less-than or equal ******************
5563 //********************************************************************
5564 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
vcale_f32(float32x2_t a,float32x2_t b)5565 _NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
5566 {
5567 uint32x2_t res64;
5568 __m128i c7fffffff;
5569 __m128 a0, b0;
5570 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5571 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5572 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5573 a0 = _mm_cmple_ps (a0, b0);
5574 return64f(a0);
5575 }
5576
5577 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
vcaleq_f32(float32x4_t a,float32x4_t b)5578 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
5579 {
5580 __m128i c7fffffff;
5581 __m128 a0, b0;
5582 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5583 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5584 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5585 a0 = _mm_cmple_ps (a0, b0);
5586 return (*(__m128i*)&a0);
5587 }
5588
5589 //******** Vector compare absolute greater-than ******************
5590 //******************************************************************
5591 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
vcagt_f32(float32x2_t a,float32x2_t b)5592 _NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
5593 {
5594 uint32x2_t res64;
5595 __m128i c7fffffff;
5596 __m128 a0, b0;
5597 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5598 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5599 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5600 a0 = _mm_cmpgt_ps (a0, b0);
5601 return64f(a0);
5602 }
5603
5604 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
vcagtq_f32(float32x4_t a,float32x4_t b)5605 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5606 {
5607 __m128i c7fffffff;
5608 __m128 a0, b0;
5609 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5610 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5611 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5612 a0 = _mm_cmpgt_ps (a0, b0);
5613 return (*(__m128i*)&a0);
5614 }
5615
5616 //***************Vector compare absolute less-than ***********************
5617 //*************************************************************************
5618 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
vcalt_f32(float32x2_t a,float32x2_t b)5619 _NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
5620 {
5621 uint32x2_t res64;
5622 __m128i c7fffffff;
5623 __m128 a0, b0;
5624 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5625 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
5626 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
5627 a0 = _mm_cmplt_ps (a0, b0);
5628 return64f(a0);
5629 }
5630
5631 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
vcaltq_f32(float32x4_t a,float32x4_t b)5632 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
5633 {
5634 __m128i c7fffffff;
5635 __m128 a0, b0;
5636 c7fffffff = _mm_set1_epi32 (0x7fffffff);
5637 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
5638 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
5639 a0 = _mm_cmplt_ps (a0, b0);
5640 return (*(__m128i*)&a0);
5641 }
5642
5643 //*************************Vector test bits************************************
5644 //*****************************************************************************
5645 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
5646 with the corresponding element of a second vector. If the result is not zero, the
5647 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
5648 all zeros. */
5649
5650 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
vtst_s8(int8x8_t a,int8x8_t b)5651 _NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b)
5652 {
5653 int8x8_t res64;
5654 return64(vtstq_s8(_pM128i(a), _pM128i(b)));
5655 }
5656
5657
5658 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
vtst_s16(int16x4_t a,int16x4_t b)5659 _NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b)
5660 {
5661 int16x4_t res64;
5662 return64(vtstq_s16(_pM128i(a), _pM128i(b)));
5663 }
5664
5665
5666 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
vtst_s32(int32x2_t a,int32x2_t b)5667 _NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b)
5668 {
5669 int32x2_t res64;
5670 return64(vtstq_s32(_pM128i(a), _pM128i(b)));
5671 }
5672
5673
5674 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
5675 #define vtst_u8 vtst_s8
5676
5677 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
5678 #define vtst_u16 vtst_s16
5679
5680 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
5681 #define vtst_u32 vtst_s32
5682
5683
5684 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
5685 #define vtst_p8 vtst_u8
5686
5687 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
vtstq_s8(int8x16_t a,int8x16_t b)5688 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
5689 {
5690 __m128i zero, one, res;
5691 zero = _mm_setzero_si128 ();
5692 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5693 res = _mm_and_si128 (a, b);
5694 res = _mm_cmpeq_epi8 (res, zero);
5695 return _mm_xor_si128(res, one); //invert result
5696 }
5697
5698 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
vtstq_s16(int16x8_t a,int16x8_t b)5699 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
5700 {
5701 __m128i zero, one, res;
5702 zero = _mm_setzero_si128 ();
5703 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5704 res = _mm_and_si128 (a, b);
5705 res = _mm_cmpeq_epi16 (res, zero);
5706 return _mm_xor_si128(res, one); //invert result
5707 }
5708
5709 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
vtstq_s32(int32x4_t a,int32x4_t b)5710 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
5711 {
5712 __m128i zero, one, res;
5713 zero = _mm_setzero_si128 ();
5714 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
5715 res = _mm_and_si128 (a, b);
5716 res = _mm_cmpeq_epi32 (res, zero);
5717 return _mm_xor_si128(res, one); //invert result
5718 }
5719
5720 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
5721 #define vtstq_u8 vtstq_s8
5722
5723 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
5724 #define vtstq_u16 vtstq_s16
5725
5726 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
5727 #define vtstq_u32 vtstq_s32
5728
5729 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
5730 #define vtstq_p8 vtstq_u8
5731
5732 //****************** Absolute difference ********************
5733 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
5734 //************************************************************
5735 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
vabd_s8(int8x8_t a,int8x8_t b)5736 _NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b)
5737 {
5738 int8x8_t res64;
5739 return64(vabdq_s8(_pM128i(a), _pM128i(b)));
5740 }
5741
5742 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
vabd_s16(int16x4_t a,int16x4_t b)5743 _NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b)
5744 {
5745 int16x4_t res64;
5746 return64(vabdq_s16(_pM128i(a), _pM128i(b)));
5747 }
5748
5749 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
vabd_s32(int32x2_t a,int32x2_t b)5750 _NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b)
5751 {//need to deal with an intermediate overflow
5752 int32x2_t res;
5753 res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] - b.m64_i32[0]: b.m64_i32[0] - a.m64_i32[0];
5754 res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] - b.m64_i32[1]: b.m64_i32[1] - a.m64_i32[1];
5755 return res;
5756 }
5757
5758 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
vabd_u8(uint8x8_t a,uint8x8_t b)5759 _NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b)
5760 {
5761 uint8x8_t res64;
5762 return64(vabdq_u8(_pM128i(a), _pM128i(b)));
5763 }
5764
5765 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0
vabd_u16(uint16x4_t a,uint16x4_t b)5766 _NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b)
5767 {
5768 uint16x4_t res64;
5769 return64(vabdq_u16(_pM128i(a), _pM128i(b)));
5770 }
5771
5772 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
vabd_u32(uint32x2_t a,uint32x2_t b)5773 _NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b)
5774 {
5775 uint32x2_t res64;
5776 return64(vabdq_u32(_pM128i(a), _pM128i(b)));
5777 }
5778
5779 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
vabd_f32(float32x2_t a,float32x2_t b)5780 _NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
5781 {
5782 float32x4_t res;
5783 __m64_128 res64;
5784 res = vabdq_f32(_pM128(a), _pM128(b));
5785 _M64f(res64, res);
5786 return res64;
5787 }
5788
5789 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
vabdq_s8(int8x16_t a,int8x16_t b)5790 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
5791 { //need to deal with an intermediate overflow
5792 __m128i cmp, difab, difba;
5793 cmp = vcgtq_s8(a,b);
5794 difab = _mm_sub_epi8(a,b);
5795 difba = _mm_sub_epi8(b,a);
5796 difab = _mm_and_si128(cmp, difab);
5797 difba = _mm_andnot_si128(cmp, difba);
5798 return _mm_or_si128(difab, difba);
5799 }
5800
5801 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
vabdq_s16(int16x8_t a,int16x8_t b)5802 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
5803 {//need to deal with an intermediate overflow
5804 __m128i cmp, difab, difba;
5805 cmp = vcgtq_s16(a,b);
5806 difab = _mm_sub_epi16(a,b);
5807 difba = _mm_sub_epi16 (b,a);
5808 difab = _mm_and_si128(cmp, difab);
5809 difba = _mm_andnot_si128(cmp, difba);
5810 return _mm_or_si128(difab, difba);
5811 }
5812
5813 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
vabdq_s32(int32x4_t a,int32x4_t b)5814 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
5815 {//need to deal with an intermediate overflow
5816 __m128i cmp, difab, difba;
5817 cmp = vcgtq_s32(a,b);
5818 difab = _mm_sub_epi32(a,b);
5819 difba = _mm_sub_epi32(b,a);
5820 difab = _mm_and_si128(cmp, difab);
5821 difba = _mm_andnot_si128(cmp, difba);
5822 return _mm_or_si128(difab, difba);
5823 }
5824
5825 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
vabdq_u8(uint8x16_t a,uint8x16_t b)5826 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
5827 {
5828 __m128i difab, difba;
5829 difab = _mm_subs_epu8(a,b);
5830 difba = _mm_subs_epu8 (b,a);
5831 return _mm_or_si128(difab, difba);
5832 }
5833
5834 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
vabdq_u16(uint16x8_t a,uint16x8_t b)5835 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
5836 {
5837 __m128i difab, difba;
5838 difab = _mm_subs_epu16(a,b);
5839 difba = _mm_subs_epu16 (b,a);
5840 return _mm_or_si128(difab, difba);
5841 }
5842
5843 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
vabdq_u32(uint32x4_t a,uint32x4_t b)5844 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
5845 {
5846 __m128i cmp, difab, difba;
5847 cmp = vcgtq_u32(a,b);
5848 difab = _mm_sub_epi32(a,b);
5849 difba = _mm_sub_epi32 (b,a);
5850 difab = _mm_and_si128(cmp, difab);
5851 difba = _mm_andnot_si128(cmp, difba);
5852 return _mm_or_si128(difab, difba);
5853 }
5854
5855 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
vabdq_f32(float32x4_t a,float32x4_t b)5856 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
5857 {
5858 __m128i c1;
5859 __m128 res;
5860 c1 = _mm_set1_epi32(0x7fffffff);
5861 res = _mm_sub_ps (a, b);
5862 return _mm_and_ps (res, *(__m128*)&c1);
5863 }
5864
5865 //************ Absolute difference - long **************************
5866 //********************************************************************
5867 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
vabdl_s8(int8x8_t a,int8x8_t b)5868 _NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
5869 {
5870 __m128i a16, b16;
5871 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
5872 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
5873 return vabdq_s16(a16, b16);
5874
5875 }
5876
5877 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
vabdl_s16(int16x4_t a,int16x4_t b)5878 _NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
5879 {
5880 __m128i a32, b32;
5881 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
5882 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
5883 return vabdq_s32(a32, b32);
5884 }
5885
5886 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vabdl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)5887 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
5888 {
5889 //no optimal SIMD solution, serial looks faster
5890 _NEON2SSE_ALIGN_16 int64_t res[2];
5891 if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
5892 else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
5893 if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
5894 else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
5895 return _mm_load_si128((__m128i*)res);
5896 }
5897
5898 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
vabdl_u8(uint8x8_t a,uint8x8_t b)5899 _NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
5900 {
5901 __m128i res;
5902 res = vsubl_u8(a,b);
5903 return _mm_abs_epi16(res);
5904 }
5905
5906 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
vabdl_u16(uint16x4_t a,uint16x4_t b)5907 _NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
5908 {
5909 __m128i res;
5910 res = vsubl_u16(a,b);
5911 return _mm_abs_epi32(res);
5912 }
5913
5914 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vabdl_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)5915 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
5916 {
5917 _NEON2SSE_ALIGN_16 uint64_t res[2];
5918 if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
5919 else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
5920 if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
5921 else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
5922 return _mm_load_si128((__m128i*)res);
5923 }
5924
5925 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
5926 //*********************************************************************************************
5927 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
vaba_s8(int8x8_t a,int8x8_t b,int8x8_t c)5928 _NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c)
5929 {
5930 int8x8_t res64;
5931 return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
5932 }
5933
5934 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
vaba_s16(int16x4_t a,int16x4_t b,int16x4_t c)5935 _NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c)
5936 {
5937 int16x4_t res64;
5938 return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
5939 }
5940
5941 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
vaba_s32(int32x2_t a,int32x2_t b,int32x2_t c)5942 _NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c)
5943 {
5944 int32x2_t res64;
5945 return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
5946 }
5947
5948 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
vaba_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)5949 _NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
5950 {
5951 int8x8_t res64;
5952 return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
5953 }
5954
5955
5956 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
vaba_u16(uint16x4_t a,uint16x4_t b,uint16x4_t c)5957 _NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c)
5958 {
5959 int16x4_t res64;
5960 return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
5961 }
5962
5963 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
vaba_u32(uint32x2_t a,uint32x2_t b,uint32x2_t c)5964 _NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c)
5965 {
5966 uint32x2_t res64;
5967 return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
5968 }
5969
5970 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
vabaq_s8(int8x16_t a,int8x16_t b,int8x16_t c)5971 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
5972 {
5973 int8x16_t sub;
5974 sub = vabdq_s8(b, c);
5975 return vaddq_s8( a, sub);
5976 }
5977
5978 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
vabaq_s16(int16x8_t a,int16x8_t b,int16x8_t c)5979 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
5980 {
5981 int16x8_t sub;
5982 sub = vabdq_s16(b, c);
5983 return vaddq_s16( a, sub);
5984 }
5985
5986 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
vabaq_s32(int32x4_t a,int32x4_t b,int32x4_t c)5987 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
5988 {
5989 int32x4_t sub;
5990 sub = vabdq_s32(b, c);
5991 return vaddq_s32( a, sub);
5992 }
5993
5994 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
vabaq_u8(uint8x16_t a,uint8x16_t b,uint8x16_t c)5995 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
5996 {
5997 uint8x16_t sub;
5998 sub = vabdq_u8(b, c);
5999 return vaddq_u8( a, sub);
6000 }
6001
6002 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
vabaq_u16(uint16x8_t a,uint16x8_t b,uint16x8_t c)6003 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
6004 {
6005 uint16x8_t sub;
6006 sub = vabdq_u16(b, c);
6007 return vaddq_u16( a, sub);
6008 }
6009
6010 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
vabaq_u32(uint32x4_t a,uint32x4_t b,uint32x4_t c)6011 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
6012 {
6013 uint32x4_t sub;
6014 sub = vabdq_u32(b, c);
6015 return vaddq_u32( a, sub);
6016 }
6017
6018 //************** Absolute difference and accumulate - long ********************************
6019 //*************************************************************************************
6020 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
vabal_s8(int16x8_t a,int8x8_t b,int8x8_t c)6021 _NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
6022 {
6023 __m128i b16, c16, res;
6024 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
6025 c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
6026 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6027 return _mm_add_epi16 (a, res);
6028 }
6029
6030 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
vabal_s16(int32x4_t a,int16x4_t b,int16x4_t c)6031 _NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
6032 {
6033 __m128i b32, c32, res;
6034 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
6035 c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
6036 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6037 return _mm_add_epi32 (a, res);
6038 }
6039
6040 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vabal_s32 (int64x2_t a,int32x2_t b,int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)6041 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6042 {
6043 __m128i res;
6044 res = vabdl_s32(b,c);
6045 return _mm_add_epi64(a, res);
6046 }
6047
6048 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
vabal_u8(uint16x8_t a,uint8x8_t b,uint8x8_t c)6049 _NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
6050 {
6051 __m128i b16, c16, res;
6052 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
6053 c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
6054 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
6055 return _mm_add_epi16 (a, res);
6056 }
6057
6058 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
vabal_u16(uint32x4_t a,uint16x4_t b,uint16x4_t c)6059 _NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
6060 {
6061 __m128i b32, c32, res;
6062 b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
6063 c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
6064 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
6065 return _mm_add_epi32 (a, res);
6066 }
6067
6068 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vabal_u32 (uint64x2_t a,uint32x2_t b,uint32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)6069 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
6070 {
6071 __m128i res;
6072 res = vabdl_u32(b,c);
6073 return _mm_add_epi64(a, res);
6074 }
6075
6076 //***********************************************************************************
6077 //**************** Maximum and minimum operations **********************************
6078 //***********************************************************************************
6079 //************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *******
6080 //***********************************************************************************
6081 _NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
vmax_s8(int8x8_t a,int8x8_t b)6082 _NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b)
6083 {
6084 int8x8_t res64;
6085 __m128i res;
6086 res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6087 return64(res);
6088 }
6089
6090 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
vmax_s16(int16x4_t a,int16x4_t b)6091 _NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
6092 {
6093 int16x4_t res64;
6094 return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
6095 }
6096
6097 _NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
vmax_s32(int32x2_t a,int32x2_t b)6098 _NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b)
6099 {
6100 int32x2_t res64;
6101 __m128i res;
6102 res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6103 return64(res);
6104 }
6105
6106 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
vmax_u8(uint8x8_t a,uint8x8_t b)6107 _NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
6108 {
6109 uint8x8_t res64;
6110 return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
6111 }
6112
6113
6114 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
vmax_u16(uint16x4_t a,uint16x4_t b)6115 _NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
6116 {
6117 uint16x4_t res64;
6118 return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
6119 }
6120
6121
6122 _NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
vmax_u32(uint32x2_t a,uint32x2_t b)6123 _NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b)
6124 {
6125 uint32x2_t res64;
6126 __m128i res;
6127 res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6128 return64(res);
6129 }
6130
6131 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
vmax_f32(float32x2_t a,float32x2_t b)6132 _NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
6133 {
6134 //serial solution looks faster than SIMD one
6135 float32x2_t res;
6136 res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6137 res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6138 return res;
6139 }
6140
6141 _NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
6142 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
6143
6144 _NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
6145 #define vmaxq_s16 _mm_max_epi16
6146
6147 _NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
6148 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
6149
6150 _NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
6151 #define vmaxq_u8 _mm_max_epu8
6152
6153 _NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
6154 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
6155
6156 _NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
6157 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
6158
6159
6160 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
6161 #define vmaxq_f32 _mm_max_ps
6162
6163
6164 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
6165 #define vmaxq_f64 _mm_max_pd
6166
6167
6168 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
6169 //***********************************************************************************************************
6170 _NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
vmin_s8(int8x8_t a,int8x8_t b)6171 _NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b)
6172 {
6173 int8x8_t res64;
6174 __m128i res;
6175 res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6176 return64(res);
6177 }
6178
6179 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
vmin_s16(int16x4_t a,int16x4_t b)6180 _NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
6181 {
6182 int16x4_t res64;
6183 return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
6184 }
6185
6186
6187 _NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
vmin_s32(int32x2_t a,int32x2_t b)6188 _NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b)
6189 {
6190 int32x2_t res64;
6191 __m128i res;
6192 res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
6193 return64(res);
6194 }
6195
6196 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
vmin_u8(uint8x8_t a,uint8x8_t b)6197 _NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
6198 {
6199 uint8x8_t res64;
6200 return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
6201 }
6202
6203
6204 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
vmin_u16(uint16x4_t a,uint16x4_t b)6205 _NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
6206 {
6207 uint16x4_t res64;
6208 return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
6209 }
6210
6211
6212 _NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
vmin_u32(uint32x2_t a,uint32x2_t b)6213 _NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b)
6214 {
6215 uint32x2_t res64;
6216 __m128i res;
6217 res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6218 return64(res);
6219 }
6220
6221 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
vmin_f32(float32x2_t a,float32x2_t b)6222 _NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
6223 {
6224 //serial solution looks faster than SIMD one
6225 float32x2_t res;
6226 res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
6227 res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
6228 return res;
6229 }
6230
6231 _NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
6232 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1
6233
6234 _NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
6235 #define vminq_s16 _mm_min_epi16
6236
6237 _NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
6238 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1
6239
6240 _NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
6241 #define vminq_u8 _mm_min_epu8
6242
6243 _NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
6244 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1
6245
6246 _NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
6247 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1
6248
6249 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
6250 #define vminq_f32 _mm_min_ps
6251
6252
6253 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
6254 #define vminq_f64 _mm_min_pd
6255
6256
6257 //************* Pairwise addition operations. **************************************
6258 //************************************************************************************
6259 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
6260 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
vpadd_s8(int8x8_t a,int8x8_t b)6261 _NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
6262 {
6263 //no 8 bit hadd in IA32, need to go to 16 bit and then pack
6264 int8x8_t res64;
6265 __m128i a16, b16, res;
6266 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6267 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
6268 res = _mm_hadd_epi16 (a16, b16);
6269 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
6270 return64(res);
6271 }
6272
6273 _NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
vpadd_s16(int16x4_t a,int16x4_t b)6274 _NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b)
6275 {
6276 int16x4_t res64;
6277 __m128i hadd128;
6278 hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
6279 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6280 return64(hadd128);
6281 }
6282
6283
6284 _NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
vpadd_s32(int32x2_t a,int32x2_t b)6285 _NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b)
6286 {
6287 int32x2_t res64;
6288 __m128i hadd128;
6289 hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
6290 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6291 return64(hadd128);
6292 }
6293
6294
6295 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
vpadd_u8(uint8x8_t a,uint8x8_t b)6296 _NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
6297 {
6298 // no 8 bit hadd in IA32, need to go to 16 bit and then pack
6299 uint8x8_t res64;
6300 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6301 __m128i mask8, a16, b16, res;
6302 mask8 = _mm_set1_epi16(0xff);
6303 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
6304 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
6305 res = _mm_hadd_epi16 (a16, b16);
6306 res = _mm_and_si128(res, mask8); //to avoid saturation
6307 res = _mm_packus_epi16 (res,res); //use low 64 bits
6308 return64(res);
6309 }
6310
6311 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
vpadd_u16(uint16x4_t a,uint16x4_t b)6312 _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
6313 {
6314 // solution may be not optimal, serial execution may be faster
6315 // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
6316 uint16x4_t res64;
6317 __m128i c32767, cfffe, as, bs, res;
6318 c32767 = _mm_set1_epi16 (32767);
6319 cfffe = _mm_set1_epi16 ((int16_t)0xfffe);
6320 as = _mm_sub_epi16 (_pM128i(a), c32767);
6321 bs = _mm_sub_epi16 (_pM128i(b), c32767);
6322 res = _mm_hadd_epi16 (as, bs);
6323 res = _mm_add_epi16 (res, cfffe);
6324 res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6325 return64(res);
6326 }
6327
6328 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
vpadd_u32(uint32x2_t a,uint32x2_t b)6329 _NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
6330 {
6331 //hadd doesn't work for unsigned values
6332 uint32x2_t res64;
6333 __m128i ab, ab_sh, res;
6334 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
6335 ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
6336 res = _mm_add_epi32(ab, ab_sh);
6337 res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6338 return64(res);
6339 }
6340
6341 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
vpadd_f32(float32x2_t a,float32x2_t b)6342 _NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
6343 {
6344 __m128 hadd128;
6345 __m64_128 res64;
6346 hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
6347 hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
6348 _M64f(res64, hadd128);
6349 return res64;
6350 }
6351
6352
6353 //************************** Long pairwise add **********************************
6354 //*********************************************************************************
6355 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
6356 // and places the final results in the destination vector.
6357
6358 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
vpaddl_s8(int8x8_t a)6359 _NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
6360 {
6361 //no 8 bit hadd in IA32, need to go to 16 bit anyway
6362 __m128i a16;
6363 int16x4_t res64;
6364 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
6365 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6366 return64(a16);
6367 }
6368
6369 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
vpaddl_s16(int16x4_t a)6370 _NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
6371 {
6372 // solution may be not optimal, serial execution may be faster
6373 int32x2_t res64;
6374 __m128i r32_1;
6375 r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
6376 r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
6377 return64(r32_1);
6378 }
6379
6380 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32 (int32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6381 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6382 {
6383 int64x1_t res;
6384 res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
6385 return res;
6386 }
6387
6388 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
vpaddl_u8(uint8x8_t a)6389 _NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
6390 {
6391 // no 8 bit hadd in IA32, need to go to 16 bit
6392 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6393 uint16x4_t res64;
6394 __m128i a16;
6395 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
6396 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
6397 return64(a16);
6398 }
6399
6400 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16 (uint16x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6401 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6402 {
6403 //serial solution looks faster than a SIMD one
6404 uint32x2_t res;
6405 res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
6406 res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
6407 return res;
6408 }
6409
6410 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6411 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
6412 {
6413 uint64x1_t res;
6414 res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
6415 return res;
6416 }
6417
6418 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
vpaddlq_s8(int8x16_t a)6419 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
6420 {
6421 //no 8 bit hadd in IA32, need to go to 16 bit
6422 __m128i r16_1, r16_2;
6423 r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
6424 //swap hi and low part of r to process the remaining data
6425 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6426 r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
6427 return _mm_hadd_epi16 (r16_1, r16_2);
6428 }
6429
6430 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
vpaddlq_s16(int16x8_t a)6431 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
6432 {
6433 //no 8 bit hadd in IA32, need to go to 16 bit
6434 __m128i r32_1, r32_2;
6435 r32_1 = _MM_CVTEPI16_EPI32(a);
6436 //swap hi and low part of r to process the remaining data
6437 r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6438 r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
6439 return _mm_hadd_epi32 (r32_1, r32_2);
6440 }
6441
6442 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32 (int32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6443 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
6444 {
6445 _NEON2SSE_ALIGN_16 int32_t atmp[4];
6446 _NEON2SSE_ALIGN_16 int64_t res[2];
6447 _mm_store_si128((__m128i*)atmp, a);
6448 res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
6449 res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
6450 return _mm_load_si128((__m128i*)res);
6451 }
6452
6453 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
vpaddlq_u8(uint8x16_t a)6454 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
6455 {
6456 //no 8 bit hadd in IA32, need to go to 16 bit
6457 __m128i r16_1, r16_2;
6458 r16_1 = _MM_CVTEPU8_EPI16(a);
6459 //swap hi and low part of r to process the remaining data
6460 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
6461 r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
6462 return _mm_hadd_epi16 (r16_1, r16_2);
6463 }
6464
6465 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16 (uint16x8_t a),_NEON2SSE_REASON_SLOW_SERIAL)6466 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6467 {
6468 //serial solution looks faster than a SIMD one
6469 _NEON2SSE_ALIGN_16 uint16_t atmp[8];
6470 _NEON2SSE_ALIGN_16 uint32_t res[4];
6471 _mm_store_si128((__m128i*)atmp, a);
6472 res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
6473 res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
6474 res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
6475 res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
6476 return _mm_load_si128((__m128i*)res);
6477 }
6478
6479 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6480 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6481 {
6482 _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6483 _NEON2SSE_ALIGN_16 uint64_t res[2];
6484 _mm_store_si128((__m128i*)atmp, a);
6485 res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
6486 res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
6487 return _mm_load_si128((__m128i*)res);
6488 }
6489
6490 //************************ Long pairwise add and accumulate **************************
6491 //****************************************************************************************
6492 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
6493 // and accumulates the values of the results into the elements of the destination (wide) vector
6494 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
vpadal_s8(int16x4_t a,int8x8_t b)6495 _NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b)
6496 {
6497 int16x4_t res64;
6498 return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
6499 }
6500
6501 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
vpadal_s16(int32x2_t a,int16x4_t b)6502 _NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b)
6503 {
6504 int32x2_t res64;
6505 return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
6506 }
6507
6508
6509 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
vpadal_s32(int64x1_t a,int32x2_t b)6510 _NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
6511 {
6512 int64x1_t res;
6513 res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
6514 return res;
6515 }
6516
6517 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
vpadal_u8(uint16x4_t a,uint8x8_t b)6518 _NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b)
6519 {
6520 uint16x4_t res64;
6521 return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
6522 }
6523
6524
6525 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0
vpadal_u16(uint32x2_t a,uint16x4_t b)6526 _NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b)
6527 {
6528 uint32x2_t res64;
6529 return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
6530 }
6531
6532 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
vpadal_u32(uint64x1_t a,uint32x2_t b)6533 _NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
6534 {
6535 uint64x1_t res;
6536 res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
6537 return res;
6538 }
6539
6540 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
vpadalq_s8(int16x8_t a,int8x16_t b)6541 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
6542 {
6543 int16x8_t pad;
6544 pad = vpaddlq_s8(b);
6545 return _mm_add_epi16 (a, pad);
6546 }
6547
6548 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
vpadalq_s16(int32x4_t a,int16x8_t b)6549 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
6550 {
6551 int32x4_t pad;
6552 pad = vpaddlq_s16(b);
6553 return _mm_add_epi32(a, pad);
6554 }
6555
6556 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
vpadalq_s32(int64x2_t a,int32x4_t b)6557 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
6558 {
6559 int64x2_t pad;
6560 pad = vpaddlq_s32(b);
6561 return _mm_add_epi64 (a, pad);
6562 }
6563
6564 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
vpadalq_u8(uint16x8_t a,uint8x16_t b)6565 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
6566 {
6567 uint16x8_t pad;
6568 pad = vpaddlq_u8(b);
6569 return _mm_add_epi16 (a, pad);
6570 }
6571
6572 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16 (uint32x4_t a,uint16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)6573 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6574 {
6575 uint32x4_t pad;
6576 pad = vpaddlq_u16(b);
6577 return _mm_add_epi32(a, pad);
6578 } //no optimal SIMD solution, serial is faster
6579
6580 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32 (uint64x2_t a,uint32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)6581 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6582 {
6583 //no optimal SIMD solution, serial is faster
6584 uint64x2_t pad;
6585 pad = vpaddlq_u32(b);
6586 return _mm_add_epi64(a, pad);
6587 } //no optimal SIMD solution, serial is faster
6588
6589 //********** Folding maximum *************************************
6590 //*******************************************************************
6591 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
6592 //and copies the larger of each pair into the corresponding element in the destination
6593 // no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
6594 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
vpmax_s8(int8x8_t a,int8x8_t b)6595 _NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
6596 {
6597 int8x8_t res64;
6598 __m128i ab, ab1, max;
6599 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6600 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6601 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6602 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6603 max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
6604 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6605 return64(max); //we need 64 bits only
6606 }
6607
6608 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
vpmax_s16(int16x4_t a,int16x4_t b)6609 _NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
6610 {
6611 //solution may be not optimal compared with the serial one
6612 int16x4_t res64;
6613 __m128i ab, ab1, max;
6614 _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6615 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6616 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6617 max = _mm_max_epi16 (ab, ab1);
6618 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6619 return64(max);
6620 }
6621
6622 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6623 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6624 {
6625 //serial solution looks faster than SIMD one
6626 int32x2_t res;
6627 res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6628 res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6629 return res;
6630 }
6631
6632 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
vpmax_u8(uint8x8_t a,uint8x8_t b)6633 _NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
6634 {
6635 uint8x8_t res64;
6636 __m128i ab, ab1, max;
6637 _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6638 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6639 ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
6640 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6641 max = _mm_max_epu8 (ab, ab1); // SSE4.1
6642 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
6643 return64(max);
6644 }
6645
6646 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
vpmax_u16(uint16x4_t a,uint16x4_t b)6647 _NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
6648 {
6649 //solution may be not optimal compared with the serial one
6650 uint16x4_t res64;
6651 __m128i ab, ab1, max;
6652 _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6653 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6654 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6655 max = _MM_MAX_EPU16 (ab, ab1);
6656 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6657 return64(max);
6658 }
6659
6660 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6661 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6662 {
6663 //serial solution looks faster than SIMD one
6664 uint32x2_t res;
6665 res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6666 res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6667 return res;
6668 }
6669
6670 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32 (float32x2_t a,float32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6671 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6672 {
6673 //serial solution looks faster than SIMD one
6674 float32x2_t res;
6675 res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6676 res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6677 return res;
6678 }
6679
6680 // ***************** Folding minimum ****************************
6681 // **************************************************************
6682 //vpmin -> takes minimum of adjacent pairs
6683 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
vpmin_s8(int8x8_t a,int8x8_t b)6684 _NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
6685 {
6686 int8x8_t res64;
6687 __m128i ab, ab1, min;
6688 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6689 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6690 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6691 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
6692 min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1
6693 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6694 return64(min);
6695 }
6696
6697 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
vpmin_s16(int16x4_t a,int16x4_t b)6698 _NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
6699 {
6700 //solution may be not optimal compared with the serial one
6701 int16x4_t res64;
6702 __m128i ab, ab1, min;
6703 _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6704 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6705 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6706 min = _mm_min_epi16 (ab, ab1);
6707 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6708 return64(min);
6709 }
6710
6711 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6712 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6713 {
6714 //serial solution looks faster than SIMD one
6715 int32x2_t res;
6716 res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
6717 res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
6718 return res;
6719 }
6720
6721 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
vpmin_u8(uint8x8_t a,uint8x8_t b)6722 _NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
6723 {
6724 uint8x8_t res64;
6725 __m128i ab, ab1, min;
6726 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6727 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6728 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6729 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
6730 min = _mm_min_epu8 (ab, ab1); // SSE4.1
6731 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
6732 return64(min);
6733 }
6734
6735 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
vpmin_u16(uint16x4_t a,uint16x4_t b)6736 _NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
6737 {
6738 //solution may be not optimal compared with the serial one
6739 uint16x4_t res64;
6740 __m128i ab, ab1, min;
6741 _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6742 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
6743 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
6744 min = _MM_MIN_EPU16 (ab, ab1);
6745 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
6746 return64(min);
6747 }
6748
6749 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32 (uint32x2_t a,uint32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6750 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6751 {
6752 //serial solution looks faster than SIMD one
6753 uint32x2_t res;
6754 res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
6755 res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
6756 return res;
6757 }
6758
6759 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32 (float32x2_t a,float32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6760 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6761 {
6762 //serial solution looks faster than SIMD one
6763 float32x2_t res;
6764 res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
6765 res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
6766 return res;
6767 }
6768
6769 //***************************************************************
6770 //*********** Reciprocal/Sqrt ************************************
6771 //***************************************************************
6772 //****************** Reciprocal estimate *******************************
6773 //the ARM NEON and x86 SIMD results may be slightly different
6774 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
vrecpe_f32(float32x2_t a)6775 _NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
6776 {
6777 float32x4_t res;
6778 __m64_128 res64;
6779 res = _mm_rcp_ps(_pM128(a));
6780 _M64f(res64, res);
6781 return res64;
6782 }
6783
6784 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6785 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6786 {
6787 //Input is fixed point number!!! No reciprocal for ints in IA32 available
6788 uint32x2_t res;
6789 float resf, r;
6790 int i, q, s;
6791 for (i =0; i<2; i++){
6792 if((a.m64_u32[i] & 0x80000000) == 0) {
6793 res.m64_u32[i] = 0xffffffff;
6794 }else{
6795 resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6796 q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6797 r = (float)(1.0 / (((float)q + 0.5) / 512.0)); /* reciprocal r */
6798 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6799 r = (float)s / 256.0;
6800 res.m64_u32[i] = r * (uint32_t)(1 << 31);
6801 }
6802 }
6803 return res;
6804 }
6805
6806 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
6807 #define vrecpeq_f32 _mm_rcp_ps
6808
6809
6810 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6811 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6812 {
6813 //Input is fixed point number!!!
6814 //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
6815 _NEON2SSE_ALIGN_16 uint32_t atmp[4];
6816 _NEON2SSE_ALIGN_16 uint32_t res[4];
6817 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
6818 float resf, r;
6819 int i, q, s;
6820 __m128i res128, mask, zero;
6821 _mm_store_si128((__m128i*)atmp, a);
6822 zero = _mm_setzero_si128();
6823 for (i =0; i<4; i++){
6824 resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
6825 q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
6826 r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
6827 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6828 r = (float)s / 256.0;
6829 res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6830 }
6831 res128 = _mm_load_si128((__m128i*)res);
6832 mask = _mm_and_si128(a, *(__m128i*)c80000000);
6833 mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff
6834 return _mm_or_si128(res128, mask);
6835 }
6836
6837 //**********Reciprocal square root estimate ****************
6838 //**********************************************************
6839 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
6840 //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
6841 ////the ARM NEON and x86 SIMD results may be slightly different
6842 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
vrsqrte_f32(float32x2_t a)6843 _NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
6844 {
6845 float32x4_t res;
6846 __m64_128 res64;
6847 res = _mm_rsqrt_ps(_pM128(a));
6848 _M64f(res64, res);
6849 return res64;
6850 }
6851
6852 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32 (uint32x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)6853 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6854 {
6855 //Input is fixed point number!!!
6856 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6857 uint32x2_t res;
6858 __m128 tmp;
6859 float r, resf, coeff;
6860 int i,q0, s;
6861 for (i =0; i<2; i++){
6862 if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
6863 res.m64_u32[i] = 0xffffffff;
6864 }else{
6865 resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
6866 coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6867 q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6868 r = ((float)q0 + 0.5) / coeff;
6869 tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6870 _mm_store_ss(&r, tmp);
6871 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6872 r = (float)(s / 256.0);
6873 res.m64_u32[i] = r * (((uint32_t)1) << 31);
6874 }
6875 }
6876 return res;
6877 }
6878
6879 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
6880 #define vrsqrteq_f32 _mm_rsqrt_ps
6881
6882 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32 (uint32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)6883 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
6884 {
6885 //Input is fixed point number!!!
6886 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6887 _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
6888 _NEON2SSE_ALIGN_16 static const uint32_t c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
6889 __m128 tmp;
6890 __m128i res128, mask, zero;
6891 float r, resf, coeff;
6892 int i,q0, s;
6893 _mm_store_si128((__m128i*)atmp, a);
6894 zero = _mm_setzero_si128();
6895 for (i =0; i<4; i++){
6896 resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
6897 coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6898 q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
6899 r = ((float)q0 + 0.5) / coeff;
6900 tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
6901 _mm_store_ss(&r, tmp);
6902 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
6903 r = (float)s / 256.0;
6904 res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
6905 }
6906 res128 = _mm_load_si128((__m128i*)res);
6907 mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
6908 mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff
6909 return _mm_or_si128(res128, mask);
6910 }
6911 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
6912 //******************************************************************************************
6913 //******VRECPS (Vector Reciprocal Step) ***************************************************
6914 //multiplies the elements of one vector by the corresponding elements of another vector,
6915 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
6916
6917 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
vrecps_f32(float32x2_t a,float32x2_t b)6918 _NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
6919 {
6920 float32x4_t res;
6921 __m64_128 res64;
6922 res = vrecpsq_f32(_pM128(a), _pM128(b));
6923 _M64f(res64, res);
6924 return res64;
6925 }
6926
6927 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
vrecpsq_f32(float32x4_t a,float32x4_t b)6928 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
6929 {
6930 __m128 f2, mul;
6931 f2 = _mm_set1_ps(2.);
6932 mul = _mm_mul_ps(a,b);
6933 return _mm_sub_ps(f2,mul);
6934 }
6935
6936 //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
6937 //multiplies the elements of one vector by the corresponding elements of another vector,
6938 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
6939
6940 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
vrsqrts_f32(float32x2_t a,float32x2_t b)6941 _NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
6942 {
6943 float32x2_t res;
6944 res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
6945 res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
6946 return res;
6947 }
6948
6949 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
vrsqrtsq_f32(float32x4_t a,float32x4_t b)6950 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
6951 {
6952 __m128 f3, f05, mul;
6953 f3 = _mm_set1_ps(3.);
6954 f05 = _mm_set1_ps(0.5);
6955 mul = _mm_mul_ps(a,b);
6956 f3 = _mm_sub_ps(f3,mul);
6957 return _mm_mul_ps (f3, f05);
6958 }
6959 //********************************************************************************************
6960 //***************************** Shifts by signed variable ***********************************
6961 //********************************************************************************************
6962 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
6963 //********************************************************************************************
6964 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
6965 //helper macro. It matches ARM implementation for big shifts
6966 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
6967 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
6968 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
6969 for (i = 0; i<LEN; i++) { \
6970 if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
6971 else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
6972 return _mm_load_si128((__m128i*)res);
6973
6974 #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
6975 int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
6976 for (i = 0; i<LEN; i++) { \
6977 if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
6978 else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
6979 return res;
6980
6981 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)6982 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6983 {
6984 SERIAL_SHIFT_64(8, i, 8)
6985 }
6986
6987 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)6988 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6989 {
6990 SERIAL_SHIFT_64(16, i, 4)
6991 }
6992
6993 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)6994 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
6995 {
6996 SERIAL_SHIFT_64(32, i, 2)
6997 }
6998
6999 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7000 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7001 {
7002 SERIAL_SHIFT_64(64, i, 1)
7003 }
7004
7005 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7006 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7007 {
7008 SERIAL_SHIFT_64(8, u, 8)
7009 }
7010
7011 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7012 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7013 {
7014 SERIAL_SHIFT_64(16, u, 4)
7015 }
7016
7017 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7018 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7019 {
7020 SERIAL_SHIFT_64(32, u, 2)
7021 }
7022
7023 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
vshl_u64(uint64x1_t a,int64x1_t b)7024 _NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
7025 {
7026 SERIAL_SHIFT_64(64, u, 1)
7027 }
7028
7029 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7030 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7031 {
7032 SERIAL_SHIFT(int8_t, int8_t, 16, 16)
7033 }
7034
7035 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7036 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7037 {
7038 SERIAL_SHIFT(int16_t, int16_t, 8, 8)
7039 }
7040
7041 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7042 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7043 {
7044 SERIAL_SHIFT(int32_t, int32_t, 4, 4)
7045 }
7046
7047 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7048 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7049 {
7050 SERIAL_SHIFT(int64_t, int64_t, 2, 2)
7051 }
7052
7053 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7054 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7055 {
7056 SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
7057 }
7058
7059 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7060 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7061 {
7062 SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
7063 }
7064
7065 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7066 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7067 {
7068 SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
7069 }
7070
7071 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7072 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7073 {
7074 SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
7075 }
7076
7077
7078 //*********** Vector saturating shift left: (negative values shift right) **********************
7079 //********************************************************************************************
7080 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7081 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7082 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7083 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7084 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7085 for (i = 0; i<LEN; i++) { \
7086 if (atmp[i] ==0) res[i] = 0; \
7087 else{ \
7088 if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
7089 else{ \
7090 if (btmp[i]>lanesize_1) { \
7091 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7092 }else{ \
7093 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7094 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7095 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7096 else res[i] = atmp[i] << btmp[i]; }}}} \
7097 return _mm_load_si128((__m128i*)res);
7098
7099 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7100 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7101 TYPE lanesize = (sizeof(TYPE) << 3); \
7102 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7103 for (i = 0; i<LEN; i++) { \
7104 if (atmp[i] ==0) {res[i] = 0; \
7105 }else{ \
7106 if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
7107 else{ \
7108 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7109 else{ \
7110 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7111 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7112 return _mm_load_si128((__m128i*)res);
7113
7114 #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
7115 int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
7116 int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
7117 for (i = 0; i<LEN; i++) { \
7118 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7119 else{ \
7120 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7121 else{ \
7122 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7123 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7124 }else{ \
7125 limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7126 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7127 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7128 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7129 return res;
7130
7131 #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7132 int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7133 int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7134 for (i = 0; i<LEN; i++) { \
7135 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7136 }else{ \
7137 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7138 else{ \
7139 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7140 else{ \
7141 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7142 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
7143 return res;
7144
7145 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7146 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7147 {
7148 SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
7149 }
7150
7151 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7152 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7153 {
7154 SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
7155 }
7156
7157 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7158 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7159 {
7160 SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
7161 }
7162
7163 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7164 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7165 {
7166 SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
7167 }
7168
7169 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7170 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7171 {
7172 SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
7173 }
7174
7175 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7176 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7177 {
7178 SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
7179 }
7180
7181 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7182 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7183 {
7184 SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
7185 }
7186
7187 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7189 {
7190 SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
7191 }
7192
7193 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7194 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7195 {
7196 SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
7197 }
7198
7199 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7200 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7201 {
7202 SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
7203 }
7204
7205 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7206 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7207 {
7208 SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
7209 }
7210
7211 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7212 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7213 {
7214 SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
7215 }
7216
7217 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7218 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7219 {
7220 SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
7221 }
7222
7223 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7224 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7225 {
7226 SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
7227 }
7228
7229 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7230 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7231 {
7232 SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
7233 }
7234
7235 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7236 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7237 {
7238 SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
7239 }
7240
7241
7242 //******** Vector rounding shift left: (negative values shift right) **********
7243 //****************************************************************************
7244 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7245 //rounding makes sense for right shifts only.
7246 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7247 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7248 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7249 for (i = 0; i<LEN; i++) { \
7250 if( btmp[i] >= 0) { \
7251 if(btmp[i] >= lanesize) res[i] = 0; \
7252 else res[i] = (atmp[i] << btmp[i]); \
7253 }else{ \
7254 res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
7255 (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
7256 (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
7257 return _mm_load_si128((__m128i*)res);
7258
7259
7260 #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
7261 int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
7262 for (i = 0; i<LEN; i++) { \
7263 if( b.m64_i ## TYPE[i] >= 0) { \
7264 if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
7265 else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
7266 }else{ \
7267 res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
7268 (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
7269 (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
7270 return res;
7271
7272
7273 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7274 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7275 {
7276 SERIAL_ROUNDING_SHIFT_64(8,i,8)
7277 }
7278
7279 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7280 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7281 {
7282 SERIAL_ROUNDING_SHIFT_64(16,i,4)
7283 }
7284
7285 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7286 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7287 {
7288 SERIAL_ROUNDING_SHIFT_64(32,i,2)
7289 }
7290
7291 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7292 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7293 {
7294 SERIAL_ROUNDING_SHIFT_64(64,i,1)
7295 }
7296
7297 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7298 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7299 {
7300 SERIAL_ROUNDING_SHIFT_64(8,u,8)
7301 }
7302
7303 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7304 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7305 {
7306 SERIAL_ROUNDING_SHIFT_64(16,u,4)
7307 }
7308
7309 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7310 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7311 {
7312 SERIAL_ROUNDING_SHIFT_64(32,u,2)
7313 }
7314
7315 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7316 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7317 {
7318 SERIAL_ROUNDING_SHIFT_64(64,u,1)
7319 }
7320
7321 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7322 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7323 {
7324 SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
7325 }
7326
7327 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7328 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7329 {
7330 SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
7331 }
7332
7333 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7334 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7335 {
7336 SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
7337 }
7338
7339 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7340 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7341 {
7342 SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
7343 }
7344
7345 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7346 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7347 {
7348 SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
7349 }
7350
7351 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7353 {
7354 SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
7355 }
7356
7357 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7358 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7359 {
7360 SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
7361 }
7362
7363 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7364 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7365 {
7366 SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
7367 }
7368
7369
7370 //********** Vector saturating rounding shift left: (negative values shift right) ****************
7371 //*************************************************************************************************
7372 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7373 //Saturation happens for left shifts only while rounding makes sense for right shifts only.
7374 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7375 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7376 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7377 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7378 for (i = 0; i<LEN; i++) { \
7379 if (atmp[i] ==0) res[i] = 0; \
7380 else{ \
7381 if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7382 else{ \
7383 if (btmp[i]>lanesize_1) { \
7384 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7385 }else{ \
7386 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7387 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7388 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7389 else res[i] = atmp[i] << btmp[i]; }}}} \
7390 return _mm_load_si128((__m128i*)res);
7391
7392 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7393 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7394 int lanesize = (sizeof(TYPE) << 3); \
7395 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7396 for (i = 0; i<LEN; i++) { \
7397 if (atmp[i] ==0) {res[i] = 0; \
7398 }else{ \
7399 if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7400 else{ \
7401 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7402 else{ \
7403 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7404 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7405 return _mm_load_si128((__m128i*)res);
7406
7407 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
7408 __m64_128 res; int ## TYPE ## _t limit; int i; \
7409 int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
7410 for (i = 0; i<LEN; i++) { \
7411 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7412 else{ \
7413 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7414 else{ \
7415 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7416 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7417 }else{ \
7418 limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7419 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7420 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7421 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7422 return res;
7423
7424 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7425 __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7426 int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7427 for (i = 0; i<LEN; i++) { \
7428 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7429 }else{ \
7430 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7431 else{ \
7432 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7433 else{ \
7434 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7435 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7436 return res;
7437
7438 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8 (int8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7439 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7440 {
7441 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
7442 }
7443
7444 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16 (int16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7445 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7446 {
7447 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
7448 }
7449
7450 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32 (int32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7451 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7452 {
7453 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
7454 }
7455
7456 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64 (int64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7457 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7458 {
7459 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
7460 }
7461
7462 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8 (uint8x8_t a,int8x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7463 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7464 {
7465 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
7466 }
7467
7468 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16 (uint16x4_t a,int16x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7469 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7470 {
7471 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
7472 }
7473
7474 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32 (uint32x2_t a,int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7475 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7476 {
7477 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
7478 }
7479
7480 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64 (uint64x1_t a,int64x1_t b),_NEON2SSE_REASON_SLOW_SERIAL)7481 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7482 {
7483 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
7484 }
7485
7486 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8 (int8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7487 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7488 {
7489 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
7490 }
7491
7492 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16 (int16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7493 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7494 {
7495 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
7496 }
7497
7498 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32 (int32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7499 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7500 {
7501 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
7502 }
7503
7504 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64 (int64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7505 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7506 {
7507 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
7508 }
7509
7510 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8 (uint8x16_t a,int8x16_t b),_NEON2SSE_REASON_SLOW_SERIAL)7511 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7512 {
7513 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
7514 }
7515
7516 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16 (uint16x8_t a,int16x8_t b),_NEON2SSE_REASON_SLOW_SERIAL)7517 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7518 {
7519 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
7520 }
7521
7522 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32 (uint32x4_t a,int32x4_t b),_NEON2SSE_REASON_SLOW_SERIAL)7523 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7524 {
7525 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
7526 }
7527
7528 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64 (uint64x2_t a,int64x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)7529 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
7530 {
7531 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
7532 }
7533
7534 // *********************************************************************************
7535 // ***************************** Shifts by a constant *****************************
7536 // *********************************************************************************
7537 //**************** Vector shift right by constant*************************************
7538 //************************************************************************************
7539 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
7540 _NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
7541 {
7542 //no 8 bit shift available, go to 16 bit
7543 int8x8_t res64;
7544 __m128i r;
7545 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7546 r = _mm_srai_epi16 (r, b); //SSE2
7547 r = _mm_packs_epi16 (r,r); //we need 64 bits only
7548 return64(r);
7549 }
7550
7551 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
7552 _NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b)
7553 {
7554 int16x4_t res64;
7555 return64(_mm_srai_epi16(_pM128i(a), b));
7556 }
7557
7558
7559 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
7560 _NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b)
7561 {
7562 int32x2_t res64;
7563 return64(_mm_srai_epi32(_pM128i(a), b));
7564 }
7565
7566 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
7567 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7568 {
7569 //no arithmetic shift for 64bit values, serial solution used
7570 int64x1_t res;
7571 if(b>=64) res.m64_i64[0] = 0;
7572 else res.m64_i64[0] = (*(int64_t*)&a) >> b;
7573 return res;
7574 }
7575
7576 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
7577 _NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
7578 {
7579 //no 8 bit shift available, go to 16 bit
7580 uint8x8_t res64;
7581 __m128i r;
7582 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7583 r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
7584 r = _mm_packus_epi16 (r,r); //we need 64 bits only
7585 return64(r);
7586 }
7587
7588 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
7589 _NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
7590 {
7591 uint16x4_t res64;
7592 return64(_mm_srli_epi16(_pM128i(a), b));
7593 }
7594
7595
7596 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
7597 _NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
7598 {
7599 uint32x2_t res64;
7600 return64(_mm_srli_epi32(_pM128i(a), b));
7601 }
7602
7603
7604 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
7605 _NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7606 {
7607 uint64x1_t res64;
7608 return64(_mm_srli_epi64(_pM128i(a), b));
7609 }
7610
7611
7612 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
7613 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
7614 {
7615 //no 8 bit shift available, go to 16 bit trick
7616 __m128i zero, mask0, a_sign, r, a_sign_mask;
7617 _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff};
7618 zero = _mm_setzero_si128();
7619 mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7620 a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
7621 r = _mm_srai_epi16 (a, b);
7622 a_sign_mask = _mm_and_si128 (mask0, a_sign);
7623 r = _mm_andnot_si128 (mask0, r);
7624 return _mm_or_si128 (r, a_sign_mask);
7625 }
7626
7627 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
7628 #define vshrq_n_s16 _mm_srai_epi16
7629
7630 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
7631 #define vshrq_n_s32 _mm_srai_epi32
7632
7633 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
7634 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7635 {
7636 //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
7637 __m128i c1, signmask,a0, res64;
7638 _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
7639 c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
7640 signmask = _mm_slli_epi64 (c1, (64 - b));
7641 a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
7642 a0 = _MM_CMPEQ_EPI64 (a, a0);
7643 signmask = _mm_and_si128(a0, signmask);
7644 res64 = _mm_srli_epi64 (a, b);
7645 return _mm_or_si128(res64, signmask);
7646 }
7647
7648 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
7649 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
7650 {
7651 //no 8 bit shift available, need the special trick
7652 __m128i mask0, r;
7653 _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00};
7654 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7655 r = _mm_srli_epi16 ( a, b);
7656 return _mm_and_si128 (r, mask0);
7657 }
7658
7659 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
7660 #define vshrq_n_u16 _mm_srli_epi16
7661
7662 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
7663 #define vshrq_n_u32 _mm_srli_epi32
7664
7665 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
7666 #define vshrq_n_u64 _mm_srli_epi64
7667
7668 //*************************** Vector shift left by constant *************************
7669 //*********************************************************************************
7670 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7671 _NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
7672 {
7673 //no 8 bit shift available, go to 16 bit
7674 int8x8_t res64;
7675 __m128i r;
7676 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7677 r = _mm_slli_epi16 (r, b); //SSE2
7678 r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
7679 return64(r);
7680 }
7681
7682 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7683 _NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b)
7684 {
7685 int16x4_t res64;
7686 return64(_mm_slli_epi16(_pM128i(a), b));
7687 }
7688
7689
7690 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7691 _NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b)
7692 {
7693 int32x2_t res64;
7694 return64(_mm_slli_epi32(_pM128i(a), b));
7695 }
7696
7697
7698 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7699 _NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b)
7700 {
7701 int64x1_t res64;
7702 return64(_mm_slli_epi64(_pM128i(a), b));
7703 }
7704
7705
7706 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
7707 _NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
7708 {
7709 //no 8 bit shift available, go to 16 bit
7710 uint8x8_t res64;
7711 __m128i mask8;
7712 __m128i r;
7713 mask8 = _mm_set1_epi16(0xff);
7714 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7715 r = _mm_slli_epi16 (r, b); //SSE2
7716 r = _mm_and_si128(r, mask8); //to avoid saturation
7717 r = _mm_packus_epi16 (r,r); //we need 64 bits only
7718 return64(r);
7719 }
7720
7721 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
7722 #define vshl_n_u16 vshl_n_s16
7723
7724
7725 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
7726 #define vshl_n_u32 vshl_n_s32
7727
7728 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
7729 #define vshl_n_u64 vshl_n_s64
7730
7731 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7732 #define vshlq_n_s8 vshlq_n_u8
7733
7734 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7735 #define vshlq_n_s16 _mm_slli_epi16
7736
7737 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7738 #define vshlq_n_s32 _mm_slli_epi32
7739
7740 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7741 #define vshlq_n_s64 _mm_slli_epi64
7742
7743 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
7744 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
7745 {
7746 //no 8 bit shift available, need the special trick
7747 __m128i mask0, r;
7748 _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff};
7749 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift
7750 r = _mm_slli_epi16 ( a, b);
7751 return _mm_and_si128 (r, mask0);
7752 }
7753
7754 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
7755 #define vshlq_n_u16 vshlq_n_s16
7756
7757 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
7758 #define vshlq_n_u32 vshlq_n_s32
7759
7760 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
7761 #define vshlq_n_u64 vshlq_n_s64
7762
7763 //************* Vector rounding shift right by constant ******************
7764 //*************************************************************************
7765 //No corresponding x86 intrinsics exist, need to do some tricks
7766 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
7767 _NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
7768 {
7769 //no 8 bit shift available, go to 16 bit
7770 int8x8_t res64;
7771 __m128i r, maskb;
7772 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7773 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7774 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7775 r = _mm_srai_epi16 (r, b);
7776 r = _mm_add_epi16 (r, maskb); //actual rounding
7777 r = _mm_packs_epi16 (r,r); ////we need 64 bits only
7778 return64(r);
7779 }
7780
7781 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
7782 _NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b)
7783 {
7784 int16x4_t res64;
7785 return64(vrshrq_n_s16(_pM128i(a), b));
7786 }
7787
7788
7789 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
7790 _NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b)
7791 {
7792 int32x2_t res64;
7793 return64(vrshrq_n_s32(_pM128i(a), b));
7794 }
7795
7796
7797 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
7798 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
7799 {
7800 //serial solution is faster
7801 int64x1_t res;
7802 int64_t a_i64 = *( int64_t*)&a;
7803 if(b==64) {
7804 res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
7805 } else {
7806 int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
7807 res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
7808 }
7809 return res;
7810 }
7811
7812 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
7813 _NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
7814 {
7815 //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
7816 uint8x8_t res64;
7817 __m128i r, maskb;
7818 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7819 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
7820 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
7821 r = _mm_srli_epi16 (r, b);
7822 r = _mm_add_epi16 (r, maskb); //actual rounding
7823 r = _mm_packus_epi16 (r,r); ////we need 64 bits only
7824 return64(r);
7825 }
7826
7827 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
7828 _NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b)
7829 {
7830 uint16x4_t res64;
7831 return64(vrshrq_n_u16(_pM128i(a), b));
7832 }
7833
7834
7835 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
7836 _NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b)
7837 {
7838 uint32x2_t res64;
7839 return64(vrshrq_n_u32(_pM128i(a), b));
7840 }
7841
7842
7843 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
7844 _NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
7845 {
7846 uint64x1_t res64;
7847 return64(vrshrq_n_u64(_pM128i(a), b));
7848 }
7849
7850 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
7851 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
7852 {
7853 //no 8 bit shift available, go to 16 bit trick
7854 __m128i r, mask1, maskb;
7855 _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7856 r = vshrq_n_s8 (a, b);
7857 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7858 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7859 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
7860 return _mm_add_epi8(r, maskb); //actual rounding
7861 }
7862
7863 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
7864 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7865 {
7866 __m128i maskb, r;
7867 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7868 maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7869 r = _mm_srai_epi16 (a, b);
7870 return _mm_add_epi16 (r, maskb); //actual rounding
7871 }
7872
7873 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
7874 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7875 {
7876 __m128i maskb, r;
7877 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7878 maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7879 r = _mm_srai_epi32(a, b);
7880 return _mm_add_epi32 (r, maskb); //actual rounding
7881 }
7882
7883 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
7884 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
7885 {
7886 //solution may be not optimal compared with a serial one
7887 __m128i maskb;
7888 int64x2_t r;
7889 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7890 maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7891 r = vshrq_n_s64(a, b);
7892 return _mm_add_epi64 (r, maskb); //actual rounding
7893 }
7894
7895 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
7896 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
7897 {
7898 //no 8 bit shift available, go to 16 bit trick
7899 __m128i r, mask1, maskb;
7900 _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7901 r = vshrq_n_u8 (a, b);
7902 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7903 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
7904 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1
7905 return _mm_add_epi8(r, maskb); //actual rounding
7906 }
7907
7908 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
7909 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
7910 {
7911 __m128i maskb, r;
7912 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
7913 maskb = _mm_srli_epi16(maskb, 15); //1 or 0
7914 r = _mm_srli_epi16 (a, b);
7915 return _mm_add_epi16 (r, maskb); //actual rounding
7916 }
7917
7918 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
7919 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
7920 {
7921 __m128i maskb, r;
7922 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
7923 maskb = _mm_srli_epi32 (maskb,31); //1 or 0
7924 r = _mm_srli_epi32(a, b);
7925 return _mm_add_epi32 (r, maskb); //actual rounding
7926 }
7927
7928 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
7929 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
7930 {
7931 //solution may be not optimal compared with a serial one
7932 __m128i maskb, r;
7933 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
7934 maskb = _mm_srli_epi64 (maskb,63); //1 or 0
7935 r = _mm_srli_epi64(a, b);
7936 return _mm_add_epi64 (r, maskb); //actual rounding
7937 }
7938
7939 //************* Vector shift right by constant and accumulate *********
7940 //*********************************************************************
7941 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
7942 _NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
7943 {
7944 int8x8_t shift;
7945 shift = vshr_n_s8(b, c);
7946 return vadd_s8( a, shift);
7947 }
7948
7949 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
7950 _NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
7951 {
7952 int16x4_t shift;
7953 shift = vshr_n_s16( b, c);
7954 return vadd_s16(a, shift);
7955 }
7956
7957 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
7958 _NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
7959 {
7960 //may be not optimal compared with the serial execution
7961 int32x2_t shift;
7962 shift = vshr_n_s32(b, c);
7963 return vadd_s32( a, shift);
7964 }
7965
7966 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
7967 _NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
7968 {
7969 //may be not optimal compared with a serial solution
7970 int64x1_t shift;
7971 shift = vshr_n_s64(b, c);
7972 return vadd_s64( a, shift);
7973 }
7974
7975 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
7976 _NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
7977 {
7978 uint8x8_t shift;
7979 shift = vshr_n_u8(b, c);
7980 return vadd_u8(a, shift);
7981 }
7982
7983 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
7984 _NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
7985 {
7986 uint16x4_t shift;
7987 shift = vshr_n_u16(b, c);
7988 return vadd_u16(a,shift);
7989 }
7990
7991 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
7992 _NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
7993 {
7994 //may be not optimal compared with the serial execution
7995 uint32x2_t shift;
7996 shift = vshr_n_u32(b, c);
7997 return vadd_u32( a, shift);
7998 }
7999
8000 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
8001 _NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
8002 {
8003 //may be not optimal compared with the serial execution
8004 uint64x1_t shift;
8005 shift = vshr_n_u64(b, c);
8006 return vadd_u64(a, shift);
8007 }
8008
8009 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
8010 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
8011 {
8012 int8x16_t shift;
8013 shift = vshrq_n_s8(b, c);
8014 return vaddq_s8(a, shift);
8015 }
8016
8017 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
8018 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
8019 {
8020 int16x8_t shift;
8021 shift = vshrq_n_s16(b, c);
8022 return vaddq_s16(a, shift);
8023 }
8024
8025 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
8026 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
8027 {
8028 int32x4_t shift;
8029 shift = vshrq_n_s32(b, c);
8030 return vaddq_s32(a, shift);
8031 }
8032
8033 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
8034 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
8035 {
8036 int64x2_t shift;
8037 shift = vshrq_n_s64(b, c);
8038 return vaddq_s64( a, shift);
8039 }
8040
8041 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
8042 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
8043 {
8044 uint8x16_t shift;
8045 shift = vshrq_n_u8(b, c);
8046 return vaddq_u8(a, shift);
8047 }
8048
8049 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
8050 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
8051 {
8052 uint16x8_t shift;
8053 shift = vshrq_n_u16(b, c);
8054 return vaddq_u16(a, shift);
8055 }
8056
8057 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
8058 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
8059 {
8060 uint32x4_t shift;
8061 shift = vshrq_n_u32(b, c);
8062 return vaddq_u32(a, shift);
8063 }
8064
8065 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
8066 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
8067 {
8068 uint64x2_t shift;
8069 shift = vshrq_n_u64(b, c);
8070 return vaddq_u64(a, shift);
8071 }
8072
8073 //************* Vector rounding shift right by constant and accumulate ****************************
8074 //************************************************************************************************
8075 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
8076 _NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
8077 {
8078 int8x8_t shift;
8079 shift = vrshr_n_s8(b, c);
8080 return vadd_s8( a, shift);
8081 }
8082
8083 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
8084 _NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
8085 {
8086 int16x4_t shift;
8087 shift = vrshr_n_s16( b, c);
8088 return vadd_s16(a, shift);
8089 }
8090
8091 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
8092 _NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
8093 {
8094 //may be not optimal compared with the serial execution
8095 int32x2_t shift;
8096 shift = vrshr_n_s32(b, c);
8097 return vadd_s32( a, shift);
8098 }
8099
8100 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
8101 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8102 {
8103 int64x1_t shift;
8104 shift = vrshr_n_s64(b, c);
8105 return vadd_s64( a, shift);
8106 }
8107
8108 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
8109 _NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
8110 {
8111 uint8x8_t shift;
8112 shift = vrshr_n_u8(b, c);
8113 return vadd_u8(a, shift);
8114 }
8115
8116 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
8117 _NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
8118 {
8119 uint16x4_t shift;
8120 shift = vrshr_n_u16(b, c);
8121 return vadd_u16(a,shift);
8122 }
8123
8124 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
8125 _NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
8126 {
8127 //may be not optimal compared with the serial execution
8128 uint32x2_t shift;
8129 shift = vrshr_n_u32(b, c);
8130 return vadd_u32( a, shift);
8131 }
8132
8133 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
8134 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
8135 {
8136 //may be not optimal compared with the serial execution
8137 uint64x1_t shift;
8138 shift = vrshr_n_u64(b, c);
8139 return vadd_u64( a, shift);
8140 }
8141
8142 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
8143 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
8144 {
8145 int8x16_t shift;
8146 shift = vrshrq_n_s8(b, c);
8147 return vaddq_s8(a, shift);
8148 }
8149
8150 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
8151 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
8152 {
8153 int16x8_t shift;
8154 shift = vrshrq_n_s16(b, c);
8155 return vaddq_s16(a, shift);
8156 }
8157
8158 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
8159 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
8160 {
8161 int32x4_t shift;
8162 shift = vrshrq_n_s32(b, c);
8163 return vaddq_s32(a, shift);
8164 }
8165
8166 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
8167 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
8168 {
8169 int64x2_t shift;
8170 shift = vrshrq_n_s64(b, c);
8171 return vaddq_s64(a, shift);
8172 }
8173
8174 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
8175 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
8176 {
8177 uint8x16_t shift;
8178 shift = vrshrq_n_u8(b, c);
8179 return vaddq_u8(a, shift);
8180 }
8181
8182 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
8183 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
8184 {
8185 uint16x8_t shift;
8186 shift = vrshrq_n_u16(b, c);
8187 return vaddq_u16(a, shift);
8188 }
8189
8190 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
8191 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
8192 {
8193 uint32x4_t shift;
8194 shift = vrshrq_n_u32(b, c);
8195 return vaddq_u32(a, shift);
8196 }
8197
8198 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
8199 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
8200 {
8201 uint64x2_t shift;
8202 shift = vrshrq_n_u64(b, c);
8203 return vaddq_u64(a, shift);
8204 }
8205
8206 //**********************Vector saturating shift left by constant *****************************
8207 //********************************************************************************************
8208 //we don't check const ranges assuming they are met
8209 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
8210 _NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
8211 {
8212 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8213 int8x8_t res64;
8214 __m128i a128, r128;
8215 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8216 r128 = _mm_slli_epi16 (a128, b);
8217 r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
8218 return64(r128);
8219 }
8220
8221 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
8222 _NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
8223 {
8224 // go to 32 bit to get the auto saturation (in packs function)
8225 int16x4_t res64;
8226 __m128i a128, r128;
8227 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8228 r128 = _mm_slli_epi32 (a128, b); //shift_res
8229 r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
8230 return64(r128);
8231 }
8232
8233 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
8234 _NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b)
8235 {
8236 //serial execution may be faster
8237 int32x2_t res64;
8238 return64(vqshlq_n_s32 (_pM128i(a), b));
8239 }
8240
8241
8242 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
8243 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8244 {
8245 // no effective SIMD solution here
8246 int64x1_t res;
8247 int64_t bmask;
8248 int64_t a_i64 = *( int64_t*)&a;
8249 bmask = ( int64_t)1 << (63 - b); //positive
8250 if (a_i64 >= bmask) {
8251 res.m64_i64[0] = ~(_SIGNBIT64);
8252 } else {
8253 res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
8254 }
8255 return res;
8256 }
8257
8258
8259 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
8260 _NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
8261 {
8262 //no 8 bit shift available in IA32 SIMD, go to 16 bit
8263 uint8x8_t res64;
8264 __m128i a128, r128;
8265 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
8266 r128 = _mm_slli_epi16 (a128, b); //shift_res
8267 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8268 return64(r128);
8269 }
8270
8271 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
8272 _NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
8273 {
8274 // go to 32 bit to get the auto saturation (in packus function)
8275 uint16x4_t res64;
8276 __m128i a128, r128;
8277 a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
8278 r128 = _mm_slli_epi32 (a128, b); //shift_res
8279 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
8280 return64(r128);
8281 }
8282
8283 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
8284 _NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b)
8285 {
8286 uint32x2_t res64;
8287 return64(vqshlq_n_u32(_pM128i(a), b));
8288 }
8289
8290 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
8291 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8292 {
8293 // no effective SIMD solution here
8294 uint64x1_t res;
8295 uint64_t bmask;
8296 uint64_t a_i64 = *(uint64_t*)&a;
8297 bmask = ( uint64_t)1 << (64 - b);
8298 res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
8299 return res;
8300 }
8301
8302 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
8303 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
8304 {
8305 // go to 16 bit to get the auto saturation (in packs function)
8306 __m128i a128, r128_1, r128_2;
8307 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8308 r128_1 = _mm_slli_epi16 (a128, b);
8309 //swap hi and low part of a128 to process the remaining data
8310 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8311 a128 = _MM_CVTEPI8_EPI16 (a128);
8312 r128_2 = _mm_slli_epi16 (a128, b);
8313 return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
8314 }
8315
8316 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
8317 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
8318 {
8319 // manual saturation solution looks LESS optimal than 32 bits conversion one
8320 // go to 32 bit to get the auto saturation (in packs function)
8321 __m128i a128, r128_1, r128_2;
8322 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8323 r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8324 //swap hi and low part of a128 to process the remaining data
8325 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8326 a128 = _MM_CVTEPI16_EPI32 (a128);
8327 r128_2 = _mm_slli_epi32 (a128, b);
8328 return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
8329 }
8330
8331 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
8332 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
8333 {
8334 // no 64 bit saturation option available, special tricks necessary
8335 __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
8336 c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
8337 maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
8338 saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise
8339 c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
8340 shift_res = _mm_slli_epi32 (a, b);
8341 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8342 //result with positive numbers saturated
8343 shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8344 //treat negative numbers
8345 maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros
8346 saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise
8347 c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
8348 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
8349 return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
8350 }
8351
8352 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
8353 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8354 {
8355 // no effective SIMD solution here
8356 _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
8357 int64_t bmask;
8358 int i;
8359 bmask = ( int64_t)1 << (63 - b); //positive
8360 _mm_store_si128((__m128i*)atmp, a);
8361 for (i = 0; i<2; i++) {
8362 if (atmp[i] >= bmask) {
8363 res[i] = ~(_SIGNBIT64);
8364 } else {
8365 res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
8366 }
8367 }
8368 return _mm_load_si128((__m128i*)res);
8369 }
8370
8371 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
8372 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
8373 {
8374 // go to 16 bit to get the auto saturation (in packs function)
8375 __m128i a128, r128_1, r128_2;
8376 a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
8377 r128_1 = _mm_slli_epi16 (a128, b);
8378 //swap hi and low part of a128 to process the remaining data
8379 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8380 a128 = _MM_CVTEPU8_EPI16 (a128);
8381 r128_2 = _mm_slli_epi16 (a128, b);
8382 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8383 }
8384
8385 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
8386 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
8387 {
8388 // manual saturation solution looks more optimal than 32 bits conversion one
8389 __m128i cb, c8000, a_signed, saturation_mask, shift_res;
8390 cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
8391 c8000 = _mm_set1_epi16 ((int16_t)0x8000);
8392 //no unsigned shorts comparison in SSE, only signed available, so need the trick
8393 a_signed = _mm_sub_epi16(a, c8000); //go to signed
8394 saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
8395 shift_res = _mm_slli_epi16 (a, b);
8396 return _mm_or_si128 (shift_res, saturation_mask);
8397 }
8398
8399 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
8400 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
8401 {
8402 // manual saturation solution, no 64 bit saturation option, the serial version may be faster
8403 __m128i cb, c80000000, a_signed, saturation_mask, shift_res;
8404 cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
8405 c80000000 = _mm_set1_epi32 (0x80000000);
8406 //no unsigned ints comparison in SSE, only signed available, so need the trick
8407 a_signed = _mm_sub_epi32(a, c80000000); //go to signed
8408 saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
8409 shift_res = _mm_slli_epi32 (a, b);
8410 return _mm_or_si128 (shift_res, saturation_mask);
8411 }
8412
8413 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
8414 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8415 {
8416 // no effective SIMD solution here
8417 _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
8418 uint64_t bmask;
8419 int i;
8420 bmask = ( uint64_t)1 << (64 - b);
8421 _mm_store_si128((__m128i*)atmp, a);
8422 for (i = 0; i<2; i++) {
8423 res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
8424 }
8425 return _mm_load_si128((__m128i*)res);
8426 }
8427
8428 //**************Vector signed->unsigned saturating shift left by constant *************
8429 //*************************************************************************************
8430 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
8431 _NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
8432 {
8433 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8434 uint8x8_t res64;
8435 __m128i a128, r128;
8436 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8437 r128 = _mm_slli_epi16 (a128, b);
8438 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
8439 return64(r128);
8440 }
8441
8442 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
8443 _NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
8444 {
8445 uint16x4_t res64;
8446 __m128i a128, r128;
8447 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8448 r128 = _mm_slli_epi32 (a128, b); //shift_res
8449 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
8450 return64(r128);
8451 }
8452
8453 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
8454 _NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b)
8455 {
8456 int32x2_t res64;
8457 return64( vqshluq_n_s32(_pM128i(a), b));
8458 }
8459
8460 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
8461 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
8462 {
8463 uint64x1_t res;
8464 uint64_t limit;
8465 if (a.m64_i64[0]<=0) {
8466 res.m64_u64[0] = 0;
8467 } else {
8468 limit = (uint64_t) 1 << (64 - b);
8469 res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
8470 }
8471 return res;
8472 }
8473
8474 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
8475 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
8476 {
8477 __m128i a128, r128_1, r128_2;
8478 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
8479 r128_1 = _mm_slli_epi16 (a128, b);
8480 //swap hi and low part of a128 to process the remaining data
8481 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8482 a128 = _MM_CVTEPI8_EPI16 (a128);
8483 r128_2 = _mm_slli_epi16 (a128, b);
8484 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
8485 }
8486
8487 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
8488 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
8489 {
8490 // manual saturation solution looks LESS optimal than 32 bits conversion one
8491 __m128i a128, r128_1, r128_2;
8492 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
8493 r128_1 = _mm_slli_epi32 (a128, b); //shift_res
8494 //swap hi and low part of a128 to process the remaining data
8495 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
8496 a128 = _MM_CVTEPI16_EPI32 (a128);
8497 r128_2 = _mm_slli_epi32 (a128, b);
8498 return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
8499 }
8500
8501 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
8502 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
8503 {
8504 //solution may be not optimal compared with the serial one
8505 __m128i zero, maskA, maskGT0, a0, a_masked, a_shift;
8506 zero = _mm_setzero_si128();
8507 maskA = _mm_cmpeq_epi32(a, a);
8508 maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
8509 //saturate negative numbers to zero
8510 maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
8511 a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now
8512 //saturate positive to 0xffffffff
8513 a_masked = _mm_and_si128 (a0, maskA);
8514 a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
8515 a_shift = _mm_slli_epi32 (a0, b);
8516 return _mm_or_si128 (a_shift, a_masked); //actual saturation
8517 }
8518
8519 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
8520 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
8521 {
8522 // no effective SIMD solution here, serial execution looks faster
8523 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8524 _NEON2SSE_ALIGN_16 uint64_t res[2];
8525 uint64_t limit;
8526 int i;
8527 _mm_store_si128((__m128i*)atmp, a);
8528 for (i = 0; i<2; i++) {
8529 if (atmp[i]<=0) {
8530 res[i] = 0;
8531 } else {
8532 limit = (uint64_t) 1 << (64 - b);
8533 res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
8534 }
8535 }
8536 return _mm_load_si128((__m128i*)res);
8537 }
8538
8539 //************** Vector narrowing shift right by constant **************
8540 //**********************************************************************
8541 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8542 _NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8543 {
8544 int8x8_t res64;
8545 __m128i r16;
8546 r16 = vshrq_n_s16(a,b);
8547 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8548 return64(r16);
8549 }
8550
8551 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8552 _NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8553 {
8554 int16x4_t res64;
8555 __m128i r32;
8556 r32 = vshrq_n_s32(a,b);
8557 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8558 return64(r32);
8559 }
8560
8561 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8562 _NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8563 {
8564 int32x2_t res64;
8565 __m128i r64;
8566 r64 = vshrq_n_s64(a,b);
8567 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8568 return64(r64);
8569 }
8570
8571 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
8572 _NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
8573 {
8574 uint8x8_t res64;
8575 __m128i mask, r16;
8576 mask = _mm_set1_epi16(0xff);
8577 r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8578 r16 = _mm_and_si128(r16, mask); //to avoid saturation
8579 r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
8580 return64(r16);
8581 }
8582
8583 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
8584 _NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
8585 {
8586 uint16x4_t res64;
8587 __m128i mask, r32;
8588 mask = _mm_set1_epi32(0xffff);
8589 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
8590 r32 = _mm_and_si128(r32, mask); //to avoid saturation
8591 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8592 return64(r32);
8593 }
8594
8595 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
8596 _NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8597 {
8598 uint32x2_t res64;
8599 __m128i r64;
8600 r64 = vshrq_n_u64(a,b);
8601 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8602 return64(r64);
8603 }
8604
8605 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
8606 //*********************************************************************************************
8607 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
8608 _NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
8609 {
8610 uint8x8_t res64;
8611 __m128i r16;
8612 r16 = vshrq_n_s16(a,b);
8613 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8614 return64(r16);
8615 }
8616
8617 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
8618 _NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
8619 {
8620 uint16x4_t res64;
8621 __m128i r32;
8622 r32 = vshrq_n_s32(a,b);
8623 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only
8624 return64(r32);
8625 }
8626
8627 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
8628 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8629 {
8630 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8631 uint32x2_t res;
8632 int64_t res64;
8633 _mm_store_si128((__m128i*)atmp, a);
8634 if (atmp[0] < 0) {
8635 res.m64_u32[0] = 0;
8636 } else {
8637 res64 = (atmp[0] >> b);
8638 res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
8639 }
8640 if (atmp[1] < 0) {
8641 res.m64_u32[1] = 0;
8642 } else {
8643 res64 = (atmp[1] >> b);
8644 res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
8645 }
8646 return res;
8647 }
8648
8649 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
8650 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
8651 _NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
8652 {
8653 //solution may be not optimal compared with the serial one
8654 __m128i r16;
8655 uint8x8_t res64;
8656 r16 = vrshrq_n_s16(a,b);
8657 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only
8658 return64(r16);
8659 }
8660
8661 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
8662 _NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
8663 {
8664 //solution may be not optimal compared with the serial one
8665 __m128i r32;
8666 uint16x4_t res64;
8667 r32 = vrshrq_n_s32(a,b);
8668 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only
8669 return64(r32);
8670 }
8671
8672 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
8673 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
8674 {
8675 _NEON2SSE_ALIGN_16 int64_t atmp[2];
8676 uint32x2_t res;
8677 int64_t res64;
8678 _mm_store_si128((__m128i*)atmp, a);
8679 if (atmp[0] < 0) {
8680 res.m64_u32[0] = 0;
8681 } else {
8682 res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
8683 res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8684 }
8685 if (atmp[1] < 0) {
8686 res.m64_u32[1] = 0;
8687 } else {
8688 res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) );
8689 res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
8690 }
8691 return res;
8692 }
8693
8694 //***** Vector narrowing saturating shift right by constant ******
8695 //*****************************************************************
8696 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
8697 _NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
8698 {
8699 int8x8_t res64;
8700 __m128i r16;
8701 r16 = vshrq_n_s16(a,b);
8702 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8703 return64(r16);
8704 }
8705
8706 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
8707 _NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
8708 {
8709 int16x4_t res64;
8710 __m128i r32;
8711 r32 = vshrq_n_s32(a,b);
8712 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8713 return64(r32);
8714 }
8715
8716 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
8717 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8718 {
8719 //no optimal SIMD solution found
8720 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
8721 int32x2_t res;
8722 _mm_store_si128((__m128i*)atmp, a);
8723 res64[0] = (atmp[0] >> b);
8724 res64[1] = (atmp[1] >> b);
8725 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8726 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8727 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8728 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8729 res.m64_i32[0] = (int32_t)res64[0];
8730 res.m64_i32[1] = (int32_t)res64[1];
8731 return res;
8732 }
8733
8734 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
8735 _NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
8736 {
8737 uint8x8_t res64;
8738 __m128i r16;
8739 r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8740 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8741 return64(r16);
8742 }
8743
8744 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
8745 _NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
8746 {
8747 uint16x4_t res64;
8748 __m128i r32;
8749 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8750 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8751 return64(r32);
8752 }
8753
8754 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
8755 _NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8756 {
8757 //serial solution may be faster
8758 uint32x2_t res64;
8759 __m128i r64, res_hi, zero;
8760 zero = _mm_setzero_si128();
8761 r64 = vshrq_n_u64(a,b);
8762 res_hi = _mm_srli_epi64(r64, 32);
8763 res_hi = _mm_cmpgt_epi32(res_hi, zero);
8764 r64 = _mm_or_si128(r64, res_hi);
8765 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8766 return64(r64);
8767 }
8768
8769
8770 //********* Vector rounding narrowing shift right by constant *************************
8771 //****************************************************************************************
8772 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8773 _NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8774 {
8775 int8x8_t res64;
8776 __m128i r16;
8777 r16 = vrshrq_n_s16(a,b);
8778 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8779 return64(r16);
8780 }
8781
8782 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8783 _NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8784 {
8785 int16x4_t res64;
8786 __m128i r32;
8787 r32 = vrshrq_n_s32(a,b);
8788 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8789 return64(r32);
8790 }
8791
8792 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8793 _NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
8794 {
8795 int32x2_t res64;
8796 __m128i r64;
8797 r64 = vrshrq_n_s64(a,b);
8798 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8799 return64(r64);
8800 }
8801
8802 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
8803 _NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
8804 {
8805 uint8x8_t res64;
8806 __m128i mask, r16;
8807 mask = _mm_set1_epi16(0xff);
8808 r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8809 r16 = _mm_and_si128(r16, mask); //to avoid saturation
8810 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8811 return64(r16);
8812 }
8813
8814 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
8815 _NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
8816 {
8817 uint16x4_t res64;
8818 __m128i mask, r32;
8819 mask = _mm_set1_epi32(0xffff);
8820 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8821 r32 = _mm_and_si128(r32, mask); //to avoid saturation
8822 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8823 return64(r32);
8824 }
8825
8826 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
8827 _NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
8828 {
8829 uint32x2_t res64;
8830 __m128i r64;
8831 r64 = vrshrq_n_u64(a,b);
8832 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8833 return64(r64);
8834 }
8835
8836 //************* Vector rounding narrowing saturating shift right by constant ************
8837 //****************************************************************************************
8838 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
8839 _NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
8840 {
8841 int8x8_t res64;
8842 __m128i r16;
8843 r16 = vrshrq_n_s16(a,b);
8844 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8845 return64(r16);
8846 }
8847
8848 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
8849 _NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
8850 {
8851 int16x4_t res64;
8852 __m128i r32;
8853 r32 = vrshrq_n_s32(a,b);
8854 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only
8855 return64(r32);
8856 }
8857
8858 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
8859 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
8860 {
8861 //no optimal SIMD solution found
8862 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
8863 int32x2_t res;
8864 _mm_store_si128((__m128i*)atmp, a);
8865 maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
8866 res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
8867 maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
8868 res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
8869 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
8870 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
8871 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
8872 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
8873 res.m64_i32[0] = (int32_t)res64[0];
8874 res.m64_i32[1] = (int32_t)res64[1];
8875 return res;
8876 }
8877
8878 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
8879 _NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
8880 {
8881 uint8x8_t res64;
8882 __m128i r16;
8883 r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8884 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only
8885 return64(r16);
8886 }
8887
8888 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
8889 _NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
8890 {
8891 uint16x4_t res64;
8892 __m128i r32;
8893 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8894 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only
8895 return64(r32);
8896 }
8897
8898 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
8899 _NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
8900 {
8901 //serial solution may be faster
8902 uint32x2_t res64;
8903 __m128i r64, res_hi, zero;
8904 zero = _mm_setzero_si128();
8905 r64 = vrshrq_n_u64(a,b);
8906 res_hi = _mm_srli_epi64(r64, 32);
8907 res_hi = _mm_cmpgt_epi32(res_hi, zero);
8908 r64 = _mm_or_si128(r64, res_hi);
8909 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8910 return64(r64);
8911 }
8912
8913 //************** Vector widening shift left by constant ****************
8914 //************************************************************************
8915 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
8916 _NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
8917 {
8918 __m128i r;
8919 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8920 return _mm_slli_epi16 (r, b);
8921 }
8922
8923 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
8924 _NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
8925 {
8926 __m128i r;
8927 r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
8928 return _mm_slli_epi32 (r, b);
8929 }
8930
8931 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
8932 _NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
8933 {
8934 __m128i r;
8935 r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
8936 return _mm_slli_epi64 (r, b);
8937 }
8938
8939 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
8940 _NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
8941 {
8942 //no uint8 to uint16 conversion available, manual conversion used
8943 __m128i zero, r;
8944 zero = _mm_setzero_si128 ();
8945 r = _mm_unpacklo_epi8(_pM128i(a), zero);
8946 return _mm_slli_epi16 (r, b);
8947 }
8948
8949 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
8950 _NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
8951 {
8952 //no uint16 to uint32 conversion available, manual conversion used
8953 __m128i zero, r;
8954 zero = _mm_setzero_si128 ();
8955 r = _mm_unpacklo_epi16(_pM128i(a), zero);
8956 return _mm_slli_epi32 (r, b);
8957 }
8958
8959 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
8960 _NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
8961 {
8962 //no uint32 to uint64 conversion available, manual conversion used
8963 __m128i zero, r;
8964 zero = _mm_setzero_si128 ();
8965 r = _mm_unpacklo_epi32(_pM128i(a), zero);
8966 return _mm_slli_epi64 (r, b);
8967 }
8968
8969 //************************************************************************************
8970 //**************************** Shifts with insert ************************************
8971 //************************************************************************************
8972 //takes each element in a vector, shifts them by an immediate value,
8973 //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
8974
8975 //**************** Vector shift right and insert ************************************
8976 //Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
8977 //All other bits are taken from b shifted.
8978 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
8979 _NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c)
8980 {
8981 int8x8_t res64;
8982 return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
8983 }
8984
8985
8986 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
8987 _NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c)
8988 {
8989 int16x4_t res64;
8990 return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
8991 }
8992
8993
8994 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
8995 _NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c)
8996 {
8997 int32x2_t res64;
8998 return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
8999 }
9000
9001
9002 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9003 _NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
9004 {
9005 int64x1_t res;
9006 if (c ==64)
9007 res = a;
9008 else{
9009 res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
9010 }
9011 return res;
9012 }
9013
9014 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9015 #define vsri_n_u8 vsri_n_s8
9016
9017 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9018 #define vsri_n_u16 vsri_n_s16
9019
9020 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
9021 #define vsri_n_u32 vsri_n_s32
9022
9023
9024 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
9025 #define vsri_n_u64 vsri_n_s64
9026
9027 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
9028 #define vsri_n_p8 vsri_n_u8
9029
9030 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
9031 #define vsri_n_p16 vsri_n_u16
9032
9033 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9034 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
9035 {
9036 __m128i maskA, a_masked;
9037 uint8x16_t b_shift;
9038 _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
9039 maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
9040 a_masked = _mm_and_si128 (a, maskA);
9041 b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
9042 return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
9043 }
9044
9045 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9046 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
9047 {
9048 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9049 uint16x8_t b_shift;
9050 uint16x8_t a_c;
9051 b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
9052 a_c = vshrq_n_u16( a, (16 - c));
9053 a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
9054 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9055 }
9056
9057 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9058 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
9059 {
9060 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9061 uint32x4_t b_shift;
9062 uint32x4_t a_c;
9063 b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
9064 a_c = vshrq_n_u32( a, (32 - c));
9065 a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
9066 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9067 }
9068
9069 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9070 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
9071 {
9072 //serial solution may be faster
9073 uint64x2_t b_shift;
9074 uint64x2_t a_c;
9075 b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
9076 a_c = _mm_srli_epi64(a, (64 - c));
9077 a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
9078 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
9079 }
9080
9081 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9082 #define vsriq_n_u8 vsriq_n_s8
9083
9084 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9085 #define vsriq_n_u16 vsriq_n_s16
9086
9087 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
9088 #define vsriq_n_u32 vsriq_n_s32
9089
9090 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
9091 #define vsriq_n_u64 vsriq_n_s64
9092
9093 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
9094 #define vsriq_n_p8 vsriq_n_u8
9095
9096 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
9097 #define vsriq_n_p16 vsriq_n_u16
9098
9099 //***** Vector shift left and insert *********************************************
9100 //*********************************************************************************
9101 //Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
9102 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
9103 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9104 _NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c)
9105 {
9106 int8x8_t res64;
9107 return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
9108 }
9109
9110
9111 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9112 _NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c)
9113 {
9114 int16x4_t res64;
9115 return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
9116 }
9117
9118
9119 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9120 _NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c)
9121 {
9122 int32x2_t res64;
9123 return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
9124 }
9125
9126 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9127 _NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
9128 {
9129 int64x1_t res;
9130 res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
9131 return res;
9132 }
9133
9134
9135 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9136 #define vsli_n_u8 vsli_n_s8
9137
9138 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9139 #define vsli_n_u16 vsli_n_s16
9140
9141 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
9142 #define vsli_n_u32 vsli_n_s32
9143
9144 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
9145 #define vsli_n_u64 vsli_n_s64
9146
9147 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
9148 #define vsli_n_p8 vsli_n_u8
9149
9150 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
9151 #define vsli_n_p16 vsli_n_u16
9152
9153 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9154 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
9155 {
9156 __m128i maskA, a_masked;
9157 int8x16_t b_shift;
9158 _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
9159 maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
9160 b_shift = vshlq_n_s8( b, c);
9161 a_masked = _mm_and_si128 (a, maskA);
9162 return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
9163 }
9164
9165 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9166 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
9167 {
9168 //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
9169 int16x8_t b_shift;
9170 int16x8_t a_c;
9171 b_shift = vshlq_n_s16( b, c);
9172 a_c = vshlq_n_s16( a, (16 - c));
9173 a_c = _mm_srli_epi16(a_c, (16 - c));
9174 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9175 }
9176
9177 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9178 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
9179 {
9180 //solution may be not optimal compared with the serial one
9181 //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
9182 int32x4_t b_shift;
9183 int32x4_t a_c;
9184 b_shift = vshlq_n_s32( b, c);
9185 a_c = vshlq_n_s32( a, (32 - c));
9186 a_c = _mm_srli_epi32(a_c, (32 - c));
9187 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9188 }
9189
9190 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9191 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
9192 {
9193 //solution may be not optimal compared with the serial one
9194 //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
9195 int64x2_t b_shift;
9196 int64x2_t a_c;
9197 b_shift = vshlq_n_s64( b, c);
9198 a_c = vshlq_n_s64( a, (64 - c));
9199 a_c = _mm_srli_epi64(a_c, (64 - c));
9200 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
9201 }
9202
9203 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9204 #define vsliq_n_u8 vsliq_n_s8
9205
9206 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9207 #define vsliq_n_u16 vsliq_n_s16
9208
9209 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
9210 #define vsliq_n_u32 vsliq_n_s32
9211
9212 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
9213 #define vsliq_n_u64 vsliq_n_s64
9214
9215 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
9216 #define vsliq_n_p8 vsliq_n_u8
9217
9218 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
9219 #define vsliq_n_p16 vsliq_n_u16
9220
9221 // ***********************************************************************************************
9222 // ****************** Loads and stores of a single vector ***************************************
9223 // ***********************************************************************************************
9224 //Performs loads and stores of a single vector of some type.
9225 //******************************* Loads ********************************************************
9226 // ***********************************************************************************************
9227 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
9228 //also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
9229 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
9230 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
9231 #define LOAD_SI128(ptr) \
9232 ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
9233
9234 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9235 #define vld1q_u8 LOAD_SI128
9236
9237 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9238 #define vld1q_u16 LOAD_SI128
9239
9240 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9241 #define vld1q_u32 LOAD_SI128
9242
9243 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9244 #define vld1q_u64 LOAD_SI128
9245
9246 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9247 #define vld1q_s8 LOAD_SI128
9248
9249 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9250 #define vld1q_s16 LOAD_SI128
9251
9252 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9253 #define vld1q_s32 LOAD_SI128
9254
9255 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9256 #define vld1q_s64 LOAD_SI128
9257
9258 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
9259 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
9260 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
9261 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9262 __m128 f2;
9263 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
9264 }*/
9265
9266 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
9267 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
9268 {
9269 if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
9270 return _mm_load_ps(ptr);
9271 else
9272 return _mm_loadu_ps(ptr);
9273 }
9274
9275 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
9276 #define vld1q_p8 LOAD_SI128
9277
9278 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
9279 #define vld1q_p16 LOAD_SI128
9280
9281 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
9282 #define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
9283
9284 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
9285 #define vld1_u16 vld1_u8
9286
9287 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
9288 #define vld1_u32 vld1_u8
9289
9290
9291 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9292 #define vld1_u64 vld1_u8
9293
9294 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
9295 #define vld1_s8 vld1_u8
9296
9297 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
9298 #define vld1_s16 vld1_u16
9299
9300 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
9301 #define vld1_s32 vld1_u32
9302
9303 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9304 #define vld1_s64 vld1_u64
9305
9306 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
9307 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9308
9309 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
9310 _NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
9311 {
9312 float32x2_t res;
9313 res.m64_f32[0] = *(ptr);
9314 res.m64_f32[1] = *(ptr + 1);
9315 return res;
9316 }
9317
9318 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
9319 #define vld1_p8 vld1_u8
9320
9321 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
9322 #define vld1_p16 vld1_u16
9323
9324
9325 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9326 _NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
9327 {
9328 if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
9329 return _mm_load_pd(ptr);
9330 else
9331 return _mm_loadu_pd(ptr);
9332 }
9333
9334
9335 //***********************************************************************************************************
9336 //******* Lane load functions - insert the data at vector's given position (lane) *************************
9337 //***********************************************************************************************************
9338 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9339 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9340
9341 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9342 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9343
9344 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9345 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9346
9347 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9348 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
9349
9350
9351 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9352 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9353
9354 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9355 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9356
9357 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9358 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9359
9360 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9361 //current IA SIMD doesn't support float16
9362
9363 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
9364 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
9365 {
9366 //we need to deal with ptr 16bit NOT aligned case
9367 __m128 p;
9368 p = _mm_set1_ps(*(ptr));
9369 return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane));
9370 }
9371
9372 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
9373 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9374
9375 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
9376 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9377
9378 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
9379 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9380
9381 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9382 _NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
9383 {
9384 uint8x8_t res;
9385 res = vec;
9386 res.m64_u8[lane] = *(ptr);
9387 return res;
9388 }
9389
9390 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9391 _NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
9392 {
9393 uint16x4_t res;
9394 res = vec;
9395 res.m64_u16[lane] = *(ptr);
9396 return res;
9397 }
9398
9399 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9400 _NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
9401 {
9402 uint32x2_t res;
9403 res = vec;
9404 res.m64_u32[lane] = *(ptr);
9405 return res;
9406 }
9407
9408 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9409 _NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
9410 {
9411 uint64x1_t res;
9412 res.m64_u64[0] = *(ptr);
9413 return res;
9414 }
9415
9416
9417 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9418 #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
9419
9420 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9421 #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
9422
9423 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9424 #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
9425
9426 _NEON2SSESTORAGE float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9427 //current IA SIMD doesn't support float16
9428
9429 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
9430 _NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
9431 {
9432 float32x2_t res;
9433 res = vec;
9434 res.m64_f32[lane] = *(ptr);
9435 return res;
9436 }
9437
9438 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
9439 #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
9440
9441 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
9442 #define vld1_lane_p8 vld1_lane_u8
9443
9444 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
9445 #define vld1_lane_p16 vld1_lane_s16
9446
9447 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
9448 // ******************************************************************************************************************
9449 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9450 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
9451
9452 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9453 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
9454
9455 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9456 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
9457
9458 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9459 _NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
9460 {
9461 _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
9462 return LOAD_SI128(val);
9463 }
9464
9465 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9466 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
9467
9468 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9469 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
9470
9471 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9472 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
9473
9474 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9475 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
9476
9477 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9478 //current IA SIMD doesn't support float16, need to go to 32 bits
9479
9480 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9481 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
9482
9483 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9484 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
9485
9486 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9487 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
9488
9489 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8 (__transfersize (1)uint8_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9490 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9491 {
9492 uint8x8_t res;
9493 int i;
9494 for(i = 0; i<8; i++) {
9495 res.m64_u8[i] = *(ptr);
9496 }
9497 return res;
9498 }
9499
9500 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16 (__transfersize (1)uint16_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9501 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9502 {
9503 uint16x4_t res;
9504 int i;
9505 for(i = 0; i<4; i++) {
9506 res.m64_u16[i] = *(ptr);
9507 }
9508 return res;
9509 }
9510
9511 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32 (__transfersize (1)uint32_t const * ptr),_NEON2SSE_REASON_SLOW_SERIAL)9512 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
9513 {
9514 uint32x2_t res;
9515 res.m64_u32[0] = *(ptr);
9516 res.m64_u32[1] = *(ptr);
9517 return res;
9518 }
9519
9520 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
9521 _NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
9522 {
9523 uint64x1_t res;
9524 res.m64_u64[0] = *(ptr);
9525 return res;
9526 }
9527
9528 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9529 #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
9530
9531
9532 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9533 #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
9534
9535
9536 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9537 #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
9538
9539
9540 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
9541 #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
9542
9543 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
9544 //current IA SIMD doesn't support float16
9545
9546 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
9547 _NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
9548 {
9549 float32x2_t res;
9550 res.m64_f32[0] = *(ptr);
9551 res.m64_f32[1] = res.m64_f32[0];
9552 return res; // use last 64bits only
9553 }
9554
9555 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
9556 #define vld1_dup_p8 vld1_dup_u8
9557
9558
9559 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
9560 #define vld1_dup_p16 vld1_dup_u16
9561
9562
9563 //*************************************************************************************
9564 //********************************* Store **********************************************
9565 //*************************************************************************************
9566 // If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
9567 //here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
9568 #define STORE_SI128(ptr, val) \
9569 (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
9570
9571 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
9572 #define vst1q_u8 STORE_SI128
9573
9574 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
9575 #define vst1q_u16 STORE_SI128
9576
9577 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
9578 #define vst1q_u32 STORE_SI128
9579
9580 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
9581 #define vst1q_u64 STORE_SI128
9582
9583 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
9584 #define vst1q_s8 STORE_SI128
9585
9586 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
9587 #define vst1q_s16 STORE_SI128
9588
9589 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
9590 #define vst1q_s32 STORE_SI128
9591
9592 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
9593 #define vst1q_s64 STORE_SI128
9594
9595 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
9596 // IA32 SIMD doesn't work with 16bit floats currently
9597
9598 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
9599 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
9600 {
9601 if( ((uintptr_t)(ptr) & 15) == 0 ) //16 bits aligned
9602 _mm_store_ps (ptr, val);
9603 else
9604 _mm_storeu_ps (ptr, val);
9605 }
9606
9607 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
9608 #define vst1q_p8 vst1q_u8
9609
9610 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
9611 #define vst1q_p16 vst1q_u16
9612
9613 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
9614 _NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
9615 {
9616 int i;
9617 for (i = 0; i<8; i++) {
9618 *(ptr + i) = ((uint8_t*)&val)[i];
9619 }
9620 //_mm_storel_epi64((__m128i*)ptr, val);
9621 return;
9622 }
9623
9624 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
9625 _NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
9626 {
9627 int i;
9628 for (i = 0; i<4; i++) {
9629 *(ptr + i) = ((uint16_t*)&val)[i];
9630 }
9631 //_mm_storel_epi64((__m128i*)ptr, val);
9632 return;
9633 }
9634
9635 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
9636 _NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
9637 {
9638 int i;
9639 for (i = 0; i<2; i++) {
9640 *(ptr + i) = ((uint32_t*)&val)[i];
9641 }
9642 //_mm_storel_epi64((__m128i*)ptr, val);
9643 return;
9644 }
9645
9646 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
9647 _NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
9648 {
9649 *(ptr) = *((uint64_t*)&val);
9650 //_mm_storel_epi64((__m128i*)ptr, val);
9651 return;
9652 }
9653
9654 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
9655 #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
9656
9657 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
9658 #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
9659
9660 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
9661 #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
9662
9663 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
9664 #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
9665
9666 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
9667 //current IA SIMD doesn't support float16
9668
9669 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
9670 _NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
9671 {
9672 *(ptr) = val.m64_f32[0];
9673 *(ptr + 1) = val.m64_f32[1];
9674 return;
9675 }
9676
9677 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
9678 #define vst1_p8 vst1_u8
9679
9680 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
9681 #define vst1_p16 vst1_u16
9682
9683 //***********Store a lane of a vector into memory (extract given lane) *********************
9684 //******************************************************************************************
9685 _NEON2SSESTORAGE void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9686 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
9687
9688 _NEON2SSESTORAGE void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9689 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
9690
9691 _NEON2SSESTORAGE void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9692 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
9693
9694 _NEON2SSESTORAGE void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9695 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
9696
9697 _NEON2SSESTORAGE void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9698 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
9699
9700 _NEON2SSESTORAGE void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9701 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
9702
9703 _NEON2SSESTORAGE void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9704 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9705
9706 _NEON2SSESTORAGE void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
9707 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9708
9709 _NEON2SSESTORAGE void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9710 //current IA SIMD doesn't support float16
9711
9712 _NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
9713 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
9714 {
9715 int32_t ilane;
9716 ilane = _MM_EXTRACT_PS(val,lane);
9717 *(ptr) = *((float*)&ilane);
9718 }
9719
9720 _NEON2SSESTORAGE void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
9721 #define vst1q_lane_p8 vst1q_lane_u8
9722
9723 _NEON2SSESTORAGE void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
9724 #define vst1q_lane_p16 vst1q_lane_s16
9725
9726 _NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9727 _NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
9728 {
9729 *(ptr) = val.m64_u8[lane];
9730 }
9731
9732 _NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9733 _NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
9734 {
9735 *(ptr) = val.m64_u16[lane];
9736 }
9737
9738 _NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9739 _NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
9740 {
9741 *(ptr) = val.m64_u32[lane];
9742 }
9743
9744 _NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9745 _NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
9746 {
9747 *(ptr) = val.m64_u64[0];
9748 }
9749
9750 _NEON2SSESTORAGE void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9751 #define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
9752
9753 _NEON2SSESTORAGE void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9754 #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
9755
9756 _NEON2SSESTORAGE void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9757 #define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
9758
9759
9760 _NEON2SSESTORAGE void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
9761 #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
9762
9763
9764 _NEON2SSESTORAGE void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9765 //current IA SIMD doesn't support float16
9766
9767 _NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
9768 _NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
9769 {
9770 *(ptr) = val.m64_f32[lane];
9771 }
9772
9773 _NEON2SSESTORAGE void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
9774 #define vst1_lane_p8 vst1_lane_u8
9775
9776 _NEON2SSESTORAGE void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
9777 #define vst1_lane_p16 vst1_lane_s16
9778
9779 //***********************************************************************************************
9780 //**************** Loads and stores of an N-element structure **********************************
9781 //***********************************************************************************************
9782 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
9783 //We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
9784 //****************** 2 elements load *********************************************
9785 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9786 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
9787 {
9788 uint8x16x2_t v;
9789 v.val[0] = vld1q_u8(ptr);
9790 v.val[1] = vld1q_u8((ptr + 16));
9791 v = vuzpq_s8(v.val[0], v.val[1]);
9792 return v;
9793 }
9794
9795 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9796 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
9797 {
9798 uint16x8x2_t v;
9799 v.val[0] = vld1q_u16( ptr);
9800 v.val[1] = vld1q_u16( (ptr + 8));
9801 v = vuzpq_s16(v.val[0], v.val[1]);
9802 return v;
9803 }
9804
9805 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9806 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9807 {
9808 uint32x4x2_t v;
9809 v.val[0] = vld1q_u32 ( ptr);
9810 v.val[1] = vld1q_u32 ( (ptr + 4));
9811 v = vuzpq_s32(v.val[0], v.val[1]);
9812 return v;
9813 }
9814
9815 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
9816 #define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
9817
9818 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9819 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
9820
9821 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9822 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
9823
9824
9825 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
9826 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9827
9828 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
9829 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
9830 {
9831 float32x4x2_t v;
9832 v.val[0] = vld1q_f32 (ptr);
9833 v.val[1] = vld1q_f32 ((ptr + 4));
9834 v = vuzpq_f32(v.val[0], v.val[1]);
9835 return v;
9836 }
9837
9838 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
9839 #define vld2q_p8 vld2q_u8
9840
9841 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
9842 #define vld2q_p16 vld2q_u16
9843
9844 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9845 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
9846 {
9847 uint8x8x2_t v;
9848 __m128i ld128;
9849 ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
9850 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
9851 vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
9852 return v;
9853 }
9854
9855 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9856 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
9857 {
9858 _NEON2SSE_ALIGN_16 uint16x4x2_t v;
9859 __m128i ld128;
9860 ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
9861 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd);
9862 vst1q_u16((v.val), ld128);
9863 return v;
9864 }
9865
9866 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9867 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
9868 {
9869 _NEON2SSE_ALIGN_16 uint32x2x2_t v;
9870 __m128i ld128;
9871 ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
9872 ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
9873 vst1q_u32((v.val), ld128);
9874 return v;
9875 }
9876
9877 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9878 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
9879 {
9880 uint64x1x2_t v;
9881 v.val[0].m64_u64[0] = *(ptr);
9882 v.val[1].m64_u64[0] = *(ptr + 1);
9883 return v;
9884 }
9885
9886 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9887 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
9888
9889 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9890 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
9891
9892 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9893 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
9894
9895 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
9896 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
9897
9898 _NEON2SSESTORAGE float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
9899 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
9900
9901 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
9902 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
9903 {
9904 float32x2x2_t v;
9905 v.val[0].m64_f32[0] = *(ptr);
9906 v.val[0].m64_f32[1] = *(ptr + 2);
9907 v.val[1].m64_f32[0] = *(ptr + 1);
9908 v.val[1].m64_f32[1] = *(ptr + 3);
9909 return v;
9910 }
9911
9912 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
9913 #define vld2_p8 vld2_u8
9914
9915 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
9916 #define vld2_p16 vld2_u16
9917
9918 //******************** Triplets ***************************************
9919 //*********************************************************************
9920 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
9921 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
9922 {
9923 //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
9924 //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
9925 //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
9926 //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
9927 uint8x16x3_t v;
9928 __m128i tmp0, tmp1,tmp2, tmp3;
9929 _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
9930 _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
9931 _NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
9932
9933 v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
9934 v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
9935 v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
9936
9937 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
9938 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
9939 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
9940
9941 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
9942 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
9943 tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
9944 tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
9945 v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
9946 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
9947
9948 tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
9949 tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
9950 v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
9951 v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
9952 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
9953 v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
9954 v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
9955 tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
9956 tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
9957 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
9958
9959 tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
9960 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
9961 v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
9962 v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
9963 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
9964 tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
9965 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
9966 return v;
9967 }
9968
9969 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
9970 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
9971 {
9972 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
9973 uint16x8x3_t v;
9974 __m128i tmp0, tmp1,tmp2, tmp3;
9975 _NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
9976 _NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
9977 _NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
9978
9979 v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
9980 v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
9981 v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
9982
9983 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
9984 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
9985 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
9986
9987 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
9988 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
9989 tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
9990 tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
9991 v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
9992 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
9993
9994 tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
9995 tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
9996 v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
9997 v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
9998 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
9999 v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
10000 v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
10001 tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
10002 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
10003 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
10004
10005 tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
10006 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
10007 v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
10008 v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
10009 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
10010 tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
10011 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
10012 return v;
10013 }
10014
10015 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10016 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10017 {
10018 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10019 uint32x4x3_t v;
10020 __m128i tmp0, tmp1,tmp2, tmp3;
10021 v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3,
10022 v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
10023 v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
10024
10025 tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
10026 tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
10027 tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
10028
10029 tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
10030 v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
10031 tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
10032 v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
10033 v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
10034 v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
10035 return v;
10036 }
10037
10038 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10039 #define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
10040
10041 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10042 #define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
10043
10044 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10045 #define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
10046
10047 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10048 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10049
10050 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
10051 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
10052 {
10053 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10054 float32x4x3_t v;
10055 __m128 tmp0, tmp1,tmp2, tmp3;
10056 v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3,
10057 v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
10058 v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
10059
10060 tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
10061 tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
10062 tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
10063 tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
10064
10065 v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
10066 tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
10067 v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
10068 v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
10069 v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
10070 return v;
10071 }
10072
10073 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
10074 #define vld3q_p8 vld3q_u8
10075
10076 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
10077 #define vld3q_p16 vld3q_u16
10078
10079 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10080 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
10081 {
10082 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10083 uint8x8x3_t v;
10084 __m128i val0, val1, val2, tmp0, tmp1;
10085 _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
10086 _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
10087 val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
10088 val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
10089
10090 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
10091 tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
10092 val0 = _mm_slli_si128(tmp0,10);
10093 val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
10094 val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
10095 val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
10096 _M64(v.val[0], val0);
10097 val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
10098 val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
10099 val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
10100 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
10101 val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
10102 _M64(v.val[1], val1);
10103
10104 tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
10105 val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
10106 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
10107 val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
10108 _M64(v.val[2], val2);
10109 return v;
10110 }
10111
10112 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10113 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
10114 {
10115 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10116 uint16x4x3_t v;
10117 __m128i val0, val1, val2, tmp0, tmp1;
10118 _NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
10119 val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3
10120 val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
10121
10122 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
10123 tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3,
10124 val0 = _mm_slli_si128(tmp0,10);
10125 val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
10126 val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
10127 val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
10128 val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
10129 _M64(v.val[0], val0);
10130
10131 val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
10132 val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
10133 val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
10134 val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
10135 val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
10136 _M64(v.val[1], val1);
10137
10138 tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
10139 tmp1 = _mm_srli_si128(tmp1,4);
10140 tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
10141 val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
10142 _M64(v.val[2], val2);
10143 return v;
10144 }
10145
10146 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10147 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
10148 {
10149 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10150 uint32x2x3_t v;
10151 __m128i val0, val1, val2;
10152 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10153 val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
10154
10155 val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
10156 _M64(v.val[0], val0);
10157 val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1,
10158 val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
10159 _M64(v.val[1], val1);
10160 val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x,
10161 _M64(v.val[2], val2);
10162 return v;
10163 }
10164 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10165 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10166 {
10167 uint64x1x3_t v;
10168 v.val[0].m64_u64[0] = *(ptr);
10169 v.val[1].m64_u64[0] = *(ptr + 1);
10170 v.val[2].m64_u64[0] = *(ptr + 2);
10171 return v;
10172 }
10173
10174 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10175 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
10176
10177 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10178 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
10179
10180 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10181 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
10182
10183 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10184 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
10185
10186 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10187 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10188
10189 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
10190 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
10191 {
10192 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10193 float32x2x3_t v;
10194 v.val[0].m64_f32[0] = *(ptr);
10195 v.val[0].m64_f32[1] = *(ptr + 3);
10196
10197 v.val[1].m64_f32[0] = *(ptr + 1);
10198 v.val[1].m64_f32[1] = *(ptr + 4);
10199
10200 v.val[2].m64_f32[0] = *(ptr + 2);
10201 v.val[2].m64_f32[1] = *(ptr + 5);
10202 return v;
10203 }
10204
10205 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
10206 #define vld3_p8 vld3_u8
10207
10208 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
10209 #define vld3_p16 vld3_u16
10210
10211 //*************** Quadruples load ********************************
10212 //*****************************************************************
10213 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10214 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
10215 {
10216 uint8x16x4_t v;
10217 __m128i tmp3, tmp2, tmp1, tmp0;
10218
10219 v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
10220 v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
10221 v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
10222 v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
10223
10224 tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
10225 tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
10226 tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
10227 tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
10228
10229 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
10230 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
10231 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
10232 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
10233
10234 tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
10235 tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
10236 tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
10237 tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
10238
10239 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
10240 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
10241 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
10242 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
10243 return v;
10244 }
10245
10246 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10247 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
10248 {
10249 uint16x8x4_t v;
10250 __m128i tmp3, tmp2, tmp1, tmp0;
10251 tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7
10252 tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
10253 tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
10254 tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
10255 v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
10256 v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
10257 v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
10258 v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
10259 tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
10260 tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
10261 tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
10262 tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
10263 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
10264 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
10265 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
10266 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
10267 return v;
10268 }
10269
10270 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10271 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10272 {
10273 uint32x4x4_t v;
10274 __m128i tmp3, tmp2, tmp1, tmp0;
10275 v.val[0] = vld1q_u32 (ptr);
10276 v.val[1] = vld1q_u32 ((ptr + 4));
10277 v.val[2] = vld1q_u32 ((ptr + 8));
10278 v.val[3] = vld1q_u32 ((ptr + 12));
10279 tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
10280 tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
10281 tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
10282 tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
10283 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
10284 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
10285 v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
10286 v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
10287 return v;
10288 }
10289
10290 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10291 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
10292
10293 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10294 #define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
10295
10296 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10297 #define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
10298
10299 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10300 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10301
10302 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
10303 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
10304 {
10305 float32x4x4_t v;
10306 __m128 tmp3, tmp2, tmp1, tmp0;
10307
10308 v.val[0] = vld1q_f32 ((float*) ptr);
10309 v.val[1] = vld1q_f32 ((float*) (ptr + 4));
10310 v.val[2] = vld1q_f32 ((float*) (ptr + 8));
10311 v.val[3] = vld1q_f32 ((float*) (ptr + 12));
10312 tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
10313 tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
10314 tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
10315 tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
10316 v.val[0] = _mm_movelh_ps(tmp0, tmp2);
10317 v.val[1] = _mm_movehl_ps(tmp2, tmp0);
10318 v.val[2] = _mm_movelh_ps(tmp1, tmp3);
10319 v.val[3] = _mm_movehl_ps(tmp3, tmp1);
10320 return v;
10321 }
10322
10323 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
10324 #define vld4q_p8 vld4q_u8
10325
10326 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
10327 #define vld4q_p16 vld4q_s16
10328
10329 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10330 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
10331 {
10332 uint8x8x4_t v;
10333 __m128i sh0, sh1;
10334 __m128i val0, val2;
10335 _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
10336
10337 val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
10338 val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
10339
10340 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
10341 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
10342 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
10343 vst1q_u8(&v.val[0], val0 );
10344 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
10345 vst1q_u8(&v.val[2], val2 );
10346 return v;
10347 }
10348
10349 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10350 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
10351 {
10352 uint16x4x4_t v;
10353 __m128i sh0, sh1;
10354 __m128i val0, val2;
10355 _NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
10356 val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
10357 val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
10358 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
10359 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
10360 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
10361 vst1q_u16(&v.val[0], val0 );
10362 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
10363 vst1q_u16(&v.val[2], val2 );
10364 return v;
10365 }
10366
10367 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10368 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
10369 {
10370 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10371 uint32x2x4_t v;
10372 __m128i val0, val01, val2;
10373 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1,
10374 val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
10375 val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
10376 val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
10377 vst1q_u32(&v.val[0], val01);
10378 vst1q_u32(&v.val[2], val2 );
10379 return v;
10380 }
10381
10382 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10383 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10384 {
10385 uint64x1x4_t v;
10386 v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
10387 v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
10388 v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
10389 v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
10390 return v;
10391 }
10392
10393 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10394 #define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
10395
10396 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10397 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
10398
10399 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10400 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
10401
10402 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10403 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
10404
10405 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10407
10408 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
10409 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
10410 {
10411 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10412 float32x2x4_t res;
10413 res.val[0].m64_f32[0] = *(ptr);
10414 res.val[0].m64_f32[1] = *(ptr + 4);
10415 res.val[1].m64_f32[0] = *(ptr + 1);
10416 res.val[1].m64_f32[1] = *(ptr + 5);
10417 res.val[2].m64_f32[0] = *(ptr + 2);
10418 res.val[2].m64_f32[1] = *(ptr + 6);
10419 res.val[3].m64_f32[0] = *(ptr + 3);
10420 res.val[3].m64_f32[1] = *(ptr + 7);
10421 return res;
10422 }
10423
10424 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
10425 #define vld4_p8 vld4_u8
10426
10427 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
10428 #define vld4_p16 vld4_u16
10429
10430 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
10431 //*******************************************************************************************************************
10432 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10433 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
10434 {
10435 uint8x8x2_t v;
10436 __m128i val0, val1;
10437 val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
10438 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
10439 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
10440 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10441 vst1q_u8(v.val, val0);
10442 return v;
10443 }
10444
10445 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10446 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
10447 {
10448 uint16x4x2_t v;
10449 __m128i val0, val1;
10450 val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
10451 val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
10452 _M64(v.val[0], val0);
10453 val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
10454 _M64(v.val[1], val1);
10455 return v;
10456 }
10457
10458 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10459 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10460 {
10461 uint32x2x2_t v;
10462 __m128i val0;
10463 val0 = LOAD_SI128(ptr); //0,1,x,x
10464 val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
10465 vst1q_u32(v.val, val0);
10466 return v;
10467 }
10468
10469 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10470 #define vld2_dup_u64 vld2_u64
10471
10472 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10473 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
10474
10475 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10476 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
10477
10478 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10479 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
10480
10481 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
10482 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
10483
10484 _NEON2SSESTORAGE float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10485 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10486
10487 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
10488 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
10489 {
10490 float32x2x2_t v;
10491 v.val[0].m64_f32[0] = *(ptr); //0,0
10492 v.val[0].m64_f32[1] = *(ptr); //0,0
10493 v.val[1].m64_f32[0] = *(ptr + 1); //1,1
10494 v.val[1].m64_f32[1] = *(ptr + 1); //1,1
10495 return v;
10496 }
10497
10498 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
10499 #define vld2_dup_p8 vld2_dup_u8
10500
10501 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
10502 #define vld2_dup_p16 vld2_dup_s16
10503
10504 //************* Duplicate (or propagate)triplets: *******************
10505 //********************************************************************
10506 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
10507 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10508 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
10509 {
10510 uint8x8x3_t v;
10511 __m128i val0, val1, val2;
10512 val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
10513 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
10514 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
10515 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10516 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
10517 vst1q_u8(v.val, val0);
10518 _M64(v.val[2], val2);
10519 return v;
10520 }
10521
10522 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10523 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
10524 {
10525 uint16x4x3_t v;
10526 __m128i val0, val1, val2;
10527 val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
10528 val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
10529 val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
10530 val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
10531 _M64(v.val[0], val0);
10532 _M64(v.val[1], val1);
10533 _M64(v.val[2], val2);
10534 return v;
10535 }
10536
10537 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10538 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10539 {
10540 uint32x2x3_t v;
10541 __m128i val0, val1, val2;
10542 val2 = LOAD_SI128(ptr); //0,1,2,x
10543 val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
10544 val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
10545 val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
10546 _M64(v.val[0], val0);
10547 _M64(v.val[1], val1);
10548 _M64(v.val[2], val2);
10549 return v;
10550 }
10551
10552 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10553 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
10554 {
10555 uint64x1x3_t v;
10556 v.val[0].m64_u64[0] = *(ptr);
10557 v.val[1].m64_u64[0] = *(ptr + 1);
10558 v.val[2].m64_u64[0] = *(ptr + 2);
10559 return v;
10560 }
10561
10562 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10563 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
10564
10565 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10566 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
10567
10568 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10569 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
10570
10571 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
10572 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
10573
10574
10575 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10576 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10577
10578 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10579 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10580 {
10581 float32x2x3_t v;
10582 int i;
10583 for (i = 0; i<3; i++) {
10584 v.val[i].m64_f32[0] = *(ptr + i);
10585 v.val[i].m64_f32[1] = *(ptr + i);
10586 }
10587 return v;
10588 }
10589
10590 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10591 #define vld3_dup_p8 vld3_dup_u8
10592
10593 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10594 #define vld3_dup_p16 vld3_dup_s16
10595
10596
10597 //************* Duplicate (or propagate) quadruples: *******************
10598 //***********************************************************************
10599 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
10600 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10601 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10602 {
10603 uint8x8x4_t v;
10604 __m128i val0, val1, val2;
10605 val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
10606 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
10607 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
10608 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10609 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
10610 vst1q_u8(&v.val[0], val0);
10611 vst1q_u8(&v.val[2], val2);
10612 return v;
10613 }
10614
10615 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10616 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10617 {
10618 uint16x4x4_t v;
10619 __m128i val0, val1, val2, val3;
10620 val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
10621 val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
10622 val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
10623 val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
10624 val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
10625 _M64(v.val[0], val0);
10626 _M64(v.val[1], val1);
10627 _M64(v.val[2], val2);
10628 _M64(v.val[3], val3);
10629 return v;
10630 }
10631
10632 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10633 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10634 {
10635 uint32x2x4_t v;
10636 __m128i val0, val1, val2, val3;
10637 val3 = LOAD_SI128(ptr); //0,1,2,3
10638 val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
10639 val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
10640 val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
10641 val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
10642 _M64(v.val[0], val0);
10643 _M64(v.val[1], val1);
10644 _M64(v.val[2], val2);
10645 _M64(v.val[3], val3);
10646 return v;
10647 }
10648
10649 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10650 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
10651 {
10652 uint64x1x4_t v;
10653 v.val[0].m64_u64[0] = *(ptr);
10654 v.val[1].m64_u64[0] = *(ptr + 1);
10655 v.val[2].m64_u64[0] = *(ptr + 2);
10656 v.val[3].m64_u64[0] = *(ptr + 3);
10657 return v;
10658 }
10659
10660 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10661 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
10662
10663 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10664 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
10665
10666 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10667 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
10668
10669 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
10670 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
10671
10672 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10673 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10674
10675 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10676 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10677 {
10678 float32x2x4_t v;
10679 int i;
10680 for (i = 0; i<4; i++) {
10681 v.val[i].m64_f32[0] = *(ptr + i);
10682 v.val[i].m64_f32[1] = *(ptr + i);
10683 }
10684 return v;
10685 }
10686
10687 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10688 #define vld4_dup_p8 vld4_dup_u8
10689
10690 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10691 #define vld4_dup_p16 vld4_dup_u16
10692
10693
10694 //**********************************************************************************
10695 //*******************Lane loads for an N-element structures ***********************
10696 //**********************************************************************************
10697 //********************** Lane pairs ************************************************
10698 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
10699 //we assume src is 16 bit aligned
10700
10701 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
10702 //to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
10703
10704 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10705 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
10706 {
10707 uint16x8x2_t v;
10708 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
10709 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
10710 return v;
10711 }
10712 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
10713
10714 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10715 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10716 {
10717 uint32x4x2_t v;
10718 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
10719 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
10720 return v;
10721 }
10722 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
10723
10724 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10725 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
10726 {
10727 int16x8x2_t v;
10728 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane);
10729 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane);
10730 return v;
10731 }
10732 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
10733
10734 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10735 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
10736 {
10737 int32x4x2_t v;
10738 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
10739 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
10740 return v;
10741 }
10742 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
10743
10744 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10745 //current IA SIMD doesn't support float16
10746
10747 //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10748 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
10749 {
10750 float32x4x2_t v;
10751 v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
10752 v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
10753 return v;
10754 }
10755 #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
10756
10757 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10758 #define vld2q_lane_p16 vld2q_lane_u16
10759
10760 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10761 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
10762 {
10763 uint8x8x2_t v;
10764 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10765 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10766 return v;
10767 }
10768
10769 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10770 _NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane)
10771 {
10772 uint16x4x2_t v;
10773 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
10774 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
10775 return v;
10776 }
10777
10778 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10779 _NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane)
10780 {
10781 uint32x2x2_t v;
10782 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
10783 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
10784 return v;
10785 }
10786
10787 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10788 #define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
10789
10790 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10791 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
10792
10793 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10794 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
10795
10796 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10797 //current IA SIMD doesn't support float16
10798
10799 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
10800 _NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane)
10801 {
10802 float32x2x2_t v;
10803 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
10804 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
10805 return v;
10806 }
10807
10808 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10809 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
10810 #define vld2_lane_p8 vld2_lane_u8
10811
10812 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10813 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10814 #define vld2_lane_p16 vld2_lane_u16
10815
10816 //*********** Lane triplets **********************
10817 //*************************************************
10818 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
10819 //we assume src is 16 bit aligned
10820
10821 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10822 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10823 {
10824 uint16x8x3_t v;
10825 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10826 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10827 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10828 return v;
10829 }
10830 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
10831
10832 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10833 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10834 {
10835 uint32x4x3_t v;
10836 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10837 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10838 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10839 return v;
10840 }
10841 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
10842
10843 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10844 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10845 {
10846 int16x8x3_t v;
10847 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10848 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10849 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10850 return v;
10851 }
10852 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
10853
10854 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10855 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10856 {
10857 int32x4x3_t v;
10858 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10859 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10860 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10861 return v;
10862 }
10863 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
10864
10865 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10866 //current IA SIMD doesn't support float16
10867 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
10868
10869
10870 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10871 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10872 {
10873 float32x4x3_t v;
10874 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10875 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10876 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10877 return v;
10878 }
10879 #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
10880
10881 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10882 #define vld3q_lane_p16 vld3q_lane_u16
10883
10884 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10885 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10886 {
10887 uint8x8x3_t v;
10888 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
10889 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
10890 v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
10891 return v;
10892 }
10893
10894 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10895 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10896 {
10897 uint16x4x3_t v;
10898 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
10899 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
10900 v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
10901 return v;
10902 }
10903
10904 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10905 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10906 {
10907 //need to merge into 128 bit anyway
10908 uint32x2x3_t v;
10909 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);;
10910 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);;
10911 v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);;
10912 return v;
10913 }
10914
10915 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10916 #define vld3_lane_s8(ptr, src, lane) vld3_lane_u8(( uint8_t*) ptr, src, lane)
10917
10918 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10919 #define vld3_lane_s16(ptr, src, lane) vld3_lane_u16(( uint16_t*) ptr, src, lane)
10920
10921 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10922 #define vld3_lane_s32(ptr, src, lane) vld3_lane_u32(( uint32_t*) ptr, src, lane)
10923
10924 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10925 //current IA SIMD doesn't support float16
10926
10927 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10928 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10929 {
10930 float32x2x3_t v;
10931 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
10932 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
10933 v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
10934 return v;
10935 }
10936
10937 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10938 #define vld3_lane_p8 vld3_lane_u8
10939
10940 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10941 #define vld3_lane_p16 vld3_lane_u16
10942
10943 //******************* Lane Quadruples load ***************************
10944 //*********************************************************************
10945 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
10946 //we assume src is 16 bit aligned
10947
10948 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10949 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
10950 {
10951 uint16x8x4_t v;
10952 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane);
10953 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane);
10954 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane);
10955 v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane);
10956 return v;
10957 }
10958 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
10959
10960 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10961 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
10962 {
10963 uint32x4x4_t v;
10964 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane);
10965 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane);
10966 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane);
10967 v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane);
10968 return v;
10969 }
10970 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
10971
10972 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10973 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10974 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
10975
10976 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10977 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10978 #define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
10979
10980 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10981 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10982 //current IA SIMD doesn't support float16
10983
10984 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10985 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
10986 {
10987 float32x4x4_t v;
10988 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
10989 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
10990 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
10991 v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
10992 return v;
10993 }
10994 #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
10995
10996 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10997 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10998 #define vld4q_lane_p16 vld4q_lane_u16
10999
11000 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11001 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)
11002 {
11003 uint8x8x4_t v;
11004 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
11005 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
11006 v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
11007 v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane);
11008 return v;
11009 }
11010
11011 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11012 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane)
11013 {
11014 uint16x4x4_t v;
11015 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
11016 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
11017 v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
11018 v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane);
11019 return v;
11020 }
11021
11022 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11023 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane)
11024 {
11025 uint32x2x4_t v;
11026 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
11027 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
11028 v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);
11029 v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane);
11030 return v;
11031 }
11032
11033 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11034 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
11035
11036 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11037 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
11038
11039 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11040 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
11041
11042 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11043 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
11044 //current IA SIMD doesn't support float16
11045
11046 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11047 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)
11048 {
11049 //serial solution may be faster
11050 float32x2x4_t v;
11051 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
11052 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
11053 v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
11054 v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane);
11055 return v;
11056 }
11057
11058 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11059 #define vld4_lane_p8 vld4_lane_u8
11060
11061 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11062 #define vld4_lane_p16 vld4_lane_u16
11063
11064 //******************* Store duplets *********************************************
11065 //********************************************************************************
11066 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
11067 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
11068 {
11069 uint8x16x2_t v;
11070 v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
11071 v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
11072 vst1q_u8 (ptr, v.val[0]);
11073 vst1q_u8 ((ptr + 16), v.val[1]);
11074 }
11075 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
11076
11077 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
11078 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
11079 {
11080 uint16x8x2_t v;
11081 v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
11082 v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
11083 vst1q_u16 (ptr, v.val[0]);
11084 vst1q_u16 ((ptr + 8), v.val[1]);
11085 }
11086 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
11087
11088 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11089 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
11090 {
11091 uint32x4x2_t v;
11092 v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
11093 v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
11094 vst1q_u32 (ptr, v.val[0]);
11095 vst1q_u32 ((ptr + 4), v.val[1]);
11096 }
11097 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
11098
11099 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
11100 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
11101 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
11102
11103 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11104 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
11105 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
11106
11107 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
11108 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
11109 #define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
11110
11111 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11112 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
11113 // IA32 SIMD doesn't work with 16bit floats currently
11114
11115 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11116 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
11117 {
11118 float32x4x2_t v;
11119 v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
11120 v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
11121 vst1q_f32 (ptr, v.val[0]);
11122 vst1q_f32 ((ptr + 4), v.val[1]);
11123 }
11124 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
11125
11126 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
11127 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
11128 #define vst2q_p8 vst2q_u8
11129
11130 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11131 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
11132 #define vst2q_p16 vst2q_u16
11133
11134 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11135 _NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val)
11136 {
11137 __m128i v0;
11138 v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
11139 vst1q_u8 (ptr, v0);
11140 }
11141
11142 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
11143 _NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val)
11144 {
11145 __m128i v0;
11146 v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
11147 vst1q_u16 (ptr, v0);
11148 }
11149
11150 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
11151 _NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val)
11152 {
11153 __m128i v0;
11154 v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
11155 vst1q_u32 (ptr, v0);
11156 }
11157
11158 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
11159 _NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val)
11160 {
11161 *(ptr) = val.val[0].m64_u64[0];
11162 *(ptr + 1) = val.val[1].m64_u64[0];
11163 }
11164
11165 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11166 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
11167
11168 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11169 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
11170
11171 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11172 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
11173
11174 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
11175 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
11176
11177 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11178 //current IA SIMD doesn't support float16
11179
11180 _NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11181 _NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val)
11182 {
11183 *(ptr) = val.val[0].m64_f32[0];
11184 *(ptr + 1) = val.val[1].m64_f32[0];
11185 *(ptr + 2) = val.val[0].m64_f32[1];
11186 *(ptr + 3) = val.val[1].m64_f32[1];
11187 }
11188
11189 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
11190 #define vst2_p8 vst2_u8
11191
11192 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11193 #define vst2_p16 vst2_u16
11194
11195 //******************** Triplets store *****************************************
11196 //******************************************************************************
11197 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
11198 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
11199 {
11200 uint8x16x3_t v;
11201 __m128i v0,v1,v2, cff, bldmask;
11202 _NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
11203 _NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
11204 _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
11205 _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
11206 _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
11207 _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
11208
11209 v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
11210 v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
11211 v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
11212 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11213 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11214 cff = _mm_cmpeq_epi8(v0, v0); //all ff
11215 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
11216 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11217 vst1q_u8(ptr, v.val[0]);
11218 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11219 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11220 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
11221 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11222 vst1q_u8((ptr + 16), v.val[1]);
11223 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11224 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11225 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
11226 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11227 vst1q_u8((ptr + 32), v.val[2]);
11228 }
11229 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
11230
11231 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
11232 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
11233 {
11234 uint16x8x3_t v;
11235 __m128i v0,v1,v2, cff, bldmask;
11236 _NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
11237 _NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
11238 _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
11239 _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
11240 _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
11241 _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
11242
11243 v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
11244 v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
11245 v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
11246 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
11247 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
11248 cff = _mm_cmpeq_epi16(v0, v0); //all ff
11249 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
11250 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
11251 vst1q_u16(ptr, v.val[0]);
11252 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
11253 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
11254 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
11255 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
11256 vst1q_u16((ptr + 8), v.val[1]);
11257 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
11258 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
11259 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
11260 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
11261 vst1q_u16((ptr + 16), v.val[2]);
11262 }
11263 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
11264
11265 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11266 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
11267 {
11268 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
11269 uint32x4x3_t v;
11270 __m128i tmp0, tmp1,tmp2;
11271 tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
11272 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
11273 tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
11274 v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
11275 v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
11276 v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
11277 tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
11278 v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
11279
11280 vst1q_u32(ptr, v.val[0]);
11281 vst1q_u32((ptr + 4), v.val[1]);
11282 vst1q_u32((ptr + 8), v.val[2]);
11283 }
11284 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
11285
11286 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
11287 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
11288 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
11289
11290 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
11291 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
11292 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
11293
11294 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
11295 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
11296 #define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
11297
11298 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11299 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
11300 // IA32 SIMD doesn't work with 16bit floats currently
11301
11302 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11303 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
11304 {
11305 float32x4x3_t v;
11306 __m128 tmp0, tmp1,tmp2;
11307 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
11308 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
11309 tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
11310 v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
11311 v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
11312 v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
11313 tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
11314 v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
11315
11316 vst1q_f32( ptr, v.val[0]);
11317 vst1q_f32( (ptr + 4), v.val[1]);
11318 vst1q_f32( (ptr + 8), v.val[2]);
11319 }
11320 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
11321
11322 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
11323 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
11324 #define vst3q_p8 vst3q_u8
11325
11326 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11327 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
11328 #define vst3q_p16 vst3q_u16
11329
11330 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11331 _NEON2SSE_INLINE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)
11332 {
11333 __m128i tmp, sh0, sh1, val0, val2;
11334 _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
11335 _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
11336 _NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
11337 _NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
11338 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) );
11339 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
11340 val2 = _pM128i(val.val[2]);
11341 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11342 val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
11343 vst1q_u8(ptr, val0); //store as 128 bit structure
11344 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
11345 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11346 val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
11347 _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory
11348 }
11349
11350 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11351 _NEON2SSE_INLINE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)
11352 {
11353 __m128i tmp, val0, val1, val2;
11354 _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
11355 _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0};
11356 _NEON2SSE_ALIGN_16 static const uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1]
11357 _NEON2SSE_ALIGN_16 static const uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0]
11358 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]));
11359 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
11360 val2 = _pM128i(val.val[2]);
11361 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
11362 val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
11363 vst1q_u16(ptr, val0); //store as 128 bit structure
11364 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
11365 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
11366 val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order
11367 _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory
11368 }
11369
11370 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11371 _NEON2SSE_INLINE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)
11372 {
11373 //val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x;
11374 __m128i val0, val1;
11375 val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5
11376 val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
11377 val1 = _mm_srli_si128(val0, 8); //4,5, x,x
11378 _M64((*(__m64_128*)(ptr + 4)), val1);
11379 val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2
11380 val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
11381 vst1q_u32(ptr, val0); //store as 128 bit structure
11382 }
11383
11384 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0]
11385 _NEON2SSE_INLINE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)
11386 {
11387 *(ptr) = val.val[0].m64_u64[0];
11388 *(ptr + 1) = val.val[1].m64_u64[0];
11389 *(ptr + 2) = val.val[2].m64_u64[0];
11390 }
11391
11392 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
11393 #define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val)
11394
11395 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
11396 #define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val)
11397
11398 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
11399 #define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val)
11400
11401 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
11402 #define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val)
11403
11404 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11405 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
11406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11407
11408 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
11409 _NEON2SSE_INLINE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)
11410 {
11411 //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5
11412 *(ptr) = val.val[0].m64_f32[0];
11413 *(ptr + 1) = val.val[1].m64_f32[0];
11414 *(ptr + 2) = val.val[2].m64_f32[0];
11415 *(ptr + 3) = val.val[0].m64_f32[1];
11416 *(ptr + 4) = val.val[1].m64_f32[1];
11417 *(ptr + 5) = val.val[2].m64_f32[1];
11418 }
11419
11420 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11421 #define vst3_p8 vst3_u8
11422
11423 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11424 #define vst3_p16 vst3_u16
11425
11426 //*************** Quadruples store ********************************
11427 //*********************************************************************
11428 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
11429 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
11430 {
11431 __m128i tmp1, tmp2, res;
11432 tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
11433 tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
11434 res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
11435 vst1q_u8(ptr, res);
11436 res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
11437 vst1q_u8((ptr + 16), res);
11438 tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
11439 tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
11440 res = _mm_unpacklo_epi16(tmp1, tmp2); //
11441 vst1q_u8((ptr + 32), res);
11442 res = _mm_unpackhi_epi16(tmp1, tmp2); //
11443 vst1q_u8((ptr + 48), res);
11444 }
11445 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
11446
11447 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
11448 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
11449 {
11450 uint16x8x4_t v;
11451 __m128i tmp1, tmp2;
11452 tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11453 tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11454 v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
11455 v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
11456 tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11457 tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11458 v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
11459 v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
11460 vst1q_u16(ptr, v.val[0]);
11461 vst1q_u16((ptr + 8), v.val[1]);
11462 vst1q_u16((ptr + 16),v.val[2]);
11463 vst1q_u16((ptr + 24), v.val[3]);
11464 }
11465 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
11466
11467 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11468 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
11469 {
11470 uint16x8x4_t v;
11471 __m128i tmp1, tmp2;
11472 tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11473 tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11474 v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
11475 v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
11476 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
11477 tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
11478 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
11479 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
11480 vst1q_u32(ptr, v.val[0]);
11481 vst1q_u32((ptr + 4), v.val[1]);
11482 vst1q_u32((ptr + 8), v.val[2]);
11483 vst1q_u32((ptr + 12), v.val[3]);
11484 }
11485 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
11486
11487 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
11488 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
11489 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
11490
11491 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
11492 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
11493 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
11494
11495 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
11496 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
11497 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
11498
11499 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11500 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
11501 // IA32 SIMD doesn't work with 16bit floats currently
11502
11503 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11504 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
11505 {
11506 __m128 tmp3, tmp2, tmp1, tmp0;
11507 float32x4x4_t v;
11508 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
11509 tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
11510 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
11511 tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
11512 v.val[0] = _mm_movelh_ps(tmp0, tmp2);
11513 v.val[1] = _mm_movehl_ps(tmp2, tmp0);
11514 v.val[2] = _mm_movelh_ps(tmp1, tmp3);
11515 v.val[3] = _mm_movehl_ps(tmp3, tmp1);
11516 vst1q_f32(ptr, v.val[0]);
11517 vst1q_f32((ptr + 4), v.val[1]);
11518 vst1q_f32((ptr + 8), v.val[2]);
11519 vst1q_f32((ptr + 12), v.val[3]);
11520 }
11521 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
11522
11523 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
11524 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
11525 #define vst4q_p8 vst4q_u8
11526
11527 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11528 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
11529 #define vst4q_p16 vst4q_s16
11530
11531 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11532 _NEON2SSE_INLINE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)
11533 {
11534 __m128i sh0, sh1, val0, val2;
11535 sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
11536 sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
11537 val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
11538 val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
11539 vst1q_u8(ptr, val0);
11540 vst1q_u8((ptr + 16), val2);
11541 }
11542
11543 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11544 _NEON2SSE_INLINE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)
11545 {
11546 __m128i sh0, sh1, val0, val2;
11547 sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
11548 sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
11549 val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
11550 val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
11551 vst1q_u16(ptr, val0); //store as 128 bit structure
11552 vst1q_u16((ptr + 8), val2);
11553 }
11554
11555 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11556 _NEON2SSE_INLINE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)
11557 {
11558 //0,4, 1,5, 2,6, 3,7
11559 __m128i sh0, sh1, val0, val1;
11560 sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5
11561 sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7
11562 val0 = _mm_unpacklo_epi64(sh0,sh1); //
11563 val1 = _mm_unpackhi_epi64(sh0,sh1); //
11564 vst1q_u32(ptr, val0); //store as 128 bit structure
11565 vst1q_u32((ptr + 4), val1);
11566 }
11567
11568 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0]
11569 _NEON2SSE_INLINE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)
11570 {
11571 *(ptr) = val.val[0].m64_u64[0];
11572 *(ptr + 1) = val.val[1].m64_u64[0];
11573 *(ptr + 2) = val.val[2].m64_u64[0];
11574 *(ptr + 3) = val.val[3].m64_u64[0];
11575 }
11576
11577 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0]
11578 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
11579
11580 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0]
11581 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
11582
11583 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
11584 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
11585
11586 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
11587 _NEON2SSESTORAGE void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
11588 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
11589
11590 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11591 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
11592 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11593
11594 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
11595 _NEON2SSE_INLINE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)
11596 {
11597 //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7
11598 *(ptr) = val.val[0].m64_f32[0];
11599 *(ptr + 1) = val.val[1].m64_f32[0];
11600 *(ptr + 2) = val.val[2].m64_f32[0];
11601 *(ptr + 3) = val.val[3].m64_f32[0];
11602 *(ptr + 4) = val.val[0].m64_f32[1];
11603 *(ptr + 5) = val.val[1].m64_f32[1];
11604 *(ptr + 6) = val.val[2].m64_f32[1];
11605 *(ptr + 7) = val.val[3].m64_f32[1];
11606 }
11607
11608 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11609 #define vst4_p8 vst4_u8
11610
11611 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11612 #define vst4_p16 vst4_u16
11613
11614 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors *********************
11615 //********************************************************************************************************************
11616 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
11617 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
11618 {
11619 vst1q_lane_s16(ptr, val->val[0], lane);
11620 vst1q_lane_s16((ptr + 1), val->val[1], lane);
11621 }
11622 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
11623
11624 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11625 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
11626 {
11627 vst1q_lane_u32(ptr, val->val[0], lane);
11628 vst1q_lane_u32((ptr + 1), val->val[1], lane);
11629 }
11630 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
11631
11632 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11633 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
11634 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
11635
11636 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
11637 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
11638 #define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane)
11639
11640 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11641 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
11642 //current IA SIMD doesn't support float16
11643
11644 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11645 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
11646 {
11647 vst1q_lane_f32(ptr, val->val[0], lane);
11648 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11649 }
11650 #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
11651
11652 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11653 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
11654 #define vst2q_lane_p16 vst2q_lane_s16
11655
11656 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11657 _NEON2SSE_INLINE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
11658 {
11659 *(ptr) = val.val[0].m64_u8[lane];
11660 *(ptr + 1) = val.val[1].m64_u8[lane];
11661 }
11662
11663 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11664 _NEON2SSE_INLINE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane)
11665 {
11666 *(ptr) = val.val[0].m64_u16[lane];
11667 *(ptr + 1) = val.val[1].m64_u16[lane];
11668 }
11669
11670 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11671 _NEON2SSE_INLINE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane)
11672 {
11673 *(ptr) = val.val[0].m64_u32[lane];
11674 *(ptr + 1) = val.val[1].m64_u32[lane];
11675 }
11676
11677 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11678 #define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane)
11679
11680 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11681 #define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane)
11682
11683 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11684 #define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane)
11685
11686 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
11687 //current IA SIMD doesn't support float16
11688
11689 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
11690 _NEON2SSE_INLINE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane)
11691 {
11692 *(ptr) = val.val[0].m64_f32[lane];
11693 *(ptr + 1) = val.val[1].m64_f32[lane];
11694 }
11695
11696 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11697 #define vst2_lane_p8 vst2_lane_u8
11698
11699 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11700 #define vst2_lane_p16 vst2_lane_u16
11701
11702 //************************* Triple lanes stores *******************************************************
11703 //*******************************************************************************************************
11704 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11705 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
11706 {
11707 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
11708 vst1q_lane_u16((ptr + 2), val->val[2], lane);
11709 }
11710 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
11711
11712 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11713 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
11714 {
11715 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
11716 vst1q_lane_u32((ptr + 2), val->val[2], lane);
11717 }
11718 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
11719
11720 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11721 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
11722 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
11723
11724 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11725 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
11726 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
11727
11728 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11729 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
11730 //current IA SIMD doesn't support float16
11731
11732 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11733 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
11734 {
11735 vst1q_lane_f32(ptr, val->val[0], lane);
11736 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11737 vst1q_lane_f32((ptr + 2), val->val[2], lane);
11738 }
11739 #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
11740
11741 //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11742 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
11743 #define vst3q_lane_p16 vst3q_lane_s16
11744
11745 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11746 _NEON2SSE_INLINE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)
11747 {
11748 *(ptr) = val.val[0].m64_u8[lane];
11749 *(ptr + 1) = val.val[1].m64_u8[lane];
11750 *(ptr + 2) = val.val[2].m64_u8[lane];
11751 }
11752
11753 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11754 _NEON2SSE_INLINE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)
11755 {
11756 *(ptr) = val.val[0].m64_u16[lane];
11757 *(ptr + 1) = val.val[1].m64_u16[lane];
11758 *(ptr + 2) = val.val[2].m64_u16[lane];
11759 }
11760
11761 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11762 _NEON2SSE_INLINE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)
11763 {
11764 *(ptr) = val.val[0].m64_u32[lane];
11765 *(ptr + 1) = val.val[1].m64_u32[lane];
11766 *(ptr + 2) = val.val[2].m64_u32[lane];
11767 }
11768
11769 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11770 #define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
11771
11772 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11773 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
11774
11775 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11776 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
11777
11778 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11779 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
11780 //current IA SIMD doesn't support float16
11781
11782 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11783 _NEON2SSE_INLINE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)
11784 {
11785 *(ptr) = val.val[0].m64_f32[lane];
11786 *(ptr + 1) = val.val[1].m64_f32[lane];
11787 *(ptr + 2) = val.val[2].m64_f32[lane];
11788 }
11789
11790 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11791 #define vst3_lane_p8 vst3_lane_u8
11792
11793 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11794 #define vst3_lane_p16 vst3_lane_u16
11795
11796 //******************************** Quadruple lanes stores ***********************************************
11797 //*******************************************************************************************************
11798 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11799 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
11800 {
11801 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane);
11802 vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
11803 }
11804 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
11805
11806 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11807 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
11808 {
11809 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane);
11810 vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
11811 }
11812 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
11813
11814 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11815 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
11816 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
11817
11818 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11819 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
11820 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
11821
11822 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11823 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
11824 //current IA SIMD doesn't support float16
11825
11826 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11827 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
11828 {
11829 vst1q_lane_f32(ptr, val->val[0], lane);
11830 vst1q_lane_f32((ptr + 1), val->val[1], lane);
11831 vst1q_lane_f32((ptr + 2), val->val[2], lane);
11832 vst1q_lane_f32((ptr + 3), val->val[3], lane);
11833 }
11834 #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
11835
11836 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11837 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
11838 #define vst4q_lane_p16 vst4q_lane_u16
11839
11840 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11841 _NEON2SSE_INLINE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)
11842 {
11843 *(ptr) = val.val[0].m64_u8[lane];
11844 *(ptr + 1) = val.val[1].m64_u8[lane];
11845 *(ptr + 2) = val.val[2].m64_u8[lane];
11846 *(ptr + 3) = val.val[3].m64_u8[lane];
11847 }
11848
11849 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11850 _NEON2SSE_INLINE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)
11851 {
11852 *(ptr) = val.val[0].m64_u16[lane];
11853 *(ptr + 1) = val.val[1].m64_u16[lane];
11854 *(ptr + 2) = val.val[2].m64_u16[lane];
11855 *(ptr + 3) = val.val[3].m64_u16[lane];
11856 }
11857
11858 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11859 _NEON2SSE_INLINE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)
11860 {
11861 *(ptr) = val.val[0].m64_u32[lane];
11862 *(ptr + 1) = val.val[1].m64_u32[lane];
11863 *(ptr + 2) = val.val[2].m64_u32[lane];
11864 *(ptr + 3) = val.val[3].m64_u32[lane];
11865 }
11866
11867 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11868 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
11869
11870 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11871 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
11872
11873 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11874 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
11875
11876 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11877 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
11878 //current IA SIMD doesn't support float16
11879
11880 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11881 _NEON2SSE_INLINE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)
11882 {
11883 *(ptr) = val.val[0].m64_f32[lane];
11884 *(ptr + 1) = val.val[1].m64_f32[lane];
11885 *(ptr + 2) = val.val[2].m64_f32[lane];
11886 *(ptr + 3) = val.val[3].m64_f32[lane];
11887 }
11888
11889 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11890 #define vst4_lane_p8 vst4_lane_u8
11891
11892 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11893 #define vst4_lane_p16 vst4_lane_u16
11894
11895 //**************************************************************************************************
11896 //************************ Extract lanes from a vector ********************************************
11897 //**************************************************************************************************
11898 //These intrinsics extract a single lane (element) from a vector.
11899 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11900 #define vget_lane_u8(vec, lane) vec.m64_u8[lane]
11901
11902 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11903 #define vget_lane_u16(vec, lane) vec.m64_u16[lane]
11904
11905
11906 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11907 #define vget_lane_u32(vec, lane) vec.m64_u32[lane]
11908
11909 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
11910 #define vget_lane_s8(vec, lane) vec.m64_i8[lane]
11911
11912 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
11913 #define vget_lane_s16(vec, lane) vec.m64_i16[lane]
11914
11915 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11916 #define vget_lane_s32(vec, lane) vec.m64_i32[lane]
11917
11918 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
11919 #define vget_lane_p8 vget_lane_u8
11920
11921 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
11922 #define vget_lane_p16 vget_lane_u16
11923
11924 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
11925 #define vget_lane_f32(vec, lane) vec.m64_f32[lane]
11926
11927 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11928 #define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8
11929
11930 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11931 #define vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16
11932
11933 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11934 #define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32
11935
11936 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
11937 #define vgetq_lane_s8 _MM_EXTRACT_EPI8
11938
11939 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
11940 #define vgetq_lane_s16 _MM_EXTRACT_EPI16
11941
11942 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11943 #define vgetq_lane_s32 _MM_EXTRACT_EPI32
11944
11945 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
11946 #define vgetq_lane_p8 vgetq_lane_u8
11947
11948 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
11949 #define vgetq_lane_p16 vgetq_lane_u16
11950
11951 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
11952 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
11953 {
11954 int32_t ilane;
11955 ilane = _MM_EXTRACT_PS(vec,lane);
11956 return *(float*)&ilane;
11957 }
11958
11959 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11960 #define vget_lane_s64(vec, lane) vec.m64_i64[0]
11961
11962 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
11963 #define vget_lane_u64(vec, lane) vec.m64_u64[0]
11964
11965
11966 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11967 #define vgetq_lane_s64 _MM_EXTRACT_EPI64
11968
11969 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
11970 #define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64
11971
11972 // ***************** Set lanes within a vector ********************************************
11973 // **************************************************************************************
11974 //These intrinsics set a single lane (element) within a vector.
11975 //same functions as vld1_lane_xx ones, but take the value to be set directly.
11976
11977 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
11978 _NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
11979 {
11980 uint8_t val;
11981 val = value;
11982 return vld1_lane_u8(&val, vec, lane);
11983 }
11984
11985 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
11986 _NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
11987 {
11988 uint16_t val;
11989 val = value;
11990 return vld1_lane_u16(&val, vec, lane);
11991 }
11992
11993 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
11994 _NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
11995 {
11996 uint32_t val;
11997 val = value;
11998 return vld1_lane_u32(&val, vec, lane);
11999 }
12000
12001 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12002 _NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
12003 {
12004 int8_t val;
12005 val = value;
12006 return vld1_lane_s8(&val, vec, lane);
12007 }
12008
12009 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12010 _NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
12011 {
12012 int16_t val;
12013 val = value;
12014 return vld1_lane_s16(&val, vec, lane);
12015 }
12016
12017 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12018 _NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
12019 {
12020 int32_t val;
12021 val = value;
12022 return vld1_lane_s32(&val, vec, lane);
12023 }
12024
12025 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
12026 #define vset_lane_p8 vset_lane_u8
12027
12028 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
12029 #define vset_lane_p16 vset_lane_u16
12030
12031 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
12032 _NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
12033 {
12034 float32_t val;
12035 val = value;
12036 return vld1_lane_f32(&val, vec, lane);
12037 }
12038
12039 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12040 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
12041 {
12042 uint8_t val;
12043 val = value;
12044 return vld1q_lane_u8(&val, vec, lane);
12045 }
12046
12047 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12048 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
12049 {
12050 uint16_t val;
12051 val = value;
12052 return vld1q_lane_u16(&val, vec, lane);
12053 }
12054
12055 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12056 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
12057 {
12058 uint32_t val;
12059 val = value;
12060 return vld1q_lane_u32(&val, vec, lane);
12061 }
12062
12063 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12064 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
12065 {
12066 int8_t val;
12067 val = value;
12068 return vld1q_lane_s8(&val, vec, lane);
12069 }
12070
12071 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12072 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
12073 {
12074 int16_t val;
12075 val = value;
12076 return vld1q_lane_s16(&val, vec, lane);
12077 }
12078
12079 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12080 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
12081 {
12082 int32_t val;
12083 val = value;
12084 return vld1q_lane_s32(&val, vec, lane);
12085 }
12086
12087 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
12088 #define vsetq_lane_p8 vsetq_lane_u8
12089
12090 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
12091 #define vsetq_lane_p16 vsetq_lane_u16
12092
12093 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
12094 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
12095 {
12096 float32_t val;
12097 val = value;
12098 return vld1q_lane_f32(&val, vec, lane);
12099 }
12100
12101 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12102 _NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
12103 {
12104 int64_t val;
12105 val = value;
12106 return vld1_lane_s64(&val, vec, lane);
12107 }
12108
12109 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
12110 _NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
12111 {
12112 uint64_t val;
12113 val = value;
12114 return vld1_lane_u64(&val, vec, lane);
12115 }
12116
12117 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12118 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
12119 {
12120 uint64_t val;
12121 val = value;
12122 return vld1q_lane_s64(&val, vec, lane);
12123 }
12124
12125 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
12126 #define vsetq_lane_u64 vsetq_lane_s64
12127
12128 // *******************************************************************************
12129 // **************** Initialize a vector from bit pattern ***************************
12130 // *******************************************************************************
12131 //These intrinsics create a vector from a literal bit pattern.
12132 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
vcreate_s8(uint64_t a)12133 _NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a)
12134 {
12135 return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12136 }
12137
12138 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
12139 #define vcreate_s16 vcreate_s8
12140
12141 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
12142 #define vcreate_s32 vcreate_s8
12143
12144 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
12145 //no IA32 SIMD avalilable
12146
12147 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
vcreate_f32(uint64_t a)12148 _NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a)
12149 {
12150 return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
12151 }
12152
12153 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
12154 #define vcreate_u8 vcreate_s8
12155
12156 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
12157 #define vcreate_u16 vcreate_s16
12158
12159 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
12160 #define vcreate_u32 vcreate_s32
12161
12162 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
12163 #define vcreate_u64 vcreate_s8
12164
12165
12166 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
12167 #define vcreate_p8 vcreate_u8
12168
12169 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
12170 #define vcreate_p16 vcreate_u16
12171
12172 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
12173 #define vcreate_s64 vcreate_u64
12174
12175 //********************* Set all lanes to same value ********************************
12176 //*********************************************************************************
12177 //These intrinsics set all lanes to the same value.
12178 _NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8 (uint8_t value),_NEON2SSE_REASON_SLOW_SERIAL)12179 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12180 {
12181 uint8x8_t res;
12182 int i;
12183 for (i = 0; i<8; i++) {
12184 res.m64_u8[i] = value;
12185 }
12186 return res;
12187 }
12188
12189 _NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16 (uint16_t value),_NEON2SSE_REASON_SLOW_SERIAL)12190 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12191 {
12192 uint16x4_t res;
12193 int i;
12194 for (i = 0; i<4; i++) {
12195 res.m64_u16[i] = value;
12196 }
12197 return res;
12198 }
12199
12200 _NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32 (uint32_t value),_NEON2SSE_REASON_SLOW_SERIAL)12201 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12202 {
12203 uint32x2_t res;
12204 res.m64_u32[0] = value;
12205 res.m64_u32[1] = value;
12206 return res;
12207 }
12208
12209 _NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8 (int8_t value),_NEON2SSE_REASON_SLOW_SERIAL)12210 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12211 {
12212 int8x8_t res;
12213 int i;
12214 for (i = 0; i<8; i++) {
12215 res.m64_i8[i] = value;
12216 }
12217 return res;
12218 }
12219
12220 _NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16 (int16_t value),_NEON2SSE_REASON_SLOW_SERIAL)12221 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12222 {
12223 int16x4_t res;
12224 int i;
12225 for (i = 0; i<4; i++) {
12226 res.m64_i16[i] = value;
12227 }
12228 return res;
12229 }
12230
12231 _NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32 (int32_t value),_NEON2SSE_REASON_SLOW_SERIAL)12232 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL)
12233 {
12234 int32x2_t res;
12235 res.m64_i32[0] = value;
12236 res.m64_i32[1] = value;
12237 return res;
12238 }
12239
12240 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
12241 #define vdup_n_p8 vdup_n_u8
12242
12243 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
12244 #define vdup_n_p16 vdup_n_s16
12245
12246 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
vdup_n_f32(float32_t value)12247 _NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
12248 {
12249 float32x2_t res;
12250 res.m64_f32[0] = value;
12251 res.m64_f32[1] = value;
12252 return res;
12253 }
12254
12255 _NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
12256 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
12257
12258 _NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
12259 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
12260
12261 _NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
12262 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
12263
12264 _NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
12265 #define vdupq_n_s8 _mm_set1_epi8
12266
12267 _NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
12268 #define vdupq_n_s16 _mm_set1_epi16
12269
12270 _NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
12271 #define vdupq_n_s32 _mm_set1_epi32
12272
12273 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
12274 #define vdupq_n_p8 vdupq_n_u8
12275
12276 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
12277 #define vdupq_n_p16 vdupq_n_u16
12278
12279 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
12280 #define vdupq_n_f32 _mm_set1_ps
12281
12282 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
vdup_n_s64(int64_t value)12283 _NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
12284 {
12285 int64x1_t res;
12286 res.m64_i64[0] = value;
12287 return res;
12288 }
12289
12290 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
vdup_n_u64(uint64_t value)12291 _NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value)
12292 {
12293 uint64x1_t res;
12294 res.m64_u64[0] = value;
12295 return res;
12296 }
12297
12298 _NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
vdupq_n_s64(int64_t value)12299 _NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value)
12300 {
12301 _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
12302 return LOAD_SI128(value2);
12303 }
12304
12305 _NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
vdupq_n_u64(uint64_t value)12306 _NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value)
12307 {
12308 _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
12309 return LOAD_SI128(val);
12310 }
12311
12312 //**** Set all lanes to same value ************************
12313 //Same functions as above - just aliaces.********************
12314 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
12315 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
12316 #define vmov_n_u8 vdup_n_s8
12317
12318 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
12319 #define vmov_n_u16 vdup_n_s16
12320
12321 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
12322 #define vmov_n_u32 vdup_n_u32
12323
12324 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
12325 #define vmov_n_s8 vdup_n_s8
12326
12327 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
12328 #define vmov_n_s16 vdup_n_s16
12329
12330 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
12331 #define vmov_n_s32 vdup_n_s32
12332
12333 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
12334 #define vmov_n_p8 vdup_n_u8
12335
12336 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
12337 #define vmov_n_p16 vdup_n_s16
12338
12339 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
12340 #define vmov_n_f32 vdup_n_f32
12341
12342 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
12343 #define vmovq_n_u8 vdupq_n_u8
12344
12345 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
12346 #define vmovq_n_u16 vdupq_n_s16
12347
12348 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
12349 #define vmovq_n_u32 vdupq_n_u32
12350
12351 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
12352 #define vmovq_n_s8 vdupq_n_s8
12353
12354 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
12355 #define vmovq_n_s16 vdupq_n_s16
12356
12357 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
12358 #define vmovq_n_s32 vdupq_n_s32
12359
12360 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
12361 #define vmovq_n_p8 vdupq_n_u8
12362
12363 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
12364 #define vmovq_n_p16 vdupq_n_s16
12365
12366 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
12367 #define vmovq_n_f32 vdupq_n_f32
12368
12369 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
12370 #define vmov_n_s64 vdup_n_s64
12371
12372 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
12373 #define vmov_n_u64 vdup_n_u64
12374
12375 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
12376 #define vmovq_n_s64 vdupq_n_s64
12377
12378 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
12379 #define vmovq_n_u64 vdupq_n_u64
12380
12381 //**************Set all lanes to the value of one lane of a vector *************
12382 //****************************************************************************
12383 //here shuffle is better solution than lane extraction followed by set1 function
12384 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12385 _NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
12386 {
12387 uint8x8_t res;
12388 uint8_t valane;
12389 int i = 0;
12390 valane = vec.m64_u8[lane];
12391 for (i = 0; i<8; i++) {
12392 res.m64_u8[i] = valane;
12393 }
12394 return res;
12395 }
12396
12397 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12398 _NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
12399 {
12400 uint16x4_t res;
12401 uint16_t valane;
12402 valane = vec.m64_u16[lane];
12403 res.m64_u16[0] = valane;
12404 res.m64_u16[1] = valane;
12405 res.m64_u16[2] = valane;
12406 res.m64_u16[3] = valane;
12407 return res;
12408 }
12409
12410 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12411 _NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12412 {
12413 uint32x2_t res;
12414 res.m64_u32[0] = vec.m64_u32[lane];
12415 res.m64_u32[1] = res.m64_u32[0];
12416 return res;
12417 }
12418
12419 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12420 #define vdup_lane_s8 vdup_lane_u8
12421
12422 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12423 #define vdup_lane_s16 vdup_lane_u16
12424
12425 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12426 #define vdup_lane_s32 vdup_lane_u32
12427
12428 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
12429 #define vdup_lane_p8 vdup_lane_u8
12430
12431 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
12432 #define vdup_lane_p16 vdup_lane_s16
12433
12434 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
12435 _NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
12436 {
12437 float32x2_t res;
12438 res.m64_f32[0] = vec.m64_f32[lane];
12439 res.m64_f32[1] = res.m64_f32[0];
12440 return res;
12441 }
12442
12443 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12444 _NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
12445 {
12446 const int8_t lane8 = (int8_t) lane;
12447 _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8};
12448 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
12449 }
12450
12451 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12452 _NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
12453 {
12454 //we could use 8bit shuffle for 16 bit as well
12455 const int8_t lane16 = ((int8_t) lane) << 1;
12456 const int8_t lane16_1 = lane16 + 1;
12457 _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1,
12458 lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1};
12459 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
12460 }
12461
12462 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12463 _NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
12464 {
12465 //need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32
12466 if (lane == 1)
12467 return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) );
12468 else
12469 return _mm_shuffle_epi32 (_pM128i(vec), 0);
12470 }
12471
12472 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12473 #define vdupq_lane_s8 vdupq_lane_u8
12474
12475 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12476 #define vdupq_lane_s16 vdupq_lane_u16
12477
12478 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12479 #define vdupq_lane_s32 vdupq_lane_u32
12480
12481 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
12482 #define vdupq_lane_p8 vdupq_lane_u8
12483
12484 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
12485 #define vdupq_lane_p16 vdupq_lane_s16
12486
12487 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
12488 #define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane))
12489
12490 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12491 #define vdup_lane_s64(vec,lane) vec
12492
12493 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
12494 #define vdup_lane_u64(vec,lane) vec
12495
12496 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12497 _NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
12498 {
12499 __m128i vec128;
12500 vec128 = _pM128i(vec);
12501 return _mm_unpacklo_epi64(vec128,vec128);
12502 }
12503
12504 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
12505 #define vdupq_lane_u64 vdupq_lane_s64
12506
12507 // ********************************************************************
12508 // ******************** Combining vectors *****************************
12509 // ********************************************************************
12510 //These intrinsics join two 64 bit vectors into a single 128bit vector.
12511 _NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
vcombine_s8(int8x8_t low,int8x8_t high)12512 _NEON2SSE_INLINE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high)
12513 {
12514 return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) );
12515 }
12516
12517 _NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
12518 #define vcombine_s16 vcombine_s8
12519
12520 _NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
12521 #define vcombine_s32 vcombine_s8
12522
12523 _NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
12524 #define vcombine_s64 vcombine_s8
12525
12526 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
12527 //current IA SIMD doesn't support float16
12528
12529 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
vcombine_f32(float32x2_t low,float32x2_t high)12530 _NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
12531 {
12532 __m128i res;
12533 res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
12534 return _M128(res);
12535 }
12536
12537 _NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
12538 #define vcombine_u8 vcombine_s8
12539
12540 _NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
12541 #define vcombine_u16 vcombine_s16
12542
12543 _NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
12544 #define vcombine_u32 vcombine_s32
12545
12546 _NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
12547 #define vcombine_u64 vcombine_s64
12548
12549 _NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
12550 #define vcombine_p8 vcombine_u8
12551
12552 _NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
12553 #define vcombine_p16 vcombine_u16
12554
12555 //**********************************************************************
12556 //************************* Splitting vectors **************************
12557 //**********************************************************************
12558 //**************** Get high part ******************************************
12559 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
12560 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
vget_high_s8(int8x16_t a)12561 _NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
12562 {
12563 int8x8_t res64;
12564 __m128i res;
12565 res = _mm_unpackhi_epi64(a,a); //SSE2
12566 return64(res);
12567 }
12568
12569 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
vget_high_s16(int16x8_t a)12570 _NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
12571 {
12572 int16x4_t res64;
12573 __m128i res;
12574 res = _mm_unpackhi_epi64(a,a); //SSE2
12575 return64(res);
12576 }
12577
12578 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
vget_high_s32(int32x4_t a)12579 _NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
12580 {
12581 int32x2_t res64;
12582 __m128i res;
12583 res = _mm_unpackhi_epi64(a,a); //SSE2
12584 return64(res);
12585 }
12586
12587 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
vget_high_s64(int64x2_t a)12588 _NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
12589 {
12590 int64x1_t res64;
12591 __m128i res;
12592 res = _mm_unpackhi_epi64(a,a); //SSE2
12593 return64(res);
12594 }
12595
12596 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
12597 // IA32 SIMD doesn't work with 16bit floats currently
12598
12599 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
vget_high_f32(float32x4_t a)12600 _NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
12601 {
12602 __m128i res;
12603 __m64_128 res64;
12604 res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
12605 return64(res);
12606 }
12607
12608 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
12609 #define vget_high_u8 vget_high_s8
12610
12611 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
12612 #define vget_high_u16 vget_high_s16
12613
12614 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
12615 #define vget_high_u32 vget_high_s32
12616
12617 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
12618 #define vget_high_u64 vget_high_s64
12619
12620 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
12621 #define vget_high_p8 vget_high_u8
12622
12623 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
12624 #define vget_high_p16 vget_high_u16
12625
12626 //********************** Get low part **********************
12627 //**********************************************************
12628 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
vget_low_s8(int8x16_t a)12629 _NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
12630 {
12631 int16x4_t res64;
12632 return64(a);
12633 }
12634
12635 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
vget_low_s16(int16x8_t a)12636 _NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
12637 {
12638 int16x4_t res64;
12639 return64(a);
12640 }
12641
12642 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
vget_low_s32(int32x4_t a)12643 _NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
12644 {
12645 int32x2_t res64;
12646 return64(a);
12647 }
12648
12649 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
vget_low_s64(int64x2_t a)12650 _NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
12651 {
12652 int64x1_t res64;
12653 return64 (a);
12654 }
12655
12656 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
12657 // IA32 SIMD doesn't work with 16bit floats currently
12658
12659 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
vget_low_f32(float32x4_t a)12660 _NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
12661 {
12662 float32x2_t res64;
12663 _M64f(res64, a);
12664 return res64;
12665 }
12666
12667 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
12668 #define vget_low_u8 vget_low_s8
12669
12670 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
12671 #define vget_low_u16 vget_low_s16
12672
12673 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
12674 #define vget_low_u32 vget_low_s32
12675
12676 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
12677 #define vget_low_u64 vget_low_s64
12678
12679 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
12680 #define vget_low_p8 vget_low_u8
12681
12682 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
12683 #define vget_low_p16 vget_low_s16
12684
12685 //**************************************************************************
12686 //************************ Converting vectors **********************************
12687 //**************************************************************************
12688 //************* Convert from float ***************************************
12689 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
12690 _NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
vcvt_s32_f32(float32x2_t a)12691 _NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a)
12692 {
12693 int32x2_t res64;
12694 __m128i res;
12695 res = _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only
12696 return64(res);
12697 }
12698
12699 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
vcvt_u32_f32(float32x2_t a)12700 _NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
12701 {
12702 uint32x2_t res64;
12703 __m128i res;
12704 res = vcvtq_u32_f32(_pM128(a));
12705 return64(res);
12706 }
12707
12708 _NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
vcvtq_s32_f32(float32x4_t a)12709 _NEON2SSE_INLINE int32x4_t vcvtq_s32_f32(float32x4_t a)
12710 {
12711 __m128 dif;
12712 __m128i res;
12713 //_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary
12714 _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12715 dif = _mm_cmpge_ps(a, *(__m128*)fmax);
12716 res = _mm_cvttps_epi32(a);
12717 return _mm_xor_si128(res, _M128i(dif));
12718 }
12719
12720 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
vcvtq_u32_f32(float32x4_t a)12721 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
12722 {
12723 //No single instruction SSE solution but we could implement it as following:
12724 __m128i res1, res2, zero, mask;
12725 __m128 max, min, dif;
12726 _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
12727 _NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f };
12728 zero = _mm_setzero_si128();
12729 mask = _mm_cmpgt_epi32(_M128i(a), zero);
12730 min = _mm_and_ps(_M128(mask), a);
12731 max = _mm_min_ps(min, *(__m128*)fmax_unsigned); //clamped in 0 - 4.29496729+009
12732
12733 dif = _mm_sub_ps(max, *(__m128*)fmax);
12734 mask = _mm_cmpgt_epi32(_M128i(dif),zero);
12735 dif = _mm_and_ps(_M128(mask), dif);
12736
12737 res1 = _mm_cvttps_epi32(dif);
12738 res2 = vcvtq_s32_f32(max);
12739 return _mm_add_epi32(res1, res2);
12740 }
12741
12742 // ***** Convert to the fixed point with the number of fraction bits specified by b ***********
12743 //*************************************************************************************************
12744 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
12745 _NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
12746 {
12747 int32x2_t res64;
12748 return64(vcvtq_n_s32_f32(_pM128(a),b));
12749 }
12750
12751 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
12752 _NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
12753 {
12754 uint32x2_t res;
12755 float convconst;
12756 convconst = (float)((uint32_t)1 << b);
12757 res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
12758 res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
12759 return res;
12760 }
12761
12762 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
12763 _NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
12764 {
12765 float convconst;
12766 _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
12767 __m128 cconst128;
12768 __m128i mask, res;
12769 convconst = (float)(1 << b);
12770 cconst128 = vdupq_n_f32(convconst);
12771 res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
12772 mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
12773 return _mm_xor_si128 (res, mask); //res saturated for 0x80000000
12774 }
12775
12776 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
12777 _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
12778 {
12779 float convconst;
12780 __m128 cconst128;
12781 convconst = (float)(1 << b);
12782 cconst128 = vdupq_n_f32(convconst);
12783 return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
12784 }
12785
12786
12787 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
vcvtnq_s32_f32(float32x4_t a)12788 _NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a)
12789 {
12790 return _mm_cvtps_epi32(a);
12791 }
12792
12793 //***************** Convert to float *************************
12794 //*************************************************************
12795 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
vcvt_f32_s32(int32x2_t a)12796 _NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
12797 {
12798 float32x2_t res;
12799 res.m64_f32[0] = (float) a.m64_i32[0];
12800 res.m64_f32[1] = (float) a.m64_i32[1];
12801 return res;
12802 }
12803
12804 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
vcvt_f32_u32(uint32x2_t a)12805 _NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
12806 {
12807 float32x2_t res;
12808 res.m64_f32[0] = (float) a.m64_u32[0];
12809 res.m64_f32[1] = (float) a.m64_u32[1];
12810 return res;
12811 }
12812
12813 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
12814 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
12815
12816 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
vcvtq_f32_u32(uint32x4_t a)12817 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
12818 {
12819 //solution may be not optimal
12820 __m128 two16, fHi, fLo;
12821 __m128i hi, lo;
12822 two16 = _mm_set1_ps((float)0x10000); //2^16
12823 // Avoid double rounding by doing two exact conversions
12824 // of high and low 16-bit segments
12825 hi = _mm_srli_epi32(a, 16);
12826 lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
12827 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
12828 fLo = _mm_cvtepi32_ps(lo);
12829 // do single rounding according to current rounding mode
12830 return _mm_add_ps(fHi, fLo);
12831 }
12832
12833 // ***** Convert to the float from fixed point with the number of fraction bits specified by b ***********
12834 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
12835 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
12836 {
12837 float32x2_t res;
12838 float convconst;
12839 convconst = (float)(1. / ((uint32_t)1 << b));
12840 res.m64_f32[0] = a.m64_i32[0] * convconst;
12841 res.m64_f32[1] = a.m64_i32[1] * convconst;
12842 return res;
12843 }
12844
12845 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
12846 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
12847 {
12848 float32x2_t res;
12849 float convconst;
12850 convconst = (float)(1. / ((uint32_t)1 << b));
12851 res.m64_f32[0] = a.m64_u32[0] * convconst;
12852 res.m64_f32[1] = a.m64_u32[1] * convconst;
12853 return res;
12854 }
12855
12856 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
12857 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
12858 {
12859 float convconst;
12860 __m128 cconst128, af;
12861 convconst = (float)(1. / ((uint32_t)1 << b));
12862 af = _mm_cvtepi32_ps(a);
12863 cconst128 = vdupq_n_f32(convconst);
12864 return _mm_mul_ps(af,cconst128);
12865 }
12866
12867 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
12868 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
12869 {
12870 float convconst;
12871 __m128 cconst128, af;
12872 convconst = (float)(1. / (1 << b));
12873 af = vcvtq_f32_u32(a);
12874 cconst128 = vdupq_n_f32(convconst);
12875 return _mm_mul_ps(af,cconst128);
12876 }
12877
12878 //**************Convert between floats ***********************
12879 //************************************************************
12880 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
12881 //Intel SIMD doesn't support 16bits floats curently
12882
12883 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
12884 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
12885
12886 //************Vector narrow integer conversion (truncation) ******************
12887 //****************************************************************************
12888 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
vmovn_s16(int16x8_t a)12889 _NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
12890 {
12891 int8x8_t res64;
12892 __m128i res;
12893 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
12894 return64(res);
12895 }
12896
12897 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
vmovn_s32(int32x4_t a)12898 _NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
12899 {
12900 int16x4_t res64;
12901 __m128i res;
12902 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
12903 return64(res);
12904 }
12905
12906 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
vmovn_s64(int64x2_t a)12907 _NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
12908 {
12909 //may be not effective compared with a serial implementation
12910 int32x2_t res64;
12911 __m128i res;
12912 res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
12913 return64(res);
12914 }
12915
12916 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
12917 #define vmovn_u16 vmovn_s16
12918
12919 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
12920 #define vmovn_u32 vmovn_s32
12921
12922 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
12923 #define vmovn_u64 vmovn_s64
12924
12925 //**************** Vector long move ***********************
12926 //***********************************************************
12927 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
vmovl_s8(int8x8_t a)12928 _NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a)
12929 {
12930 return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1
12931 }
12932
12933 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
vmovl_s16(int16x4_t a)12934 _NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a)
12935 {
12936 return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1
12937 }
12938
12939 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
vmovl_s32(int32x2_t a)12940 _NEON2SSE_INLINE int64x2_t vmovl_s32(int32x2_t a)
12941 {
12942 return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1
12943 }
12944
12945 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
vmovl_u8(uint8x8_t a)12946 _NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a)
12947 {
12948 return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1
12949 }
12950
12951 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
vmovl_u16(uint16x4_t a)12952 _NEON2SSE_INLINE uint32x4_t vmovl_u16(uint16x4_t a)
12953 {
12954 return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1
12955 }
12956
12957 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
vmovl_u32(uint32x2_t a)12958 _NEON2SSE_INLINE uint64x2_t vmovl_u32(uint32x2_t a)
12959 {
12960 return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1
12961 }
12962
12963 //*************Vector saturating narrow integer*****************
12964 //**************************************************************
12965 _NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
vqmovn_s16(int16x8_t a)12966 _NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a)
12967 {
12968 int8x8_t res64;
12969 __m128i res;
12970 res = _mm_packs_epi16(a, a);
12971 return64(res);
12972 }
12973
12974 _NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
vqmovn_s32(int32x4_t a)12975 _NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a)
12976 {
12977 int16x4_t res64;
12978 __m128i res;
12979 res = _mm_packs_epi32(a, a);
12980 return64(res);
12981 }
12982
12983 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64 (int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)12984 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
12985 {
12986 int32x2_t res;
12987 _NEON2SSE_ALIGN_16 int64_t atmp[2];
12988 _mm_store_si128((__m128i*)atmp, a);
12989 if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
12990 if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
12991 if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
12992 if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
12993 res.m64_i32[0] = (int32_t)atmp[0];
12994 res.m64_i32[1] = (int32_t)atmp[1];
12995 return res;
12996 }
12997
12998 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
vqmovn_u16(uint16x8_t a)12999 _NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
13000 {
13001 //no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs
13002 uint8x8_t res64;
13003 __m128i c7fff, a_trunc, mask_trunc;
13004 c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
13005 a_trunc = _mm_and_si128(a, c7fff); // a truncated to max signed
13006 mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13007 mask_trunc = _mm_and_si128(mask_trunc, c7fff); //zero or c7fff if the 15-th bit had been set initially
13008 a_trunc = _mm_or_si128(a_trunc, mask_trunc);
13009 a_trunc = _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
13010 return64(a_trunc);
13011 }
13012
13013 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
vqmovn_u32(uint32x4_t a)13014 _NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
13015 {
13016 #ifdef USE_SSE4
13017 //no uint32 to uint16 conversion in SSE, need truncate to max signed first
13018 uint16x4_t res64;
13019 __m128i c7fffffff, a_trunc, mask_trunc;
13020 c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
13021 a_trunc = _mm_and_si128(a, c7fffffff); // a truncated to max signed
13022 mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
13023 mask_trunc = _mm_and_si128(mask_trunc, c7fffffff); //zero or c7fff if the 15-th bit had been set initially
13024 a_trunc = _mm_or_si128(a_trunc, mask_trunc);
13025 a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
13026 return64(a_trunc);
13027 #else
13028 uint16x4_t res64;
13029 __m128i res_hi, mask;
13030 mask = _mm_setzero_si128();
13031 res_hi = _mm_srli_epi32(a, 16);
13032 res_hi = _mm_cmpeq_epi16(res_hi, mask);
13033 mask = _mm_cmpeq_epi16(mask,mask); //all fff
13034 mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers
13035 res_hi = _mm_or_si128(a, mask); //saturated res
13036 res_hi = _mm_shuffle_epi8 (res_hi, *(__m128i*) mask8_32_even_odd); //go to 16 bits
13037 return64(res_hi);
13038 #endif
13039 }
13040
13041 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
vqmovn_u64(uint64x2_t a)13042 _NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
13043 {
13044 //serial solution may be faster
13045 uint32x2_t res64;
13046 __m128i res_hi, mask;
13047 mask = _mm_setzero_si128();
13048 res_hi = _mm_srli_epi64(a, 32);
13049 res_hi = _mm_cmpeq_epi32(res_hi, mask);
13050 mask = _mm_cmpeq_epi32(mask,mask); //all fff
13051 mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
13052 res_hi = _mm_or_si128(a, mask);
13053 res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13054 return64(res_hi);
13055 }
13056 //************* Vector saturating narrow integer signed->unsigned **************
13057 //*****************************************************************************
13058 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
vqmovun_s16(int16x8_t a)13059 _NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
13060 {
13061 uint8x8_t res64;
13062 __m128i res;
13063 res = _mm_packus_epi16(a, a); //use low 64bits only
13064 return64(res);
13065 }
13066
13067 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
vqmovun_s32(int32x4_t a)13068 _NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
13069 {
13070 uint16x4_t res64;
13071 __m128i res;
13072 res = _MM_PACKUS1_EPI32(a); //use low 64bits only
13073 return64(res);
13074 }
13075
13076 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
vqmovun_s64(int64x2_t a)13077 _NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
13078 {
13079 uint32x2_t res64;
13080 __m128i res_hi,res_lo, zero, cmp;
13081 zero = _mm_setzero_si128();
13082 res_hi = _mm_srli_epi64(a, 32);
13083 cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
13084 res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
13085 cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
13086 res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
13087 res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13088 return64(res_lo);
13089 }
13090
13091 // ********************************************************
13092 // **************** Table look up **************************
13093 // ********************************************************
13094 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
13095 //in a table and generate a new vector. Indexes out of range return 0.
13096 //for Intel SIMD we need to set the MSB to 1 for zero return
13097 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
vtbl1_u8(uint8x8_t a,uint8x8_t b)13098 _NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
13099 {
13100 uint8x8_t res64;
13101 __m128i c7, maskgt, bmask, b128;
13102 c7 = _mm_set1_epi8 (7);
13103 b128 = _pM128i(b);
13104 maskgt = _mm_cmpgt_epi8(b128,c7);
13105 bmask = _mm_or_si128(b128,maskgt);
13106 bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
13107 return64(bmask);
13108 }
13109
13110 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
13111 #define vtbl1_s8 vtbl1_u8
13112
13113 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
13114 #define vtbl1_p8 vtbl1_u8
13115
13116 _NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
vtbl2_u8(uint8x8x2_t a,uint8x8_t b)13117 _NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b)
13118 {
13119 uint8x8_t res64;
13120 __m128i c15, a01, maskgt15, bmask, b128;
13121 c15 = _mm_set1_epi8 (15);
13122 b128 = _pM128i(b);
13123 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13124 bmask = _mm_or_si128(b128, maskgt15);
13125 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1]));
13126 a01 = _mm_shuffle_epi8(a01, bmask);
13127 return64(a01);
13128 }
13129
13130 //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13131 #define vtbl2_s8 vtbl2_u8
13132
13133 //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13134 #define vtbl2_p8 vtbl2_u8
13135
13136 _NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
vtbl3_u8(uint8x8x3_t a,uint8x8_t b)13137 _NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b)
13138 {
13139 //solution may be not optimal
13140 uint8x8_t res64;
13141 __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
13142 c15 = _mm_set1_epi8 (15);
13143 c23 = _mm_set1_epi8 (23);
13144 b128 = _pM128i(b);
13145 maskgt23 = _mm_cmpgt_epi8(b128,c23);
13146 bmask = _mm_or_si128(b128, maskgt23);
13147 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13148 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13149 sh0 = _mm_shuffle_epi8(a01, bmask);
13150 sh1 = _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
13151 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
13152 return64(sh0);
13153 }
13154
13155 _NEON2SSESTORAGE int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13156 #define vtbl3_s8 vtbl3_u8
13157
13158 _NEON2SSESTORAGE poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13159 #define vtbl3_p8 vtbl3_u8
13160
13161 _NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
vtbl4_u8(uint8x8x4_t a,uint8x8_t b)13162 _NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b)
13163 {
13164 //solution may be not optimal
13165 uint8x8_t res64;
13166 __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
13167 c15 = _mm_set1_epi8 (15);
13168 c31 = _mm_set1_epi8 (31);
13169 b128 = _pM128i(b);
13170 maskgt31 = _mm_cmpgt_epi8(b128,c31);
13171 bmask = _mm_or_si128(b128, maskgt31);
13172 maskgt15 = _mm_cmpgt_epi8(b128,c15);
13173 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
13174 a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3]));
13175 sh0 = _mm_shuffle_epi8(a01, bmask);
13176 sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
13177 sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
13178 return64(sh0);
13179 }
13180
13181 _NEON2SSESTORAGE int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13182 #define vtbl4_s8 vtbl4_u8
13183
13184 _NEON2SSESTORAGE poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13185 #define vtbl4_p8 vtbl4_u8
13186
13187 //****************** Extended table look up intrinsics ***************************
13188 //**********************************************************************************
13189 //VTBX (Vector Table Extension) works in the same way as VTBL do,
13190 // except that indexes out of range leave the destination element unchanged.
13191
13192 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
vtbx1_u8(uint8x8_t a,uint8x8_t b,uint8x8_t c)13193 _NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
13194 {
13195 uint8x8_t res64;
13196 __m128i c7, maskgt, sh, c128;
13197 c7 = _mm_set1_epi8 (7);
13198 c128 = _pM128i(c);
13199 maskgt = _mm_cmpgt_epi8(c128,c7);
13200 c7 = _mm_and_si128(maskgt,_pM128i(a));
13201 sh = _mm_shuffle_epi8(_pM128i(b),c128);
13202 sh = _mm_andnot_si128(maskgt,sh);
13203 sh = _mm_or_si128(sh,c7);
13204 return64(sh);
13205 }
13206
13207 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
13208 #define vtbx1_s8 vtbx1_u8
13209
13210 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
13211 #define vtbx1_p8 vtbx1_u8
13212
13213 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
vtbx2_u8(uint8x8_t a,uint8x8x2_t b,uint8x8_t c)13214 _NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c)
13215 {
13216 uint8x8_t res64;
13217 __m128i c15, b01, maskgt15, sh, c128;
13218 c15 = _mm_set1_epi8 (15);
13219 c128 = _pM128i(c);
13220 maskgt15 = _mm_cmpgt_epi8(c128, c15);
13221 c15 = _mm_and_si128(maskgt15, _pM128i(a));
13222 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1]));
13223 sh = _mm_shuffle_epi8(b01, c128);
13224 sh = _mm_andnot_si128(maskgt15, sh);
13225 sh = _mm_or_si128(sh,c15);
13226 return64(sh);
13227 }
13228
13229 //int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13230 #define vtbx2_s8 vtbx2_u8
13231
13232 //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13233 #define vtbx2_p8 vtbx2_u8
13234
13235 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
vtbx3_u8(uint8x8_t a,uint8x8x3_t b,uint8x8_t c)13236 _NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c)
13237 {
13238 //solution may be not optimal
13239 uint8x8_t res64;
13240 __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
13241 c15 = _mm_set1_epi8 (15);
13242 c23 = _mm_set1_epi8 (23);
13243 c128 = _pM128i(c);
13244 maskgt15 = _mm_cmpgt_epi8(c128,c15);
13245 maskgt23 = _mm_cmpgt_epi8(c128,c23);
13246 c23 = _mm_and_si128(maskgt23, _pM128i(a));
13247 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13248 sh0 = _mm_shuffle_epi8(b01, c128);
13249 sh1 = _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
13250 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13251 sh0 = _mm_andnot_si128(maskgt23,sh0);
13252 sh0 = _mm_or_si128(sh0,c23);
13253 return64(sh0);
13254 }
13255
13256 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13257 #define vtbx3_s8 vtbx3_u8
13258
13259 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13260 #define vtbx3_p8 vtbx3_u8
13261
13262 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
vtbx4_u8(uint8x8_t a,uint8x8x4_t b,uint8x8_t c)13263 _NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c)
13264 {
13265 //solution may be not optimal
13266 uint8x8_t res64;
13267 __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
13268 c15 = _mm_set1_epi8 (15);
13269 c31 = _mm_set1_epi8 (31);
13270 c128 = _pM128i(c);
13271 maskgt15 = _mm_cmpgt_epi8(c128,c15);
13272 maskgt31 = _mm_cmpgt_epi8(c128,c31);
13273 c31 = _mm_and_si128(maskgt31, _pM128i(a));
13274
13275 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
13276 b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3]));
13277 sh0 = _mm_shuffle_epi8(b01, c128);
13278 sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
13279 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
13280 sh0 = _mm_andnot_si128(maskgt31,sh0);
13281 sh0 = _mm_or_si128(sh0,c31);
13282 return64(sh0);
13283 }
13284
13285 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13286 #define vtbx4_s8 vtbx4_u8
13287
13288 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13289 #define vtbx4_p8 vtbx4_u8
13290
13291 //*************************************************************************************************
13292 // *************************** Operations with a scalar value *********************************
13293 //*************************************************************************************************
13294
13295 //******* Vector multiply accumulate by scalar *************************************************
13296 //**********************************************************************************************
13297 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13298 _NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
13299 {
13300 int16_t c;
13301 int16x4_t scalar;
13302 c = vget_lane_s16(v, l);
13303 scalar = vdup_n_s16(c);
13304 return vmla_s16(a, b, scalar);
13305 }
13306
13307 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13308 _NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
13309 {
13310 int32_t c;
13311 int32x2_t scalar;
13312 c = vget_lane_s32(v, l);
13313 scalar = vdup_n_s32(c);
13314 return vmla_s32(a, b, scalar);
13315 }
13316
13317 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
13318 #define vmla_lane_u16 vmla_lane_s16
13319
13320
13321 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
13322 #define vmla_lane_u32 vmla_lane_s32
13323
13324 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
13325 _NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13326 {
13327 float32_t vlane;
13328 float32x2_t c;
13329 vlane = vget_lane_f32(v, l);
13330 c = vdup_n_f32(vlane);
13331 return vmla_f32(a,b,c);
13332 }
13333
13334 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13335 _NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13336 {
13337 int16_t vlane;
13338 int16x8_t c;
13339 vlane = vget_lane_s16(v, l);
13340 c = vdupq_n_s16(vlane);
13341 return vmlaq_s16(a,b,c);
13342 }
13343
13344 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13345 _NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13346 {
13347 int32_t vlane;
13348 int32x4_t c;
13349 vlane = vget_lane_s32(v, l);
13350 c = vdupq_n_s32(vlane);
13351 return vmlaq_s32(a,b,c);
13352 }
13353
13354 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13355 #define vmlaq_lane_u16 vmlaq_lane_s16
13356
13357 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13358 #define vmlaq_lane_u32 vmlaq_lane_s32
13359
13360 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13361 _NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13362 {
13363 float32_t vlane;
13364 float32x4_t c;
13365 vlane = vget_lane_f32(v, l);
13366 c = vdupq_n_f32(vlane);
13367 return vmlaq_f32(a,b,c);
13368 }
13369
13370 //***************** Vector widening multiply accumulate by scalar **********************
13371 //***************************************************************************************
13372 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13373 _NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13374 {
13375 int16_t vlane;
13376 int16x4_t c;
13377 vlane = vget_lane_s16(v, l);
13378 c = vdup_n_s16(vlane);
13379 return vmlal_s16(a, b, c);
13380 }
13381
13382 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13383 _NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13384 {
13385 int32_t vlane;
13386 int32x2_t c;
13387 vlane = vget_lane_s32(v, l);
13388 c = vdup_n_s32(vlane);
13389 return vmlal_s32(a, b, c);
13390 }
13391
13392 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13393 _NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13394 {
13395 uint16_t vlane;
13396 uint16x4_t c;
13397 vlane = vget_lane_u16(v, l);
13398 c = vdup_n_u16(vlane);
13399 return vmlal_u16(a, b, c);
13400 }
13401
13402 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13403 _NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13404 {
13405 uint32_t vlane;
13406 uint32x2_t c;
13407 vlane = vget_lane_u32(v, l);
13408 c = vdup_n_u32(vlane);
13409 return vmlal_u32(a, b, c);
13410 }
13411
13412 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
13413 // ************************************************************************************************
13414 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
13415 _NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13416 {
13417 int16_t vlane;
13418 int16x4_t c;
13419 vlane = vget_lane_s16(v, l);
13420 c = vdup_n_s16(vlane);
13421 return vqdmlal_s16(a, b, c);
13422 }
13423
13424 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
13425 _NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
13426 {
13427 int32_t vlane;
13428 uint32x2_t c;
13429 vlane = vget_lane_s32(v, l);
13430 c = vdup_n_s32(vlane);
13431 return vqdmlal_s32(a, b, c);
13432 }
13433
13434 // ****** Vector multiply subtract by scalar *****************
13435 // *************************************************************
13436 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13437 _NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13438 {
13439 int16_t vlane;
13440 int16x4_t c;
13441 vlane = vget_lane_s16(v, l);
13442 c = vdup_n_s16(vlane);
13443 return vmls_s16(a, b, c);
13444 }
13445
13446 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13447 _NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13448 {
13449 int32_t vlane;
13450 int32x2_t c;
13451 vlane = vget_lane_s32(v, l);
13452 c = vdup_n_s32(vlane);
13453 return vmls_s32(a, b, c);
13454 }
13455
13456 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
13457 _NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
13458 {
13459 uint16_t vlane;
13460 uint16x4_t c;
13461 vlane = vget_lane_s16(v, l);
13462 c = vdup_n_s16(vlane);
13463 return vmls_s16(a, b, c);
13464 }
13465
13466 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
13467 _NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
13468 {
13469 uint32_t vlane;
13470 uint32x2_t c;
13471 vlane = vget_lane_u32(v, l);
13472 c = vdup_n_u32(vlane);
13473 return vmls_u32(a, b, c);
13474 }
13475
13476 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
13477 _NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
13478 {
13479 float32_t vlane;
13480 float32x2_t c;
13481 vlane = (float) vget_lane_f32(v, l);
13482 c = vdup_n_f32(vlane);
13483 return vmls_f32(a,b,c);
13484 }
13485
13486 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
13487 _NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
13488 {
13489 int16_t vlane;
13490 int16x8_t c;
13491 vlane = vget_lane_s16(v, l);
13492 c = vdupq_n_s16(vlane);
13493 return vmlsq_s16(a, b,c);
13494 }
13495
13496 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
13497 _NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
13498 {
13499 int32_t vlane;
13500 int32x4_t c;
13501 vlane = vget_lane_s32(v, l);
13502 c = vdupq_n_s32(vlane);
13503 return vmlsq_s32(a,b,c);
13504 }
13505
13506 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
13507 _NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
13508 {
13509 uint16_t vlane;
13510 uint16x8_t c;
13511 vlane = vget_lane_u16(v, l);
13512 c = vdupq_n_u16(vlane);
13513 return vmlsq_u16(a,b,c);
13514 }
13515
13516 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
13517 _NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
13518 {
13519 uint32_t vlane;
13520 uint32x4_t c;
13521 vlane = vget_lane_u32(v, l);
13522 c = vdupq_n_u32(vlane);
13523 return vmlsq_u32(a,b,c);
13524 }
13525
13526 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
13527 _NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
13528 {
13529 float32_t vlane;
13530 float32x4_t c;
13531 vlane = (float) vget_lane_f32(v, l);
13532 c = vdupq_n_f32(vlane);
13533 return vmlsq_f32(a,b,c);
13534 }
13535
13536 // **** Vector widening multiply subtract by scalar ****
13537 // ****************************************************
13538 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
13539 _NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
13540 {
13541 int16_t vlane;
13542 int16x4_t c;
13543 vlane = vget_lane_s16(v, l);
13544 c = vdup_n_s16(vlane);
13545 return vmlsl_s16(a, b, c);
13546 }
13547
13548 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
13549 _NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
13550 {
13551 int32_t vlane;
13552 int32x2_t c;
13553 vlane = vget_lane_s32(v, l);
13554 c = vdup_n_s32(vlane);
13555 return vmlsl_s32(a, b, c);
13556 }
13557
13558 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
13559 _NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
13560 {
13561 uint16_t vlane;
13562 uint16x4_t c;
13563 vlane = vget_lane_s16(v, l);
13564 c = vdup_n_s16(vlane);
13565 return vmlsl_s16(a, b, c);
13566 }
13567
13568 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
13569 _NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
13570 {
13571 uint32_t vlane;
13572 uint32x2_t c;
13573 vlane = vget_lane_u32(v, l);
13574 c = vdup_n_u32(vlane);
13575 return vmlsl_u32(a, b, c);
13576 }
13577
13578 //********* Vector widening saturating doubling multiply subtract by scalar **************************
13579 //******************************************************************************************************
13580 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
13581 _NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
13582 {
13583 int16_t vlane;
13584 int16x4_t c;
13585 vlane = vget_lane_s16(v, l);
13586 c = vdup_n_s16(vlane);
13587 return vqdmlsl_s16(a, b, c);
13588 }
13589
13590 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
13591 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
13592 {
13593 int32_t vlane;
13594 int32x2_t c;
13595 vlane = vget_lane_s32(v, l);
13596 c = vdup_n_s32(vlane);
13597 return vqdmlsl_s32(a, b, c);
13598 }
13599 //********** Vector multiply with scalar *****************************
13600 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
vmul_n_s16(int16x4_t a,int16_t b)13601 _NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
13602 {
13603 int16x4_t b16x4;
13604 b16x4 = vdup_n_s16(b);
13605 return vmul_s16(a, b16x4);
13606 }
13607
13608 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
vmul_n_s32(int32x2_t a,int32_t b)13609 _NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
13610 {
13611 //serial solution looks faster
13612 int32x2_t b32x2;
13613 b32x2 = vdup_n_s32(b);
13614 return vmul_s32(a, b32x2);
13615 }
13616
13617 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
vmul_n_f32(float32x2_t a,float32_t b)13618 _NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
13619 {
13620 float32x2_t b32x2;
13621 b32x2 = vdup_n_f32(b);
13622 return vmul_f32(a, b32x2);
13623 }
13624
13625 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
vmul_n_u16(uint16x4_t a,uint16_t b)13626 _NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
13627 {
13628 uint16x4_t b16x4;
13629 b16x4 = vdup_n_s16(b);
13630 return vmul_s16(a, b16x4);
13631 }
13632
13633 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
vmul_n_u32(uint32x2_t a,uint32_t b)13634 _NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
13635 {
13636 //serial solution looks faster
13637 uint32x2_t b32x2;
13638 b32x2 = vdup_n_u32(b);
13639 return vmul_u32(a, b32x2);
13640 }
13641
13642 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
vmulq_n_s16(int16x8_t a,int16_t b)13643 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
13644 {
13645 int16x8_t b16x8;
13646 b16x8 = vdupq_n_s16(b);
13647 return vmulq_s16(a, b16x8);
13648 }
13649
13650 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
vmulq_n_s32(int32x4_t a,int32_t b)13651 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
13652 {
13653 int32x4_t b32x4;
13654 b32x4 = vdupq_n_s32(b);
13655 return vmulq_s32(a, b32x4);
13656 }
13657
13658 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
vmulq_n_f32(float32x4_t a,float32_t b)13659 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
13660 {
13661 float32x4_t b32x4;
13662 b32x4 = vdupq_n_f32(b);
13663 return vmulq_f32(a, b32x4);
13664 }
13665
13666 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
vmulq_n_u16(uint16x8_t a,uint16_t b)13667 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
13668 {
13669 uint16x8_t b16x8;
13670 b16x8 = vdupq_n_s16(b);
13671 return vmulq_s16(a, b16x8);
13672 }
13673
13674 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
vmulq_n_u32(uint32x4_t a,uint32_t b)13675 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
13676 {
13677 uint32x4_t b32x4;
13678 b32x4 = vdupq_n_u32(b);
13679 return vmulq_u32(a, b32x4);
13680 }
13681
13682 //********** Vector multiply lane *****************************
13683 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
13684 _NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
13685 {
13686 int16x4_t b16x4;
13687 int16_t vlane;
13688 vlane = vget_lane_s16(b, c);
13689 b16x4 = vdup_n_s16(vlane);
13690 return vmul_s16(a, b16x4);
13691 }
13692
13693 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
13694 _NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
13695 {
13696 int32x2_t b32x2;
13697 int32_t vlane;
13698 vlane = vget_lane_s32(b, c);
13699 b32x2 = vdup_n_s32(vlane);
13700 return vmul_s32(a, b32x2);
13701 }
13702
13703 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
13704 _NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
13705 {
13706 float32x2_t b32x2;
13707 float32_t vlane;
13708 vlane = vget_lane_f32(b, c);
13709 b32x2 = vdup_n_f32(vlane);
13710 return vmul_f32(a, b32x2);
13711 }
13712
13713 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
13714 #define vmul_lane_u16 vmul_lane_s16
13715
13716 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
13717 #define vmul_lane_u32 vmul_lane_s32
13718
13719 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
13720 _NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
13721 {
13722 int16x8_t b16x8;
13723 int16_t vlane;
13724 vlane = vget_lane_s16(b, c);
13725 b16x8 = vdupq_n_s16(vlane);
13726 return vmulq_s16(a, b16x8);
13727 }
13728
13729 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
13730 _NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
13731 {
13732 int32x4_t b32x4;
13733 int32_t vlane;
13734 vlane = vget_lane_s32(b, c);
13735 b32x4 = vdupq_n_s32(vlane);
13736 return vmulq_s32(a, b32x4);
13737 }
13738
13739 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
13740 _NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
13741 {
13742 float32x4_t b32x4;
13743 float32_t vlane;
13744 vlane = vget_lane_f32(b, c);
13745 b32x4 = vdupq_n_f32(vlane);
13746 return vmulq_f32(a, b32x4);
13747 }
13748
13749 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
13750 #define vmulq_lane_u16 vmulq_lane_s16
13751
13752 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
13753 #define vmulq_lane_u32 vmulq_lane_s32
13754
13755 //**** Vector long multiply with scalar ************
13756 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
vmull_n_s16(int16x4_t vec1,int16_t val2)13757 _NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
13758 {
13759 int16x4_t b16x4;
13760 b16x4 = vdup_n_s16(val2);
13761 return vmull_s16(vec1, b16x4);
13762 }
13763
13764 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
vmull_n_s32(int32x2_t vec1,int32_t val2)13765 _NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
13766 {
13767 int32x2_t b32x2;
13768 b32x2 = vdup_n_s32(val2);
13769 return vmull_s32(vec1, b32x2);
13770 }
13771
13772 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
vmull_n_u16(uint16x4_t vec1,uint16_t val2)13773 _NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
13774 {
13775 uint16x4_t b16x4;
13776 b16x4 = vdup_n_s16(val2);
13777 return vmull_s16(vec1, b16x4);
13778 }
13779
13780 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
vmull_n_u32(uint32x2_t vec1,uint32_t val2)13781 _NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
13782 {
13783 uint32x2_t b32x2;
13784 b32x2 = vdup_n_u32(val2);
13785 return vmull_u32(vec1, b32x2);
13786 }
13787
13788 //**** Vector long multiply by scalar ****
13789 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
13790 _NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
13791 {
13792 int16_t vlane;
13793 int16x4_t b;
13794 vlane = vget_lane_s16(val2, val3);
13795 b = vdup_n_s16(vlane);
13796 return vmull_s16(vec1, b);
13797 }
13798
13799 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
13800 _NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
13801 {
13802 int32_t vlane;
13803 int32x2_t b;
13804 vlane = vget_lane_s32(val2, val3);
13805 b = vdup_n_s32(vlane);
13806 return vmull_s32(vec1, b);
13807 }
13808
13809 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
13810 _NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
13811 {
13812 uint16_t vlane;
13813 uint16x4_t b;
13814 vlane = vget_lane_s16(val2, val3);
13815 b = vdup_n_s16(vlane);
13816 return vmull_s16(vec1, b);
13817 }
13818
13819 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
13820 _NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
13821 {
13822 uint32_t vlane;
13823 uint32x2_t b;
13824 vlane = vget_lane_u32(val2, val3);
13825 b = vdup_n_u32(vlane);
13826 return vmull_u32(vec1, b);
13827 }
13828
13829 //********* Vector saturating doubling long multiply with scalar *******************
13830 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
vqdmull_n_s16(int16x4_t vec1,int16_t val2)13831 _NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
13832 {
13833 //the serial soulution may be faster due to saturation
13834 int16x4_t b;
13835 b = vdup_n_s16(val2);
13836 return vqdmull_s16(vec1, b);
13837 }
13838
13839 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32 (int32x2_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_SERIAL)13840 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
13841 {
13842 int32x2_t b;
13843 b = vdup_n_s32(val2);
13844 return vqdmull_s32(vec1,b); //slow serial function!!!!
13845 }
13846
13847 //************* Vector saturating doubling long multiply by scalar ***********************************************
13848 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
13849 _NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
13850 {
13851 int16_t c;
13852 int16x4_t scalar;
13853 c = vget_lane_s16(val2, val3);
13854 scalar = vdup_n_s16(c);
13855 return vqdmull_s16(vec1, scalar);
13856 }
13857
13858
13859 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
13860 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
13861 {
13862 int32_t c;
13863 int32x2_t scalar;
13864 c = vget_lane_s32(val2, val3);
13865 scalar = vdup_n_s32(c);
13866 return vqdmull_s32(vec1,scalar); //slow serial function!!!!
13867 }
13868
13869 // *****Vector saturating doubling multiply high with scalar *****
13870 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
vqdmulh_n_s16(int16x4_t vec1,int16_t val2)13871 _NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2)
13872 {
13873 int16x4_t res64;
13874 return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
13875 }
13876
13877 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
vqdmulh_n_s32(int32x2_t vec1,int32_t val2)13878 _NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2)
13879 {
13880 int32x2_t res64;
13881 return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
13882 }
13883
13884 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
vqdmulhq_n_s16(int16x8_t vec1,int16_t val2)13885 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0]
13886 {
13887 //solution may be not optimal
13888 int16x8_t scalar;
13889 scalar = vdupq_n_s16(val2);
13890 return vqdmulhq_s16(vec1, scalar);
13891 }
13892
13893 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32 (int32x4_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13894 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13895 {
13896 int32x4_t scalar;
13897 scalar = vdupq_n_s32(val2);
13898 return vqdmulhq_s32(vec1, scalar);
13899 }
13900
13901 //***** Vector saturating doubling multiply high by scalar ****************
13902 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
13903 _NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0]
13904 {
13905 //solution may be not optimal
13906 int16_t vlane;
13907 int16x4_t scalar;
13908 vlane = vget_lane_s16(val2, val3);
13909 scalar = vdup_n_s16(vlane);
13910 return vqdmulh_s16(vec1, scalar);
13911 }
13912
13913 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
13914 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13915 {
13916 int32_t vlane;
13917 int32x2_t scalar;
13918 vlane = vget_lane_s32(val2, val3);
13919 scalar = vdup_n_s32(vlane);
13920 return vqdmulh_s32(vec1, scalar);
13921 }
13922
13923 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
13924 _NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0]
13925 {
13926 //solution may be not optimal
13927 int16_t vlane;
13928 int16x8_t scalar;
13929 vlane = vget_lane_s16(val2, val3);
13930 scalar = vdupq_n_s16(vlane );
13931 return vqdmulhq_s16(vec1, scalar);
13932 }
13933
13934 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
13935 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13936 {
13937 //solution may be not optimal
13938 int32_t vlane;
13939 int32x4_t scalar;
13940 vlane = vgetq_lane_s32(_pM128i(val2), val3);
13941 scalar = vdupq_n_s32(vlane );
13942 return vqdmulhq_s32(vec1, scalar);
13943 }
13944
13945 //******** Vector saturating rounding doubling multiply high with scalar ***
13946 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
vqrdmulh_n_s16(int16x4_t vec1,int16_t val2)13947 _NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
13948 {
13949 //solution may be not optimal
13950 int16x4_t scalar;
13951 scalar = vdup_n_s16(val2);
13952 return vqrdmulh_s16(vec1, scalar);
13953 }
13954
13955 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32 (int32x2_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13956 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13957 {
13958 int32x2_t scalar;
13959 scalar = vdup_n_s32(val2);
13960 return vqrdmulh_s32(vec1, scalar);
13961 }
13962
13963 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
vqrdmulhq_n_s16(int16x8_t vec1,int16_t val2)13964 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
13965 {
13966 //solution may be not optimal
13967 int16x8_t scalar;
13968 scalar = vdupq_n_s16(val2);
13969 return vqrdmulhq_s16(vec1, scalar);
13970 }
13971
13972 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32 (int32x4_t vec1,int32_t val2),_NEON2SSE_REASON_SLOW_UNEFFECTIVE)13973 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13974 {
13975 int32x4_t scalar;
13976 scalar = vdupq_n_s32(val2);
13977 return vqrdmulhq_s32(vec1, scalar);
13978 }
13979
13980 //********* Vector rounding saturating doubling multiply high by scalar ****
13981 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
13982 _NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
13983 {
13984 //solution may be not optimal
13985 int16_t vlane;
13986 int16x4_t scalar;
13987 vlane = vget_lane_s16(val2, val3);
13988 scalar = vdup_n_s16(vlane);
13989 return vqrdmulh_s16(vec1, scalar);
13990 }
13991
13992 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
13993 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
13994 {
13995 int32_t vlane;
13996 int32x2_t scalar;
13997 vlane = vget_lane_s32(val2, val3);
13998 scalar = vdup_n_s32(vlane);
13999 return vqrdmulh_s32(vec1, scalar);
14000 }
14001
14002 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
14003 _NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
14004 {
14005 //solution may be not optimal
14006 int16_t vlane;
14007 int16x8_t scalar;
14008 vlane = vget_lane_s16(val2, val3);
14009 scalar = vdupq_n_s16(vlane);
14010 return vqrdmulhq_s16(vec1, scalar);
14011 }
14012
14013 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
14014 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
14015 {
14016 //solution may be not optimal
14017 int32_t vlane;
14018 int32x4_t scalar;
14019 vlane = vgetq_lane_s32(_pM128i(val2), val3);
14020 scalar = vdupq_n_s32(vlane );
14021 return vqrdmulhq_s32(vec1, scalar);
14022 }
14023
14024 //**************Vector multiply accumulate with scalar *******************
14025 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
vmla_n_s16(int16x4_t a,int16x4_t b,int16_t c)14026 _NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
14027 {
14028 int16x4_t scalar;
14029 scalar = vdup_n_s16(c);
14030 return vmla_s16(a, b, scalar);
14031 }
14032
14033 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
vmla_n_s32(int32x2_t a,int32x2_t b,int32_t c)14034 _NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
14035 {
14036 int32x2_t scalar;
14037 scalar = vdup_n_s32(c);
14038 return vmla_s32(a, b, scalar);
14039 }
14040
14041 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
14042 #define vmla_n_u16 vmla_n_s16
14043
14044
14045 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
14046 #define vmla_n_u32 vmla_n_s32
14047
14048
14049 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
vmla_n_f32(float32x2_t a,float32x2_t b,float32_t c)14050 _NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
14051 {
14052 float32x2_t scalar;
14053 scalar = vdup_n_f32(c);
14054 return vmla_f32(a, b, scalar);
14055 }
14056
14057 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
vmlaq_n_s16(int16x8_t a,int16x8_t b,int16_t c)14058 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
14059 {
14060 int16x8_t scalar;
14061 scalar = vdupq_n_s16(c);
14062 return vmlaq_s16(a,b,scalar);
14063 }
14064
14065 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
vmlaq_n_s32(int32x4_t a,int32x4_t b,int32_t c)14066 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
14067 {
14068 int32x4_t scalar;
14069 scalar = vdupq_n_s32(c);
14070 return vmlaq_s32(a,b,scalar);
14071 }
14072
14073 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
14074 #define vmlaq_n_u16 vmlaq_n_s16
14075
14076 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
14077 #define vmlaq_n_u32 vmlaq_n_s32
14078
14079 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
vmlaq_n_f32(float32x4_t a,float32x4_t b,float32_t c)14080 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
14081 {
14082 float32x4_t scalar;
14083 scalar = vdupq_n_f32(c);
14084 return vmlaq_f32(a,b,scalar);
14085 }
14086
14087 //************Vector widening multiply accumulate with scalar****************************
14088 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
vmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)14089 _NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
14090 {
14091 int16x4_t vc;
14092 vc = vdup_n_s16(c);
14093 return vmlal_s16(a, b, vc);
14094 }
14095
14096 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
vmlal_n_s32(int64x2_t a,int32x2_t b,int32_t c)14097 _NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
14098 {
14099 int32x2_t vc;
14100 vc = vdup_n_s32(c);
14101 return vmlal_s32(a, b, vc);
14102 }
14103
14104 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
vmlal_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)14105 _NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
14106 {
14107 uint16x4_t vc;
14108 vc = vdup_n_u16(c);
14109 return vmlal_u16(a, b, vc);
14110 }
14111
14112 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
vmlal_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)14113 _NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
14114 {
14115 uint32x2_t vc;
14116 vc = vdup_n_u32(c);
14117 return vmlal_u32(a, b, vc);
14118 }
14119
14120 //************ Vector widening saturating doubling multiply accumulate with scalar **************
14121 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
vqdmlal_n_s16(int32x4_t a,int16x4_t b,int16_t c)14122 _NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14123 {
14124 //not optimal SIMD soulution, serial may be faster
14125 int16x4_t vc;
14126 vc = vdup_n_s16(c);
14127 return vqdmlal_s16(a, b, vc);
14128 }
14129
14130 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32 (int64x2_t a,int32x2_t b,int32_t c),_NEON2SSE_REASON_SLOW_SERIAL)14131 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14132 {
14133 int32x2_t vc;
14134 vc = vdup_n_s32(c);
14135 return vqdmlal_s32(a, b, vc);
14136 }
14137
14138 //******** Vector multiply subtract with scalar **************
14139 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
vmls_n_s16(int16x4_t a,int16x4_t b,int16_t c)14140 _NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
14141 {
14142 int16x4_t vc;
14143 vc = vdup_n_s16(c);
14144 return vmls_s16(a, b, vc);
14145 }
14146
14147 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
vmls_n_s32(int32x2_t a,int32x2_t b,int32_t c)14148 _NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
14149 {
14150 int32x2_t vc;
14151 vc = vdup_n_s32(c);
14152 return vmls_s32(a, b, vc);
14153 }
14154
14155 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
vmls_n_u16(uint16x4_t a,uint16x4_t b,uint16_t c)14156 _NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
14157 {
14158 uint16x4_t vc;
14159 vc = vdup_n_s16(c);
14160 return vmls_s16(a, b, vc);
14161 }
14162
14163 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
vmls_n_u32(uint32x2_t a,uint32x2_t b,uint32_t c)14164 _NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
14165 {
14166 uint32x2_t vc;
14167 vc = vdup_n_u32(c);
14168 return vmls_u32(a, b, vc);
14169 }
14170
14171 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
vmls_n_f32(float32x2_t a,float32x2_t b,float32_t c)14172 _NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
14173 {
14174 float32x2_t res;
14175 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
14176 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
14177 return res;
14178 }
14179
14180 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
vmlsq_n_s16(int16x8_t a,int16x8_t b,int16_t c)14181 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
14182 {
14183 int16x8_t vc;
14184 vc = vdupq_n_s16(c);
14185 return vmlsq_s16(a, b,vc);
14186 }
14187
14188 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
vmlsq_n_s32(int32x4_t a,int32x4_t b,int32_t c)14189 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
14190 {
14191 int32x4_t vc;
14192 vc = vdupq_n_s32(c);
14193 return vmlsq_s32(a,b,vc);
14194 }
14195
14196 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
vmlsq_n_u16(uint16x8_t a,uint16x8_t b,uint16_t c)14197 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
14198 {
14199 uint16x8_t vc;
14200 vc = vdupq_n_u16(c);
14201 return vmlsq_u16(a,b,vc);
14202 }
14203
14204 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
vmlsq_n_u32(uint32x4_t a,uint32x4_t b,uint32_t c)14205 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
14206 {
14207 uint32x4_t vc;
14208 vc = vdupq_n_u32(c);
14209 return vmlsq_u32(a,b,vc);
14210 }
14211
14212 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
vmlsq_n_f32(float32x4_t a,float32x4_t b,float32_t c)14213 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
14214 {
14215 float32x4_t vc;
14216 vc = vdupq_n_f32(c);
14217 return vmlsq_f32(a,b,vc);
14218 }
14219
14220 //**** Vector widening multiply subtract with scalar ******
14221 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
vmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)14222 _NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
14223 {
14224 int16x4_t vc;
14225 vc = vdup_n_s16(c);
14226 return vmlsl_s16(a, b, vc);
14227 }
14228
14229 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
vmlsl_n_s32(int64x2_t a,int32x2_t b,int32_t c)14230 _NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
14231 {
14232 int32x2_t vc;
14233 vc = vdup_n_s32(c);
14234 return vmlsl_s32(a, b, vc);
14235 }
14236
14237 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
vmlsl_n_u16(uint32x4_t a,uint16x4_t b,uint16_t c)14238 _NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
14239 {
14240 uint16x4_t vc;
14241 vc = vdup_n_u16(c);
14242 return vmlsl_u16(a, b, vc);
14243 }
14244
14245 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
vmlsl_n_u32(uint64x2_t a,uint32x2_t b,uint32_t c)14246 _NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
14247 {
14248 uint32x2_t vc;
14249 vc = vdup_n_u32(c);
14250 return vmlsl_u32(a, b, vc);
14251 }
14252
14253 //***** Vector widening saturating doubling multiply subtract with scalar *********
14254 //**********************************************************************************
14255 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
vqdmlsl_n_s16(int32x4_t a,int16x4_t b,int16_t c)14256 _NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
14257 {
14258 int16x4_t vc;
14259 vc = vdup_n_s16(c);
14260 return vqdmlsl_s16(a, b, vc);
14261 }
14262
14263 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32 (int64x2_t a,int32x2_t b,int32_t c),_NEON2SSE_REASON_SLOW_SERIAL)14264 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
14265 {
14266 int32x2_t vc;
14267 vc = vdup_n_s32(c);
14268 return vqdmlsl_s32(a, b, vc);
14269 }
14270
14271 //******************* Vector extract ***********************************************
14272 //*************************************************************************************
14273 //VEXT (Vector Extract) extracts elements from the bottom end of the second operand
14274 //vector and the top end of the first, concatenates them, and places the result in the destination vector
14275 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
14276 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14277 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
14278 {
14279 int8x8_t res;
14280 int i;
14281 for (i = 0; i<8 - c; i++) {
14282 res.m64_i8[i] = a.m64_i8[i + c];
14283 }
14284 for(i = 0; i<c; i++) {
14285 res.m64_i8[8 - c + i] = b.m64_i8[i];
14286 }
14287 return res;
14288 }
14289
14290 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14291 #define vext_u8 vext_s8
14292 //same result tested
14293
14294 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
14295 #define vext_p8 vext_u8
14296
14297 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14298 _NEON2SSE_INLINE int16x4_t _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14299 {
14300 int16x4_t res;
14301 int i;
14302 for (i = 0; i<4 - c; i++) {
14303 res.m64_i16[i] = a.m64_i16[i + c];
14304 }
14305 for(i = 0; i<c; i++) {
14306 res.m64_i16[4 - c + i] = b.m64_i16[i];
14307 }
14308 return res;
14309 }
14310
14311 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14312 #define vext_u16 vext_s16
14313
14314 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
14315 #define vext_p16 vext_s16
14316
14317 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14318 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14319 {
14320 int32x2_t res;
14321 if (c==0) {
14322 res.m64_i32[0] = a.m64_i32[0];
14323 res.m64_i32[1] = a.m64_i32[1];
14324 } else {
14325 res.m64_i32[0] = a.m64_i32[1];
14326 res.m64_i32[1] = b.m64_i32[0];
14327 }
14328 return res;
14329 }
14330
14331 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14332 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
14333 {
14334 float32x2_t res;
14335 if (c==0) {
14336 res.m64_f32[0] = a.m64_f32[0];
14337 res.m64_f32[1] = a.m64_f32[1];
14338 } else {
14339 res.m64_f32[0] = a.m64_f32[1];
14340 res.m64_f32[1] = b.m64_f32[0];
14341 }
14342 return res;
14343 }
14344
14345 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
14346 #define vext_u32 vext_s32
14347
14348
14349 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14350 #define vext_s64(a,b,c) a
14351
14352 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
14353 #define vext_u64(a,b,c) a
14354
14355 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14356 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14357
14358 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14359 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14360
14361 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
14362 #define vextq_p8 vextq_s8
14363
14364 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14365 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14366
14367 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14368 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14369
14370 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
14371 #define vextq_p16 vextq_s16
14372
14373 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14374 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14375
14376 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
14377 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14378
14379 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
14380 #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
14381
14382 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14383 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14384
14385 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
14386 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14387
14388 //************ Reverse vector elements (swap endianness)*****************
14389 //*************************************************************************
14390 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
14391 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
vrev64_s8(int8x8_t vec)14392 _NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
14393 {
14394 int8x8_t res64;
14395 __m128i res;
14396 res = vrev64q_s8(_pM128i(vec));
14397 return64(res);
14398 }
14399
14400 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
vrev64_s16(int16x4_t vec)14401 _NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
14402 {
14403 int16x4_t res64;
14404 __m128i res;
14405 res = vrev64q_s16(_pM128i(vec));
14406 return64(res);
14407 }
14408
14409 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
vrev64_s32(int32x2_t vec)14410 _NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
14411 {
14412 int32x2_t res;
14413 res.m64_i32[0] = vec.m64_i32[1];
14414 res.m64_i32[1] = vec.m64_i32[0];
14415 return res;
14416 }
14417
14418 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
14419 #define vrev64_u8 vrev64_s8
14420
14421 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
14422 #define vrev64_u16 vrev64_s16
14423
14424 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
14425 #define vrev64_u32 vrev64_s32
14426
14427 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
14428 #define vrev64_p8 vrev64_u8
14429
14430 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
14431 #define vrev64_p16 vrev64_u16
14432
14433 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
vrev64_f32(float32x2_t vec)14434 _NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
14435 {
14436 float32x2_t res;
14437 res.m64_f32[0] = vec.m64_f32[1];
14438 res.m64_f32[1] = vec.m64_f32[0];
14439 return res;
14440 }
14441
14442 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
vrev64q_s8(int8x16_t vec)14443 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
14444 {
14445 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
14446 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14447 }
14448
14449 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
vrev64q_s16(int16x8_t vec)14450 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
14451 {
14452 //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
14453 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
14454 return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
14455 }
14456
14457 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
vrev64q_s32(int32x4_t vec)14458 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
14459 {
14460 return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
14461 }
14462
14463 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
14464 #define vrev64q_u8 vrev64q_s8
14465
14466 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
14467 #define vrev64q_u16 vrev64q_s16
14468
14469 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
14470 #define vrev64q_u32 vrev64q_s32
14471
14472 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
14473 #define vrev64q_p8 vrev64q_u8
14474
14475 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
14476 #define vrev64q_p16 vrev64q_u16
14477
14478 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
14479 #define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1))
14480
14481 //******************** 32 bit shuffles **********************
14482 //************************************************************
14483 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
vrev32_s8(int8x8_t vec)14484 _NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
14485 {
14486 int8x8_t res64;
14487 __m128i res;
14488 res = vrev32q_s8(_pM128i(vec));
14489 return64(res);
14490 }
14491
14492 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
vrev32_s16(int16x4_t vec)14493 _NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
14494 {
14495 int16x4_t res64;
14496 __m128i res;
14497 res = vrev32q_s16(_pM128i(vec));
14498 return64(res);
14499 }
14500
14501 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
14502 #define vrev32_u8 vrev32_s8
14503
14504 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
14505 #define vrev32_u16 vrev32_s16
14506
14507 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
14508 #define vrev32_p8 vrev32_u8
14509
14510 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
14511 #define vrev32_p16 vrev32_u16
14512
14513 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
vrev32q_s8(int8x16_t vec)14514 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
14515 {
14516 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
14517 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14518 }
14519
14520 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
vrev32q_s16(int16x8_t vec)14521 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
14522 {
14523 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
14524 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8);
14525 }
14526
14527 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
14528 #define vrev32q_u8 vrev32q_s8
14529
14530 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
14531 #define vrev32q_u16 vrev32q_s16
14532
14533 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
14534 #define vrev32q_p8 vrev32q_u8
14535
14536 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
14537 #define vrev32q_p16 vrev32q_u16
14538
14539 //************* 16 bit shuffles **********************
14540 //******************************************************
14541 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
vrev16_s8(int8x8_t vec)14542 _NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
14543 {
14544 int8x8_t res64;
14545 __m128i res;
14546 res = vrev16q_s8(_pM128i(vec));
14547 return64(res);
14548 }
14549
14550 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
14551 #define vrev16_u8 vrev16_s8
14552
14553 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
14554 #define vrev16_p8 vrev16_u8
14555
14556 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
vrev16q_s8(int8x16_t vec)14557 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
14558 {
14559 _NEON2SSE_ALIGN_16 static const int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
14560 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev8);
14561 }
14562
14563 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
14564 #define vrev16q_u8 vrev16q_s8
14565
14566 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
14567 #define vrev16q_p8 vrev16q_u8
14568
14569 //*********************************************************************
14570 //**************** Other single operand arithmetic *******************
14571 //*********************************************************************
14572
14573 //*********** Absolute: Vd[i] = |Va[i]| **********************************
14574 //************************************************************************
14575 _NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
vabs_s8(int8x8_t a)14576 _NEON2SSE_INLINE int8x8_t vabs_s8(int8x8_t a)
14577 {
14578 int8x8_t res64;
14579 __m128i res;
14580 res = _mm_abs_epi8(_pM128i(a));
14581 return64(res);
14582 }
14583
14584
14585 _NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
vabs_s16(int16x4_t a)14586 _NEON2SSE_INLINE int16x4_t vabs_s16(int16x4_t a)
14587 {
14588 int16x4_t res64;
14589 __m128i res;
14590 res = _mm_abs_epi16(_pM128i(a));
14591 return64(res);
14592 }
14593
14594 _NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
vabs_s32(int32x2_t a)14595 _NEON2SSE_INLINE int32x2_t vabs_s32(int32x2_t a)
14596 {
14597 int32x2_t res64;
14598 __m128i res;
14599 res = _mm_abs_epi32(_pM128i(a));
14600 return64(res);
14601 }
14602
14603 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
vabs_f32(float32x2_t a)14604 _NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
14605 {
14606 float32x4_t res;
14607 __m64_128 res64;
14608 _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14609 res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
14610 _M64f(res64, res);
14611 return res64;
14612 }
14613
14614 _NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
14615 #define vabsq_s8 _mm_abs_epi8
14616
14617 _NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
14618 #define vabsq_s16 _mm_abs_epi16
14619
14620 _NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
14621 #define vabsq_s32 _mm_abs_epi32
14622
14623 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
vabsq_f32(float32x4_t a)14624 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
14625 {
14626 _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14627 return _mm_and_ps (a, *(__m128*)c7fffffff);
14628 }
14629
14630 #ifdef _NEON2SSE_64BIT
14631 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
vabsq_s64(int64x2_t a)14632 _NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0
14633 {
14634 __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31);
14635 return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign);
14636 }
14637
14638 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
vabsq_f64(float64x2_t a)14639 _NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0
14640 {
14641 _NEON2SSE_ALIGN_16 static const int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL};
14642 return _mm_and_pd (a, *(__m128d*)mask);
14643 }
14644 #endif
14645
14646 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
14647 //**********************************************************************
14648 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
14649 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
vqabs_s8(int8x8_t a)14650 _NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
14651 {
14652 int8x8_t res64;
14653 __m128i res;
14654 res = vqabsq_s8(_pM128i(a));
14655 return64(res);
14656 }
14657
14658 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
vqabs_s16(int16x4_t a)14659 _NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
14660 {
14661 int16x4_t res64;
14662 __m128i res;
14663 res = vqabsq_s16(_pM128i(a));
14664 return64(res);
14665 }
14666
14667 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
vqabs_s32(int32x2_t a)14668 _NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
14669 {
14670 int32x2_t res64;
14671 __m128i res;
14672 res = vqabsq_s32(_pM128i(a));
14673 return64(res);
14674 }
14675
14676 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
vqabsq_s8(int8x16_t a)14677 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
14678 {
14679 __m128i c_128, abs, abs_cmp;
14680 c_128 = _mm_set1_epi8 ((int8_t)0x80); //-128
14681 abs = _mm_abs_epi8 (a);
14682 abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
14683 return _mm_xor_si128 (abs, abs_cmp);
14684 }
14685
14686 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
vqabsq_s16(int16x8_t a)14687 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
14688 {
14689 __m128i c_32768, abs, abs_cmp;
14690 c_32768 = _mm_set1_epi16 ((int16_t)0x8000); //-32768
14691 abs = _mm_abs_epi16 (a);
14692 abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
14693 return _mm_xor_si128 (abs, abs_cmp);
14694 }
14695
14696 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
vqabsq_s32(int32x4_t a)14697 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
14698 {
14699 __m128i c80000000, abs, abs_cmp;
14700 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14701 abs = _mm_abs_epi32 (a);
14702 abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
14703 return _mm_xor_si128 (abs, abs_cmp);
14704 }
14705
14706 //*************** Negate: Vd[i] = - Va[i] *************************************
14707 //*****************************************************************************
14708 //several Negate implementations possible for SIMD.
14709 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
14710 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
vneg_s8(int8x8_t a)14711 _NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
14712 {
14713 int8x8_t res64;
14714 __m128i res;
14715 res = vnegq_s8(_pM128i(a));
14716 return64(res);
14717 }
14718
14719 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
vneg_s16(int16x4_t a)14720 _NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
14721 {
14722 int16x4_t res64;
14723 __m128i res;
14724 res = vnegq_s16(_pM128i(a));
14725 return64(res);
14726 }
14727
14728 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
vneg_s32(int32x2_t a)14729 _NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
14730 {
14731 int32x2_t res64;
14732 __m128i res;
14733 res = vnegq_s32(_pM128i(a));
14734 return64(res);
14735 }
14736
14737 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
vneg_f32(float32x2_t a)14738 _NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
14739 {
14740 float32x4_t res;
14741 __m64_128 res64;
14742 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14743 res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
14744 _M64f(res64, res);
14745 return res64;
14746 }
14747
14748 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
vnegq_s8(int8x16_t a)14749 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
14750 {
14751 __m128i zero;
14752 zero = _mm_setzero_si128 ();
14753 return _mm_sub_epi8 (zero, a);
14754 } //or _mm_sign_epi8 (a, negative numbers vector)
14755
14756 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
vnegq_s16(int16x8_t a)14757 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
14758 {
14759 __m128i zero;
14760 zero = _mm_setzero_si128 ();
14761 return _mm_sub_epi16 (zero, a);
14762 } //or _mm_sign_epi16 (a, negative numbers vector)
14763
14764 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
vnegq_s32(int32x4_t a)14765 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
14766 {
14767 __m128i zero;
14768 zero = _mm_setzero_si128 ();
14769 return _mm_sub_epi32 (zero, a);
14770 } //or _mm_sign_epi32 (a, negative numbers vector)
14771
14772 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
vnegq_f32(float32x4_t a)14773 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
14774 {
14775 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14776 return _mm_xor_ps (a, *(__m128*) c80000000);
14777 }
14778
14779 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
14780 //***************************************************************************************
14781 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
14782 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
vqneg_s8(int8x8_t a)14783 _NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
14784 {
14785 int8x8_t res64;
14786 __m128i res;
14787 res = vqnegq_s8(_pM128i(a));
14788 return64(res);
14789 }
14790
14791 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
vqneg_s16(int16x4_t a)14792 _NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
14793 {
14794 int16x4_t res64;
14795 __m128i res;
14796 res = vqnegq_s16(_pM128i(a));
14797 return64(res);
14798 }
14799
14800 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
vqneg_s32(int32x2_t a)14801 _NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
14802 {
14803 int32x2_t res64;
14804 __m128i res;
14805 res = vqnegq_s32(_pM128i(a));
14806 return64(res);
14807 }
14808
14809 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
vqnegq_s8(int8x16_t a)14810 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
14811 {
14812 __m128i zero;
14813 zero = _mm_setzero_si128 ();
14814 return _mm_subs_epi8 (zero, a); //saturating substraction
14815 }
14816
14817 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
vqnegq_s16(int16x8_t a)14818 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
14819 {
14820 __m128i zero;
14821 zero = _mm_setzero_si128 ();
14822 return _mm_subs_epi16 (zero, a); //saturating substraction
14823 }
14824
14825 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
vqnegq_s32(int32x4_t a)14826 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
14827 {
14828 //solution may be not optimal compared with a serial
14829 __m128i c80000000, zero, sub, cmp;
14830 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
14831 zero = _mm_setzero_si128 ();
14832 sub = _mm_sub_epi32 (zero, a); //substraction
14833 cmp = _mm_cmpeq_epi32 (a, c80000000);
14834 return _mm_xor_si128 (sub, cmp);
14835 }
14836
14837 //****************** Count leading zeros ********************************
14838 //**************************************************************************
14839 //no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14840 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
vclz_s8(int8x8_t a)14841 _NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
14842 {
14843 int8x8_t res64;
14844 __m128i res;
14845 res = vclzq_s8(_pM128i(a));
14846 return64(res);
14847 }
14848
14849 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
vclz_s16(int16x4_t a)14850 _NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
14851 {
14852 int16x4_t res64;
14853 __m128i res;
14854 res = vclzq_s16(_pM128i(a));
14855 return64(res);
14856 }
14857
14858 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
vclz_s32(int32x2_t a)14859 _NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
14860 {
14861 int32x2_t res64;
14862 __m128i res;
14863 res = vclzq_s32(_pM128i(a));
14864 return64(res);
14865 }
14866
14867
14868 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
14869 #define vclz_u8 vclz_s8
14870
14871 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
14872 #define vclz_u16 vclz_s16
14873
14874 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
14875 #define vclz_u32 vclz_s32
14876
14877 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
vclzq_s8(int8x16_t a)14878 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
14879 {
14880 _NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
14881 /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
14882 /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
14883 /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0 };
14884 __m128i maskLOW, c4, lowclz, mask, hiclz;
14885 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
14886 c4 = _mm_set1_epi8(4);
14887 lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
14888 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
14889 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
14890 hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
14891 mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
14892 lowclz = _mm_and_si128(lowclz,mask);
14893 return _mm_add_epi8(lowclz, hiclz);
14894 }
14895
14896 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
vclzq_s16(int16x8_t a)14897 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
14898 {
14899 __m128i c7, res8x16, res8x16_swap;
14900 _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
14901 _NEON2SSE_ALIGN_16 static const uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
14902 c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
14903 res8x16 = vclzq_s8(a);
14904 res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
14905 res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
14906 res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
14907 c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
14908 res8x16 = _mm_and_si128(res8x16, c7); //lowclz
14909 return _mm_add_epi16(res8x16_swap, res8x16);
14910 }
14911
14912 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
vclzq_s32(int32x4_t a)14913 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
14914 {
14915 __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
14916 c55555555 = _mm_set1_epi32(0x55555555);
14917 c33333333 = _mm_set1_epi32(0x33333333);
14918 c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
14919 c3f = _mm_set1_epi32(0x3f);
14920 c32 = _mm_set1_epi32(32);
14921 tmp = _mm_srli_epi32(a, 1);
14922 res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
14923 tmp = _mm_srli_epi32(res, 2);
14924 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
14925 tmp = _mm_srli_epi32(res, 4);
14926 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
14927 tmp = _mm_srli_epi32(res, 8);
14928 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
14929 tmp = _mm_srli_epi32(res, 16);
14930 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
14931
14932 tmp = _mm_srli_epi32(res, 1);
14933 tmp = _mm_and_si128(tmp, c55555555);
14934 res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
14935
14936 tmp = _mm_srli_epi32(res, 2);
14937 tmp = _mm_and_si128(tmp, c33333333);
14938 tmp1 = _mm_and_si128(res, c33333333);
14939 res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
14940
14941 tmp = _mm_srli_epi32(res, 4);
14942 tmp = _mm_add_epi32(tmp, res);
14943 res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
14944
14945 tmp = _mm_srli_epi32(res, 8);
14946 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
14947
14948 tmp = _mm_srli_epi32(res, 16);
14949 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
14950
14951 res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
14952
14953 return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
14954 }
14955
14956 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
14957 #define vclzq_u8 vclzq_s8
14958
14959 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
14960 #define vclzq_u16 vclzq_s16
14961
14962 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
14963 #define vclzq_u32 vclzq_s32
14964
14965 //************** Count leading sign bits **************************
14966 //********************************************************************
14967 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
14968 // the topmost bit, that are the same as the topmost bit, in each element in a vector
14969 //No corresponding vector intrinsics in IA32, need to implement it.
14970 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14971 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
vcls_s8(int8x8_t a)14972 _NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
14973 {
14974 int8x8_t res64;
14975 __m128i res;
14976 res = vclsq_s8(_pM128i(a));
14977 return64(res);
14978 }
14979
14980 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
vcls_s16(int16x4_t a)14981 _NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
14982 {
14983 int16x4_t res64;
14984 __m128i res;
14985 res = vclsq_s16(_pM128i(a));
14986 return64(res);
14987 }
14988
14989 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
vcls_s32(int32x2_t a)14990 _NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
14991 {
14992 int32x2_t res64;
14993 __m128i res;
14994 res = vclsq_s32(_pM128i(a));
14995 return64(res);
14996 }
14997
14998 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
vclsq_s8(int8x16_t a)14999 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
15000 {
15001 __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
15002 cff = _mm_cmpeq_epi8 (a,a); //0xff
15003 c80 = _mm_set1_epi8((int8_t)0x80);
15004 c1 = _mm_set1_epi8(1);
15005 a_mask = _mm_and_si128(a, c80);
15006 a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
15007 a_neg = _mm_xor_si128(a, cff);
15008 a_neg = _mm_and_si128(a_mask, a_neg);
15009 a_pos = _mm_andnot_si128(a_mask, a);
15010 a_comb = _mm_or_si128(a_pos, a_neg);
15011 a_comb = vclzq_s8(a_comb);
15012 return _mm_sub_epi8(a_comb, c1);
15013 }
15014
15015 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
vclsq_s16(int16x8_t a)15016 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
15017 {
15018 __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
15019 cffff = _mm_cmpeq_epi16(a,a);
15020 c8000 = _mm_slli_epi16(cffff, 15); //0x8000
15021 c1 = _mm_srli_epi16(cffff,15); //0x1
15022 a_mask = _mm_and_si128(a, c8000);
15023 a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
15024 a_neg = _mm_xor_si128(a, cffff);
15025 a_neg = _mm_and_si128(a_mask, a_neg);
15026 a_pos = _mm_andnot_si128(a_mask, a);
15027 a_comb = _mm_or_si128(a_pos, a_neg);
15028 a_comb = vclzq_s16(a_comb);
15029 return _mm_sub_epi16(a_comb, c1);
15030 }
15031
15032 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
vclsq_s32(int32x4_t a)15033 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
15034 {
15035 __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
15036 cffffffff = _mm_cmpeq_epi32(a,a);
15037 c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000
15038 c1 = _mm_srli_epi32(cffffffff,31); //0x1
15039 a_mask = _mm_and_si128(a, c80000000);
15040 a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
15041 a_neg = _mm_xor_si128(a, cffffffff);
15042 a_neg = _mm_and_si128(a_mask, a_neg);
15043 a_pos = _mm_andnot_si128(a_mask, a);
15044 a_comb = _mm_or_si128(a_pos, a_neg);
15045 a_comb = vclzq_s32(a_comb);
15046 return _mm_sub_epi32(a_comb, c1);
15047 }
15048
15049 //************************* Count number of set bits ********************************
15050 //*************************************************************************************
15051 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element
15052 //another option is to do the following algorithm:
15053
15054 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
vcnt_u8(uint8x8_t a)15055 _NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
15056 {
15057 uint8x8_t res64;
15058 __m128i res;
15059 res = vcntq_u8(_pM128i(a));
15060 return64(res);
15061 }
15062
15063 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
15064 #define vcnt_s8 vcnt_u8
15065
15066 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
15067 #define vcnt_p8 vcnt_u8
15068
15069 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
vcntq_u8(uint8x16_t a)15070 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
15071 {
15072 _NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
15073 /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
15074 /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
15075 /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
15076 __m128i maskLOW, mask, lowpopcnt, hipopcnt;
15077 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
15078 mask = _mm_and_si128(a, maskLOW);
15079 lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
15080 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits
15081 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
15082 hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
15083 return _mm_add_epi8(lowpopcnt, hipopcnt);
15084 }
15085
15086 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
15087 #define vcntq_s8 vcntq_u8
15088
15089 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
15090 #define vcntq_p8 vcntq_u8
15091
15092 //**************************************************************************************
15093 //*********************** Logical operations ****************************************
15094 //**************************************************************************************
15095 //************************** Bitwise not ***********************************
15096 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
15097 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
vmvn_s8(int8x8_t a)15098 _NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
15099 {
15100 int8x8_t res64;
15101 __m128i res;
15102 res = vmvnq_s8(_pM128i(a));
15103 return64(res);
15104 }
15105
15106 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
vmvn_s16(int16x4_t a)15107 _NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
15108 {
15109 int16x4_t res64;
15110 __m128i res;
15111 res = vmvnq_s16(_pM128i(a));
15112 return64(res);
15113 }
15114
15115 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
vmvn_s32(int32x2_t a)15116 _NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
15117 {
15118 int32x2_t res64;
15119 __m128i res;
15120 res = vmvnq_s32(_pM128i(a));
15121 return64(res);
15122 }
15123
15124 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
15125 #define vmvn_u8 vmvn_s8
15126
15127 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
15128 #define vmvn_u16 vmvn_s16
15129
15130 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
15131 #define vmvn_u32 vmvn_s32
15132
15133 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
15134 #define vmvn_p8 vmvn_u8
15135
15136 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
vmvnq_s8(int8x16_t a)15137 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
15138 {
15139 __m128i c1;
15140 c1 = _mm_cmpeq_epi8 (a,a); //0xff
15141 return _mm_andnot_si128 (a, c1);
15142 }
15143
15144 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
vmvnq_s16(int16x8_t a)15145 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
15146 {
15147 __m128i c1;
15148 c1 = _mm_cmpeq_epi16 (a,a); //0xffff
15149 return _mm_andnot_si128 (a, c1);
15150 }
15151
15152 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
vmvnq_s32(int32x4_t a)15153 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
15154 {
15155 __m128i c1;
15156 c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
15157 return _mm_andnot_si128 (a, c1);
15158 }
15159
15160 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
15161 #define vmvnq_u8 vmvnq_s8
15162
15163 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
15164 #define vmvnq_u16 vmvnq_s16
15165
15166 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
15167 #define vmvnq_u32 vmvnq_s32
15168
15169 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
15170 #define vmvnq_p8 vmvnq_u8
15171
15172 //****************** Bitwise and ***********************
15173 //******************************************************
15174 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
vand_s8(int8x8_t a,int8x8_t b)15175 _NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
15176 {
15177 int8x8_t res64;
15178 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15179 }
15180
15181 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
vand_s16(int16x4_t a,int16x4_t b)15182 _NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
15183 {
15184 int16x4_t res64;
15185 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15186 }
15187
15188 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
vand_s32(int32x2_t a,int32x2_t b)15189 _NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
15190 {
15191 int32x2_t res64;
15192 return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
15193 }
15194
15195
15196 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
vand_s64(int64x1_t a,int64x1_t b)15197 _NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b)
15198 {
15199 int64x1_t res;
15200 res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
15201 return res;
15202 }
15203
15204 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
15205 #define vand_u8 vand_s8
15206
15207 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
15208 #define vand_u16 vand_s16
15209
15210 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
15211 #define vand_u32 vand_s32
15212
15213 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
15214 #define vand_u64 vand_s64
15215
15216
15217 _NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
15218 #define vandq_s8 _mm_and_si128
15219
15220 _NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
15221 #define vandq_s16 _mm_and_si128
15222
15223 _NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
15224 #define vandq_s32 _mm_and_si128
15225
15226 _NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
15227 #define vandq_s64 _mm_and_si128
15228
15229 _NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
15230 #define vandq_u8 _mm_and_si128
15231
15232 _NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
15233 #define vandq_u16 _mm_and_si128
15234
15235 _NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
15236 #define vandq_u32 _mm_and_si128
15237
15238 _NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
15239 #define vandq_u64 _mm_and_si128
15240
15241 //******************** Bitwise or *********************************
15242 //******************************************************************
15243 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
vorr_s8(int8x8_t a,int8x8_t b)15244 _NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
15245 {
15246 int8x8_t res64;
15247 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15248 }
15249
15250
15251 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
vorr_s16(int16x4_t a,int16x4_t b)15252 _NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
15253 {
15254 int16x4_t res64;
15255 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15256 }
15257
15258
15259 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
vorr_s32(int32x2_t a,int32x2_t b)15260 _NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
15261 {
15262 int32x2_t res64;
15263 return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
15264 }
15265
15266
15267 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
vorr_s64(int64x1_t a,int64x1_t b)15268 _NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b)
15269 {
15270 int64x1_t res;
15271 res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
15272 return res;
15273 }
15274
15275 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
15276 #define vorr_u8 vorr_s8
15277
15278 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
15279 #define vorr_u16 vorr_s16
15280
15281 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
15282 #define vorr_u32 vorr_s32
15283
15284 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
15285 #define vorr_u64 vorr_s64
15286
15287 _NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
15288 #define vorrq_s8 _mm_or_si128
15289
15290 _NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
15291 #define vorrq_s16 _mm_or_si128
15292
15293 _NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
15294 #define vorrq_s32 _mm_or_si128
15295
15296 _NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
15297 #define vorrq_s64 _mm_or_si128
15298
15299 _NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
15300 #define vorrq_u8 _mm_or_si128
15301
15302 _NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
15303 #define vorrq_u16 _mm_or_si128
15304
15305 _NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
15306 #define vorrq_u32 _mm_or_si128
15307
15308 _NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
15309 #define vorrq_u64 _mm_or_si128
15310
15311 //************* Bitwise exclusive or (EOR or XOR) ******************
15312 //*******************************************************************
15313 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
veor_s8(int8x8_t a,int8x8_t b)15314 _NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
15315 {
15316 int8x8_t res64;
15317 return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
15318 }
15319
15320 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
15321 #define veor_s16 veor_s8
15322
15323 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
15324 #define veor_s32 veor_s8
15325
15326 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
veor_s64(int64x1_t a,int64x1_t b)15327 _NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b)
15328 {
15329 int64x1_t res;
15330 res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
15331 return res;
15332 }
15333
15334 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
15335 #define veor_u8 veor_s8
15336
15337 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
15338 #define veor_u16 veor_s16
15339
15340 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
15341 #define veor_u32 veor_s32
15342
15343 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
15344 #define veor_u64 veor_s64
15345
15346 _NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
15347 #define veorq_s8 _mm_xor_si128
15348
15349 _NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
15350 #define veorq_s16 _mm_xor_si128
15351
15352 _NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
15353 #define veorq_s32 _mm_xor_si128
15354
15355 _NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
15356 #define veorq_s64 _mm_xor_si128
15357
15358 _NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
15359 #define veorq_u8 _mm_xor_si128
15360
15361 _NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
15362 #define veorq_u16 _mm_xor_si128
15363
15364 _NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
15365 #define veorq_u32 _mm_xor_si128
15366
15367 _NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
15368 #define veorq_u64 _mm_xor_si128
15369
15370 //********************** Bit Clear **********************************
15371 //*******************************************************************
15372 //Logical AND complement (AND negation or AND NOT)
15373 _NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
vbic_s8(int8x8_t a,int8x8_t b)15374 _NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b)
15375 {
15376 int8x8_t res64;
15377 return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
15378 }
15379
15380 _NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
15381 #define vbic_s16 vbic_s8
15382
15383 _NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
15384 #define vbic_s32 vbic_s8
15385
15386 _NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
vbic_s64(int64x1_t a,int64x1_t b)15387 _NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b)
15388 {
15389 int64x1_t res;
15390 res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
15391 return res;
15392 }
15393
15394 _NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
15395 #define vbic_u8 vbic_s8
15396
15397 _NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
15398 #define vbic_u16 vbic_s16
15399
15400 _NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
15401 #define vbic_u32 vbic_s32
15402
15403 _NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
15404 #define vbic_u64 vbic_s64
15405
15406 _NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
15407 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15408
15409 _NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
15410 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15411
15412 _NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
15413 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15414
15415 _NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
15416 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15417
15418 _NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
15419 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15420
15421 _NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
15422 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15423
15424 _NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
15425 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15426
15427 _NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
15428 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15429
15430 //**************** Bitwise OR complement ********************************
15431 //**************************************** ********************************
15432 //no exact IA 32 match, need to implement it as following
15433 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
vorn_s8(int8x8_t a,int8x8_t b)15434 _NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b)
15435 {
15436 int8x8_t res64;
15437 return64(vornq_s8(_pM128i(a), _pM128i(b)));
15438 }
15439
15440
15441 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
vorn_s16(int16x4_t a,int16x4_t b)15442 _NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b)
15443 {
15444 int16x4_t res64;
15445 return64(vornq_s16(_pM128i(a), _pM128i(b)));
15446 }
15447
15448
15449 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
vorn_s32(int32x2_t a,int32x2_t b)15450 _NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b)
15451 {
15452 int32x2_t res64;
15453 return64(vornq_s32(_pM128i(a), _pM128i(b)));
15454 }
15455
15456
15457 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
vorn_s64(int64x1_t a,int64x1_t b)15458 _NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
15459 {
15460 int64x1_t res;
15461 res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
15462 return res;
15463 }
15464
15465 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
15466 #define vorn_u8 vorn_s8
15467
15468
15469 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
15470 #define vorn_u16 vorn_s16
15471
15472 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
15473 #define vorn_u32 vorn_s32
15474
15475 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
15476 #define vorn_u64 vorn_s64
15477
15478
15479 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
vornq_s8(int8x16_t a,int8x16_t b)15480 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
15481 {
15482 __m128i b1;
15483 b1 = vmvnq_s8( b); //bitwise not for b
15484 return _mm_or_si128 (a, b1);
15485 }
15486
15487 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
vornq_s16(int16x8_t a,int16x8_t b)15488 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
15489 {
15490 __m128i b1;
15491 b1 = vmvnq_s16( b); //bitwise not for b
15492 return _mm_or_si128 (a, b1);
15493 }
15494
15495 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
vornq_s32(int32x4_t a,int32x4_t b)15496 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
15497 {
15498 __m128i b1;
15499 b1 = vmvnq_s32( b); //bitwise not for b
15500 return _mm_or_si128 (a, b1);
15501 }
15502
15503 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
vornq_s64(int64x2_t a,int64x2_t b)15504 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
15505 {
15506 __m128i c1, b1;
15507 c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
15508 b1 = _mm_andnot_si128 (b, c1);
15509 return _mm_or_si128 (a, b1);
15510 }
15511
15512 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
vornq_u8(uint8x16_t a,uint8x16_t b)15513 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
15514 {
15515 __m128i b1;
15516 b1 = vmvnq_u8( b); //bitwise not for b
15517 return _mm_or_si128 (a, b1);
15518 }
15519
15520 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
vornq_u16(uint16x8_t a,uint16x8_t b)15521 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
15522 {
15523 __m128i b1;
15524 b1 = vmvnq_s16( b); //bitwise not for b
15525 return _mm_or_si128 (a, b1);
15526 }
15527
15528 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
vornq_u32(uint32x4_t a,uint32x4_t b)15529 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
15530 {
15531 __m128i b1;
15532 b1 = vmvnq_u32( b); //bitwise not for b
15533 return _mm_or_si128 (a, b1);
15534 }
15535 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
15536 #define vornq_u64 vornq_s64
15537
15538 //********************* Bitwise Select *****************************
15539 //******************************************************************
15540 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
15541
15542 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
15543 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
15544
15545 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
15546 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
15547
15548 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
15549 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
15550
15551 //VBSL only is implemented for SIMD
15552 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
vbsl_s8(uint8x8_t a,int8x8_t b,int8x8_t c)15553 _NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
15554 {
15555 int8x8_t res64;
15556 __m128i res;
15557 res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
15558 return64(res);
15559 }
15560
15561 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
15562 #define vbsl_s16 vbsl_s8
15563
15564 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
15565 #define vbsl_s32 vbsl_s8
15566
15567 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
vbsl_s64(uint64x1_t a,int64x1_t b,int64x1_t c)15568 _NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
15569 {
15570 int64x1_t res;
15571 res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
15572 return res;
15573 }
15574
15575 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
15576 #define vbsl_u8 vbsl_s8
15577
15578 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
15579 #define vbsl_u16 vbsl_s8
15580
15581 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
15582 #define vbsl_u32 vbsl_s8
15583
15584 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
15585 #define vbsl_u64 vbsl_s64
15586
15587 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
vbsl_f32(uint32x2_t a,float32x2_t b,float32x2_t c)15588 _NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
15589 {
15590 __m128 sel1, sel2;
15591 __m64_128 res64;
15592 sel1 = _mm_and_ps (_pM128(a), _pM128(b));
15593 sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
15594 sel1 = _mm_or_ps (sel1, sel2);
15595 _M64f(res64, sel1);
15596 return res64;
15597 }
15598
15599 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
15600 #define vbsl_p8 vbsl_s8
15601
15602 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
15603 #define vbsl_p16 vbsl_s8
15604
15605 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
vbslq_s8(uint8x16_t a,int8x16_t b,int8x16_t c)15606 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
15607 {
15608 __m128i sel1, sel2;
15609 sel1 = _mm_and_si128 (a, b);
15610 sel2 = _mm_andnot_si128 (a, c);
15611 return _mm_or_si128 (sel1, sel2);
15612 }
15613
15614 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
15615 #define vbslq_s16 vbslq_s8
15616
15617 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
15618 #define vbslq_s32 vbslq_s8
15619
15620 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
15621 #define vbslq_s64 vbslq_s8
15622
15623 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
15624 #define vbslq_u8 vbslq_s8
15625
15626 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
15627 #define vbslq_u16 vbslq_s8
15628
15629 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
15630 #define vbslq_u32 vbslq_s8
15631
15632 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
15633 #define vbslq_u64 vbslq_s8
15634
15635 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
vbslq_f32(uint32x4_t a,float32x4_t b,float32x4_t c)15636 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
15637 {
15638 __m128 sel1, sel2;
15639 sel1 = _mm_and_ps (*(__m128*)&a, b);
15640 sel2 = _mm_andnot_ps (*(__m128*)&a, c);
15641 return _mm_or_ps (sel1, sel2);
15642 }
15643
15644 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
15645 #define vbslq_p8 vbslq_u8
15646
15647 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
15648 #define vbslq_p16 vbslq_s8
15649
15650 //************************************************************************************
15651 //**************** Transposition operations ****************************************
15652 //************************************************************************************
15653 //***************** Vector Transpose ************************************************
15654 //************************************************************************************
15655 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
15656 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
15657 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
vtrn_s8(int8x8_t a,int8x8_t b)15658 _NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
15659 {
15660 int8x8x2_t val;
15661 __m128i tmp, val0;
15662 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15663 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
15664 vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
15665 return val;
15666 }
15667
15668 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
vtrn_s16(int16x4_t a,int16x4_t b)15669 _NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
15670 {
15671 int16x4x2_t val;
15672 __m128i tmp, val0;
15673 _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
15674 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15675 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
15676 vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
15677 return val;
15678 }
15679
15680 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
vtrn_s32(int32x2_t a,int32x2_t b)15681 _NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
15682 {
15683 int32x2x2_t val;
15684 __m128i val0;
15685 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
15686 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
15687 return val;
15688 }
15689
15690 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
15691 #define vtrn_u8 vtrn_s8
15692
15693 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
15694 #define vtrn_u16 vtrn_s16
15695
15696 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
15697 #define vtrn_u32 vtrn_s32
15698
15699 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
vtrn_f32(float32x2_t a,float32x2_t b)15700 _NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
15701 {
15702 float32x2x2_t val;
15703 val.val[0].m64_f32[0] = a.m64_f32[0];
15704 val.val[0].m64_f32[1] = b.m64_f32[0];
15705 val.val[1].m64_f32[0] = a.m64_f32[1];
15706 val.val[1].m64_f32[1] = b.m64_f32[1];
15707 return val; //a0,b0,a1,b1
15708 }
15709
15710 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
15711 #define vtrn_p8 vtrn_u8
15712
15713 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
15714 #define vtrn_p16 vtrn_s16
15715
15716 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
vtrnq_s8(int8x16_t a,int8x16_t b)15717 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
15718 {
15719 int8x16x2_t r8x16;
15720 __m128i a_sh, b_sh;
15721 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15722 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15723
15724 r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
15725 r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
15726 return r8x16;
15727 }
15728
15729 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
vtrnq_s16(int16x8_t a,int16x8_t b)15730 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
15731 {
15732 int16x8x2_t v16x8;
15733 __m128i a_sh, b_sh;
15734 a_sh = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
15735 b_sh = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
15736 v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
15737 v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
15738 return v16x8;
15739 }
15740
15741 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
vtrnq_s32(int32x4_t a,int32x4_t b)15742 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
15743 {
15744 //may be not optimal solution compared with serial
15745 int32x4x2_t v32x4;
15746 __m128i a_sh, b_sh;
15747 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15748 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15749
15750 v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
15751 v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3
15752 return v32x4;
15753 }
15754
15755 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
15756 #define vtrnq_u8 vtrnq_s8
15757
15758 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
15759 #define vtrnq_u16 vtrnq_s16
15760
15761 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
15762 #define vtrnq_u32 vtrnq_s32
15763
15764 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
vtrnq_f32(float32x4_t a,float32x4_t b)15765 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
15766 {
15767 //may be not optimal solution compared with serial
15768 float32x4x2_t f32x4;
15769 __m128 a_sh, b_sh;
15770 a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
15771 b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
15772
15773 f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
15774 f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3
15775 return f32x4;
15776 }
15777
15778 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
15779 #define vtrnq_p8 vtrnq_s8
15780
15781 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
15782 #define vtrnq_p16 vtrnq_s16
15783
15784 //***************** Interleave elements ***************************
15785 //*****************************************************************
15786 //output has (a0,b0,a1,b1, a2,b2,.....)
15787 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
vzip_s8(int8x8_t a,int8x8_t b)15788 _NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
15789 {
15790 int8x8x2_t val;
15791 __m128i val0;
15792 val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
15793 vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15794 return val;
15795 }
15796
15797 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
vzip_s16(int16x4_t a,int16x4_t b)15798 _NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
15799 {
15800 int16x4x2_t val;
15801 __m128i val0;
15802 val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
15803 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15804 return val;
15805 }
15806
15807 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
15808 #define vzip_s32 vtrn_s32
15809
15810 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
15811 #define vzip_u8 vzip_s8
15812
15813 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
15814 #define vzip_u16 vzip_s16
15815
15816 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
15817 #define vzip_u32 vzip_s32
15818
15819 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
15820 #define vzip_f32 vtrn_f32
15821
15822 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
15823 #define vzip_p8 vzip_u8
15824
15825 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
15826 #define vzip_p16 vzip_u16
15827
15828 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
vzipq_s8(int8x16_t a,int8x16_t b)15829 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
15830 {
15831 int8x16x2_t r8x16;
15832 r8x16.val[0] = _mm_unpacklo_epi8(a, b);
15833 r8x16.val[1] = _mm_unpackhi_epi8(a, b);
15834 return r8x16;
15835 }
15836
15837 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
vzipq_s16(int16x8_t a,int16x8_t b)15838 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
15839 {
15840 int16x8x2_t r16x8;
15841 r16x8.val[0] = _mm_unpacklo_epi16(a, b);
15842 r16x8.val[1] = _mm_unpackhi_epi16(a, b);
15843 return r16x8;
15844 }
15845
15846 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
vzipq_s32(int32x4_t a,int32x4_t b)15847 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
15848 {
15849 int32x4x2_t r32x4;
15850 r32x4.val[0] = _mm_unpacklo_epi32(a, b);
15851 r32x4.val[1] = _mm_unpackhi_epi32(a, b);
15852 return r32x4;
15853 }
15854
15855 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
15856 #define vzipq_u8 vzipq_s8
15857
15858 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
15859 #define vzipq_u16 vzipq_s16
15860
15861 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
15862 #define vzipq_u32 vzipq_s32
15863
15864 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
vzipq_f32(float32x4_t a,float32x4_t b)15865 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
15866 {
15867 float32x4x2_t f32x4;
15868 f32x4.val[0] = _mm_unpacklo_ps ( a, b);
15869 f32x4.val[1] = _mm_unpackhi_ps ( a, b);
15870 return f32x4;
15871 }
15872
15873 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
15874 #define vzipq_p8 vzipq_u8
15875
15876 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
15877 #define vzipq_p16 vzipq_u16
15878
15879 //*********************** De-Interleave elements *************************
15880 //*************************************************************************
15881 //As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
15882 //no such functions in IA32 SIMD, shuffle is required
15883 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
vuzp_s8(int8x8_t a,int8x8_t b)15884 _NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
15885 {
15886 int8x8x2_t val;
15887 __m128i tmp, val0;
15888 _NEON2SSE_ALIGN_16 static const int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15};
15889 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15890 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7)
15891 vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15892 return val;
15893 }
15894
15895 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
vuzp_s16(int16x4_t a,int16x4_t b)15896 _NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
15897 {
15898 int16x4x2_t val;
15899 __m128i tmp, val0;
15900 _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
15901 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
15902 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
15903 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15904 return val;
15905 }
15906
15907 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
vuzp_s32(int32x2_t a,int32x2_t b)15908 _NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
15909 {
15910 int32x2x2_t val;
15911 __m128i val0;
15912 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
15913 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15914 return val;
15915 }
15916
15917 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
15918 #define vuzp_u8 vuzp_s8
15919
15920 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
15921 #define vuzp_u16 vuzp_s16
15922
15923 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
15924 #define vuzp_u32 vuzp_s32
15925
15926 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
15927 #define vuzp_f32 vzip_f32
15928
15929 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
15930 #define vuzp_p8 vuzp_u8
15931
15932 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
15933 #define vuzp_p16 vuzp_u16
15934
15935 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
vuzpq_s8(int8x16_t a,int8x16_t b)15936 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
15937 {
15938 int8x16x2_t v8x16;
15939 __m128i a_sh, b_sh;
15940 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15941 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15942 //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
15943 v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14,
15944 v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15
15945 return v8x16;
15946 }
15947
15948 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
vuzpq_s16(int16x8_t a,int16x8_t b)15949 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
15950 {
15951 int16x8x2_t v16x8;
15952 __m128i a_sh, b_sh;
15953 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7
15954 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7
15955 v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
15956 v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
15957 return v16x8;
15958 }
15959
15960 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
vuzpq_s32(int32x4_t a,int32x4_t b)15961 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
15962 {
15963 //may be not optimal solution compared with serial
15964 int32x4x2_t v32x4;
15965 __m128i a_sh, b_sh;
15966 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
15967 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
15968
15969 v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
15970 v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
15971 return v32x4;
15972 }
15973
15974 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
15975 #define vuzpq_u8 vuzpq_s8
15976
15977 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
15978 #define vuzpq_u16 vuzpq_s16
15979
15980 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
15981 #define vuzpq_u32 vuzpq_s32
15982
15983 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
vuzpq_f32(float32x4_t a,float32x4_t b)15984 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
15985 {
15986 float32x4x2_t v32x4;
15987 v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
15988 v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
15989 return v32x4;
15990 }
15991
15992 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
15993 #define vuzpq_p8 vuzpq_u8
15994
15995 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
15996 #define vuzpq_p16 vuzpq_u16
15997
15998 //##############################################################################################
15999 //*********************** Reinterpret cast intrinsics.******************************************
16000 //##############################################################################################
16001 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
16002 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
16003 #define vreinterpret_p8_u32
16004
16005 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
16006 #define vreinterpret_p8_u16
16007
16008 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
16009 #define vreinterpret_p8_u8
16010
16011 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
16012 #define vreinterpret_p8_s32
16013
16014 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
16015 #define vreinterpret_p8_s16
16016
16017 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
16018 #define vreinterpret_p8_s8
16019
16020 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
16021 #define vreinterpret_p8_u64
16022
16023 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
16024 #define vreinterpret_p8_s64
16025
16026 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
16027 #define vreinterpret_p8_f32
16028
16029 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
16030 #define vreinterpret_p8_p16
16031
16032 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
16033 #define vreinterpretq_p8_u32
16034
16035 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
16036 #define vreinterpretq_p8_u16
16037
16038 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
16039 #define vreinterpretq_p8_u8
16040
16041 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
16042 #define vreinterpretq_p8_s32
16043
16044 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
16045 #define vreinterpretq_p8_s16
16046
16047 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
16048 #define vreinterpretq_p8_s8
16049
16050 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
16051 #define vreinterpretq_p8_u64
16052
16053 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
16054 #define vreinterpretq_p8_s64
16055
16056 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
16057 #define vreinterpretq_p8_f32(t) _M128i(t)
16058
16059 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
16060 #define vreinterpretq_p8_p16
16061
16062 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
16063 #define vreinterpret_p16_u32
16064
16065 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
16066 #define vreinterpret_p16_u16
16067
16068 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
16069 #define vreinterpret_p16_u8
16070
16071 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
16072 #define vreinterpret_p16_s32
16073
16074 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
16075 #define vreinterpret_p16_s16
16076
16077 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
16078 #define vreinterpret_p16_s8
16079
16080 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
16081 #define vreinterpret_p16_u64
16082
16083 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
16084 #define vreinterpret_p16_s64
16085
16086 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
16087 #define vreinterpret_p16_f32
16088
16089 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
16090 #define vreinterpret_p16_p8
16091
16092 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
16093 #define vreinterpretq_p16_u32
16094
16095 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
16096 #define vreinterpretq_p16_u16
16097
16098 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
16099 #define vreinterpretq_p16_s32
16100
16101 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
16102 #define vreinterpretq_p16_s16
16103
16104 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
16105 #define vreinterpretq_p16_s8
16106
16107 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
16108 #define vreinterpretq_p16_u64
16109
16110 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
16111 #define vreinterpretq_p16_s64
16112
16113 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
16114 #define vreinterpretq_p16_f32(t) _M128i(t)
16115
16116 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
16117 #define vreinterpretq_p16_p8 vreinterpretq_s16_p8
16118
16119 //**** Integer to float ******
16120 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
vreinterpret_f32_u32(uint32x2_t t)16121 _NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t)
16122 {
16123 return (*(__m64_128*)&(t));
16124 }
16125
16126 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
16127 #define vreinterpret_f32_u16 vreinterpret_f32_u32
16128
16129
16130 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
16131 #define vreinterpret_f32_u8 vreinterpret_f32_u32
16132
16133
16134 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s32 (int32x2_t t);
16135 #define vreinterpret_f32_s32 vreinterpret_f32_u32
16136
16137
16138 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s16 (int16x4_t t);
16139 #define vreinterpret_f32_s16 vreinterpret_f32_u32
16140
16141 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s8 (int8x8_t t);
16142 #define vreinterpret_f32_s8 vreinterpret_f32_u32
16143
16144
16145 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u64(uint64x1_t t);
16146 #define vreinterpret_f32_u64 vreinterpret_f32_u32
16147
16148
16149 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s64 (int64x1_t t);
16150 #define vreinterpret_f32_s64 vreinterpret_f32_u32
16151
16152
16153 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
16154 #define vreinterpret_f32_p16 vreinterpret_f32_u32
16155
16156 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
16157 #define vreinterpret_f32_p8 vreinterpret_f32_u32
16158
16159 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
16160 #define vreinterpretq_f32_u32(t) _M128(t)
16161
16162 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
16163 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
16164
16165 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
16166 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
16167
16168 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
16169 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
16170
16171 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
16172 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
16173
16174 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
16175 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
16176
16177 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
16178 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
16179
16180 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
16181 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
16182
16183 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
16184 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
16185
16186 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
16187 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
16188
16189 //*** Integer type conversions ******************
16190 //no conversion necessary for the following functions because it is same data type
16191 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
16192 #define vreinterpret_s64_u32
16193
16194 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
16195 #define vreinterpret_s64_u16
16196
16197 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
16198 #define vreinterpret_s64_u8
16199
16200 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s32 (int32x2_t t);
16201 #define vreinterpret_s64_s32
16202
16203 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s16 (int16x4_t t);
16204 #define vreinterpret_s64_s16
16205
16206 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s8 (int8x8_t t);
16207 #define vreinterpret_s64_s8
16208
16209 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
16210 #define vreinterpret_s64_u64
16211
16212 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_f32 (float32x2_t t);
16213 #define vreinterpret_s64_f32
16214
16215 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
16216 #define vreinterpret_s64_p16
16217
16218 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
16219 #define vreinterpret_s64_p8
16220
16221 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
16222 #define vreinterpretq_s64_u32
16223
16224 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
16225 #define vreinterpretq_s64_s16
16226
16227 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
16228 #define vreinterpretq_s64_u8
16229
16230 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
16231 #define vreinterpretq_s64_s32
16232
16233 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
16234 #define vreinterpretq_s64_u16
16235
16236 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
16237 #define vreinterpretq_s64_s8
16238
16239 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
16240 #define vreinterpretq_s64_u64
16241
16242 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
16243 #define vreinterpretq_s64_f32(t) _M128i(t)
16244
16245 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
16246 #define vreinterpretq_s64_p16
16247
16248 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
16249 #define vreinterpretq_s64_p8
16250
16251 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
16252 #define vreinterpret_u64_u32
16253
16254 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
16255 #define vreinterpret_u64_u16
16256
16257 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
16258 #define vreinterpret_u64_u8
16259
16260 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
16261 #define vreinterpret_u64_s32
16262
16263 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
16264 #define vreinterpret_u64_s16
16265
16266 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
16267 #define vreinterpret_u64_s8
16268
16269 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
16270 #define vreinterpret_u64_s64
16271
16272 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
16273 #define vreinterpret_u64_f32
16274
16275 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
16276 #define vreinterpret_u64_p16
16277
16278 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
16279 #define vreinterpret_u64_p8
16280
16281 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
16282 #define vreinterpretq_u64_u32
16283
16284 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
16285 #define vreinterpretq_u64_u16
16286
16287 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
16288 #define vreinterpretq_u64_u8
16289
16290 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
16291 #define vreinterpretq_u64_s32
16292
16293 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
16294 #define vreinterpretq_u64_s16
16295
16296 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
16297 #define vreinterpretq_u64_s8
16298
16299 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
16300 #define vreinterpretq_u64_s64
16301
16302 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
16303 #define vreinterpretq_u64_f32(t) _M128i(t)
16304
16305 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
16306 #define vreinterpretq_u64_p16
16307
16308 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
16309 #define vreinterpretq_u64_p8
16310
16311 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
16312 #define vreinterpret_s8_u32
16313
16314 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
16315 #define vreinterpret_s8_u16
16316
16317 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
16318 #define vreinterpret_s8_u8
16319
16320 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s32 (int32x2_t t);
16321 #define vreinterpret_s8_s32
16322
16323 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s16 (int16x4_t t);
16324 #define vreinterpret_s8_s16
16325
16326 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
16327 #define vreinterpret_s8_u64
16328
16329 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s64 (int64x1_t t);
16330 #define vreinterpret_s8_s64
16331
16332 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_f32 (float32x2_t t);
16333 #define vreinterpret_s8_f32
16334
16335 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
16336 #define vreinterpret_s8_p16
16337
16338 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
16339 #define vreinterpret_s8_p8
16340
16341 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
16342 #define vreinterpretq_s8_u32
16343
16344 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
16345 #define vreinterpretq_s8_u16
16346
16347 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
16348 #define vreinterpretq_s8_u8
16349
16350 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
16351 #define vreinterpretq_s8_s32
16352
16353 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
16354 #define vreinterpretq_s8_s16
16355
16356 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
16357 #define vreinterpretq_s8_u64
16358
16359 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
16360 #define vreinterpretq_s8_s64
16361
16362 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
16363 #define vreinterpretq_s8_f32(t) _M128i(t)
16364
16365 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
16366 #define vreinterpretq_s8_p16
16367
16368 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
16369 #define vreinterpretq_s8_p8
16370
16371 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
16372 #define vreinterpret_s16_u32
16373
16374 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
16375 #define vreinterpret_s16_u16
16376
16377 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
16378 #define vreinterpret_s16_u8
16379
16380 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s32 (int32x2_t t);
16381 #define vreinterpret_s16_s32
16382
16383 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s8 (int8x8_t t);
16384 #define vreinterpret_s16_s8
16385
16386 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
16387 #define vreinterpret_s16_u64
16388
16389 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s64 (int64x1_t t);
16390 #define vreinterpret_s16_s64
16391
16392 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_f32 (float32x2_t t);
16393 #define vreinterpret_s16_f32
16394
16395
16396 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
16397 #define vreinterpret_s16_p16
16398
16399 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
16400 #define vreinterpret_s16_p8
16401
16402 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
16403 #define vreinterpretq_s16_u32
16404
16405 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
16406 #define vreinterpretq_s16_u16
16407
16408 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
16409 #define vreinterpretq_s16_u8
16410
16411 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
16412 #define vreinterpretq_s16_s32
16413
16414 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
16415 #define vreinterpretq_s16_s8
16416
16417 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
16418 #define vreinterpretq_s16_u64
16419
16420 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
16421 #define vreinterpretq_s16_s64
16422
16423 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
16424 #define vreinterpretq_s16_f32(t) _M128i(t)
16425
16426 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
16427 #define vreinterpretq_s16_p16
16428
16429 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
16430 #define vreinterpretq_s16_p8
16431
16432 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
16433 #define vreinterpret_s32_u32
16434
16435 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
16436 #define vreinterpret_s32_u16
16437
16438 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
16439 #define vreinterpret_s32_u8
16440
16441 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s16 (int16x4_t t);
16442 #define vreinterpret_s32_s16
16443
16444 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s8 (int8x8_t t);
16445 #define vreinterpret_s32_s8
16446
16447 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
16448 #define vreinterpret_s32_u64
16449
16450 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s64 (int64x1_t t);
16451 #define vreinterpret_s32_s64
16452
16453 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_f32 (float32x2_t t);
16454 #define vreinterpret_s32_f32
16455
16456 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
16457 #define vreinterpret_s32_p16
16458
16459 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
16460 #define vreinterpret_s32_p8
16461
16462 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
16463 #define vreinterpretq_s32_u32
16464
16465 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
16466 #define vreinterpretq_s32_u16
16467
16468 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
16469 #define vreinterpretq_s32_u8
16470
16471 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
16472 #define vreinterpretq_s32_s16
16473
16474 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
16475 #define vreinterpretq_s32_s8
16476
16477 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
16478 #define vreinterpretq_s32_u64
16479
16480 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
16481 #define vreinterpretq_s32_s64
16482
16483 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
16484 #define vreinterpretq_s32_f32(t) _M128i(t)
16485
16486 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
16487 #define vreinterpretq_s32_p16
16488
16489 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
16490 #define vreinterpretq_s32_p8
16491
16492 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
16493 #define vreinterpret_u8_u32
16494
16495 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
16496 #define vreinterpret_u8_u16
16497
16498 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
16499 #define vreinterpret_u8_s32
16500
16501 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
16502 #define vreinterpret_u8_s16
16503
16504 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
16505 #define vreinterpret_u8_s8
16506
16507 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
16508 #define vreinterpret_u8_u64
16509
16510 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
16511 #define vreinterpret_u8_s64
16512
16513 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
16514 #define vreinterpret_u8_f32
16515
16516 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
16517 #define vreinterpret_u8_p16
16518
16519 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
16520 #define vreinterpret_u8_p8
16521
16522 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
16523 #define vreinterpretq_u8_u32
16524
16525 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
16526 #define vreinterpretq_u8_u16
16527
16528 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
16529 #define vreinterpretq_u8_s32
16530
16531 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
16532 #define vreinterpretq_u8_s16
16533
16534 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
16535 #define vreinterpretq_u8_s8
16536
16537 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
16538 #define vreinterpretq_u8_u64
16539
16540 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
16541 #define vreinterpretq_u8_s64
16542
16543 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
16544 #define vreinterpretq_u8_f32(t) _M128i(t)
16545
16546
16547 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
16548 #define vreinterpretq_u8_p16
16549
16550 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
16551 #define vreinterpretq_u8_p8
16552
16553 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
16554 #define vreinterpret_u16_u32
16555
16556 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
16557 #define vreinterpret_u16_u8
16558
16559 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
16560 #define vreinterpret_u16_s32
16561
16562 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
16563 #define vreinterpret_u16_s16
16564
16565 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
16566 #define vreinterpret_u16_s8
16567
16568 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
16569 #define vreinterpret_u16_u64
16570
16571 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
16572 #define vreinterpret_u16_s64
16573
16574 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
16575 #define vreinterpret_u16_f32
16576
16577 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
16578 #define vreinterpret_u16_p16
16579
16580 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
16581 #define vreinterpret_u16_p8
16582
16583 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
16584 #define vreinterpretq_u16_u32
16585
16586 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
16587 #define vreinterpretq_u16_u8
16588
16589 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
16590 #define vreinterpretq_u16_s32
16591
16592 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
16593 #define vreinterpretq_u16_s16
16594
16595 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
16596 #define vreinterpretq_u16_s8
16597
16598 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
16599 #define vreinterpretq_u16_u64
16600
16601 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
16602 #define vreinterpretq_u16_s64
16603
16604 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
16605 #define vreinterpretq_u16_f32(t) _M128i(t)
16606
16607 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
16608 #define vreinterpretq_u16_p16
16609
16610 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
16611 #define vreinterpretq_u16_p8
16612
16613 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
16614 #define vreinterpret_u32_u16
16615
16616 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
16617 #define vreinterpret_u32_u8
16618
16619 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
16620 #define vreinterpret_u32_s32
16621
16622 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
16623 #define vreinterpret_u32_s16
16624
16625 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
16626 #define vreinterpret_u32_s8
16627
16628 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
16629 #define vreinterpret_u32_u64
16630
16631 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
16632 #define vreinterpret_u32_s64
16633
16634 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
16635 #define vreinterpret_u32_f32
16636
16637 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
16638 #define vreinterpret_u32_p16
16639
16640 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
16641 #define vreinterpret_u32_p8
16642
16643 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
16644 #define vreinterpretq_u32_u16
16645
16646 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
16647 #define vreinterpretq_u32_u8
16648
16649 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
16650 #define vreinterpretq_u32_s32
16651
16652 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
16653 #define vreinterpretq_u32_s16
16654
16655 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
16656 #define vreinterpretq_u32_s8
16657
16658 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
16659 #define vreinterpretq_u32_u64
16660
16661 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
16662 #define vreinterpretq_u32_s64
16663
16664 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
16665 #define vreinterpretq_u32_f32(t) _M128i(t)
16666
16667 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
16668 #define vreinterpretq_u32_p16
16669
16670 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
16671 #define vreinterpretq_u32_p8
16672
16673 //************* Round ******************
16674 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a);
16675 #ifdef USE_SSE4
16676 # define vrndnq_f32(a) _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
16677 #else
_NEON2SSE_PERFORMANCE_WARNING(float32x4_t vrndnq_f32 (float32x4_t a),_NEON2SSE_REASON_SLOW_SERIAL)16678 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16679 {
16680 int i;
16681 _NEON2SSE_ALIGN_16 float32_t res[4];
16682 _mm_store_ps(res, a);
16683 for(i = 0; i<4; i++) {
16684 res[i] = nearbyintf(res[i]);
16685 }
16686 return _mm_load_ps(res);
16687 }
16688 #endif
16689
16690
16691 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a);
16692 #ifdef USE_SSE4
16693 # define vrndnq_f64(a) _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
16694 #else
_NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64 (float64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL)16695 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
16696 {
16697 _NEON2SSE_ALIGN_16 float64_t res[2];
16698 _mm_store_pd(res, a);
16699 res[0] = nearbyintf(res[0]);
16700 res[1] = nearbyintf(res[1]);
16701 return _mm_load_pd(res);
16702 }
16703 #endif
16704
16705
16706
16707 //************* Sqrt ******************
16708 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a);
16709 #define vsqrtq_f32 _mm_sqrt_ps
16710
16711 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a);
16712 #define vsqrtq_f64 _mm_sqrt_pd
16713
16714
16715 #endif /* NEON2SSE_H */
16716