xref: /aosp_15_r20/external/lzma/C/SwapBytes.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1*f6dc9357SAndroid Build Coastguard Worker /* SwapBytes.c -- Byte Swap conversion filter
2*f6dc9357SAndroid Build Coastguard Worker 2024-03-01 : Igor Pavlov : Public domain */
3*f6dc9357SAndroid Build Coastguard Worker 
4*f6dc9357SAndroid Build Coastguard Worker #include "Precomp.h"
5*f6dc9357SAndroid Build Coastguard Worker 
6*f6dc9357SAndroid Build Coastguard Worker #include "Compiler.h"
7*f6dc9357SAndroid Build Coastguard Worker #include "CpuArch.h"
8*f6dc9357SAndroid Build Coastguard Worker #include "RotateDefs.h"
9*f6dc9357SAndroid Build Coastguard Worker #include "SwapBytes.h"
10*f6dc9357SAndroid Build Coastguard Worker 
11*f6dc9357SAndroid Build Coastguard Worker typedef UInt16 CSwapUInt16;
12*f6dc9357SAndroid Build Coastguard Worker typedef UInt32 CSwapUInt32;
13*f6dc9357SAndroid Build Coastguard Worker 
14*f6dc9357SAndroid Build Coastguard Worker // #define k_SwapBytes_Mode_BASE   0
15*f6dc9357SAndroid Build Coastguard Worker 
16*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_X86_OR_AMD64
17*f6dc9357SAndroid Build Coastguard Worker 
18*f6dc9357SAndroid Build Coastguard Worker #define k_SwapBytes_Mode_SSE2   1
19*f6dc9357SAndroid Build Coastguard Worker #define k_SwapBytes_Mode_SSSE3  2
20*f6dc9357SAndroid Build Coastguard Worker #define k_SwapBytes_Mode_AVX2   3
21*f6dc9357SAndroid Build Coastguard Worker 
22*f6dc9357SAndroid Build Coastguard Worker   // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
23*f6dc9357SAndroid Build Coastguard Worker   #if defined(__clang__) && (__clang_major__ >= 4) \
24*f6dc9357SAndroid Build Coastguard Worker       || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
25*f6dc9357SAndroid Build Coastguard Worker       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_AVX2
26*f6dc9357SAndroid Build Coastguard Worker       #define SWAP_ATTRIB_SSE2  __attribute__((__target__("sse2")))
27*f6dc9357SAndroid Build Coastguard Worker       #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3")))
28*f6dc9357SAndroid Build Coastguard Worker       #define SWAP_ATTRIB_AVX2  __attribute__((__target__("avx2")))
29*f6dc9357SAndroid Build Coastguard Worker   #elif defined(_MSC_VER)
30*f6dc9357SAndroid Build Coastguard Worker     #if (_MSC_VER == 1900)
31*f6dc9357SAndroid Build Coastguard Worker       #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
32*f6dc9357SAndroid Build Coastguard Worker     #endif
33*f6dc9357SAndroid Build Coastguard Worker     #if (_MSC_VER >= 1900)
34*f6dc9357SAndroid Build Coastguard Worker       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_AVX2
35*f6dc9357SAndroid Build Coastguard Worker     #elif (_MSC_VER >= 1500)  // (VS2008)
36*f6dc9357SAndroid Build Coastguard Worker       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_SSSE3
37*f6dc9357SAndroid Build Coastguard Worker     #elif (_MSC_VER >= 1310)  // (VS2003)
38*f6dc9357SAndroid Build Coastguard Worker       #define k_SwapBytes_Mode_MAX  k_SwapBytes_Mode_SSE2
39*f6dc9357SAndroid Build Coastguard Worker     #endif
40*f6dc9357SAndroid Build Coastguard Worker   #endif // _MSC_VER
41*f6dc9357SAndroid Build Coastguard Worker 
42*f6dc9357SAndroid Build Coastguard Worker /*
43*f6dc9357SAndroid Build Coastguard Worker // for debug
44*f6dc9357SAndroid Build Coastguard Worker #ifdef k_SwapBytes_Mode_MAX
45*f6dc9357SAndroid Build Coastguard Worker #undef k_SwapBytes_Mode_MAX
46*f6dc9357SAndroid Build Coastguard Worker #endif
47*f6dc9357SAndroid Build Coastguard Worker */
48*f6dc9357SAndroid Build Coastguard Worker 
49*f6dc9357SAndroid Build Coastguard Worker #ifndef k_SwapBytes_Mode_MAX
50*f6dc9357SAndroid Build Coastguard Worker #define k_SwapBytes_Mode_MAX 0
51*f6dc9357SAndroid Build Coastguard Worker #endif
52*f6dc9357SAndroid Build Coastguard Worker 
53*f6dc9357SAndroid Build Coastguard Worker #if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64)
54*f6dc9357SAndroid Build Coastguard Worker   #define k_SwapBytes_Mode_MIN  k_SwapBytes_Mode_SSE2
55*f6dc9357SAndroid Build Coastguard Worker #else
56*f6dc9357SAndroid Build Coastguard Worker   #define k_SwapBytes_Mode_MIN  0
57*f6dc9357SAndroid Build Coastguard Worker #endif
58*f6dc9357SAndroid Build Coastguard Worker 
59*f6dc9357SAndroid Build Coastguard Worker #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2)
60*f6dc9357SAndroid Build Coastguard Worker   #define USE_SWAP_AVX2
61*f6dc9357SAndroid Build Coastguard Worker #endif
62*f6dc9357SAndroid Build Coastguard Worker #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3)
63*f6dc9357SAndroid Build Coastguard Worker   #define USE_SWAP_SSSE3
64*f6dc9357SAndroid Build Coastguard Worker #endif
65*f6dc9357SAndroid Build Coastguard Worker #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2)
66*f6dc9357SAndroid Build Coastguard Worker   #define USE_SWAP_128
67*f6dc9357SAndroid Build Coastguard Worker #endif
68*f6dc9357SAndroid Build Coastguard Worker 
69*f6dc9357SAndroid Build Coastguard Worker #if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128)
70*f6dc9357SAndroid Build Coastguard Worker #define FORCE_SWAP_MODE
71*f6dc9357SAndroid Build Coastguard Worker #endif
72*f6dc9357SAndroid Build Coastguard Worker 
73*f6dc9357SAndroid Build Coastguard Worker 
74*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_SWAP_128
75*f6dc9357SAndroid Build Coastguard Worker /*
76*f6dc9357SAndroid Build Coastguard Worker  <mmintrin.h> MMX
77*f6dc9357SAndroid Build Coastguard Worker <xmmintrin.h> SSE
78*f6dc9357SAndroid Build Coastguard Worker <emmintrin.h> SSE2
79*f6dc9357SAndroid Build Coastguard Worker <pmmintrin.h> SSE3
80*f6dc9357SAndroid Build Coastguard Worker <tmmintrin.h> SSSE3
81*f6dc9357SAndroid Build Coastguard Worker <smmintrin.h> SSE4.1
82*f6dc9357SAndroid Build Coastguard Worker <nmmintrin.h> SSE4.2
83*f6dc9357SAndroid Build Coastguard Worker <ammintrin.h> SSE4A
84*f6dc9357SAndroid Build Coastguard Worker <wmmintrin.h> AES
85*f6dc9357SAndroid Build Coastguard Worker <immintrin.h> AVX, AVX2, FMA
86*f6dc9357SAndroid Build Coastguard Worker */
87*f6dc9357SAndroid Build Coastguard Worker 
88*f6dc9357SAndroid Build Coastguard Worker #include <emmintrin.h> // sse2
89*f6dc9357SAndroid Build Coastguard Worker // typedef __m128i v128;
90*f6dc9357SAndroid Build Coastguard Worker 
91*f6dc9357SAndroid Build Coastguard Worker #define SWAP2_128(i) { \
92*f6dc9357SAndroid Build Coastguard Worker   const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \
93*f6dc9357SAndroid Build Coastguard Worker                     *(      __m128i *)(      void *)(items + (i) * 8) = \
94*f6dc9357SAndroid Build Coastguard Worker     _mm_or_si128( \
95*f6dc9357SAndroid Build Coastguard Worker       _mm_slli_epi16(v, 8), \
96*f6dc9357SAndroid Build Coastguard Worker       _mm_srli_epi16(v, 8)); }
97*f6dc9357SAndroid Build Coastguard Worker // _mm_or_si128() has more ports to execute than _mm_add_epi16().
98*f6dc9357SAndroid Build Coastguard Worker 
99*f6dc9357SAndroid Build Coastguard Worker static
100*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP_ATTRIB_SSE2
101*f6dc9357SAndroid Build Coastguard Worker SWAP_ATTRIB_SSE2
102*f6dc9357SAndroid Build Coastguard Worker #endif
103*f6dc9357SAndroid Build Coastguard Worker void
104*f6dc9357SAndroid Build Coastguard Worker Z7_FASTCALL
SwapBytes2_128(CSwapUInt16 * items,const CSwapUInt16 * lim)105*f6dc9357SAndroid Build Coastguard Worker SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
106*f6dc9357SAndroid Build Coastguard Worker {
107*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
108*f6dc9357SAndroid Build Coastguard Worker   do
109*f6dc9357SAndroid Build Coastguard Worker   {
110*f6dc9357SAndroid Build Coastguard Worker     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
111*f6dc9357SAndroid Build Coastguard Worker     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
112*f6dc9357SAndroid Build Coastguard Worker   }
113*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
114*f6dc9357SAndroid Build Coastguard Worker }
115*f6dc9357SAndroid Build Coastguard Worker 
116*f6dc9357SAndroid Build Coastguard Worker /*
117*f6dc9357SAndroid Build Coastguard Worker // sse2
118*f6dc9357SAndroid Build Coastguard Worker #define SWAP4_128_pack(i) { \
119*f6dc9357SAndroid Build Coastguard Worker   __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
120*f6dc9357SAndroid Build Coastguard Worker   __m128i v0 = _mm_unpacklo_epi8(v, mask); \
121*f6dc9357SAndroid Build Coastguard Worker   __m128i v1 = _mm_unpackhi_epi8(v, mask); \
122*f6dc9357SAndroid Build Coastguard Worker   v0 = _mm_shufflelo_epi16(v0, 0x1b); \
123*f6dc9357SAndroid Build Coastguard Worker   v1 = _mm_shufflelo_epi16(v1, 0x1b); \
124*f6dc9357SAndroid Build Coastguard Worker   v0 = _mm_shufflehi_epi16(v0, 0x1b); \
125*f6dc9357SAndroid Build Coastguard Worker   v1 = _mm_shufflehi_epi16(v1, 0x1b); \
126*f6dc9357SAndroid Build Coastguard Worker   *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); }
127*f6dc9357SAndroid Build Coastguard Worker 
128*f6dc9357SAndroid Build Coastguard Worker static
129*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP_ATTRIB_SSE2
130*f6dc9357SAndroid Build Coastguard Worker SWAP_ATTRIB_SSE2
131*f6dc9357SAndroid Build Coastguard Worker #endif
132*f6dc9357SAndroid Build Coastguard Worker void
133*f6dc9357SAndroid Build Coastguard Worker Z7_FASTCALL
134*f6dc9357SAndroid Build Coastguard Worker SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim)
135*f6dc9357SAndroid Build Coastguard Worker {
136*f6dc9357SAndroid Build Coastguard Worker   const __m128i mask = _mm_setzero_si128();
137*f6dc9357SAndroid Build Coastguard Worker   // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
138*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
139*f6dc9357SAndroid Build Coastguard Worker   do
140*f6dc9357SAndroid Build Coastguard Worker   {
141*f6dc9357SAndroid Build Coastguard Worker     SWAP4_128_pack(0); items += 1 * 4;
142*f6dc9357SAndroid Build Coastguard Worker     // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4;
143*f6dc9357SAndroid Build Coastguard Worker   }
144*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
145*f6dc9357SAndroid Build Coastguard Worker }
146*f6dc9357SAndroid Build Coastguard Worker 
147*f6dc9357SAndroid Build Coastguard Worker // sse2
148*f6dc9357SAndroid Build Coastguard Worker #define SWAP4_128_shift(i) { \
149*f6dc9357SAndroid Build Coastguard Worker   __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
150*f6dc9357SAndroid Build Coastguard Worker   __m128i v2; \
151*f6dc9357SAndroid Build Coastguard Worker   v2 = _mm_or_si128( \
152*f6dc9357SAndroid Build Coastguard Worker         _mm_slli_si128(_mm_and_si128(v, mask), 1), \
153*f6dc9357SAndroid Build Coastguard Worker         _mm_and_si128(_mm_srli_si128(v, 1), mask)); \
154*f6dc9357SAndroid Build Coastguard Worker   v = _mm_or_si128( \
155*f6dc9357SAndroid Build Coastguard Worker         _mm_slli_epi32(v, 24), \
156*f6dc9357SAndroid Build Coastguard Worker         _mm_srli_epi32(v, 24)); \
157*f6dc9357SAndroid Build Coastguard Worker   *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); }
158*f6dc9357SAndroid Build Coastguard Worker 
159*f6dc9357SAndroid Build Coastguard Worker static
160*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP_ATTRIB_SSE2
161*f6dc9357SAndroid Build Coastguard Worker SWAP_ATTRIB_SSE2
162*f6dc9357SAndroid Build Coastguard Worker #endif
163*f6dc9357SAndroid Build Coastguard Worker void
164*f6dc9357SAndroid Build Coastguard Worker Z7_FASTCALL
165*f6dc9357SAndroid Build Coastguard Worker SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim)
166*f6dc9357SAndroid Build Coastguard Worker {
167*f6dc9357SAndroid Build Coastguard Worker   #define M1 0xff00
168*f6dc9357SAndroid Build Coastguard Worker   const __m128i mask = _mm_set_epi32(M1, M1, M1, M1);
169*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
170*f6dc9357SAndroid Build Coastguard Worker   do
171*f6dc9357SAndroid Build Coastguard Worker   {
172*f6dc9357SAndroid Build Coastguard Worker     // SWAP4_128_shift(0)  SWAP4_128_shift(1)  items += 2 * 4;
173*f6dc9357SAndroid Build Coastguard Worker     // SWAP4_128_shift(0)  SWAP4_128_shift(1)  items += 2 * 4;
174*f6dc9357SAndroid Build Coastguard Worker     SWAP4_128_shift(0); items += 1 * 4;
175*f6dc9357SAndroid Build Coastguard Worker   }
176*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
177*f6dc9357SAndroid Build Coastguard Worker }
178*f6dc9357SAndroid Build Coastguard Worker */
179*f6dc9357SAndroid Build Coastguard Worker 
180*f6dc9357SAndroid Build Coastguard Worker 
181*f6dc9357SAndroid Build Coastguard Worker #if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2)
182*f6dc9357SAndroid Build Coastguard Worker 
183*f6dc9357SAndroid Build Coastguard Worker #define SWAP_SHUF_REV_SEQ_2_VALS(v)                (v)+1, (v)
184*f6dc9357SAndroid Build Coastguard Worker #define SWAP_SHUF_REV_SEQ_4_VALS(v)  (v)+3, (v)+2, (v)+1, (v)
185*f6dc9357SAndroid Build Coastguard Worker 
186*f6dc9357SAndroid Build Coastguard Worker #define SWAP2_SHUF_MASK_16_BYTES \
187*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \
188*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \
189*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \
190*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \
191*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \
192*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \
193*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \
194*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_2_VALS (7 * 2)
195*f6dc9357SAndroid Build Coastguard Worker 
196*f6dc9357SAndroid Build Coastguard Worker #define SWAP4_SHUF_MASK_16_BYTES \
197*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \
198*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \
199*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \
200*f6dc9357SAndroid Build Coastguard Worker     SWAP_SHUF_REV_SEQ_4_VALS (3 * 4)
201*f6dc9357SAndroid Build Coastguard Worker 
202*f6dc9357SAndroid Build Coastguard Worker #if defined(USE_SWAP_AVX2)
203*f6dc9357SAndroid Build Coastguard Worker /* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */
204*f6dc9357SAndroid Build Coastguard Worker // #define SWAP_USE_256_BIT_INIT_MASK
205*f6dc9357SAndroid Build Coastguard Worker #endif
206*f6dc9357SAndroid Build Coastguard Worker 
207*f6dc9357SAndroid Build Coastguard Worker #if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2)
208*f6dc9357SAndroid Build Coastguard Worker #define SWAP_MASK_INIT_SIZE 32
209*f6dc9357SAndroid Build Coastguard Worker #else
210*f6dc9357SAndroid Build Coastguard Worker #define SWAP_MASK_INIT_SIZE 16
211*f6dc9357SAndroid Build Coastguard Worker #endif
212*f6dc9357SAndroid Build Coastguard Worker 
213*f6dc9357SAndroid Build Coastguard Worker MY_ALIGN(SWAP_MASK_INIT_SIZE)
214*f6dc9357SAndroid Build Coastguard Worker static const Byte k_ShufMask_Swap2[] =
215*f6dc9357SAndroid Build Coastguard Worker {
216*f6dc9357SAndroid Build Coastguard Worker     SWAP2_SHUF_MASK_16_BYTES
217*f6dc9357SAndroid Build Coastguard Worker   #if SWAP_MASK_INIT_SIZE > 16
218*f6dc9357SAndroid Build Coastguard Worker   , SWAP2_SHUF_MASK_16_BYTES
219*f6dc9357SAndroid Build Coastguard Worker   #endif
220*f6dc9357SAndroid Build Coastguard Worker };
221*f6dc9357SAndroid Build Coastguard Worker 
222*f6dc9357SAndroid Build Coastguard Worker MY_ALIGN(SWAP_MASK_INIT_SIZE)
223*f6dc9357SAndroid Build Coastguard Worker static const Byte k_ShufMask_Swap4[] =
224*f6dc9357SAndroid Build Coastguard Worker {
225*f6dc9357SAndroid Build Coastguard Worker     SWAP4_SHUF_MASK_16_BYTES
226*f6dc9357SAndroid Build Coastguard Worker   #if SWAP_MASK_INIT_SIZE > 16
227*f6dc9357SAndroid Build Coastguard Worker   , SWAP4_SHUF_MASK_16_BYTES
228*f6dc9357SAndroid Build Coastguard Worker   #endif
229*f6dc9357SAndroid Build Coastguard Worker };
230*f6dc9357SAndroid Build Coastguard Worker 
231*f6dc9357SAndroid Build Coastguard Worker 
232*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_SWAP_SSSE3
233*f6dc9357SAndroid Build Coastguard Worker 
234*f6dc9357SAndroid Build Coastguard Worker #include <tmmintrin.h> // ssse3
235*f6dc9357SAndroid Build Coastguard Worker 
236*f6dc9357SAndroid Build Coastguard Worker #define SHUF_128(i)   *(items + (i)) = \
237*f6dc9357SAndroid Build Coastguard Worker      _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3
238*f6dc9357SAndroid Build Coastguard Worker 
239*f6dc9357SAndroid Build Coastguard Worker // Z7_NO_INLINE
240*f6dc9357SAndroid Build Coastguard Worker static
241*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP_ATTRIB_SSSE3
242*f6dc9357SAndroid Build Coastguard Worker SWAP_ATTRIB_SSSE3
243*f6dc9357SAndroid Build Coastguard Worker #endif
244*f6dc9357SAndroid Build Coastguard Worker Z7_ATTRIB_NO_VECTORIZE
245*f6dc9357SAndroid Build Coastguard Worker void
246*f6dc9357SAndroid Build Coastguard Worker Z7_FASTCALL
ShufBytes_128(void * items8,const void * lim8,const void * mask128_ptr)247*f6dc9357SAndroid Build Coastguard Worker ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr)
248*f6dc9357SAndroid Build Coastguard Worker {
249*f6dc9357SAndroid Build Coastguard Worker   __m128i *items = (__m128i *)items8;
250*f6dc9357SAndroid Build Coastguard Worker   const __m128i *lim = (const __m128i *)lim8;
251*f6dc9357SAndroid Build Coastguard Worker   // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS);
252*f6dc9357SAndroid Build Coastguard Worker   // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS);
253*f6dc9357SAndroid Build Coastguard Worker   // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
254*f6dc9357SAndroid Build Coastguard Worker   // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
255*f6dc9357SAndroid Build Coastguard Worker   // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]);
256*f6dc9357SAndroid Build Coastguard Worker   const __m128i mask = *(const __m128i *)mask128_ptr;
257*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
258*f6dc9357SAndroid Build Coastguard Worker   do
259*f6dc9357SAndroid Build Coastguard Worker   {
260*f6dc9357SAndroid Build Coastguard Worker     SHUF_128(0)  SHUF_128(1)  items += 2;
261*f6dc9357SAndroid Build Coastguard Worker     SHUF_128(0)  SHUF_128(1)  items += 2;
262*f6dc9357SAndroid Build Coastguard Worker   }
263*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
264*f6dc9357SAndroid Build Coastguard Worker }
265*f6dc9357SAndroid Build Coastguard Worker 
266*f6dc9357SAndroid Build Coastguard Worker #endif // USE_SWAP_SSSE3
267*f6dc9357SAndroid Build Coastguard Worker 
268*f6dc9357SAndroid Build Coastguard Worker 
269*f6dc9357SAndroid Build Coastguard Worker 
270*f6dc9357SAndroid Build Coastguard Worker #ifdef USE_SWAP_AVX2
271*f6dc9357SAndroid Build Coastguard Worker 
272*f6dc9357SAndroid Build Coastguard Worker #include <immintrin.h> // avx, avx2
273*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
274*f6dc9357SAndroid Build Coastguard Worker #include <avxintrin.h>
275*f6dc9357SAndroid Build Coastguard Worker #include <avx2intrin.h>
276*f6dc9357SAndroid Build Coastguard Worker #endif
277*f6dc9357SAndroid Build Coastguard Worker 
278*f6dc9357SAndroid Build Coastguard Worker #define SHUF_256(i)   *(items + (i)) = \
279*f6dc9357SAndroid Build Coastguard Worker   _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2
280*f6dc9357SAndroid Build Coastguard Worker 
281*f6dc9357SAndroid Build Coastguard Worker // Z7_NO_INLINE
282*f6dc9357SAndroid Build Coastguard Worker static
283*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP_ATTRIB_AVX2
284*f6dc9357SAndroid Build Coastguard Worker SWAP_ATTRIB_AVX2
285*f6dc9357SAndroid Build Coastguard Worker #endif
286*f6dc9357SAndroid Build Coastguard Worker Z7_ATTRIB_NO_VECTORIZE
287*f6dc9357SAndroid Build Coastguard Worker void
288*f6dc9357SAndroid Build Coastguard Worker Z7_FASTCALL
ShufBytes_256(void * items8,const void * lim8,const void * mask128_ptr)289*f6dc9357SAndroid Build Coastguard Worker ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr)
290*f6dc9357SAndroid Build Coastguard Worker {
291*f6dc9357SAndroid Build Coastguard Worker   __m256i *items = (__m256i *)items8;
292*f6dc9357SAndroid Build Coastguard Worker   const __m256i *lim = (const __m256i *)lim8;
293*f6dc9357SAndroid Build Coastguard Worker   /*
294*f6dc9357SAndroid Build Coastguard Worker   UNUSED_VAR(mask128_ptr)
295*f6dc9357SAndroid Build Coastguard Worker   __m256i mask =
296*f6dc9357SAndroid Build Coastguard Worker   for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES);
297*f6dc9357SAndroid Build Coastguard Worker   for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES);
298*f6dc9357SAndroid Build Coastguard Worker   */
299*f6dc9357SAndroid Build Coastguard Worker   const __m256i mask =
300*f6dc9357SAndroid Build Coastguard Worker  #if SWAP_MASK_INIT_SIZE > 16
301*f6dc9357SAndroid Build Coastguard Worker       *(const __m256i *)(const void *)mask128_ptr;
302*f6dc9357SAndroid Build Coastguard Worker  #else
303*f6dc9357SAndroid Build Coastguard Worker   /* msvc: broadcastsi128() version reserves the stack for no reason
304*f6dc9357SAndroid Build Coastguard Worker      msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu   xmm0,XMMWORD PTR [r8]
305*f6dc9357SAndroid Build Coastguard Worker      msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want
306*f6dc9357SAndroid Build Coastguard Worker   */
307*f6dc9357SAndroid Build Coastguard Worker   // _mm256_broadcastsi128_si256(*mask128_ptr);
308*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
309*f6dc9357SAndroid Build Coastguard Worker   #define MY_mm256_set_m128i(hi, lo)  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
310*f6dc9357SAndroid Build Coastguard Worker #else
311*f6dc9357SAndroid Build Coastguard Worker   #define MY_mm256_set_m128i  _mm256_set_m128i
312*f6dc9357SAndroid Build Coastguard Worker #endif
313*f6dc9357SAndroid Build Coastguard Worker       MY_mm256_set_m128i(
314*f6dc9357SAndroid Build Coastguard Worker         *(const __m128i *)mask128_ptr,
315*f6dc9357SAndroid Build Coastguard Worker         *(const __m128i *)mask128_ptr);
316*f6dc9357SAndroid Build Coastguard Worker  #endif
317*f6dc9357SAndroid Build Coastguard Worker 
318*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
319*f6dc9357SAndroid Build Coastguard Worker   do
320*f6dc9357SAndroid Build Coastguard Worker   {
321*f6dc9357SAndroid Build Coastguard Worker     SHUF_256(0)  SHUF_256(1)  items += 2;
322*f6dc9357SAndroid Build Coastguard Worker     SHUF_256(0)  SHUF_256(1)  items += 2;
323*f6dc9357SAndroid Build Coastguard Worker   }
324*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
325*f6dc9357SAndroid Build Coastguard Worker }
326*f6dc9357SAndroid Build Coastguard Worker 
327*f6dc9357SAndroid Build Coastguard Worker #endif // USE_SWAP_AVX2
328*f6dc9357SAndroid Build Coastguard Worker #endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2
329*f6dc9357SAndroid Build Coastguard Worker #endif // USE_SWAP_128
330*f6dc9357SAndroid Build Coastguard Worker 
331*f6dc9357SAndroid Build Coastguard Worker 
332*f6dc9357SAndroid Build Coastguard Worker 
333*f6dc9357SAndroid Build Coastguard Worker // compile message "NEON intrinsics not available with the soft-float ABI"
334*f6dc9357SAndroid Build Coastguard Worker #elif defined(MY_CPU_ARM_OR_ARM64) \
335*f6dc9357SAndroid Build Coastguard Worker     && defined(MY_CPU_LE) \
336*f6dc9357SAndroid Build Coastguard Worker     && !defined(Z7_DISABLE_ARM_NEON)
337*f6dc9357SAndroid Build Coastguard Worker 
338*f6dc9357SAndroid Build Coastguard Worker   #if defined(__clang__) && (__clang_major__ >= 8) \
339*f6dc9357SAndroid Build Coastguard Worker     || defined(__GNUC__) && (__GNUC__ >= 6)
340*f6dc9357SAndroid Build Coastguard Worker     #if defined(__ARM_FP)
341*f6dc9357SAndroid Build Coastguard Worker     #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \
342*f6dc9357SAndroid Build Coastguard Worker         || defined(MY_CPU_ARM64)
343*f6dc9357SAndroid Build Coastguard Worker     #if  defined(MY_CPU_ARM64) \
344*f6dc9357SAndroid Build Coastguard Worker       || !defined(Z7_CLANG_VERSION) \
345*f6dc9357SAndroid Build Coastguard Worker       || defined(__ARM_NEON)
346*f6dc9357SAndroid Build Coastguard Worker       #define USE_SWAP_128
347*f6dc9357SAndroid Build Coastguard Worker     #ifdef MY_CPU_ARM64
348*f6dc9357SAndroid Build Coastguard Worker       // #define SWAP_ATTRIB_NEON __attribute__((__target__("")))
349*f6dc9357SAndroid Build Coastguard Worker     #else
350*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_CLANG_VERSION)
351*f6dc9357SAndroid Build Coastguard Worker       // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon")))
352*f6dc9357SAndroid Build Coastguard Worker #else
353*f6dc9357SAndroid Build Coastguard Worker       // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))")
354*f6dc9357SAndroid Build Coastguard Worker       #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon")))
355*f6dc9357SAndroid Build Coastguard Worker #endif
356*f6dc9357SAndroid Build Coastguard Worker     #endif // MY_CPU_ARM64
357*f6dc9357SAndroid Build Coastguard Worker     #endif // __ARM_NEON
358*f6dc9357SAndroid Build Coastguard Worker     #endif // __ARM_ARCH
359*f6dc9357SAndroid Build Coastguard Worker     #endif // __ARM_FP
360*f6dc9357SAndroid Build Coastguard Worker 
361*f6dc9357SAndroid Build Coastguard Worker   #elif defined(_MSC_VER)
362*f6dc9357SAndroid Build Coastguard Worker     #if (_MSC_VER >= 1910)
363*f6dc9357SAndroid Build Coastguard Worker       #define USE_SWAP_128
364*f6dc9357SAndroid Build Coastguard Worker     #endif
365*f6dc9357SAndroid Build Coastguard Worker   #endif
366*f6dc9357SAndroid Build Coastguard Worker 
367*f6dc9357SAndroid Build Coastguard Worker   #ifdef USE_SWAP_128
368*f6dc9357SAndroid Build Coastguard Worker   #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
369*f6dc9357SAndroid Build Coastguard Worker     #include <arm64_neon.h>
370*f6dc9357SAndroid Build Coastguard Worker   #else
371*f6dc9357SAndroid Build Coastguard Worker 
372*f6dc9357SAndroid Build Coastguard Worker /*
373*f6dc9357SAndroid Build Coastguard Worker #if !defined(__ARM_NEON)
374*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_GCC_VERSION) && (__GNUC__  <   5) \
375*f6dc9357SAndroid Build Coastguard Worker  || defined(Z7_GCC_VERSION) && (__GNUC__ ==   5) && (Z7_GCC_VERSION <  90201) \
376*f6dc9357SAndroid Build Coastguard Worker  || defined(Z7_GCC_VERSION) && (__GNUC__ ==   5) && (Z7_GCC_VERSION < 100100)
377*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
378*f6dc9357SAndroid Build Coastguard Worker #pragma message("#define __ARM_NEON 1")
379*f6dc9357SAndroid Build Coastguard Worker // #define __ARM_NEON 1
380*f6dc9357SAndroid Build Coastguard Worker Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
381*f6dc9357SAndroid Build Coastguard Worker #endif
382*f6dc9357SAndroid Build Coastguard Worker #endif
383*f6dc9357SAndroid Build Coastguard Worker */
384*f6dc9357SAndroid Build Coastguard Worker     #include <arm_neon.h>
385*f6dc9357SAndroid Build Coastguard Worker   #endif
386*f6dc9357SAndroid Build Coastguard Worker   #endif
387*f6dc9357SAndroid Build Coastguard Worker 
388*f6dc9357SAndroid Build Coastguard Worker #ifndef USE_SWAP_128
389*f6dc9357SAndroid Build Coastguard Worker   #define FORCE_SWAP_MODE
390*f6dc9357SAndroid Build Coastguard Worker #else
391*f6dc9357SAndroid Build Coastguard Worker 
392*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_ARM64
393*f6dc9357SAndroid Build Coastguard Worker   // for debug : comment it
394*f6dc9357SAndroid Build Coastguard Worker   #define FORCE_SWAP_MODE
395*f6dc9357SAndroid Build Coastguard Worker #else
396*f6dc9357SAndroid Build Coastguard Worker   #define k_SwapBytes_Mode_NEON 1
397*f6dc9357SAndroid Build Coastguard Worker #endif
398*f6dc9357SAndroid Build Coastguard Worker // typedef uint8x16_t v128;
399*f6dc9357SAndroid Build Coastguard Worker #define SWAP2_128(i)   *(uint8x16_t *)      (void *)(items + (i) * 8) = \
400*f6dc9357SAndroid Build Coastguard Worker       vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8));
401*f6dc9357SAndroid Build Coastguard Worker #define SWAP4_128(i)   *(uint8x16_t *)      (void *)(items + (i) * 4) = \
402*f6dc9357SAndroid Build Coastguard Worker       vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4));
403*f6dc9357SAndroid Build Coastguard Worker 
404*f6dc9357SAndroid Build Coastguard Worker // Z7_NO_INLINE
405*f6dc9357SAndroid Build Coastguard Worker static
406*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP_ATTRIB_NEON
407*f6dc9357SAndroid Build Coastguard Worker SWAP_ATTRIB_NEON
408*f6dc9357SAndroid Build Coastguard Worker #endif
409*f6dc9357SAndroid Build Coastguard Worker Z7_ATTRIB_NO_VECTORIZE
410*f6dc9357SAndroid Build Coastguard Worker void
411*f6dc9357SAndroid Build Coastguard Worker Z7_FASTCALL
SwapBytes2_128(CSwapUInt16 * items,const CSwapUInt16 * lim)412*f6dc9357SAndroid Build Coastguard Worker SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
413*f6dc9357SAndroid Build Coastguard Worker {
414*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
415*f6dc9357SAndroid Build Coastguard Worker   do
416*f6dc9357SAndroid Build Coastguard Worker   {
417*f6dc9357SAndroid Build Coastguard Worker     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
418*f6dc9357SAndroid Build Coastguard Worker     SWAP2_128(0)  SWAP2_128(1)  items += 2 * 8;
419*f6dc9357SAndroid Build Coastguard Worker   }
420*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
421*f6dc9357SAndroid Build Coastguard Worker }
422*f6dc9357SAndroid Build Coastguard Worker 
423*f6dc9357SAndroid Build Coastguard Worker // Z7_NO_INLINE
424*f6dc9357SAndroid Build Coastguard Worker static
425*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP_ATTRIB_NEON
426*f6dc9357SAndroid Build Coastguard Worker SWAP_ATTRIB_NEON
427*f6dc9357SAndroid Build Coastguard Worker #endif
428*f6dc9357SAndroid Build Coastguard Worker Z7_ATTRIB_NO_VECTORIZE
429*f6dc9357SAndroid Build Coastguard Worker void
430*f6dc9357SAndroid Build Coastguard Worker Z7_FASTCALL
SwapBytes4_128(CSwapUInt32 * items,const CSwapUInt32 * lim)431*f6dc9357SAndroid Build Coastguard Worker SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim)
432*f6dc9357SAndroid Build Coastguard Worker {
433*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
434*f6dc9357SAndroid Build Coastguard Worker   do
435*f6dc9357SAndroid Build Coastguard Worker   {
436*f6dc9357SAndroid Build Coastguard Worker     SWAP4_128(0)  SWAP4_128(1)  items += 2 * 4;
437*f6dc9357SAndroid Build Coastguard Worker     SWAP4_128(0)  SWAP4_128(1)  items += 2 * 4;
438*f6dc9357SAndroid Build Coastguard Worker   }
439*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
440*f6dc9357SAndroid Build Coastguard Worker }
441*f6dc9357SAndroid Build Coastguard Worker 
442*f6dc9357SAndroid Build Coastguard Worker #endif // USE_SWAP_128
443*f6dc9357SAndroid Build Coastguard Worker 
444*f6dc9357SAndroid Build Coastguard Worker #else // MY_CPU_ARM_OR_ARM64
445*f6dc9357SAndroid Build Coastguard Worker #define FORCE_SWAP_MODE
446*f6dc9357SAndroid Build Coastguard Worker #endif // MY_CPU_ARM_OR_ARM64
447*f6dc9357SAndroid Build Coastguard Worker 
448*f6dc9357SAndroid Build Coastguard Worker 
449*f6dc9357SAndroid Build Coastguard Worker 
450*f6dc9357SAndroid Build Coastguard Worker 
451*f6dc9357SAndroid Build Coastguard Worker 
452*f6dc9357SAndroid Build Coastguard Worker 
453*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86)
454*f6dc9357SAndroid Build Coastguard Worker   /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah }
455*f6dc9357SAndroid Build Coastguard Worker      So we use own versions of byteswap function */
456*f6dc9357SAndroid Build Coastguard Worker   #if (_MSC_VER < 1400 )  // old MSVC-X86 without _rotr16() support
457*f6dc9357SAndroid Build Coastguard Worker     #define SWAP2_16(i)  { UInt32 v = items[i];  v += (v << 16);  v >>= 8;  items[i] = (CSwapUInt16)v; }
458*f6dc9357SAndroid Build Coastguard Worker   #else  // is new MSVC-X86 with fast _rotr16()
459*f6dc9357SAndroid Build Coastguard Worker     #include <intrin.h>
460*f6dc9357SAndroid Build Coastguard Worker     #define SWAP2_16(i)  { items[i] = _rotr16(items[i], 8); }
461*f6dc9357SAndroid Build Coastguard Worker   #endif
462*f6dc9357SAndroid Build Coastguard Worker #else  // is not MSVC-X86
463*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_16(i)  { CSwapUInt16 v = items[i];  items[i] = Z7_BSWAP16(v); }
464*f6dc9357SAndroid Build Coastguard Worker #endif  // MSVC-X86
465*f6dc9357SAndroid Build Coastguard Worker 
466*f6dc9357SAndroid Build Coastguard Worker #if defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
467*f6dc9357SAndroid Build Coastguard Worker   #define SWAP4_32(i)  { CSwapUInt32 v = items[i];  items[i] = Z7_BSWAP32(v); }
468*f6dc9357SAndroid Build Coastguard Worker #else
469*f6dc9357SAndroid Build Coastguard Worker   #define SWAP4_32(i)  \
470*f6dc9357SAndroid Build Coastguard Worker     { UInt32 v = items[i]; \
471*f6dc9357SAndroid Build Coastguard Worker       v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \
472*f6dc9357SAndroid Build Coastguard Worker       v = rotlFixed(v, 16); \
473*f6dc9357SAndroid Build Coastguard Worker       items[i] = v; }
474*f6dc9357SAndroid Build Coastguard Worker #endif
475*f6dc9357SAndroid Build Coastguard Worker 
476*f6dc9357SAndroid Build Coastguard Worker 
477*f6dc9357SAndroid Build Coastguard Worker 
478*f6dc9357SAndroid Build Coastguard Worker 
479*f6dc9357SAndroid Build Coastguard Worker #if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128)
480*f6dc9357SAndroid Build Coastguard Worker   #define DEFAULT_Swap2  SwapBytes2_128
481*f6dc9357SAndroid Build Coastguard Worker   #if !defined(MY_CPU_X86_OR_AMD64)
482*f6dc9357SAndroid Build Coastguard Worker     #define DEFAULT_Swap4  SwapBytes4_128
483*f6dc9357SAndroid Build Coastguard Worker   #endif
484*f6dc9357SAndroid Build Coastguard Worker #endif
485*f6dc9357SAndroid Build Coastguard Worker 
486*f6dc9357SAndroid Build Coastguard Worker #if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
487*f6dc9357SAndroid Build Coastguard Worker 
488*f6dc9357SAndroid Build Coastguard Worker #define SWAP_BASE_FUNCS_PREFIXES \
489*f6dc9357SAndroid Build Coastguard Worker Z7_FORCE_INLINE  \
490*f6dc9357SAndroid Build Coastguard Worker static \
491*f6dc9357SAndroid Build Coastguard Worker Z7_ATTRIB_NO_VECTOR  \
492*f6dc9357SAndroid Build Coastguard Worker void Z7_FASTCALL
493*f6dc9357SAndroid Build Coastguard Worker 
494*f6dc9357SAndroid Build Coastguard Worker 
495*f6dc9357SAndroid Build Coastguard Worker #if defined(MY_CPU_ARM_OR_ARM64)
496*f6dc9357SAndroid Build Coastguard Worker #if defined(__clang__)
497*f6dc9357SAndroid Build Coastguard Worker #pragma GCC diagnostic ignored "-Wlanguage-extension-token"
498*f6dc9357SAndroid Build Coastguard Worker #endif
499*f6dc9357SAndroid Build Coastguard Worker #endif
500*f6dc9357SAndroid Build Coastguard Worker 
501*f6dc9357SAndroid Build Coastguard Worker 
502*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_64BIT
503*f6dc9357SAndroid Build Coastguard Worker 
504*f6dc9357SAndroid Build Coastguard Worker #if defined(MY_CPU_ARM64) \
505*f6dc9357SAndroid Build Coastguard Worker     && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \
506*f6dc9357SAndroid Build Coastguard Worker     && (  (defined(__GNUC__) && (__GNUC__ >= 4)) \
507*f6dc9357SAndroid Build Coastguard Worker        || (defined(__clang__) && (__clang_major__ >= 4)))
508*f6dc9357SAndroid Build Coastguard Worker 
509*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_64_VAR(v)  asm ("rev16 %x0,%x0" : "+r" (v));
510*f6dc9357SAndroid Build Coastguard Worker   #define SWAP4_64_VAR(v)  asm ("rev32 %x0,%x0" : "+r" (v));
511*f6dc9357SAndroid Build Coastguard Worker 
512*f6dc9357SAndroid Build Coastguard Worker #else  // is not ARM64-GNU
513*f6dc9357SAndroid Build Coastguard Worker 
514*f6dc9357SAndroid Build Coastguard Worker #if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128)
515*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_64_VAR(v) \
516*f6dc9357SAndroid Build Coastguard Worker     v = ( 0x00ff00ff00ff00ff & (v >> 8))  \
517*f6dc9357SAndroid Build Coastguard Worker       + ((0x00ff00ff00ff00ff & v) << 8);
518*f6dc9357SAndroid Build Coastguard Worker       /* plus gives faster code in MSVC */
519*f6dc9357SAndroid Build Coastguard Worker #endif
520*f6dc9357SAndroid Build Coastguard Worker 
521*f6dc9357SAndroid Build Coastguard Worker #ifdef Z7_CPU_FAST_BSWAP_SUPPORTED
522*f6dc9357SAndroid Build Coastguard Worker   #define SWAP4_64_VAR(v) \
523*f6dc9357SAndroid Build Coastguard Worker     v = Z7_BSWAP64(v); \
524*f6dc9357SAndroid Build Coastguard Worker     v = Z7_ROTL64(v, 32);
525*f6dc9357SAndroid Build Coastguard Worker #else
526*f6dc9357SAndroid Build Coastguard Worker   #define SWAP4_64_VAR(v) \
527*f6dc9357SAndroid Build Coastguard Worker     v = ( 0x000000ff000000ff & (v >> 24))  \
528*f6dc9357SAndroid Build Coastguard Worker       + ((0x000000ff000000ff & v) << 24 )  \
529*f6dc9357SAndroid Build Coastguard Worker       + ( 0x0000ff000000ff00 & (v >>  8))  \
530*f6dc9357SAndroid Build Coastguard Worker       + ((0x0000ff000000ff00 & v) <<  8 )  \
531*f6dc9357SAndroid Build Coastguard Worker       ;
532*f6dc9357SAndroid Build Coastguard Worker #endif
533*f6dc9357SAndroid Build Coastguard Worker 
534*f6dc9357SAndroid Build Coastguard Worker #endif  // ARM64-GNU
535*f6dc9357SAndroid Build Coastguard Worker 
536*f6dc9357SAndroid Build Coastguard Worker 
537*f6dc9357SAndroid Build Coastguard Worker #ifdef SWAP2_64_VAR
538*f6dc9357SAndroid Build Coastguard Worker 
539*f6dc9357SAndroid Build Coastguard Worker #define SWAP2_64(i) { \
540*f6dc9357SAndroid Build Coastguard Worker     UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \
541*f6dc9357SAndroid Build Coastguard Worker     SWAP2_64_VAR(v) \
542*f6dc9357SAndroid Build Coastguard Worker     *(UInt64 *)(void *)(items + (i) * 4) = v; }
543*f6dc9357SAndroid Build Coastguard Worker 
544*f6dc9357SAndroid Build Coastguard Worker SWAP_BASE_FUNCS_PREFIXES
SwapBytes2_64(CSwapUInt16 * items,const CSwapUInt16 * lim)545*f6dc9357SAndroid Build Coastguard Worker SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim)
546*f6dc9357SAndroid Build Coastguard Worker {
547*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
548*f6dc9357SAndroid Build Coastguard Worker   do
549*f6dc9357SAndroid Build Coastguard Worker   {
550*f6dc9357SAndroid Build Coastguard Worker     SWAP2_64(0)  SWAP2_64(1)  items += 2 * 4;
551*f6dc9357SAndroid Build Coastguard Worker     SWAP2_64(0)  SWAP2_64(1)  items += 2 * 4;
552*f6dc9357SAndroid Build Coastguard Worker   }
553*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
554*f6dc9357SAndroid Build Coastguard Worker }
555*f6dc9357SAndroid Build Coastguard Worker 
556*f6dc9357SAndroid Build Coastguard Worker   #define DEFAULT_Swap2  SwapBytes2_64
557*f6dc9357SAndroid Build Coastguard Worker   #if !defined(FORCE_SWAP_MODE)
558*f6dc9357SAndroid Build Coastguard Worker     #define SWAP2_DEFAULT_MODE 0
559*f6dc9357SAndroid Build Coastguard Worker   #endif
560*f6dc9357SAndroid Build Coastguard Worker #else // !defined(SWAP2_64_VAR)
561*f6dc9357SAndroid Build Coastguard Worker   #define DEFAULT_Swap2  SwapBytes2_128
562*f6dc9357SAndroid Build Coastguard Worker   #if !defined(FORCE_SWAP_MODE)
563*f6dc9357SAndroid Build Coastguard Worker     #define SWAP2_DEFAULT_MODE 1
564*f6dc9357SAndroid Build Coastguard Worker   #endif
565*f6dc9357SAndroid Build Coastguard Worker #endif // SWAP2_64_VAR
566*f6dc9357SAndroid Build Coastguard Worker 
567*f6dc9357SAndroid Build Coastguard Worker 
568*f6dc9357SAndroid Build Coastguard Worker #define SWAP4_64(i) { \
569*f6dc9357SAndroid Build Coastguard Worker     UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \
570*f6dc9357SAndroid Build Coastguard Worker     SWAP4_64_VAR(v) \
571*f6dc9357SAndroid Build Coastguard Worker     *(UInt64 *)(void *)(items + (i) * 2) = v; }
572*f6dc9357SAndroid Build Coastguard Worker 
573*f6dc9357SAndroid Build Coastguard Worker SWAP_BASE_FUNCS_PREFIXES
SwapBytes4_64(CSwapUInt32 * items,const CSwapUInt32 * lim)574*f6dc9357SAndroid Build Coastguard Worker SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim)
575*f6dc9357SAndroid Build Coastguard Worker {
576*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
577*f6dc9357SAndroid Build Coastguard Worker   do
578*f6dc9357SAndroid Build Coastguard Worker   {
579*f6dc9357SAndroid Build Coastguard Worker     SWAP4_64(0)  SWAP4_64(1)  items += 2 * 2;
580*f6dc9357SAndroid Build Coastguard Worker     SWAP4_64(0)  SWAP4_64(1)  items += 2 * 2;
581*f6dc9357SAndroid Build Coastguard Worker   }
582*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
583*f6dc9357SAndroid Build Coastguard Worker }
584*f6dc9357SAndroid Build Coastguard Worker 
585*f6dc9357SAndroid Build Coastguard Worker #define DEFAULT_Swap4  SwapBytes4_64
586*f6dc9357SAndroid Build Coastguard Worker 
587*f6dc9357SAndroid Build Coastguard Worker #else  // is not 64BIT
588*f6dc9357SAndroid Build Coastguard Worker 
589*f6dc9357SAndroid Build Coastguard Worker 
590*f6dc9357SAndroid Build Coastguard Worker #if defined(MY_CPU_ARM_OR_ARM64) \
591*f6dc9357SAndroid Build Coastguard Worker     && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \
592*f6dc9357SAndroid Build Coastguard Worker     && (  (defined(__GNUC__) && (__GNUC__ >= 4)) \
593*f6dc9357SAndroid Build Coastguard Worker        || (defined(__clang__) && (__clang_major__ >= 4)))
594*f6dc9357SAndroid Build Coastguard Worker 
595*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_64BIT
596*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_32_VAR(v)  asm ("rev16 %w0,%w0" : "+r" (v));
597*f6dc9357SAndroid Build Coastguard Worker #else
598*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_32_VAR(v)  asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc
599*f6dc9357SAndroid Build Coastguard Worker     // asm ("rev16 %r0,%r0" : "+r" (a));  // for gcc
600*f6dc9357SAndroid Build Coastguard Worker #endif
601*f6dc9357SAndroid Build Coastguard Worker 
602*f6dc9357SAndroid Build Coastguard Worker #elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \
603*f6dc9357SAndroid Build Coastguard Worker     || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \
604*f6dc9357SAndroid Build Coastguard Worker     || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED)
605*f6dc9357SAndroid Build Coastguard Worker   // old msvc doesn't support _byteswap_ulong()
606*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_32_VAR(v) \
607*f6dc9357SAndroid Build Coastguard Worker     v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff);
608*f6dc9357SAndroid Build Coastguard Worker 
609*f6dc9357SAndroid Build Coastguard Worker #else  // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported
610*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_32_VAR(v) \
611*f6dc9357SAndroid Build Coastguard Worker     v = Z7_BSWAP32(v); \
612*f6dc9357SAndroid Build Coastguard Worker     v = rotlFixed(v, 16);
613*f6dc9357SAndroid Build Coastguard Worker 
614*f6dc9357SAndroid Build Coastguard Worker #endif  // GNU-ARM*
615*f6dc9357SAndroid Build Coastguard Worker 
616*f6dc9357SAndroid Build Coastguard Worker #define SWAP2_32(i) { \
617*f6dc9357SAndroid Build Coastguard Worker     UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \
618*f6dc9357SAndroid Build Coastguard Worker     SWAP2_32_VAR(v); \
619*f6dc9357SAndroid Build Coastguard Worker     *(UInt32 *)(void *)(items + (i) * 2) = v; }
620*f6dc9357SAndroid Build Coastguard Worker 
621*f6dc9357SAndroid Build Coastguard Worker 
622*f6dc9357SAndroid Build Coastguard Worker SWAP_BASE_FUNCS_PREFIXES
SwapBytes2_32(CSwapUInt16 * items,const CSwapUInt16 * lim)623*f6dc9357SAndroid Build Coastguard Worker SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim)
624*f6dc9357SAndroid Build Coastguard Worker {
625*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
626*f6dc9357SAndroid Build Coastguard Worker   do
627*f6dc9357SAndroid Build Coastguard Worker   {
628*f6dc9357SAndroid Build Coastguard Worker     SWAP2_32(0)  SWAP2_32(1)  items += 2 * 2;
629*f6dc9357SAndroid Build Coastguard Worker     SWAP2_32(0)  SWAP2_32(1)  items += 2 * 2;
630*f6dc9357SAndroid Build Coastguard Worker   }
631*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
632*f6dc9357SAndroid Build Coastguard Worker }
633*f6dc9357SAndroid Build Coastguard Worker 
634*f6dc9357SAndroid Build Coastguard Worker 
635*f6dc9357SAndroid Build Coastguard Worker SWAP_BASE_FUNCS_PREFIXES
SwapBytes4_32(CSwapUInt32 * items,const CSwapUInt32 * lim)636*f6dc9357SAndroid Build Coastguard Worker SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim)
637*f6dc9357SAndroid Build Coastguard Worker {
638*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
639*f6dc9357SAndroid Build Coastguard Worker   do
640*f6dc9357SAndroid Build Coastguard Worker   {
641*f6dc9357SAndroid Build Coastguard Worker     SWAP4_32(0)  SWAP4_32(1)  items += 2;
642*f6dc9357SAndroid Build Coastguard Worker     SWAP4_32(0)  SWAP4_32(1)  items += 2;
643*f6dc9357SAndroid Build Coastguard Worker   }
644*f6dc9357SAndroid Build Coastguard Worker   while (items != lim);
645*f6dc9357SAndroid Build Coastguard Worker }
646*f6dc9357SAndroid Build Coastguard Worker 
647*f6dc9357SAndroid Build Coastguard Worker #define DEFAULT_Swap2  SwapBytes2_32
648*f6dc9357SAndroid Build Coastguard Worker #define DEFAULT_Swap4  SwapBytes4_32
649*f6dc9357SAndroid Build Coastguard Worker #if !defined(FORCE_SWAP_MODE)
650*f6dc9357SAndroid Build Coastguard Worker   #define SWAP2_DEFAULT_MODE 0
651*f6dc9357SAndroid Build Coastguard Worker #endif
652*f6dc9357SAndroid Build Coastguard Worker 
653*f6dc9357SAndroid Build Coastguard Worker #endif // MY_CPU_64BIT
654*f6dc9357SAndroid Build Coastguard Worker #endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
655*f6dc9357SAndroid Build Coastguard Worker 
656*f6dc9357SAndroid Build Coastguard Worker 
657*f6dc9357SAndroid Build Coastguard Worker 
658*f6dc9357SAndroid Build Coastguard Worker #if !defined(FORCE_SWAP_MODE)
659*f6dc9357SAndroid Build Coastguard Worker static unsigned g_SwapBytes_Mode;
660*f6dc9357SAndroid Build Coastguard Worker #endif
661*f6dc9357SAndroid Build Coastguard Worker 
662*f6dc9357SAndroid Build Coastguard Worker /* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */
663*f6dc9357SAndroid Build Coastguard Worker #define SWAP_ITERATION_BLOCK_SIZE_MAX  (1 << 7)
664*f6dc9357SAndroid Build Coastguard Worker 
665*f6dc9357SAndroid Build Coastguard Worker // 32 bytes for (AVX) or 2 * 16-bytes for NEON.
666*f6dc9357SAndroid Build Coastguard Worker #define SWAP_VECTOR_ALIGN_SIZE  (1 << 5)
667*f6dc9357SAndroid Build Coastguard Worker 
668*f6dc9357SAndroid Build Coastguard Worker Z7_NO_INLINE
z7_SwapBytes2(CSwapUInt16 * items,size_t numItems)669*f6dc9357SAndroid Build Coastguard Worker void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems)
670*f6dc9357SAndroid Build Coastguard Worker {
671*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
672*f6dc9357SAndroid Build Coastguard Worker   for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
673*f6dc9357SAndroid Build Coastguard Worker   {
674*f6dc9357SAndroid Build Coastguard Worker     SWAP2_16(0)
675*f6dc9357SAndroid Build Coastguard Worker     items++;
676*f6dc9357SAndroid Build Coastguard Worker   }
677*f6dc9357SAndroid Build Coastguard Worker   {
678*f6dc9357SAndroid Build Coastguard Worker     const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1;
679*f6dc9357SAndroid Build Coastguard Worker     size_t numItems2 = numItems;
680*f6dc9357SAndroid Build Coastguard Worker     CSwapUInt16 *lim;
681*f6dc9357SAndroid Build Coastguard Worker     numItems &= k_Align_Mask;
682*f6dc9357SAndroid Build Coastguard Worker     numItems2 &= ~(size_t)k_Align_Mask;
683*f6dc9357SAndroid Build Coastguard Worker     lim = items + numItems2;
684*f6dc9357SAndroid Build Coastguard Worker     if (numItems2 != 0)
685*f6dc9357SAndroid Build Coastguard Worker     {
686*f6dc9357SAndroid Build Coastguard Worker      #if !defined(FORCE_SWAP_MODE)
687*f6dc9357SAndroid Build Coastguard Worker       #ifdef MY_CPU_X86_OR_AMD64
688*f6dc9357SAndroid Build Coastguard Worker         #ifdef USE_SWAP_AVX2
689*f6dc9357SAndroid Build Coastguard Worker           if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
690*f6dc9357SAndroid Build Coastguard Worker             ShufBytes_256((__m256i *)(void *)items,
691*f6dc9357SAndroid Build Coastguard Worker                 (const __m256i *)(const void *)lim,
692*f6dc9357SAndroid Build Coastguard Worker                 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
693*f6dc9357SAndroid Build Coastguard Worker           else
694*f6dc9357SAndroid Build Coastguard Worker         #endif
695*f6dc9357SAndroid Build Coastguard Worker         #ifdef USE_SWAP_SSSE3
696*f6dc9357SAndroid Build Coastguard Worker           if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
697*f6dc9357SAndroid Build Coastguard Worker             ShufBytes_128((__m128i *)(void *)items,
698*f6dc9357SAndroid Build Coastguard Worker                 (const __m128i *)(const void *)lim,
699*f6dc9357SAndroid Build Coastguard Worker                 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
700*f6dc9357SAndroid Build Coastguard Worker           else
701*f6dc9357SAndroid Build Coastguard Worker         #endif
702*f6dc9357SAndroid Build Coastguard Worker       #endif  // MY_CPU_X86_OR_AMD64
703*f6dc9357SAndroid Build Coastguard Worker       #if SWAP2_DEFAULT_MODE == 0
704*f6dc9357SAndroid Build Coastguard Worker           if (g_SwapBytes_Mode != 0)
705*f6dc9357SAndroid Build Coastguard Worker             SwapBytes2_128(items, lim);
706*f6dc9357SAndroid Build Coastguard Worker           else
707*f6dc9357SAndroid Build Coastguard Worker       #endif
708*f6dc9357SAndroid Build Coastguard Worker      #endif // FORCE_SWAP_MODE
709*f6dc9357SAndroid Build Coastguard Worker             DEFAULT_Swap2(items, lim);
710*f6dc9357SAndroid Build Coastguard Worker     }
711*f6dc9357SAndroid Build Coastguard Worker     items = lim;
712*f6dc9357SAndroid Build Coastguard Worker   }
713*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
714*f6dc9357SAndroid Build Coastguard Worker   for (; numItems != 0; numItems--)
715*f6dc9357SAndroid Build Coastguard Worker   {
716*f6dc9357SAndroid Build Coastguard Worker     SWAP2_16(0)
717*f6dc9357SAndroid Build Coastguard Worker     items++;
718*f6dc9357SAndroid Build Coastguard Worker   }
719*f6dc9357SAndroid Build Coastguard Worker }
720*f6dc9357SAndroid Build Coastguard Worker 
721*f6dc9357SAndroid Build Coastguard Worker 
722*f6dc9357SAndroid Build Coastguard Worker Z7_NO_INLINE
z7_SwapBytes4(CSwapUInt32 * items,size_t numItems)723*f6dc9357SAndroid Build Coastguard Worker void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems)
724*f6dc9357SAndroid Build Coastguard Worker {
725*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
726*f6dc9357SAndroid Build Coastguard Worker   for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
727*f6dc9357SAndroid Build Coastguard Worker   {
728*f6dc9357SAndroid Build Coastguard Worker     SWAP4_32(0)
729*f6dc9357SAndroid Build Coastguard Worker     items++;
730*f6dc9357SAndroid Build Coastguard Worker   }
731*f6dc9357SAndroid Build Coastguard Worker   {
732*f6dc9357SAndroid Build Coastguard Worker     const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1;
733*f6dc9357SAndroid Build Coastguard Worker     size_t numItems2 = numItems;
734*f6dc9357SAndroid Build Coastguard Worker     CSwapUInt32 *lim;
735*f6dc9357SAndroid Build Coastguard Worker     numItems &= k_Align_Mask;
736*f6dc9357SAndroid Build Coastguard Worker     numItems2 &= ~(size_t)k_Align_Mask;
737*f6dc9357SAndroid Build Coastguard Worker     lim = items + numItems2;
738*f6dc9357SAndroid Build Coastguard Worker     if (numItems2 != 0)
739*f6dc9357SAndroid Build Coastguard Worker     {
740*f6dc9357SAndroid Build Coastguard Worker      #if !defined(FORCE_SWAP_MODE)
741*f6dc9357SAndroid Build Coastguard Worker       #ifdef MY_CPU_X86_OR_AMD64
742*f6dc9357SAndroid Build Coastguard Worker         #ifdef USE_SWAP_AVX2
743*f6dc9357SAndroid Build Coastguard Worker           if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
744*f6dc9357SAndroid Build Coastguard Worker             ShufBytes_256((__m256i *)(void *)items,
745*f6dc9357SAndroid Build Coastguard Worker                 (const __m256i *)(const void *)lim,
746*f6dc9357SAndroid Build Coastguard Worker                 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
747*f6dc9357SAndroid Build Coastguard Worker           else
748*f6dc9357SAndroid Build Coastguard Worker         #endif
749*f6dc9357SAndroid Build Coastguard Worker         #ifdef USE_SWAP_SSSE3
750*f6dc9357SAndroid Build Coastguard Worker           if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
751*f6dc9357SAndroid Build Coastguard Worker             ShufBytes_128((__m128i *)(void *)items,
752*f6dc9357SAndroid Build Coastguard Worker                 (const __m128i *)(const void *)lim,
753*f6dc9357SAndroid Build Coastguard Worker                 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
754*f6dc9357SAndroid Build Coastguard Worker           else
755*f6dc9357SAndroid Build Coastguard Worker         #endif
756*f6dc9357SAndroid Build Coastguard Worker       #else  // MY_CPU_X86_OR_AMD64
757*f6dc9357SAndroid Build Coastguard Worker 
758*f6dc9357SAndroid Build Coastguard Worker           if (g_SwapBytes_Mode != 0)
759*f6dc9357SAndroid Build Coastguard Worker             SwapBytes4_128(items, lim);
760*f6dc9357SAndroid Build Coastguard Worker           else
761*f6dc9357SAndroid Build Coastguard Worker       #endif  // MY_CPU_X86_OR_AMD64
762*f6dc9357SAndroid Build Coastguard Worker      #endif // FORCE_SWAP_MODE
763*f6dc9357SAndroid Build Coastguard Worker             DEFAULT_Swap4(items, lim);
764*f6dc9357SAndroid Build Coastguard Worker     }
765*f6dc9357SAndroid Build Coastguard Worker     items = lim;
766*f6dc9357SAndroid Build Coastguard Worker   }
767*f6dc9357SAndroid Build Coastguard Worker   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
768*f6dc9357SAndroid Build Coastguard Worker   for (; numItems != 0; numItems--)
769*f6dc9357SAndroid Build Coastguard Worker   {
770*f6dc9357SAndroid Build Coastguard Worker     SWAP4_32(0)
771*f6dc9357SAndroid Build Coastguard Worker     items++;
772*f6dc9357SAndroid Build Coastguard Worker   }
773*f6dc9357SAndroid Build Coastguard Worker }
774*f6dc9357SAndroid Build Coastguard Worker 
775*f6dc9357SAndroid Build Coastguard Worker 
776*f6dc9357SAndroid Build Coastguard Worker // #define SHOW_HW_STATUS
777*f6dc9357SAndroid Build Coastguard Worker 
778*f6dc9357SAndroid Build Coastguard Worker #ifdef SHOW_HW_STATUS
779*f6dc9357SAndroid Build Coastguard Worker #include <stdio.h>
780*f6dc9357SAndroid Build Coastguard Worker #define PRF(x) x
781*f6dc9357SAndroid Build Coastguard Worker #else
782*f6dc9357SAndroid Build Coastguard Worker #define PRF(x)
783*f6dc9357SAndroid Build Coastguard Worker #endif
784*f6dc9357SAndroid Build Coastguard Worker 
z7_SwapBytesPrepare(void)785*f6dc9357SAndroid Build Coastguard Worker void z7_SwapBytesPrepare(void)
786*f6dc9357SAndroid Build Coastguard Worker {
787*f6dc9357SAndroid Build Coastguard Worker #ifndef FORCE_SWAP_MODE
788*f6dc9357SAndroid Build Coastguard Worker   unsigned mode = 0; // k_SwapBytes_Mode_BASE;
789*f6dc9357SAndroid Build Coastguard Worker 
790*f6dc9357SAndroid Build Coastguard Worker #ifdef MY_CPU_ARM_OR_ARM64
791*f6dc9357SAndroid Build Coastguard Worker   {
792*f6dc9357SAndroid Build Coastguard Worker     if (CPU_IsSupported_NEON())
793*f6dc9357SAndroid Build Coastguard Worker     {
794*f6dc9357SAndroid Build Coastguard Worker       // #pragma message ("=== SwapBytes NEON")
795*f6dc9357SAndroid Build Coastguard Worker       PRF(printf("\n=== SwapBytes NEON\n");)
796*f6dc9357SAndroid Build Coastguard Worker       mode = k_SwapBytes_Mode_NEON;
797*f6dc9357SAndroid Build Coastguard Worker     }
798*f6dc9357SAndroid Build Coastguard Worker   }
799*f6dc9357SAndroid Build Coastguard Worker #else // MY_CPU_ARM_OR_ARM64
800*f6dc9357SAndroid Build Coastguard Worker   {
801*f6dc9357SAndroid Build Coastguard Worker     #ifdef USE_SWAP_AVX2
802*f6dc9357SAndroid Build Coastguard Worker       if (CPU_IsSupported_AVX2())
803*f6dc9357SAndroid Build Coastguard Worker       {
804*f6dc9357SAndroid Build Coastguard Worker         // #pragma message ("=== SwapBytes AVX2")
805*f6dc9357SAndroid Build Coastguard Worker         PRF(printf("\n=== SwapBytes AVX2\n");)
806*f6dc9357SAndroid Build Coastguard Worker         mode = k_SwapBytes_Mode_AVX2;
807*f6dc9357SAndroid Build Coastguard Worker       }
808*f6dc9357SAndroid Build Coastguard Worker       else
809*f6dc9357SAndroid Build Coastguard Worker     #endif
810*f6dc9357SAndroid Build Coastguard Worker     #ifdef USE_SWAP_SSSE3
811*f6dc9357SAndroid Build Coastguard Worker       if (CPU_IsSupported_SSSE3())
812*f6dc9357SAndroid Build Coastguard Worker       {
813*f6dc9357SAndroid Build Coastguard Worker         // #pragma message ("=== SwapBytes SSSE3")
814*f6dc9357SAndroid Build Coastguard Worker         PRF(printf("\n=== SwapBytes SSSE3\n");)
815*f6dc9357SAndroid Build Coastguard Worker         mode = k_SwapBytes_Mode_SSSE3;
816*f6dc9357SAndroid Build Coastguard Worker       }
817*f6dc9357SAndroid Build Coastguard Worker       else
818*f6dc9357SAndroid Build Coastguard Worker     #endif
819*f6dc9357SAndroid Build Coastguard Worker     #if !defined(MY_CPU_AMD64)
820*f6dc9357SAndroid Build Coastguard Worker       if (CPU_IsSupported_SSE2())
821*f6dc9357SAndroid Build Coastguard Worker     #endif
822*f6dc9357SAndroid Build Coastguard Worker       {
823*f6dc9357SAndroid Build Coastguard Worker         // #pragma message ("=== SwapBytes SSE2")
824*f6dc9357SAndroid Build Coastguard Worker         PRF(printf("\n=== SwapBytes SSE2\n");)
825*f6dc9357SAndroid Build Coastguard Worker         mode = k_SwapBytes_Mode_SSE2;
826*f6dc9357SAndroid Build Coastguard Worker       }
827*f6dc9357SAndroid Build Coastguard Worker   }
828*f6dc9357SAndroid Build Coastguard Worker #endif // MY_CPU_ARM_OR_ARM64
829*f6dc9357SAndroid Build Coastguard Worker   g_SwapBytes_Mode = mode;
830*f6dc9357SAndroid Build Coastguard Worker   // g_SwapBytes_Mode = 0; // for debug
831*f6dc9357SAndroid Build Coastguard Worker #endif // FORCE_SWAP_MODE
832*f6dc9357SAndroid Build Coastguard Worker   PRF(printf("\n=== SwapBytesPrepare\n");)
833*f6dc9357SAndroid Build Coastguard Worker }
834*f6dc9357SAndroid Build Coastguard Worker 
835*f6dc9357SAndroid Build Coastguard Worker #undef PRF
836