1 /* SwapBytes.c -- Byte Swap conversion filter
2 2024-03-01 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #include "Compiler.h"
7 #include "CpuArch.h"
8 #include "RotateDefs.h"
9 #include "SwapBytes.h"
10
11 typedef UInt16 CSwapUInt16;
12 typedef UInt32 CSwapUInt32;
13
14 // #define k_SwapBytes_Mode_BASE 0
15
16 #ifdef MY_CPU_X86_OR_AMD64
17
18 #define k_SwapBytes_Mode_SSE2 1
19 #define k_SwapBytes_Mode_SSSE3 2
20 #define k_SwapBytes_Mode_AVX2 3
21
22 // #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
23 #if defined(__clang__) && (__clang_major__ >= 4) \
24 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
25 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2
26 #define SWAP_ATTRIB_SSE2 __attribute__((__target__("sse2")))
27 #define SWAP_ATTRIB_SSSE3 __attribute__((__target__("ssse3")))
28 #define SWAP_ATTRIB_AVX2 __attribute__((__target__("avx2")))
29 #elif defined(_MSC_VER)
30 #if (_MSC_VER == 1900)
31 #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
32 #endif
33 #if (_MSC_VER >= 1900)
34 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_AVX2
35 #elif (_MSC_VER >= 1500) // (VS2008)
36 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSSE3
37 #elif (_MSC_VER >= 1310) // (VS2003)
38 #define k_SwapBytes_Mode_MAX k_SwapBytes_Mode_SSE2
39 #endif
40 #endif // _MSC_VER
41
42 /*
43 // for debug
44 #ifdef k_SwapBytes_Mode_MAX
45 #undef k_SwapBytes_Mode_MAX
46 #endif
47 */
48
49 #ifndef k_SwapBytes_Mode_MAX
50 #define k_SwapBytes_Mode_MAX 0
51 #endif
52
53 #if (k_SwapBytes_Mode_MAX != 0) && defined(MY_CPU_AMD64)
54 #define k_SwapBytes_Mode_MIN k_SwapBytes_Mode_SSE2
55 #else
56 #define k_SwapBytes_Mode_MIN 0
57 #endif
58
59 #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_AVX2)
60 #define USE_SWAP_AVX2
61 #endif
62 #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSSE3)
63 #define USE_SWAP_SSSE3
64 #endif
65 #if (k_SwapBytes_Mode_MAX >= k_SwapBytes_Mode_SSE2)
66 #define USE_SWAP_128
67 #endif
68
69 #if k_SwapBytes_Mode_MAX <= k_SwapBytes_Mode_MIN || !defined(USE_SWAP_128)
70 #define FORCE_SWAP_MODE
71 #endif
72
73
74 #ifdef USE_SWAP_128
75 /*
76 <mmintrin.h> MMX
77 <xmmintrin.h> SSE
78 <emmintrin.h> SSE2
79 <pmmintrin.h> SSE3
80 <tmmintrin.h> SSSE3
81 <smmintrin.h> SSE4.1
82 <nmmintrin.h> SSE4.2
83 <ammintrin.h> SSE4A
84 <wmmintrin.h> AES
85 <immintrin.h> AVX, AVX2, FMA
86 */
87
88 #include <emmintrin.h> // sse2
89 // typedef __m128i v128;
90
91 #define SWAP2_128(i) { \
92 const __m128i v = *(const __m128i *)(const void *)(items + (i) * 8); \
93 *( __m128i *)( void *)(items + (i) * 8) = \
94 _mm_or_si128( \
95 _mm_slli_epi16(v, 8), \
96 _mm_srli_epi16(v, 8)); }
97 // _mm_or_si128() has more ports to execute than _mm_add_epi16().
98
99 static
100 #ifdef SWAP_ATTRIB_SSE2
101 SWAP_ATTRIB_SSE2
102 #endif
103 void
104 Z7_FASTCALL
SwapBytes2_128(CSwapUInt16 * items,const CSwapUInt16 * lim)105 SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
106 {
107 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
108 do
109 {
110 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
111 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
112 }
113 while (items != lim);
114 }
115
116 /*
117 // sse2
118 #define SWAP4_128_pack(i) { \
119 __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
120 __m128i v0 = _mm_unpacklo_epi8(v, mask); \
121 __m128i v1 = _mm_unpackhi_epi8(v, mask); \
122 v0 = _mm_shufflelo_epi16(v0, 0x1b); \
123 v1 = _mm_shufflelo_epi16(v1, 0x1b); \
124 v0 = _mm_shufflehi_epi16(v0, 0x1b); \
125 v1 = _mm_shufflehi_epi16(v1, 0x1b); \
126 *(__m128i *)(void *)(items + (i) * 4) = _mm_packus_epi16(v0, v1); }
127
128 static
129 #ifdef SWAP_ATTRIB_SSE2
130 SWAP_ATTRIB_SSE2
131 #endif
132 void
133 Z7_FASTCALL
134 SwapBytes4_128_pack(CSwapUInt32 *items, const CSwapUInt32 *lim)
135 {
136 const __m128i mask = _mm_setzero_si128();
137 // const __m128i mask = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
138 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
139 do
140 {
141 SWAP4_128_pack(0); items += 1 * 4;
142 // SWAP4_128_pack(0); SWAP4_128_pack(1); items += 2 * 4;
143 }
144 while (items != lim);
145 }
146
147 // sse2
148 #define SWAP4_128_shift(i) { \
149 __m128i v = *(const __m128i *)(const void *)(items + (i) * 4); \
150 __m128i v2; \
151 v2 = _mm_or_si128( \
152 _mm_slli_si128(_mm_and_si128(v, mask), 1), \
153 _mm_and_si128(_mm_srli_si128(v, 1), mask)); \
154 v = _mm_or_si128( \
155 _mm_slli_epi32(v, 24), \
156 _mm_srli_epi32(v, 24)); \
157 *(__m128i *)(void *)(items + (i) * 4) = _mm_or_si128(v2, v); }
158
159 static
160 #ifdef SWAP_ATTRIB_SSE2
161 SWAP_ATTRIB_SSE2
162 #endif
163 void
164 Z7_FASTCALL
165 SwapBytes4_128_shift(CSwapUInt32 *items, const CSwapUInt32 *lim)
166 {
167 #define M1 0xff00
168 const __m128i mask = _mm_set_epi32(M1, M1, M1, M1);
169 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
170 do
171 {
172 // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4;
173 // SWAP4_128_shift(0) SWAP4_128_shift(1) items += 2 * 4;
174 SWAP4_128_shift(0); items += 1 * 4;
175 }
176 while (items != lim);
177 }
178 */
179
180
181 #if defined(USE_SWAP_SSSE3) || defined(USE_SWAP_AVX2)
182
183 #define SWAP_SHUF_REV_SEQ_2_VALS(v) (v)+1, (v)
184 #define SWAP_SHUF_REV_SEQ_4_VALS(v) (v)+3, (v)+2, (v)+1, (v)
185
186 #define SWAP2_SHUF_MASK_16_BYTES \
187 SWAP_SHUF_REV_SEQ_2_VALS (0 * 2), \
188 SWAP_SHUF_REV_SEQ_2_VALS (1 * 2), \
189 SWAP_SHUF_REV_SEQ_2_VALS (2 * 2), \
190 SWAP_SHUF_REV_SEQ_2_VALS (3 * 2), \
191 SWAP_SHUF_REV_SEQ_2_VALS (4 * 2), \
192 SWAP_SHUF_REV_SEQ_2_VALS (5 * 2), \
193 SWAP_SHUF_REV_SEQ_2_VALS (6 * 2), \
194 SWAP_SHUF_REV_SEQ_2_VALS (7 * 2)
195
196 #define SWAP4_SHUF_MASK_16_BYTES \
197 SWAP_SHUF_REV_SEQ_4_VALS (0 * 4), \
198 SWAP_SHUF_REV_SEQ_4_VALS (1 * 4), \
199 SWAP_SHUF_REV_SEQ_4_VALS (2 * 4), \
200 SWAP_SHUF_REV_SEQ_4_VALS (3 * 4)
201
202 #if defined(USE_SWAP_AVX2)
203 /* if we use 256_BIT_INIT_MASK, each static array mask will be larger for 16 bytes */
204 // #define SWAP_USE_256_BIT_INIT_MASK
205 #endif
206
207 #if defined(SWAP_USE_256_BIT_INIT_MASK) && defined(USE_SWAP_AVX2)
208 #define SWAP_MASK_INIT_SIZE 32
209 #else
210 #define SWAP_MASK_INIT_SIZE 16
211 #endif
212
213 MY_ALIGN(SWAP_MASK_INIT_SIZE)
214 static const Byte k_ShufMask_Swap2[] =
215 {
216 SWAP2_SHUF_MASK_16_BYTES
217 #if SWAP_MASK_INIT_SIZE > 16
218 , SWAP2_SHUF_MASK_16_BYTES
219 #endif
220 };
221
222 MY_ALIGN(SWAP_MASK_INIT_SIZE)
223 static const Byte k_ShufMask_Swap4[] =
224 {
225 SWAP4_SHUF_MASK_16_BYTES
226 #if SWAP_MASK_INIT_SIZE > 16
227 , SWAP4_SHUF_MASK_16_BYTES
228 #endif
229 };
230
231
232 #ifdef USE_SWAP_SSSE3
233
234 #include <tmmintrin.h> // ssse3
235
236 #define SHUF_128(i) *(items + (i)) = \
237 _mm_shuffle_epi8(*(items + (i)), mask); // SSSE3
238
239 // Z7_NO_INLINE
240 static
241 #ifdef SWAP_ATTRIB_SSSE3
242 SWAP_ATTRIB_SSSE3
243 #endif
244 Z7_ATTRIB_NO_VECTORIZE
245 void
246 Z7_FASTCALL
ShufBytes_128(void * items8,const void * lim8,const void * mask128_ptr)247 ShufBytes_128(void *items8, const void *lim8, const void *mask128_ptr)
248 {
249 __m128i *items = (__m128i *)items8;
250 const __m128i *lim = (const __m128i *)lim8;
251 // const __m128i mask = _mm_set_epi8(SHUF_SWAP2_MASK_16_VALS);
252 // const __m128i mask = _mm_set_epi8(SHUF_SWAP4_MASK_16_VALS);
253 // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
254 // const __m128i mask = _mm_load_si128((const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
255 // const __m128i mask = *(const __m128i *)(const void *)&(k_ShufMask_Swap4[0]);
256 const __m128i mask = *(const __m128i *)mask128_ptr;
257 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
258 do
259 {
260 SHUF_128(0) SHUF_128(1) items += 2;
261 SHUF_128(0) SHUF_128(1) items += 2;
262 }
263 while (items != lim);
264 }
265
266 #endif // USE_SWAP_SSSE3
267
268
269
270 #ifdef USE_SWAP_AVX2
271
272 #include <immintrin.h> // avx, avx2
273 #if defined(__clang__)
274 #include <avxintrin.h>
275 #include <avx2intrin.h>
276 #endif
277
278 #define SHUF_256(i) *(items + (i)) = \
279 _mm256_shuffle_epi8(*(items + (i)), mask); // AVX2
280
281 // Z7_NO_INLINE
282 static
283 #ifdef SWAP_ATTRIB_AVX2
284 SWAP_ATTRIB_AVX2
285 #endif
286 Z7_ATTRIB_NO_VECTORIZE
287 void
288 Z7_FASTCALL
ShufBytes_256(void * items8,const void * lim8,const void * mask128_ptr)289 ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr)
290 {
291 __m256i *items = (__m256i *)items8;
292 const __m256i *lim = (const __m256i *)lim8;
293 /*
294 UNUSED_VAR(mask128_ptr)
295 __m256i mask =
296 for Swap4: _mm256_setr_epi8(SWAP4_SHUF_MASK_16_BYTES, SWAP4_SHUF_MASK_16_BYTES);
297 for Swap2: _mm256_setr_epi8(SWAP2_SHUF_MASK_16_BYTES, SWAP2_SHUF_MASK_16_BYTES);
298 */
299 const __m256i mask =
300 #if SWAP_MASK_INIT_SIZE > 16
301 *(const __m256i *)(const void *)mask128_ptr;
302 #else
303 /* msvc: broadcastsi128() version reserves the stack for no reason
304 msvc 19.29-: _mm256_insertf128_si256() / _mm256_set_m128i)) versions use non-avx movdqu xmm0,XMMWORD PTR [r8]
305 msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want
306 */
307 // _mm256_broadcastsi128_si256(*mask128_ptr);
308 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
309 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
310 #else
311 #define MY_mm256_set_m128i _mm256_set_m128i
312 #endif
313 MY_mm256_set_m128i(
314 *(const __m128i *)mask128_ptr,
315 *(const __m128i *)mask128_ptr);
316 #endif
317
318 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
319 do
320 {
321 SHUF_256(0) SHUF_256(1) items += 2;
322 SHUF_256(0) SHUF_256(1) items += 2;
323 }
324 while (items != lim);
325 }
326
327 #endif // USE_SWAP_AVX2
328 #endif // USE_SWAP_SSSE3 || USE_SWAP_AVX2
329 #endif // USE_SWAP_128
330
331
332
333 // compile message "NEON intrinsics not available with the soft-float ABI"
334 #elif defined(MY_CPU_ARM_OR_ARM64) \
335 && defined(MY_CPU_LE) \
336 && !defined(Z7_DISABLE_ARM_NEON)
337
338 #if defined(__clang__) && (__clang_major__ >= 8) \
339 || defined(__GNUC__) && (__GNUC__ >= 6)
340 #if defined(__ARM_FP)
341 #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \
342 || defined(MY_CPU_ARM64)
343 #if defined(MY_CPU_ARM64) \
344 || !defined(Z7_CLANG_VERSION) \
345 || defined(__ARM_NEON)
346 #define USE_SWAP_128
347 #ifdef MY_CPU_ARM64
348 // #define SWAP_ATTRIB_NEON __attribute__((__target__("")))
349 #else
350 #if defined(Z7_CLANG_VERSION)
351 // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon")))
352 #else
353 // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))")
354 #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon")))
355 #endif
356 #endif // MY_CPU_ARM64
357 #endif // __ARM_NEON
358 #endif // __ARM_ARCH
359 #endif // __ARM_FP
360
361 #elif defined(_MSC_VER)
362 #if (_MSC_VER >= 1910)
363 #define USE_SWAP_128
364 #endif
365 #endif
366
367 #ifdef USE_SWAP_128
368 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
369 #include <arm64_neon.h>
370 #else
371
372 /*
373 #if !defined(__ARM_NEON)
374 #if defined(Z7_GCC_VERSION) && (__GNUC__ < 5) \
375 || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 90201) \
376 || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 100100)
377 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
378 #pragma message("#define __ARM_NEON 1")
379 // #define __ARM_NEON 1
380 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
381 #endif
382 #endif
383 */
384 #include <arm_neon.h>
385 #endif
386 #endif
387
388 #ifndef USE_SWAP_128
389 #define FORCE_SWAP_MODE
390 #else
391
392 #ifdef MY_CPU_ARM64
393 // for debug : comment it
394 #define FORCE_SWAP_MODE
395 #else
396 #define k_SwapBytes_Mode_NEON 1
397 #endif
398 // typedef uint8x16_t v128;
399 #define SWAP2_128(i) *(uint8x16_t *) (void *)(items + (i) * 8) = \
400 vrev16q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 8));
401 #define SWAP4_128(i) *(uint8x16_t *) (void *)(items + (i) * 4) = \
402 vrev32q_u8(*(const uint8x16_t *)(const void *)(items + (i) * 4));
403
404 // Z7_NO_INLINE
405 static
406 #ifdef SWAP_ATTRIB_NEON
407 SWAP_ATTRIB_NEON
408 #endif
409 Z7_ATTRIB_NO_VECTORIZE
410 void
411 Z7_FASTCALL
SwapBytes2_128(CSwapUInt16 * items,const CSwapUInt16 * lim)412 SwapBytes2_128(CSwapUInt16 *items, const CSwapUInt16 *lim)
413 {
414 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
415 do
416 {
417 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
418 SWAP2_128(0) SWAP2_128(1) items += 2 * 8;
419 }
420 while (items != lim);
421 }
422
423 // Z7_NO_INLINE
424 static
425 #ifdef SWAP_ATTRIB_NEON
426 SWAP_ATTRIB_NEON
427 #endif
428 Z7_ATTRIB_NO_VECTORIZE
429 void
430 Z7_FASTCALL
SwapBytes4_128(CSwapUInt32 * items,const CSwapUInt32 * lim)431 SwapBytes4_128(CSwapUInt32 *items, const CSwapUInt32 *lim)
432 {
433 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
434 do
435 {
436 SWAP4_128(0) SWAP4_128(1) items += 2 * 4;
437 SWAP4_128(0) SWAP4_128(1) items += 2 * 4;
438 }
439 while (items != lim);
440 }
441
442 #endif // USE_SWAP_128
443
444 #else // MY_CPU_ARM_OR_ARM64
445 #define FORCE_SWAP_MODE
446 #endif // MY_CPU_ARM_OR_ARM64
447
448
449
450
451
452
453 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_X86)
454 /* _byteswap_ushort() in MSVC x86 32-bit works via slow { mov dh, al; mov dl, ah }
455 So we use own versions of byteswap function */
456 #if (_MSC_VER < 1400 ) // old MSVC-X86 without _rotr16() support
457 #define SWAP2_16(i) { UInt32 v = items[i]; v += (v << 16); v >>= 8; items[i] = (CSwapUInt16)v; }
458 #else // is new MSVC-X86 with fast _rotr16()
459 #include <intrin.h>
460 #define SWAP2_16(i) { items[i] = _rotr16(items[i], 8); }
461 #endif
462 #else // is not MSVC-X86
463 #define SWAP2_16(i) { CSwapUInt16 v = items[i]; items[i] = Z7_BSWAP16(v); }
464 #endif // MSVC-X86
465
466 #if defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
467 #define SWAP4_32(i) { CSwapUInt32 v = items[i]; items[i] = Z7_BSWAP32(v); }
468 #else
469 #define SWAP4_32(i) \
470 { UInt32 v = items[i]; \
471 v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff); \
472 v = rotlFixed(v, 16); \
473 items[i] = v; }
474 #endif
475
476
477
478
479 #if defined(FORCE_SWAP_MODE) && defined(USE_SWAP_128)
480 #define DEFAULT_Swap2 SwapBytes2_128
481 #if !defined(MY_CPU_X86_OR_AMD64)
482 #define DEFAULT_Swap4 SwapBytes4_128
483 #endif
484 #endif
485
486 #if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
487
488 #define SWAP_BASE_FUNCS_PREFIXES \
489 Z7_FORCE_INLINE \
490 static \
491 Z7_ATTRIB_NO_VECTOR \
492 void Z7_FASTCALL
493
494
495 #if defined(MY_CPU_ARM_OR_ARM64)
496 #if defined(__clang__)
497 #pragma GCC diagnostic ignored "-Wlanguage-extension-token"
498 #endif
499 #endif
500
501
502 #ifdef MY_CPU_64BIT
503
504 #if defined(MY_CPU_ARM64) \
505 && defined(__ARM_ARCH) && (__ARM_ARCH >= 8) \
506 && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \
507 || (defined(__clang__) && (__clang_major__ >= 4)))
508
509 #define SWAP2_64_VAR(v) asm ("rev16 %x0,%x0" : "+r" (v));
510 #define SWAP4_64_VAR(v) asm ("rev32 %x0,%x0" : "+r" (v));
511
512 #else // is not ARM64-GNU
513
514 #if !defined(MY_CPU_X86_OR_AMD64) || (k_SwapBytes_Mode_MIN == 0) || !defined(USE_SWAP_128)
515 #define SWAP2_64_VAR(v) \
516 v = ( 0x00ff00ff00ff00ff & (v >> 8)) \
517 + ((0x00ff00ff00ff00ff & v) << 8);
518 /* plus gives faster code in MSVC */
519 #endif
520
521 #ifdef Z7_CPU_FAST_BSWAP_SUPPORTED
522 #define SWAP4_64_VAR(v) \
523 v = Z7_BSWAP64(v); \
524 v = Z7_ROTL64(v, 32);
525 #else
526 #define SWAP4_64_VAR(v) \
527 v = ( 0x000000ff000000ff & (v >> 24)) \
528 + ((0x000000ff000000ff & v) << 24 ) \
529 + ( 0x0000ff000000ff00 & (v >> 8)) \
530 + ((0x0000ff000000ff00 & v) << 8 ) \
531 ;
532 #endif
533
534 #endif // ARM64-GNU
535
536
537 #ifdef SWAP2_64_VAR
538
539 #define SWAP2_64(i) { \
540 UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 4); \
541 SWAP2_64_VAR(v) \
542 *(UInt64 *)(void *)(items + (i) * 4) = v; }
543
544 SWAP_BASE_FUNCS_PREFIXES
SwapBytes2_64(CSwapUInt16 * items,const CSwapUInt16 * lim)545 SwapBytes2_64(CSwapUInt16 *items, const CSwapUInt16 *lim)
546 {
547 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
548 do
549 {
550 SWAP2_64(0) SWAP2_64(1) items += 2 * 4;
551 SWAP2_64(0) SWAP2_64(1) items += 2 * 4;
552 }
553 while (items != lim);
554 }
555
556 #define DEFAULT_Swap2 SwapBytes2_64
557 #if !defined(FORCE_SWAP_MODE)
558 #define SWAP2_DEFAULT_MODE 0
559 #endif
560 #else // !defined(SWAP2_64_VAR)
561 #define DEFAULT_Swap2 SwapBytes2_128
562 #if !defined(FORCE_SWAP_MODE)
563 #define SWAP2_DEFAULT_MODE 1
564 #endif
565 #endif // SWAP2_64_VAR
566
567
568 #define SWAP4_64(i) { \
569 UInt64 v = *(const UInt64 *)(const void *)(items + (i) * 2); \
570 SWAP4_64_VAR(v) \
571 *(UInt64 *)(void *)(items + (i) * 2) = v; }
572
573 SWAP_BASE_FUNCS_PREFIXES
SwapBytes4_64(CSwapUInt32 * items,const CSwapUInt32 * lim)574 SwapBytes4_64(CSwapUInt32 *items, const CSwapUInt32 *lim)
575 {
576 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
577 do
578 {
579 SWAP4_64(0) SWAP4_64(1) items += 2 * 2;
580 SWAP4_64(0) SWAP4_64(1) items += 2 * 2;
581 }
582 while (items != lim);
583 }
584
585 #define DEFAULT_Swap4 SwapBytes4_64
586
587 #else // is not 64BIT
588
589
590 #if defined(MY_CPU_ARM_OR_ARM64) \
591 && defined(__ARM_ARCH) && (__ARM_ARCH >= 6) \
592 && ( (defined(__GNUC__) && (__GNUC__ >= 4)) \
593 || (defined(__clang__) && (__clang_major__ >= 4)))
594
595 #ifdef MY_CPU_64BIT
596 #define SWAP2_32_VAR(v) asm ("rev16 %w0,%w0" : "+r" (v));
597 #else
598 #define SWAP2_32_VAR(v) asm ("rev16 %0,%0" : "+r" (v)); // for clang/gcc
599 // asm ("rev16 %r0,%r0" : "+r" (a)); // for gcc
600 #endif
601
602 #elif defined(_MSC_VER) && (_MSC_VER < 1300) && defined(MY_CPU_X86) \
603 || !defined(Z7_CPU_FAST_BSWAP_SUPPORTED) \
604 || !defined(Z7_CPU_FAST_ROTATE_SUPPORTED)
605 // old msvc doesn't support _byteswap_ulong()
606 #define SWAP2_32_VAR(v) \
607 v = ((v & 0xff00ff) << 8) + ((v >> 8) & 0xff00ff);
608
609 #else // is not ARM and is not old-MSVC-X86 and fast BSWAP/ROTATE are supported
610 #define SWAP2_32_VAR(v) \
611 v = Z7_BSWAP32(v); \
612 v = rotlFixed(v, 16);
613
614 #endif // GNU-ARM*
615
616 #define SWAP2_32(i) { \
617 UInt32 v = *(const UInt32 *)(const void *)(items + (i) * 2); \
618 SWAP2_32_VAR(v); \
619 *(UInt32 *)(void *)(items + (i) * 2) = v; }
620
621
622 SWAP_BASE_FUNCS_PREFIXES
SwapBytes2_32(CSwapUInt16 * items,const CSwapUInt16 * lim)623 SwapBytes2_32(CSwapUInt16 *items, const CSwapUInt16 *lim)
624 {
625 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
626 do
627 {
628 SWAP2_32(0) SWAP2_32(1) items += 2 * 2;
629 SWAP2_32(0) SWAP2_32(1) items += 2 * 2;
630 }
631 while (items != lim);
632 }
633
634
635 SWAP_BASE_FUNCS_PREFIXES
SwapBytes4_32(CSwapUInt32 * items,const CSwapUInt32 * lim)636 SwapBytes4_32(CSwapUInt32 *items, const CSwapUInt32 *lim)
637 {
638 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
639 do
640 {
641 SWAP4_32(0) SWAP4_32(1) items += 2;
642 SWAP4_32(0) SWAP4_32(1) items += 2;
643 }
644 while (items != lim);
645 }
646
647 #define DEFAULT_Swap2 SwapBytes2_32
648 #define DEFAULT_Swap4 SwapBytes4_32
649 #if !defined(FORCE_SWAP_MODE)
650 #define SWAP2_DEFAULT_MODE 0
651 #endif
652
653 #endif // MY_CPU_64BIT
654 #endif // if !defined(DEFAULT_Swap2) || !defined(DEFAULT_Swap4)
655
656
657
658 #if !defined(FORCE_SWAP_MODE)
659 static unsigned g_SwapBytes_Mode;
660 #endif
661
662 /* size of largest unrolled loop iteration: 128 bytes = 4 * 32 bytes (AVX). */
663 #define SWAP_ITERATION_BLOCK_SIZE_MAX (1 << 7)
664
665 // 32 bytes for (AVX) or 2 * 16-bytes for NEON.
666 #define SWAP_VECTOR_ALIGN_SIZE (1 << 5)
667
668 Z7_NO_INLINE
z7_SwapBytes2(CSwapUInt16 * items,size_t numItems)669 void z7_SwapBytes2(CSwapUInt16 *items, size_t numItems)
670 {
671 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
672 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
673 {
674 SWAP2_16(0)
675 items++;
676 }
677 {
678 const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt16) - 1;
679 size_t numItems2 = numItems;
680 CSwapUInt16 *lim;
681 numItems &= k_Align_Mask;
682 numItems2 &= ~(size_t)k_Align_Mask;
683 lim = items + numItems2;
684 if (numItems2 != 0)
685 {
686 #if !defined(FORCE_SWAP_MODE)
687 #ifdef MY_CPU_X86_OR_AMD64
688 #ifdef USE_SWAP_AVX2
689 if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
690 ShufBytes_256((__m256i *)(void *)items,
691 (const __m256i *)(const void *)lim,
692 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
693 else
694 #endif
695 #ifdef USE_SWAP_SSSE3
696 if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
697 ShufBytes_128((__m128i *)(void *)items,
698 (const __m128i *)(const void *)lim,
699 (const __m128i *)(const void *)&(k_ShufMask_Swap2[0]));
700 else
701 #endif
702 #endif // MY_CPU_X86_OR_AMD64
703 #if SWAP2_DEFAULT_MODE == 0
704 if (g_SwapBytes_Mode != 0)
705 SwapBytes2_128(items, lim);
706 else
707 #endif
708 #endif // FORCE_SWAP_MODE
709 DEFAULT_Swap2(items, lim);
710 }
711 items = lim;
712 }
713 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
714 for (; numItems != 0; numItems--)
715 {
716 SWAP2_16(0)
717 items++;
718 }
719 }
720
721
722 Z7_NO_INLINE
z7_SwapBytes4(CSwapUInt32 * items,size_t numItems)723 void z7_SwapBytes4(CSwapUInt32 *items, size_t numItems)
724 {
725 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
726 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (SWAP_VECTOR_ALIGN_SIZE - 1)) != 0; numItems--)
727 {
728 SWAP4_32(0)
729 items++;
730 }
731 {
732 const size_t k_Align_Mask = SWAP_ITERATION_BLOCK_SIZE_MAX / sizeof(CSwapUInt32) - 1;
733 size_t numItems2 = numItems;
734 CSwapUInt32 *lim;
735 numItems &= k_Align_Mask;
736 numItems2 &= ~(size_t)k_Align_Mask;
737 lim = items + numItems2;
738 if (numItems2 != 0)
739 {
740 #if !defined(FORCE_SWAP_MODE)
741 #ifdef MY_CPU_X86_OR_AMD64
742 #ifdef USE_SWAP_AVX2
743 if (g_SwapBytes_Mode > k_SwapBytes_Mode_SSSE3)
744 ShufBytes_256((__m256i *)(void *)items,
745 (const __m256i *)(const void *)lim,
746 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
747 else
748 #endif
749 #ifdef USE_SWAP_SSSE3
750 if (g_SwapBytes_Mode >= k_SwapBytes_Mode_SSSE3)
751 ShufBytes_128((__m128i *)(void *)items,
752 (const __m128i *)(const void *)lim,
753 (const __m128i *)(const void *)&(k_ShufMask_Swap4[0]));
754 else
755 #endif
756 #else // MY_CPU_X86_OR_AMD64
757
758 if (g_SwapBytes_Mode != 0)
759 SwapBytes4_128(items, lim);
760 else
761 #endif // MY_CPU_X86_OR_AMD64
762 #endif // FORCE_SWAP_MODE
763 DEFAULT_Swap4(items, lim);
764 }
765 items = lim;
766 }
767 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
768 for (; numItems != 0; numItems--)
769 {
770 SWAP4_32(0)
771 items++;
772 }
773 }
774
775
776 // #define SHOW_HW_STATUS
777
778 #ifdef SHOW_HW_STATUS
779 #include <stdio.h>
780 #define PRF(x) x
781 #else
782 #define PRF(x)
783 #endif
784
z7_SwapBytesPrepare(void)785 void z7_SwapBytesPrepare(void)
786 {
787 #ifndef FORCE_SWAP_MODE
788 unsigned mode = 0; // k_SwapBytes_Mode_BASE;
789
790 #ifdef MY_CPU_ARM_OR_ARM64
791 {
792 if (CPU_IsSupported_NEON())
793 {
794 // #pragma message ("=== SwapBytes NEON")
795 PRF(printf("\n=== SwapBytes NEON\n");)
796 mode = k_SwapBytes_Mode_NEON;
797 }
798 }
799 #else // MY_CPU_ARM_OR_ARM64
800 {
801 #ifdef USE_SWAP_AVX2
802 if (CPU_IsSupported_AVX2())
803 {
804 // #pragma message ("=== SwapBytes AVX2")
805 PRF(printf("\n=== SwapBytes AVX2\n");)
806 mode = k_SwapBytes_Mode_AVX2;
807 }
808 else
809 #endif
810 #ifdef USE_SWAP_SSSE3
811 if (CPU_IsSupported_SSSE3())
812 {
813 // #pragma message ("=== SwapBytes SSSE3")
814 PRF(printf("\n=== SwapBytes SSSE3\n");)
815 mode = k_SwapBytes_Mode_SSSE3;
816 }
817 else
818 #endif
819 #if !defined(MY_CPU_AMD64)
820 if (CPU_IsSupported_SSE2())
821 #endif
822 {
823 // #pragma message ("=== SwapBytes SSE2")
824 PRF(printf("\n=== SwapBytes SSE2\n");)
825 mode = k_SwapBytes_Mode_SSE2;
826 }
827 }
828 #endif // MY_CPU_ARM_OR_ARM64
829 g_SwapBytes_Mode = mode;
830 // g_SwapBytes_Mode = 0; // for debug
831 #endif // FORCE_SWAP_MODE
832 PRF(printf("\n=== SwapBytesPrepare\n");)
833 }
834
835 #undef PRF
836