xref: /aosp_15_r20/external/lzma/C/CpuArch.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* CpuArch.c -- CPU specific code
2 Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 // #include <stdio.h>
7 
8 #include "CpuArch.h"
9 
10 #ifdef MY_CPU_X86_OR_AMD64
11 
12 #undef NEED_CHECK_FOR_CPUID
13 #if !defined(MY_CPU_AMD64)
14 #define NEED_CHECK_FOR_CPUID
15 #endif
16 
17 /*
18   cpuid instruction supports (subFunction) parameter in ECX,
19   that is used only with some specific (function) parameter values.
20   most functions use only (subFunction==0).
21 */
22 /*
23   __cpuid(): MSVC and GCC/CLANG use same function/macro name
24              but parameters are different.
25    We use MSVC __cpuid() parameters style for our z7_x86_cpuid() function.
26 */
27 
28 #if defined(__GNUC__) /* && (__GNUC__ >= 10) */ \
29     || defined(__clang__) /* && (__clang_major__ >= 10) */
30 
31 /* there was some CLANG/GCC compilers that have issues with
32    rbx(ebx) handling in asm blocks in -fPIC mode (__PIC__ is defined).
33    compiler's <cpuid.h> contains the macro __cpuid() that is similar to our code.
34    The history of __cpuid() changes in CLANG/GCC:
35    GCC:
36      2007: it preserved ebx for (__PIC__ && __i386__)
37      2013: it preserved rbx and ebx for __PIC__
38      2014: it doesn't preserves rbx and ebx anymore
39      we suppose that (__GNUC__ >= 5) fixed that __PIC__ ebx/rbx problem.
40    CLANG:
41      2014+: it preserves rbx, but only for 64-bit code. No __PIC__ check.
42    Why CLANG cares about 64-bit mode only, and doesn't care about ebx (in 32-bit)?
43    Do we need __PIC__ test for CLANG or we must care about rbx even if
44    __PIC__ is not defined?
45 */
46 
47 #define ASM_LN "\n"
48 
49 #if defined(MY_CPU_AMD64) && defined(__PIC__) \
50     && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
51 
52   /* "=&r" selects free register. It can select even rbx, if that register is free.
53      "=&D" for (RDI) also works, but the code can be larger with "=&D"
54      "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
55 
56 #define x86_cpuid_MACRO_2(p, func, subFunc) { \
57   __asm__ __volatile__ ( \
58     ASM_LN   "mov     %%rbx, %q1"  \
59     ASM_LN   "cpuid"               \
60     ASM_LN   "xchg    %%rbx, %q1"  \
61     : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
62 
63 #elif defined(MY_CPU_X86) && defined(__PIC__) \
64     && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
65 
66 #define x86_cpuid_MACRO_2(p, func, subFunc) { \
67   __asm__ __volatile__ ( \
68     ASM_LN   "mov     %%ebx, %k1"  \
69     ASM_LN   "cpuid"               \
70     ASM_LN   "xchg    %%ebx, %k1"  \
71     : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
72 
73 #else
74 
75 #define x86_cpuid_MACRO_2(p, func, subFunc) { \
76   __asm__ __volatile__ ( \
77     ASM_LN   "cpuid"               \
78     : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
79 
80 #endif
81 
82 #define x86_cpuid_MACRO(p, func)  x86_cpuid_MACRO_2(p, func, 0)
83 
z7_x86_cpuid(UInt32 p[4],UInt32 func)84 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
85 {
86   x86_cpuid_MACRO(p, func)
87 }
88 
89 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)90 void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
91 {
92   x86_cpuid_MACRO_2(p, func, subFunc)
93 }
94 
95 
96 Z7_NO_INLINE
z7_x86_cpuid_GetMaxFunc(void)97 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
98 {
99  #if defined(NEED_CHECK_FOR_CPUID)
100   #define EFALGS_CPUID_BIT 21
101   UInt32 a;
102   __asm__ __volatile__ (
103     ASM_LN   "pushf"
104     ASM_LN   "pushf"
105     ASM_LN   "pop     %0"
106     // ASM_LN   "movl    %0, %1"
107     // ASM_LN   "xorl    $0x200000, %0"
108     ASM_LN   "btc     %1, %0"
109     ASM_LN   "push    %0"
110     ASM_LN   "popf"
111     ASM_LN   "pushf"
112     ASM_LN   "pop     %0"
113     ASM_LN   "xorl    (%%esp), %0"
114 
115     ASM_LN   "popf"
116     ASM_LN
117     : "=&r" (a) // "=a"
118     : "i" (EFALGS_CPUID_BIT)
119     );
120   if ((a & (1 << EFALGS_CPUID_BIT)) == 0)
121     return 0;
122  #endif
123   {
124     UInt32 p[4];
125     x86_cpuid_MACRO(p, 0)
126     return p[0];
127   }
128 }
129 
130 #undef ASM_LN
131 
132 #elif !defined(_MSC_VER)
133 
134 /*
135 // for gcc/clang and other: we can try to use __cpuid macro:
136 #include <cpuid.h>
137 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
138 {
139   __cpuid(func, p[0], p[1], p[2], p[3]);
140 }
141 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
142 {
143   return (UInt32)__get_cpuid_max(0, NULL);
144 }
145 */
146 // for unsupported cpuid:
z7_x86_cpuid(UInt32 p[4],UInt32 func)147 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
148 {
149   UNUSED_VAR(func)
150   p[0] = p[1] = p[2] = p[3] = 0;
151 }
z7_x86_cpuid_GetMaxFunc(void)152 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
153 {
154   return 0;
155 }
156 
157 #else // _MSC_VER
158 
159 #if !defined(MY_CPU_AMD64)
160 
z7_x86_cpuid_GetMaxFunc(void)161 UInt32 __declspec(naked) Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
162 {
163   #if defined(NEED_CHECK_FOR_CPUID)
164   #define EFALGS_CPUID_BIT 21
165   __asm   pushfd
166   __asm   pushfd
167   /*
168   __asm   pop     eax
169   // __asm   mov     edx, eax
170   __asm   btc     eax, EFALGS_CPUID_BIT
171   __asm   push    eax
172   */
173   __asm   btc     dword ptr [esp], EFALGS_CPUID_BIT
174   __asm   popfd
175   __asm   pushfd
176   __asm   pop     eax
177   // __asm   xor     eax, edx
178   __asm   xor     eax, [esp]
179   // __asm   push    edx
180   __asm   popfd
181   __asm   and     eax, (1 shl EFALGS_CPUID_BIT)
182   __asm   jz end_func
183   #endif
184   __asm   push    ebx
185   __asm   xor     eax, eax    // func
186   __asm   xor     ecx, ecx    // subFunction (optional) for (func == 0)
187   __asm   cpuid
188   __asm   pop     ebx
189   #if defined(NEED_CHECK_FOR_CPUID)
190   end_func:
191   #endif
192   __asm   ret 0
193 }
194 
z7_x86_cpuid(UInt32 p[4],UInt32 func)195 void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
196 {
197   UNUSED_VAR(p)
198   UNUSED_VAR(func)
199   __asm   push    ebx
200   __asm   push    edi
201   __asm   mov     edi, ecx    // p
202   __asm   mov     eax, edx    // func
203   __asm   xor     ecx, ecx    // subfunction (optional) for (func == 0)
204   __asm   cpuid
205   __asm   mov     [edi     ], eax
206   __asm   mov     [edi +  4], ebx
207   __asm   mov     [edi +  8], ecx
208   __asm   mov     [edi + 12], edx
209   __asm   pop     edi
210   __asm   pop     ebx
211   __asm   ret     0
212 }
213 
214 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)215 void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
216 {
217   UNUSED_VAR(p)
218   UNUSED_VAR(func)
219   UNUSED_VAR(subFunc)
220   __asm   push    ebx
221   __asm   push    edi
222   __asm   mov     edi, ecx    // p
223   __asm   mov     eax, edx    // func
224   __asm   mov     ecx, [esp + 12]  // subFunc
225   __asm   cpuid
226   __asm   mov     [edi     ], eax
227   __asm   mov     [edi +  4], ebx
228   __asm   mov     [edi +  8], ecx
229   __asm   mov     [edi + 12], edx
230   __asm   pop     edi
231   __asm   pop     ebx
232   __asm   ret     4
233 }
234 
235 #else // MY_CPU_AMD64
236 
237     #if _MSC_VER >= 1600
238       #include <intrin.h>
239       #define MY_cpuidex  __cpuidex
240 
241 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)242 void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
243 {
244   __cpuidex((int *)p, func, subFunc);
245 }
246 
247     #else
248 /*
249  __cpuid (func == (0 or 7)) requires subfunction number in ECX.
250   MSDN: The __cpuid intrinsic clears the ECX register before calling the cpuid instruction.
251    __cpuid() in new MSVC clears ECX.
252    __cpuid() in old MSVC (14.00) x64 doesn't clear ECX
253  We still can use __cpuid for low (func) values that don't require ECX,
254  but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
255  So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
256  where ECX value is first parameter for FASTCALL / NO_INLINE func.
257  So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
258  old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
259 
260 DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!
261 */
262 static
MY_cpuidex_HACK(Int32 subFunction,Int32 func,Int32 * CPUInfo)263 Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo)
264 {
265   UNUSED_VAR(subFunction)
266   __cpuid(CPUInfo, func);
267 }
268       #define MY_cpuidex(info, func, func2)  MY_cpuidex_HACK(func2, func, info)
269       #pragma message("======== MY_cpuidex_HACK WAS USED ========")
270 static
z7_x86_cpuid_subFunc(UInt32 p[4],UInt32 func,UInt32 subFunc)271 void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
272 {
273   MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
274 }
275     #endif // _MSC_VER >= 1600
276 
277 #if !defined(MY_CPU_AMD64)
278 /* inlining for __cpuid() in MSVC x86 (32-bit) produces big ineffective code,
279    so we disable inlining here */
280 Z7_NO_INLINE
281 #endif
z7_x86_cpuid(UInt32 p[4],UInt32 func)282 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
283 {
284   MY_cpuidex((Int32 *)p, (Int32)func, 0);
285 }
286 
287 Z7_NO_INLINE
z7_x86_cpuid_GetMaxFunc(void)288 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
289 {
290   Int32 a[4];
291   MY_cpuidex(a, 0, 0);
292   return a[0];
293 }
294 
295 #endif // MY_CPU_AMD64
296 #endif // _MSC_VER
297 
298 #if defined(NEED_CHECK_FOR_CPUID)
299 #define CHECK_CPUID_IS_SUPPORTED { if (z7_x86_cpuid_GetMaxFunc() == 0) return 0; }
300 #else
301 #define CHECK_CPUID_IS_SUPPORTED
302 #endif
303 #undef NEED_CHECK_FOR_CPUID
304 
305 
306 static
x86cpuid_Func_1(UInt32 * p)307 BoolInt x86cpuid_Func_1(UInt32 *p)
308 {
309   CHECK_CPUID_IS_SUPPORTED
310   z7_x86_cpuid(p, 1);
311   return True;
312 }
313 
314 /*
315 static const UInt32 kVendors[][1] =
316 {
317   { 0x756E6547 }, // , 0x49656E69, 0x6C65746E },
318   { 0x68747541 }, // , 0x69746E65, 0x444D4163 },
319   { 0x746E6543 }  // , 0x48727561, 0x736C7561 }
320 };
321 */
322 
323 /*
324 typedef struct
325 {
326   UInt32 maxFunc;
327   UInt32 vendor[3];
328   UInt32 ver;
329   UInt32 b;
330   UInt32 c;
331   UInt32 d;
332 } Cx86cpuid;
333 
334 enum
335 {
336   CPU_FIRM_INTEL,
337   CPU_FIRM_AMD,
338   CPU_FIRM_VIA
339 };
340 int x86cpuid_GetFirm(const Cx86cpuid *p);
341 #define x86cpuid_ver_GetFamily(ver) (((ver >> 16) & 0xff0) | ((ver >> 8) & 0xf))
342 #define x86cpuid_ver_GetModel(ver)  (((ver >> 12) &  0xf0) | ((ver >> 4) & 0xf))
343 #define x86cpuid_ver_GetStepping(ver) (ver & 0xf)
344 
345 int x86cpuid_GetFirm(const Cx86cpuid *p)
346 {
347   unsigned i;
348   for (i = 0; i < sizeof(kVendors) / sizeof(kVendors[0]); i++)
349   {
350     const UInt32 *v = kVendors[i];
351     if (v[0] == p->vendor[0]
352         // && v[1] == p->vendor[1]
353         // && v[2] == p->vendor[2]
354         )
355       return (int)i;
356   }
357   return -1;
358 }
359 
360 BoolInt CPU_Is_InOrder()
361 {
362   Cx86cpuid p;
363   UInt32 family, model;
364   if (!x86cpuid_CheckAndRead(&p))
365     return True;
366 
367   family = x86cpuid_ver_GetFamily(p.ver);
368   model = x86cpuid_ver_GetModel(p.ver);
369 
370   switch (x86cpuid_GetFirm(&p))
371   {
372     case CPU_FIRM_INTEL: return (family < 6 || (family == 6 && (
373         // In-Order Atom CPU
374            model == 0x1C  // 45 nm, N4xx, D4xx, N5xx, D5xx, 230, 330
375         || model == 0x26  // 45 nm, Z6xx
376         || model == 0x27  // 32 nm, Z2460
377         || model == 0x35  // 32 nm, Z2760
378         || model == 0x36  // 32 nm, N2xxx, D2xxx
379         )));
380     case CPU_FIRM_AMD: return (family < 5 || (family == 5 && (model < 6 || model == 0xA)));
381     case CPU_FIRM_VIA: return (family < 6 || (family == 6 && model < 0xF));
382   }
383   return False; // v23 : unknown processors are not In-Order
384 }
385 */
386 
387 #ifdef _WIN32
388 #include "7zWindows.h"
389 #endif
390 
391 #if !defined(MY_CPU_AMD64) && defined(_WIN32)
392 
393 /* for legacy SSE ia32: there is no user-space cpu instruction to check
394    that OS supports SSE register storing/restoring on context switches.
395    So we need some OS-specific function to check that it's safe to use SSE registers.
396 */
397 
398 Z7_FORCE_INLINE
CPU_Sys_Is_SSE_Supported(void)399 static BoolInt CPU_Sys_Is_SSE_Supported(void)
400 {
401 #ifdef _MSC_VER
402   #pragma warning(push)
403   #pragma warning(disable : 4996) // `GetVersion': was declared deprecated
404 #endif
405   /* low byte is major version of Windows
406      We suppose that any Windows version since
407      Windows2000 (major == 5) supports SSE registers */
408   return (Byte)GetVersion() >= 5;
409 #if defined(_MSC_VER)
410   #pragma warning(pop)
411 #endif
412 }
413 #define CHECK_SYS_SSE_SUPPORT if (!CPU_Sys_Is_SSE_Supported()) return False;
414 #else
415 #define CHECK_SYS_SSE_SUPPORT
416 #endif
417 
418 
419 #if !defined(MY_CPU_AMD64)
420 
CPU_IsSupported_CMOV(void)421 BoolInt CPU_IsSupported_CMOV(void)
422 {
423   UInt32 a[4];
424   if (!x86cpuid_Func_1(&a[0]))
425     return 0;
426   return (BoolInt)(a[3] >> 15) & 1;
427 }
428 
CPU_IsSupported_SSE(void)429 BoolInt CPU_IsSupported_SSE(void)
430 {
431   UInt32 a[4];
432   CHECK_SYS_SSE_SUPPORT
433   if (!x86cpuid_Func_1(&a[0]))
434     return 0;
435   return (BoolInt)(a[3] >> 25) & 1;
436 }
437 
CPU_IsSupported_SSE2(void)438 BoolInt CPU_IsSupported_SSE2(void)
439 {
440   UInt32 a[4];
441   CHECK_SYS_SSE_SUPPORT
442   if (!x86cpuid_Func_1(&a[0]))
443     return 0;
444   return (BoolInt)(a[3] >> 26) & 1;
445 }
446 
447 #endif
448 
449 
x86cpuid_Func_1_ECX(void)450 static UInt32 x86cpuid_Func_1_ECX(void)
451 {
452   UInt32 a[4];
453   CHECK_SYS_SSE_SUPPORT
454   if (!x86cpuid_Func_1(&a[0]))
455     return 0;
456   return a[2];
457 }
458 
CPU_IsSupported_AES(void)459 BoolInt CPU_IsSupported_AES(void)
460 {
461   return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1;
462 }
463 
CPU_IsSupported_SSSE3(void)464 BoolInt CPU_IsSupported_SSSE3(void)
465 {
466   return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1;
467 }
468 
CPU_IsSupported_SSE41(void)469 BoolInt CPU_IsSupported_SSE41(void)
470 {
471   return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1;
472 }
473 
CPU_IsSupported_SHA(void)474 BoolInt CPU_IsSupported_SHA(void)
475 {
476   CHECK_SYS_SSE_SUPPORT
477 
478   if (z7_x86_cpuid_GetMaxFunc() < 7)
479     return False;
480   {
481     UInt32 d[4];
482     z7_x86_cpuid(d, 7);
483     return (BoolInt)(d[1] >> 29) & 1;
484   }
485 }
486 
487 
CPU_IsSupported_SHA512(void)488 BoolInt CPU_IsSupported_SHA512(void)
489 {
490   if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
491 
492   if (z7_x86_cpuid_GetMaxFunc() < 7)
493     return False;
494   {
495     UInt32 d[4];
496     z7_x86_cpuid_subFunc(d, 7, 0);
497     if (d[0] < 1) // d[0] - is max supported subleaf value
498       return False;
499     z7_x86_cpuid_subFunc(d, 7, 1);
500     return (BoolInt)(d[0]) & 1;
501   }
502 }
503 
504 /*
505 MSVC: _xgetbv() intrinsic is available since VS2010SP1.
506    MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
507    <immintrin.h> that we can use or check.
508    For any 32-bit x86 we can use asm code in MSVC,
509    but MSVC asm code is huge after compilation.
510    So _xgetbv() is better
511 
512 ICC: _xgetbv() intrinsic is available (in what version of ICC?)
513    ICC defines (__GNUC___) and it supports gnu assembler
514    also ICC supports MASM style code with -use-msasm switch.
515    but ICC doesn't support __attribute__((__target__))
516 
517 GCC/CLANG 9:
518   _xgetbv() is macro that works via __builtin_ia32_xgetbv()
519   and we need __attribute__((__target__("xsave")).
520   But with __target__("xsave") the function will be not
521   inlined to function that has no __target__("xsave") attribute.
522   If we want _xgetbv() call inlining, then we should use asm version
523   instead of calling _xgetbv().
524   Note:intrinsic is broke before GCC 8.2:
525     https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85684
526 */
527 
528 #if    defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1100) \
529     || defined(_MSC_VER) && (_MSC_VER >= 1600) && (_MSC_FULL_VER >= 160040219)  \
530     || defined(__GNUC__) && (__GNUC__ >= 9) \
531     || defined(__clang__) && (__clang_major__ >= 9)
532 // we define ATTRIB_XGETBV, if we want to use predefined _xgetbv() from compiler
533 #if defined(__INTEL_COMPILER)
534 #define ATTRIB_XGETBV
535 #elif defined(__GNUC__) || defined(__clang__)
536 // we don't define ATTRIB_XGETBV here, because asm version is better for inlining.
537 // #define ATTRIB_XGETBV __attribute__((__target__("xsave")))
538 #else
539 #define ATTRIB_XGETBV
540 #endif
541 #endif
542 
543 #if defined(ATTRIB_XGETBV)
544 #include <immintrin.h>
545 #endif
546 
547 
548 // XFEATURE_ENABLED_MASK/XCR0
549 #define MY_XCR_XFEATURE_ENABLED_MASK 0
550 
551 #if defined(ATTRIB_XGETBV)
552 ATTRIB_XGETBV
553 #endif
x86_xgetbv_0(UInt32 num)554 static UInt64 x86_xgetbv_0(UInt32 num)
555 {
556 #if defined(ATTRIB_XGETBV)
557   {
558     return
559       #if (defined(_MSC_VER))
560         _xgetbv(num);
561       #else
562         __builtin_ia32_xgetbv(
563           #if !defined(__clang__)
564             (int)
565           #endif
566             num);
567       #endif
568   }
569 
570 #elif defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_CC)
571 
572   UInt32 a, d;
573  #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
574   __asm__
575   (
576     "xgetbv"
577     : "=a"(a), "=d"(d) : "c"(num) : "cc"
578   );
579  #else // is old gcc
580   __asm__
581   (
582     ".byte 0x0f, 0x01, 0xd0" "\n\t"
583     : "=a"(a), "=d"(d) : "c"(num) : "cc"
584   );
585  #endif
586   return ((UInt64)d << 32) | a;
587   // return a;
588 
589 #elif defined(_MSC_VER) && !defined(MY_CPU_AMD64)
590 
591   UInt32 a, d;
592   __asm {
593     push eax
594     push edx
595     push ecx
596     mov ecx, num;
597     // xor ecx, ecx // = MY_XCR_XFEATURE_ENABLED_MASK
598     _emit 0x0f
599     _emit 0x01
600     _emit 0xd0
601     mov a, eax
602     mov d, edx
603     pop ecx
604     pop edx
605     pop eax
606   }
607   return ((UInt64)d << 32) | a;
608   // return a;
609 
610 #else // it's unknown compiler
611   // #error "Need xgetbv function"
612   UNUSED_VAR(num)
613   // for MSVC-X64 we could call external function from external file.
614   /* Actually we had checked OSXSAVE/AVX in cpuid before.
615      So it's expected that OS supports at least AVX and below. */
616   // if (num != MY_XCR_XFEATURE_ENABLED_MASK) return 0; // if not XCR0
617   return
618       // (1 << 0) |  // x87
619         (1 << 1)   // SSE
620       | (1 << 2);  // AVX
621 
622 #endif
623 }
624 
625 #ifdef _WIN32
626 /*
627   Windows versions do not know about new ISA extensions that
628   can be introduced. But we still can use new extensions,
629   even if Windows doesn't report about supporting them,
630   But we can use new extensions, only if Windows knows about new ISA extension
631   that changes the number or size of registers: SSE, AVX/XSAVE, AVX512
632   So it's enough to check
633     MY_PF_AVX_INSTRUCTIONS_AVAILABLE
634       instead of
635     MY_PF_AVX2_INSTRUCTIONS_AVAILABLE
636 */
637 #define MY_PF_XSAVE_ENABLED                            17
638 // #define MY_PF_SSSE3_INSTRUCTIONS_AVAILABLE             36
639 // #define MY_PF_SSE4_1_INSTRUCTIONS_AVAILABLE            37
640 // #define MY_PF_SSE4_2_INSTRUCTIONS_AVAILABLE            38
641 // #define MY_PF_AVX_INSTRUCTIONS_AVAILABLE               39
642 // #define MY_PF_AVX2_INSTRUCTIONS_AVAILABLE              40
643 // #define MY_PF_AVX512F_INSTRUCTIONS_AVAILABLE           41
644 #endif
645 
CPU_IsSupported_AVX(void)646 BoolInt CPU_IsSupported_AVX(void)
647 {
648   #ifdef _WIN32
649   if (!IsProcessorFeaturePresent(MY_PF_XSAVE_ENABLED))
650     return False;
651   /* PF_AVX_INSTRUCTIONS_AVAILABLE probably is supported starting from
652      some latest Win10 revisions. But we need AVX in older Windows also.
653      So we don't use the following check: */
654   /*
655   if (!IsProcessorFeaturePresent(MY_PF_AVX_INSTRUCTIONS_AVAILABLE))
656     return False;
657   */
658   #endif
659 
660   /*
661     OS must use new special XSAVE/XRSTOR instructions to save
662     AVX registers when it required for context switching.
663     At OS statring:
664       OS sets CR4.OSXSAVE flag to signal the processor that OS supports the XSAVE extensions.
665       Also OS sets bitmask in XCR0 register that defines what
666       registers will be processed by XSAVE instruction:
667         XCR0.SSE[bit 0] - x87 registers and state
668         XCR0.SSE[bit 1] - SSE registers and state
669         XCR0.AVX[bit 2] - AVX registers and state
670     CR4.OSXSAVE is reflected to CPUID.1:ECX.OSXSAVE[bit 27].
671        So we can read that bit in user-space.
672     XCR0 is available for reading in user-space by new XGETBV instruction.
673   */
674   {
675     const UInt32 c = x86cpuid_Func_1_ECX();
676     if (0 == (1
677         & (c >> 28)   // AVX instructions are supported by hardware
678         & (c >> 27))) // OSXSAVE bit: XSAVE and related instructions are enabled by OS.
679       return False;
680   }
681 
682   /* also we can check
683      CPUID.1:ECX.XSAVE [bit 26] : that shows that
684         XSAVE, XRESTOR, XSETBV, XGETBV instructions are supported by hardware.
685      But that check is redundant, because if OSXSAVE bit is set, then XSAVE is also set */
686 
687   /* If OS have enabled XSAVE extension instructions (OSXSAVE == 1),
688      in most cases we expect that OS also will support storing/restoring
689      for AVX and SSE states at least.
690      But to be ensure for that we call user-space instruction
691      XGETBV(0) to get XCR0 value that contains bitmask that defines
692      what exact states(registers) OS have enabled for storing/restoring.
693   */
694 
695   {
696     const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
697     // printf("\n=== XGetBV=0x%x\n", bm);
698     return 1
699         & (BoolInt)(bm >> 1)  // SSE state is supported (set by OS) for storing/restoring
700         & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring
701   }
702   // since Win7SP1: we can use GetEnabledXStateFeatures();
703 }
704 
705 
CPU_IsSupported_AVX2(void)706 BoolInt CPU_IsSupported_AVX2(void)
707 {
708   if (!CPU_IsSupported_AVX())
709     return False;
710   if (z7_x86_cpuid_GetMaxFunc() < 7)
711     return False;
712   {
713     UInt32 d[4];
714     z7_x86_cpuid(d, 7);
715     // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
716     return 1
717       & (BoolInt)(d[1] >> 5); // avx2
718   }
719 }
720 
721 #if 0
722 BoolInt CPU_IsSupported_AVX512F_AVX512VL(void)
723 {
724   if (!CPU_IsSupported_AVX())
725     return False;
726   if (z7_x86_cpuid_GetMaxFunc() < 7)
727     return False;
728   {
729     UInt32 d[4];
730     BoolInt v;
731     z7_x86_cpuid(d, 7);
732     // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
733     v = 1
734       & (BoolInt)(d[1] >> 16)  // avx512f
735       & (BoolInt)(d[1] >> 31); // avx512vl
736     if (!v)
737       return False;
738   }
739   {
740     const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
741     // printf("\n=== XGetBV=0x%x\n", bm);
742     return 1
743         & (BoolInt)(bm >> 5)  // OPMASK
744         & (BoolInt)(bm >> 6)  // ZMM upper 256-bit
745         & (BoolInt)(bm >> 7); // ZMM16 ... ZMM31
746   }
747 }
748 #endif
749 
CPU_IsSupported_VAES_AVX2(void)750 BoolInt CPU_IsSupported_VAES_AVX2(void)
751 {
752   if (!CPU_IsSupported_AVX())
753     return False;
754   if (z7_x86_cpuid_GetMaxFunc() < 7)
755     return False;
756   {
757     UInt32 d[4];
758     z7_x86_cpuid(d, 7);
759     // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
760     return 1
761       & (BoolInt)(d[1] >> 5) // avx2
762       // & (d[1] >> 31) // avx512vl
763       & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX
764   }
765 }
766 
CPU_IsSupported_PageGB(void)767 BoolInt CPU_IsSupported_PageGB(void)
768 {
769   CHECK_CPUID_IS_SUPPORTED
770   {
771     UInt32 d[4];
772     z7_x86_cpuid(d, 0x80000000);
773     if (d[0] < 0x80000001)
774       return False;
775     z7_x86_cpuid(d, 0x80000001);
776     return (BoolInt)(d[3] >> 26) & 1;
777   }
778 }
779 
780 
781 #elif defined(MY_CPU_ARM_OR_ARM64)
782 
783 #ifdef _WIN32
784 
785 #include "7zWindows.h"
786 
CPU_IsSupported_CRC32(void)787 BoolInt CPU_IsSupported_CRC32(void)  { return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
CPU_IsSupported_CRYPTO(void)788 BoolInt CPU_IsSupported_CRYPTO(void) { return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
CPU_IsSupported_NEON(void)789 BoolInt CPU_IsSupported_NEON(void)   { return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) ? 1 : 0; }
790 
791 #else
792 
793 #if defined(__APPLE__)
794 
795 /*
796 #include <stdio.h>
797 #include <string.h>
798 static void Print_sysctlbyname(const char *name)
799 {
800   size_t bufSize = 256;
801   char buf[256];
802   int res = sysctlbyname(name, &buf, &bufSize, NULL, 0);
803   {
804     int i;
805     printf("\nres = %d : %s : '%s' : bufSize = %d, numeric", res, name, buf, (unsigned)bufSize);
806     for (i = 0; i < 20; i++)
807       printf(" %2x", (unsigned)(Byte)buf[i]);
808 
809   }
810 }
811 */
812 /*
813   Print_sysctlbyname("hw.pagesize");
814   Print_sysctlbyname("machdep.cpu.brand_string");
815 */
816 
z7_sysctlbyname_Get_BoolInt(const char * name)817 static BoolInt z7_sysctlbyname_Get_BoolInt(const char *name)
818 {
819   UInt32 val = 0;
820   if (z7_sysctlbyname_Get_UInt32(name, &val) == 0 && val == 1)
821     return 1;
822   return 0;
823 }
824 
CPU_IsSupported_CRC32(void)825 BoolInt CPU_IsSupported_CRC32(void)
826 {
827   return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_crc32");
828 }
829 
CPU_IsSupported_NEON(void)830 BoolInt CPU_IsSupported_NEON(void)
831 {
832   return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
833 }
834 
CPU_IsSupported_SHA512(void)835 BoolInt CPU_IsSupported_SHA512(void)
836 {
837   return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
838 }
839 
840 /*
841 BoolInt CPU_IsSupported_SHA3(void)
842 {
843   return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
844 }
845 */
846 
847 #ifdef MY_CPU_ARM64
848 #define APPLE_CRYPTO_SUPPORT_VAL 1
849 #else
850 #define APPLE_CRYPTO_SUPPORT_VAL 0
851 #endif
852 
CPU_IsSupported_SHA1(void)853 BoolInt CPU_IsSupported_SHA1(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
CPU_IsSupported_SHA2(void)854 BoolInt CPU_IsSupported_SHA2(void) { return APPLE_CRYPTO_SUPPORT_VAL; }
CPU_IsSupported_AES(void)855 BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; }
856 
857 
858 #else // __APPLE__
859 
860 #if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216)
861   #define Z7_GETAUXV_AVAILABLE
862 #else
863 // #pragma message("=== is not NEW GLIBC === ")
864   #if defined __has_include
865   #if __has_include (<sys/auxv.h>)
866 // #pragma message("=== sys/auxv.h is avail=== ")
867     #define Z7_GETAUXV_AVAILABLE
868   #endif
869   #endif
870 #endif
871 
872 #ifdef Z7_GETAUXV_AVAILABLE
873 // #pragma message("=== Z7_GETAUXV_AVAILABLE === ")
874 #include <sys/auxv.h>
875 #define USE_HWCAP
876 #endif
877 
878 #ifdef USE_HWCAP
879 
880 #if defined(__FreeBSD__)
MY_getauxval(int aux)881 static unsigned long MY_getauxval(int aux)
882 {
883   unsigned long val;
884   if (elf_aux_info(aux, &val, sizeof(val)))
885     return 0;
886   return val;
887 }
888 #else
889 #define MY_getauxval  getauxval
890   #if defined __has_include
891   #if __has_include (<asm/hwcap.h>)
892 #include <asm/hwcap.h>
893   #endif
894   #endif
895 #endif
896 
897   #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \
898   BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP)  & (HWCAP_  ## name2)); }
899 
900 #ifdef MY_CPU_ARM64
901   #define MY_HWCAP_CHECK_FUNC(name) \
902   MY_HWCAP_CHECK_FUNC_2(name, name)
903 #if 1 || defined(__ARM_NEON)
CPU_IsSupported_NEON(void)904   BoolInt CPU_IsSupported_NEON(void) { return True; }
905 #else
MY_HWCAP_CHECK_FUNC_2(NEON,ASIMD)906   MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD)
907 #endif
908 // MY_HWCAP_CHECK_FUNC (ASIMD)
909 #elif defined(MY_CPU_ARM)
910   #define MY_HWCAP_CHECK_FUNC(name) \
911   BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); }
912   MY_HWCAP_CHECK_FUNC_2(NEON, NEON)
913 #endif
914 
915 #else // USE_HWCAP
916 
917   #define MY_HWCAP_CHECK_FUNC(name) \
918   BoolInt CPU_IsSupported_ ## name(void) { return 0; }
919 #if defined(__ARM_NEON)
920   BoolInt CPU_IsSupported_NEON(void) { return True; }
921 #else
922   MY_HWCAP_CHECK_FUNC(NEON)
923 #endif
924 
925 #endif // USE_HWCAP
926 
927 MY_HWCAP_CHECK_FUNC (CRC32)
928 MY_HWCAP_CHECK_FUNC (SHA1)
929 MY_HWCAP_CHECK_FUNC (SHA2)
930 MY_HWCAP_CHECK_FUNC (AES)
931 #ifdef MY_CPU_ARM64
932 // <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
933 // we define them here, if they are not defined
934 #ifndef HWCAP_SHA3
935 // #define HWCAP_SHA3    (1 << 17)
936 #endif
937 #ifndef HWCAP_SHA512
938 // #pragma message("=== HWCAP_SHA512 define === ")
939 #define HWCAP_SHA512  (1 << 21)
940 #endif
941 MY_HWCAP_CHECK_FUNC (SHA512)
942 // MY_HWCAP_CHECK_FUNC (SHA3)
943 #endif
944 
945 #endif // __APPLE__
946 #endif // _WIN32
947 
948 #endif // MY_CPU_ARM_OR_ARM64
949 
950 
951 
952 #ifdef __APPLE__
953 
954 #include <sys/sysctl.h>
955 
956 int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize)
957 {
958   return sysctlbyname(name, buf, bufSize, NULL, 0);
959 }
960 
z7_sysctlbyname_Get_UInt32(const char * name,UInt32 * val)961 int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val)
962 {
963   size_t bufSize = sizeof(*val);
964   const int res = z7_sysctlbyname_Get(name, val, &bufSize);
965   if (res == 0 && bufSize != sizeof(*val))
966     return EFAULT;
967   return res;
968 }
969 
970 #endif
971