xref: /aosp_15_r20/external/mesa3d/src/util/blake3/blake3_dispatch.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4 
5 #include "blake3_impl.h"
6 
7 #if defined(_MSC_VER)
8 #include <Windows.h>
9 #endif
10 
11 #if defined(IS_X86)
12 #if defined(_MSC_VER)
13 #include <intrin.h>
14 #elif defined(__GNUC__)
15 #include <immintrin.h>
16 #else
17 #undef IS_X86 /* Unimplemented! */
18 #endif
19 #endif
20 
21 #if !defined(BLAKE3_ATOMICS)
22 #if defined(__has_include)
23 #if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
24 #define BLAKE3_ATOMICS 1
25 #else
26 #define BLAKE3_ATOMICS 0
27 #endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
28 #else
29 #define BLAKE3_ATOMICS 0
30 #endif /* defined(__has_include) */
31 #endif /* BLAKE3_ATOMICS */
32 
33 #if BLAKE3_ATOMICS
34 #define ATOMIC_INT _Atomic int
35 #define ATOMIC_LOAD(x) x
36 #define ATOMIC_STORE(x, y) x = y
37 #elif defined(_MSC_VER)
38 #define ATOMIC_INT LONG
39 #define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
40 #define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
41 #else
42 #define ATOMIC_INT int
43 #define ATOMIC_LOAD(x) x
44 #define ATOMIC_STORE(x, y) x = y
45 #endif
46 
47 #define MAYBE_UNUSED(x) (void)((x))
48 
49 #if defined(IS_X86)
xgetbv(void)50 static uint64_t xgetbv(void) {
51 #if defined(_MSC_VER)
52   return _xgetbv(0);
53 #else
54   uint32_t eax = 0, edx = 0;
55   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
56   return ((uint64_t)edx << 32) | eax;
57 #endif
58 }
59 
cpuid(uint32_t out[4],uint32_t id)60 static void cpuid(uint32_t out[4], uint32_t id) {
61 #if defined(_MSC_VER)
62   __cpuid((int *)out, id);
63 #elif defined(__i386__) || defined(_M_IX86)
64   __asm__ __volatile__("movl %%ebx, %1\n"
65                        "cpuid\n"
66                        "xchgl %1, %%ebx\n"
67                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
68                        : "a"(id));
69 #else
70   __asm__ __volatile__("cpuid\n"
71                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
72                        : "a"(id));
73 #endif
74 }
75 
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)76 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
77 #if defined(_MSC_VER)
78   __cpuidex((int *)out, id, sid);
79 #elif defined(__i386__) || defined(_M_IX86)
80   __asm__ __volatile__("movl %%ebx, %1\n"
81                        "cpuid\n"
82                        "xchgl %1, %%ebx\n"
83                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
84                        : "a"(id), "c"(sid));
85 #else
86   __asm__ __volatile__("cpuid\n"
87                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
88                        : "a"(id), "c"(sid));
89 #endif
90 }
91 
92 #endif
93 
94 enum cpu_feature {
95   SSE2 = 1 << 0,
96   SSSE3 = 1 << 1,
97   SSE41 = 1 << 2,
98   AVX = 1 << 3,
99   AVX2 = 1 << 4,
100   AVX512F = 1 << 5,
101   AVX512VL = 1 << 6,
102   /* ... */
103   UNDEFINED = 1 << 30
104 };
105 
106 #if !defined(BLAKE3_TESTING)
107 static /* Allow the variable to be controlled manually for testing */
108 #endif
109     ATOMIC_INT g_cpu_features = UNDEFINED;
110 
111 #if !defined(BLAKE3_TESTING)
112 static
113 #endif
114     enum cpu_feature
get_cpu_features(void)115     get_cpu_features(void) {
116 
117   /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
118   enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
119   if (features != UNDEFINED) {
120     return features;
121   } else {
122 #if defined(IS_X86)
123     uint32_t regs[4] = {0};
124     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
125     (void)edx;
126     features = 0;
127     cpuid(regs, 0);
128     const int max_id = *eax;
129     cpuid(regs, 1);
130 #if defined(__amd64__) || defined(_M_X64)
131     features |= SSE2;
132 #else
133     if (*edx & (1UL << 26))
134       features |= SSE2;
135 #endif
136     if (*ecx & (1UL << 9))
137       features |= SSSE3;
138     if (*ecx & (1UL << 19))
139       features |= SSE41;
140 
141     if (*ecx & (1UL << 27)) { // OSXSAVE
142       const uint64_t mask = xgetbv();
143       if ((mask & 6) == 6) { // SSE and AVX states
144         if (*ecx & (1UL << 28))
145           features |= AVX;
146         if (max_id >= 7) {
147           cpuidex(regs, 7, 0);
148           if (*ebx & (1UL << 5))
149             features |= AVX2;
150           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
151             if (*ebx & (1UL << 31))
152               features |= AVX512VL;
153             if (*ebx & (1UL << 16))
154               features |= AVX512F;
155           }
156         }
157       }
158     }
159     ATOMIC_STORE(g_cpu_features, features);
160     return features;
161 #else
162     /* How to detect NEON? */
163     return 0;
164 #endif
165   }
166 }
167 
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)168 void blake3_compress_in_place(uint32_t cv[8],
169                               const uint8_t block[BLAKE3_BLOCK_LEN],
170                               uint8_t block_len, uint64_t counter,
171                               uint8_t flags) {
172 #if defined(IS_X86)
173   const enum cpu_feature features = get_cpu_features();
174   MAYBE_UNUSED(features);
175 #if !defined(BLAKE3_NO_AVX512)
176   if (features & AVX512VL) {
177     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
178     return;
179   }
180 #endif
181 #if !defined(BLAKE3_NO_SSE41)
182   if (features & SSE41) {
183     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
184     return;
185   }
186 #endif
187 #if !defined(BLAKE3_NO_SSE2)
188   if (features & SSE2) {
189     blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
190     return;
191   }
192 #endif
193 #endif
194   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
195 }
196 
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])197 void blake3_compress_xof(const uint32_t cv[8],
198                          const uint8_t block[BLAKE3_BLOCK_LEN],
199                          uint8_t block_len, uint64_t counter, uint8_t flags,
200                          uint8_t out[64]) {
201 #if defined(IS_X86)
202   const enum cpu_feature features = get_cpu_features();
203   MAYBE_UNUSED(features);
204 #if !defined(BLAKE3_NO_AVX512)
205   if (features & AVX512VL) {
206     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
207     return;
208   }
209 #endif
210 #if !defined(BLAKE3_NO_SSE41)
211   if (features & SSE41) {
212     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
213     return;
214   }
215 #endif
216 #if !defined(BLAKE3_NO_SSE2)
217   if (features & SSE2) {
218     blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
219     return;
220   }
221 #endif
222 #endif
223   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
224 }
225 
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)226 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
227                       size_t blocks, const uint32_t key[8], uint64_t counter,
228                       bool increment_counter, uint8_t flags,
229                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
230 #if defined(IS_X86)
231   const enum cpu_feature features = get_cpu_features();
232   MAYBE_UNUSED(features);
233 #if !defined(BLAKE3_NO_AVX512)
234   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
235     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
236                             increment_counter, flags, flags_start, flags_end,
237                             out);
238     return;
239   }
240 #endif
241 #if !defined(BLAKE3_NO_AVX2)
242   if (features & AVX2) {
243     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
244                           increment_counter, flags, flags_start, flags_end,
245                           out);
246     return;
247   }
248 #endif
249 #if !defined(BLAKE3_NO_SSE41)
250   if (features & SSE41) {
251     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
252                            increment_counter, flags, flags_start, flags_end,
253                            out);
254     return;
255   }
256 #endif
257 #if !defined(BLAKE3_NO_SSE2)
258   if (features & SSE2) {
259     blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
260                           increment_counter, flags, flags_start, flags_end,
261                           out);
262     return;
263   }
264 #endif
265 #endif
266 
267 #if BLAKE3_USE_NEON == 1
268   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
269                         increment_counter, flags, flags_start, flags_end, out);
270   return;
271 #endif
272 
273   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
274                             increment_counter, flags, flags_start, flags_end,
275                             out);
276 }
277 
278 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)279 size_t blake3_simd_degree(void) {
280 #if defined(IS_X86)
281   const enum cpu_feature features = get_cpu_features();
282   MAYBE_UNUSED(features);
283 #if !defined(BLAKE3_NO_AVX512)
284   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
285     return 16;
286   }
287 #endif
288 #if !defined(BLAKE3_NO_AVX2)
289   if (features & AVX2) {
290     return 8;
291   }
292 #endif
293 #if !defined(BLAKE3_NO_SSE41)
294   if (features & SSE41) {
295     return 4;
296   }
297 #endif
298 #if !defined(BLAKE3_NO_SSE2)
299   if (features & SSE2) {
300     return 4;
301   }
302 #endif
303 #endif
304 #if BLAKE3_USE_NEON == 1
305   return 4;
306 #endif
307   return 1;
308 }
309