1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "blake3_impl.h"
6
7 #if defined(_MSC_VER)
8 #include <Windows.h>
9 #endif
10
11 #if defined(IS_X86)
12 #if defined(_MSC_VER)
13 #include <intrin.h>
14 #elif defined(__GNUC__)
15 #include <immintrin.h>
16 #else
17 #undef IS_X86 /* Unimplemented! */
18 #endif
19 #endif
20
21 #if !defined(BLAKE3_ATOMICS)
22 #if defined(__has_include)
23 #if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
24 #define BLAKE3_ATOMICS 1
25 #else
26 #define BLAKE3_ATOMICS 0
27 #endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
28 #else
29 #define BLAKE3_ATOMICS 0
30 #endif /* defined(__has_include) */
31 #endif /* BLAKE3_ATOMICS */
32
33 #if BLAKE3_ATOMICS
34 #define ATOMIC_INT _Atomic int
35 #define ATOMIC_LOAD(x) x
36 #define ATOMIC_STORE(x, y) x = y
37 #elif defined(_MSC_VER)
38 #define ATOMIC_INT LONG
39 #define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
40 #define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
41 #else
42 #define ATOMIC_INT int
43 #define ATOMIC_LOAD(x) x
44 #define ATOMIC_STORE(x, y) x = y
45 #endif
46
47 #define MAYBE_UNUSED(x) (void)((x))
48
49 #if defined(IS_X86)
xgetbv(void)50 static uint64_t xgetbv(void) {
51 #if defined(_MSC_VER)
52 return _xgetbv(0);
53 #else
54 uint32_t eax = 0, edx = 0;
55 __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
56 return ((uint64_t)edx << 32) | eax;
57 #endif
58 }
59
cpuid(uint32_t out[4],uint32_t id)60 static void cpuid(uint32_t out[4], uint32_t id) {
61 #if defined(_MSC_VER)
62 __cpuid((int *)out, id);
63 #elif defined(__i386__) || defined(_M_IX86)
64 __asm__ __volatile__("movl %%ebx, %1\n"
65 "cpuid\n"
66 "xchgl %1, %%ebx\n"
67 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
68 : "a"(id));
69 #else
70 __asm__ __volatile__("cpuid\n"
71 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
72 : "a"(id));
73 #endif
74 }
75
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)76 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
77 #if defined(_MSC_VER)
78 __cpuidex((int *)out, id, sid);
79 #elif defined(__i386__) || defined(_M_IX86)
80 __asm__ __volatile__("movl %%ebx, %1\n"
81 "cpuid\n"
82 "xchgl %1, %%ebx\n"
83 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
84 : "a"(id), "c"(sid));
85 #else
86 __asm__ __volatile__("cpuid\n"
87 : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
88 : "a"(id), "c"(sid));
89 #endif
90 }
91
92 #endif
93
94 enum cpu_feature {
95 SSE2 = 1 << 0,
96 SSSE3 = 1 << 1,
97 SSE41 = 1 << 2,
98 AVX = 1 << 3,
99 AVX2 = 1 << 4,
100 AVX512F = 1 << 5,
101 AVX512VL = 1 << 6,
102 /* ... */
103 UNDEFINED = 1 << 30
104 };
105
106 #if !defined(BLAKE3_TESTING)
107 static /* Allow the variable to be controlled manually for testing */
108 #endif
109 ATOMIC_INT g_cpu_features = UNDEFINED;
110
111 #if !defined(BLAKE3_TESTING)
112 static
113 #endif
114 enum cpu_feature
get_cpu_features(void)115 get_cpu_features(void) {
116
117 /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
118 enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
119 if (features != UNDEFINED) {
120 return features;
121 } else {
122 #if defined(IS_X86)
123 uint32_t regs[4] = {0};
124 uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
125 (void)edx;
126 features = 0;
127 cpuid(regs, 0);
128 const int max_id = *eax;
129 cpuid(regs, 1);
130 #if defined(__amd64__) || defined(_M_X64)
131 features |= SSE2;
132 #else
133 if (*edx & (1UL << 26))
134 features |= SSE2;
135 #endif
136 if (*ecx & (1UL << 9))
137 features |= SSSE3;
138 if (*ecx & (1UL << 19))
139 features |= SSE41;
140
141 if (*ecx & (1UL << 27)) { // OSXSAVE
142 const uint64_t mask = xgetbv();
143 if ((mask & 6) == 6) { // SSE and AVX states
144 if (*ecx & (1UL << 28))
145 features |= AVX;
146 if (max_id >= 7) {
147 cpuidex(regs, 7, 0);
148 if (*ebx & (1UL << 5))
149 features |= AVX2;
150 if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
151 if (*ebx & (1UL << 31))
152 features |= AVX512VL;
153 if (*ebx & (1UL << 16))
154 features |= AVX512F;
155 }
156 }
157 }
158 }
159 ATOMIC_STORE(g_cpu_features, features);
160 return features;
161 #else
162 /* How to detect NEON? */
163 return 0;
164 #endif
165 }
166 }
167
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)168 void blake3_compress_in_place(uint32_t cv[8],
169 const uint8_t block[BLAKE3_BLOCK_LEN],
170 uint8_t block_len, uint64_t counter,
171 uint8_t flags) {
172 #if defined(IS_X86)
173 const enum cpu_feature features = get_cpu_features();
174 MAYBE_UNUSED(features);
175 #if !defined(BLAKE3_NO_AVX512)
176 if (features & AVX512VL) {
177 blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
178 return;
179 }
180 #endif
181 #if !defined(BLAKE3_NO_SSE41)
182 if (features & SSE41) {
183 blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
184 return;
185 }
186 #endif
187 #if !defined(BLAKE3_NO_SSE2)
188 if (features & SSE2) {
189 blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
190 return;
191 }
192 #endif
193 #endif
194 blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
195 }
196
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])197 void blake3_compress_xof(const uint32_t cv[8],
198 const uint8_t block[BLAKE3_BLOCK_LEN],
199 uint8_t block_len, uint64_t counter, uint8_t flags,
200 uint8_t out[64]) {
201 #if defined(IS_X86)
202 const enum cpu_feature features = get_cpu_features();
203 MAYBE_UNUSED(features);
204 #if !defined(BLAKE3_NO_AVX512)
205 if (features & AVX512VL) {
206 blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
207 return;
208 }
209 #endif
210 #if !defined(BLAKE3_NO_SSE41)
211 if (features & SSE41) {
212 blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
213 return;
214 }
215 #endif
216 #if !defined(BLAKE3_NO_SSE2)
217 if (features & SSE2) {
218 blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
219 return;
220 }
221 #endif
222 #endif
223 blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
224 }
225
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)226 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
227 size_t blocks, const uint32_t key[8], uint64_t counter,
228 bool increment_counter, uint8_t flags,
229 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
230 #if defined(IS_X86)
231 const enum cpu_feature features = get_cpu_features();
232 MAYBE_UNUSED(features);
233 #if !defined(BLAKE3_NO_AVX512)
234 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
235 blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
236 increment_counter, flags, flags_start, flags_end,
237 out);
238 return;
239 }
240 #endif
241 #if !defined(BLAKE3_NO_AVX2)
242 if (features & AVX2) {
243 blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
244 increment_counter, flags, flags_start, flags_end,
245 out);
246 return;
247 }
248 #endif
249 #if !defined(BLAKE3_NO_SSE41)
250 if (features & SSE41) {
251 blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
252 increment_counter, flags, flags_start, flags_end,
253 out);
254 return;
255 }
256 #endif
257 #if !defined(BLAKE3_NO_SSE2)
258 if (features & SSE2) {
259 blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
260 increment_counter, flags, flags_start, flags_end,
261 out);
262 return;
263 }
264 #endif
265 #endif
266
267 #if BLAKE3_USE_NEON == 1
268 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
269 increment_counter, flags, flags_start, flags_end, out);
270 return;
271 #endif
272
273 blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
274 increment_counter, flags, flags_start, flags_end,
275 out);
276 }
277
278 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)279 size_t blake3_simd_degree(void) {
280 #if defined(IS_X86)
281 const enum cpu_feature features = get_cpu_features();
282 MAYBE_UNUSED(features);
283 #if !defined(BLAKE3_NO_AVX512)
284 if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
285 return 16;
286 }
287 #endif
288 #if !defined(BLAKE3_NO_AVX2)
289 if (features & AVX2) {
290 return 8;
291 }
292 #endif
293 #if !defined(BLAKE3_NO_SSE41)
294 if (features & SSE41) {
295 return 4;
296 }
297 #endif
298 #if !defined(BLAKE3_NO_SSE2)
299 if (features & SSE2) {
300 return 4;
301 }
302 #endif
303 #endif
304 #if BLAKE3_USE_NEON == 1
305 return 4;
306 #endif
307 return 1;
308 }
309