1 /*
2 * xxHash - Extremely Fast Hash algorithm
3 * Header File
4 * Copyright (C) 2012-2024 Yann Collet
5 *
6 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met:
11 *
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following disclaimer
16 * in the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 * You can contact the author at:
32 * - xxHash homepage: https://www.xxhash.com
33 * - xxHash source repository: https://github.com/Cyan4973/xxHash
34 */
35 /*!
36 * @mainpage xxHash
37 *
38 * @file xxhash.h
39 * xxHash prototypes and implementation
40 */
41 /* TODO: update */
42 /* Notice extracted from xxHash homepage:
43
44 xxHash is an extremely fast hash algorithm, running at RAM speed limits.
45 It also successfully passes all tests from the SMHasher suite.
46
47 Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
48 @3GHz)
49
50 Name Speed Q.Score Author
51 xxHash 5.4 GB/s 10
52 CrapWow 3.2 GB/s 2 Andrew
53 MurmurHash 3a 2.7 GB/s 10 Austin Appleby
54 SpookyHash 2.0 GB/s 10 Bob Jenkins
55 SBox 1.4 GB/s 9 Bret Mulvey
56 Lookup3 1.2 GB/s 9 Bob Jenkins
57 SuperFastHash 1.2 GB/s 1 Paul Hsieh
58 CityHash64 1.05 GB/s 10 Pike & Alakuijala
59 FNV 0.55 GB/s 5 Fowler, Noll, Vo
60 CRC32 0.43 GB/s 9
61 MD5-32 0.33 GB/s 10 Ronald L. Rivest
62 SHA1-32 0.28 GB/s 10
63
64 Q.Score is a measure of quality of the hash function.
65 It depends on successfully passing SMHasher test set.
66 10 is a perfect score.
67
68 Note: SMHasher's CRC32 implementation is not the fastest one.
69 Other speed-oriented implementations can be faster,
70 especially in combination with PCLMUL instruction:
71 https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
72
73 A 64-bit version, named XXH64, is available since r35.
74 It offers much better speed, but for 64-bit applications only.
75 Name Speed on 64 bits Speed on 32 bits
76 XXH64 13.8 GB/s 1.9 GB/s
77 XXH32 6.8 GB/s 6.0 GB/s
78 */
79
80 #if defined(__cplusplus)
81 extern "C" {
82
83 #endif
84
85 /* ****************************
86 * INLINE mode
87 ******************************/
88 /*!
89 * XXH_INLINE_ALL (and XXH_PRIVATE_API)
90 * Use these build macros to inline xxhash into the target unit.
91 * Inlining improves performance on small inputs, especially when the length is
92 * expressed as a compile-time constant:
93 *
94 * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
95 *
96 * It also keeps xxHash symbols private to the unit, so they are not exported.
97 *
98 * Usage:
99 * #define XXH_INLINE_ALL
100 * #include "xxhash.h"
101 *
102 * Do not compile and link xxhash.o as a separate object, as it is not useful.
103 */
104 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) && \
105 !defined(XXH_INLINE_ALL_31684351384)
106 /* this section should be traversed only once */
107 #define XXH_INLINE_ALL_31684351384
108 /* give access to the advanced API, required to compile implementations */
109 #undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */
110 #define XXH_STATIC_LINKING_ONLY
111 /* make all functions private */
112 #undef XXH_PUBLIC_API
113 #if defined(__GNUC__)
114 #define XXH_PUBLIC_API static __inline __attribute__((unused))
115 #elif defined(__cplusplus) || \
116 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
117 #define XXH_PUBLIC_API static inline
118 #elif defined(_MSC_VER)
119 #define XXH_PUBLIC_API static __inline
120 #else
121 /* note: this version may generate warnings for unused static functions */
122 #define XXH_PUBLIC_API static
123 #endif
124
125 /*
126 * This part deals with the special case where a unit wants to inline xxHash,
127 * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
128 * such as part of some previously included *.h header file.
129 * Without further action, the new include would just be ignored,
130 * and functions would effectively _not_ be inlined (silent failure).
131 * The following macros solve this situation by prefixing all inlined names,
132 * avoiding naming collision with previous inclusions.
133 */
134 /* Before that, we unconditionally #undef all symbols,
135 * in case they were already defined with XXH_NAMESPACE.
136 * They will then be redefined for XXH_INLINE_ALL
137 */
138 #undef XXH_versionNumber
139 /* XXH32 */
140 #undef XXH32
141 #undef XXH32_createState
142 #undef XXH32_freeState
143 #undef XXH32_reset
144 #undef XXH32_update
145 #undef XXH32_digest
146 #undef XXH32_copyState
147 #undef XXH32_canonicalFromHash
148 #undef XXH32_hashFromCanonical
149 /* XXH64 */
150 #undef XXH64
151 #undef XXH64_createState
152 #undef XXH64_freeState
153 #undef XXH64_reset
154 #undef XXH64_update
155 #undef XXH64_digest
156 #undef XXH64_copyState
157 #undef XXH64_canonicalFromHash
158 #undef XXH64_hashFromCanonical
159 /* XXH3_64bits */
160 #undef XXH3_64bits
161 #undef XXH3_64bits_withSecret
162 #undef XXH3_64bits_withSeed
163 #undef XXH3_createState
164 #undef XXH3_freeState
165 #undef XXH3_copyState
166 #undef XXH3_64bits_reset
167 #undef XXH3_64bits_reset_withSeed
168 #undef XXH3_64bits_reset_withSecret
169 #undef XXH3_64bits_update
170 #undef XXH3_64bits_digest
171 #undef XXH3_generateSecret
172 /* XXH3_128bits */
173 #undef XXH128
174 #undef XXH3_128bits
175 #undef XXH3_128bits_withSeed
176 #undef XXH3_128bits_withSecret
177 #undef XXH3_128bits_reset
178 #undef XXH3_128bits_reset_withSeed
179 #undef XXH3_128bits_reset_withSecret
180 #undef XXH3_128bits_update
181 #undef XXH3_128bits_digest
182 #undef XXH128_isEqual
183 #undef XXH128_cmp
184 #undef XXH128_canonicalFromHash
185 #undef XXH128_hashFromCanonical
186 /* Finally, free the namespace itself */
187 #undef XXH_NAMESPACE
188
189 /* employ the namespace for XXH_INLINE_ALL */
190 #define XXH_NAMESPACE XXH_INLINE_
191 /*
192 * Some identifiers (enums, type names) are not symbols,
193 * but they must nonetheless be renamed to avoid redeclaration.
194 * Alternative solution: do not redeclare them.
195 * However, this requires some #ifdefs, and has a more dispersed impact.
196 * Meanwhile, renaming can be achieved in a single place.
197 */
198 #define XXH_IPREF(Id) XXH_NAMESPACE##Id
199 #define XXH_OK XXH_IPREF(XXH_OK)
200 #define XXH_ERROR XXH_IPREF(XXH_ERROR)
201 #define XXH_errorcode XXH_IPREF(XXH_errorcode)
202 #define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)
203 #define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)
204 #define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
205 #define XXH32_state_s XXH_IPREF(XXH32_state_s)
206 #define XXH32_state_t XXH_IPREF(XXH32_state_t)
207 #define XXH64_state_s XXH_IPREF(XXH64_state_s)
208 #define XXH64_state_t XXH_IPREF(XXH64_state_t)
209 #define XXH3_state_s XXH_IPREF(XXH3_state_s)
210 #define XXH3_state_t XXH_IPREF(XXH3_state_t)
211 #define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
212 /* Ensure the header is parsed again, even if it was previously included */
213 #undef XXHASH_H_5627135585666179
214 #undef XXHASH_H_STATIC_13879238742
215 #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
216
217 /* ****************************************************************
218 * Stable API
219 *****************************************************************/
220 #ifndef XXHASH_H_5627135585666179
221 #define XXHASH_H_5627135585666179 1
222
223 /*!
224 * @defgroup public Public API
225 * Contains details on the public xxHash functions.
226 * @{
227
228 */
229 /* specific declaration modes for Windows */
230 #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
231 #if defined(WIN32) && defined(_MSC_VER) && \
232 (defined(XXH_IMPORT) || defined(XXH_EXPORT))
233 #ifdef XXH_EXPORT
234 #define XXH_PUBLIC_API __declspec(dllexport)
235 #elif XXH_IMPORT
236 #define XXH_PUBLIC_API __declspec(dllimport)
237 #endif
238 #else
239 #define XXH_PUBLIC_API /* do nothing */
240 #endif
241 #endif
242
243 #ifdef XXH_DOXYGEN
244 /*!
245 * @brief Emulate a namespace by transparently prefixing all symbols.
246 *
247 * If you want to include _and expose_ xxHash functions from within your own
248 * library, but also want to avoid symbol collisions with other libraries
249 * which may also include xxHash, you can use XXH_NAMESPACE to automatically
250 * prefix any public symbol from xxhash library with the value of
251 * XXH_NAMESPACE (therefore, avoid empty or numeric values).
252 *
253 * Note that no change is required within the calling program as long as it
254 * includes `xxhash.h`: Regular symbol names will be automatically
255 * translated by this header.
256 */
257 #define XXH_NAMESPACE /* YOUR NAME HERE */
258 #undef XXH_NAMESPACE
259 #endif
260
261 #ifdef XXH_NAMESPACE
262 #define XXH_CAT(A, B) A##B
263 #define XXH_NAME2(A, B) XXH_CAT(A, B)
264 #define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
265 /* XXH32 */
266 #define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
267 #define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
268 #define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
269 #define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
270 #define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
271 #define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
272 #define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
273 #define XXH32_canonicalFromHash \
274 XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
275 #define XXH32_hashFromCanonical \
276 XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
277 /* XXH64 */
278 #define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
279 #define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
280 #define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
281 #define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
282 #define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
283 #define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
284 #define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
285 #define XXH64_canonicalFromHash \
286 XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
287 #define XXH64_hashFromCanonical \
288 XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
289 /* XXH3_64bits */
290 #define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
291 #define XXH3_64bits_withSecret \
292 XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
293 #define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
294 #define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
295 #define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
296 #define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
297 #define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
298 #define XXH3_64bits_reset_withSeed \
299 XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
300 #define XXH3_64bits_reset_withSecret \
301 XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
302 #define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
303 #define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
304 #define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
305 /* XXH3_128bits */
306 #define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
307 #define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
308 #define XXH3_128bits_withSeed \
309 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
310 #define XXH3_128bits_withSecret \
311 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
312 #define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
313 #define XXH3_128bits_reset_withSeed \
314 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
315 #define XXH3_128bits_reset_withSecret \
316 XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
317 #define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
318 #define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
319 #define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
320 #define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
321 #define XXH128_canonicalFromHash \
322 XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
323 #define XXH128_hashFromCanonical \
324 XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
325 #endif
326
327 /* *************************************
328 * Version
329 ***************************************/
330 #define XXH_VERSION_MAJOR 0
331 #define XXH_VERSION_MINOR 8
332 #define XXH_VERSION_RELEASE 1
333 #define XXH_VERSION_NUMBER \
334 (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \
335 XXH_VERSION_RELEASE)
336
337 /*!
338 * @brief Obtains the xxHash version.
339 *
340 * This is only useful when xxHash is compiled as a shared library, as it is
341 * independent of the version defined in the header.
342 *
343 * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
344 */
345 XXH_PUBLIC_API unsigned XXH_versionNumber(void);
346
347 /* ****************************
348 * Definitions
349 ******************************/
350 #include <stddef.h> /* size_t */
351 typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;
352
353 /*-**********************************************************************
354 * 32-bit hash
355 ************************************************************************/
356 #if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
357 /*!
358 * @brief An unsigned 32-bit integer.
359 *
360 * Not necessarily defined to `uint32_t` but functionally equivalent.
361 */
362 typedef uint32_t XXH32_hash_t;
363
364 #elif !defined(__VMS) && \
365 (defined(__cplusplus) || \
366 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
367 #include <stdint.h>
368 typedef uint32_t XXH32_hash_t;
369
370 #else
371 #include <limits.h>
372 #if UINT_MAX == 0xFFFFFFFFUL
373 typedef unsigned int XXH32_hash_t;
374 #else
375 #if ULONG_MAX == 0xFFFFFFFFUL
376 typedef unsigned long XXH32_hash_t;
377 #else
378 #error "unsupported platform: need a 32-bit type"
379 #endif
380 #endif
381 #endif
382
383 /*!
384 * @}
385 *
386 * @defgroup xxh32_family XXH32 family
387 * @ingroup public
388 * Contains functions used in the classic 32-bit xxHash algorithm.
389 *
390 * @note
391 * XXH32 is considered rather weak by today's standards.
392 * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
393 * systems, and offers true 64/128 bit hash results. It provides a superior
394 * level of dispersion, and greatly reduces the risks of collisions.
395 *
396 * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
397 * @see @ref xxh32_impl for implementation details
398 * @{
399
400 */
401
402 /*!
403 * @brief Calculates the 32-bit hash of @p input using xxHash32.
404 *
405 * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
406 *
407 * @param input The block of data to be hashed, at least @p length bytes in
408 * size.
409 * @param length The length of @p input, in bytes.
410 * @param seed The 32-bit seed to alter the hash's output predictably.
411 *
412 * @pre
413 * The memory between @p input and @p input + @p length must be valid,
414 * readable, contiguous memory. However, if @p length is `0`, @p input may be
415 * `NULL`. In C++, this also must be *TriviallyCopyable*.
416 *
417 * @return The calculated 32-bit hash value.
418 *
419 * @see
420 * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
421 * Direct equivalents for the other variants of xxHash.
422 * @see
423 * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
424 */
425 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
426 XXH32_hash_t seed);
427
428 /*!
429 * Streaming functions generate the xxHash value from an incremental input.
430 * This method is slower than single-call functions, due to state management.
431 * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
432 *
433 * An XXH state must first be allocated using `XXH*_createState()`.
434 *
435 * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
436 *
437 * Then, feed the hash state by calling `XXH*_update()` as many times as
438 * necessary.
439 *
440 * The function returns an error code, with 0 meaning OK, and any other value
441 * meaning there is an error.
442 *
443 * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
444 * This function returns the nn-bits hash as an int or long long.
445 *
446 * It's still possible to continue inserting input into the hash state after a
447 * digest, and generate new hash values later on by invoking `XXH*_digest()`.
448 *
449 * When done, release the state using `XXH*_freeState()`.
450 *
451 * Example code for incrementally hashing a file:
452 * @code{.c}
453 * #include <stdio.h>
454 * #include <xxhash.h>
455 * #define BUFFER_SIZE 256
456 *
457 * // Note: XXH64 and XXH3 use the same interface.
458 * XXH32_hash_t
459 * hashFile(FILE* stream)
460 * {
461
462 * XXH32_state_t* state;
463 * unsigned char buf[BUFFER_SIZE];
464 * size_t amt;
465 * XXH32_hash_t hash;
466 *
467 * state = XXH32_createState(); // Create a state
468 * assert(state != NULL); // Error check here
469 * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed
470 * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
471
472 * XXH32_update(state, buf, amt); // Hash the file in chunks
473 * }
474 * hash = XXH32_digest(state); // Finalize the hash
475 * XXH32_freeState(state); // Clean up
476 * return hash;
477 * }
478 * @endcode
479 */
480
481 /*!
482 * @typedef struct XXH32_state_s XXH32_state_t
483 * @brief The opaque state struct for the XXH32 streaming API.
484 *
485 * @see XXH32_state_s for details.
486 */
487 typedef struct XXH32_state_s XXH32_state_t;
488
489 /*!
490 * @brief Allocates an @ref XXH32_state_t.
491 *
492 * Must be freed with XXH32_freeState().
493 * @return An allocated XXH32_state_t on success, `NULL` on failure.
494 */
495 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void);
496 /*!
497 * @brief Frees an @ref XXH32_state_t.
498 *
499 * Must be allocated with XXH32_createState().
500 * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref
501 * XXH32_createState().
502 * @return XXH_OK.
503 */
504 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr);
505 /*!
506 * @brief Copies one @ref XXH32_state_t to another.
507 *
508 * @param dst_state The state to copy to.
509 * @param src_state The state to copy from.
510 * @pre
511 * @p dst_state and @p src_state must not be `NULL` and must not overlap.
512 */
513 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *dst_state,
514 const XXH32_state_t *src_state);
515
516 /*!
517 * @brief Resets an @ref XXH32_state_t to begin a new hash.
518 *
519 * This function resets and seeds a state. Call it before @ref XXH32_update().
520 *
521 * @param statePtr The state struct to reset.
522 * @param seed The 32-bit seed to alter the hash result predictably.
523 *
524 * @pre
525 * @p statePtr must not be `NULL`.
526 *
527 * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
528 */
529 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
530 XXH32_hash_t seed);
531
532 /*!
533 * @brief Consumes a block of @p input to an @ref XXH32_state_t.
534 *
535 * Call this to incrementally consume blocks of data.
536 *
537 * @param statePtr The state struct to update.
538 * @param input The block of data to be hashed, at least @p length bytes in
539 * size.
540 * @param length The length of @p input, in bytes.
541 *
542 * @pre
543 * @p statePtr must not be `NULL`.
544 * @pre
545 * The memory between @p input and @p input + @p length must be valid,
546 * readable, contiguous memory. However, if @p length is `0`, @p input may be
547 * `NULL`. In C++, this also must be *TriviallyCopyable*.
548 *
549 * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
550 */
551 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr,
552 const void *input, size_t length);
553
554 /*!
555 * @brief Returns the calculated hash value from an @ref XXH32_state_t.
556 *
557 * @note
558 * Calling XXH32_digest() will not affect @p statePtr, so you can update,
559 * digest, and update again.
560 *
561 * @param statePtr The state struct to calculate the hash from.
562 *
563 * @pre
564 * @p statePtr must not be `NULL`.
565 *
566 * @return The calculated xxHash32 value from that state.
567 */
568 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr);
569
570 /******* Canonical representation *******/
571
572 /*
573 * The default return values from XXH functions are unsigned 32 and 64 bit
574 * integers.
575 * This the simplest and fastest format for further post-processing.
576 *
577 * However, this leaves open the question of what is the order on the byte
578 * level, since little and big endian conventions will store the same number
579 * differently.
580 *
581 * The canonical representation settles this issue by mandating big-endian
582 * convention, the same convention as human-readable numbers (large digits
583 * first).
584 *
585 * When writing hash values to storage, sending them over a network, or printing
586 * them, it's highly recommended to use the canonical representation to ensure
587 * portability across a wider range of systems, present and future.
588 *
589 * The following functions allow transformation of hash values to and from
590 * canonical format.
591 */
592
593 /*!
594 * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
595 */
596 typedef struct {
597
598 unsigned char digest[4]; /*!< Hash bytes, big endian */
599
600 } XXH32_canonical_t;
601
602 /*!
603 * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
604 *
605 * @param dst The @ref XXH32_canonical_t pointer to be stored to.
606 * @param hash The @ref XXH32_hash_t to be converted.
607 *
608 * @pre
609 * @p dst must not be `NULL`.
610 */
611 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
612 XXH32_hash_t hash);
613
614 /*!
615 * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
616 *
617 * @param src The @ref XXH32_canonical_t to convert.
618 *
619 * @pre
620 * @p src must not be `NULL`.
621 *
622 * @return The converted hash.
623 */
624 XXH_PUBLIC_API XXH32_hash_t
625 XXH32_hashFromCanonical(const XXH32_canonical_t *src);
626
627 #ifdef __has_attribute
628 #define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
629 #else
630 #define XXH_HAS_ATTRIBUTE(x) 0
631 #endif
632
633 /* C-language Attributes are added in C23. */
634 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && \
635 defined(__has_c_attribute)
636 #define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
637 #else
638 #define XXH_HAS_C_ATTRIBUTE(x) 0
639 #endif
640
641 #if defined(__cplusplus) && defined(__has_cpp_attribute)
642 #define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
643 #else
644 #define XXH_HAS_CPP_ATTRIBUTE(x) 0
645 #endif
646
647 /*
648 Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough'
649 attribute introduced in CPP17 and C23. CPP17 :
650 https://en.cppreference.com/w/cpp/language/attributes/fallthrough C23 :
651 https://en.cppreference.com/w/c/language/attributes/fallthrough
652 */
653 #if XXH_HAS_C_ATTRIBUTE(x)
654 #define XXH_FALLTHROUGH [[fallthrough]]
655 #elif XXH_HAS_CPP_ATTRIBUTE(x)
656 #define XXH_FALLTHROUGH [[fallthrough]]
657 #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
658 #define XXH_FALLTHROUGH __attribute__((fallthrough))
659 #else
660 #define XXH_FALLTHROUGH
661 #endif
662
663 /*!
664 * @}
665 * @ingroup public
666 * @{
667
668 */
669
670 #ifndef XXH_NO_LONG_LONG
671 /*-**********************************************************************
672 * 64-bit hash
673 ************************************************************************/
674 #if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
675 /*!
676 * @brief An unsigned 64-bit integer.
677 *
678 * Not necessarily defined to `uint64_t` but functionally equivalent.
679 */
680 typedef uint64_t XXH64_hash_t;
681 #elif !defined(__VMS) && \
682 (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
683 (__STDC_VERSION__ >= 199901L) /* C99 */))
684 #include <stdint.h>
685 typedef uint64_t XXH64_hash_t;
686 #else
687 #include <limits.h>
688 #if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
689 /* LP64 ABI says uint64_t is unsigned long */
690 typedef unsigned long XXH64_hash_t;
691 #else
692 /* the following type must have a width of 64-bit */
693 typedef unsigned long long XXH64_hash_t;
694 #endif
695 #endif
696
697 /*!
698 * @}
699 *
700 * @defgroup xxh64_family XXH64 family
701 * @ingroup public
702 * @{
703
704 * Contains functions used in the classic 64-bit xxHash algorithm.
705 *
706 * @note
707 * XXH3 provides competitive speed for both 32-bit and 64-bit systems,
708 * and offers true 64/128 bit hash results. It provides a superior level of
709 * dispersion, and greatly reduces the risks of collisions.
710 */
711
712 /*!
713 * @brief Calculates the 64-bit hash of @p input using xxHash64.
714 *
715 * This function usually runs faster on 64-bit systems, but slower on 32-bit
716 * systems (see benchmark).
717 *
718 * @param input The block of data to be hashed, at least @p length bytes in
719 * size.
720 * @param length The length of @p input, in bytes.
721 * @param seed The 64-bit seed to alter the hash's output predictably.
722 *
723 * @pre
724 * The memory between @p input and @p input + @p length must be valid,
725 * readable, contiguous memory. However, if @p length is `0`, @p input may be
726 * `NULL`. In C++, this also must be *TriviallyCopyable*.
727 *
728 * @return The calculated 64-bit hash.
729 *
730 * @see
731 * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
732 * Direct equivalents for the other variants of xxHash.
733 * @see
734 * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
735 */
736 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length,
737 XXH64_hash_t seed);
738
739 /******* Streaming *******/
740 /*!
741 * @brief The opaque state struct for the XXH64 streaming API.
742 *
743 * @see XXH64_state_s for details.
744 */
745 typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
746 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void);
747 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr);
748 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *dst_state,
749 const XXH64_state_t *src_state);
750
751 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
752 XXH64_hash_t seed);
753 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *statePtr,
754 const void *input, size_t length);
755 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *statePtr);
756
757 /******* Canonical representation *******/
758 typedef struct {
759
760 unsigned char digest[sizeof(XXH64_hash_t)];
761
762 } XXH64_canonical_t;
763
764 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
765 XXH64_hash_t hash);
766 XXH_PUBLIC_API XXH64_hash_t
767 XXH64_hashFromCanonical(const XXH64_canonical_t *src);
768
769 /*!
770 * @}
771 * ************************************************************************
772 * @defgroup xxh3_family XXH3 family
773 * @ingroup public
774 * @{
775
776 *
777 * XXH3 is a more recent hash algorithm featuring:
778 * - Improved speed for both small and large inputs
779 * - True 64-bit and 128-bit outputs
780 * - SIMD acceleration
781 * - Improved 32-bit viability
782 *
783 * Speed analysis methodology is explained here:
784 *
785 * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
786 *
787 * Compared to XXH64, expect XXH3 to run approximately
788 * ~2x faster on large inputs and >3x faster on small ones,
789 * exact differences vary depending on platform.
790 *
791 * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
792 * but does not require it.
793 * Any 32-bit and 64-bit targets that can run XXH32 smoothly
794 * can run XXH3 at competitive speeds, even without vector support.
795 * Further details are explained in the implementation.
796 *
797 * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
798 * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
799 *
800 * XXH3 implementation is portable:
801 * it has a generic C90 formulation that can be compiled on any platform,
802 * all implementations generage exactly the same hash value on all platforms.
803 * Starting from v0.8.0, it's also labelled "stable", meaning that
804 * any future version will also generate the same hash value.
805 *
806 * XXH3 offers 2 variants, _64bits and _128bits.
807 *
808 * When only 64 bits are needed, prefer invoking the _64bits variant, as it
809 * reduces the amount of mixing, resulting in faster speed on small inputs.
810 * It's also generally simpler to manipulate a scalar return type than a struct.
811 *
812 * The API supports one-shot hashing, streaming mode, and custom secrets.
813 */
814
815 /*-**********************************************************************
816 * XXH3 64-bit variant
817 ************************************************************************/
818
819 /* XXH3_64bits():
820 * default 64-bit variant, using default secret and default seed of 0.
821 * It's the fastest variant. */
822 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len);
823
824 /*
825 * XXH3_64bits_withSeed():
826 * This variant generates a custom secret on the fly
827 * based on default secret altered using the `seed` value.
828 * While this operation is decently fast, note that it's not completely free.
829 * Note: seed==0 produces the same results as XXH3_64bits().
830 */
831 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len,
832 XXH64_hash_t seed);
833
834 /*!
835 * The bare minimum size for a custom secret.
836 *
837 * @see
838 * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
839 * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
840 */
841 #define XXH3_SECRET_SIZE_MIN 136
842
843 /*
844 * XXH3_64bits_withSecret():
845 * It's possible to provide any blob of bytes as a "secret" to generate the
846 * hash. This makes it more difficult for an external actor to prepare an
847 * intentional collision. The main condition is that secretSize *must* be large
848 * enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced hash
849 * values depends on secret's entropy. Technically, the secret must look like a
850 * bunch of random bytes. Avoid "trivial" or structured data such as repeated
851 * sequences or a text document. Whenever unsure about the "randomness" of the
852 * blob of bytes, consider relabelling it as a "custom seed" instead, and employ
853 * "XXH3_generateSecret()" (see below) to generate a high entropy secret derived
854 * from the custom seed.
855 */
856 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
857 const void *secret,
858 size_t secretSize);
859
860 /******* Streaming *******/
861 /*
862 * Streaming requires state maintenance.
863 * This operation costs memory and CPU.
864 * As a consequence, streaming is slower than one-shot hashing.
865 * For better performance, prefer one-shot functions whenever applicable.
866 */
867
868 /*!
869 * @brief The state struct for the XXH3 streaming API.
870 *
871 * @see XXH3_state_s for details.
872 */
873 typedef struct XXH3_state_s XXH3_state_t;
874 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void);
875 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr);
876 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *dst_state,
877 const XXH3_state_t *src_state);
878
879 /*
880 * XXH3_64bits_reset():
881 * Initialize with default parameters.
882 * digest will be equivalent to `XXH3_64bits()`.
883 */
884 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr);
885 /*
886 * XXH3_64bits_reset_withSeed():
887 * Generate a custom secret from `seed`, and store it into `statePtr`.
888 * digest will be equivalent to `XXH3_64bits_withSeed()`.
889 */
890 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
891 XXH64_hash_t seed);
892 /*
893 * XXH3_64bits_reset_withSecret():
894 * `secret` is referenced, it _must outlive_ the hash streaming session.
895 * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
896 * and the quality of produced hash values depends on secret's entropy
897 * (secret's content should look like a bunch of random bytes).
898 * When in doubt about the randomness of a candidate `secret`,
899 * consider employing `XXH3_generateSecret()` instead (see below).
900 */
901 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
902 XXH3_state_t *statePtr, const void *secret, size_t secretSize);
903
904 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *statePtr,
905 const void *input,
906 size_t length);
907 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *statePtr);
908
909 /* note : canonical representation of XXH3 is the same as XXH64
910 * since they both produce XXH64_hash_t values */
911
912 /*-**********************************************************************
913 * XXH3 128-bit variant
914 ************************************************************************/
915
916 /*!
917 * @brief The return value from 128-bit hashes.
918 *
919 * Stored in little endian order, although the fields themselves are in native
920 * endianness.
921 */
922 typedef struct {
923
924 XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */
925 XXH64_hash_t high64; /*!< `value >> 64` */
926
927 } XXH128_hash_t;
928
929 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *data, size_t len);
930 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void *data, size_t len,
931 XXH64_hash_t seed);
932 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *data,
933 size_t len,
934 const void *secret,
935 size_t secretSize);
936
937 /******* Streaming *******/
938 /*
939 * Streaming requires state maintenance.
940 * This operation costs memory and CPU.
941 * As a consequence, streaming is slower than one-shot hashing.
942 * For better performance, prefer one-shot functions whenever applicable.
943 *
944 * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
945 * Use already declared XXH3_createState() and XXH3_freeState().
946 *
947 * All reset and streaming functions have same meaning as their 64-bit
948 * counterpart.
949 */
950
951 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr);
952 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
953 XXH64_hash_t seed);
954 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
955 XXH3_state_t *statePtr, const void *secret, size_t secretSize);
956
957 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *statePtr,
958 const void *input,
959 size_t length);
960 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *statePtr);
961
962 /* Following helper functions make it possible to compare XXH128_hast_t values.
963 * Since XXH128_hash_t is a structure, this capability is not offered by the
964 * language.
965 * Note: For better performance, these functions can be inlined using
966 * XXH_INLINE_ALL */
967
968 /*!
969 * XXH128_isEqual():
970 * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
971 */
972 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
973
974 /*!
975 * XXH128_cmp():
976 *
977 * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
978 *
979 * return: >0 if *h128_1 > *h128_2
980 * =0 if *h128_1 == *h128_2
981 * <0 if *h128_1 < *h128_2
982 */
983 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2);
984
985 /******* Canonical representation *******/
986 typedef struct {
987
988 unsigned char digest[sizeof(XXH128_hash_t)];
989
990 } XXH128_canonical_t;
991
992 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
993 XXH128_hash_t hash);
994 XXH_PUBLIC_API XXH128_hash_t
995 XXH128_hashFromCanonical(const XXH128_canonical_t *src);
996
997 #endif /* XXH_NO_LONG_LONG */
998
999 /*!
1000 * @}
1001 */
1002 #endif /* XXHASH_H_5627135585666179 */
1003
1004 #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
1005 #define XXHASH_H_STATIC_13879238742
1006 /* ****************************************************************************
1007 * This section contains declarations which are not guaranteed to remain stable.
1008 * They may change in future versions, becoming incompatible with a different
1009 * version of the library.
1010 * These declarations should only be used with static linking.
1011 * Never use them in association with dynamic linking!
1012 *****************************************************************************
1013 */
1014
1015 /*
1016 * These definitions are only present to allow static allocation
1017 * of XXH states, on stack or in a struct, for example.
1018 * Never **ever** access their members directly.
1019 */
1020
1021 /*!
1022 * @internal
1023 * @brief Structure for XXH32 streaming API.
1024 *
1025 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
1026 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
1027 * an opaque type. This allows fields to safely be changed.
1028 *
1029 * Typedef'd to @ref XXH32_state_t.
1030 * Do not access the members of this struct directly.
1031 * @see XXH64_state_s, XXH3_state_s
1032 */
1033 struct XXH32_state_s {
1034
1035 XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
1036 XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref
1037 total_len_32 overflow) */
1038 XXH32_hash_t v1; /*!< First accumulator lane */
1039 XXH32_hash_t v2; /*!< Second accumulator lane */
1040 XXH32_hash_t v3; /*!< Third accumulator lane */
1041 XXH32_hash_t v4; /*!< Fourth accumulator lane */
1042 XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as
1043 unsigned char[16]. */
1044 XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */
1045 XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may
1046 be removed. */
1047
1048 }; /* typedef'd to XXH32_state_t */
1049
1050 #ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */
1051
1052 /*!
1053 * @internal
1054 * @brief Structure for XXH64 streaming API.
1055 *
1056 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
1057 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
1058 * an opaque type. This allows fields to safely be changed.
1059 *
1060 * Typedef'd to @ref XXH64_state_t.
1061 * Do not access the members of this struct directly.
1062 * @see XXH32_state_s, XXH3_state_s
1063 */
1064 struct XXH64_state_s {
1065
1066 XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */
1067 XXH64_hash_t v1; /*!< First accumulator lane */
1068 XXH64_hash_t v2; /*!< Second accumulator lane */
1069 XXH64_hash_t v3; /*!< Third accumulator lane */
1070 XXH64_hash_t v4; /*!< Fourth accumulator lane */
1071 XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as
1072 unsigned char[32]. */
1073 XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */
1074 XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/
1075 XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it
1076 may be removed. */
1077
1078 }; /* typedef'd to XXH64_state_t */
1079
1080 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 \
1081 */
1082 #include <stdalign.h>
1083 #define XXH_ALIGN(n) alignas(n)
1084 #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
1085 /* In C++ alignas() is a keyword */
1086 #define XXH_ALIGN(n) alignas(n)
1087 #elif defined(__GNUC__)
1088 #define XXH_ALIGN(n) __attribute__((aligned(n)))
1089 #elif defined(_MSC_VER)
1090 #define XXH_ALIGN(n) __declspec(align(n))
1091 #else
1092 #define XXH_ALIGN(n) /* disabled */
1093 #endif
1094
1095 /* Old GCC versions only accept the attribute after the type in structures.
1096 */
1097 #if !(defined(__STDC_VERSION__) && \
1098 (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
1099 && !(defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
1100 && defined(__GNUC__)
1101 #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
1102 #else
1103 #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
1104 #endif
1105
1106 /*!
1107 * @brief The size of the internal XXH3 buffer.
1108 *
1109 * This is the optimal update size for incremental hashing.
1110 *
1111 * @see XXH3_64b_update(), XXH3_128b_update().
1112 */
1113 #define XXH3_INTERNALBUFFER_SIZE 256
1114
1115 /*!
1116 * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
1117 *
1118 * This is the size used in @ref XXH3_kSecret and the seeded functions.
1119 *
1120 * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
1121 */
1122 #define XXH3_SECRET_DEFAULT_SIZE 192
1123
1124 /*!
1125 * @internal
1126 * @brief Structure for XXH3 streaming API.
1127 *
1128 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
1129 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
1130 * Otherwise it is an opaque type.
1131 * Never use this definition in combination with dynamic library.
1132 * This allows fields to safely be changed in the future.
1133 *
1134 * @note ** This structure has a strict alignment requirement of 64 bytes!! **
1135 * Do not allocate this with `malloc()` or `new`,
1136 * it will not be sufficiently aligned.
1137 * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
1138 *
1139 * Typedef'd to @ref XXH3_state_t.
1140 * Do never access the members of this struct directly.
1141 *
1142 * @see XXH3_INITSTATE() for stack initialization.
1143 * @see XXH3_createState(), XXH3_freeState().
1144 * @see XXH32_state_s, XXH64_state_s
1145 */
1146 struct XXH3_state_s {
1147
1148 XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
1149 /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref
1150 * XXH64_state_s */
1151 XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
1152 /*!< Used to store a custom secret generated from a seed. */
1153 XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
1154 /*!< The internal buffer. @see XXH32_state_s::mem32 */
1155 XXH32_hash_t bufferedSize;
1156 /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
1157 XXH32_hash_t reserved32;
1158 /*!< Reserved field. Needed for padding on 64-bit. */
1159 size_t nbStripesSoFar;
1160 /*!< Number or stripes processed. */
1161 XXH64_hash_t totalLen;
1162 /*!< Total length hashed. 64-bit even on 32-bit targets. */
1163 size_t nbStripesPerBlock;
1164 /*!< Number of stripes per block. */
1165 size_t secretLimit;
1166 /*!< Size of @ref customSecret or @ref extSecret */
1167 XXH64_hash_t seed;
1168 /*!< Seed for _withSeed variants. Must be zero otherwise, @see
1169 * XXH3_INITSTATE() */
1170 XXH64_hash_t reserved64;
1171 /*!< Reserved field. */
1172 const unsigned char *extSecret;
1173 /*!< Reference to an external secret for the _withSecret variants, NULL
1174 * for other variants. */
1175 /* note: there may be some padding at the end due to alignment on 64 bytes */
1176
1177 }; /* typedef'd to XXH3_state_t */
1178
1179 #undef XXH_ALIGN_MEMBER
1180
1181 /*!
1182 * @brief Initializes a stack-allocated `XXH3_state_s`.
1183 *
1184 * When the @ref XXH3_state_t structure is merely emplaced on stack,
1185 * it should be initialized with XXH3_INITSTATE() or a memset()
1186 * in case its first reset uses XXH3_NNbits_reset_withSeed().
1187 * This init can be omitted if the first reset uses default or _withSecret
1188 * mode. This operation isn't necessary when the state is created with
1189 * XXH3_createState(). Note that this doesn't prepare the state for a
1190 * streaming operation, it's still necessary to use XXH3_NNbits_reset*()
1191 * afterwards.
1192 */
1193 #define XXH3_INITSTATE(XXH3_state_ptr) \
1194 { (XXH3_state_ptr)->seed = 0; }
1195
1196 /* === Experimental API === */
1197 /* Symbols defined below must be considered tied to a specific library version.
1198 */
1199
1200 /*
1201 * XXH3_generateSecret():
1202 *
1203 * Derive a high-entropy secret from any user-defined content, named customSeed.
1204 * The generated secret can be used in combination with `*_withSecret()`
1205 * functions. The `_withSecret()` variants are useful to provide a higher level
1206 * of protection than 64-bit seed, as it becomes much more difficult for an
1207 * external actor to guess how to impact the calculation logic.
1208 *
1209 * The function accepts as input a custom seed of any length and any content,
1210 * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
1211 * into an already allocated buffer secretBuffer.
1212 * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
1213 *
1214 * The generated secret can then be used with any `*_withSecret()` variant.
1215 * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
1216 * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
1217 * are part of this list. They all accept a `secret` parameter
1218 * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
1219 * _and_ feature very high entropy (consist of random-looking bytes).
1220 * These conditions can be a high bar to meet, so
1221 * this function can be used to generate a secret of proper quality.
1222 *
1223 * customSeed can be anything. It can have any size, even small ones,
1224 * and its content can be anything, even stupidly "low entropy" source such as a
1225 * bunch of zeroes. The resulting `secret` will nonetheless provide all expected
1226 * qualities.
1227 *
1228 * Supplying NULL as the customSeed copies the default secret into
1229 * `secretBuffer`. When customSeedSize > 0, supplying NULL as customSeed is
1230 * undefined behavior.
1231 */
1232 XXH_PUBLIC_API void XXH3_generateSecret(void *secretBuffer,
1233 const void *customSeed,
1234 size_t customSeedSize);
1235
1236 /* simple short-cut to pre-selected XXH3_128bits variant */
1237 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
1238 XXH64_hash_t seed);
1239
1240 #endif /* XXH_NO_LONG_LONG */
1241 #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
1242 #define XXH_IMPLEMENTATION
1243 #endif
1244
1245 #endif /* defined(XXH_STATIC_LINKING_ONLY) && \
1246 !defined(XXHASH_H_STATIC_13879238742) */
1247
1248 /* ======================================================================== */
1249 /* ======================================================================== */
1250 /* ======================================================================== */
1251
1252 /*-**********************************************************************
1253 * xxHash implementation
1254 *-**********************************************************************
1255 * xxHash's implementation used to be hosted inside xxhash.c.
1256 *
1257 * However, inlining requires implementation to be visible to the compiler,
1258 * hence be included alongside the header.
1259 * Previously, implementation was hosted inside xxhash.c,
1260 * which was then #included when inlining was activated.
1261 * This construction created issues with a few build and install systems,
1262 * as it required xxhash.c to be stored in /include directory.
1263 *
1264 * xxHash implementation is now directly integrated within xxhash.h.
1265 * As a consequence, xxhash.c is no longer needed in /include.
1266 *
1267 * xxhash.c is still available and is still useful.
1268 * In a "normal" setup, when xxhash is not inlined,
1269 * xxhash.h only exposes the prototypes and public symbols,
1270 * while xxhash.c can be built into an object file xxhash.o
1271 * which can then be linked into the final binary.
1272 ************************************************************************/
1273
1274 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) || \
1275 defined(XXH_IMPLEMENTATION)) && \
1276 !defined(XXH_IMPLEM_13a8737387)
1277 #define XXH_IMPLEM_13a8737387
1278
1279 /* *************************************
1280 * Tuning parameters
1281 ***************************************/
1282
1283 /*!
1284 * @defgroup tuning Tuning parameters
1285 * @{
1286
1287 *
1288 * Various macros to control xxHash's behavior.
1289 */
1290 #ifdef XXH_DOXYGEN
1291 /*!
1292 * @brief Define this to disable 64-bit code.
1293 *
1294 * Useful if only using the @ref xxh32_family and you have a strict C90
1295 * compiler.
1296 */
1297 #define XXH_NO_LONG_LONG
1298 #undef XXH_NO_LONG_LONG /* don't actually */
1299 /*!
1300 * @brief Controls how unaligned memory is accessed.
1301 *
1302 * By default, access to unaligned memory is controlled by `memcpy()`, which
1303 * is safe and portable.
1304 *
1305 * Unfortunately, on some target/compiler combinations, the generated
1306 * assembly is sub-optimal.
1307 *
1308 * The below switch allow selection of a different access method
1309 * in the search for improved performance.
1310 *
1311 * @par Possible options:
1312 *
1313 * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
1314 * @par
1315 * Use `memcpy()`. Safe and portable. Note that most modern compilers
1316 * will eliminate the function call and treat it as an unaligned access.
1317 *
1318 * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
1319 * @par
1320 * Depends on compiler extensions and is therefore not portable.
1321 * This method is safe _if_ your compiler supports it,
1322 * and *generally* as fast or faster than `memcpy`.
1323 *
1324 * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
1325 * @par
1326 * Casts directly and dereferences. This method doesn't depend on the
1327 * compiler, but it violates the C standard as it directly dereferences
1328 * an unaligned pointer. It can generate buggy code on targets which do not
1329 * support unaligned memory accesses, but in some circumstances, it's
1330 * the only known way to get the most performance.
1331 *
1332 * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
1333 * @par
1334 * Also portable. This can generate the best code on old compilers which
1335 * don't inline small `memcpy()` calls, and it might also be faster on
1336 * big-endian systems which lack a native byteswap instruction. However,
1337 * some compilers will emit literal byteshifts even if the target supports
1338 * unaligned access.
1339 * .
1340 *
1341 * @warning
1342 * Methods 1 and 2 rely on implementation-defined behavior. Use these with
1343 * care, as what works on one compiler/platform/optimization level may
1344 * cause another to read garbage data or even crash.
1345 *
1346 * See https://stackoverflow.com/a/32095106/646947 for details.
1347 *
1348 * Prefer these methods in priority order (0 > 3 > 1 > 2)
1349 */
1350 #define XXH_FORCE_MEMORY_ACCESS 0
1351 /*!
1352 * @def XXH_ACCEPT_NULL_INPUT_POINTER
1353 * @brief Whether to add explicit `NULL` checks.
1354 *
1355 * If the input pointer is `NULL` and the length is non-zero, xxHash's
1356 * default behavior is to dereference it, triggering a segfault.
1357 *
1358 * When this macro is enabled, xxHash actively checks the input for a null
1359 * pointer. If it is, the result for null input pointers is the same as a
1360 * zero-length input.
1361 */
1362 #define XXH_ACCEPT_NULL_INPUT_POINTER 0
1363 /*!
1364 * @def XXH_FORCE_ALIGN_CHECK
1365 * @brief If defined to non-zero, adds a special path for aligned inputs
1366 * (XXH32() and XXH64() only).
1367 *
1368 * This is an important performance trick for architectures without decent
1369 * unaligned memory access performance.
1370 *
1371 * It checks for input alignment, and when conditions are met, uses a "fast
1372 * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
1373 * faster_ read speed.
1374 *
1375 * The check costs one initial branch per hash, which is generally
1376 * negligible, but not zero.
1377 *
1378 * Moreover, it's not useful to generate an additional code path if memory
1379 * access uses the same instruction for both aligned and unaligned
1380 * addresses (e.g. x86 and aarch64).
1381 *
1382 * In these cases, the alignment check can be removed by setting this macro
1383 * to 0. Then the code will always use unaligned memory access. Align check
1384 * is automatically disabled on x86, x64 & arm64, which are platforms known
1385 * to offer good unaligned memory accesses performance.
1386 *
1387 * This option does not affect XXH3 (only XXH32 and XXH64).
1388 */
1389 #define XXH_FORCE_ALIGN_CHECK 0
1390
1391 /*!
1392 * @def XXH_NO_INLINE_HINTS
1393 * @brief When non-zero, sets all functions to `static`.
1394 *
1395 * By default, xxHash tries to force the compiler to inline almost all
1396 * internal functions.
1397 *
1398 * This can usually improve performance due to reduced jumping and improved
1399 * constant folding, but significantly increases the size of the binary
1400 * which might not be favorable.
1401 *
1402 * Additionally, sometimes the forced inlining can be detrimental to
1403 * performance, depending on the architecture.
1404 *
1405 * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
1406 * compiler full control on whether to inline or not.
1407 *
1408 * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
1409 * -fno-inline with GCC or Clang, this will automatically be defined.
1410 */
1411 #define XXH_NO_INLINE_HINTS 0
1412
1413 /*!
1414 * @def XXH_REROLL
1415 * @brief Whether to reroll `XXH32_finalize`.
1416 *
1417 * For performance, `XXH32_finalize` uses an unrolled loop
1418 * in the form of a switch statement.
1419 *
1420 * This is not always desirable, as it generates larger code,
1421 * and depending on the architecture, may even be slower
1422 *
1423 * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
1424 */
1425 #define XXH_REROLL 0
1426
1427 /*!
1428 * @internal
1429 * @brief Redefines old internal names.
1430 *
1431 * For compatibility with code that uses xxHash's internals before the names
1432 * were changed to improve namespacing. There is no other reason to use
1433 * this.
1434 */
1435 #define XXH_OLD_NAMES
1436 #undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
1437 #endif /* XXH_DOXYGEN */
1438 /*!
1439 * @}
1440 */
1441
1442 #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \
1443 line for example */
1444 /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
1445 #if !defined(__clang__) && \
1446 ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
1447 (defined(__GNUC__) && \
1448 ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
1449 (defined(__mips__) && (__mips <= 5 || __mips_isa_rev < 6) && \
1450 (!defined(__mips16) || defined(__mips_mips16e2))))))
1451 #define XXH_FORCE_MEMORY_ACCESS 1
1452 #endif
1453 #endif
1454
1455 #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
1456 #define XXH_ACCEPT_NULL_INPUT_POINTER 0
1457 #endif
1458
1459 #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
1460 #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \
1461 defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */
1462 #define XXH_FORCE_ALIGN_CHECK 0
1463 #else
1464 #define XXH_FORCE_ALIGN_CHECK 1
1465 #endif
1466 #endif
1467
1468 #ifndef XXH_NO_INLINE_HINTS
1469 #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
1470 || defined(__NO_INLINE__) /* -O0, -fno-inline */
1471 #define XXH_NO_INLINE_HINTS 1
1472 #else
1473 #define XXH_NO_INLINE_HINTS 0
1474 #endif
1475 #endif
1476
1477 #ifndef XXH_REROLL
1478 #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \
1479 (defined(__GNUC__) && !defined(__clang__))
1480 /* The if/then loop is preferable to switch/case on gcc (on x64) */
1481 #define XXH_REROLL 1
1482 #else
1483 #define XXH_REROLL 0
1484 #endif
1485 #endif
1486
1487 /*!
1488 * @defgroup impl Implementation
1489 * @{
1490
1491 */
1492
1493 /* *************************************
1494 * Includes & Memory related functions
1495 ***************************************/
1496 /*
1497 * Modify the local functions below should you wish to use
1498 * different memory routines for malloc() and free()
1499 */
1500 #include <stdlib.h>
1501
1502 /*!
1503 * @internal
1504 * @brief Modify this function to use a different routine than malloc().
1505 */
XXH_malloc(size_t s)1506 static void *XXH_malloc(size_t s) {
1507
1508 return malloc(s);
1509
1510 }
1511
1512 /*!
1513 * @internal
1514 * @brief Modify this function to use a different routine than free().
1515 */
XXH_free(void * p)1516 static void XXH_free(void *p) {
1517
1518 free(p);
1519
1520 }
1521
1522 #include <string.h>
1523
1524 /*!
1525 * @internal
1526 * @brief Modify this function to use a different routine than memcpy().
1527 */
XXH_memcpy(void * dest,const void * src,size_t size)1528 static void *XXH_memcpy(void *dest, const void *src, size_t size) {
1529
1530 return memcpy(dest, src, size);
1531
1532 }
1533
1534 #include <limits.h> /* ULLONG_MAX */
1535
1536 /* *************************************
1537 * Compiler Specific Options
1538 ***************************************/
1539 #ifdef _MSC_VER /* Visual Studio warning fix */
1540 #pragma warning(disable : 4127) /* disable: C4127: conditional expression \
1541 is constant */
1542 #endif
1543
1544 #if XXH_NO_INLINE_HINTS /* disable inlining hints */
1545 #if defined(__GNUC__)
1546 #define XXH_FORCE_INLINE static __attribute__((unused))
1547 #else
1548 #define XXH_FORCE_INLINE static
1549 #endif
1550 #define XXH_NO_INLINE static
1551 /* enable inlining hints */
1552 #elif defined(_MSC_VER) /* Visual Studio */
1553 #define XXH_FORCE_INLINE static __forceinline
1554 #define XXH_NO_INLINE static __declspec(noinline)
1555 #elif defined(__GNUC__)
1556 #define XXH_FORCE_INLINE \
1557 static __inline__ __attribute__((always_inline, unused))
1558 #define XXH_NO_INLINE static __attribute__((noinline))
1559 #elif defined(__cplusplus) || \
1560 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
1561 #define XXH_FORCE_INLINE static inline
1562 #define XXH_NO_INLINE static
1563 #else
1564 #define XXH_FORCE_INLINE static
1565 #define XXH_NO_INLINE static
1566 #endif
1567
1568 /* *************************************
1569 * Debug
1570 ***************************************/
1571 /*!
1572 * @ingroup tuning
1573 * @def XXH_DEBUGLEVEL
1574 * @brief Sets the debugging level.
1575 *
1576 * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
1577 * compiler's command line options. The value must be a number.
1578 */
1579 #ifndef XXH_DEBUGLEVEL
1580 #ifdef DEBUGLEVEL /* backwards compat */
1581 #define XXH_DEBUGLEVEL DEBUGLEVEL
1582 #else
1583 #define XXH_DEBUGLEVEL 0
1584 #endif
1585 #endif
1586
1587 #if (XXH_DEBUGLEVEL >= 1)
1588 #include <assert.h> /* note: can still be disabled with NDEBUG */
1589 #define XXH_ASSERT(c) assert(c)
1590 #else
1591 #define XXH_ASSERT(c) ((void)0)
1592 #endif
1593
1594 /* note: use after variable declarations */
1595 #ifndef XXH_STATIC_ASSERT
1596 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */
1597 #include <assert.h>
1598 #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
1599 do { \
1600 \
1601 static_assert((c), m); \
1602 \
1603 } while (0)
1604
1605 #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */
1606 #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
1607 do { \
1608 \
1609 static_assert((c), m); \
1610 \
1611 } while (0)
1612
1613 #else
1614 #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
1615 do { \
1616 \
1617 struct xxh_sa { \
1618 \
1619 char x[(c) ? 1 : -1]; \
1620 \
1621 }; \
1622 \
1623 } while (0)
1624
1625 #endif
1626 #define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c)
1627 #endif
1628
1629 /*!
1630 * @internal
1631 * @def XXH_COMPILER_GUARD(var)
1632 * @brief Used to prevent unwanted optimizations for @p var.
1633 *
1634 * It uses an empty GCC inline assembly statement with a register constraint
1635 * which forces @p var into a general purpose register (eg eax, ebx, ecx
1636 * on x86) and marks it as modified.
1637 *
1638 * This is used in a few places to avoid unwanted autovectorization (e.g.
1639 * XXH32_round()). All vectorization we want is explicit via intrinsics,
1640 * and _usually_ isn't wanted elsewhere.
1641 *
1642 * We also use it to prevent unwanted constant folding for AArch64 in
1643 * XXH3_initCustomSecret_scalar().
1644 */
1645 #ifdef __GNUC__
1646 #define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r"(var))
1647 #else
1648 #define XXH_COMPILER_GUARD(var) ((void)0)
1649 #endif
1650
1651 /* *************************************
1652 * Basic Types
1653 ***************************************/
1654 #if !defined(__VMS) && \
1655 (defined(__cplusplus) || \
1656 (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
1657 #include <stdint.h>
1658 typedef uint8_t xxh_u8;
1659 #else
1660 typedef unsigned char xxh_u8;
1661 #endif
1662 typedef XXH32_hash_t xxh_u32;
1663
1664 #ifdef XXH_OLD_NAMES
1665 #define BYTE xxh_u8
1666 #define U8 xxh_u8
1667 #define U32 xxh_u32
1668 #endif
1669
1670 /* *** Memory access *** */
1671
1672 /*!
1673 * @internal
1674 * @fn xxh_u32 XXH_read32(const void* ptr)
1675 * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
1676 *
1677 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1678 *
1679 * @param ptr The pointer to read from.
1680 * @return The 32-bit native endian integer from the bytes at @p ptr.
1681 */
1682
1683 /*!
1684 * @internal
1685 * @fn xxh_u32 XXH_readLE32(const void* ptr)
1686 * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
1687 *
1688 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1689 *
1690 * @param ptr The pointer to read from.
1691 * @return The 32-bit little endian integer from the bytes at @p ptr.
1692 */
1693
1694 /*!
1695 * @internal
1696 * @fn xxh_u32 XXH_readBE32(const void* ptr)
1697 * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
1698 *
1699 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1700 *
1701 * @param ptr The pointer to read from.
1702 * @return The 32-bit big endian integer from the bytes at @p ptr.
1703 */
1704
1705 /*!
1706 * @internal
1707 * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
1708 * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
1709 *
1710 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1711 * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
1712 * always @ref XXH_alignment::XXH_unaligned.
1713 *
1714 * @param ptr The pointer to read from.
1715 * @param align Whether @p ptr is aligned.
1716 * @pre
1717 * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
1718 * aligned.
1719 * @return The 32-bit little endian integer from the bytes at @p ptr.
1720 */
1721
1722 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1723 /*
1724 * Manual byteshift. Best for old compilers which don't inline memcpy.
1725 * We actually directly use XXH_readLE32 and XXH_readBE32.
1726 */
1727 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
1728
1729 /*
1730 * Force direct memory access. Only works on CPU which support unaligned memory
1731 * access in hardware.
1732 */
XXH_read32(const void * memPtr)1733 static xxh_u32 XXH_read32(const void *memPtr) {
1734
1735 return *(const xxh_u32 *)memPtr;
1736
1737 }
1738
1739 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
1740
1741 /*
1742 * __pack instructions are safer but compiler specific, hence potentially
1743 * problematic for some compilers.
1744 *
1745 * Currently only defined for GCC and ICC.
1746 */
1747 #ifdef XXH_OLD_NAMES
1748 typedef union {
1749
1750 xxh_u32 u32;
1751
1752 } __attribute__((packed)) unalign;
1753
1754 #endif
XXH_read32(const void * ptr)1755 static xxh_u32 XXH_read32(const void *ptr) {
1756
1757 typedef union {
1758
1759 xxh_u32 u32;
1760
1761 } __attribute__((packed)) xxh_unalign;
1762
1763 return ((const xxh_unalign *)ptr)->u32;
1764
1765 }
1766
1767 #else
1768
1769 /*
1770 * Portable and safe solution. Generally efficient.
1771 * see: https://stackoverflow.com/a/32095106/646947
1772 */
XXH_read32(const void * memPtr)1773 static xxh_u32 XXH_read32(const void *memPtr) {
1774
1775 xxh_u32 val;
1776 memcpy(&val, memPtr, sizeof(val));
1777 return val;
1778
1779 }
1780
1781 #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
1782
1783 /* *** Endianness *** */
1784
1785 /*!
1786 * @ingroup tuning
1787 * @def XXH_CPU_LITTLE_ENDIAN
1788 * @brief Whether the target is little endian.
1789 *
1790 * Defined to 1 if the target is little endian, or 0 if it is big endian.
1791 * It can be defined externally, for example on the compiler command line.
1792 *
1793 * If it is not defined,
1794 * a runtime check (which is usually constant folded) is used instead.
1795 *
1796 * @note
1797 * This is not necessarily defined to an integer constant.
1798 *
1799 * @see XXH_isLittleEndian() for the runtime check.
1800 */
1801 #ifndef XXH_CPU_LITTLE_ENDIAN
1802 /*
1803 * Try to detect endianness automatically, to avoid the nonstandard behavior
1804 * in `XXH_isLittleEndian()`
1805 */
1806 #if defined(_WIN32) /* Windows is always little endian */ \
1807 || defined(__LITTLE_ENDIAN__) || \
1808 (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1809 #define XXH_CPU_LITTLE_ENDIAN 1
1810 #elif defined(__BIG_ENDIAN__) || \
1811 (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
1812 #define XXH_CPU_LITTLE_ENDIAN 0
1813 #else
1814 /*!
1815 * @internal
1816 * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
1817 *
1818 * Most compilers will constant fold this.
1819 */
XXH_isLittleEndian(void)1820 static int XXH_isLittleEndian(void) {
1821
1822 /*
1823 * Portable and well-defined behavior.
1824 * Don't use static: it is detrimental to performance.
1825 */
1826 const union {
1827
1828 xxh_u32 u;
1829 xxh_u8 c[4];
1830
1831 } one = {1};
1832
1833 return one.c[0];
1834
1835 }
1836
1837 #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
1838 #endif
1839 #endif
1840
1841 /* ****************************************
1842 * Compiler-specific Functions and Macros
1843 ******************************************/
1844 #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
1845
1846 #ifdef __has_builtin
1847 #define XXH_HAS_BUILTIN(x) __has_builtin(x)
1848 #else
1849 #define XXH_HAS_BUILTIN(x) 0
1850 #endif
1851
1852 /*!
1853 * @internal
1854 * @def XXH_rotl32(x,r)
1855 * @brief 32-bit rotate left.
1856 *
1857 * @param x The 32-bit integer to be rotated.
1858 * @param r The number of bits to rotate.
1859 * @pre
1860 * @p r > 0 && @p r < 32
1861 * @note
1862 * @p x and @p r may be evaluated multiple times.
1863 * @return The rotated result.
1864 */
1865 #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \
1866 XXH_HAS_BUILTIN(__builtin_rotateleft64)
1867 #define XXH_rotl32 __builtin_rotateleft32
1868 #define XXH_rotl64 __builtin_rotateleft64
1869 /* Note: although _rotl exists for minGW (GCC under windows), performance
1870 * seems poor */
1871 #elif defined(_MSC_VER)
1872 #define XXH_rotl32(x, r) _rotl(x, r)
1873 #define XXH_rotl64(x, r) _rotl64(x, r)
1874 #else
1875 #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
1876 #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
1877 #endif
1878
1879 /*!
1880 * @internal
1881 * @fn xxh_u32 XXH_swap32(xxh_u32 x)
1882 * @brief A 32-bit byteswap.
1883 *
1884 * @param x The 32-bit integer to byteswap.
1885 * @return @p x, byteswapped.
1886 */
1887 #if defined(_MSC_VER) /* Visual Studio */
1888 #define XXH_swap32 _byteswap_ulong
1889 #elif XXH_GCC_VERSION >= 403
1890 #define XXH_swap32 __builtin_bswap32
1891 #else
XXH_swap32(xxh_u32 x)1892 static xxh_u32 XXH_swap32(xxh_u32 x) {
1893
1894 return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |
1895 ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);
1896
1897 }
1898
1899 #endif
1900
1901 /* ***************************
1902 * Memory reads
1903 *****************************/
1904
1905 /*!
1906 * @internal
1907 * @brief Enum to indicate whether a pointer is aligned.
1908 */
1909 typedef enum {
1910
1911 XXH_aligned, /*!< Aligned */
1912 XXH_unaligned /*!< Possibly unaligned */
1913
1914 } XXH_alignment;
1915
1916 /*
1917 * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
1918 *
1919 * This is ideal for older compilers which don't inline memcpy.
1920 */
1921 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1922
XXH_readLE32(const void * memPtr)1923 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *memPtr) {
1924
1925 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1926 return bytePtr[0] | ((xxh_u32)bytePtr[1] << 8) | ((xxh_u32)bytePtr[2] << 16) |
1927 ((xxh_u32)bytePtr[3] << 24);
1928
1929 }
1930
XXH_readBE32(const void * memPtr)1931 XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void *memPtr) {
1932
1933 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1934 return bytePtr[3] | ((xxh_u32)bytePtr[2] << 8) | ((xxh_u32)bytePtr[1] << 16) |
1935 ((xxh_u32)bytePtr[0] << 24);
1936
1937 }
1938
1939 #else
XXH_readLE32(const void * ptr)1940 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *ptr) {
1941
1942 return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
1943
1944 }
1945
XXH_readBE32(const void * ptr)1946 static xxh_u32 XXH_readBE32(const void *ptr) {
1947
1948 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
1949
1950 }
1951
1952 #endif
1953
XXH_readLE32_align(const void * ptr,XXH_alignment align)1954 XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void *ptr,
1955 XXH_alignment align) {
1956
1957 if (align == XXH_unaligned) {
1958
1959 return XXH_readLE32(ptr);
1960
1961 } else {
1962
1963 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32 *)ptr
1964 : XXH_swap32(*(const xxh_u32 *)ptr);
1965
1966 }
1967
1968 }
1969
1970 /* *************************************
1971 * Misc
1972 ***************************************/
1973 /*! @ingroup public */
XXH_versionNumber(void)1974 XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
1975
1976 return XXH_VERSION_NUMBER;
1977
1978 }
1979
1980 /* *******************************************************************
1981 * 32-bit hash functions
1982 *********************************************************************/
1983 /*!
1984 * @}
1985 * @defgroup xxh32_impl XXH32 implementation
1986 * @ingroup impl
1987 * @{
1988
1989 */
1990 /* #define instead of static const, to be used as initializers */
1991 #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */
1992 #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */
1993 #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */
1994 #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */
1995 #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */
1996
1997 #ifdef XXH_OLD_NAMES
1998 #define PRIME32_1 XXH_PRIME32_1
1999 #define PRIME32_2 XXH_PRIME32_2
2000 #define PRIME32_3 XXH_PRIME32_3
2001 #define PRIME32_4 XXH_PRIME32_4
2002 #define PRIME32_5 XXH_PRIME32_5
2003 #endif
2004
2005 /*!
2006 * @internal
2007 * @brief Normal stripe processing routine.
2008 *
2009 * This shuffles the bits so that any bit from @p input impacts several bits in
2010 * @p acc.
2011 *
2012 * @param acc The accumulator lane.
2013 * @param input The stripe of input to mix.
2014 * @return The mixed accumulator lane.
2015 */
XXH32_round(xxh_u32 acc,xxh_u32 input)2016 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
2017
2018 acc += input * XXH_PRIME32_2;
2019 acc = XXH_rotl32(acc, 13);
2020 acc *= XXH_PRIME32_1;
2021 #if (defined(__SSE4_1__) || defined(__aarch64__)) && \
2022 !defined(XXH_ENABLE_AUTOVECTORIZE)
2023 /*
2024 * UGLY HACK:
2025 * A compiler fence is the only thing that prevents GCC and Clang from
2026 * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
2027 * reason) without globally disabling SSE4.1.
2028 *
2029 * The reason we want to avoid vectorization is because despite working on
2030 * 4 integers at a time, there are multiple factors slowing XXH32 down on
2031 * SSE4:
2032 * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
2033 * newer chips!) making it slightly slower to multiply four integers at
2034 * once compared to four integers independently. Even when pmulld was
2035 * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
2036 * just to multiply unless doing a long operation.
2037 *
2038 * - Four instructions are required to rotate,
2039 * movqda tmp, v // not required with VEX encoding
2040 * pslld tmp, 13 // tmp <<= 13
2041 * psrld v, 19 // x >>= 19
2042 * por v, tmp // x |= tmp
2043 * compared to one for scalar:
2044 * roll v, 13 // reliably fast across the board
2045 * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason
2046 *
2047 * - Instruction level parallelism is actually more beneficial here because
2048 * the SIMD actually serializes this operation: While v1 is rotating, v2
2049 * can load data, while v3 can multiply. SSE forces them to operate
2050 * together.
2051 *
2052 * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
2053 * and it is pointless writing a NEON implementation that is basically the
2054 * same speed as scalar for XXH32.
2055 */
2056 XXH_COMPILER_GUARD(acc);
2057 #endif
2058 return acc;
2059
2060 }
2061
2062 /*!
2063 * @internal
2064 * @brief Mixes all bits to finalize the hash.
2065 *
2066 * The final mix ensures that all input bits have a chance to impact any bit in
2067 * the output digest, resulting in an unbiased distribution.
2068 *
2069 * @param h32 The hash to avalanche.
2070 * @return The avalanched hash.
2071 */
XXH32_avalanche(xxh_u32 h32)2072 static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
2073
2074 h32 ^= h32 >> 15;
2075 h32 *= XXH_PRIME32_2;
2076 h32 ^= h32 >> 13;
2077 h32 *= XXH_PRIME32_3;
2078 h32 ^= h32 >> 16;
2079 return (h32);
2080
2081 }
2082
2083 #define XXH_get32bits(p) XXH_readLE32_align(p, align)
2084
2085 #define XXH_PROCESS1 \
2086 do { \
2087 \
2088 h32 += (*ptr++) * XXH_PRIME32_5; \
2089 h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
2090 \
2091 } while (0)
2092
2093 #define XXH_PROCESS4 \
2094 do { \
2095 \
2096 h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
2097 ptr += 4; \
2098 h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
2099 \
2100 } while (0)
2101
2102 /*!
2103 * @internal
2104 * @brief Processes the last 0-15 bytes of @p ptr.
2105 *
2106 * There may be up to 15 bytes remaining to consume from the input.
2107 * This final stage will digest them to ensure that all input bytes are present
2108 * in the final mix.
2109 *
2110 * @param h32 The hash to finalize.
2111 * @param ptr The pointer to the remaining input.
2112 * @param len The remaining length, modulo 16.
2113 * @param align Whether @p ptr is aligned.
2114 * @return The finalized hash.
2115 */
XXH32_finalize(xxh_u32 h32,const xxh_u8 * ptr,size_t len,XXH_alignment align)2116 static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
2117 XXH_alignment align) {
2118
2119 /* Compact rerolled version */
2120 if (XXH_REROLL) {
2121
2122 len &= 15;
2123 while (len >= 4) {
2124
2125 XXH_PROCESS4;
2126 len -= 4;
2127
2128 }
2129
2130 while (len > 0) {
2131
2132 XXH_PROCESS1;
2133 --len;
2134
2135 }
2136
2137 return XXH32_avalanche(h32);
2138
2139 } else {
2140
2141 switch (len & 15) /* or switch(bEnd - p) */ {
2142
2143 case 12:
2144 XXH_PROCESS4;
2145 XXH_FALLTHROUGH;
2146 case 8:
2147 XXH_PROCESS4;
2148 XXH_FALLTHROUGH;
2149 case 4:
2150 XXH_PROCESS4;
2151 return XXH32_avalanche(h32);
2152
2153 case 13:
2154 XXH_PROCESS4;
2155 XXH_FALLTHROUGH;
2156 case 9:
2157 XXH_PROCESS4;
2158 XXH_FALLTHROUGH;
2159 case 5:
2160 XXH_PROCESS4;
2161 XXH_PROCESS1;
2162 return XXH32_avalanche(h32);
2163
2164 case 14:
2165 XXH_PROCESS4;
2166 XXH_FALLTHROUGH;
2167 case 10:
2168 XXH_PROCESS4;
2169 XXH_FALLTHROUGH;
2170 case 6:
2171 XXH_PROCESS4;
2172 XXH_PROCESS1;
2173 XXH_PROCESS1;
2174 return XXH32_avalanche(h32);
2175
2176 case 15:
2177 XXH_PROCESS4;
2178 XXH_FALLTHROUGH;
2179 case 11:
2180 XXH_PROCESS4;
2181 XXH_FALLTHROUGH;
2182 case 7:
2183 XXH_PROCESS4;
2184 XXH_FALLTHROUGH;
2185 case 3:
2186 XXH_PROCESS1;
2187 XXH_FALLTHROUGH;
2188 case 2:
2189 XXH_PROCESS1;
2190 XXH_FALLTHROUGH;
2191 case 1:
2192 XXH_PROCESS1;
2193 XXH_FALLTHROUGH;
2194 case 0:
2195 return XXH32_avalanche(h32);
2196
2197 }
2198
2199 XXH_ASSERT(0);
2200 return h32; /* reaching this point is deemed impossible */
2201
2202 }
2203
2204 }
2205
2206 #ifdef XXH_OLD_NAMES
2207 #define PROCESS1 XXH_PROCESS1
2208 #define PROCESS4 XXH_PROCESS4
2209 #else
2210 #undef XXH_PROCESS1
2211 #undef XXH_PROCESS4
2212 #endif
2213
2214 /*!
2215 * @internal
2216 * @brief The implementation for @ref XXH32().
2217 *
2218 * @param input, len, seed Directly passed from @ref XXH32().
2219 * @param align Whether @p input is aligned.
2220 * @return The calculated hash.
2221 */
XXH32_endian_align(const xxh_u8 * input,size_t len,xxh_u32 seed,XXH_alignment align)2222 XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
2223 xxh_u32 seed, XXH_alignment align) {
2224
2225 const xxh_u8 *bEnd = input ? input + len : NULL;
2226 xxh_u32 h32;
2227
2228 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2229 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2230 if (input == NULL) {
2231
2232 len = 0;
2233 bEnd = input = (const xxh_u8 *)(size_t)16;
2234
2235 }
2236
2237 #endif
2238
2239 if (len >= 16) {
2240
2241 const xxh_u8 *const limit = bEnd - 15;
2242 xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2243 xxh_u32 v2 = seed + XXH_PRIME32_2;
2244 xxh_u32 v3 = seed + 0;
2245 xxh_u32 v4 = seed - XXH_PRIME32_1;
2246
2247 do {
2248
2249 v1 = XXH32_round(v1, XXH_get32bits(input));
2250 input += 4;
2251 v2 = XXH32_round(v2, XXH_get32bits(input));
2252 input += 4;
2253 v3 = XXH32_round(v3, XXH_get32bits(input));
2254 input += 4;
2255 v4 = XXH32_round(v4, XXH_get32bits(input));
2256 input += 4;
2257
2258 } while (input < limit);
2259
2260 h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) +
2261 XXH_rotl32(v4, 18);
2262
2263 } else {
2264
2265 h32 = seed + XXH_PRIME32_5;
2266
2267 }
2268
2269 h32 += (xxh_u32)len;
2270
2271 return XXH32_finalize(h32, input, len & 15, align);
2272
2273 }
2274
2275 /*! @ingroup xxh32_family */
XXH32(const void * input,size_t len,XXH32_hash_t seed)2276 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
2277 XXH32_hash_t seed) {
2278
2279 #if 0
2280 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
2281 XXH32_state_t state;
2282 XXH32_reset(&state, seed);
2283 XXH32_update(&state, (const xxh_u8*)input, len);
2284 return XXH32_digest(&state);
2285 #else
2286 if (XXH_FORCE_ALIGN_CHECK) {
2287
2288 if ((((size_t)input) & 3) ==
2289 0) { /* Input is 4-bytes aligned, leverage the speed benefit */
2290 return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned);
2291
2292 }
2293
2294 }
2295
2296 return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned);
2297 #endif
2298
2299 }
2300
2301 /******* Hash streaming *******/
2302 /*!
2303 * @ingroup xxh32_family
2304 */
XXH32_createState(void)2305 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) {
2306
2307 return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t));
2308
2309 }
2310
2311 /*! @ingroup xxh32_family */
XXH32_freeState(XXH32_state_t * statePtr)2312 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
2313
2314 XXH_free(statePtr);
2315 return XXH_OK;
2316
2317 }
2318
2319 /*! @ingroup xxh32_family */
XXH32_copyState(XXH32_state_t * dstState,const XXH32_state_t * srcState)2320 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t *dstState,
2321 const XXH32_state_t *srcState) {
2322
2323 memcpy(dstState, srcState, sizeof(*dstState));
2324
2325 }
2326
2327 /*! @ingroup xxh32_family */
XXH32_reset(XXH32_state_t * statePtr,XXH32_hash_t seed)2328 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
2329 XXH32_hash_t seed) {
2330
2331 XXH32_state_t state; /* using a local state to memcpy() in order to avoid
2332 strict-aliasing warnings */
2333 memset(&state, 0, sizeof(state));
2334 state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2335 state.v2 = seed + XXH_PRIME32_2;
2336 state.v3 = seed + 0;
2337 state.v4 = seed - XXH_PRIME32_1;
2338 /* do not write into reserved, planned to be removed in a future version */
2339 memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
2340 return XXH_OK;
2341
2342 }
2343
2344 /*! @ingroup xxh32_family */
XXH32_update(XXH32_state_t * state,const void * input,size_t len)2345 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
2346 const void *input, size_t len) {
2347
2348 if (input == NULL)
2349 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2350 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2351 return XXH_OK;
2352 #else
2353 return XXH_ERROR;
2354 #endif
2355
2356 {
2357
2358 const xxh_u8 *p = (const xxh_u8 *)input;
2359 const xxh_u8 *const bEnd = p + len;
2360
2361 state->total_len_32 += (XXH32_hash_t)len;
2362 state->large_len |=
2363 (XXH32_hash_t)((len >= 16) | (state->total_len_32 >= 16));
2364
2365 if (state->memsize + len < 16) { /* fill in tmp buffer */
2366 XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input, len);
2367 state->memsize += (XXH32_hash_t)len;
2368 return XXH_OK;
2369
2370 }
2371
2372 if (state->memsize) { /* some data left from previous update */
2373 XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input,
2374 16 - state->memsize);
2375 {
2376
2377 const xxh_u32 *p32 = state->mem32;
2378 state->v1 = XXH32_round(state->v1, XXH_readLE32(p32));
2379 p32++;
2380 state->v2 = XXH32_round(state->v2, XXH_readLE32(p32));
2381 p32++;
2382 state->v3 = XXH32_round(state->v3, XXH_readLE32(p32));
2383 p32++;
2384 state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
2385
2386 }
2387
2388 p += 16 - state->memsize;
2389 state->memsize = 0;
2390
2391 }
2392
2393 if (p <= bEnd - 16) {
2394
2395 const xxh_u8 *const limit = bEnd - 16;
2396 xxh_u32 v1 = state->v1;
2397 xxh_u32 v2 = state->v2;
2398 xxh_u32 v3 = state->v3;
2399 xxh_u32 v4 = state->v4;
2400
2401 do {
2402
2403 v1 = XXH32_round(v1, XXH_readLE32(p));
2404 p += 4;
2405 v2 = XXH32_round(v2, XXH_readLE32(p));
2406 p += 4;
2407 v3 = XXH32_round(v3, XXH_readLE32(p));
2408 p += 4;
2409 v4 = XXH32_round(v4, XXH_readLE32(p));
2410 p += 4;
2411
2412 } while (p <= limit);
2413
2414 state->v1 = v1;
2415 state->v2 = v2;
2416 state->v3 = v3;
2417 state->v4 = v4;
2418
2419 }
2420
2421 if (p < bEnd) {
2422
2423 XXH_memcpy(state->mem32, p, (size_t)(bEnd - p));
2424 state->memsize = (unsigned)(bEnd - p);
2425
2426 }
2427
2428 }
2429
2430 return XXH_OK;
2431
2432 }
2433
2434 /*! @ingroup xxh32_family */
XXH32_digest(const XXH32_state_t * state)2435 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
2436
2437 xxh_u32 h32;
2438
2439 if (state->large_len) {
2440
2441 h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) +
2442 XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
2443
2444 } else {
2445
2446 h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
2447
2448 }
2449
2450 h32 += state->total_len_32;
2451
2452 return XXH32_finalize(h32, (const xxh_u8 *)state->mem32, state->memsize,
2453 XXH_aligned);
2454
2455 }
2456
2457 /******* Canonical representation *******/
2458
2459 /*!
2460 * @ingroup xxh32_family
2461 * The default return values from XXH functions are unsigned 32 and 64 bit
2462 * integers.
2463 *
2464 * The canonical representation uses big endian convention, the same convention
2465 * as human-readable numbers (large digits first).
2466 *
2467 * This way, hash values can be written into a file or buffer, remaining
2468 * comparable across different systems.
2469 *
2470 * The following functions allow transformation of hash values to and from their
2471 * canonical format.
2472 */
XXH32_canonicalFromHash(XXH32_canonical_t * dst,XXH32_hash_t hash)2473 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
2474 XXH32_hash_t hash) {
2475
2476 XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
2477 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
2478 memcpy(dst, &hash, sizeof(*dst));
2479
2480 }
2481
2482 /*! @ingroup xxh32_family */
2483 XXH_PUBLIC_API XXH32_hash_t
XXH32_hashFromCanonical(const XXH32_canonical_t * src)2484 XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
2485
2486 return XXH_readBE32(src);
2487
2488 }
2489
2490 #ifndef XXH_NO_LONG_LONG
2491
2492 /* *******************************************************************
2493 * 64-bit hash functions
2494 *********************************************************************/
2495 /*!
2496 * @}
2497 * @ingroup impl
2498 * @{
2499
2500 */
2501 /******* Memory access *******/
2502
2503 typedef XXH64_hash_t xxh_u64;
2504
2505 #ifdef XXH_OLD_NAMES
2506 #define U64 xxh_u64
2507 #endif
2508
2509 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
2510 /*
2511 * Manual byteshift. Best for old compilers which don't inline memcpy.
2512 * We actually directly use XXH_readLE64 and XXH_readBE64.
2513 */
2514 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
2515
2516 /* Force direct memory access. Only works on CPU which support unaligned memory
2517 * access in hardware */
XXH_read64(const void * memPtr)2518 static xxh_u64 XXH_read64(const void *memPtr) {
2519
2520 return *(const xxh_u64 *)memPtr;
2521
2522 }
2523
2524 #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
2525
2526 /*
2527 * __pack instructions are safer, but compiler specific, hence potentially
2528 * problematic for some compilers.
2529 *
2530 * Currently only defined for GCC and ICC.
2531 */
2532 #ifdef XXH_OLD_NAMES
2533 typedef union {
2534
2535 xxh_u32 u32;
2536 xxh_u64 u64;
2537
2538 } __attribute__((packed)) unalign64;
2539
2540 #endif
XXH_read64(const void * ptr)2541 static xxh_u64 XXH_read64(const void *ptr) {
2542
2543 typedef union {
2544
2545 xxh_u32 u32;
2546 xxh_u64 u64;
2547
2548 } __attribute__((packed)) xxh_unalign64;
2549
2550 return ((const xxh_unalign64 *)ptr)->u64;
2551
2552 }
2553
2554 #else
2555
2556 /*
2557 * Portable and safe solution. Generally efficient.
2558 * see: https://stackoverflow.com/a/32095106/646947
2559 */
XXH_read64(const void * memPtr)2560 static xxh_u64 XXH_read64(const void *memPtr) {
2561
2562 xxh_u64 val;
2563 memcpy(&val, memPtr, sizeof(val));
2564 return val;
2565
2566 }
2567
2568 #endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
2569
2570 #if defined(_MSC_VER) /* Visual Studio */
2571 #define XXH_swap64 _byteswap_uint64
2572 #elif XXH_GCC_VERSION >= 403
2573 #define XXH_swap64 __builtin_bswap64
2574 #else
XXH_swap64(xxh_u64 x)2575 static xxh_u64 XXH_swap64(xxh_u64 x) {
2576
2577 return ((x << 56) & 0xff00000000000000ULL) |
2578 ((x << 40) & 0x00ff000000000000ULL) |
2579 ((x << 24) & 0x0000ff0000000000ULL) |
2580 ((x << 8) & 0x000000ff00000000ULL) |
2581 ((x >> 8) & 0x00000000ff000000ULL) |
2582 ((x >> 24) & 0x0000000000ff0000ULL) |
2583 ((x >> 40) & 0x000000000000ff00ULL) |
2584 ((x >> 56) & 0x00000000000000ffULL);
2585
2586 }
2587
2588 #endif
2589
2590 /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
2591 #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
2592
XXH_readLE64(const void * memPtr)2593 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *memPtr) {
2594
2595 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
2596 return bytePtr[0] | ((xxh_u64)bytePtr[1] << 8) | ((xxh_u64)bytePtr[2] << 16) |
2597 ((xxh_u64)bytePtr[3] << 24) | ((xxh_u64)bytePtr[4] << 32) |
2598 ((xxh_u64)bytePtr[5] << 40) | ((xxh_u64)bytePtr[6] << 48) |
2599 ((xxh_u64)bytePtr[7] << 56);
2600
2601 }
2602
XXH_readBE64(const void * memPtr)2603 XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void *memPtr) {
2604
2605 const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
2606 return bytePtr[7] | ((xxh_u64)bytePtr[6] << 8) | ((xxh_u64)bytePtr[5] << 16) |
2607 ((xxh_u64)bytePtr[4] << 24) | ((xxh_u64)bytePtr[3] << 32) |
2608 ((xxh_u64)bytePtr[2] << 40) | ((xxh_u64)bytePtr[1] << 48) |
2609 ((xxh_u64)bytePtr[0] << 56);
2610
2611 }
2612
2613 #else
XXH_readLE64(const void * ptr)2614 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *ptr) {
2615
2616 return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
2617
2618 }
2619
XXH_readBE64(const void * ptr)2620 static xxh_u64 XXH_readBE64(const void *ptr) {
2621
2622 return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
2623
2624 }
2625
2626 #endif
2627
XXH_readLE64_align(const void * ptr,XXH_alignment align)2628 XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void *ptr,
2629 XXH_alignment align) {
2630
2631 if (align == XXH_unaligned)
2632 return XXH_readLE64(ptr);
2633 else
2634 return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64 *)ptr
2635 : XXH_swap64(*(const xxh_u64 *)ptr);
2636
2637 }
2638
2639 /******* xxh64 *******/
2640 /*!
2641 * @}
2642 * @defgroup xxh64_impl XXH64 implementation
2643 * @ingroup impl
2644 * @{
2645
2646 */
2647 /* #define rather that static const, to be used as initializers */
2648 #define XXH_PRIME64_1 \
2649 0x9E3779B185EBCA87ULL /*!< \
2650 0b1001111000110111011110011011000110000101111010111100101010000111 \
2651 */
2652 #define XXH_PRIME64_2 \
2653 0xC2B2AE3D27D4EB4FULL /*!< \
2654 0b1100001010110010101011100011110100100111110101001110101101001111 \
2655 */
2656 #define XXH_PRIME64_3 \
2657 0x165667B19E3779F9ULL /*!< \
2658 0b0001011001010110011001111011000110011110001101110111100111111001 \
2659 */
2660 #define XXH_PRIME64_4 \
2661 0x85EBCA77C2B2AE63ULL /*!< \
2662 0b1000010111101011110010100111011111000010101100101010111001100011 \
2663 */
2664 #define XXH_PRIME64_5 \
2665 0x27D4EB2F165667C5ULL /*!< \
2666 0b0010011111010100111010110010111100010110010101100110011111000101 \
2667 */
2668
2669 #ifdef XXH_OLD_NAMES
2670 #define PRIME64_1 XXH_PRIME64_1
2671 #define PRIME64_2 XXH_PRIME64_2
2672 #define PRIME64_3 XXH_PRIME64_3
2673 #define PRIME64_4 XXH_PRIME64_4
2674 #define PRIME64_5 XXH_PRIME64_5
2675 #endif
2676
XXH64_round(xxh_u64 acc,xxh_u64 input)2677 static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) {
2678
2679 acc += input * XXH_PRIME64_2;
2680 acc = XXH_rotl64(acc, 31);
2681 acc *= XXH_PRIME64_1;
2682 return acc;
2683
2684 }
2685
XXH64_mergeRound(xxh_u64 acc,xxh_u64 val)2686 static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) {
2687
2688 val = XXH64_round(0, val);
2689 acc ^= val;
2690 acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
2691 return acc;
2692
2693 }
2694
XXH64_avalanche(xxh_u64 h64)2695 static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
2696
2697 h64 ^= h64 >> 33;
2698 h64 *= XXH_PRIME64_2;
2699 h64 ^= h64 >> 29;
2700 h64 *= XXH_PRIME64_3;
2701 h64 ^= h64 >> 32;
2702 return h64;
2703
2704 }
2705
2706 #define XXH_get64bits(p) XXH_readLE64_align(p, align)
2707
XXH64_finalize(xxh_u64 h64,const xxh_u8 * ptr,size_t len,XXH_alignment align)2708 static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
2709 XXH_alignment align) {
2710
2711 len &= 31;
2712 while (len >= 8) {
2713
2714 xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
2715 ptr += 8;
2716 h64 ^= k1;
2717 h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4;
2718 len -= 8;
2719
2720 }
2721
2722 if (len >= 4) {
2723
2724 h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
2725 ptr += 4;
2726 h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
2727 len -= 4;
2728
2729 }
2730
2731 while (len > 0) {
2732
2733 h64 ^= (*ptr++) * XXH_PRIME64_5;
2734 h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
2735 --len;
2736
2737 }
2738
2739 return XXH64_avalanche(h64);
2740
2741 }
2742
2743 #ifdef XXH_OLD_NAMES
2744 #define PROCESS1_64 XXH_PROCESS1_64
2745 #define PROCESS4_64 XXH_PROCESS4_64
2746 #define PROCESS8_64 XXH_PROCESS8_64
2747 #else
2748 #undef XXH_PROCESS1_64
2749 #undef XXH_PROCESS4_64
2750 #undef XXH_PROCESS8_64
2751 #endif
2752
XXH64_endian_align(const xxh_u8 * input,size_t len,xxh_u64 seed,XXH_alignment align)2753 XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
2754 xxh_u64 seed, XXH_alignment align) {
2755
2756 const xxh_u8 *bEnd = input ? input + len : NULL;
2757 xxh_u64 h64;
2758
2759 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2760 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2761 if (input == NULL) {
2762
2763 len = 0;
2764 bEnd = input = (const xxh_u8 *)(size_t)32;
2765
2766 }
2767
2768 #endif
2769
2770 if (len >= 32) {
2771
2772 const xxh_u8 *const limit = bEnd - 32;
2773 xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2774 xxh_u64 v2 = seed + XXH_PRIME64_2;
2775 xxh_u64 v3 = seed + 0;
2776 xxh_u64 v4 = seed - XXH_PRIME64_1;
2777
2778 do {
2779
2780 v1 = XXH64_round(v1, XXH_get64bits(input));
2781 input += 8;
2782 v2 = XXH64_round(v2, XXH_get64bits(input));
2783 input += 8;
2784 v3 = XXH64_round(v3, XXH_get64bits(input));
2785 input += 8;
2786 v4 = XXH64_round(v4, XXH_get64bits(input));
2787 input += 8;
2788
2789 } while (input <= limit);
2790
2791 h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
2792 XXH_rotl64(v4, 18);
2793 h64 = XXH64_mergeRound(h64, v1);
2794 h64 = XXH64_mergeRound(h64, v2);
2795 h64 = XXH64_mergeRound(h64, v3);
2796 h64 = XXH64_mergeRound(h64, v4);
2797
2798 } else {
2799
2800 h64 = seed + XXH_PRIME64_5;
2801
2802 }
2803
2804 h64 += (xxh_u64)len;
2805
2806 return XXH64_finalize(h64, input, len, align);
2807
2808 }
2809
2810 /*! @ingroup xxh64_family */
XXH64(const void * input,size_t len,XXH64_hash_t seed)2811 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
2812 XXH64_hash_t seed) {
2813
2814 #if 0
2815 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
2816 XXH64_state_t state;
2817 XXH64_reset(&state, seed);
2818 XXH64_update(&state, (const xxh_u8*)input, len);
2819 return XXH64_digest(&state);
2820 #else
2821 if (XXH_FORCE_ALIGN_CHECK) {
2822
2823 if ((((size_t)input) & 7) ==
2824 0) { /* Input is aligned, let's leverage the speed advantage */
2825 return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned);
2826
2827 }
2828
2829 }
2830
2831 return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned);
2832
2833 #endif
2834
2835 }
2836
2837 /******* Hash Streaming *******/
2838
2839 /*! @ingroup xxh64_family*/
XXH64_createState(void)2840 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) {
2841
2842 return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t));
2843
2844 }
2845
2846 /*! @ingroup xxh64_family */
XXH64_freeState(XXH64_state_t * statePtr)2847 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
2848
2849 XXH_free(statePtr);
2850 return XXH_OK;
2851
2852 }
2853
2854 /*! @ingroup xxh64_family */
XXH64_copyState(XXH64_state_t * dstState,const XXH64_state_t * srcState)2855 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t *dstState,
2856 const XXH64_state_t *srcState) {
2857
2858 memcpy(dstState, srcState, sizeof(*dstState));
2859
2860 }
2861
2862 /*! @ingroup xxh64_family */
XXH64_reset(XXH64_state_t * statePtr,XXH64_hash_t seed)2863 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
2864 XXH64_hash_t seed) {
2865
2866 XXH64_state_t state; /* use a local state to memcpy() in order to avoid
2867 strict-aliasing warnings */
2868 memset(&state, 0, sizeof(state));
2869 state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2870 state.v2 = seed + XXH_PRIME64_2;
2871 state.v3 = seed + 0;
2872 state.v4 = seed - XXH_PRIME64_1;
2873 /* do not write into reserved64, might be removed in a future version */
2874 memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
2875 return XXH_OK;
2876
2877 }
2878
2879 /*! @ingroup xxh64_family */
XXH64_update(XXH64_state_t * state,const void * input,size_t len)2880 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
2881 const void *input, size_t len) {
2882
2883 if (input == NULL)
2884 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2885 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2886 return XXH_OK;
2887 #else
2888 return XXH_ERROR;
2889 #endif
2890
2891 {
2892
2893 const xxh_u8 *p = (const xxh_u8 *)input;
2894 const xxh_u8 *const bEnd = p + len;
2895
2896 state->total_len += len;
2897
2898 if (state->memsize + len < 32) { /* fill in tmp buffer */
2899 XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input, len);
2900 state->memsize += (xxh_u32)len;
2901 return XXH_OK;
2902
2903 }
2904
2905 if (state->memsize) { /* tmp buffer is full */
2906 XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input,
2907 32 - state->memsize);
2908 state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0));
2909 state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1));
2910 state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2));
2911 state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3));
2912 p += 32 - state->memsize;
2913 state->memsize = 0;
2914
2915 }
2916
2917 if (p + 32 <= bEnd) {
2918
2919 const xxh_u8 *const limit = bEnd - 32;
2920 xxh_u64 v1 = state->v1;
2921 xxh_u64 v2 = state->v2;
2922 xxh_u64 v3 = state->v3;
2923 xxh_u64 v4 = state->v4;
2924
2925 do {
2926
2927 v1 = XXH64_round(v1, XXH_readLE64(p));
2928 p += 8;
2929 v2 = XXH64_round(v2, XXH_readLE64(p));
2930 p += 8;
2931 v3 = XXH64_round(v3, XXH_readLE64(p));
2932 p += 8;
2933 v4 = XXH64_round(v4, XXH_readLE64(p));
2934 p += 8;
2935
2936 } while (p <= limit);
2937
2938 state->v1 = v1;
2939 state->v2 = v2;
2940 state->v3 = v3;
2941 state->v4 = v4;
2942
2943 }
2944
2945 if (p < bEnd) {
2946
2947 XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
2948 state->memsize = (unsigned)(bEnd - p);
2949
2950 }
2951
2952 }
2953
2954 return XXH_OK;
2955
2956 }
2957
2958 /*! @ingroup xxh64_family */
XXH64_digest(const XXH64_state_t * state)2959 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
2960
2961 xxh_u64 h64;
2962
2963 if (state->total_len >= 32) {
2964
2965 xxh_u64 const v1 = state->v1;
2966 xxh_u64 const v2 = state->v2;
2967 xxh_u64 const v3 = state->v3;
2968 xxh_u64 const v4 = state->v4;
2969
2970 h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
2971 XXH_rotl64(v4, 18);
2972 h64 = XXH64_mergeRound(h64, v1);
2973 h64 = XXH64_mergeRound(h64, v2);
2974 h64 = XXH64_mergeRound(h64, v3);
2975 h64 = XXH64_mergeRound(h64, v4);
2976
2977 } else {
2978
2979 h64 = state->v3 /*seed*/ + XXH_PRIME64_5;
2980
2981 }
2982
2983 h64 += (xxh_u64)state->total_len;
2984
2985 return XXH64_finalize(h64, (const xxh_u8 *)state->mem64,
2986 (size_t)state->total_len, XXH_aligned);
2987
2988 }
2989
2990 /******* Canonical representation *******/
2991
2992 /*! @ingroup xxh64_family */
XXH64_canonicalFromHash(XXH64_canonical_t * dst,XXH64_hash_t hash)2993 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
2994 XXH64_hash_t hash) {
2995
2996 XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2997 if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
2998 memcpy(dst, &hash, sizeof(*dst));
2999
3000 }
3001
3002 /*! @ingroup xxh64_family */
3003 XXH_PUBLIC_API XXH64_hash_t
XXH64_hashFromCanonical(const XXH64_canonical_t * src)3004 XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
3005
3006 return XXH_readBE64(src);
3007
3008 }
3009
3010 #ifndef XXH_NO_XXH3
3011
3012 /* *********************************************************************
3013 * XXH3
3014 * New generation hash designed for speed on small keys and vectorization
3015 ************************************************************************ */
3016 /*!
3017 * @}
3018 * @defgroup xxh3_impl XXH3 implementation
3019 * @ingroup impl
3020 * @{
3021
3022 */
3023
3024 /* === Compiler specifics === */
3025
3026 #if ((defined(sun) || defined(__sun)) && \
3027 __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested \
3028 with GCC 5.5 */
3029 #define XXH_RESTRICT /* disable */
3030 #elif defined(__STDC_VERSION__) && \
3031 __STDC_VERSION__ >= 199901L /* >= C99 */
3032 #define XXH_RESTRICT restrict
3033 #else
3034 /* Note: it might be useful to define __restrict or __restrict__ for
3035 * some C++ compilers */
3036 #define XXH_RESTRICT /* disable */
3037 #endif
3038
3039 #if (defined(__GNUC__) && (__GNUC__ >= 3)) || \
3040 (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
3041 defined(__clang__)
3042 #define XXH_likely(x) __builtin_expect(x, 1)
3043 #define XXH_unlikely(x) __builtin_expect(x, 0)
3044 #else
3045 #define XXH_likely(x) (x)
3046 #define XXH_unlikely(x) (x)
3047 #endif
3048
3049 #if defined(__GNUC__)
3050 #if defined(__AVX2__)
3051 #include <immintrin.h>
3052 #elif defined(__SSE2__)
3053 #include <emmintrin.h>
3054 #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
3055 #define inline __inline__ /* circumvent a clang bug */
3056 #include <arm_neon.h>
3057 #undef inline
3058 #endif
3059 #elif defined(_MSC_VER)
3060 #include <intrin.h>
3061 #endif
3062
3063 /*
3064 * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
3065 * remaining a true 64-bit/128-bit hash function.
3066 *
3067 * This is done by prioritizing a subset of 64-bit operations that can be
3068 * emulated without too many steps on the average 32-bit machine.
3069 *
3070 * For example, these two lines seem similar, and run equally fast on
3071 * 64-bit:
3072 *
3073 * xxh_u64 x;
3074 * x ^= (x >> 47); // good
3075 * x ^= (x >> 13); // bad
3076 *
3077 * However, to a 32-bit machine, there is a major difference.
3078 *
3079 * x ^= (x >> 47) looks like this:
3080 *
3081 * x.lo ^= (x.hi >> (47 - 32));
3082 *
3083 * while x ^= (x >> 13) looks like this:
3084 *
3085 * // note: funnel shifts are not usually cheap.
3086 * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
3087 * x.hi ^= (x.hi >> 13);
3088 *
3089 * The first one is significantly faster than the second, simply because
3090 * the shift is larger than 32. This means:
3091 * - All the bits we need are in the upper 32 bits, so we can ignore the
3092 * lower 32 bits in the shift.
3093 * - The shift result will always fit in the lower 32 bits, and
3094 * therefore, we can ignore the upper 32 bits in the xor.
3095 *
3096 * Thanks to this optimization, XXH3 only requires these features to be
3097 * efficient:
3098 *
3099 * - Usable unaligned access
3100 * - A 32-bit or 64-bit ALU
3101 * - If 32-bit, a decent ADC instruction
3102 * - A 32 or 64-bit multiply with a 64-bit result
3103 * - For the 128-bit variant, a decent byteswap helps short inputs.
3104 *
3105 * The first two are already required by XXH32, and almost all 32-bit and
3106 * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
3107 *
3108 * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is
3109 * one notable exception.
3110 *
3111 * First of all, Thumb-1 lacks support for the UMULL instruction which
3112 * performs the important long multiply. This means numerous __aeabi_lmul
3113 * calls.
3114 *
3115 * Second of all, the 8 functional registers are just not enough.
3116 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
3117 * need Lo registers, and this shuffling results in thousands more MOVs
3118 * than A32.
3119 *
3120 * A32 and T32 don't have this limitation. They can access all 14
3121 * registers, do a 32->64 multiply with UMULL, and the flexible operand
3122 * allowing free shifts is helpful, too.
3123 *
3124 * Therefore, we do a quick sanity check.
3125 *
3126 * If compiling Thumb-1 for a target which supports ARM instructions, we
3127 * will emit a warning, as it is not a "sane" platform to compile for.
3128 *
3129 * Usually, if this happens, it is because of an accident and you probably
3130 * need to specify -march, as you likely meant to compile for a newer
3131 * architecture.
3132 *
3133 * Credit: large sections of the vectorial and asm source code paths
3134 * have been contributed by @easyaspi314
3135 */
3136 #if defined(__thumb__) && !defined(__thumb2__) && \
3137 defined(__ARM_ARCH_ISA_ARM)
3138 #warning "XXH3 is highly inefficient without ARM or Thumb-2."
3139 #endif
3140
3141 /* ==========================================
3142 * Vectorization detection
3143 * ========================================== */
3144
3145 #ifdef XXH_DOXYGEN
3146 /*!
3147 * @ingroup tuning
3148 * @brief Overrides the vectorization implementation chosen for XXH3.
3149 *
3150 * Can be defined to 0 to disable SIMD or any of the values mentioned in
3151 * @ref XXH_VECTOR_TYPE.
3152 *
3153 * If this is not defined, it uses predefined macros to determine the
3154 * best implementation.
3155 */
3156 #define XXH_VECTOR XXH_SCALAR
3157 /*!
3158 * @ingroup tuning
3159 * @brief Possible values for @ref XXH_VECTOR.
3160 *
3161 * Note that these are actually implemented as macros.
3162 *
3163 * If this is not defined, it is detected automatically.
3164 * @ref XXH_X86DISPATCH overrides this.
3165 */
3166 enum XXH_VECTOR_TYPE /* fake enum */ {
3167
3168 XXH_SCALAR = 0, /*!< Portable scalar version */
3169 XXH_SSE2 = 1, /*!<
3170 * SSE2 for Pentium 4, Opteron, all x86_64.
3171 *
3172 * @note SSE2 is also guaranteed on Windows 10, macOS, and
3173 * Android x86.
3174 */
3175 XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */
3176 XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
3177 XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
3178 XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
3179
3180 };
3181
3182 /*!
3183 * @ingroup tuning
3184 * @brief Selects the minimum alignment for XXH3's accumulators.
3185 *
3186 * When using SIMD, this should match the alignment reqired for said
3187 * vector type, so, for example, 32 for AVX2.
3188 *
3189 * Default: Auto detected.
3190 */
3191 #define XXH_ACC_ALIGN 8
3192 #endif
3193
3194 /* Actual definition */
3195 #ifndef XXH_DOXYGEN
3196 #define XXH_SCALAR 0
3197 #define XXH_SSE2 1
3198 #define XXH_AVX2 2
3199 #define XXH_AVX512 3
3200 #define XXH_NEON 4
3201 #define XXH_VSX 5
3202 #endif
3203
3204 #ifndef XXH_VECTOR /* can be defined on command line */
3205 #if defined(__AVX512F__)
3206 #define XXH_VECTOR XXH_AVX512
3207 #elif defined(__AVX2__)
3208 #define XXH_VECTOR XXH_AVX2
3209 #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
3210 (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
3211 #define XXH_VECTOR XXH_SSE2
3212 #elif defined(__GNUC__) /* msvc support maybe later */ \
3213 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) && \
3214 (defined( \
3215 __LITTLE_ENDIAN__) /* We only support little endian NEON */ \
3216 || (defined(__BYTE_ORDER__) && \
3217 __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
3218 #define XXH_VECTOR XXH_NEON
3219 #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
3220 (defined(__s390x__) && defined(__VEC__)) && \
3221 defined(__GNUC__) /* TODO: IBM XL */
3222 #define XXH_VECTOR XXH_VSX
3223 #else
3224 #define XXH_VECTOR XXH_SCALAR
3225 #endif
3226 #endif
3227
3228 /*
3229 * Controls the alignment of the accumulator,
3230 * for compatibility with aligned vector loads, which are usually faster.
3231 */
3232 #ifndef XXH_ACC_ALIGN
3233 #if defined(XXH_X86DISPATCH)
3234 #define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */
3235 #elif XXH_VECTOR == XXH_SCALAR /* scalar */
3236 #define XXH_ACC_ALIGN 8
3237 #elif XXH_VECTOR == XXH_SSE2 /* sse2 */
3238 #define XXH_ACC_ALIGN 16
3239 #elif XXH_VECTOR == XXH_AVX2 /* avx2 */
3240 #define XXH_ACC_ALIGN 32
3241 #elif XXH_VECTOR == XXH_NEON /* neon */
3242 #define XXH_ACC_ALIGN 16
3243 #elif XXH_VECTOR == XXH_VSX /* vsx */
3244 #define XXH_ACC_ALIGN 16
3245 #elif XXH_VECTOR == XXH_AVX512 /* avx512 */
3246 #define XXH_ACC_ALIGN 64
3247 #endif
3248 #endif
3249
3250 #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
3251 XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
3252 #define XXH_SEC_ALIGN XXH_ACC_ALIGN
3253 #else
3254 #define XXH_SEC_ALIGN 8
3255 #endif
3256
3257 /*
3258 * UGLY HACK:
3259 * GCC usually generates the best code with -O3 for xxHash.
3260 *
3261 * However, when targeting AVX2, it is overzealous in its unrolling
3262 * resulting in code roughly 3/4 the speed of Clang.
3263 *
3264 * There are other issues, such as GCC splitting _mm256_loadu_si256 into
3265 * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization
3266 * which only applies to Sandy and Ivy Bridge... which don't even support
3267 * AVX2.
3268 *
3269 * That is why when compiling the AVX2 version, it is recommended to use
3270 * either -O2 -mavx2 -march=haswell or -O2 -mavx2
3271 * -mno-avx256-split-unaligned-load for decent performance, or to use
3272 * Clang instead.
3273 *
3274 * Fortunately, we can control the first one with a pragma that forces GCC
3275 * into -O2, but the other one we can't control without "failed to inline
3276 * always inline function due to target mismatch" warnings.
3277 */
3278 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
3279 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
3280 && defined(__OPTIMIZE__) && \
3281 !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
3282 #pragma GCC push_options
3283 #pragma GCC optimize("-O2")
3284 #endif
3285
3286 #if XXH_VECTOR == XXH_NEON
3287 /*
3288 * NEON's setup for vmlal_u32 is a little more complicated than it is on
3289 * SSE2, AVX2, and VSX.
3290 *
3291 * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
3292 * upcast.
3293 *
3294 * To do the same operation, the 128-bit 'Q' register needs to be split
3295 * into two 64-bit 'D' registers, performing this operation::
3296 *
3297 * [ a | b ] |
3298 * '---------. .--------' | | x |
3299 * | .---------' '--------. |
3300 * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32
3301 * ]
3302 *
3303 * Due to significant changes in aarch64, the fastest method for aarch64
3304 * is completely different than the fastest method for ARMv7-A.
3305 *
3306 * ARMv7-A treats D registers as unions overlaying Q registers, so
3307 * modifying D11 will modify the high half of Q5. This is similar to how
3308 * modifying AH will only affect bits 8-15 of AX on x86.
3309 *
3310 * VZIP takes two registers, and puts even lanes in one register and odd
3311 * lanes in the other.
3312 *
3313 * On ARMv7-A, this strangely modifies both parameters in place instead
3314 * of taking the usual 3-operand form.
3315 *
3316 * Therefore, if we want to do this, we can simply use a D-form VZIP.32
3317 * on the lower and upper halves of the Q register to end up with the
3318 * high and low halves where we want - all in one instruction.
3319 *
3320 * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = {
3321
3322 * d10[1], d11[1] }
3323 *
3324 * Unfortunately we need inline assembly for this: Instructions
3325 * modifying two registers at once is not possible in GCC or Clang's IR,
3326 * and they have to create a copy.
3327 *
3328 * aarch64 requires a different approach.
3329 *
3330 * In order to make it easier to write a decent compiler for aarch64,
3331 * many quirks were removed, such as conditional execution.
3332 *
3333 * NEON was also affected by this.
3334 *
3335 * aarch64 cannot access the high bits of a Q-form register, and writes
3336 * to a D-form register zero the high bits, similar to how writes to
3337 * W-form scalar registers (or DWORD registers on x86_64) work.
3338 *
3339 * The formerly free vget_high intrinsics now require a vext (with a few
3340 * exceptions)
3341 *
3342 * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
3343 * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
3344 * only modify one operand.
3345 *
3346 * The equivalent of the VZIP.32 on the lower and upper halves would be
3347 * this mess:
3348 *
3349 * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0],
3350 * v0[1] } zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } zip2
3351 * v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
3352 *
3353 * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
3354 * (SHRN):
3355 *
3356 * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
3357 * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
3358 *
3359 * This is available on ARMv7-A, but is less efficient than a single
3360 * VZIP.32.
3361 */
3362
3363 /*!
3364 * Function-like macro:
3365 * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
3366 * &outHi)
3367 * {
3368
3369 * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
3370 * outHi = (uint32x2_t)(in >> 32);
3371 * in = UNDEFINED;
3372 * }
3373 */
3374 #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
3375 && defined(__GNUC__) && !defined(__aarch64__) && \
3376 !defined(__arm64__)
3377 #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3378 do { \
3379 \
3380 /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, \
3381 * %f0 = upper D half */ \
3382 /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 \
3383 */ \
3384 /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
3385 */ \
3386 __asm__("vzip.32 %e0, %f0" : "+w"(in)); \
3387 (outLo) = vget_low_u32(vreinterpretq_u32_u64(in)); \
3388 (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
3389 \
3390 } while (0)
3391
3392 #else
3393 #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3394 do { \
3395 \
3396 (outLo) = vmovn_u64(in); \
3397 (outHi) = vshrn_n_u64((in), 32); \
3398 \
3399 } while (0)
3400
3401 #endif
3402 #endif /* XXH_VECTOR == XXH_NEON */
3403
3404 /*
3405 * VSX and Z Vector helpers.
3406 *
3407 * This is very messy, and any pull requests to clean this up are welcome.
3408 *
3409 * There are a lot of problems with supporting VSX and s390x, due to
3410 * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
3411 */
3412 #if XXH_VECTOR == XXH_VSX
3413 #if defined(__s390x__)
3414 #include <s390intrin.h>
3415 #else
3416 /* gcc's altivec.h can have the unwanted consequence to
3417 * unconditionally #define bool, vector, and pixel keywords, with bad
3418 * consequences for programs already using these keywords for other
3419 * purposes. The paragraph defining these macros is skipped when
3420 * __APPLE_ALTIVEC__ is defined.
3421 * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
3422 * compiler, but it seems that, in some cases, it isn't. Force the
3423 * build macro to be defined, so that keywords are not altered.
3424 */
3425 #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
3426 #define __APPLE_ALTIVEC__
3427 #endif
3428 #include <altivec.h>
3429 #endif
3430
3431 typedef __vector unsigned long long xxh_u64x2;
3432 typedef __vector unsigned char xxh_u8x16;
3433 typedef __vector unsigned xxh_u32x4;
3434
3435 #ifndef XXH_VSX_BE
3436 #if defined(__BIG_ENDIAN__) || \
3437 (defined(__BYTE_ORDER__) && \
3438 __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
3439 #define XXH_VSX_BE 1
3440 #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
3441 __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
3442 #warning \
3443 "-maltivec=be is not recommended. Please use native endianness."
3444 #define XXH_VSX_BE 1
3445 #else
3446 #define XXH_VSX_BE 0
3447 #endif
3448 #endif /* !defined(XXH_VSX_BE) */
3449
3450 #if XXH_VSX_BE
3451 #if defined(__POWER9_VECTOR__) || \
3452 (defined(__clang__) && defined(__s390x__))
3453 #define XXH_vec_revb vec_revb
3454 #else
3455 /*!
3456 * A polyfill for POWER9's vec_revb().
3457 */
XXH_vec_revb(xxh_u64x2 val)3458 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
3459
3460 xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
3461 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08};
3462 return vec_perm(val, val, vByteSwap);
3463
3464 }
3465
3466 #endif
3467 #endif /* XXH_VSX_BE */
3468
3469 /*!
3470 * Performs an unaligned vector load and byte swaps it on big endian.
3471 */
XXH_vec_loadu(const void * ptr)3472 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
3473
3474 xxh_u64x2 ret;
3475 memcpy(&ret, ptr, sizeof(xxh_u64x2));
3476 #if XXH_VSX_BE
3477 ret = XXH_vec_revb(ret);
3478 #endif
3479 return ret;
3480
3481 }
3482
3483 /*
3484 * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
3485 *
3486 * These intrinsics weren't added until GCC 8, despite existing for a
3487 * while, and they are endian dependent. Also, their meaning swap
3488 * depending on version.
3489 * */
3490 #if defined(__s390x__)
3491 /* s390x is always big endian, no issue on this platform */
3492 #define XXH_vec_mulo vec_mulo
3493 #define XXH_vec_mule vec_mule
3494 #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
3495 /* Clang has a better way to control this, we can just use the builtin
3496 * which doesn't swap. */
3497 #define XXH_vec_mulo __builtin_altivec_vmulouw
3498 #define XXH_vec_mule __builtin_altivec_vmuleuw
3499 #else
3500 /* gcc needs inline assembly */
3501 /* Adapted from
3502 * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_vec_mulo(xxh_u32x4 a,xxh_u32x4 b)3503 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) {
3504
3505 xxh_u64x2 result;
3506 __asm__("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
3507 return result;
3508
3509 }
3510
XXH_vec_mule(xxh_u32x4 a,xxh_u32x4 b)3511 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
3512
3513 xxh_u64x2 result;
3514 __asm__("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
3515 return result;
3516
3517 }
3518
3519 #endif /* XXH_vec_mulo, XXH_vec_mule */
3520 #endif /* XXH_VECTOR == XXH_VSX */
3521
3522 /* prefetch
3523 * can be disabled, by declaring XXH_NO_PREFETCH build macro */
3524 #if defined(XXH_NO_PREFETCH)
3525 #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
3526 #else
3527 #if defined(_MSC_VER) && \
3528 (defined(_M_X64) || \
3529 defined( \
3530 _M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
3531 #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
3532 #define XXH_PREFETCH(ptr) \
3533 _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
3534 #elif defined(__GNUC__) && \
3535 ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
3536 #define XXH_PREFETCH(ptr) \
3537 __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
3538 #else
3539 #define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
3540 #endif
3541 #endif /* XXH_NO_PREFETCH */
3542
3543 /* ==========================================
3544 * XXH3 default settings
3545 * ========================================== */
3546
3547 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
3548
3549 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
3550 #error "default keyset is not large enough"
3551 #endif
3552
3553 /*! Pseudorandom secret taken directly from FARSH. */
3554 XXH_ALIGN(64)
3555 static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
3556
3557 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c,
3558 0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb,
3559 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e,
3560 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
3561 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6,
3562 0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb,
3563 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97,
3564 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
3565 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7,
3566 0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31,
3567 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 0xea, 0xc5, 0xac, 0x83,
3568 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
3569 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26,
3570 0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc,
3571 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f,
3572 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
3573
3574 };
3575
3576 #ifdef XXH_OLD_NAMES
3577 #define kSecret XXH3_kSecret
3578 #endif
3579
3580 #ifdef XXH_DOXYGEN
3581 /*!
3582 * @brief Calculates a 32-bit to 64-bit long multiply.
3583 *
3584 * Implemented as a macro.
3585 *
3586 * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it
3587 * doesn't need to (but it shouldn't need to anyways, it is about 7 instructions
3588 * to do a 64x64 multiply...). Since we know that this will _always_ emit
3589 * `MULL`, we use that instead of the normal method.
3590 *
3591 * If you are compiling for platforms like Thumb-1 and don't have a better
3592 * option, you may also want to write your own long multiply routine here.
3593 *
3594 * @param x, y Numbers to be multiplied
3595 * @return 64-bit product of the low 32 bits of @p x and @p y.
3596 */
XXH_mult32to64(xxh_u64 x,xxh_u64 y)3597 XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) {
3598
3599 return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
3600
3601 }
3602
3603 #elif defined(_MSC_VER) && defined(_M_IX86)
3604 #include <intrin.h>
3605 #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
3606 #else
3607 /*
3608 * Downcast + upcast is usually better than masking on older compilers
3609 * like GCC 4.2 (especially 32-bit ones), all without affecting newer
3610 * compilers.
3611 *
3612 * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
3613 * operands and perform a full 64x64 multiply -- entirely redundant on
3614 * 32-bit.
3615 */
3616 #define XXH_mult32to64(x, y) \
3617 ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
3618 #endif
3619
3620 /*!
3621 * @brief Calculates a 64->128-bit long multiply.
3622 *
3623 * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
3624 * version.
3625 *
3626 * @param lhs, rhs The 64-bit integers to be multiplied
3627 * @return The 128-bit result represented in an @ref XXH128_hash_t.
3628 */
XXH_mult64to128(xxh_u64 lhs,xxh_u64 rhs)3629 static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
3630
3631 /*
3632 * GCC/Clang __uint128_t method.
3633 *
3634 * On most 64-bit targets, GCC and Clang define a __uint128_t type.
3635 * This is usually the best way as it usually uses a native long 64-bit
3636 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
3637 *
3638 * Usually.
3639 *
3640 * Despite being a 32-bit platform, Clang (and emscripten) define this
3641 * type despite not having the arithmetic for it. This results in a laggy
3642 * compiler builtin call which calculates a full 128-bit multiply.
3643 * In that case it is best to use the portable one.
3644 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
3645 */
3646 #if defined(__GNUC__) && !defined(__wasm__) && \
3647 defined(__SIZEOF_INT128__) || \
3648 (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
3649
3650 __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
3651 XXH128_hash_t r128;
3652 r128.low64 = (xxh_u64)(product);
3653 r128.high64 = (xxh_u64)(product >> 64);
3654 return r128;
3655
3656 /*
3657 * MSVC for x64's _umul128 method.
3658 *
3659 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
3660 * *HighProduct);
3661 *
3662 * This compiles to single operand MUL on x64.
3663 */
3664 #elif defined(_M_X64) || defined(_M_IA64)
3665
3666 #ifndef _MSC_VER
3667 #pragma intrinsic(_umul128)
3668 #endif
3669 xxh_u64 product_high;
3670 xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
3671 XXH128_hash_t r128;
3672 r128.low64 = product_low;
3673 r128.high64 = product_high;
3674 return r128;
3675
3676 #else
3677 /*
3678 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
3679 *
3680 * This is a fast and simple grade school multiply, which is shown below
3681 * with base 10 arithmetic instead of base 0x100000000.
3682 *
3683 * 9 3 // D2 lhs = 93
3684 * x 7 5 // D2 rhs = 75
3685 * ----------
3686 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
3687 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
3688 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
3689 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
3690 * ---------
3691 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
3692 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
3693 * ---------
3694 * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
3695 *
3696 * The reasons for adding the products like this are:
3697 * 1. It avoids manual carry tracking. Just like how
3698 * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
3699 * This avoids a lot of complexity.
3700 *
3701 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
3702 * instruction available in ARM's Digital Signal Processing extension
3703 * in 32-bit ARMv6 and later, which is shown below:
3704 *
3705 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
3706 * {
3707
3708 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
3709 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
3710 * *RdHi = (xxh_u32)(product >> 32);
3711 * }
3712 *
3713 * This instruction was designed for efficient long multiplication, and
3714 * allows this to be calculated in only 4 instructions at speeds
3715 * comparable to some 64-bit ALUs.
3716 *
3717 * 3. It isn't terrible on other platforms. Usually this will be a couple
3718 * of 32-bit ADD/ADCs.
3719 */
3720
3721 /* First calculate all of the cross products. */
3722 xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
3723 xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
3724 xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
3725 xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
3726
3727 /* Now add the products together. These will never overflow. */
3728 xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
3729 xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
3730 xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
3731
3732 XXH128_hash_t r128;
3733 r128.low64 = lower;
3734 r128.high64 = upper;
3735 return r128;
3736 #endif
3737
3738 }
3739
3740 /*!
3741 * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
3742 *
3743 * The reason for the separate function is to prevent passing too many structs
3744 * around by value. This will hopefully inline the multiply, but we don't force
3745 * it.
3746 *
3747 * @param lhs, rhs The 64-bit integers to multiply
3748 * @return The low 64 bits of the product XOR'd by the high 64 bits.
3749 * @see XXH_mult64to128()
3750 */
XXH3_mul128_fold64(xxh_u64 lhs,xxh_u64 rhs)3751 static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
3752
3753 XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
3754 return product.low64 ^ product.high64;
3755
3756 }
3757
3758 /*! Seems to produce slightly better code on GCC for some reason. */
XXH_xorshift64(xxh_u64 v64,int shift)3759 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
3760
3761 XXH_ASSERT(0 <= shift && shift < 64);
3762 return v64 ^ (v64 >> shift);
3763
3764 }
3765
3766 /*
3767 * This is a fast avalanche stage,
3768 * suitable when input bits are already partially mixed
3769 */
XXH3_avalanche(xxh_u64 h64)3770 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) {
3771
3772 h64 = XXH_xorshift64(h64, 37);
3773 h64 *= 0x165667919E3779F9ULL;
3774 h64 = XXH_xorshift64(h64, 32);
3775 return h64;
3776
3777 }
3778
3779 /*
3780 * This is a stronger avalanche,
3781 * inspired by Pelle Evensen's rrmxmx
3782 * preferable when input has not been previously mixed
3783 */
XXH3_rrmxmx(xxh_u64 h64,xxh_u64 len)3784 static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) {
3785
3786 /* this mix is inspired by Pelle Evensen's rrmxmx */
3787 h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
3788 h64 *= 0x9FB21C651E98DF25ULL;
3789 h64 ^= (h64 >> 35) + len;
3790 h64 *= 0x9FB21C651E98DF25ULL;
3791 return XXH_xorshift64(h64, 28);
3792
3793 }
3794
3795 /* ==========================================
3796 * Short keys
3797 * ==========================================
3798 * One of the shortcomings of XXH32 and XXH64 was that their performance was
3799 * sub-optimal on short lengths. It used an iterative algorithm which strongly
3800 * favored lengths that were a multiple of 4 or 8.
3801 *
3802 * Instead of iterating over individual inputs, we use a set of single shot
3803 * functions which piece together a range of lengths and operate in constant
3804 * time.
3805 *
3806 * Additionally, the number of multiplies has been significantly reduced. This
3807 * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
3808 *
3809 * Depending on the platform, this may or may not be faster than XXH32, but it
3810 * is almost guaranteed to be faster than XXH64.
3811 */
3812
3813 /*
3814 * At very short lengths, there isn't enough input to fully hide secrets, or use
3815 * the entire secret.
3816 *
3817 * There is also only a limited amount of mixing we can do before significantly
3818 * impacting performance.
3819 *
3820 * Therefore, we use different sections of the secret and always mix two secret
3821 * samples with an XOR. This should have no effect on performance on the
3822 * seedless or withSeed variants because everything _should_ be constant folded
3823 * by modern compilers.
3824 *
3825 * The XOR mixing hides individual parts of the secret and increases entropy.
3826 *
3827 * This adds an extra layer of strength for custom secrets.
3828 */
XXH3_len_1to3_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3829 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8 *input, size_t len,
3830 const xxh_u8 *secret,
3831 XXH64_hash_t seed) {
3832
3833 XXH_ASSERT(input != NULL);
3834 XXH_ASSERT(1 <= len && len <= 3);
3835 XXH_ASSERT(secret != NULL);
3836 /*
3837 * len = 1: combined = { input[0], 0x01, input[0], input[0] }
3838 * len = 2: combined = { input[1], 0x02, input[0], input[1] }
3839 * len = 3: combined = { input[2], 0x03, input[0], input[1] }
3840 */
3841 {
3842
3843 xxh_u8 const c1 = input[0];
3844 xxh_u8 const c2 = input[len >> 1];
3845 xxh_u8 const c3 = input[len - 1];
3846 xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
3847 ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
3848 xxh_u64 const bitflip =
3849 (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
3850 xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
3851 return XXH64_avalanche(keyed);
3852
3853 }
3854
3855 }
3856
XXH3_len_4to8_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3857 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
3858 const xxh_u8 *secret,
3859 XXH64_hash_t seed) {
3860
3861 XXH_ASSERT(input != NULL);
3862 XXH_ASSERT(secret != NULL);
3863 XXH_ASSERT(4 <= len && len <= 8);
3864 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
3865 {
3866
3867 xxh_u32 const input1 = XXH_readLE32(input);
3868 xxh_u32 const input2 = XXH_readLE32(input + len - 4);
3869 xxh_u64 const bitflip =
3870 (XXH_readLE64(secret + 8) ^ XXH_readLE64(secret + 16)) - seed;
3871 xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
3872 xxh_u64 const keyed = input64 ^ bitflip;
3873 return XXH3_rrmxmx(keyed, len);
3874
3875 }
3876
3877 }
3878
XXH3_len_9to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3879 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
3880 size_t len,
3881 const xxh_u8 *secret,
3882 XXH64_hash_t seed) {
3883
3884 XXH_ASSERT(input != NULL);
3885 XXH_ASSERT(secret != NULL);
3886 XXH_ASSERT(9 <= len && len <= 16);
3887 {
3888
3889 xxh_u64 const bitflip1 =
3890 (XXH_readLE64(secret + 24) ^ XXH_readLE64(secret + 32)) + seed;
3891 xxh_u64 const bitflip2 =
3892 (XXH_readLE64(secret + 40) ^ XXH_readLE64(secret + 48)) - seed;
3893 xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
3894 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
3895 xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi +
3896 XXH3_mul128_fold64(input_lo, input_hi);
3897 return XXH3_avalanche(acc);
3898
3899 }
3900
3901 }
3902
XXH3_len_0to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3903 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8 *input,
3904 size_t len,
3905 const xxh_u8 *secret,
3906 XXH64_hash_t seed) {
3907
3908 XXH_ASSERT(len <= 16);
3909 {
3910
3911 if (XXH_likely(len > 8))
3912 return XXH3_len_9to16_64b(input, len, secret, seed);
3913 if (XXH_likely(len >= 4))
3914 return XXH3_len_4to8_64b(input, len, secret, seed);
3915 if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
3916 return XXH64_avalanche(
3917 seed ^ (XXH_readLE64(secret + 56) ^ XXH_readLE64(secret + 64)));
3918
3919 }
3920
3921 }
3922
3923 /*
3924 * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
3925 * multiplication by zero, affecting hashes of lengths 17 to 240.
3926 *
3927 * However, they are very unlikely.
3928 *
3929 * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
3930 * unseeded non-cryptographic hashes, it does not attempt to defend itself
3931 * against specially crafted inputs, only random inputs.
3932 *
3933 * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
3934 * cancelling out the secret is taken an arbitrary number of times (addressed
3935 * in XXH3_accumulate_512), this collision is very unlikely with random inputs
3936 * and/or proper seeding:
3937 *
3938 * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
3939 * function that is only called up to 16 times per hash with up to 240 bytes of
3940 * input.
3941 *
3942 * This is not too bad for a non-cryptographic hash function, especially with
3943 * only 64 bit outputs.
3944 *
3945 * The 128-bit variant (which trades some speed for strength) is NOT affected
3946 * by this, although it is always a good idea to use a proper seed if you care
3947 * about strength.
3948 */
XXH3_mix16B(const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 seed64)3949 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
3950 const xxh_u8 *XXH_RESTRICT secret,
3951 xxh_u64 seed64) {
3952
3953 #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
3954 && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
3955 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like \
3956 XXH32 hack */
3957 /*
3958 * UGLY HACK:
3959 * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
3960 * slower code.
3961 *
3962 * By forcing seed64 into a register, we disrupt the cost model and
3963 * cause it to scalarize. See `XXH32_round()`
3964 *
3965 * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
3966 * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
3967 * GCC 9.2, despite both emitting scalar code.
3968 *
3969 * GCC generates much better scalar code than Clang for the rest of XXH3,
3970 * which is why finding a more optimal codepath is an interest.
3971 */
3972 XXH_COMPILER_GUARD(seed64);
3973 #endif
3974 {
3975
3976 xxh_u64 const input_lo = XXH_readLE64(input);
3977 xxh_u64 const input_hi = XXH_readLE64(input + 8);
3978 return XXH3_mul128_fold64(input_lo ^ (XXH_readLE64(secret) + seed64),
3979 input_hi ^ (XXH_readLE64(secret + 8) - seed64));
3980
3981 }
3982
3983 }
3984
3985 /* For mid range keys, XXH3 uses a Mum-hash variant. */
XXH3_len_17to128_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)3986 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
3987 const xxh_u8 *XXH_RESTRICT input, size_t len,
3988 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
3989
3990 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
3991 (void)secretSize;
3992 XXH_ASSERT(16 < len && len <= 128);
3993
3994 {
3995
3996 xxh_u64 acc = len * XXH_PRIME64_1;
3997 if (len > 32) {
3998
3999 if (len > 64) {
4000
4001 if (len > 96) {
4002
4003 acc += XXH3_mix16B(input + 48, secret + 96, seed);
4004 acc += XXH3_mix16B(input + len - 64, secret + 112, seed);
4005
4006 }
4007
4008 acc += XXH3_mix16B(input + 32, secret + 64, seed);
4009 acc += XXH3_mix16B(input + len - 48, secret + 80, seed);
4010
4011 }
4012
4013 acc += XXH3_mix16B(input + 16, secret + 32, seed);
4014 acc += XXH3_mix16B(input + len - 32, secret + 48, seed);
4015
4016 }
4017
4018 acc += XXH3_mix16B(input + 0, secret + 0, seed);
4019 acc += XXH3_mix16B(input + len - 16, secret + 16, seed);
4020
4021 return XXH3_avalanche(acc);
4022
4023 }
4024
4025 }
4026
4027 #define XXH3_MIDSIZE_MAX 240
4028
XXH3_len_129to240_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)4029 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
4030 const xxh_u8 *XXH_RESTRICT input, size_t len,
4031 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
4032
4033 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4034 (void)secretSize;
4035 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
4036
4037 #define XXH3_MIDSIZE_STARTOFFSET 3
4038 #define XXH3_MIDSIZE_LASTOFFSET 17
4039
4040 {
4041
4042 xxh_u64 acc = len * XXH_PRIME64_1;
4043 int const nbRounds = (int)len / 16;
4044 int i;
4045 for (i = 0; i < 8; i++) {
4046
4047 acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed);
4048
4049 }
4050
4051 acc = XXH3_avalanche(acc);
4052 XXH_ASSERT(nbRounds >= 8);
4053 #if defined(__clang__) /* Clang */ \
4054 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
4055 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
4056 /*
4057 * UGLY HACK:
4058 * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
4059 * In everywhere else, it uses scalar code.
4060 *
4061 * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
4062 * would still be slower than UMAAL (see XXH_mult64to128).
4063 *
4064 * Unfortunately, Clang doesn't handle the long multiplies properly and
4065 * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
4066 * scalarized into an ugly mess of VMOV.32 instructions.
4067 *
4068 * This mess is difficult to avoid without turning autovectorization
4069 * off completely, but they are usually relatively minor and/or not
4070 * worth it to fix.
4071 *
4072 * This loop is the easiest to fix, as unlike XXH32, this pragma
4073 * _actually works_ because it is a loop vectorization instead of an
4074 * SLP vectorization.
4075 */
4076 #pragma clang loop vectorize(disable)
4077 #endif
4078 for (i = 8; i < nbRounds; i++) {
4079
4080 acc +=
4081 XXH3_mix16B(input + (16 * i),
4082 secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
4083
4084 }
4085
4086 /* last bytes */
4087 acc += XXH3_mix16B(input + len - 16,
4088 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET,
4089 seed);
4090 return XXH3_avalanche(acc);
4091
4092 }
4093
4094 }
4095
4096 /* ======= Long Keys ======= */
4097
4098 #define XXH_STRIPE_LEN 64
4099 #define XXH_SECRET_CONSUME_RATE \
4100 8 /* nb of secret bytes consumed at each accumulation */
4101 #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
4102
4103 #ifdef XXH_OLD_NAMES
4104 #define STRIPE_LEN XXH_STRIPE_LEN
4105 #define ACC_NB XXH_ACC_NB
4106 #endif
4107
XXH_writeLE64(void * dst,xxh_u64 v64)4108 XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
4109
4110 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
4111 memcpy(dst, &v64, sizeof(v64));
4112
4113 }
4114
4115 /* Several intrinsic functions below are supposed to accept __int64 as
4116 * argument, as documented in
4117 * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
4118 * However, several environments do not define __int64 type,
4119 * requiring a workaround.
4120 */
4121 #if !defined(__VMS) && \
4122 (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
4123 (__STDC_VERSION__ >= 199901L) /* C99 */))
4124 typedef int64_t xxh_i64;
4125 #else
4126 /* the following type must have a width of 64-bit */
4127 typedef long long xxh_i64;
4128 #endif
4129
4130 /*
4131 * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
4132 * most optimized.
4133 *
4134 * It is a hardened version of UMAC, based off of FARSH's implementation.
4135 *
4136 * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
4137 * implementations, and it is ridiculously fast.
4138 *
4139 * We harden it by mixing the original input to the accumulators as well as
4140 * the product.
4141 *
4142 * This means that in the (relatively likely) case of a multiply by zero,
4143 * the original input is preserved.
4144 *
4145 * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
4146 * cross-pollination, as otherwise the upper and lower halves would be
4147 * essentially independent.
4148 *
4149 * This doesn't matter on 64-bit hashes since they all get merged together
4150 * in the end, so we skip the extra step.
4151 *
4152 * Both XXH3_64bits and XXH3_128bits use this subroutine.
4153 */
4154
4155 #if (XXH_VECTOR == XXH_AVX512) || \
4156 (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
4157
4158 #ifndef XXH_TARGET_AVX512
4159 #define XXH_TARGET_AVX512 /* disable attribute target */
4160 #endif
4161
XXH3_accumulate_512_avx512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4162 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
4163 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4164 const void *XXH_RESTRICT secret) {
4165
4166 __m512i *const xacc = (__m512i *)acc;
4167 XXH_ASSERT((((size_t)acc) & 63) == 0);
4168 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
4169
4170 {
4171
4172 /* data_vec = input[0]; */
4173 __m512i const data_vec = _mm512_loadu_si512(input);
4174 /* key_vec = secret[0]; */
4175 __m512i const key_vec = _mm512_loadu_si512(secret);
4176 /* data_key = data_vec ^ key_vec; */
4177 __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
4178 /* data_key_lo = data_key >> 32; */
4179 __m512i const data_key_lo =
4180 _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4181 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4182 __m512i const product = _mm512_mul_epu32(data_key, data_key_lo);
4183 /* xacc[0] += swap(data_vec); */
4184 __m512i const data_swap =
4185 _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
4186 __m512i const sum = _mm512_add_epi64(*xacc, data_swap);
4187 /* xacc[0] += product; */
4188 *xacc = _mm512_add_epi64(product, sum);
4189
4190 }
4191
4192 }
4193
4194 /*
4195 * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
4196 *
4197 * Multiplication isn't perfect, as explained by Google in HighwayHash:
4198 *
4199 * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
4200 * // varying degrees. In descending order of goodness, bytes
4201 * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
4202 * // As expected, the upper and lower bytes are much worse.
4203 *
4204 * Source:
4205 * https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
4206 *
4207 * Since our algorithm uses a pseudorandom secret to add some variance into the
4208 * mix, we don't need to (or want to) mix as often or as much as HighwayHash
4209 * does.
4210 *
4211 * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
4212 * extraction.
4213 *
4214 * Both XXH3_64bits and XXH3_128bits use this subroutine.
4215 */
4216
XXH3_scrambleAcc_avx512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4217 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
4218 void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
4219
4220 XXH_ASSERT((((size_t)acc) & 63) == 0);
4221 XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
4222 {
4223
4224 __m512i *const xacc = (__m512i *)acc;
4225 const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
4226
4227 /* xacc[0] ^= (xacc[0] >> 47) */
4228 __m512i const acc_vec = *xacc;
4229 __m512i const shifted = _mm512_srli_epi64(acc_vec, 47);
4230 __m512i const data_vec = _mm512_xor_si512(acc_vec, shifted);
4231 /* xacc[0] ^= secret; */
4232 __m512i const key_vec = _mm512_loadu_si512(secret);
4233 __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
4234
4235 /* xacc[0] *= XXH_PRIME32_1; */
4236 __m512i const data_key_hi =
4237 _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4238 __m512i const prod_lo = _mm512_mul_epu32(data_key, prime32);
4239 __m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32);
4240 *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
4241
4242 }
4243
4244 }
4245
XXH3_initCustomSecret_avx512(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4246 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
4247 void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4248
4249 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
4250 XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
4251 XXH_ASSERT(((size_t)customSecret & 63) == 0);
4252 (void)(&XXH_writeLE64);
4253 {
4254
4255 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
4256 __m512i const seed = _mm512_mask_set1_epi64(
4257 _mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
4258
4259 const __m512i *const src = (const __m512i *)((const void *)XXH3_kSecret);
4260 __m512i *const dest = (__m512i *)customSecret;
4261 int i;
4262 XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
4263 XXH_ASSERT(((size_t)dest & 63) == 0);
4264 for (i = 0; i < nbRounds; ++i) {
4265
4266 /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
4267 * const*', this will warn "discards 'const' qualifier". */
4268 union {
4269
4270 const __m512i *cp;
4271 void *p;
4272
4273 } remote_const_void;
4274
4275 remote_const_void.cp = src + i;
4276 dest[i] =
4277 _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
4278
4279 }
4280
4281 }
4282
4283 }
4284
4285 #endif
4286
4287 #if (XXH_VECTOR == XXH_AVX2) || \
4288 (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
4289
4290 #ifndef XXH_TARGET_AVX2
4291 #define XXH_TARGET_AVX2 /* disable attribute target */
4292 #endif
4293
XXH3_accumulate_512_avx2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4294 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
4295 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4296 const void *XXH_RESTRICT secret) {
4297
4298 XXH_ASSERT((((size_t)acc) & 31) == 0);
4299 {
4300
4301 __m256i *const xacc = (__m256i *)acc;
4302 /* Unaligned. This is mainly for pointer arithmetic, and because
4303 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason.
4304 */
4305 const __m256i *const xinput = (const __m256i *)input;
4306 /* Unaligned. This is mainly for pointer arithmetic, and because
4307 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
4308 const __m256i *const xsecret = (const __m256i *)secret;
4309
4310 size_t i;
4311 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
4312
4313 /* data_vec = xinput[i]; */
4314 __m256i const data_vec = _mm256_loadu_si256(xinput + i);
4315 /* key_vec = xsecret[i]; */
4316 __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
4317 /* data_key = data_vec ^ key_vec; */
4318 __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
4319 /* data_key_lo = data_key >> 32; */
4320 __m256i const data_key_lo =
4321 _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4322 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4323 __m256i const product = _mm256_mul_epu32(data_key, data_key_lo);
4324 /* xacc[i] += swap(data_vec); */
4325 __m256i const data_swap =
4326 _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
4327 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
4328 /* xacc[i] += product; */
4329 xacc[i] = _mm256_add_epi64(product, sum);
4330
4331 }
4332
4333 }
4334
4335 }
4336
XXH3_scrambleAcc_avx2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4337 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
4338 void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
4339
4340 XXH_ASSERT((((size_t)acc) & 31) == 0);
4341 {
4342
4343 __m256i *const xacc = (__m256i *)acc;
4344 /* Unaligned. This is mainly for pointer arithmetic, and because
4345 * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
4346 const __m256i *const xsecret = (const __m256i *)secret;
4347 const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
4348
4349 size_t i;
4350 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
4351
4352 /* xacc[i] ^= (xacc[i] >> 47) */
4353 __m256i const acc_vec = xacc[i];
4354 __m256i const shifted = _mm256_srli_epi64(acc_vec, 47);
4355 __m256i const data_vec = _mm256_xor_si256(acc_vec, shifted);
4356 /* xacc[i] ^= xsecret; */
4357 __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
4358 __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
4359
4360 /* xacc[i] *= XXH_PRIME32_1; */
4361 __m256i const data_key_hi =
4362 _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4363 __m256i const prod_lo = _mm256_mul_epu32(data_key, prime32);
4364 __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32);
4365 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
4366
4367 }
4368
4369 }
4370
4371 }
4372
XXH3_initCustomSecret_avx2(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4373 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
4374 void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4375
4376 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
4377 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
4378 XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
4379 (void)(&XXH_writeLE64);
4380 XXH_PREFETCH(customSecret);
4381 {
4382
4383 __m256i const seed =
4384 _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64,
4385 (xxh_i64)(0U - seed64), (xxh_i64)seed64);
4386
4387 const __m256i *const src = (const __m256i *)((const void *)XXH3_kSecret);
4388 __m256i *dest = (__m256i *)customSecret;
4389
4390 #if defined(__GNUC__) || defined(__clang__)
4391 /*
4392 * On GCC & Clang, marking 'dest' as modified will cause the compiler:
4393 * - do not extract the secret from sse registers in the internal loop
4394 * - use less common registers, and avoid pushing these reg into stack
4395 */
4396 XXH_COMPILER_GUARD(dest);
4397 #endif
4398 XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
4399 XXH_ASSERT(((size_t)dest & 31) == 0);
4400
4401 /* GCC -O2 need unroll loop manually */
4402 dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
4403 dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed);
4404 dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed);
4405 dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed);
4406 dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed);
4407 dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed);
4408
4409 }
4410
4411 }
4412
4413 #endif
4414
4415 /* x86dispatch always generates SSE2 */
4416 #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
4417
4418 #ifndef XXH_TARGET_SSE2
4419 #define XXH_TARGET_SSE2 /* disable attribute target */
4420 #endif
4421
XXH3_accumulate_512_sse2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4422 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
4423 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4424 const void *XXH_RESTRICT secret) {
4425
4426 /* SSE2 is just a half-scale version of the AVX2 version. */
4427 XXH_ASSERT((((size_t)acc) & 15) == 0);
4428 {
4429
4430 __m128i *const xacc = (__m128i *)acc;
4431 /* Unaligned. This is mainly for pointer arithmetic, and because
4432 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
4433 const __m128i *const xinput = (const __m128i *)input;
4434 /* Unaligned. This is mainly for pointer arithmetic, and because
4435 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
4436 const __m128i *const xsecret = (const __m128i *)secret;
4437
4438 size_t i;
4439 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
4440
4441 /* data_vec = xinput[i]; */
4442 __m128i const data_vec = _mm_loadu_si128(xinput + i);
4443 /* key_vec = xsecret[i]; */
4444 __m128i const key_vec = _mm_loadu_si128(xsecret + i);
4445 /* data_key = data_vec ^ key_vec; */
4446 __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
4447 /* data_key_lo = data_key >> 32; */
4448 __m128i const data_key_lo =
4449 _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4450 /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4451 __m128i const product = _mm_mul_epu32(data_key, data_key_lo);
4452 /* xacc[i] += swap(data_vec); */
4453 __m128i const data_swap =
4454 _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
4455 __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
4456 /* xacc[i] += product; */
4457 xacc[i] = _mm_add_epi64(product, sum);
4458
4459 }
4460
4461 }
4462
4463 }
4464
XXH3_scrambleAcc_sse2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4465 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
4466 void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
4467
4468 XXH_ASSERT((((size_t)acc) & 15) == 0);
4469 {
4470
4471 __m128i *const xacc = (__m128i *)acc;
4472 /* Unaligned. This is mainly for pointer arithmetic, and because
4473 * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
4474 const __m128i *const xsecret = (const __m128i *)secret;
4475 const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
4476
4477 size_t i;
4478 for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
4479
4480 /* xacc[i] ^= (xacc[i] >> 47) */
4481 __m128i const acc_vec = xacc[i];
4482 __m128i const shifted = _mm_srli_epi64(acc_vec, 47);
4483 __m128i const data_vec = _mm_xor_si128(acc_vec, shifted);
4484 /* xacc[i] ^= xsecret[i]; */
4485 __m128i const key_vec = _mm_loadu_si128(xsecret + i);
4486 __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
4487
4488 /* xacc[i] *= XXH_PRIME32_1; */
4489 __m128i const data_key_hi =
4490 _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4491 __m128i const prod_lo = _mm_mul_epu32(data_key, prime32);
4492 __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32);
4493 xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
4494
4495 }
4496
4497 }
4498
4499 }
4500
XXH3_initCustomSecret_sse2(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4501 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
4502 void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4503
4504 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
4505 (void)(&XXH_writeLE64);
4506 {
4507
4508 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
4509
4510 #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
4511 /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
4512 XXH_ALIGN(16)
4513 const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)};
4514 __m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
4515 #else
4516 __m128i const seed =
4517 _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
4518 #endif
4519 int i;
4520
4521 const void *const src16 = XXH3_kSecret;
4522 __m128i *dst16 = (__m128i *)customSecret;
4523 #if defined(__GNUC__) || defined(__clang__)
4524 /*
4525 * On GCC & Clang, marking 'dest' as modified will cause the compiler:
4526 * - do not extract the secret from sse registers in the internal loop
4527 * - use less common registers, and avoid pushing these reg into stack
4528 */
4529 XXH_COMPILER_GUARD(dst16);
4530 #endif
4531 XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
4532 XXH_ASSERT(((size_t)dst16 & 15) == 0);
4533
4534 for (i = 0; i < nbRounds; ++i) {
4535
4536 dst16[i] =
4537 _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed);
4538
4539 }
4540
4541 }
4542
4543 }
4544
4545 #endif
4546
4547 #if (XXH_VECTOR == XXH_NEON)
4548
XXH3_accumulate_512_neon(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4549 XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
4550 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4551 const void *XXH_RESTRICT secret) {
4552
4553 XXH_ASSERT((((size_t)acc) & 15) == 0);
4554 {
4555
4556 uint64x2_t *const xacc = (uint64x2_t *)acc;
4557 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
4558 */
4559 uint8_t const *const xinput = (const uint8_t *)input;
4560 uint8_t const *const xsecret = (const uint8_t *)secret;
4561
4562 size_t i;
4563 for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
4564
4565 /* data_vec = xinput[i]; */
4566 uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
4567 /* key_vec = xsecret[i]; */
4568 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4569 uint64x2_t data_key;
4570 uint32x2_t data_key_lo, data_key_hi;
4571 /* xacc[i] += swap(data_vec); */
4572 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
4573 uint64x2_t const swapped = vextq_u64(data64, data64, 1);
4574 xacc[i] = vaddq_u64(xacc[i], swapped);
4575 /* data_key = data_vec ^ key_vec; */
4576 data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
4577 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
4578 * data_key_hi = (uint32x2_t) (data_key >> 32);
4579 * data_key = UNDEFINED; */
4580 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4581 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4582 xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);
4583
4584 }
4585
4586 }
4587
4588 }
4589
XXH3_scrambleAcc_neon(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4590 XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT acc,
4591 const void *XXH_RESTRICT secret) {
4592
4593 XXH_ASSERT((((size_t)acc) & 15) == 0);
4594
4595 {
4596
4597 uint64x2_t *xacc = (uint64x2_t *)acc;
4598 uint8_t const *xsecret = (uint8_t const *)secret;
4599 uint32x2_t prime = vdup_n_u32(XXH_PRIME32_1);
4600
4601 size_t i;
4602 for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
4603
4604 /* xacc[i] ^= (xacc[i] >> 47); */
4605 uint64x2_t acc_vec = xacc[i];
4606 uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
4607 uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
4608
4609 /* xacc[i] ^= xsecret[i]; */
4610 uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4611 uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
4612
4613 /* xacc[i] *= XXH_PRIME32_1 */
4614 uint32x2_t data_key_lo, data_key_hi;
4615 /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
4616 * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
4617 * xacc[i] = UNDEFINED; */
4618 XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4619 { /*
4620 * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
4621 *
4622 * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
4623 * incorrectly "optimize" this:
4624 * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
4625 * shifted = vshll_n_u32(tmp, 32);
4626 * to this:
4627 * tmp = "vmulq_u64"(a, b); // no such thing!
4628 * shifted = vshlq_n_u64(tmp, 32);
4629 *
4630 * However, unlike SSE, Clang lacks a 64-bit multiply routine
4631 * for NEON, and it scalarizes two 64-bit multiplies instead.
4632 *
4633 * vmull_u32 has the same timing as vmul_u32, and it avoids
4634 * this bug completely.
4635 * See https://bugs.llvm.org/show_bug.cgi?id=39967
4636 */
4637 uint64x2_t prod_hi = vmull_u32(data_key_hi, prime);
4638 /* xacc[i] = prod_hi << 32; */
4639 xacc[i] = vshlq_n_u64(prod_hi, 32);
4640 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4641 xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
4642
4643 }
4644
4645 }
4646
4647 }
4648
4649 }
4650
4651 #endif
4652
4653 #if (XXH_VECTOR == XXH_VSX)
4654
XXH3_accumulate_512_vsx(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4655 XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT acc,
4656 const void *XXH_RESTRICT input,
4657 const void *XXH_RESTRICT secret) {
4658
4659 xxh_u64x2 *const xacc = (xxh_u64x2 *)acc; /* presumed aligned */
4660 xxh_u64x2 const *const xinput =
4661 (xxh_u64x2 const *)input; /* no alignment restriction */
4662 xxh_u64x2 const *const xsecret =
4663 (xxh_u64x2 const *)secret; /* no alignment restriction */
4664 xxh_u64x2 const v32 = {32, 32};
4665 size_t i;
4666 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4667
4668 /* data_vec = xinput[i]; */
4669 xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
4670 /* key_vec = xsecret[i]; */
4671 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4672 xxh_u64x2 const data_key = data_vec ^ key_vec;
4673 /* shuffled = (data_key << 32) | (data_key >> 32); */
4674 xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4675 /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled &
4676 * 0xFFFFFFFF); */
4677 xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4678 xacc[i] += product;
4679
4680 /* swap high and low halves */
4681 #ifdef __s390x__
4682 xacc[i] += vec_permi(data_vec, data_vec, 2);
4683 #else
4684 xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
4685 #endif
4686
4687 }
4688
4689 }
4690
XXH3_scrambleAcc_vsx(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4691 XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT acc,
4692 const void *XXH_RESTRICT secret) {
4693
4694 XXH_ASSERT((((size_t)acc) & 15) == 0);
4695
4696 {
4697
4698 xxh_u64x2 *const xacc = (xxh_u64x2 *)acc;
4699 const xxh_u64x2 *const xsecret = (const xxh_u64x2 *)secret;
4700 /* constants */
4701 xxh_u64x2 const v32 = {32, 32};
4702 xxh_u64x2 const v47 = {47, 47};
4703 xxh_u32x4 const prime = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,
4704 XXH_PRIME32_1};
4705 size_t i;
4706 for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4707
4708 /* xacc[i] ^= (xacc[i] >> 47); */
4709 xxh_u64x2 const acc_vec = xacc[i];
4710 xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
4711
4712 /* xacc[i] ^= xsecret[i]; */
4713 xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4714 xxh_u64x2 const data_key = data_vec ^ key_vec;
4715
4716 /* xacc[i] *= XXH_PRIME32_1 */
4717 /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime &
4718 * 0xFFFFFFFF); */
4719 xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
4720 /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */
4721 xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
4722 xacc[i] = prod_odd + (prod_even << v32);
4723
4724 }
4725
4726 }
4727
4728 }
4729
4730 #endif
4731
4732 /* scalar variants - universal */
4733
XXH3_accumulate_512_scalar(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4734 XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
4735 void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4736 const void *XXH_RESTRICT secret) {
4737
4738 xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */
4739 const xxh_u8 *const xinput =
4740 (const xxh_u8 *)input; /* no alignment restriction */
4741 const xxh_u8 *const xsecret =
4742 (const xxh_u8 *)secret; /* no alignment restriction */
4743 size_t i;
4744 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN - 1)) == 0);
4745 for (i = 0; i < XXH_ACC_NB; i++) {
4746
4747 xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i);
4748 xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8);
4749 xacc[i ^ 1] += data_val; /* swap adjacent lanes */
4750 xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4751
4752 }
4753
4754 }
4755
XXH3_scrambleAcc_scalar(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4756 XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT acc,
4757 const void *XXH_RESTRICT secret) {
4758
4759 xxh_u64 *const xacc = (xxh_u64 *)acc; /* presumed aligned */
4760 const xxh_u8 *const xsecret =
4761 (const xxh_u8 *)secret; /* no alignment restriction */
4762 size_t i;
4763 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN - 1)) == 0);
4764 for (i = 0; i < XXH_ACC_NB; i++) {
4765
4766 xxh_u64 const key64 = XXH_readLE64(xsecret + 8 * i);
4767 xxh_u64 acc64 = xacc[i];
4768 acc64 = XXH_xorshift64(acc64, 47);
4769 acc64 ^= key64;
4770 acc64 *= XXH_PRIME32_1;
4771 xacc[i] = acc64;
4772
4773 }
4774
4775 }
4776
XXH3_initCustomSecret_scalar(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4777 XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
4778 void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4779
4780 /*
4781 * We need a separate pointer for the hack below,
4782 * which requires a non-const pointer.
4783 * Any decent compiler will optimize this out otherwise.
4784 */
4785 const xxh_u8 *kSecretPtr = XXH3_kSecret;
4786 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
4787
4788 #if defined(__clang__) && defined(__aarch64__)
4789 /*
4790 * UGLY HACK:
4791 * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
4792 * placed sequentially, in order, at the top of the unrolled loop.
4793 *
4794 * While MOVK is great for generating constants (2 cycles for a 64-bit
4795 * constant compared to 4 cycles for LDR), long MOVK chains stall the
4796 * integer pipelines:
4797 * I L S
4798 * MOVK
4799 * MOVK
4800 * MOVK
4801 * MOVK
4802 * ADD
4803 * SUB STR
4804 * STR
4805 * By forcing loads from memory (as the asm line causes Clang to assume
4806 * that XXH3_kSecretPtr has been changed), the pipelines are used more
4807 * efficiently:
4808 * I L S
4809 * LDR
4810 * ADD LDR
4811 * SUB STR
4812 * STR
4813 * XXH3_64bits_withSeed, len == 256, Snapdragon 835
4814 * without hack: 2654.4 MB/s
4815 * with hack: 3202.9 MB/s
4816 */
4817 XXH_COMPILER_GUARD(kSecretPtr);
4818 #endif
4819 /*
4820 * Note: in debug mode, this overrides the asm optimization
4821 * and Clang will emit MOVK chains again.
4822 */
4823 XXH_ASSERT(kSecretPtr == XXH3_kSecret);
4824
4825 {
4826
4827 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
4828 int i;
4829 for (i = 0; i < nbRounds; i++) {
4830
4831 /*
4832 * The asm hack causes Clang to assume that kSecretPtr aliases with
4833 * customSecret, and on aarch64, this prevented LDP from merging two
4834 * loads together for free. Putting the loads together before the stores
4835 * properly generates LDP.
4836 */
4837 xxh_u64 lo = XXH_readLE64(kSecretPtr + 16 * i) + seed64;
4838 xxh_u64 hi = XXH_readLE64(kSecretPtr + 16 * i + 8) - seed64;
4839 XXH_writeLE64((xxh_u8 *)customSecret + 16 * i, lo);
4840 XXH_writeLE64((xxh_u8 *)customSecret + 16 * i + 8, hi);
4841
4842 }
4843
4844 }
4845
4846 }
4847
4848 typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
4849 const void *);
4850 typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
4851 typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
4852
4853 #if (XXH_VECTOR == XXH_AVX512)
4854
4855 #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
4856 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
4857 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
4858
4859 #elif (XXH_VECTOR == XXH_AVX2)
4860
4861 #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
4862 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
4863 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
4864
4865 #elif (XXH_VECTOR == XXH_SSE2)
4866
4867 #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
4868 #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
4869 #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
4870
4871 #elif (XXH_VECTOR == XXH_NEON)
4872
4873 #define XXH3_accumulate_512 XXH3_accumulate_512_neon
4874 #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
4875 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4876
4877 #elif (XXH_VECTOR == XXH_VSX)
4878
4879 #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
4880 #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
4881 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4882
4883 #else /* scalar */
4884
4885 #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
4886 #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
4887 #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4888
4889 #endif
4890
4891 #ifndef XXH_PREFETCH_DIST
4892 #ifdef __clang__
4893 #define XXH_PREFETCH_DIST 320
4894 #else
4895 #if (XXH_VECTOR == XXH_AVX512)
4896 #define XXH_PREFETCH_DIST 512
4897 #else
4898 #define XXH_PREFETCH_DIST 384
4899 #endif
4900 #endif /* __clang__ */
4901 #endif /* XXH_PREFETCH_DIST */
4902
4903 /*
4904 * XXH3_accumulate()
4905 * Loops over XXH3_accumulate_512().
4906 * Assumption: nbStripes will not overflow the secret size
4907 */
XXH3_accumulate(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,size_t nbStripes,XXH3_f_accumulate_512 f_acc512)4908 XXH_FORCE_INLINE void XXH3_accumulate(xxh_u64 *XXH_RESTRICT acc,
4909 const xxh_u8 *XXH_RESTRICT input,
4910 const xxh_u8 *XXH_RESTRICT secret,
4911 size_t nbStripes,
4912 XXH3_f_accumulate_512 f_acc512) {
4913
4914 size_t n;
4915 for (n = 0; n < nbStripes; n++) {
4916
4917 const xxh_u8 *const in = input + n * XXH_STRIPE_LEN;
4918 XXH_PREFETCH(in + XXH_PREFETCH_DIST);
4919 f_acc512(acc, in, secret + n * XXH_SECRET_CONSUME_RATE);
4920
4921 }
4922
4923 }
4924
XXH3_hashLong_internal_loop(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)4925 XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
4926 xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, size_t len,
4927 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
4928 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
4929
4930 size_t const nbStripesPerBlock =
4931 (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
4932 size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
4933 size_t const nb_blocks = (len - 1) / block_len;
4934
4935 size_t n;
4936
4937 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4938
4939 for (n = 0; n < nb_blocks; n++) {
4940
4941 XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock,
4942 f_acc512);
4943 f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
4944
4945 }
4946
4947 /* last partial block */
4948 XXH_ASSERT(len > XXH_STRIPE_LEN);
4949 {
4950
4951 size_t const nbStripes =
4952 ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
4953 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
4954 XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes,
4955 f_acc512);
4956
4957 /* last stripe */
4958 {
4959
4960 const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
4961 #define XXH_SECRET_LASTACC_START \
4962 7 /* not aligned on 8, last secret is different from acc & scrambler \
4963 */
4964 f_acc512(acc, p,
4965 secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
4966
4967 }
4968
4969 }
4970
4971 }
4972
XXH3_mix2Accs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret)4973 XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64 *XXH_RESTRICT acc,
4974 const xxh_u8 *XXH_RESTRICT secret) {
4975
4976 return XXH3_mul128_fold64(acc[0] ^ XXH_readLE64(secret),
4977 acc[1] ^ XXH_readLE64(secret + 8));
4978
4979 }
4980
XXH3_mergeAccs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 start)4981 static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
4982 const xxh_u8 *XXH_RESTRICT secret,
4983 xxh_u64 start) {
4984
4985 xxh_u64 result64 = start;
4986 size_t i = 0;
4987
4988 for (i = 0; i < 4; i++) {
4989
4990 result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
4991 #if defined(__clang__) /* Clang */ \
4992 && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \
4993 && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
4994 && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
4995 /*
4996 * UGLY HACK:
4997 * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
4998 * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
4999 * XXH3_64bits, len == 256, Snapdragon 835:
5000 * without hack: 2063.7 MB/s
5001 * with hack: 2560.7 MB/s
5002 */
5003 XXH_COMPILER_GUARD(result64);
5004 #endif
5005
5006 }
5007
5008 return XXH3_avalanche(result64);
5009
5010 }
5011
5012 #define XXH3_INIT_ACC \
5013 { \
5014 \
5015 XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
5016 XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
5017 \
5018 }
5019
XXH3_hashLong_64b_internal(const void * XXH_RESTRICT input,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5020 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
5021 const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret,
5022 size_t secretSize, XXH3_f_accumulate_512 f_acc512,
5023 XXH3_f_scrambleAcc f_scramble) {
5024
5025 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5026
5027 XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len,
5028 (const xxh_u8 *)secret, secretSize, f_acc512,
5029 f_scramble);
5030
5031 /* converge into final hash */
5032 XXH_STATIC_ASSERT(sizeof(acc) == 64);
5033 /* do not align on 8, so that the secret is different from the accumulator
5034 */
5035 #define XXH_SECRET_MERGEACCS_START 11
5036 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
5037 return XXH3_mergeAccs(acc,
5038 (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START,
5039 (xxh_u64)len * XXH_PRIME64_1);
5040
5041 }
5042
5043 /*
5044 * It's important for performance that XXH3_hashLong is not inlined.
5045 */
XXH3_hashLong_64b_withSecret(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const xxh_u8 * XXH_RESTRICT secret,size_t secretLen)5046 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(
5047 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5048 const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
5049
5050 (void)seed64;
5051 return XXH3_hashLong_64b_internal(input, len, secret, secretLen,
5052 XXH3_accumulate_512, XXH3_scrambleAcc);
5053
5054 }
5055
5056 /*
5057 * It's important for performance that XXH3_hashLong is not inlined.
5058 * Since the function is not inlined, the compiler may not be able to understand
5059 * that, in some scenarios, its `secret` argument is actually a compile time
5060 * constant. This variant enforces that the compiler can detect that, and uses
5061 * this opportunity to streamline the generated code for better performance.
5062 */
XXH3_hashLong_64b_default(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const xxh_u8 * XXH_RESTRICT secret,size_t secretLen)5063 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_default(
5064 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5065 const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
5066
5067 (void)seed64;
5068 (void)secret;
5069 (void)secretLen;
5070 return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret,
5071 sizeof(XXH3_kSecret), XXH3_accumulate_512,
5072 XXH3_scrambleAcc);
5073
5074 }
5075
5076 /*
5077 * XXH3_hashLong_64b_withSeed():
5078 * Generate a custom key based on alteration of default XXH3_kSecret with the
5079 * seed, and then use this key for long mode hashing.
5080 *
5081 * This operation is decently fast but nonetheless costs a little bit of time.
5082 * Try to avoid it whenever possible (typically when seed==0).
5083 *
5084 * It's important for performance that XXH3_hashLong is not inlined. Not sure
5085 * why (uop cache maybe?), but the difference is large and easily measurable.
5086 */
XXH3_hashLong_64b_withSeed_internal(const void * input,size_t len,XXH64_hash_t seed,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble,XXH3_f_initCustomSecret f_initSec)5087 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(
5088 const void *input, size_t len, XXH64_hash_t seed,
5089 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
5090 XXH3_f_initCustomSecret f_initSec) {
5091
5092 if (seed == 0)
5093 return XXH3_hashLong_64b_internal(
5094 input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
5095 {
5096
5097 XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5098 f_initSec(secret, seed);
5099 return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
5100 f_acc512, f_scramble);
5101
5102 }
5103
5104 }
5105
5106 /*
5107 * It's important for performance that XXH3_hashLong is not inlined.
5108 */
XXH3_hashLong_64b_withSeed(const void * input,size_t len,XXH64_hash_t seed,const xxh_u8 * secret,size_t secretLen)5109 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const void *input,
5110 size_t len,
5111 XXH64_hash_t seed,
5112 const xxh_u8 *secret,
5113 size_t secretLen) {
5114
5115 (void)secret;
5116 (void)secretLen;
5117 return XXH3_hashLong_64b_withSeed_internal(
5118 input, len, seed, XXH3_accumulate_512, XXH3_scrambleAcc,
5119 XXH3_initCustomSecret);
5120
5121 }
5122
5123 typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void *XXH_RESTRICT, size_t,
5124 XXH64_hash_t,
5125 const xxh_u8 *XXH_RESTRICT, size_t);
5126
5127 XXH_FORCE_INLINE XXH64_hash_t
XXH3_64bits_internal(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen,XXH3_hashLong64_f f_hashLong)5128 XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
5129 XXH64_hash_t seed64, const void *XXH_RESTRICT secret,
5130 size_t secretLen, XXH3_hashLong64_f f_hashLong) {
5131
5132 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
5133 /*
5134 * If an action is to be taken if `secretLen` condition is not respected,
5135 * it should be done here.
5136 * For now, it's a contract pre-condition.
5137 * Adding a check and a branch here would cost performance at every hash.
5138 * Also, note that function signature doesn't offer room to return an error.
5139 */
5140 if (len <= 16)
5141 return XXH3_len_0to16_64b((const xxh_u8 *)input, len,
5142 (const xxh_u8 *)secret, seed64);
5143 if (len <= 128)
5144 return XXH3_len_17to128_64b((const xxh_u8 *)input, len,
5145 (const xxh_u8 *)secret, secretLen, seed64);
5146 if (len <= XXH3_MIDSIZE_MAX)
5147 return XXH3_len_129to240_64b((const xxh_u8 *)input, len,
5148 (const xxh_u8 *)secret, secretLen, seed64);
5149 return f_hashLong(input, len, seed64, (const xxh_u8 *)secret, secretLen);
5150
5151 }
5152
5153 /* === Public entry point === */
5154
5155 /*! @ingroup xxh3_family */
XXH3_64bits(const void * input,size_t len)5156 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
5157
5158 return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
5159 XXH3_hashLong_64b_default);
5160
5161 }
5162
5163 /*! @ingroup xxh3_family */
XXH3_64bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)5164 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
5165 size_t len,
5166 const void *secret,
5167 size_t secretSize) {
5168
5169 return XXH3_64bits_internal(input, len, 0, secret, secretSize,
5170 XXH3_hashLong_64b_withSecret);
5171
5172 }
5173
5174 /*! @ingroup xxh3_family */
XXH3_64bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)5175 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
5176 XXH64_hash_t seed) {
5177
5178 return XXH3_64bits_internal(input, len, seed, XXH3_kSecret,
5179 sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
5180
5181 }
5182
5183 /* === XXH3 streaming === */
5184
5185 /*
5186 * Malloc's a pointer that is always aligned to align.
5187 *
5188 * This must be freed with `XXH_alignedFree()`.
5189 *
5190 * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
5191 * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
5192 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
5193 *
5194 * This underalignment previously caused a rather obvious crash which went
5195 * completely unnoticed due to XXH3_createState() not actually being tested.
5196 * Credit to RedSpah for noticing this bug.
5197 *
5198 * The alignment is done manually: Functions like posix_memalign or _mm_malloc
5199 * are avoided: To maintain portability, we would have to write a fallback
5200 * like this anyways, and besides, testing for the existence of library
5201 * functions without relying on external build tools is impossible.
5202 *
5203 * The method is simple: Overallocate, manually align, and store the offset
5204 * to the original behind the returned pointer.
5205 *
5206 * Align must be a power of 2 and 8 <= align <= 128.
5207 */
XXH_alignedMalloc(size_t s,size_t align)5208 static void *XXH_alignedMalloc(size_t s, size_t align) {
5209
5210 XXH_ASSERT(align <= 128 && align >= 8); /* range check */
5211 XXH_ASSERT((align & (align - 1)) == 0); /* power of 2 */
5212 XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */
5213 { /* Overallocate to make room for manual realignment and an offset byte */
5214 xxh_u8 *base = (xxh_u8 *)XXH_malloc(s + align);
5215 if (base != NULL) {
5216
5217 /*
5218 * Get the offset needed to align this pointer.
5219 *
5220 * Even if the returned pointer is aligned, there will always be
5221 * at least one byte to store the offset to the original pointer.
5222 */
5223 size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
5224 /* Add the offset for the now-aligned pointer */
5225 xxh_u8 *ptr = base + offset;
5226
5227 XXH_ASSERT((size_t)ptr % align == 0);
5228
5229 /* Store the offset immediately before the returned pointer. */
5230 ptr[-1] = (xxh_u8)offset;
5231 return ptr;
5232
5233 }
5234
5235 return NULL;
5236
5237 }
5238
5239 }
5240
5241 /*
5242 * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
5243 * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
5244 */
XXH_alignedFree(void * p)5245 static void XXH_alignedFree(void *p) {
5246
5247 if (p != NULL) {
5248
5249 xxh_u8 *ptr = (xxh_u8 *)p;
5250 /* Get the offset byte we added in XXH_malloc. */
5251 xxh_u8 offset = ptr[-1];
5252 /* Free the original malloc'd pointer */
5253 xxh_u8 *base = ptr - offset;
5254 XXH_free(base);
5255
5256 }
5257
5258 }
5259
5260 /*! @ingroup xxh3_family */
XXH3_createState(void)5261 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
5262
5263 XXH3_state_t *const state =
5264 (XXH3_state_t *)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
5265 if (state == NULL) return NULL;
5266 XXH3_INITSTATE(state);
5267 return state;
5268
5269 }
5270
5271 /*! @ingroup xxh3_family */
XXH3_freeState(XXH3_state_t * statePtr)5272 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
5273
5274 XXH_alignedFree(statePtr);
5275 return XXH_OK;
5276
5277 }
5278
5279 /*! @ingroup xxh3_family */
XXH3_copyState(XXH3_state_t * dst_state,const XXH3_state_t * src_state)5280 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *dst_state,
5281 const XXH3_state_t *src_state) {
5282
5283 memcpy(dst_state, src_state, sizeof(*dst_state));
5284
5285 }
5286
XXH3_reset_internal(XXH3_state_t * statePtr,XXH64_hash_t seed,const void * secret,size_t secretSize)5287 static void XXH3_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed,
5288 const void *secret, size_t secretSize) {
5289
5290 size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
5291 size_t const initLength =
5292 offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
5293 XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
5294 XXH_ASSERT(statePtr != NULL);
5295 /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
5296 memset((char *)statePtr + initStart, 0, initLength);
5297 statePtr->acc[0] = XXH_PRIME32_3;
5298 statePtr->acc[1] = XXH_PRIME64_1;
5299 statePtr->acc[2] = XXH_PRIME64_2;
5300 statePtr->acc[3] = XXH_PRIME64_3;
5301 statePtr->acc[4] = XXH_PRIME64_4;
5302 statePtr->acc[5] = XXH_PRIME32_2;
5303 statePtr->acc[6] = XXH_PRIME64_5;
5304 statePtr->acc[7] = XXH_PRIME32_1;
5305 statePtr->seed = seed;
5306 statePtr->extSecret = (const unsigned char *)secret;
5307 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5308 statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
5309 statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
5310
5311 }
5312
5313 /*! @ingroup xxh3_family */
XXH3_64bits_reset(XXH3_state_t * statePtr)5314 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
5315
5316 if (statePtr == NULL) return XXH_ERROR;
5317 XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
5318 return XXH_OK;
5319
5320 }
5321
5322 /*! @ingroup xxh3_family */
XXH3_64bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)5323 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
5324 XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
5325
5326 if (statePtr == NULL) return XXH_ERROR;
5327 XXH3_reset_internal(statePtr, 0, secret, secretSize);
5328 if (secret == NULL) return XXH_ERROR;
5329 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5330 return XXH_OK;
5331
5332 }
5333
5334 /*! @ingroup xxh3_family */
XXH3_64bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)5335 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
5336 XXH64_hash_t seed) {
5337
5338 if (statePtr == NULL) return XXH_ERROR;
5339 if (seed == 0) return XXH3_64bits_reset(statePtr);
5340 if (seed != statePtr->seed)
5341 XXH3_initCustomSecret(statePtr->customSecret, seed);
5342 XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
5343 return XXH_OK;
5344
5345 }
5346
5347 /* Note : when XXH3_consumeStripes() is invoked,
5348 * there must be a guarantee that at least one more byte must be consumed from
5349 * input
5350 * so that the function can blindly consume all stripes using the "normal"
5351 * secret segment */
XXH3_consumeStripes(xxh_u64 * XXH_RESTRICT acc,size_t * XXH_RESTRICT nbStripesSoFarPtr,size_t nbStripesPerBlock,const xxh_u8 * XXH_RESTRICT input,size_t nbStripes,const xxh_u8 * XXH_RESTRICT secret,size_t secretLimit,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5352 XXH_FORCE_INLINE void XXH3_consumeStripes(
5353 xxh_u64 *XXH_RESTRICT acc, size_t *XXH_RESTRICT nbStripesSoFarPtr,
5354 size_t nbStripesPerBlock, const xxh_u8 *XXH_RESTRICT input,
5355 size_t nbStripes, const xxh_u8 *XXH_RESTRICT secret, size_t secretLimit,
5356 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
5357
5358 XXH_ASSERT(nbStripes <=
5359 nbStripesPerBlock); /* can handle max 1 scramble per invocation */
5360 XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
5361 if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
5362
5363 /* need a scrambling operation */
5364 size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
5365 size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
5366 XXH3_accumulate(acc, input,
5367 secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
5368 nbStripesToEndofBlock, f_acc512);
5369 f_scramble(acc, secret + secretLimit);
5370 XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret,
5371 nbStripesAfterBlock, f_acc512);
5372 *nbStripesSoFarPtr = nbStripesAfterBlock;
5373
5374 } else {
5375
5376 XXH3_accumulate(acc, input,
5377 secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
5378 nbStripes, f_acc512);
5379 *nbStripesSoFarPtr += nbStripes;
5380
5381 }
5382
5383 }
5384
5385 /*
5386 * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
5387 */
XXH3_update(XXH3_state_t * state,const xxh_u8 * input,size_t len,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5388 XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
5389 const xxh_u8 *input, size_t len,
5390 XXH3_f_accumulate_512 f_acc512,
5391 XXH3_f_scrambleAcc f_scramble) {
5392
5393 if (input == NULL)
5394 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
5395 (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
5396 return XXH_OK;
5397 #else
5398 return XXH_ERROR;
5399 #endif
5400
5401 {
5402
5403 const xxh_u8 *const bEnd = input + len;
5404 const unsigned char *const secret =
5405 (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5406
5407 state->totalLen += len;
5408 XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
5409
5410 if (state->bufferedSize + len <=
5411 XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
5412 XXH_memcpy(state->buffer + state->bufferedSize, input, len);
5413 state->bufferedSize += (XXH32_hash_t)len;
5414 return XXH_OK;
5415
5416 }
5417
5418 /* total input is now > XXH3_INTERNALBUFFER_SIZE */
5419
5420 #define XXH3_INTERNALBUFFER_STRIPES \
5421 (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
5422 XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
5423 0); /* clean multiple */
5424
5425 /*
5426 * Internal buffer is partially filled (always, except at beginning)
5427 * Complete it, then consume it.
5428 */
5429 if (state->bufferedSize) {
5430
5431 size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
5432 XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
5433 input += loadSize;
5434 XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
5435 state->nbStripesPerBlock, state->buffer,
5436 XXH3_INTERNALBUFFER_STRIPES, secret,
5437 state->secretLimit, f_acc512, f_scramble);
5438 state->bufferedSize = 0;
5439
5440 }
5441
5442 XXH_ASSERT(input < bEnd);
5443
5444 /* Consume input by a multiple of internal buffer size */
5445 if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5446
5447 const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
5448 do {
5449
5450 XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
5451 state->nbStripesPerBlock, input,
5452 XXH3_INTERNALBUFFER_STRIPES, secret,
5453 state->secretLimit, f_acc512, f_scramble);
5454 input += XXH3_INTERNALBUFFER_SIZE;
5455
5456 } while (input < limit);
5457
5458 /* for last partial stripe */
5459 memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN,
5460 input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5461
5462 }
5463
5464 XXH_ASSERT(input < bEnd);
5465
5466 /* Some remaining input (always) : buffer it */
5467 XXH_memcpy(state->buffer, input, (size_t)(bEnd - input));
5468 state->bufferedSize = (XXH32_hash_t)(bEnd - input);
5469
5470 }
5471
5472 return XXH_OK;
5473
5474 }
5475
5476 /*! @ingroup xxh3_family */
XXH3_64bits_update(XXH3_state_t * state,const void * input,size_t len)5477 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
5478 const void *input, size_t len) {
5479
5480 return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
5481 XXH3_scrambleAcc);
5482
5483 }
5484
XXH3_digest_long(XXH64_hash_t * acc,const XXH3_state_t * state,const unsigned char * secret)5485 XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t *acc,
5486 const XXH3_state_t *state,
5487 const unsigned char *secret) {
5488
5489 /*
5490 * Digest on a local copy. This way, the state remains unaltered, and it can
5491 * continue ingesting more input afterwards.
5492 */
5493 memcpy(acc, state->acc, sizeof(state->acc));
5494 if (state->bufferedSize >= XXH_STRIPE_LEN) {
5495
5496 size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
5497 size_t nbStripesSoFar = state->nbStripesSoFar;
5498 XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock,
5499 state->buffer, nbStripes, secret, state->secretLimit,
5500 XXH3_accumulate_512, XXH3_scrambleAcc);
5501 /* last stripe */
5502 XXH3_accumulate_512(acc,
5503 state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
5504 secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5505
5506 } else { /* bufferedSize < XXH_STRIPE_LEN */
5507
5508 xxh_u8 lastStripe[XXH_STRIPE_LEN];
5509 size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
5510 XXH_ASSERT(state->bufferedSize >
5511 0); /* there is always some input buffered */
5512 memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize,
5513 catchupSize);
5514 memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
5515 XXH3_accumulate_512(acc, lastStripe,
5516 secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5517
5518 }
5519
5520 }
5521
5522 /*! @ingroup xxh3_family */
XXH3_64bits_digest(const XXH3_state_t * state)5523 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
5524
5525 const unsigned char *const secret =
5526 (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5527 if (state->totalLen > XXH3_MIDSIZE_MAX) {
5528
5529 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
5530 XXH3_digest_long(acc, state, secret);
5531 return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
5532 (xxh_u64)state->totalLen * XXH_PRIME64_1);
5533
5534 }
5535
5536 /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
5537 if (state->seed)
5538 return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen,
5539 state->seed);
5540 return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
5541 secret, state->secretLimit + XXH_STRIPE_LEN);
5542
5543 }
5544
5545 #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
5546
5547 /*! @ingroup xxh3_family */
XXH3_generateSecret(void * secretBuffer,const void * customSeed,size_t customSeedSize)5548 XXH_PUBLIC_API void XXH3_generateSecret(void *secretBuffer,
5549 const void *customSeed,
5550 size_t customSeedSize) {
5551
5552 XXH_ASSERT(secretBuffer != NULL);
5553 if (customSeedSize == 0) {
5554
5555 memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
5556 return;
5557
5558 }
5559
5560 XXH_ASSERT(customSeed != NULL);
5561
5562 {
5563
5564 size_t const segmentSize = sizeof(XXH128_hash_t);
5565 size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
5566 XXH128_canonical_t scrambler;
5567 XXH64_hash_t seeds[12];
5568 size_t segnb;
5569 XXH_ASSERT(nbSegments == 12);
5570 XXH_ASSERT(segmentSize * nbSegments ==
5571 XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
5572 XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
5573
5574 /*
5575 * Copy customSeed to seeds[], truncating or repeating as necessary.
5576 */
5577 {
5578
5579 size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
5580 size_t filled = toFill;
5581 memcpy(seeds, customSeed, toFill);
5582 while (filled < sizeof(seeds)) {
5583
5584 toFill = XXH_MIN(filled, sizeof(seeds) - filled);
5585 memcpy((char *)seeds + filled, seeds, toFill);
5586 filled += toFill;
5587
5588 }
5589
5590 }
5591
5592 /* generate secret */
5593 memcpy(secretBuffer, &scrambler, sizeof(scrambler));
5594 for (segnb = 1; segnb < nbSegments; segnb++) {
5595
5596 size_t const segmentStart = segnb * segmentSize;
5597 XXH128_canonical_t segment;
5598 XXH128_canonicalFromHash(&segment,
5599 XXH128(&scrambler, sizeof(scrambler),
5600 XXH_readLE64(seeds + segnb) + segnb));
5601 memcpy((char *)secretBuffer + segmentStart, &segment, sizeof(segment));
5602
5603 }
5604
5605 }
5606
5607 }
5608
5609 /* ==========================================
5610 * XXH3 128 bits (a.k.a XXH128)
5611 * ==========================================
5612 * XXH3's 128-bit variant has better mixing and strength than the 64-bit
5613 * variant, even without counting the significantly larger output size.
5614 *
5615 * For example, extra steps are taken to avoid the seed-dependent collisions
5616 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
5617 *
5618 * This strength naturally comes at the cost of some speed, especially on short
5619 * lengths. Note that longer hashes are about as fast as the 64-bit version
5620 * due to it using only a slight modification of the 64-bit loop.
5621 *
5622 * XXH128 is also more oriented towards 64-bit machines. It is still extremely
5623 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
5624 */
5625
XXH3_len_1to3_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5626 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8 *input,
5627 size_t len,
5628 const xxh_u8 *secret,
5629 XXH64_hash_t seed) {
5630
5631 /* A doubled version of 1to3_64b with different constants. */
5632 XXH_ASSERT(input != NULL);
5633 XXH_ASSERT(1 <= len && len <= 3);
5634 XXH_ASSERT(secret != NULL);
5635 /*
5636 * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
5637 * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
5638 * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
5639 */
5640 {
5641
5642 xxh_u8 const c1 = input[0];
5643 xxh_u8 const c2 = input[len >> 1];
5644 xxh_u8 const c3 = input[len - 1];
5645 xxh_u32 const combinedl = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
5646 ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
5647 xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
5648 xxh_u64 const bitflipl =
5649 (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
5650 xxh_u64 const bitfliph =
5651 (XXH_readLE32(secret + 8) ^ XXH_readLE32(secret + 12)) - seed;
5652 xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
5653 xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
5654 XXH128_hash_t h128;
5655 h128.low64 = XXH64_avalanche(keyed_lo);
5656 h128.high64 = XXH64_avalanche(keyed_hi);
5657 return h128;
5658
5659 }
5660
5661 }
5662
XXH3_len_4to8_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5663 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8 *input,
5664 size_t len,
5665 const xxh_u8 *secret,
5666 XXH64_hash_t seed) {
5667
5668 XXH_ASSERT(input != NULL);
5669 XXH_ASSERT(secret != NULL);
5670 XXH_ASSERT(4 <= len && len <= 8);
5671 seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
5672 {
5673
5674 xxh_u32 const input_lo = XXH_readLE32(input);
5675 xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
5676 xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
5677 xxh_u64 const bitflip =
5678 (XXH_readLE64(secret + 16) ^ XXH_readLE64(secret + 24)) + seed;
5679 xxh_u64 const keyed = input_64 ^ bitflip;
5680
5681 /* Shift len to the left to ensure it is even, this avoids even multiplies.
5682 */
5683 XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
5684
5685 m128.high64 += (m128.low64 << 1);
5686 m128.low64 ^= (m128.high64 >> 3);
5687
5688 m128.low64 = XXH_xorshift64(m128.low64, 35);
5689 m128.low64 *= 0x9FB21C651E98DF25ULL;
5690 m128.low64 = XXH_xorshift64(m128.low64, 28);
5691 m128.high64 = XXH3_avalanche(m128.high64);
5692 return m128;
5693
5694 }
5695
5696 }
5697
XXH3_len_9to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5698 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8 *input,
5699 size_t len,
5700 const xxh_u8 *secret,
5701 XXH64_hash_t seed) {
5702
5703 XXH_ASSERT(input != NULL);
5704 XXH_ASSERT(secret != NULL);
5705 XXH_ASSERT(9 <= len && len <= 16);
5706 {
5707
5708 xxh_u64 const bitflipl =
5709 (XXH_readLE64(secret + 32) ^ XXH_readLE64(secret + 40)) - seed;
5710 xxh_u64 const bitfliph =
5711 (XXH_readLE64(secret + 48) ^ XXH_readLE64(secret + 56)) + seed;
5712 xxh_u64 const input_lo = XXH_readLE64(input);
5713 xxh_u64 input_hi = XXH_readLE64(input + len - 8);
5714 XXH128_hash_t m128 =
5715 XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
5716 /*
5717 * Put len in the middle of m128 to ensure that the length gets mixed to
5718 * both the low and high bits in the 128x64 multiply below.
5719 */
5720 m128.low64 += (xxh_u64)(len - 1) << 54;
5721 input_hi ^= bitfliph;
5722 /*
5723 * Add the high 32 bits of input_hi to the high 32 bits of m128, then
5724 * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
5725 * the high 64 bits of m128.
5726 *
5727 * The best approach to this operation is different on 32-bit and 64-bit.
5728 */
5729 if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
5730 /*
5731 * 32-bit optimized version, which is more readable.
5732 *
5733 * On 32-bit, it removes an ADC and delays a dependency between the two
5734 * halves of m128.high64, but it generates an extra mask on 64-bit.
5735 */
5736 m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) +
5737 XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
5738
5739 } else {
5740
5741 /*
5742 * 64-bit optimized (albeit more confusing) version.
5743 *
5744 * Uses some properties of addition and multiplication to remove the mask:
5745 *
5746 * Let:
5747 * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
5748 * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
5749 * c = XXH_PRIME32_2
5750 *
5751 * a + (b * c)
5752 * Inverse Property: x + y - x == y
5753 * a + (b * (1 + c - 1))
5754 * Distributive Property: x * (y + z) == (x * y) + (x * z)
5755 * a + (b * 1) + (b * (c - 1))
5756 * Identity Property: x * 1 == x
5757 * a + b + (b * (c - 1))
5758 *
5759 * Substitute a, b, and c:
5760 * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 -
5761 * 1))
5762 *
5763 * Since input_hi.hi + input_hi.lo == input_hi, we get this:
5764 * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
5765 */
5766 m128.high64 +=
5767 input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
5768
5769 }
5770
5771 /* m128 ^= XXH_swap64(m128 >> 64); */
5772 m128.low64 ^= XXH_swap64(m128.high64);
5773
5774 { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
5775 XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
5776 h128.high64 += m128.high64 * XXH_PRIME64_2;
5777
5778 h128.low64 = XXH3_avalanche(h128.low64);
5779 h128.high64 = XXH3_avalanche(h128.high64);
5780 return h128;
5781
5782 }
5783
5784 }
5785
5786 }
5787
5788 /*
5789 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
5790 */
XXH3_len_0to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5791 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8 *input,
5792 size_t len,
5793 const xxh_u8 *secret,
5794 XXH64_hash_t seed) {
5795
5796 XXH_ASSERT(len <= 16);
5797 {
5798
5799 if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
5800 if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
5801 if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
5802 {
5803
5804 XXH128_hash_t h128;
5805 xxh_u64 const bitflipl =
5806 XXH_readLE64(secret + 64) ^ XXH_readLE64(secret + 72);
5807 xxh_u64 const bitfliph =
5808 XXH_readLE64(secret + 80) ^ XXH_readLE64(secret + 88);
5809 h128.low64 = XXH64_avalanche(seed ^ bitflipl);
5810 h128.high64 = XXH64_avalanche(seed ^ bitfliph);
5811 return h128;
5812
5813 }
5814
5815 }
5816
5817 }
5818
5819 /*
5820 * A bit slower than XXH3_mix16B, but handles multiply by zero better.
5821 */
XXH128_mix32B(XXH128_hash_t acc,const xxh_u8 * input_1,const xxh_u8 * input_2,const xxh_u8 * secret,XXH64_hash_t seed)5822 XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc,
5823 const xxh_u8 *input_1,
5824 const xxh_u8 *input_2,
5825 const xxh_u8 *secret,
5826 XXH64_hash_t seed) {
5827
5828 acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
5829 acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
5830 acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
5831 acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
5832 return acc;
5833
5834 }
5835
XXH3_len_17to128_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)5836 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b(
5837 const xxh_u8 *XXH_RESTRICT input, size_t len,
5838 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
5839
5840 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5841 (void)secretSize;
5842 XXH_ASSERT(16 < len && len <= 128);
5843
5844 {
5845
5846 XXH128_hash_t acc;
5847 acc.low64 = len * XXH_PRIME64_1;
5848 acc.high64 = 0;
5849 if (len > 32) {
5850
5851 if (len > 64) {
5852
5853 if (len > 96) {
5854
5855 acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96,
5856 seed);
5857
5858 }
5859
5860 acc =
5861 XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
5862
5863 }
5864
5865 acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
5866
5867 }
5868
5869 acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
5870 {
5871
5872 XXH128_hash_t h128;
5873 h128.low64 = acc.low64 + acc.high64;
5874 h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
5875 ((len - seed) * XXH_PRIME64_2);
5876 h128.low64 = XXH3_avalanche(h128.low64);
5877 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
5878 return h128;
5879
5880 }
5881
5882 }
5883
5884 }
5885
XXH3_len_129to240_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)5886 XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b(
5887 const xxh_u8 *XXH_RESTRICT input, size_t len,
5888 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
5889
5890 XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5891 (void)secretSize;
5892 XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
5893
5894 {
5895
5896 XXH128_hash_t acc;
5897 int const nbRounds = (int)len / 32;
5898 int i;
5899 acc.low64 = len * XXH_PRIME64_1;
5900 acc.high64 = 0;
5901 for (i = 0; i < 4; i++) {
5902
5903 acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
5904 secret + (32 * i), seed);
5905
5906 }
5907
5908 acc.low64 = XXH3_avalanche(acc.low64);
5909 acc.high64 = XXH3_avalanche(acc.high64);
5910 XXH_ASSERT(nbRounds >= 4);
5911 for (i = 4; i < nbRounds; i++) {
5912
5913 acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
5914 secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
5915 seed);
5916
5917 }
5918
5919 /* last bytes */
5920 acc = XXH128_mix32B(
5921 acc, input + len - 16, input + len - 32,
5922 secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
5923 0ULL - seed);
5924
5925 {
5926
5927 XXH128_hash_t h128;
5928 h128.low64 = acc.low64 + acc.high64;
5929 h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
5930 ((len - seed) * XXH_PRIME64_2);
5931 h128.low64 = XXH3_avalanche(h128.low64);
5932 h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
5933 return h128;
5934
5935 }
5936
5937 }
5938
5939 }
5940
XXH3_hashLong_128b_internal(const void * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5941 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(
5942 const void *XXH_RESTRICT input, size_t len,
5943 const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
5944 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
5945
5946 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5947
5948 XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len, secret,
5949 secretSize, f_acc512, f_scramble);
5950
5951 /* converge into final hash */
5952 XXH_STATIC_ASSERT(sizeof(acc) == 64);
5953 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
5954 {
5955
5956 XXH128_hash_t h128;
5957 h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
5958 (xxh_u64)len * XXH_PRIME64_1);
5959 h128.high64 = XXH3_mergeAccs(
5960 acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
5961 ~((xxh_u64)len * XXH_PRIME64_2));
5962 return h128;
5963
5964 }
5965
5966 }
5967
5968 /*
5969 * It's important for performance that XXH3_hashLong is not inlined.
5970 */
XXH3_hashLong_128b_default(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen)5971 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_default(
5972 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5973 const void *XXH_RESTRICT secret, size_t secretLen) {
5974
5975 (void)seed64;
5976 (void)secret;
5977 (void)secretLen;
5978 return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret,
5979 sizeof(XXH3_kSecret), XXH3_accumulate_512,
5980 XXH3_scrambleAcc);
5981
5982 }
5983
5984 /*
5985 * It's important for performance that XXH3_hashLong is not inlined.
5986 */
XXH3_hashLong_128b_withSecret(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen)5987 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(
5988 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5989 const void *XXH_RESTRICT secret, size_t secretLen) {
5990
5991 (void)seed64;
5992 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
5993 secretLen, XXH3_accumulate_512,
5994 XXH3_scrambleAcc);
5995
5996 }
5997
XXH3_hashLong_128b_withSeed_internal(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble,XXH3_f_initCustomSecret f_initSec)5998 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(
5999 const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
6000 XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
6001 XXH3_f_initCustomSecret f_initSec) {
6002
6003 if (seed64 == 0)
6004 return XXH3_hashLong_128b_internal(
6005 input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
6006 {
6007
6008 XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
6009 f_initSec(secret, seed64);
6010 return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
6011 sizeof(secret), f_acc512, f_scramble);
6012
6013 }
6014
6015 }
6016
6017 /*
6018 * It's important for performance that XXH3_hashLong is not inlined.
6019 */
6020 XXH_NO_INLINE XXH128_hash_t
XXH3_hashLong_128b_withSeed(const void * input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen)6021 XXH3_hashLong_128b_withSeed(const void *input, size_t len, XXH64_hash_t seed64,
6022 const void *XXH_RESTRICT secret, size_t secretLen) {
6023
6024 (void)secret;
6025 (void)secretLen;
6026 return XXH3_hashLong_128b_withSeed_internal(
6027 input, len, seed64, XXH3_accumulate_512, XXH3_scrambleAcc,
6028 XXH3_initCustomSecret);
6029
6030 }
6031
6032 typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void *XXH_RESTRICT, size_t,
6033 XXH64_hash_t,
6034 const void *XXH_RESTRICT, size_t);
6035
6036 XXH_FORCE_INLINE XXH128_hash_t
XXH3_128bits_internal(const void * input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen,XXH3_hashLong128_f f_hl128)6037 XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
6038 const void *XXH_RESTRICT secret, size_t secretLen,
6039 XXH3_hashLong128_f f_hl128) {
6040
6041 XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
6042 /*
6043 * If an action is to be taken if `secret` conditions are not respected,
6044 * it should be done here.
6045 * For now, it's a contract pre-condition.
6046 * Adding a check and a branch here would cost performance at every hash.
6047 */
6048 if (len <= 16)
6049 return XXH3_len_0to16_128b((const xxh_u8 *)input, len,
6050 (const xxh_u8 *)secret, seed64);
6051 if (len <= 128)
6052 return XXH3_len_17to128_128b((const xxh_u8 *)input, len,
6053 (const xxh_u8 *)secret, secretLen, seed64);
6054 if (len <= XXH3_MIDSIZE_MAX)
6055 return XXH3_len_129to240_128b((const xxh_u8 *)input, len,
6056 (const xxh_u8 *)secret, secretLen, seed64);
6057 return f_hl128(input, len, seed64, secret, secretLen);
6058
6059 }
6060
6061 /* === Public XXH128 API === */
6062
6063 /*! @ingroup xxh3_family */
XXH3_128bits(const void * input,size_t len)6064 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
6065
6066 return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
6067 sizeof(XXH3_kSecret),
6068 XXH3_hashLong_128b_default);
6069
6070 }
6071
6072 /*! @ingroup xxh3_family */
XXH3_128bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)6073 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
6074 size_t len,
6075 const void *secret,
6076 size_t secretSize) {
6077
6078 return XXH3_128bits_internal(input, len, 0, (const xxh_u8 *)secret,
6079 secretSize, XXH3_hashLong_128b_withSecret);
6080
6081 }
6082
6083 /*! @ingroup xxh3_family */
XXH3_128bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)6084 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void *input,
6085 size_t len,
6086 XXH64_hash_t seed) {
6087
6088 return XXH3_128bits_internal(input, len, seed, XXH3_kSecret,
6089 sizeof(XXH3_kSecret),
6090 XXH3_hashLong_128b_withSeed);
6091
6092 }
6093
6094 /*! @ingroup xxh3_family */
XXH128(const void * input,size_t len,XXH64_hash_t seed)6095 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
6096 XXH64_hash_t seed) {
6097
6098 return XXH3_128bits_withSeed(input, len, seed);
6099
6100 }
6101
6102 /* === XXH3 128-bit streaming === */
6103
6104 /*
6105 * All the functions are actually the same as for 64-bit streaming variant.
6106 * The only difference is the finalization routine.
6107 */
6108
6109 /*! @ingroup xxh3_family */
XXH3_128bits_reset(XXH3_state_t * statePtr)6110 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
6111
6112 if (statePtr == NULL) return XXH_ERROR;
6113 XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
6114 return XXH_OK;
6115
6116 }
6117
6118 /*! @ingroup xxh3_family */
XXH3_128bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)6119 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
6120 XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
6121
6122 if (statePtr == NULL) return XXH_ERROR;
6123 XXH3_reset_internal(statePtr, 0, secret, secretSize);
6124 if (secret == NULL) return XXH_ERROR;
6125 if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
6126 return XXH_OK;
6127
6128 }
6129
6130 /*! @ingroup xxh3_family */
XXH3_128bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)6131 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
6132 XXH64_hash_t seed) {
6133
6134 if (statePtr == NULL) return XXH_ERROR;
6135 if (seed == 0) return XXH3_128bits_reset(statePtr);
6136 if (seed != statePtr->seed)
6137 XXH3_initCustomSecret(statePtr->customSecret, seed);
6138 XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
6139 return XXH_OK;
6140
6141 }
6142
6143 /*! @ingroup xxh3_family */
XXH3_128bits_update(XXH3_state_t * state,const void * input,size_t len)6144 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
6145 const void *input,
6146 size_t len) {
6147
6148 return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
6149 XXH3_scrambleAcc);
6150
6151 }
6152
6153 /*! @ingroup xxh3_family */
XXH3_128bits_digest(const XXH3_state_t * state)6154 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
6155
6156 const unsigned char *const secret =
6157 (state->extSecret == NULL) ? state->customSecret : state->extSecret;
6158 if (state->totalLen > XXH3_MIDSIZE_MAX) {
6159
6160 XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
6161 XXH3_digest_long(acc, state, secret);
6162 XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >=
6163 sizeof(acc) + XXH_SECRET_MERGEACCS_START);
6164 {
6165
6166 XXH128_hash_t h128;
6167 h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
6168 (xxh_u64)state->totalLen * XXH_PRIME64_1);
6169 h128.high64 =
6170 XXH3_mergeAccs(acc,
6171 secret + state->secretLimit + XXH_STRIPE_LEN -
6172 sizeof(acc) - XXH_SECRET_MERGEACCS_START,
6173 ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
6174 return h128;
6175
6176 }
6177
6178 }
6179
6180 /* len <= XXH3_MIDSIZE_MAX : short code */
6181 if (state->seed)
6182 return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen,
6183 state->seed);
6184 return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
6185 secret, state->secretLimit + XXH_STRIPE_LEN);
6186
6187 }
6188
6189 /* 128-bit utility functions */
6190
6191 #include <string.h> /* memcmp, memcpy */
6192
6193 /* return : 1 is equal, 0 if different */
6194 /*! @ingroup xxh3_family */
XXH128_isEqual(XXH128_hash_t h1,XXH128_hash_t h2)6195 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
6196
6197 /* note : XXH128_hash_t is compact, it has no padding byte */
6198 return !(memcmp(&h1, &h2, sizeof(h1)));
6199
6200 }
6201
6202 /* This prototype is compatible with stdlib's qsort().
6203 * return : >0 if *h128_1 > *h128_2
6204 * <0 if *h128_1 < *h128_2
6205 * =0 if *h128_1 == *h128_2 */
6206 /*! @ingroup xxh3_family */
XXH128_cmp(const void * h128_1,const void * h128_2)6207 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
6208
6209 XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
6210 XXH128_hash_t const h2 = *(const XXH128_hash_t *)h128_2;
6211 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
6212 /* note : bets that, in most cases, hash values are different */
6213 if (hcmp) return hcmp;
6214 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
6215
6216 }
6217
6218 /*====== Canonical representation ======*/
6219 /*! @ingroup xxh3_family */
XXH128_canonicalFromHash(XXH128_canonical_t * dst,XXH128_hash_t hash)6220 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
6221 XXH128_hash_t hash) {
6222
6223 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
6224 if (XXH_CPU_LITTLE_ENDIAN) {
6225
6226 hash.high64 = XXH_swap64(hash.high64);
6227 hash.low64 = XXH_swap64(hash.low64);
6228
6229 }
6230
6231 memcpy(dst, &hash.high64, sizeof(hash.high64));
6232 memcpy((char *)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
6233
6234 }
6235
6236 /*! @ingroup xxh3_family */
6237 XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t * src)6238 XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
6239
6240 XXH128_hash_t h;
6241 h.high64 = XXH_readBE64(src);
6242 h.low64 = XXH_readBE64(src->digest + 8);
6243 return h;
6244
6245 }
6246
6247 /* Pop our optimization override from above */
6248 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
6249 && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
6250 && defined(__OPTIMIZE__) && \
6251 !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
6252 #pragma GCC pop_options
6253 #endif
6254
6255 #endif /* XXH_NO_LONG_LONG */
6256
6257 #endif /* XXH_NO_XXH3 */
6258
6259 /*!
6260 * @}
6261 */
6262 #endif /* XXH_IMPLEMENTATION */
6263
6264 #if defined(__cplusplus)
6265
6266 }
6267
6268 #endif
6269
6270