xref: /aosp_15_r20/external/AFLplusplus/include/xxhash.h (revision 08b48e0b10e97b33e7b60c5b6e2243bd915777f2)
1 /*
2  * xxHash - Extremely Fast Hash algorithm
3  * Header File
4  * Copyright (C) 2012-2024 Yann Collet
5  *
6  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are
10  * met:
11  *
12  *    * Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *    * Redistributions in binary form must reproduce the above
15  *      copyright notice, this list of conditions and the following disclaimer
16  *      in the documentation and/or other materials provided with the
17  *      distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * You can contact the author at:
32  *   - xxHash homepage: https://www.xxhash.com
33  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
34  */
35 /*!
36  * @mainpage xxHash
37  *
38  * @file xxhash.h
39  * xxHash prototypes and implementation
40  */
41 /* TODO: update */
42 /* Notice extracted from xxHash homepage:
43 
44 xxHash is an extremely fast hash algorithm, running at RAM speed limits.
45 It also successfully passes all tests from the SMHasher suite.
46 
47 Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo
48 @3GHz)
49 
50 Name            Speed       Q.Score   Author
51 xxHash          5.4 GB/s     10
52 CrapWow         3.2 GB/s      2       Andrew
53 MurmurHash 3a   2.7 GB/s     10       Austin Appleby
54 SpookyHash      2.0 GB/s     10       Bob Jenkins
55 SBox            1.4 GB/s      9       Bret Mulvey
56 Lookup3         1.2 GB/s      9       Bob Jenkins
57 SuperFastHash   1.2 GB/s      1       Paul Hsieh
58 CityHash64      1.05 GB/s    10       Pike & Alakuijala
59 FNV             0.55 GB/s     5       Fowler, Noll, Vo
60 CRC32           0.43 GB/s     9
61 MD5-32          0.33 GB/s    10       Ronald L. Rivest
62 SHA1-32         0.28 GB/s    10
63 
64 Q.Score is a measure of quality of the hash function.
65 It depends on successfully passing SMHasher test set.
66 10 is a perfect score.
67 
68 Note: SMHasher's CRC32 implementation is not the fastest one.
69 Other speed-oriented implementations can be faster,
70 especially in combination with PCLMUL instruction:
71 https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
72 
73 A 64-bit version, named XXH64, is available since r35.
74 It offers much better speed, but for 64-bit applications only.
75 Name     Speed on 64 bits    Speed on 32 bits
76 XXH64       13.8 GB/s            1.9 GB/s
77 XXH32        6.8 GB/s            6.0 GB/s
78 */
79 
80 #if defined(__cplusplus)
81 extern "C" {
82 
83 #endif
84 
85 /* ****************************
86  *  INLINE mode
87  ******************************/
88 /*!
89  * XXH_INLINE_ALL (and XXH_PRIVATE_API)
90  * Use these build macros to inline xxhash into the target unit.
91  * Inlining improves performance on small inputs, especially when the length is
92  * expressed as a compile-time constant:
93  *
94  *      https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
95  *
96  * It also keeps xxHash symbols private to the unit, so they are not exported.
97  *
98  * Usage:
99  *     #define XXH_INLINE_ALL
100  *     #include "xxhash.h"
101  *
102  * Do not compile and link xxhash.o as a separate object, as it is not useful.
103  */
104 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) && \
105     !defined(XXH_INLINE_ALL_31684351384)
106 /* this section should be traversed only once */
107   #define XXH_INLINE_ALL_31684351384
108 /* give access to the advanced API, required to compile implementations */
109   #undef XXH_STATIC_LINKING_ONLY                       /* avoid macro redef */
110   #define XXH_STATIC_LINKING_ONLY
111 /* make all functions private */
112   #undef XXH_PUBLIC_API
113   #if defined(__GNUC__)
114     #define XXH_PUBLIC_API static __inline __attribute__((unused))
115   #elif defined(__cplusplus) || \
116       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
117     #define XXH_PUBLIC_API static inline
118   #elif defined(_MSC_VER)
119     #define XXH_PUBLIC_API static __inline
120   #else
121   /* note: this version may generate warnings for unused static functions */
122     #define XXH_PUBLIC_API static
123   #endif
124 
125 /*
126  * This part deals with the special case where a unit wants to inline xxHash,
127  * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
128  * such as part of some previously included *.h header file.
129  * Without further action, the new include would just be ignored,
130  * and functions would effectively _not_ be inlined (silent failure).
131  * The following macros solve this situation by prefixing all inlined names,
132  * avoiding naming collision with previous inclusions.
133  */
134 /* Before that, we unconditionally #undef all symbols,
135  * in case they were already defined with XXH_NAMESPACE.
136  * They will then be redefined for XXH_INLINE_ALL
137  */
138   #undef XXH_versionNumber
139 /* XXH32 */
140   #undef XXH32
141   #undef XXH32_createState
142   #undef XXH32_freeState
143   #undef XXH32_reset
144   #undef XXH32_update
145   #undef XXH32_digest
146   #undef XXH32_copyState
147   #undef XXH32_canonicalFromHash
148   #undef XXH32_hashFromCanonical
149 /* XXH64 */
150   #undef XXH64
151   #undef XXH64_createState
152   #undef XXH64_freeState
153   #undef XXH64_reset
154   #undef XXH64_update
155   #undef XXH64_digest
156   #undef XXH64_copyState
157   #undef XXH64_canonicalFromHash
158   #undef XXH64_hashFromCanonical
159 /* XXH3_64bits */
160   #undef XXH3_64bits
161   #undef XXH3_64bits_withSecret
162   #undef XXH3_64bits_withSeed
163   #undef XXH3_createState
164   #undef XXH3_freeState
165   #undef XXH3_copyState
166   #undef XXH3_64bits_reset
167   #undef XXH3_64bits_reset_withSeed
168   #undef XXH3_64bits_reset_withSecret
169   #undef XXH3_64bits_update
170   #undef XXH3_64bits_digest
171   #undef XXH3_generateSecret
172 /* XXH3_128bits */
173   #undef XXH128
174   #undef XXH3_128bits
175   #undef XXH3_128bits_withSeed
176   #undef XXH3_128bits_withSecret
177   #undef XXH3_128bits_reset
178   #undef XXH3_128bits_reset_withSeed
179   #undef XXH3_128bits_reset_withSecret
180   #undef XXH3_128bits_update
181   #undef XXH3_128bits_digest
182   #undef XXH128_isEqual
183   #undef XXH128_cmp
184   #undef XXH128_canonicalFromHash
185   #undef XXH128_hashFromCanonical
186 /* Finally, free the namespace itself */
187   #undef XXH_NAMESPACE
188 
189 /* employ the namespace for XXH_INLINE_ALL */
190   #define XXH_NAMESPACE XXH_INLINE_
191 /*
192  * Some identifiers (enums, type names) are not symbols,
193  * but they must nonetheless be renamed to avoid redeclaration.
194  * Alternative solution: do not redeclare them.
195  * However, this requires some #ifdefs, and has a more dispersed impact.
196  * Meanwhile, renaming can be achieved in a single place.
197  */
198   #define XXH_IPREF(Id) XXH_NAMESPACE##Id
199   #define XXH_OK XXH_IPREF(XXH_OK)
200   #define XXH_ERROR XXH_IPREF(XXH_ERROR)
201   #define XXH_errorcode XXH_IPREF(XXH_errorcode)
202   #define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t)
203   #define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t)
204   #define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
205   #define XXH32_state_s XXH_IPREF(XXH32_state_s)
206   #define XXH32_state_t XXH_IPREF(XXH32_state_t)
207   #define XXH64_state_s XXH_IPREF(XXH64_state_s)
208   #define XXH64_state_t XXH_IPREF(XXH64_state_t)
209   #define XXH3_state_s XXH_IPREF(XXH3_state_s)
210   #define XXH3_state_t XXH_IPREF(XXH3_state_t)
211   #define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
212 /* Ensure the header is parsed again, even if it was previously included */
213   #undef XXHASH_H_5627135585666179
214   #undef XXHASH_H_STATIC_13879238742
215 #endif                                 /* XXH_INLINE_ALL || XXH_PRIVATE_API */
216 
217 /* ****************************************************************
218  *  Stable API
219  *****************************************************************/
220 #ifndef XXHASH_H_5627135585666179
221   #define XXHASH_H_5627135585666179 1
222 
223   /*!
224    * @defgroup public Public API
225    * Contains details on the public xxHash functions.
226    * @{
227 
228    */
229   /* specific declaration modes for Windows */
230   #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
231     #if defined(WIN32) && defined(_MSC_VER) && \
232         (defined(XXH_IMPORT) || defined(XXH_EXPORT))
233       #ifdef XXH_EXPORT
234         #define XXH_PUBLIC_API __declspec(dllexport)
235       #elif XXH_IMPORT
236         #define XXH_PUBLIC_API __declspec(dllimport)
237       #endif
238     #else
239       #define XXH_PUBLIC_API                                  /* do nothing */
240     #endif
241   #endif
242 
243   #ifdef XXH_DOXYGEN
244     /*!
245      * @brief Emulate a namespace by transparently prefixing all symbols.
246      *
247      * If you want to include _and expose_ xxHash functions from within your own
248      * library, but also want to avoid symbol collisions with other libraries
249      * which may also include xxHash, you can use XXH_NAMESPACE to automatically
250      * prefix any public symbol from xxhash library with the value of
251      * XXH_NAMESPACE (therefore, avoid empty or numeric values).
252      *
253      * Note that no change is required within the calling program as long as it
254      * includes `xxhash.h`: Regular symbol names will be automatically
255      * translated by this header.
256      */
257     #define XXH_NAMESPACE                                 /* YOUR NAME HERE */
258     #undef XXH_NAMESPACE
259   #endif
260 
261   #ifdef XXH_NAMESPACE
262     #define XXH_CAT(A, B) A##B
263     #define XXH_NAME2(A, B) XXH_CAT(A, B)
264     #define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
265     /* XXH32 */
266     #define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
267     #define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
268     #define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
269     #define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
270     #define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
271     #define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
272     #define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
273     #define XXH32_canonicalFromHash \
274       XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
275     #define XXH32_hashFromCanonical \
276       XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
277     /* XXH64 */
278     #define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
279     #define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
280     #define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
281     #define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
282     #define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
283     #define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
284     #define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
285     #define XXH64_canonicalFromHash \
286       XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
287     #define XXH64_hashFromCanonical \
288       XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
289     /* XXH3_64bits */
290     #define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
291     #define XXH3_64bits_withSecret \
292       XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
293     #define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
294     #define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
295     #define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
296     #define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
297     #define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
298     #define XXH3_64bits_reset_withSeed \
299       XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
300     #define XXH3_64bits_reset_withSecret \
301       XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
302     #define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
303     #define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
304     #define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
305     /* XXH3_128bits */
306     #define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
307     #define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
308     #define XXH3_128bits_withSeed \
309       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
310     #define XXH3_128bits_withSecret \
311       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
312     #define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
313     #define XXH3_128bits_reset_withSeed \
314       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
315     #define XXH3_128bits_reset_withSecret \
316       XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
317     #define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
318     #define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
319     #define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
320     #define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
321     #define XXH128_canonicalFromHash \
322       XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
323     #define XXH128_hashFromCanonical \
324       XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
325   #endif
326 
327   /* *************************************
328    *  Version
329    ***************************************/
330   #define XXH_VERSION_MAJOR 0
331   #define XXH_VERSION_MINOR 8
332   #define XXH_VERSION_RELEASE 1
333   #define XXH_VERSION_NUMBER                                   \
334     (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 + \
335      XXH_VERSION_RELEASE)
336 
337 /*!
338  * @brief Obtains the xxHash version.
339  *
340  * This is only useful when xxHash is compiled as a shared library, as it is
341  * independent of the version defined in the header.
342  *
343  * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
344  */
345 XXH_PUBLIC_API unsigned XXH_versionNumber(void);
346 
347   /* ****************************
348    *  Definitions
349    ******************************/
350   #include <stddef.h>                                             /* size_t */
351 typedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;
352 
353   /*-**********************************************************************
354    *  32-bit hash
355    ************************************************************************/
356   #if defined(XXH_DOXYGEN)                 /* Don't show <stdint.h> include */
357 /*!
358  * @brief An unsigned 32-bit integer.
359  *
360  * Not necessarily defined to `uint32_t` but functionally equivalent.
361  */
362 typedef uint32_t XXH32_hash_t;
363 
364   #elif !defined(__VMS) &&     \
365       (defined(__cplusplus) || \
366        (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
367     #include <stdint.h>
368 typedef uint32_t XXH32_hash_t;
369 
370   #else
371     #include <limits.h>
372     #if UINT_MAX == 0xFFFFFFFFUL
373 typedef unsigned int XXH32_hash_t;
374     #else
375       #if ULONG_MAX == 0xFFFFFFFFUL
376 typedef unsigned long XXH32_hash_t;
377       #else
378         #error "unsupported platform: need a 32-bit type"
379       #endif
380     #endif
381   #endif
382 
383 /*!
384  * @}
385  *
386  * @defgroup xxh32_family XXH32 family
387  * @ingroup public
388  * Contains functions used in the classic 32-bit xxHash algorithm.
389  *
390  * @note
391  *   XXH32 is considered rather weak by today's standards.
392  *   The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
393  *   systems, and offers true 64/128 bit hash results. It provides a superior
394  *   level of dispersion, and greatly reduces the risks of collisions.
395  *
396  * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
397  * @see @ref xxh32_impl for implementation details
398  * @{
399 
400  */
401 
402 /*!
403  * @brief Calculates the 32-bit hash of @p input using xxHash32.
404  *
405  * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
406  *
407  * @param input The block of data to be hashed, at least @p length bytes in
408  * size.
409  * @param length The length of @p input, in bytes.
410  * @param seed The 32-bit seed to alter the hash's output predictably.
411  *
412  * @pre
413  *   The memory between @p input and @p input + @p length must be valid,
414  *   readable, contiguous memory. However, if @p length is `0`, @p input may be
415  *   `NULL`. In C++, this also must be *TriviallyCopyable*.
416  *
417  * @return The calculated 32-bit hash value.
418  *
419  * @see
420  *    XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
421  *    Direct equivalents for the other variants of xxHash.
422  * @see
423  *    XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
424  */
425 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t length,
426                                   XXH32_hash_t seed);
427 
428 /*!
429  * Streaming functions generate the xxHash value from an incremental input.
430  * This method is slower than single-call functions, due to state management.
431  * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
432  *
433  * An XXH state must first be allocated using `XXH*_createState()`.
434  *
435  * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
436  *
437  * Then, feed the hash state by calling `XXH*_update()` as many times as
438  * necessary.
439  *
440  * The function returns an error code, with 0 meaning OK, and any other value
441  * meaning there is an error.
442  *
443  * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
444  * This function returns the nn-bits hash as an int or long long.
445  *
446  * It's still possible to continue inserting input into the hash state after a
447  * digest, and generate new hash values later on by invoking `XXH*_digest()`.
448  *
449  * When done, release the state using `XXH*_freeState()`.
450  *
451  * Example code for incrementally hashing a file:
452  * @code{.c}
453  *    #include <stdio.h>
454  *    #include <xxhash.h>
455  *    #define BUFFER_SIZE 256
456  *
457  *    // Note: XXH64 and XXH3 use the same interface.
458  *    XXH32_hash_t
459  *    hashFile(FILE* stream)
460  *    {
461 
462  *        XXH32_state_t* state;
463  *        unsigned char buf[BUFFER_SIZE];
464  *        size_t amt;
465  *        XXH32_hash_t hash;
466  *
467  *        state = XXH32_createState();       // Create a state
468  *        assert(state != NULL);             // Error check here
469  *        XXH32_reset(state, 0xbaad5eed);    // Reset state with our seed
470  *        while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
471 
472  *            XXH32_update(state, buf, amt); // Hash the file in chunks
473  *        }
474  *        hash = XXH32_digest(state);        // Finalize the hash
475  *        XXH32_freeState(state);            // Clean up
476  *        return hash;
477  *    }
478  * @endcode
479  */
480 
481 /*!
482  * @typedef struct XXH32_state_s XXH32_state_t
483  * @brief The opaque state struct for the XXH32 streaming API.
484  *
485  * @see XXH32_state_s for details.
486  */
487 typedef struct XXH32_state_s XXH32_state_t;
488 
489 /*!
490  * @brief Allocates an @ref XXH32_state_t.
491  *
492  * Must be freed with XXH32_freeState().
493  * @return An allocated XXH32_state_t on success, `NULL` on failure.
494  */
495 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void);
496 /*!
497  * @brief Frees an @ref XXH32_state_t.
498  *
499  * Must be allocated with XXH32_createState().
500  * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref
501  * XXH32_createState().
502  * @return XXH_OK.
503  */
504 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr);
505 /*!
506  * @brief Copies one @ref XXH32_state_t to another.
507  *
508  * @param dst_state The state to copy to.
509  * @param src_state The state to copy from.
510  * @pre
511  *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
512  */
513 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t       *dst_state,
514                                     const XXH32_state_t *src_state);
515 
516 /*!
517  * @brief Resets an @ref XXH32_state_t to begin a new hash.
518  *
519  * This function resets and seeds a state. Call it before @ref XXH32_update().
520  *
521  * @param statePtr The state struct to reset.
522  * @param seed The 32-bit seed to alter the hash result predictably.
523  *
524  * @pre
525  *   @p statePtr must not be `NULL`.
526  *
527  * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
528  */
529 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
530                                          XXH32_hash_t   seed);
531 
532 /*!
533  * @brief Consumes a block of @p input to an @ref XXH32_state_t.
534  *
535  * Call this to incrementally consume blocks of data.
536  *
537  * @param statePtr The state struct to update.
538  * @param input The block of data to be hashed, at least @p length bytes in
539  * size.
540  * @param length The length of @p input, in bytes.
541  *
542  * @pre
543  *   @p statePtr must not be `NULL`.
544  * @pre
545  *   The memory between @p input and @p input + @p length must be valid,
546  *   readable, contiguous memory. However, if @p length is `0`, @p input may be
547  *   `NULL`. In C++, this also must be *TriviallyCopyable*.
548  *
549  * @return @ref XXH_OK on success, @ref XXH_ERROR on failure.
550  */
551 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *statePtr,
552                                           const void *input, size_t length);
553 
554 /*!
555  * @brief Returns the calculated hash value from an @ref XXH32_state_t.
556  *
557  * @note
558  *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
559  *   digest, and update again.
560  *
561  * @param statePtr The state struct to calculate the hash from.
562  *
563  * @pre
564  *  @p statePtr must not be `NULL`.
565  *
566  * @return The calculated xxHash32 value from that state.
567  */
568 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *statePtr);
569 
570 /*******   Canonical representation   *******/
571 
572 /*
573  * The default return values from XXH functions are unsigned 32 and 64 bit
574  * integers.
575  * This the simplest and fastest format for further post-processing.
576  *
577  * However, this leaves open the question of what is the order on the byte
578  * level, since little and big endian conventions will store the same number
579  * differently.
580  *
581  * The canonical representation settles this issue by mandating big-endian
582  * convention, the same convention as human-readable numbers (large digits
583  * first).
584  *
585  * When writing hash values to storage, sending them over a network, or printing
586  * them, it's highly recommended to use the canonical representation to ensure
587  * portability across a wider range of systems, present and future.
588  *
589  * The following functions allow transformation of hash values to and from
590  * canonical format.
591  */
592 
593 /*!
594  * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
595  */
596 typedef struct {
597 
598   unsigned char digest[4];                      /*!< Hash bytes, big endian */
599 
600 } XXH32_canonical_t;
601 
602 /*!
603  * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
604  *
605  * @param dst The @ref XXH32_canonical_t pointer to be stored to.
606  * @param hash The @ref XXH32_hash_t to be converted.
607  *
608  * @pre
609  *   @p dst must not be `NULL`.
610  */
611 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
612                                             XXH32_hash_t       hash);
613 
614 /*!
615  * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
616  *
617  * @param src The @ref XXH32_canonical_t to convert.
618  *
619  * @pre
620  *   @p src must not be `NULL`.
621  *
622  * @return The converted hash.
623  */
624 XXH_PUBLIC_API XXH32_hash_t
625 XXH32_hashFromCanonical(const XXH32_canonical_t *src);
626 
627   #ifdef __has_attribute
628     #define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
629   #else
630     #define XXH_HAS_ATTRIBUTE(x) 0
631   #endif
632 
633   /* C-language Attributes are added in C23. */
634   #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && \
635       defined(__has_c_attribute)
636     #define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
637   #else
638     #define XXH_HAS_C_ATTRIBUTE(x) 0
639   #endif
640 
641   #if defined(__cplusplus) && defined(__has_cpp_attribute)
642     #define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
643   #else
644     #define XXH_HAS_CPP_ATTRIBUTE(x) 0
645   #endif
646 
647   /*
648   Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough'
649   attribute introduced in CPP17 and C23. CPP17 :
650   https://en.cppreference.com/w/cpp/language/attributes/fallthrough C23   :
651   https://en.cppreference.com/w/c/language/attributes/fallthrough
652   */
653   #if XXH_HAS_C_ATTRIBUTE(x)
654     #define XXH_FALLTHROUGH [[fallthrough]]
655   #elif XXH_HAS_CPP_ATTRIBUTE(x)
656     #define XXH_FALLTHROUGH [[fallthrough]]
657   #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
658     #define XXH_FALLTHROUGH __attribute__((fallthrough))
659   #else
660     #define XXH_FALLTHROUGH
661   #endif
662 
663 /*!
664  * @}
665  * @ingroup public
666  * @{
667 
668  */
669 
670   #ifndef XXH_NO_LONG_LONG
671     /*-**********************************************************************
672      *  64-bit hash
673      ************************************************************************/
674     #if defined(XXH_DOXYGEN)                    /* don't include <stdint.h> */
675 /*!
676  * @brief An unsigned 64-bit integer.
677  *
678  * Not necessarily defined to `uint64_t` but functionally equivalent.
679  */
680 typedef uint64_t XXH64_hash_t;
681     #elif !defined(__VMS) &&                                   \
682         (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
683                                   (__STDC_VERSION__ >= 199901L) /* C99 */))
684       #include <stdint.h>
685 typedef uint64_t XXH64_hash_t;
686     #else
687       #include <limits.h>
688       #if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
689 /* LP64 ABI says uint64_t is unsigned long */
690 typedef unsigned long XXH64_hash_t;
691       #else
692 /* the following type must have a width of 64-bit */
693 typedef unsigned long long XXH64_hash_t;
694       #endif
695     #endif
696 
697 /*!
698  * @}
699  *
700  * @defgroup xxh64_family XXH64 family
701  * @ingroup public
702  * @{
703 
704  * Contains functions used in the classic 64-bit xxHash algorithm.
705  *
706  * @note
707  *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
708  *   and offers true 64/128 bit hash results. It provides a superior level of
709  *   dispersion, and greatly reduces the risks of collisions.
710  */
711 
712 /*!
713  * @brief Calculates the 64-bit hash of @p input using xxHash64.
714  *
715  * This function usually runs faster on 64-bit systems, but slower on 32-bit
716  * systems (see benchmark).
717  *
718  * @param input The block of data to be hashed, at least @p length bytes in
719  * size.
720  * @param length The length of @p input, in bytes.
721  * @param seed The 64-bit seed to alter the hash's output predictably.
722  *
723  * @pre
724  *   The memory between @p input and @p input + @p length must be valid,
725  *   readable, contiguous memory. However, if @p length is `0`, @p input may be
726  *   `NULL`. In C++, this also must be *TriviallyCopyable*.
727  *
728  * @return The calculated 64-bit hash.
729  *
730  * @see
731  *    XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128():
732  *    Direct equivalents for the other variants of xxHash.
733  * @see
734  *    XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
735  */
736 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t length,
737                                   XXH64_hash_t seed);
738 
739 /*******   Streaming   *******/
740 /*!
741  * @brief The opaque state struct for the XXH64 streaming API.
742  *
743  * @see XXH64_state_s for details.
744  */
745 typedef struct XXH64_state_s  XXH64_state_t;             /* incomplete type */
746 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void);
747 XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t *statePtr);
748 XXH_PUBLIC_API void           XXH64_copyState(XXH64_state_t       *dst_state,
749                                               const XXH64_state_t *src_state);
750 
751 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
752                                          XXH64_hash_t   seed);
753 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *statePtr,
754                                           const void *input, size_t length);
755 XXH_PUBLIC_API XXH64_hash_t  XXH64_digest(const XXH64_state_t *statePtr);
756 
757 /*******   Canonical representation   *******/
758 typedef struct {
759 
760   unsigned char digest[sizeof(XXH64_hash_t)];
761 
762 } XXH64_canonical_t;
763 
764 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
765                                             XXH64_hash_t       hash);
766 XXH_PUBLIC_API XXH64_hash_t
767 XXH64_hashFromCanonical(const XXH64_canonical_t *src);
768 
769 /*!
770  * @}
771  * ************************************************************************
772  * @defgroup xxh3_family XXH3 family
773  * @ingroup public
774  * @{
775 
776  *
777  * XXH3 is a more recent hash algorithm featuring:
778  *  - Improved speed for both small and large inputs
779  *  - True 64-bit and 128-bit outputs
780  *  - SIMD acceleration
781  *  - Improved 32-bit viability
782  *
783  * Speed analysis methodology is explained here:
784  *
785  *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
786  *
787  * Compared to XXH64, expect XXH3 to run approximately
788  * ~2x faster on large inputs and >3x faster on small ones,
789  * exact differences vary depending on platform.
790  *
791  * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
792  * but does not require it.
793  * Any 32-bit and 64-bit targets that can run XXH32 smoothly
794  * can run XXH3 at competitive speeds, even without vector support.
795  * Further details are explained in the implementation.
796  *
797  * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
798  * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
799  *
800  * XXH3 implementation is portable:
801  * it has a generic C90 formulation that can be compiled on any platform,
802  * all implementations generage exactly the same hash value on all platforms.
803  * Starting from v0.8.0, it's also labelled "stable", meaning that
804  * any future version will also generate the same hash value.
805  *
806  * XXH3 offers 2 variants, _64bits and _128bits.
807  *
808  * When only 64 bits are needed, prefer invoking the _64bits variant, as it
809  * reduces the amount of mixing, resulting in faster speed on small inputs.
810  * It's also generally simpler to manipulate a scalar return type than a struct.
811  *
812  * The API supports one-shot hashing, streaming mode, and custom secrets.
813  */
814 
815 /*-**********************************************************************
816  *  XXH3 64-bit variant
817  ************************************************************************/
818 
819 /* XXH3_64bits():
820  * default 64-bit variant, using default secret and default seed of 0.
821  * It's the fastest variant. */
822 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *data, size_t len);
823 
824 /*
825  * XXH3_64bits_withSeed():
826  * This variant generates a custom secret on the fly
827  * based on default secret altered using the `seed` value.
828  * While this operation is decently fast, note that it's not completely free.
829  * Note: seed==0 produces the same results as XXH3_64bits().
830  */
831 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *data, size_t len,
832                                                  XXH64_hash_t seed);
833 
834     /*!
835      * The bare minimum size for a custom secret.
836      *
837      * @see
838      *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
839      *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
840      */
841     #define XXH3_SECRET_SIZE_MIN 136
842 
843 /*
844  * XXH3_64bits_withSecret():
845  * It's possible to provide any blob of bytes as a "secret" to generate the
846  * hash. This makes it more difficult for an external actor to prepare an
847  * intentional collision. The main condition is that secretSize *must* be large
848  * enough (>= XXH3_SECRET_SIZE_MIN). However, the quality of produced hash
849  * values depends on secret's entropy. Technically, the secret must look like a
850  * bunch of random bytes. Avoid "trivial" or structured data such as repeated
851  * sequences or a text document. Whenever unsure about the "randomness" of the
852  * blob of bytes, consider relabelling it as a "custom seed" instead, and employ
853  * "XXH3_generateSecret()" (see below) to generate a high entropy secret derived
854  * from the custom seed.
855  */
856 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *data, size_t len,
857                                                    const void *secret,
858                                                    size_t      secretSize);
859 
860 /*******   Streaming   *******/
861 /*
862  * Streaming requires state maintenance.
863  * This operation costs memory and CPU.
864  * As a consequence, streaming is slower than one-shot hashing.
865  * For better performance, prefer one-shot functions whenever applicable.
866  */
867 
868 /*!
869  * @brief The state struct for the XXH3 streaming API.
870  *
871  * @see XXH3_state_s for details.
872  */
873 typedef struct XXH3_state_s  XXH3_state_t;
874 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void);
875 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr);
876 XXH_PUBLIC_API void          XXH3_copyState(XXH3_state_t       *dst_state,
877                                             const XXH3_state_t *src_state);
878 
879 /*
880  * XXH3_64bits_reset():
881  * Initialize with default parameters.
882  * digest will be equivalent to `XXH3_64bits()`.
883  */
884 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr);
885 /*
886  * XXH3_64bits_reset_withSeed():
887  * Generate a custom secret from `seed`, and store it into `statePtr`.
888  * digest will be equivalent to `XXH3_64bits_withSeed()`.
889  */
890 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
891                                                         XXH64_hash_t  seed);
892 /*
893  * XXH3_64bits_reset_withSecret():
894  * `secret` is referenced, it _must outlive_ the hash streaming session.
895  * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
896  * and the quality of produced hash values depends on secret's entropy
897  * (secret's content should look like a bunch of random bytes).
898  * When in doubt about the randomness of a candidate `secret`,
899  * consider employing `XXH3_generateSecret()` instead (see below).
900  */
901 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
902     XXH3_state_t *statePtr, const void *secret, size_t secretSize);
903 
904 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *statePtr,
905                                                 const void   *input,
906                                                 size_t        length);
907 XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest(const XXH3_state_t *statePtr);
908 
909 /* note : canonical representation of XXH3 is the same as XXH64
910  * since they both produce XXH64_hash_t values */
911 
912 /*-**********************************************************************
913  *  XXH3 128-bit variant
914  ************************************************************************/
915 
916 /*!
917  * @brief The return value from 128-bit hashes.
918  *
919  * Stored in little endian order, although the fields themselves are in native
920  * endianness.
921  */
922 typedef struct {
923 
924   XXH64_hash_t low64;                     /*!< `value & 0xFFFFFFFFFFFFFFFF` */
925   XXH64_hash_t high64;                                   /*!< `value >> 64` */
926 
927 } XXH128_hash_t;
928 
929 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *data, size_t len);
930 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void *data, size_t len,
931                                                    XXH64_hash_t seed);
932 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *data,
933                                                      size_t      len,
934                                                      const void *secret,
935                                                      size_t      secretSize);
936 
937 /*******   Streaming   *******/
938 /*
939  * Streaming requires state maintenance.
940  * This operation costs memory and CPU.
941  * As a consequence, streaming is slower than one-shot hashing.
942  * For better performance, prefer one-shot functions whenever applicable.
943  *
944  * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
945  * Use already declared XXH3_createState() and XXH3_freeState().
946  *
947  * All reset and streaming functions have same meaning as their 64-bit
948  * counterpart.
949  */
950 
951 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr);
952 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
953                                                          XXH64_hash_t  seed);
954 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
955     XXH3_state_t *statePtr, const void *secret, size_t secretSize);
956 
957 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *statePtr,
958                                                  const void   *input,
959                                                  size_t        length);
960 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *statePtr);
961 
962 /* Following helper functions make it possible to compare XXH128_hast_t values.
963  * Since XXH128_hash_t is a structure, this capability is not offered by the
964  * language.
965  * Note: For better performance, these functions can be inlined using
966  * XXH_INLINE_ALL */
967 
968 /*!
969  * XXH128_isEqual():
970  * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
971  */
972 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
973 
974 /*!
975  * XXH128_cmp():
976  *
977  * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
978  *
979  * return: >0 if *h128_1  > *h128_2
980  *         =0 if *h128_1 == *h128_2
981  *         <0 if *h128_1  < *h128_2
982  */
983 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2);
984 
985 /*******   Canonical representation   *******/
986 typedef struct {
987 
988   unsigned char digest[sizeof(XXH128_hash_t)];
989 
990 } XXH128_canonical_t;
991 
992 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
993                                              XXH128_hash_t       hash);
994 XXH_PUBLIC_API XXH128_hash_t
995 XXH128_hashFromCanonical(const XXH128_canonical_t *src);
996 
997   #endif                                                /* XXH_NO_LONG_LONG */
998 
999 /*!
1000  * @}
1001  */
1002 #endif                                         /* XXHASH_H_5627135585666179 */
1003 
1004 #if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
1005   #define XXHASH_H_STATIC_13879238742
1006 /* ****************************************************************************
1007  * This section contains declarations which are not guaranteed to remain stable.
1008  * They may change in future versions, becoming incompatible with a different
1009  * version of the library.
1010  * These declarations should only be used with static linking.
1011  * Never use them in association with dynamic linking!
1012  *****************************************************************************
1013  */
1014 
1015 /*
1016  * These definitions are only present to allow static allocation
1017  * of XXH states, on stack or in a struct, for example.
1018  * Never **ever** access their members directly.
1019  */
1020 
1021 /*!
1022  * @internal
1023  * @brief Structure for XXH32 streaming API.
1024  *
1025  * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
1026  * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
1027  * an opaque type. This allows fields to safely be changed.
1028  *
1029  * Typedef'd to @ref XXH32_state_t.
1030  * Do not access the members of this struct directly.
1031  * @see XXH64_state_s, XXH3_state_s
1032  */
1033 struct XXH32_state_s {
1034 
1035   XXH32_hash_t total_len_32;          /*!< Total length hashed, modulo 2^32 */
1036   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref
1037                                 total_len_32 overflow) */
1038   XXH32_hash_t v1;                              /*!< First accumulator lane */
1039   XXH32_hash_t v2;                             /*!< Second accumulator lane */
1040   XXH32_hash_t v3;                              /*!< Third accumulator lane */
1041   XXH32_hash_t v4;                             /*!< Fourth accumulator lane */
1042   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as
1043                                 unsigned char[16]. */
1044   XXH32_hash_t memsize;                   /*!< Amount of data in @ref mem32 */
1045   XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may
1046                             be removed. */
1047 
1048 };                                            /* typedef'd to XXH32_state_t */
1049 
1050   #ifndef XXH_NO_LONG_LONG       /* defined when there is no 64-bit support */
1051 
1052 /*!
1053  * @internal
1054  * @brief Structure for XXH64 streaming API.
1055  *
1056  * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
1057  * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
1058  * an opaque type. This allows fields to safely be changed.
1059  *
1060  * Typedef'd to @ref XXH64_state_t.
1061  * Do not access the members of this struct directly.
1062  * @see XXH32_state_s, XXH3_state_s
1063  */
1064 struct XXH64_state_s {
1065 
1066   XXH64_hash_t total_len;  /*!< Total length hashed. This is always 64-bit. */
1067   XXH64_hash_t v1;                              /*!< First accumulator lane */
1068   XXH64_hash_t v2;                             /*!< Second accumulator lane */
1069   XXH64_hash_t v3;                              /*!< Third accumulator lane */
1070   XXH64_hash_t v4;                             /*!< Fourth accumulator lane */
1071   XXH64_hash_t mem64[4];   /*!< Internal buffer for partial reads. Treated as
1072                               unsigned char[32]. */
1073   XXH32_hash_t memsize;                   /*!< Amount of data in @ref mem64 */
1074   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
1075   XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it
1076                               may be removed. */
1077 
1078 };                                            /* typedef'd to XXH64_state_t */
1079 
1080     #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 \
1081                                                                     */
1082       #include <stdalign.h>
1083       #define XXH_ALIGN(n) alignas(n)
1084     #elif defined(__cplusplus) && (__cplusplus >= 201103L)      /* >= C++11 */
1085     /* In C++ alignas() is a keyword */
1086       #define XXH_ALIGN(n) alignas(n)
1087     #elif defined(__GNUC__)
1088       #define XXH_ALIGN(n) __attribute__((aligned(n)))
1089     #elif defined(_MSC_VER)
1090       #define XXH_ALIGN(n) __declspec(align(n))
1091     #else
1092       #define XXH_ALIGN(n)                                      /* disabled */
1093     #endif
1094 
1095     /* Old GCC versions only accept the attribute after the type in structures.
1096      */
1097     #if !(defined(__STDC_VERSION__) &&                                        \
1098           (__STDC_VERSION__ >= 201112L))                       /* C11+ */     \
1099         && !(defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
1100         && defined(__GNUC__)
1101       #define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
1102     #else
1103       #define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
1104     #endif
1105 
1106     /*!
1107      * @brief The size of the internal XXH3 buffer.
1108      *
1109      * This is the optimal update size for incremental hashing.
1110      *
1111      * @see XXH3_64b_update(), XXH3_128b_update().
1112      */
1113     #define XXH3_INTERNALBUFFER_SIZE 256
1114 
1115     /*!
1116      * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
1117      *
1118      * This is the size used in @ref XXH3_kSecret and the seeded functions.
1119      *
1120      * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
1121      */
1122     #define XXH3_SECRET_DEFAULT_SIZE 192
1123 
1124 /*!
1125  * @internal
1126  * @brief Structure for XXH3 streaming API.
1127  *
1128  * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
1129  * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
1130  * Otherwise it is an opaque type.
1131  * Never use this definition in combination with dynamic library.
1132  * This allows fields to safely be changed in the future.
1133  *
1134  * @note ** This structure has a strict alignment requirement of 64 bytes!! **
1135  * Do not allocate this with `malloc()` or `new`,
1136  * it will not be sufficiently aligned.
1137  * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
1138  *
1139  * Typedef'd to @ref XXH3_state_t.
1140  * Do never access the members of this struct directly.
1141  *
1142  * @see XXH3_INITSTATE() for stack initialization.
1143  * @see XXH3_createState(), XXH3_freeState().
1144  * @see XXH32_state_s, XXH64_state_s
1145  */
1146 struct XXH3_state_s {
1147 
1148   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
1149   /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref
1150    * XXH64_state_s */
1151   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
1152   /*!< Used to store a custom secret generated from a seed. */
1153   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
1154   /*!< The internal buffer. @see XXH32_state_s::mem32 */
1155   XXH32_hash_t bufferedSize;
1156   /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
1157   XXH32_hash_t reserved32;
1158   /*!< Reserved field. Needed for padding on 64-bit. */
1159   size_t nbStripesSoFar;
1160   /*!< Number or stripes processed. */
1161   XXH64_hash_t totalLen;
1162   /*!< Total length hashed. 64-bit even on 32-bit targets. */
1163   size_t nbStripesPerBlock;
1164   /*!< Number of stripes per block. */
1165   size_t secretLimit;
1166   /*!< Size of @ref customSecret or @ref extSecret */
1167   XXH64_hash_t seed;
1168   /*!< Seed for _withSeed variants. Must be zero otherwise, @see
1169    * XXH3_INITSTATE() */
1170   XXH64_hash_t reserved64;
1171   /*!< Reserved field. */
1172   const unsigned char *extSecret;
1173   /*!< Reference to an external secret for the _withSecret variants, NULL
1174    *   for other variants. */
1175   /* note: there may be some padding at the end due to alignment on 64 bytes */
1176 
1177 };                                             /* typedef'd to XXH3_state_t */
1178 
1179     #undef XXH_ALIGN_MEMBER
1180 
1181     /*!
1182      * @brief Initializes a stack-allocated `XXH3_state_s`.
1183      *
1184      * When the @ref XXH3_state_t structure is merely emplaced on stack,
1185      * it should be initialized with XXH3_INITSTATE() or a memset()
1186      * in case its first reset uses XXH3_NNbits_reset_withSeed().
1187      * This init can be omitted if the first reset uses default or _withSecret
1188      * mode. This operation isn't necessary when the state is created with
1189      * XXH3_createState(). Note that this doesn't prepare the state for a
1190      * streaming operation, it's still necessary to use XXH3_NNbits_reset*()
1191      * afterwards.
1192      */
1193     #define XXH3_INITSTATE(XXH3_state_ptr) \
1194       { (XXH3_state_ptr)->seed = 0; }
1195 
1196 /* ===   Experimental API   === */
1197 /* Symbols defined below must be considered tied to a specific library version.
1198  */
1199 
1200 /*
1201  * XXH3_generateSecret():
1202  *
1203  * Derive a high-entropy secret from any user-defined content, named customSeed.
1204  * The generated secret can be used in combination with `*_withSecret()`
1205  * functions. The `_withSecret()` variants are useful to provide a higher level
1206  * of protection than 64-bit seed, as it becomes much more difficult for an
1207  * external actor to guess how to impact the calculation logic.
1208  *
1209  * The function accepts as input a custom seed of any length and any content,
1210  * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
1211  * into an already allocated buffer secretBuffer.
1212  * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
1213  *
1214  * The generated secret can then be used with any `*_withSecret()` variant.
1215  * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
1216  * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
1217  * are part of this list. They all accept a `secret` parameter
1218  * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
1219  * _and_ feature very high entropy (consist of random-looking bytes).
1220  * These conditions can be a high bar to meet, so
1221  * this function can be used to generate a secret of proper quality.
1222  *
1223  * customSeed can be anything. It can have any size, even small ones,
1224  * and its content can be anything, even stupidly "low entropy" source such as a
1225  * bunch of zeroes. The resulting `secret` will nonetheless provide all expected
1226  * qualities.
1227  *
1228  * Supplying NULL as the customSeed copies the default secret into
1229  * `secretBuffer`. When customSeedSize > 0, supplying NULL as customSeed is
1230  * undefined behavior.
1231  */
1232 XXH_PUBLIC_API void XXH3_generateSecret(void       *secretBuffer,
1233                                         const void *customSeed,
1234                                         size_t      customSeedSize);
1235 
1236 /* simple short-cut to pre-selected XXH3_128bits variant */
1237 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *data, size_t len,
1238                                     XXH64_hash_t seed);
1239 
1240   #endif                                                /* XXH_NO_LONG_LONG */
1241   #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
1242     #define XXH_IMPLEMENTATION
1243   #endif
1244 
1245 #endif /* defined(XXH_STATIC_LINKING_ONLY) && \
1246           !defined(XXHASH_H_STATIC_13879238742) */
1247 
1248 /* ======================================================================== */
1249 /* ======================================================================== */
1250 /* ======================================================================== */
1251 
1252 /*-**********************************************************************
1253  * xxHash implementation
1254  *-**********************************************************************
1255  * xxHash's implementation used to be hosted inside xxhash.c.
1256  *
1257  * However, inlining requires implementation to be visible to the compiler,
1258  * hence be included alongside the header.
1259  * Previously, implementation was hosted inside xxhash.c,
1260  * which was then #included when inlining was activated.
1261  * This construction created issues with a few build and install systems,
1262  * as it required xxhash.c to be stored in /include directory.
1263  *
1264  * xxHash implementation is now directly integrated within xxhash.h.
1265  * As a consequence, xxhash.c is no longer needed in /include.
1266  *
1267  * xxhash.c is still available and is still useful.
1268  * In a "normal" setup, when xxhash is not inlined,
1269  * xxhash.h only exposes the prototypes and public symbols,
1270  * while xxhash.c can be built into an object file xxhash.o
1271  * which can then be linked into the final binary.
1272  ************************************************************************/
1273 
1274 #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) || \
1275      defined(XXH_IMPLEMENTATION)) &&                        \
1276     !defined(XXH_IMPLEM_13a8737387)
1277   #define XXH_IMPLEM_13a8737387
1278 
1279   /* *************************************
1280    *  Tuning parameters
1281    ***************************************/
1282 
1283   /*!
1284    * @defgroup tuning Tuning parameters
1285    * @{
1286 
1287    *
1288    * Various macros to control xxHash's behavior.
1289    */
1290   #ifdef XXH_DOXYGEN
1291     /*!
1292      * @brief Define this to disable 64-bit code.
1293      *
1294      * Useful if only using the @ref xxh32_family and you have a strict C90
1295      * compiler.
1296      */
1297     #define XXH_NO_LONG_LONG
1298     #undef XXH_NO_LONG_LONG                               /* don't actually */
1299     /*!
1300      * @brief Controls how unaligned memory is accessed.
1301      *
1302      * By default, access to unaligned memory is controlled by `memcpy()`, which
1303      * is safe and portable.
1304      *
1305      * Unfortunately, on some target/compiler combinations, the generated
1306      * assembly is sub-optimal.
1307      *
1308      * The below switch allow selection of a different access method
1309      * in the search for improved performance.
1310      *
1311      * @par Possible options:
1312      *
1313      *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
1314      *   @par
1315      *     Use `memcpy()`. Safe and portable. Note that most modern compilers
1316      * will eliminate the function call and treat it as an unaligned access.
1317      *
1318      *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
1319      *   @par
1320      *     Depends on compiler extensions and is therefore not portable.
1321      *     This method is safe _if_ your compiler supports it,
1322      *     and *generally* as fast or faster than `memcpy`.
1323      *
1324      *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
1325      *  @par
1326      *     Casts directly and dereferences. This method doesn't depend on the
1327      *     compiler, but it violates the C standard as it directly dereferences
1328      * an unaligned pointer. It can generate buggy code on targets which do not
1329      *     support unaligned memory accesses, but in some circumstances, it's
1330      * the only known way to get the most performance.
1331      *
1332      *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
1333      *  @par
1334      *     Also portable. This can generate the best code on old compilers which
1335      * don't inline small `memcpy()` calls, and it might also be faster on
1336      * big-endian systems which lack a native byteswap instruction. However,
1337      * some compilers will emit literal byteshifts even if the target supports
1338      * unaligned access.
1339      *  .
1340      *
1341      * @warning
1342      *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
1343      *   care, as what works on one compiler/platform/optimization level may
1344      * cause another to read garbage data or even crash.
1345      *
1346      * See https://stackoverflow.com/a/32095106/646947 for details.
1347      *
1348      * Prefer these methods in priority order (0 > 3 > 1 > 2)
1349      */
1350     #define XXH_FORCE_MEMORY_ACCESS 0
1351     /*!
1352      * @def XXH_ACCEPT_NULL_INPUT_POINTER
1353      * @brief Whether to add explicit `NULL` checks.
1354      *
1355      * If the input pointer is `NULL` and the length is non-zero, xxHash's
1356      * default behavior is to dereference it, triggering a segfault.
1357      *
1358      * When this macro is enabled, xxHash actively checks the input for a null
1359      * pointer. If it is, the result for null input pointers is the same as a
1360      * zero-length input.
1361      */
1362     #define XXH_ACCEPT_NULL_INPUT_POINTER 0
1363     /*!
1364      * @def XXH_FORCE_ALIGN_CHECK
1365      * @brief If defined to non-zero, adds a special path for aligned inputs
1366      * (XXH32() and XXH64() only).
1367      *
1368      * This is an important performance trick for architectures without decent
1369      * unaligned memory access performance.
1370      *
1371      * It checks for input alignment, and when conditions are met, uses a "fast
1372      * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
1373      * faster_ read speed.
1374      *
1375      * The check costs one initial branch per hash, which is generally
1376      * negligible, but not zero.
1377      *
1378      * Moreover, it's not useful to generate an additional code path if memory
1379      * access uses the same instruction for both aligned and unaligned
1380      * addresses (e.g. x86 and aarch64).
1381      *
1382      * In these cases, the alignment check can be removed by setting this macro
1383      * to 0. Then the code will always use unaligned memory access. Align check
1384      * is automatically disabled on x86, x64 & arm64, which are platforms known
1385      * to offer good unaligned memory accesses performance.
1386      *
1387      * This option does not affect XXH3 (only XXH32 and XXH64).
1388      */
1389     #define XXH_FORCE_ALIGN_CHECK 0
1390 
1391     /*!
1392      * @def XXH_NO_INLINE_HINTS
1393      * @brief When non-zero, sets all functions to `static`.
1394      *
1395      * By default, xxHash tries to force the compiler to inline almost all
1396      * internal functions.
1397      *
1398      * This can usually improve performance due to reduced jumping and improved
1399      * constant folding, but significantly increases the size of the binary
1400      * which might not be favorable.
1401      *
1402      * Additionally, sometimes the forced inlining can be detrimental to
1403      * performance, depending on the architecture.
1404      *
1405      * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
1406      * compiler full control on whether to inline or not.
1407      *
1408      * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
1409      * -fno-inline with GCC or Clang, this will automatically be defined.
1410      */
1411     #define XXH_NO_INLINE_HINTS 0
1412 
1413     /*!
1414      * @def XXH_REROLL
1415      * @brief Whether to reroll `XXH32_finalize`.
1416      *
1417      * For performance, `XXH32_finalize` uses an unrolled loop
1418      * in the form of a switch statement.
1419      *
1420      * This is not always desirable, as it generates larger code,
1421      * and depending on the architecture, may even be slower
1422      *
1423      * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
1424      */
1425     #define XXH_REROLL 0
1426 
1427     /*!
1428      * @internal
1429      * @brief Redefines old internal names.
1430      *
1431      * For compatibility with code that uses xxHash's internals before the names
1432      * were changed to improve namespacing. There is no other reason to use
1433      * this.
1434      */
1435     #define XXH_OLD_NAMES
1436     #undef XXH_OLD_NAMES                 /* don't actually use, it is ugly. */
1437   #endif                                                     /* XXH_DOXYGEN */
1438                          /*!
1439                           * @}
1440                           */
1441 
1442   #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command \
1443                                      line for example */
1444   /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
1445     #if !defined(__clang__) &&                                          \
1446         ((defined(__INTEL_COMPILER) && !defined(_WIN32)) ||             \
1447          (defined(__GNUC__) &&                                          \
1448           ((defined(__ARM_ARCH) && __ARM_ARCH >= 7) ||                  \
1449            (defined(__mips__) && (__mips <= 5 || __mips_isa_rev < 6) && \
1450             (!defined(__mips16) || defined(__mips_mips16e2))))))
1451       #define XXH_FORCE_MEMORY_ACCESS 1
1452     #endif
1453   #endif
1454 
1455   #ifndef XXH_ACCEPT_NULL_INPUT_POINTER        /* can be defined externally */
1456     #define XXH_ACCEPT_NULL_INPUT_POINTER 0
1457   #endif
1458 
1459   #ifndef XXH_FORCE_ALIGN_CHECK                /* can be defined externally */
1460     #if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || \
1461         defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64)  /* visual */
1462       #define XXH_FORCE_ALIGN_CHECK 0
1463     #else
1464       #define XXH_FORCE_ALIGN_CHECK 1
1465     #endif
1466   #endif
1467 
1468   #ifndef XXH_NO_INLINE_HINTS
1469     #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
1470         || defined(__NO_INLINE__)                       /* -O0, -fno-inline */
1471       #define XXH_NO_INLINE_HINTS 1
1472     #else
1473       #define XXH_NO_INLINE_HINTS 0
1474     #endif
1475   #endif
1476 
1477   #ifndef XXH_REROLL
1478     #if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \
1479         (defined(__GNUC__) && !defined(__clang__))
1480     /* The if/then loop is preferable to switch/case on gcc (on x64) */
1481       #define XXH_REROLL 1
1482     #else
1483       #define XXH_REROLL 0
1484     #endif
1485   #endif
1486 
1487   /*!
1488    * @defgroup impl Implementation
1489    * @{
1490 
1491    */
1492 
1493   /* *************************************
1494    *  Includes & Memory related functions
1495    ***************************************/
1496   /*
1497    * Modify the local functions below should you wish to use
1498    * different memory routines for malloc() and free()
1499    */
1500   #include <stdlib.h>
1501 
1502 /*!
1503  * @internal
1504  * @brief Modify this function to use a different routine than malloc().
1505  */
XXH_malloc(size_t s)1506 static void *XXH_malloc(size_t s) {
1507 
1508   return malloc(s);
1509 
1510 }
1511 
1512 /*!
1513  * @internal
1514  * @brief Modify this function to use a different routine than free().
1515  */
XXH_free(void * p)1516 static void XXH_free(void *p) {
1517 
1518   free(p);
1519 
1520 }
1521 
1522   #include <string.h>
1523 
1524 /*!
1525  * @internal
1526  * @brief Modify this function to use a different routine than memcpy().
1527  */
XXH_memcpy(void * dest,const void * src,size_t size)1528 static void *XXH_memcpy(void *dest, const void *src, size_t size) {
1529 
1530   return memcpy(dest, src, size);
1531 
1532 }
1533 
1534   #include <limits.h>                                         /* ULLONG_MAX */
1535 
1536   /* *************************************
1537    *  Compiler Specific Options
1538    ***************************************/
1539   #ifdef _MSC_VER                              /* Visual Studio warning fix */
1540     #pragma warning(disable : 4127) /* disable: C4127: conditional expression \
1541                                        is constant */
1542   #endif
1543 
1544   #if XXH_NO_INLINE_HINTS                         /* disable inlining hints */
1545     #if defined(__GNUC__)
1546       #define XXH_FORCE_INLINE static __attribute__((unused))
1547     #else
1548       #define XXH_FORCE_INLINE static
1549     #endif
1550     #define XXH_NO_INLINE static
1551   /* enable inlining hints */
1552   #elif defined(_MSC_VER)                                  /* Visual Studio */
1553     #define XXH_FORCE_INLINE static __forceinline
1554     #define XXH_NO_INLINE static __declspec(noinline)
1555   #elif defined(__GNUC__)
1556     #define XXH_FORCE_INLINE \
1557       static __inline__ __attribute__((always_inline, unused))
1558     #define XXH_NO_INLINE static __attribute__((noinline))
1559   #elif defined(__cplusplus) || \
1560       (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
1561     #define XXH_FORCE_INLINE static inline
1562     #define XXH_NO_INLINE static
1563   #else
1564     #define XXH_FORCE_INLINE static
1565     #define XXH_NO_INLINE static
1566   #endif
1567 
1568   /* *************************************
1569    *  Debug
1570    ***************************************/
1571   /*!
1572    * @ingroup tuning
1573    * @def XXH_DEBUGLEVEL
1574    * @brief Sets the debugging level.
1575    *
1576    * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
1577    * compiler's command line options. The value must be a number.
1578    */
1579   #ifndef XXH_DEBUGLEVEL
1580     #ifdef DEBUGLEVEL                                   /* backwards compat */
1581       #define XXH_DEBUGLEVEL DEBUGLEVEL
1582     #else
1583       #define XXH_DEBUGLEVEL 0
1584     #endif
1585   #endif
1586 
1587   #if (XXH_DEBUGLEVEL >= 1)
1588     #include <assert.h>          /* note: can still be disabled with NDEBUG */
1589     #define XXH_ASSERT(c) assert(c)
1590   #else
1591     #define XXH_ASSERT(c) ((void)0)
1592   #endif
1593 
1594   /* note: use after variable declarations */
1595   #ifndef XXH_STATIC_ASSERT
1596     #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11 */
1597       #include <assert.h>
1598       #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
1599         do {                                       \
1600                                                    \
1601           static_assert((c), m);                   \
1602                                                    \
1603         } while (0)
1604 
1605     #elif defined(__cplusplus) && (__cplusplus >= 201103L)         /* C++11 */
1606       #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
1607         do {                                       \
1608                                                    \
1609           static_assert((c), m);                   \
1610                                                    \
1611         } while (0)
1612 
1613     #else
1614       #define XXH_STATIC_ASSERT_WITH_MESSAGE(c, m) \
1615         do {                                       \
1616                                                    \
1617           struct xxh_sa {                          \
1618                                                    \
1619             char x[(c) ? 1 : -1];                  \
1620                                                    \
1621           };                                       \
1622                                                    \
1623         } while (0)
1624 
1625     #endif
1626     #define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c), #c)
1627   #endif
1628 
1629   /*!
1630    * @internal
1631    * @def XXH_COMPILER_GUARD(var)
1632    * @brief Used to prevent unwanted optimizations for @p var.
1633    *
1634    * It uses an empty GCC inline assembly statement with a register constraint
1635    * which forces @p var into a general purpose register (eg eax, ebx, ecx
1636    * on x86) and marks it as modified.
1637    *
1638    * This is used in a few places to avoid unwanted autovectorization (e.g.
1639    * XXH32_round()). All vectorization we want is explicit via intrinsics,
1640    * and _usually_ isn't wanted elsewhere.
1641    *
1642    * We also use it to prevent unwanted constant folding for AArch64 in
1643    * XXH3_initCustomSecret_scalar().
1644    */
1645   #ifdef __GNUC__
1646     #define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r"(var))
1647   #else
1648     #define XXH_COMPILER_GUARD(var) ((void)0)
1649   #endif
1650 
1651   /* *************************************
1652    *  Basic Types
1653    ***************************************/
1654   #if !defined(__VMS) &&       \
1655       (defined(__cplusplus) || \
1656        (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
1657     #include <stdint.h>
1658 typedef uint8_t xxh_u8;
1659   #else
1660 typedef unsigned char xxh_u8;
1661   #endif
1662 typedef XXH32_hash_t xxh_u32;
1663 
1664   #ifdef XXH_OLD_NAMES
1665     #define BYTE xxh_u8
1666     #define U8 xxh_u8
1667     #define U32 xxh_u32
1668   #endif
1669 
1670 /* ***   Memory access   *** */
1671 
1672 /*!
1673  * @internal
1674  * @fn xxh_u32 XXH_read32(const void* ptr)
1675  * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
1676  *
1677  * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1678  *
1679  * @param ptr The pointer to read from.
1680  * @return The 32-bit native endian integer from the bytes at @p ptr.
1681  */
1682 
1683 /*!
1684  * @internal
1685  * @fn xxh_u32 XXH_readLE32(const void* ptr)
1686  * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
1687  *
1688  * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1689  *
1690  * @param ptr The pointer to read from.
1691  * @return The 32-bit little endian integer from the bytes at @p ptr.
1692  */
1693 
1694 /*!
1695  * @internal
1696  * @fn xxh_u32 XXH_readBE32(const void* ptr)
1697  * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
1698  *
1699  * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1700  *
1701  * @param ptr The pointer to read from.
1702  * @return The 32-bit big endian integer from the bytes at @p ptr.
1703  */
1704 
1705 /*!
1706  * @internal
1707  * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
1708  * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
1709  *
1710  * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
1711  * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
1712  * always @ref XXH_alignment::XXH_unaligned.
1713  *
1714  * @param ptr The pointer to read from.
1715  * @param align Whether @p ptr is aligned.
1716  * @pre
1717  *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
1718  *   aligned.
1719  * @return The 32-bit little endian integer from the bytes at @p ptr.
1720  */
1721 
1722   #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1723   /*
1724    * Manual byteshift. Best for old compilers which don't inline memcpy.
1725    * We actually directly use XXH_readLE32 and XXH_readBE32.
1726    */
1727   #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
1728 
1729 /*
1730  * Force direct memory access. Only works on CPU which support unaligned memory
1731  * access in hardware.
1732  */
XXH_read32(const void * memPtr)1733 static xxh_u32 XXH_read32(const void *memPtr) {
1734 
1735   return *(const xxh_u32 *)memPtr;
1736 
1737 }
1738 
1739   #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
1740 
1741     /*
1742      * __pack instructions are safer but compiler specific, hence potentially
1743      * problematic for some compilers.
1744      *
1745      * Currently only defined for GCC and ICC.
1746      */
1747     #ifdef XXH_OLD_NAMES
1748 typedef union {
1749 
1750   xxh_u32 u32;
1751 
1752 } __attribute__((packed)) unalign;
1753 
1754     #endif
XXH_read32(const void * ptr)1755 static xxh_u32 XXH_read32(const void *ptr) {
1756 
1757   typedef union {
1758 
1759     xxh_u32 u32;
1760 
1761   } __attribute__((packed)) xxh_unalign;
1762 
1763   return ((const xxh_unalign *)ptr)->u32;
1764 
1765 }
1766 
1767   #else
1768 
1769 /*
1770  * Portable and safe solution. Generally efficient.
1771  * see: https://stackoverflow.com/a/32095106/646947
1772  */
XXH_read32(const void * memPtr)1773 static xxh_u32 XXH_read32(const void *memPtr) {
1774 
1775   xxh_u32 val;
1776   memcpy(&val, memPtr, sizeof(val));
1777   return val;
1778 
1779 }
1780 
1781   #endif                                  /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
1782 
1783   /* ***   Endianness   *** */
1784 
1785   /*!
1786    * @ingroup tuning
1787    * @def XXH_CPU_LITTLE_ENDIAN
1788    * @brief Whether the target is little endian.
1789    *
1790    * Defined to 1 if the target is little endian, or 0 if it is big endian.
1791    * It can be defined externally, for example on the compiler command line.
1792    *
1793    * If it is not defined,
1794    * a runtime check (which is usually constant folded) is used instead.
1795    *
1796    * @note
1797    *   This is not necessarily defined to an integer constant.
1798    *
1799    * @see XXH_isLittleEndian() for the runtime check.
1800    */
1801   #ifndef XXH_CPU_LITTLE_ENDIAN
1802     /*
1803      * Try to detect endianness automatically, to avoid the nonstandard behavior
1804      * in `XXH_isLittleEndian()`
1805      */
1806     #if defined(_WIN32) /* Windows is always little endian */ \
1807         || defined(__LITTLE_ENDIAN__) ||                      \
1808         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
1809       #define XXH_CPU_LITTLE_ENDIAN 1
1810     #elif defined(__BIG_ENDIAN__) || \
1811         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
1812       #define XXH_CPU_LITTLE_ENDIAN 0
1813     #else
1814 /*!
1815  * @internal
1816  * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
1817  *
1818  * Most compilers will constant fold this.
1819  */
XXH_isLittleEndian(void)1820 static int XXH_isLittleEndian(void) {
1821 
1822   /*
1823    * Portable and well-defined behavior.
1824    * Don't use static: it is detrimental to performance.
1825    */
1826   const union {
1827 
1828     xxh_u32 u;
1829     xxh_u8  c[4];
1830 
1831   } one = {1};
1832 
1833   return one.c[0];
1834 
1835 }
1836 
1837       #define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian()
1838     #endif
1839   #endif
1840 
1841   /* ****************************************
1842    *  Compiler-specific Functions and Macros
1843    ******************************************/
1844   #define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
1845 
1846   #ifdef __has_builtin
1847     #define XXH_HAS_BUILTIN(x) __has_builtin(x)
1848   #else
1849     #define XXH_HAS_BUILTIN(x) 0
1850   #endif
1851 
1852   /*!
1853    * @internal
1854    * @def XXH_rotl32(x,r)
1855    * @brief 32-bit rotate left.
1856    *
1857    * @param x The 32-bit integer to be rotated.
1858    * @param r The number of bits to rotate.
1859    * @pre
1860    *   @p r > 0 && @p r < 32
1861    * @note
1862    *   @p x and @p r may be evaluated multiple times.
1863    * @return The rotated result.
1864    */
1865   #if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) && \
1866       XXH_HAS_BUILTIN(__builtin_rotateleft64)
1867     #define XXH_rotl32 __builtin_rotateleft32
1868     #define XXH_rotl64 __builtin_rotateleft64
1869   /* Note: although _rotl exists for minGW (GCC under windows), performance
1870    * seems poor */
1871   #elif defined(_MSC_VER)
1872     #define XXH_rotl32(x, r) _rotl(x, r)
1873     #define XXH_rotl64(x, r) _rotl64(x, r)
1874   #else
1875     #define XXH_rotl32(x, r) (((x) << (r)) | ((x) >> (32 - (r))))
1876     #define XXH_rotl64(x, r) (((x) << (r)) | ((x) >> (64 - (r))))
1877   #endif
1878 
1879   /*!
1880    * @internal
1881    * @fn xxh_u32 XXH_swap32(xxh_u32 x)
1882    * @brief A 32-bit byteswap.
1883    *
1884    * @param x The 32-bit integer to byteswap.
1885    * @return @p x, byteswapped.
1886    */
1887   #if defined(_MSC_VER)                                    /* Visual Studio */
1888     #define XXH_swap32 _byteswap_ulong
1889   #elif XXH_GCC_VERSION >= 403
1890     #define XXH_swap32 __builtin_bswap32
1891   #else
XXH_swap32(xxh_u32 x)1892 static xxh_u32 XXH_swap32(xxh_u32 x) {
1893 
1894   return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |
1895          ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);
1896 
1897 }
1898 
1899   #endif
1900 
1901 /* ***************************
1902  *  Memory reads
1903  *****************************/
1904 
1905 /*!
1906  * @internal
1907  * @brief Enum to indicate whether a pointer is aligned.
1908  */
1909 typedef enum {
1910 
1911   XXH_aligned,                                                 /*!< Aligned */
1912   XXH_unaligned                                     /*!< Possibly unaligned */
1913 
1914 } XXH_alignment;
1915 
1916   /*
1917    * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
1918    *
1919    * This is ideal for older compilers which don't inline memcpy.
1920    */
1921   #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
1922 
XXH_readLE32(const void * memPtr)1923 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *memPtr) {
1924 
1925   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1926   return bytePtr[0] | ((xxh_u32)bytePtr[1] << 8) | ((xxh_u32)bytePtr[2] << 16) |
1927          ((xxh_u32)bytePtr[3] << 24);
1928 
1929 }
1930 
XXH_readBE32(const void * memPtr)1931 XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void *memPtr) {
1932 
1933   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
1934   return bytePtr[3] | ((xxh_u32)bytePtr[2] << 8) | ((xxh_u32)bytePtr[1] << 16) |
1935          ((xxh_u32)bytePtr[0] << 24);
1936 
1937 }
1938 
1939   #else
XXH_readLE32(const void * ptr)1940 XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void *ptr) {
1941 
1942   return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
1943 
1944 }
1945 
XXH_readBE32(const void * ptr)1946 static xxh_u32 XXH_readBE32(const void *ptr) {
1947 
1948   return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
1949 
1950 }
1951 
1952   #endif
1953 
XXH_readLE32_align(const void * ptr,XXH_alignment align)1954 XXH_FORCE_INLINE xxh_u32 XXH_readLE32_align(const void   *ptr,
1955                                             XXH_alignment align) {
1956 
1957   if (align == XXH_unaligned) {
1958 
1959     return XXH_readLE32(ptr);
1960 
1961   } else {
1962 
1963     return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32 *)ptr
1964                                  : XXH_swap32(*(const xxh_u32 *)ptr);
1965 
1966   }
1967 
1968 }
1969 
1970 /* *************************************
1971  *  Misc
1972  ***************************************/
1973 /*! @ingroup public */
XXH_versionNumber(void)1974 XXH_PUBLIC_API unsigned XXH_versionNumber(void) {
1975 
1976   return XXH_VERSION_NUMBER;
1977 
1978 }
1979 
1980 /* *******************************************************************
1981  *  32-bit hash functions
1982  *********************************************************************/
1983 /*!
1984  * @}
1985  * @defgroup xxh32_impl XXH32 implementation
1986  * @ingroup impl
1987  * @{
1988 
1989  */
1990 /* #define instead of static const, to be used as initializers */
1991   #define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */
1992   #define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */
1993   #define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */
1994   #define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */
1995   #define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */
1996 
1997   #ifdef XXH_OLD_NAMES
1998     #define PRIME32_1 XXH_PRIME32_1
1999     #define PRIME32_2 XXH_PRIME32_2
2000     #define PRIME32_3 XXH_PRIME32_3
2001     #define PRIME32_4 XXH_PRIME32_4
2002     #define PRIME32_5 XXH_PRIME32_5
2003   #endif
2004 
2005 /*!
2006  * @internal
2007  * @brief Normal stripe processing routine.
2008  *
2009  * This shuffles the bits so that any bit from @p input impacts several bits in
2010  * @p acc.
2011  *
2012  * @param acc The accumulator lane.
2013  * @param input The stripe of input to mix.
2014  * @return The mixed accumulator lane.
2015  */
XXH32_round(xxh_u32 acc,xxh_u32 input)2016 static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) {
2017 
2018   acc += input * XXH_PRIME32_2;
2019   acc = XXH_rotl32(acc, 13);
2020   acc *= XXH_PRIME32_1;
2021   #if (defined(__SSE4_1__) || defined(__aarch64__)) && \
2022       !defined(XXH_ENABLE_AUTOVECTORIZE)
2023   /*
2024    * UGLY HACK:
2025    * A compiler fence is the only thing that prevents GCC and Clang from
2026    * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
2027    * reason) without globally disabling SSE4.1.
2028    *
2029    * The reason we want to avoid vectorization is because despite working on
2030    * 4 integers at a time, there are multiple factors slowing XXH32 down on
2031    * SSE4:
2032    * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
2033    *   newer chips!) making it slightly slower to multiply four integers at
2034    *   once compared to four integers independently. Even when pmulld was
2035    *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
2036    *   just to multiply unless doing a long operation.
2037    *
2038    * - Four instructions are required to rotate,
2039    *      movqda tmp,  v // not required with VEX encoding
2040    *      pslld  tmp, 13 // tmp <<= 13
2041    *      psrld  v,   19 // x >>= 19
2042    *      por    v,  tmp // x |= tmp
2043    *   compared to one for scalar:
2044    *      roll   v, 13    // reliably fast across the board
2045    *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
2046    *
2047    * - Instruction level parallelism is actually more beneficial here because
2048    *   the SIMD actually serializes this operation: While v1 is rotating, v2
2049    *   can load data, while v3 can multiply. SSE forces them to operate
2050    *   together.
2051    *
2052    * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
2053    * and it is pointless writing a NEON implementation that is basically the
2054    * same speed as scalar for XXH32.
2055    */
2056   XXH_COMPILER_GUARD(acc);
2057   #endif
2058   return acc;
2059 
2060 }
2061 
2062 /*!
2063  * @internal
2064  * @brief Mixes all bits to finalize the hash.
2065  *
2066  * The final mix ensures that all input bits have a chance to impact any bit in
2067  * the output digest, resulting in an unbiased distribution.
2068  *
2069  * @param h32 The hash to avalanche.
2070  * @return The avalanched hash.
2071  */
XXH32_avalanche(xxh_u32 h32)2072 static xxh_u32 XXH32_avalanche(xxh_u32 h32) {
2073 
2074   h32 ^= h32 >> 15;
2075   h32 *= XXH_PRIME32_2;
2076   h32 ^= h32 >> 13;
2077   h32 *= XXH_PRIME32_3;
2078   h32 ^= h32 >> 16;
2079   return (h32);
2080 
2081 }
2082 
2083   #define XXH_get32bits(p) XXH_readLE32_align(p, align)
2084 
2085   #define XXH_PROCESS1                           \
2086     do {                                         \
2087                                                  \
2088       h32 += (*ptr++) * XXH_PRIME32_5;           \
2089       h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
2090                                                  \
2091     } while (0)
2092 
2093   #define XXH_PROCESS4                           \
2094     do {                                         \
2095                                                  \
2096       h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
2097       ptr += 4;                                  \
2098       h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
2099                                                  \
2100     } while (0)
2101 
2102 /*!
2103  * @internal
2104  * @brief Processes the last 0-15 bytes of @p ptr.
2105  *
2106  * There may be up to 15 bytes remaining to consume from the input.
2107  * This final stage will digest them to ensure that all input bytes are present
2108  * in the final mix.
2109  *
2110  * @param h32 The hash to finalize.
2111  * @param ptr The pointer to the remaining input.
2112  * @param len The remaining length, modulo 16.
2113  * @param align Whether @p ptr is aligned.
2114  * @return The finalized hash.
2115  */
XXH32_finalize(xxh_u32 h32,const xxh_u8 * ptr,size_t len,XXH_alignment align)2116 static xxh_u32 XXH32_finalize(xxh_u32 h32, const xxh_u8 *ptr, size_t len,
2117                               XXH_alignment align) {
2118 
2119   /* Compact rerolled version */
2120   if (XXH_REROLL) {
2121 
2122     len &= 15;
2123     while (len >= 4) {
2124 
2125       XXH_PROCESS4;
2126       len -= 4;
2127 
2128     }
2129 
2130     while (len > 0) {
2131 
2132       XXH_PROCESS1;
2133       --len;
2134 
2135     }
2136 
2137     return XXH32_avalanche(h32);
2138 
2139   } else {
2140 
2141     switch (len & 15) /* or switch(bEnd - p) */ {
2142 
2143       case 12:
2144         XXH_PROCESS4;
2145         XXH_FALLTHROUGH;
2146       case 8:
2147         XXH_PROCESS4;
2148         XXH_FALLTHROUGH;
2149       case 4:
2150         XXH_PROCESS4;
2151         return XXH32_avalanche(h32);
2152 
2153       case 13:
2154         XXH_PROCESS4;
2155         XXH_FALLTHROUGH;
2156       case 9:
2157         XXH_PROCESS4;
2158         XXH_FALLTHROUGH;
2159       case 5:
2160         XXH_PROCESS4;
2161         XXH_PROCESS1;
2162         return XXH32_avalanche(h32);
2163 
2164       case 14:
2165         XXH_PROCESS4;
2166         XXH_FALLTHROUGH;
2167       case 10:
2168         XXH_PROCESS4;
2169         XXH_FALLTHROUGH;
2170       case 6:
2171         XXH_PROCESS4;
2172         XXH_PROCESS1;
2173         XXH_PROCESS1;
2174         return XXH32_avalanche(h32);
2175 
2176       case 15:
2177         XXH_PROCESS4;
2178         XXH_FALLTHROUGH;
2179       case 11:
2180         XXH_PROCESS4;
2181         XXH_FALLTHROUGH;
2182       case 7:
2183         XXH_PROCESS4;
2184         XXH_FALLTHROUGH;
2185       case 3:
2186         XXH_PROCESS1;
2187         XXH_FALLTHROUGH;
2188       case 2:
2189         XXH_PROCESS1;
2190         XXH_FALLTHROUGH;
2191       case 1:
2192         XXH_PROCESS1;
2193         XXH_FALLTHROUGH;
2194       case 0:
2195         return XXH32_avalanche(h32);
2196 
2197     }
2198 
2199     XXH_ASSERT(0);
2200     return h32;                 /* reaching this point is deemed impossible */
2201 
2202   }
2203 
2204 }
2205 
2206   #ifdef XXH_OLD_NAMES
2207     #define PROCESS1 XXH_PROCESS1
2208     #define PROCESS4 XXH_PROCESS4
2209   #else
2210     #undef XXH_PROCESS1
2211     #undef XXH_PROCESS4
2212   #endif
2213 
2214 /*!
2215  * @internal
2216  * @brief The implementation for @ref XXH32().
2217  *
2218  * @param input, len, seed Directly passed from @ref XXH32().
2219  * @param align Whether @p input is aligned.
2220  * @return The calculated hash.
2221  */
XXH32_endian_align(const xxh_u8 * input,size_t len,xxh_u32 seed,XXH_alignment align)2222 XXH_FORCE_INLINE xxh_u32 XXH32_endian_align(const xxh_u8 *input, size_t len,
2223                                             xxh_u32 seed, XXH_alignment align) {
2224 
2225   const xxh_u8 *bEnd = input ? input + len : NULL;
2226   xxh_u32       h32;
2227 
2228   #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2229       (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2230   if (input == NULL) {
2231 
2232     len = 0;
2233     bEnd = input = (const xxh_u8 *)(size_t)16;
2234 
2235   }
2236 
2237   #endif
2238 
2239   if (len >= 16) {
2240 
2241     const xxh_u8 *const limit = bEnd - 15;
2242     xxh_u32             v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2243     xxh_u32             v2 = seed + XXH_PRIME32_2;
2244     xxh_u32             v3 = seed + 0;
2245     xxh_u32             v4 = seed - XXH_PRIME32_1;
2246 
2247     do {
2248 
2249       v1 = XXH32_round(v1, XXH_get32bits(input));
2250       input += 4;
2251       v2 = XXH32_round(v2, XXH_get32bits(input));
2252       input += 4;
2253       v3 = XXH32_round(v3, XXH_get32bits(input));
2254       input += 4;
2255       v4 = XXH32_round(v4, XXH_get32bits(input));
2256       input += 4;
2257 
2258     } while (input < limit);
2259 
2260     h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) +
2261           XXH_rotl32(v4, 18);
2262 
2263   } else {
2264 
2265     h32 = seed + XXH_PRIME32_5;
2266 
2267   }
2268 
2269   h32 += (xxh_u32)len;
2270 
2271   return XXH32_finalize(h32, input, len & 15, align);
2272 
2273 }
2274 
2275 /*! @ingroup xxh32_family */
XXH32(const void * input,size_t len,XXH32_hash_t seed)2276 XXH_PUBLIC_API XXH32_hash_t XXH32(const void *input, size_t len,
2277                                   XXH32_hash_t seed) {
2278 
2279   #if 0
2280     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
2281     XXH32_state_t state;
2282     XXH32_reset(&state, seed);
2283     XXH32_update(&state, (const xxh_u8*)input, len);
2284     return XXH32_digest(&state);
2285   #else
2286   if (XXH_FORCE_ALIGN_CHECK) {
2287 
2288     if ((((size_t)input) & 3) ==
2289         0) {        /* Input is 4-bytes aligned, leverage the speed benefit */
2290       return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned);
2291 
2292     }
2293 
2294   }
2295 
2296   return XXH32_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned);
2297   #endif
2298 
2299 }
2300 
2301 /*******   Hash streaming   *******/
2302 /*!
2303  * @ingroup xxh32_family
2304  */
XXH32_createState(void)2305 XXH_PUBLIC_API XXH32_state_t *XXH32_createState(void) {
2306 
2307   return (XXH32_state_t *)XXH_malloc(sizeof(XXH32_state_t));
2308 
2309 }
2310 
2311 /*! @ingroup xxh32_family */
XXH32_freeState(XXH32_state_t * statePtr)2312 XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t *statePtr) {
2313 
2314   XXH_free(statePtr);
2315   return XXH_OK;
2316 
2317 }
2318 
2319 /*! @ingroup xxh32_family */
XXH32_copyState(XXH32_state_t * dstState,const XXH32_state_t * srcState)2320 XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t       *dstState,
2321                                     const XXH32_state_t *srcState) {
2322 
2323   memcpy(dstState, srcState, sizeof(*dstState));
2324 
2325 }
2326 
2327 /*! @ingroup xxh32_family */
XXH32_reset(XXH32_state_t * statePtr,XXH32_hash_t seed)2328 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t *statePtr,
2329                                          XXH32_hash_t   seed) {
2330 
2331   XXH32_state_t state; /* using a local state to memcpy() in order to avoid
2332                           strict-aliasing warnings */
2333   memset(&state, 0, sizeof(state));
2334   state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2335   state.v2 = seed + XXH_PRIME32_2;
2336   state.v3 = seed + 0;
2337   state.v4 = seed - XXH_PRIME32_1;
2338   /* do not write into reserved, planned to be removed in a future version */
2339   memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
2340   return XXH_OK;
2341 
2342 }
2343 
2344 /*! @ingroup xxh32_family */
XXH32_update(XXH32_state_t * state,const void * input,size_t len)2345 XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t *state,
2346                                           const void *input, size_t len) {
2347 
2348   if (input == NULL)
2349   #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2350       (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2351     return XXH_OK;
2352   #else
2353     return XXH_ERROR;
2354   #endif
2355 
2356   {
2357 
2358     const xxh_u8       *p = (const xxh_u8 *)input;
2359     const xxh_u8 *const bEnd = p + len;
2360 
2361     state->total_len_32 += (XXH32_hash_t)len;
2362     state->large_len |=
2363         (XXH32_hash_t)((len >= 16) | (state->total_len_32 >= 16));
2364 
2365     if (state->memsize + len < 16) {                  /* fill in tmp buffer */
2366       XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input, len);
2367       state->memsize += (XXH32_hash_t)len;
2368       return XXH_OK;
2369 
2370     }
2371 
2372     if (state->memsize) {            /* some data left from previous update */
2373       XXH_memcpy((xxh_u8 *)(state->mem32) + state->memsize, input,
2374                  16 - state->memsize);
2375       {
2376 
2377         const xxh_u32 *p32 = state->mem32;
2378         state->v1 = XXH32_round(state->v1, XXH_readLE32(p32));
2379         p32++;
2380         state->v2 = XXH32_round(state->v2, XXH_readLE32(p32));
2381         p32++;
2382         state->v3 = XXH32_round(state->v3, XXH_readLE32(p32));
2383         p32++;
2384         state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
2385 
2386       }
2387 
2388       p += 16 - state->memsize;
2389       state->memsize = 0;
2390 
2391     }
2392 
2393     if (p <= bEnd - 16) {
2394 
2395       const xxh_u8 *const limit = bEnd - 16;
2396       xxh_u32             v1 = state->v1;
2397       xxh_u32             v2 = state->v2;
2398       xxh_u32             v3 = state->v3;
2399       xxh_u32             v4 = state->v4;
2400 
2401       do {
2402 
2403         v1 = XXH32_round(v1, XXH_readLE32(p));
2404         p += 4;
2405         v2 = XXH32_round(v2, XXH_readLE32(p));
2406         p += 4;
2407         v3 = XXH32_round(v3, XXH_readLE32(p));
2408         p += 4;
2409         v4 = XXH32_round(v4, XXH_readLE32(p));
2410         p += 4;
2411 
2412       } while (p <= limit);
2413 
2414       state->v1 = v1;
2415       state->v2 = v2;
2416       state->v3 = v3;
2417       state->v4 = v4;
2418 
2419     }
2420 
2421     if (p < bEnd) {
2422 
2423       XXH_memcpy(state->mem32, p, (size_t)(bEnd - p));
2424       state->memsize = (unsigned)(bEnd - p);
2425 
2426     }
2427 
2428   }
2429 
2430   return XXH_OK;
2431 
2432 }
2433 
2434 /*! @ingroup xxh32_family */
XXH32_digest(const XXH32_state_t * state)2435 XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t *state) {
2436 
2437   xxh_u32 h32;
2438 
2439   if (state->large_len) {
2440 
2441     h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) +
2442           XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
2443 
2444   } else {
2445 
2446     h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
2447 
2448   }
2449 
2450   h32 += state->total_len_32;
2451 
2452   return XXH32_finalize(h32, (const xxh_u8 *)state->mem32, state->memsize,
2453                         XXH_aligned);
2454 
2455 }
2456 
2457 /*******   Canonical representation   *******/
2458 
2459 /*!
2460  * @ingroup xxh32_family
2461  * The default return values from XXH functions are unsigned 32 and 64 bit
2462  * integers.
2463  *
2464  * The canonical representation uses big endian convention, the same convention
2465  * as human-readable numbers (large digits first).
2466  *
2467  * This way, hash values can be written into a file or buffer, remaining
2468  * comparable across different systems.
2469  *
2470  * The following functions allow transformation of hash values to and from their
2471  * canonical format.
2472  */
XXH32_canonicalFromHash(XXH32_canonical_t * dst,XXH32_hash_t hash)2473 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t *dst,
2474                                             XXH32_hash_t       hash) {
2475 
2476   XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
2477   if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
2478   memcpy(dst, &hash, sizeof(*dst));
2479 
2480 }
2481 
2482 /*! @ingroup xxh32_family */
2483 XXH_PUBLIC_API XXH32_hash_t
XXH32_hashFromCanonical(const XXH32_canonical_t * src)2484 XXH32_hashFromCanonical(const XXH32_canonical_t *src) {
2485 
2486   return XXH_readBE32(src);
2487 
2488 }
2489 
2490   #ifndef XXH_NO_LONG_LONG
2491 
2492 /* *******************************************************************
2493  *  64-bit hash functions
2494  *********************************************************************/
2495 /*!
2496  * @}
2497  * @ingroup impl
2498  * @{
2499 
2500  */
2501 /*******   Memory access   *******/
2502 
2503 typedef XXH64_hash_t xxh_u64;
2504 
2505     #ifdef XXH_OLD_NAMES
2506       #define U64 xxh_u64
2507     #endif
2508 
2509     #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
2510     /*
2511      * Manual byteshift. Best for old compilers which don't inline memcpy.
2512      * We actually directly use XXH_readLE64 and XXH_readBE64.
2513      */
2514     #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
2515 
2516 /* Force direct memory access. Only works on CPU which support unaligned memory
2517  * access in hardware */
XXH_read64(const void * memPtr)2518 static xxh_u64 XXH_read64(const void *memPtr) {
2519 
2520   return *(const xxh_u64 *)memPtr;
2521 
2522 }
2523 
2524     #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
2525 
2526       /*
2527        * __pack instructions are safer, but compiler specific, hence potentially
2528        * problematic for some compilers.
2529        *
2530        * Currently only defined for GCC and ICC.
2531        */
2532       #ifdef XXH_OLD_NAMES
2533 typedef union {
2534 
2535   xxh_u32 u32;
2536   xxh_u64 u64;
2537 
2538 } __attribute__((packed)) unalign64;
2539 
2540       #endif
XXH_read64(const void * ptr)2541 static xxh_u64 XXH_read64(const void *ptr) {
2542 
2543   typedef union {
2544 
2545     xxh_u32 u32;
2546     xxh_u64 u64;
2547 
2548   } __attribute__((packed)) xxh_unalign64;
2549 
2550   return ((const xxh_unalign64 *)ptr)->u64;
2551 
2552 }
2553 
2554     #else
2555 
2556 /*
2557  * Portable and safe solution. Generally efficient.
2558  * see: https://stackoverflow.com/a/32095106/646947
2559  */
XXH_read64(const void * memPtr)2560 static xxh_u64 XXH_read64(const void *memPtr) {
2561 
2562   xxh_u64 val;
2563   memcpy(&val, memPtr, sizeof(val));
2564   return val;
2565 
2566 }
2567 
2568     #endif                                /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
2569 
2570     #if defined(_MSC_VER)                                  /* Visual Studio */
2571       #define XXH_swap64 _byteswap_uint64
2572     #elif XXH_GCC_VERSION >= 403
2573       #define XXH_swap64 __builtin_bswap64
2574     #else
XXH_swap64(xxh_u64 x)2575 static xxh_u64 XXH_swap64(xxh_u64 x) {
2576 
2577   return ((x << 56) & 0xff00000000000000ULL) |
2578          ((x << 40) & 0x00ff000000000000ULL) |
2579          ((x << 24) & 0x0000ff0000000000ULL) |
2580          ((x << 8) & 0x000000ff00000000ULL) |
2581          ((x >> 8) & 0x00000000ff000000ULL) |
2582          ((x >> 24) & 0x0000000000ff0000ULL) |
2583          ((x >> 40) & 0x000000000000ff00ULL) |
2584          ((x >> 56) & 0x00000000000000ffULL);
2585 
2586 }
2587 
2588     #endif
2589 
2590     /* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
2591     #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 3))
2592 
XXH_readLE64(const void * memPtr)2593 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *memPtr) {
2594 
2595   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
2596   return bytePtr[0] | ((xxh_u64)bytePtr[1] << 8) | ((xxh_u64)bytePtr[2] << 16) |
2597          ((xxh_u64)bytePtr[3] << 24) | ((xxh_u64)bytePtr[4] << 32) |
2598          ((xxh_u64)bytePtr[5] << 40) | ((xxh_u64)bytePtr[6] << 48) |
2599          ((xxh_u64)bytePtr[7] << 56);
2600 
2601 }
2602 
XXH_readBE64(const void * memPtr)2603 XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void *memPtr) {
2604 
2605   const xxh_u8 *bytePtr = (const xxh_u8 *)memPtr;
2606   return bytePtr[7] | ((xxh_u64)bytePtr[6] << 8) | ((xxh_u64)bytePtr[5] << 16) |
2607          ((xxh_u64)bytePtr[4] << 24) | ((xxh_u64)bytePtr[3] << 32) |
2608          ((xxh_u64)bytePtr[2] << 40) | ((xxh_u64)bytePtr[1] << 48) |
2609          ((xxh_u64)bytePtr[0] << 56);
2610 
2611 }
2612 
2613     #else
XXH_readLE64(const void * ptr)2614 XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void *ptr) {
2615 
2616   return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
2617 
2618 }
2619 
XXH_readBE64(const void * ptr)2620 static xxh_u64 XXH_readBE64(const void *ptr) {
2621 
2622   return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
2623 
2624 }
2625 
2626     #endif
2627 
XXH_readLE64_align(const void * ptr,XXH_alignment align)2628 XXH_FORCE_INLINE xxh_u64 XXH_readLE64_align(const void   *ptr,
2629                                             XXH_alignment align) {
2630 
2631   if (align == XXH_unaligned)
2632     return XXH_readLE64(ptr);
2633   else
2634     return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64 *)ptr
2635                                  : XXH_swap64(*(const xxh_u64 *)ptr);
2636 
2637 }
2638 
2639     /*******   xxh64   *******/
2640     /*!
2641      * @}
2642      * @defgroup xxh64_impl XXH64 implementation
2643      * @ingroup impl
2644      * @{
2645 
2646      */
2647     /* #define rather that static const, to be used as initializers */
2648     #define XXH_PRIME64_1                                                                         \
2649       0x9E3779B185EBCA87ULL /*!<                                                                  \
2650                                0b1001111000110111011110011011000110000101111010111100101010000111 \
2651                              */
2652     #define XXH_PRIME64_2                                                                         \
2653       0xC2B2AE3D27D4EB4FULL /*!<                                                                  \
2654                                0b1100001010110010101011100011110100100111110101001110101101001111 \
2655                              */
2656     #define XXH_PRIME64_3                                                                         \
2657       0x165667B19E3779F9ULL /*!<                                                                  \
2658                                0b0001011001010110011001111011000110011110001101110111100111111001 \
2659                              */
2660     #define XXH_PRIME64_4                                                                         \
2661       0x85EBCA77C2B2AE63ULL /*!<                                                                  \
2662                                0b1000010111101011110010100111011111000010101100101010111001100011 \
2663                              */
2664     #define XXH_PRIME64_5                                                                         \
2665       0x27D4EB2F165667C5ULL /*!<                                                                  \
2666                                0b0010011111010100111010110010111100010110010101100110011111000101 \
2667                              */
2668 
2669     #ifdef XXH_OLD_NAMES
2670       #define PRIME64_1 XXH_PRIME64_1
2671       #define PRIME64_2 XXH_PRIME64_2
2672       #define PRIME64_3 XXH_PRIME64_3
2673       #define PRIME64_4 XXH_PRIME64_4
2674       #define PRIME64_5 XXH_PRIME64_5
2675     #endif
2676 
XXH64_round(xxh_u64 acc,xxh_u64 input)2677 static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) {
2678 
2679   acc += input * XXH_PRIME64_2;
2680   acc = XXH_rotl64(acc, 31);
2681   acc *= XXH_PRIME64_1;
2682   return acc;
2683 
2684 }
2685 
XXH64_mergeRound(xxh_u64 acc,xxh_u64 val)2686 static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) {
2687 
2688   val = XXH64_round(0, val);
2689   acc ^= val;
2690   acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
2691   return acc;
2692 
2693 }
2694 
XXH64_avalanche(xxh_u64 h64)2695 static xxh_u64 XXH64_avalanche(xxh_u64 h64) {
2696 
2697   h64 ^= h64 >> 33;
2698   h64 *= XXH_PRIME64_2;
2699   h64 ^= h64 >> 29;
2700   h64 *= XXH_PRIME64_3;
2701   h64 ^= h64 >> 32;
2702   return h64;
2703 
2704 }
2705 
2706     #define XXH_get64bits(p) XXH_readLE64_align(p, align)
2707 
XXH64_finalize(xxh_u64 h64,const xxh_u8 * ptr,size_t len,XXH_alignment align)2708 static xxh_u64 XXH64_finalize(xxh_u64 h64, const xxh_u8 *ptr, size_t len,
2709                               XXH_alignment align) {
2710 
2711   len &= 31;
2712   while (len >= 8) {
2713 
2714     xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
2715     ptr += 8;
2716     h64 ^= k1;
2717     h64 = XXH_rotl64(h64, 27) * XXH_PRIME64_1 + XXH_PRIME64_4;
2718     len -= 8;
2719 
2720   }
2721 
2722   if (len >= 4) {
2723 
2724     h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
2725     ptr += 4;
2726     h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
2727     len -= 4;
2728 
2729   }
2730 
2731   while (len > 0) {
2732 
2733     h64 ^= (*ptr++) * XXH_PRIME64_5;
2734     h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
2735     --len;
2736 
2737   }
2738 
2739   return XXH64_avalanche(h64);
2740 
2741 }
2742 
2743     #ifdef XXH_OLD_NAMES
2744       #define PROCESS1_64 XXH_PROCESS1_64
2745       #define PROCESS4_64 XXH_PROCESS4_64
2746       #define PROCESS8_64 XXH_PROCESS8_64
2747     #else
2748       #undef XXH_PROCESS1_64
2749       #undef XXH_PROCESS4_64
2750       #undef XXH_PROCESS8_64
2751     #endif
2752 
XXH64_endian_align(const xxh_u8 * input,size_t len,xxh_u64 seed,XXH_alignment align)2753 XXH_FORCE_INLINE xxh_u64 XXH64_endian_align(const xxh_u8 *input, size_t len,
2754                                             xxh_u64 seed, XXH_alignment align) {
2755 
2756   const xxh_u8 *bEnd = input ? input + len : NULL;
2757   xxh_u64       h64;
2758 
2759     #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2760         (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2761   if (input == NULL) {
2762 
2763     len = 0;
2764     bEnd = input = (const xxh_u8 *)(size_t)32;
2765 
2766   }
2767 
2768     #endif
2769 
2770   if (len >= 32) {
2771 
2772     const xxh_u8 *const limit = bEnd - 32;
2773     xxh_u64             v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2774     xxh_u64             v2 = seed + XXH_PRIME64_2;
2775     xxh_u64             v3 = seed + 0;
2776     xxh_u64             v4 = seed - XXH_PRIME64_1;
2777 
2778     do {
2779 
2780       v1 = XXH64_round(v1, XXH_get64bits(input));
2781       input += 8;
2782       v2 = XXH64_round(v2, XXH_get64bits(input));
2783       input += 8;
2784       v3 = XXH64_round(v3, XXH_get64bits(input));
2785       input += 8;
2786       v4 = XXH64_round(v4, XXH_get64bits(input));
2787       input += 8;
2788 
2789     } while (input <= limit);
2790 
2791     h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
2792           XXH_rotl64(v4, 18);
2793     h64 = XXH64_mergeRound(h64, v1);
2794     h64 = XXH64_mergeRound(h64, v2);
2795     h64 = XXH64_mergeRound(h64, v3);
2796     h64 = XXH64_mergeRound(h64, v4);
2797 
2798   } else {
2799 
2800     h64 = seed + XXH_PRIME64_5;
2801 
2802   }
2803 
2804   h64 += (xxh_u64)len;
2805 
2806   return XXH64_finalize(h64, input, len, align);
2807 
2808 }
2809 
2810 /*! @ingroup xxh64_family */
XXH64(const void * input,size_t len,XXH64_hash_t seed)2811 XXH_PUBLIC_API XXH64_hash_t XXH64(const void *input, size_t len,
2812                                   XXH64_hash_t seed) {
2813 
2814     #if 0
2815     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
2816     XXH64_state_t state;
2817     XXH64_reset(&state, seed);
2818     XXH64_update(&state, (const xxh_u8*)input, len);
2819     return XXH64_digest(&state);
2820     #else
2821   if (XXH_FORCE_ALIGN_CHECK) {
2822 
2823     if ((((size_t)input) & 7) ==
2824         0) {        /* Input is aligned, let's leverage the speed advantage */
2825       return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_aligned);
2826 
2827     }
2828 
2829   }
2830 
2831   return XXH64_endian_align((const xxh_u8 *)input, len, seed, XXH_unaligned);
2832 
2833     #endif
2834 
2835 }
2836 
2837 /*******   Hash Streaming   *******/
2838 
2839 /*! @ingroup xxh64_family*/
XXH64_createState(void)2840 XXH_PUBLIC_API XXH64_state_t *XXH64_createState(void) {
2841 
2842   return (XXH64_state_t *)XXH_malloc(sizeof(XXH64_state_t));
2843 
2844 }
2845 
2846 /*! @ingroup xxh64_family */
XXH64_freeState(XXH64_state_t * statePtr)2847 XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t *statePtr) {
2848 
2849   XXH_free(statePtr);
2850   return XXH_OK;
2851 
2852 }
2853 
2854 /*! @ingroup xxh64_family */
XXH64_copyState(XXH64_state_t * dstState,const XXH64_state_t * srcState)2855 XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t       *dstState,
2856                                     const XXH64_state_t *srcState) {
2857 
2858   memcpy(dstState, srcState, sizeof(*dstState));
2859 
2860 }
2861 
2862 /*! @ingroup xxh64_family */
XXH64_reset(XXH64_state_t * statePtr,XXH64_hash_t seed)2863 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t *statePtr,
2864                                          XXH64_hash_t   seed) {
2865 
2866   XXH64_state_t state; /* use a local state to memcpy() in order to avoid
2867                           strict-aliasing warnings */
2868   memset(&state, 0, sizeof(state));
2869   state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2870   state.v2 = seed + XXH_PRIME64_2;
2871   state.v3 = seed + 0;
2872   state.v4 = seed - XXH_PRIME64_1;
2873   /* do not write into reserved64, might be removed in a future version */
2874   memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
2875   return XXH_OK;
2876 
2877 }
2878 
2879 /*! @ingroup xxh64_family */
XXH64_update(XXH64_state_t * state,const void * input,size_t len)2880 XXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t *state,
2881                                           const void *input, size_t len) {
2882 
2883   if (input == NULL)
2884     #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
2885         (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
2886     return XXH_OK;
2887     #else
2888     return XXH_ERROR;
2889     #endif
2890 
2891   {
2892 
2893     const xxh_u8       *p = (const xxh_u8 *)input;
2894     const xxh_u8 *const bEnd = p + len;
2895 
2896     state->total_len += len;
2897 
2898     if (state->memsize + len < 32) {                  /* fill in tmp buffer */
2899       XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input, len);
2900       state->memsize += (xxh_u32)len;
2901       return XXH_OK;
2902 
2903     }
2904 
2905     if (state->memsize) {                             /* tmp buffer is full */
2906       XXH_memcpy(((xxh_u8 *)state->mem64) + state->memsize, input,
2907                  32 - state->memsize);
2908       state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0));
2909       state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1));
2910       state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2));
2911       state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3));
2912       p += 32 - state->memsize;
2913       state->memsize = 0;
2914 
2915     }
2916 
2917     if (p + 32 <= bEnd) {
2918 
2919       const xxh_u8 *const limit = bEnd - 32;
2920       xxh_u64             v1 = state->v1;
2921       xxh_u64             v2 = state->v2;
2922       xxh_u64             v3 = state->v3;
2923       xxh_u64             v4 = state->v4;
2924 
2925       do {
2926 
2927         v1 = XXH64_round(v1, XXH_readLE64(p));
2928         p += 8;
2929         v2 = XXH64_round(v2, XXH_readLE64(p));
2930         p += 8;
2931         v3 = XXH64_round(v3, XXH_readLE64(p));
2932         p += 8;
2933         v4 = XXH64_round(v4, XXH_readLE64(p));
2934         p += 8;
2935 
2936       } while (p <= limit);
2937 
2938       state->v1 = v1;
2939       state->v2 = v2;
2940       state->v3 = v3;
2941       state->v4 = v4;
2942 
2943     }
2944 
2945     if (p < bEnd) {
2946 
2947       XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
2948       state->memsize = (unsigned)(bEnd - p);
2949 
2950     }
2951 
2952   }
2953 
2954   return XXH_OK;
2955 
2956 }
2957 
2958 /*! @ingroup xxh64_family */
XXH64_digest(const XXH64_state_t * state)2959 XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t *state) {
2960 
2961   xxh_u64 h64;
2962 
2963   if (state->total_len >= 32) {
2964 
2965     xxh_u64 const v1 = state->v1;
2966     xxh_u64 const v2 = state->v2;
2967     xxh_u64 const v3 = state->v3;
2968     xxh_u64 const v4 = state->v4;
2969 
2970     h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
2971           XXH_rotl64(v4, 18);
2972     h64 = XXH64_mergeRound(h64, v1);
2973     h64 = XXH64_mergeRound(h64, v2);
2974     h64 = XXH64_mergeRound(h64, v3);
2975     h64 = XXH64_mergeRound(h64, v4);
2976 
2977   } else {
2978 
2979     h64 = state->v3 /*seed*/ + XXH_PRIME64_5;
2980 
2981   }
2982 
2983   h64 += (xxh_u64)state->total_len;
2984 
2985   return XXH64_finalize(h64, (const xxh_u8 *)state->mem64,
2986                         (size_t)state->total_len, XXH_aligned);
2987 
2988 }
2989 
2990 /******* Canonical representation   *******/
2991 
2992 /*! @ingroup xxh64_family */
XXH64_canonicalFromHash(XXH64_canonical_t * dst,XXH64_hash_t hash)2993 XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t *dst,
2994                                             XXH64_hash_t       hash) {
2995 
2996   XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2997   if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
2998   memcpy(dst, &hash, sizeof(*dst));
2999 
3000 }
3001 
3002 /*! @ingroup xxh64_family */
3003 XXH_PUBLIC_API XXH64_hash_t
XXH64_hashFromCanonical(const XXH64_canonical_t * src)3004 XXH64_hashFromCanonical(const XXH64_canonical_t *src) {
3005 
3006   return XXH_readBE64(src);
3007 
3008 }
3009 
3010     #ifndef XXH_NO_XXH3
3011 
3012     /* *********************************************************************
3013      *  XXH3
3014      *  New generation hash designed for speed on small keys and vectorization
3015      ************************************************************************ */
3016     /*!
3017      * @}
3018      * @defgroup xxh3_impl XXH3 implementation
3019      * @ingroup impl
3020      * @{
3021 
3022      */
3023 
3024     /* ===   Compiler specifics   === */
3025 
3026       #if ((defined(sun) || defined(__sun)) &&                                \
3027            __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested \
3028                            with GCC 5.5 */
3029         #define XXH_RESTRICT                                     /* disable */
3030       #elif defined(__STDC_VERSION__) && \
3031           __STDC_VERSION__ >= 199901L                             /* >= C99 */
3032         #define XXH_RESTRICT restrict
3033       #else
3034       /* Note: it might be useful to define __restrict or __restrict__ for
3035        * some C++ compilers */
3036         #define XXH_RESTRICT                                     /* disable */
3037       #endif
3038 
3039       #if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                   \
3040           (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \
3041           defined(__clang__)
3042         #define XXH_likely(x) __builtin_expect(x, 1)
3043         #define XXH_unlikely(x) __builtin_expect(x, 0)
3044       #else
3045         #define XXH_likely(x) (x)
3046         #define XXH_unlikely(x) (x)
3047       #endif
3048 
3049       #if defined(__GNUC__)
3050         #if defined(__AVX2__)
3051           #include <immintrin.h>
3052         #elif defined(__SSE2__)
3053           #include <emmintrin.h>
3054         #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
3055           #define inline __inline__               /* circumvent a clang bug */
3056           #include <arm_neon.h>
3057           #undef inline
3058         #endif
3059       #elif defined(_MSC_VER)
3060         #include <intrin.h>
3061       #endif
3062 
3063       /*
3064        * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
3065        * remaining a true 64-bit/128-bit hash function.
3066        *
3067        * This is done by prioritizing a subset of 64-bit operations that can be
3068        * emulated without too many steps on the average 32-bit machine.
3069        *
3070        * For example, these two lines seem similar, and run equally fast on
3071        * 64-bit:
3072        *
3073        *   xxh_u64 x;
3074        *   x ^= (x >> 47); // good
3075        *   x ^= (x >> 13); // bad
3076        *
3077        * However, to a 32-bit machine, there is a major difference.
3078        *
3079        * x ^= (x >> 47) looks like this:
3080        *
3081        *   x.lo ^= (x.hi >> (47 - 32));
3082        *
3083        * while x ^= (x >> 13) looks like this:
3084        *
3085        *   // note: funnel shifts are not usually cheap.
3086        *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
3087        *   x.hi ^= (x.hi >> 13);
3088        *
3089        * The first one is significantly faster than the second, simply because
3090        * the shift is larger than 32. This means:
3091        *  - All the bits we need are in the upper 32 bits, so we can ignore the
3092        * lower 32 bits in the shift.
3093        *  - The shift result will always fit in the lower 32 bits, and
3094        * therefore, we can ignore the upper 32 bits in the xor.
3095        *
3096        * Thanks to this optimization, XXH3 only requires these features to be
3097        * efficient:
3098        *
3099        *  - Usable unaligned access
3100        *  - A 32-bit or 64-bit ALU
3101        *      - If 32-bit, a decent ADC instruction
3102        *  - A 32 or 64-bit multiply with a 64-bit result
3103        *  - For the 128-bit variant, a decent byteswap helps short inputs.
3104        *
3105        * The first two are already required by XXH32, and almost all 32-bit and
3106        * 64-bit platforms which can run XXH32 can run XXH3 efficiently.
3107        *
3108        * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is
3109        * one notable exception.
3110        *
3111        * First of all, Thumb-1 lacks support for the UMULL instruction which
3112        * performs the important long multiply. This means numerous __aeabi_lmul
3113        * calls.
3114        *
3115        * Second of all, the 8 functional registers are just not enough.
3116        * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic
3117        * need Lo registers, and this shuffling results in thousands more MOVs
3118        * than A32.
3119        *
3120        * A32 and T32 don't have this limitation. They can access all 14
3121        * registers, do a 32->64 multiply with UMULL, and the flexible operand
3122        * allowing free shifts is helpful, too.
3123        *
3124        * Therefore, we do a quick sanity check.
3125        *
3126        * If compiling Thumb-1 for a target which supports ARM instructions, we
3127        * will emit a warning, as it is not a "sane" platform to compile for.
3128        *
3129        * Usually, if this happens, it is because of an accident and you probably
3130        * need to specify -march, as you likely meant to compile for a newer
3131        * architecture.
3132        *
3133        * Credit: large sections of the vectorial and asm source code paths
3134        *         have been contributed by @easyaspi314
3135        */
3136       #if defined(__thumb__) && !defined(__thumb2__) && \
3137           defined(__ARM_ARCH_ISA_ARM)
3138         #warning "XXH3 is highly inefficient without ARM or Thumb-2."
3139       #endif
3140 
3141     /* ==========================================
3142      * Vectorization detection
3143      * ========================================== */
3144 
3145       #ifdef XXH_DOXYGEN
3146         /*!
3147          * @ingroup tuning
3148          * @brief Overrides the vectorization implementation chosen for XXH3.
3149          *
3150          * Can be defined to 0 to disable SIMD or any of the values mentioned in
3151          * @ref XXH_VECTOR_TYPE.
3152          *
3153          * If this is not defined, it uses predefined macros to determine the
3154          * best implementation.
3155          */
3156         #define XXH_VECTOR XXH_SCALAR
3157 /*!
3158  * @ingroup tuning
3159  * @brief Possible values for @ref XXH_VECTOR.
3160  *
3161  * Note that these are actually implemented as macros.
3162  *
3163  * If this is not defined, it is detected automatically.
3164  * @ref XXH_X86DISPATCH overrides this.
3165  */
3166 enum XXH_VECTOR_TYPE /* fake enum */ {
3167 
3168   XXH_SCALAR = 0,                              /*!< Portable scalar version */
3169   XXH_SSE2 = 1,   /*!<
3170                    * SSE2 for Pentium 4, Opteron, all x86_64.
3171                    *
3172                    * @note SSE2 is also guaranteed on Windows 10, macOS, and
3173                    * Android x86.
3174                    */
3175   XXH_AVX2 = 2,                         /*!< AVX2 for Haswell and Bulldozer */
3176   XXH_AVX512 = 3,                       /*!< AVX512 for Skylake and Icelake */
3177   XXH_NEON = 4,                  /*!< NEON for most ARMv7-A and all AArch64 */
3178   XXH_VSX = 5,                 /*!< VSX and ZVector for POWER8/z13 (64-bit) */
3179 
3180 };
3181 
3182         /*!
3183          * @ingroup tuning
3184          * @brief Selects the minimum alignment for XXH3's accumulators.
3185          *
3186          * When using SIMD, this should match the alignment reqired for said
3187          * vector type, so, for example, 32 for AVX2.
3188          *
3189          * Default: Auto detected.
3190          */
3191         #define XXH_ACC_ALIGN 8
3192       #endif
3193 
3194       /* Actual definition */
3195       #ifndef XXH_DOXYGEN
3196         #define XXH_SCALAR 0
3197         #define XXH_SSE2 1
3198         #define XXH_AVX2 2
3199         #define XXH_AVX512 3
3200         #define XXH_NEON 4
3201         #define XXH_VSX 5
3202       #endif
3203 
3204       #ifndef XXH_VECTOR                  /* can be defined on command line */
3205         #if defined(__AVX512F__)
3206           #define XXH_VECTOR XXH_AVX512
3207         #elif defined(__AVX2__)
3208           #define XXH_VECTOR XXH_AVX2
3209         #elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \
3210             (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
3211           #define XXH_VECTOR XXH_SSE2
3212         #elif defined(__GNUC__) /* msvc support maybe later */               \
3213             && (defined(__ARM_NEON__) || defined(__ARM_NEON)) &&             \
3214             (defined(                                                        \
3215                  __LITTLE_ENDIAN__) /* We only support little endian NEON */ \
3216              || (defined(__BYTE_ORDER__) &&                                  \
3217                  __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
3218           #define XXH_VECTOR XXH_NEON
3219         #elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) || \
3220             (defined(__s390x__) && defined(__VEC__)) &&             \
3221                 defined(__GNUC__)                           /* TODO: IBM XL */
3222           #define XXH_VECTOR XXH_VSX
3223         #else
3224           #define XXH_VECTOR XXH_SCALAR
3225         #endif
3226       #endif
3227 
3228       /*
3229        * Controls the alignment of the accumulator,
3230        * for compatibility with aligned vector loads, which are usually faster.
3231        */
3232       #ifndef XXH_ACC_ALIGN
3233         #if defined(XXH_X86DISPATCH)
3234           #define XXH_ACC_ALIGN 64         /* for compatibility with avx512 */
3235         #elif XXH_VECTOR == XXH_SCALAR                            /* scalar */
3236           #define XXH_ACC_ALIGN 8
3237         #elif XXH_VECTOR == XXH_SSE2                                /* sse2 */
3238           #define XXH_ACC_ALIGN 16
3239         #elif XXH_VECTOR == XXH_AVX2                                /* avx2 */
3240           #define XXH_ACC_ALIGN 32
3241         #elif XXH_VECTOR == XXH_NEON                                /* neon */
3242           #define XXH_ACC_ALIGN 16
3243         #elif XXH_VECTOR == XXH_VSX                                  /* vsx */
3244           #define XXH_ACC_ALIGN 16
3245         #elif XXH_VECTOR == XXH_AVX512                            /* avx512 */
3246           #define XXH_ACC_ALIGN 64
3247         #endif
3248       #endif
3249 
3250       #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 || \
3251           XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
3252         #define XXH_SEC_ALIGN XXH_ACC_ALIGN
3253       #else
3254         #define XXH_SEC_ALIGN 8
3255       #endif
3256 
3257       /*
3258        * UGLY HACK:
3259        * GCC usually generates the best code with -O3 for xxHash.
3260        *
3261        * However, when targeting AVX2, it is overzealous in its unrolling
3262        * resulting in code roughly 3/4 the speed of Clang.
3263        *
3264        * There are other issues, such as GCC splitting _mm256_loadu_si256 into
3265        * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization
3266        * which only applies to Sandy and Ivy Bridge... which don't even support
3267        * AVX2.
3268        *
3269        * That is why when compiling the AVX2 version, it is recommended to use
3270        * either -O2 -mavx2 -march=haswell or -O2 -mavx2
3271        * -mno-avx256-split-unaligned-load for decent performance, or to use
3272        * Clang instead.
3273        *
3274        * Fortunately, we can control the first one with a pragma that forces GCC
3275        * into -O2, but the other one we can't control without "failed to inline
3276        * always inline function due to target mismatch" warnings.
3277        */
3278       #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
3279           && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
3280           && defined(__OPTIMIZE__) &&                                      \
3281           !defined(__OPTIMIZE_SIZE__)                /* respect -O0 and -Os */
3282         #pragma GCC push_options
3283         #pragma GCC optimize("-O2")
3284       #endif
3285 
3286       #if XXH_VECTOR == XXH_NEON
3287         /*
3288          * NEON's setup for vmlal_u32 is a little more complicated than it is on
3289          * SSE2, AVX2, and VSX.
3290          *
3291          * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an
3292          * upcast.
3293          *
3294          * To do the same operation, the 128-bit 'Q' register needs to be split
3295          * into two 64-bit 'D' registers, performing this operation::
3296          *
3297          *   [                a                 |                 b ] |
3298          * '---------. .--------'                | |                         x |
3299          *            |              .---------' '--------.                |
3300          *   [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[    a >> 32     |     b >> 32
3301          * ]
3302          *
3303          * Due to significant changes in aarch64, the fastest method for aarch64
3304          * is completely different than the fastest method for ARMv7-A.
3305          *
3306          * ARMv7-A treats D registers as unions overlaying Q registers, so
3307          * modifying D11 will modify the high half of Q5. This is similar to how
3308          * modifying AH will only affect bits 8-15 of AX on x86.
3309          *
3310          * VZIP takes two registers, and puts even lanes in one register and odd
3311          * lanes in the other.
3312          *
3313          * On ARMv7-A, this strangely modifies both parameters in place instead
3314          * of taking the usual 3-operand form.
3315          *
3316          * Therefore, if we want to do this, we can simply use a D-form VZIP.32
3317          * on the lower and upper halves of the Q register to end up with the
3318          * high and low halves where we want - all in one instruction.
3319          *
3320          *   vzip.32   d10, d11       @ d10 = { d10[0], d11[0] }; d11 = {
3321 
3322          * d10[1], d11[1] }
3323          *
3324          * Unfortunately we need inline assembly for this: Instructions
3325          * modifying two registers at once is not possible in GCC or Clang's IR,
3326          * and they have to create a copy.
3327          *
3328          * aarch64 requires a different approach.
3329          *
3330          * In order to make it easier to write a decent compiler for aarch64,
3331          * many quirks were removed, such as conditional execution.
3332          *
3333          * NEON was also affected by this.
3334          *
3335          * aarch64 cannot access the high bits of a Q-form register, and writes
3336          * to a D-form register zero the high bits, similar to how writes to
3337          * W-form scalar registers (or DWORD registers on x86_64) work.
3338          *
3339          * The formerly free vget_high intrinsics now require a vext (with a few
3340          * exceptions)
3341          *
3342          * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the
3343          * equivalent of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to
3344          * only modify one operand.
3345          *
3346          * The equivalent of the VZIP.32 on the lower and upper halves would be
3347          * this mess:
3348          *
3349          *   ext     v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0],
3350          * v0[1] } zip1    v1.2s, v0.2s, v2.2s     // v1 = { v0[0], v2[0] } zip2
3351          * v0.2s, v0.2s, v1.2s     // v0 = { v0[1], v2[1] }
3352          *
3353          * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64
3354          * (SHRN):
3355          *
3356          *   shrn    v1.2s, v0.2d, #32  // v1 = (uint32x2_t)(v0 >> 32);
3357          *   xtn     v0.2s, v0.2d       // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
3358          *
3359          * This is available on ARMv7-A, but is less efficient than a single
3360          * VZIP.32.
3361          */
3362 
3363         /*!
3364          * Function-like macro:
3365          * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t
3366          * &outHi)
3367          * {
3368 
3369          *     outLo = (uint32x2_t)(in & 0xFFFFFFFF);
3370          *     outHi = (uint32x2_t)(in >> 32);
3371          *     in = UNDEFINED;
3372          * }
3373          */
3374         #if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
3375             && defined(__GNUC__) && !defined(__aarch64__) &&   \
3376             !defined(__arm64__)
3377           #define XXH_SPLIT_IN_PLACE(in, outLo, outHi)                                                   \
3378             do {                                                                                         \
3379                                                                                                          \
3380               /* Undocumented GCC/Clang operand modifier: %e0 = lower D half,                            \
3381                * %f0 = upper D half */                                                                   \
3382               /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486             \
3383                */                                                                                        \
3384               /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 \
3385                */                                                                                        \
3386               __asm__("vzip.32  %e0, %f0" : "+w"(in));                                                   \
3387               (outLo) = vget_low_u32(vreinterpretq_u32_u64(in));                                         \
3388               (outHi) = vget_high_u32(vreinterpretq_u32_u64(in));                                        \
3389                                                                                                          \
3390             } while (0)
3391 
3392         #else
3393           #define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3394             do {                                       \
3395                                                        \
3396               (outLo) = vmovn_u64(in);                 \
3397               (outHi) = vshrn_n_u64((in), 32);         \
3398                                                        \
3399             } while (0)
3400 
3401         #endif
3402       #endif                                      /* XXH_VECTOR == XXH_NEON */
3403 
3404       /*
3405        * VSX and Z Vector helpers.
3406        *
3407        * This is very messy, and any pull requests to clean this up are welcome.
3408        *
3409        * There are a lot of problems with supporting VSX and s390x, due to
3410        * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
3411        */
3412       #if XXH_VECTOR == XXH_VSX
3413         #if defined(__s390x__)
3414           #include <s390intrin.h>
3415         #else
3416           /* gcc's altivec.h can have the unwanted consequence to
3417            * unconditionally #define bool, vector, and pixel keywords, with bad
3418            * consequences for programs already using these keywords for other
3419            * purposes. The paragraph defining these macros is skipped when
3420            * __APPLE_ALTIVEC__ is defined.
3421            * __APPLE_ALTIVEC__ is _generally_ defined automatically by the
3422            * compiler, but it seems that, in some cases, it isn't. Force the
3423            * build macro to be defined, so that keywords are not altered.
3424            */
3425           #if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
3426             #define __APPLE_ALTIVEC__
3427           #endif
3428           #include <altivec.h>
3429         #endif
3430 
3431 typedef __vector unsigned long long xxh_u64x2;
3432 typedef __vector unsigned char      xxh_u8x16;
3433 typedef __vector unsigned           xxh_u32x4;
3434 
3435         #ifndef XXH_VSX_BE
3436           #if defined(__BIG_ENDIAN__) ||  \
3437               (defined(__BYTE_ORDER__) && \
3438                __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
3439             #define XXH_VSX_BE 1
3440           #elif defined(__VEC_ELEMENT_REG_ORDER__) && \
3441               __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
3442             #warning \
3443                 "-maltivec=be is not recommended. Please use native endianness."
3444             #define XXH_VSX_BE 1
3445           #else
3446             #define XXH_VSX_BE 0
3447           #endif
3448         #endif                                      /* !defined(XXH_VSX_BE) */
3449 
3450         #if XXH_VSX_BE
3451           #if defined(__POWER9_VECTOR__) || \
3452               (defined(__clang__) && defined(__s390x__))
3453             #define XXH_vec_revb vec_revb
3454           #else
3455 /*!
3456  * A polyfill for POWER9's vec_revb().
3457  */
XXH_vec_revb(xxh_u64x2 val)3458 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) {
3459 
3460   xxh_u8x16 const vByteSwap = {0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
3461                                0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08};
3462   return vec_perm(val, val, vByteSwap);
3463 
3464 }
3465 
3466           #endif
3467         #endif                                                /* XXH_VSX_BE */
3468 
3469 /*!
3470  * Performs an unaligned vector load and byte swaps it on big endian.
3471  */
XXH_vec_loadu(const void * ptr)3472 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) {
3473 
3474   xxh_u64x2 ret;
3475   memcpy(&ret, ptr, sizeof(xxh_u64x2));
3476         #if XXH_VSX_BE
3477   ret = XXH_vec_revb(ret);
3478         #endif
3479   return ret;
3480 
3481 }
3482 
3483         /*
3484          * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
3485          *
3486          * These intrinsics weren't added until GCC 8, despite existing for a
3487          * while, and they are endian dependent. Also, their meaning swap
3488          * depending on version.
3489          * */
3490         #if defined(__s390x__)
3491         /* s390x is always big endian, no issue on this platform */
3492           #define XXH_vec_mulo vec_mulo
3493           #define XXH_vec_mule vec_mule
3494         #elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
3495         /* Clang has a better way to control this, we can just use the builtin
3496          * which doesn't swap. */
3497           #define XXH_vec_mulo __builtin_altivec_vmulouw
3498           #define XXH_vec_mule __builtin_altivec_vmuleuw
3499         #else
3500 /* gcc needs inline assembly */
3501 /* Adapted from
3502  * https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_vec_mulo(xxh_u32x4 a,xxh_u32x4 b)3503 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) {
3504 
3505   xxh_u64x2 result;
3506   __asm__("vmulouw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
3507   return result;
3508 
3509 }
3510 
XXH_vec_mule(xxh_u32x4 a,xxh_u32x4 b)3511 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) {
3512 
3513   xxh_u64x2 result;
3514   __asm__("vmuleuw %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
3515   return result;
3516 
3517 }
3518 
3519         #endif                                /* XXH_vec_mulo, XXH_vec_mule */
3520       #endif                                       /* XXH_VECTOR == XXH_VSX */
3521 
3522       /* prefetch
3523        * can be disabled, by declaring XXH_NO_PREFETCH build macro */
3524       #if defined(XXH_NO_PREFETCH)
3525         #define XXH_PREFETCH(ptr) (void)(ptr)                   /* disabled */
3526       #else
3527         #if defined(_MSC_VER) && \
3528             (defined(_M_X64) ||  \
3529              defined(            \
3530                  _M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
3531           #include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
3532           #define XXH_PREFETCH(ptr) \
3533             _mm_prefetch((const char *)(ptr), _MM_HINT_T0)
3534         #elif defined(__GNUC__) && \
3535             ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)))
3536           #define XXH_PREFETCH(ptr) \
3537             __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
3538         #else
3539           #define XXH_PREFETCH(ptr) (void)(ptr)                 /* disabled */
3540         #endif
3541       #endif                                             /* XXH_NO_PREFETCH */
3542 
3543     /* ==========================================
3544      * XXH3 default settings
3545      * ========================================== */
3546 
3547       #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
3548 
3549       #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
3550         #error "default keyset is not large enough"
3551       #endif
3552 
3553 /*! Pseudorandom secret taken directly from FARSH. */
3554 XXH_ALIGN(64)
3555 static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
3556 
3557     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c,
3558     0xf7, 0x21, 0xad, 0x1c, 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb,
3559     0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, 0xcb, 0x79, 0xe6, 0x4e,
3560     0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
3561     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6,
3562     0x81, 0x3a, 0x26, 0x4c, 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb,
3563     0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, 0x71, 0x64, 0x48, 0x97,
3564     0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
3565     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7,
3566     0xc7, 0x0b, 0x4f, 0x1d, 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31,
3567     0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, 0xea, 0xc5, 0xac, 0x83,
3568     0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
3569     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26,
3570     0x29, 0xd4, 0x68, 0x9e, 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc,
3571     0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, 0x45, 0xcb, 0x3a, 0x8f,
3572     0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
3573 
3574 };
3575 
3576       #ifdef XXH_OLD_NAMES
3577         #define kSecret XXH3_kSecret
3578       #endif
3579 
3580       #ifdef XXH_DOXYGEN
3581 /*!
3582  * @brief Calculates a 32-bit to 64-bit long multiply.
3583  *
3584  * Implemented as a macro.
3585  *
3586  * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it
3587  * doesn't need to (but it shouldn't need to anyways, it is about 7 instructions
3588  * to do a 64x64 multiply...). Since we know that this will _always_ emit
3589  * `MULL`, we use that instead of the normal method.
3590  *
3591  * If you are compiling for platforms like Thumb-1 and don't have a better
3592  * option, you may also want to write your own long multiply routine here.
3593  *
3594  * @param x, y Numbers to be multiplied
3595  * @return 64-bit product of the low 32 bits of @p x and @p y.
3596  */
XXH_mult32to64(xxh_u64 x,xxh_u64 y)3597 XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y) {
3598 
3599   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
3600 
3601 }
3602 
3603       #elif defined(_MSC_VER) && defined(_M_IX86)
3604         #include <intrin.h>
3605         #define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
3606       #else
3607       /*
3608        * Downcast + upcast is usually better than masking on older compilers
3609        * like GCC 4.2 (especially 32-bit ones), all without affecting newer
3610        * compilers.
3611        *
3612        * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both
3613        * operands and perform a full 64x64 multiply -- entirely redundant on
3614        * 32-bit.
3615        */
3616         #define XXH_mult32to64(x, y) \
3617           ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
3618       #endif
3619 
3620 /*!
3621  * @brief Calculates a 64->128-bit long multiply.
3622  *
3623  * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
3624  * version.
3625  *
3626  * @param lhs, rhs The 64-bit integers to be multiplied
3627  * @return The 128-bit result represented in an @ref XXH128_hash_t.
3628  */
XXH_mult64to128(xxh_u64 lhs,xxh_u64 rhs)3629 static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) {
3630 
3631       /*
3632        * GCC/Clang __uint128_t method.
3633        *
3634        * On most 64-bit targets, GCC and Clang define a __uint128_t type.
3635        * This is usually the best way as it usually uses a native long 64-bit
3636        * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
3637        *
3638        * Usually.
3639        *
3640        * Despite being a 32-bit platform, Clang (and emscripten) define this
3641        * type despite not having the arithmetic for it. This results in a laggy
3642        * compiler builtin call which calculates a full 128-bit multiply.
3643        * In that case it is best to use the portable one.
3644        * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
3645        */
3646       #if defined(__GNUC__) && !defined(__wasm__) && \
3647               defined(__SIZEOF_INT128__) ||          \
3648           (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
3649 
3650   __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
3651   XXH128_hash_t     r128;
3652   r128.low64 = (xxh_u64)(product);
3653   r128.high64 = (xxh_u64)(product >> 64);
3654   return r128;
3655 
3656         /*
3657          * MSVC for x64's _umul128 method.
3658          *
3659          * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64
3660          * *HighProduct);
3661          *
3662          * This compiles to single operand MUL on x64.
3663          */
3664       #elif defined(_M_X64) || defined(_M_IA64)
3665 
3666         #ifndef _MSC_VER
3667           #pragma intrinsic(_umul128)
3668         #endif
3669   xxh_u64       product_high;
3670   xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
3671   XXH128_hash_t r128;
3672   r128.low64 = product_low;
3673   r128.high64 = product_high;
3674   return r128;
3675 
3676       #else
3677   /*
3678    * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
3679    *
3680    * This is a fast and simple grade school multiply, which is shown below
3681    * with base 10 arithmetic instead of base 0x100000000.
3682    *
3683    *           9 3 // D2 lhs = 93
3684    *         x 7 5 // D2 rhs = 75
3685    *     ----------
3686    *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
3687    *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
3688    *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
3689    *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
3690    *     ---------
3691    *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
3692    *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
3693    *     ---------
3694    *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
3695    *
3696    * The reasons for adding the products like this are:
3697    *  1. It avoids manual carry tracking. Just like how
3698    *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
3699    *     This avoids a lot of complexity.
3700    *
3701    *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
3702    *     instruction available in ARM's Digital Signal Processing extension
3703    *     in 32-bit ARMv6 and later, which is shown below:
3704    *
3705    *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
3706    *         {
3707 
3708    *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
3709    *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
3710    *             *RdHi = (xxh_u32)(product >> 32);
3711    *         }
3712    *
3713    *     This instruction was designed for efficient long multiplication, and
3714    *     allows this to be calculated in only 4 instructions at speeds
3715    *     comparable to some 64-bit ALUs.
3716    *
3717    *  3. It isn't terrible on other platforms. Usually this will be a couple
3718    *     of 32-bit ADD/ADCs.
3719    */
3720 
3721   /* First calculate all of the cross products. */
3722   xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
3723   xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
3724   xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
3725   xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
3726 
3727   /* Now add the products together. These will never overflow. */
3728   xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
3729   xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
3730   xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
3731 
3732   XXH128_hash_t r128;
3733   r128.low64 = lower;
3734   r128.high64 = upper;
3735   return r128;
3736       #endif
3737 
3738 }
3739 
3740 /*!
3741  * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
3742  *
3743  * The reason for the separate function is to prevent passing too many structs
3744  * around by value. This will hopefully inline the multiply, but we don't force
3745  * it.
3746  *
3747  * @param lhs, rhs The 64-bit integers to multiply
3748  * @return The low 64 bits of the product XOR'd by the high 64 bits.
3749  * @see XXH_mult64to128()
3750  */
XXH3_mul128_fold64(xxh_u64 lhs,xxh_u64 rhs)3751 static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) {
3752 
3753   XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
3754   return product.low64 ^ product.high64;
3755 
3756 }
3757 
3758 /*! Seems to produce slightly better code on GCC for some reason. */
XXH_xorshift64(xxh_u64 v64,int shift)3759 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) {
3760 
3761   XXH_ASSERT(0 <= shift && shift < 64);
3762   return v64 ^ (v64 >> shift);
3763 
3764 }
3765 
3766 /*
3767  * This is a fast avalanche stage,
3768  * suitable when input bits are already partially mixed
3769  */
XXH3_avalanche(xxh_u64 h64)3770 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) {
3771 
3772   h64 = XXH_xorshift64(h64, 37);
3773   h64 *= 0x165667919E3779F9ULL;
3774   h64 = XXH_xorshift64(h64, 32);
3775   return h64;
3776 
3777 }
3778 
3779 /*
3780  * This is a stronger avalanche,
3781  * inspired by Pelle Evensen's rrmxmx
3782  * preferable when input has not been previously mixed
3783  */
XXH3_rrmxmx(xxh_u64 h64,xxh_u64 len)3784 static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) {
3785 
3786   /* this mix is inspired by Pelle Evensen's rrmxmx */
3787   h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
3788   h64 *= 0x9FB21C651E98DF25ULL;
3789   h64 ^= (h64 >> 35) + len;
3790   h64 *= 0x9FB21C651E98DF25ULL;
3791   return XXH_xorshift64(h64, 28);
3792 
3793 }
3794 
3795 /* ==========================================
3796  * Short keys
3797  * ==========================================
3798  * One of the shortcomings of XXH32 and XXH64 was that their performance was
3799  * sub-optimal on short lengths. It used an iterative algorithm which strongly
3800  * favored lengths that were a multiple of 4 or 8.
3801  *
3802  * Instead of iterating over individual inputs, we use a set of single shot
3803  * functions which piece together a range of lengths and operate in constant
3804  * time.
3805  *
3806  * Additionally, the number of multiplies has been significantly reduced. This
3807  * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
3808  *
3809  * Depending on the platform, this may or may not be faster than XXH32, but it
3810  * is almost guaranteed to be faster than XXH64.
3811  */
3812 
3813 /*
3814  * At very short lengths, there isn't enough input to fully hide secrets, or use
3815  * the entire secret.
3816  *
3817  * There is also only a limited amount of mixing we can do before significantly
3818  * impacting performance.
3819  *
3820  * Therefore, we use different sections of the secret and always mix two secret
3821  * samples with an XOR. This should have no effect on performance on the
3822  * seedless or withSeed variants because everything _should_ be constant folded
3823  * by modern compilers.
3824  *
3825  * The XOR mixing hides individual parts of the secret and increases entropy.
3826  *
3827  * This adds an extra layer of strength for custom secrets.
3828  */
XXH3_len_1to3_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3829 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8 *input, size_t len,
3830                                                 const xxh_u8 *secret,
3831                                                 XXH64_hash_t  seed) {
3832 
3833   XXH_ASSERT(input != NULL);
3834   XXH_ASSERT(1 <= len && len <= 3);
3835   XXH_ASSERT(secret != NULL);
3836   /*
3837    * len = 1: combined = { input[0], 0x01, input[0], input[0] }
3838    * len = 2: combined = { input[1], 0x02, input[0], input[1] }
3839    * len = 3: combined = { input[2], 0x03, input[0], input[1] }
3840    */
3841   {
3842 
3843     xxh_u8 const  c1 = input[0];
3844     xxh_u8 const  c2 = input[len >> 1];
3845     xxh_u8 const  c3 = input[len - 1];
3846     xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
3847                              ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
3848     xxh_u64 const bitflip =
3849         (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
3850     xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
3851     return XXH64_avalanche(keyed);
3852 
3853   }
3854 
3855 }
3856 
XXH3_len_4to8_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3857 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len,
3858                                                 const xxh_u8 *secret,
3859                                                 XXH64_hash_t  seed) {
3860 
3861   XXH_ASSERT(input != NULL);
3862   XXH_ASSERT(secret != NULL);
3863   XXH_ASSERT(4 <= len && len <= 8);
3864   seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
3865   {
3866 
3867     xxh_u32 const input1 = XXH_readLE32(input);
3868     xxh_u32 const input2 = XXH_readLE32(input + len - 4);
3869     xxh_u64 const bitflip =
3870         (XXH_readLE64(secret + 8) ^ XXH_readLE64(secret + 16)) - seed;
3871     xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
3872     xxh_u64 const keyed = input64 ^ bitflip;
3873     return XXH3_rrmxmx(keyed, len);
3874 
3875   }
3876 
3877 }
3878 
XXH3_len_9to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3879 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input,
3880                                                  size_t        len,
3881                                                  const xxh_u8 *secret,
3882                                                  XXH64_hash_t  seed) {
3883 
3884   XXH_ASSERT(input != NULL);
3885   XXH_ASSERT(secret != NULL);
3886   XXH_ASSERT(9 <= len && len <= 16);
3887   {
3888 
3889     xxh_u64 const bitflip1 =
3890         (XXH_readLE64(secret + 24) ^ XXH_readLE64(secret + 32)) + seed;
3891     xxh_u64 const bitflip2 =
3892         (XXH_readLE64(secret + 40) ^ XXH_readLE64(secret + 48)) - seed;
3893     xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
3894     xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
3895     xxh_u64 const acc = len + XXH_swap64(input_lo) + input_hi +
3896                         XXH3_mul128_fold64(input_lo, input_hi);
3897     return XXH3_avalanche(acc);
3898 
3899   }
3900 
3901 }
3902 
XXH3_len_0to16_64b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)3903 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8 *input,
3904                                                  size_t        len,
3905                                                  const xxh_u8 *secret,
3906                                                  XXH64_hash_t  seed) {
3907 
3908   XXH_ASSERT(len <= 16);
3909   {
3910 
3911     if (XXH_likely(len > 8))
3912       return XXH3_len_9to16_64b(input, len, secret, seed);
3913     if (XXH_likely(len >= 4))
3914       return XXH3_len_4to8_64b(input, len, secret, seed);
3915     if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
3916     return XXH64_avalanche(
3917         seed ^ (XXH_readLE64(secret + 56) ^ XXH_readLE64(secret + 64)));
3918 
3919   }
3920 
3921 }
3922 
3923 /*
3924  * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
3925  * multiplication by zero, affecting hashes of lengths 17 to 240.
3926  *
3927  * However, they are very unlikely.
3928  *
3929  * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
3930  * unseeded non-cryptographic hashes, it does not attempt to defend itself
3931  * against specially crafted inputs, only random inputs.
3932  *
3933  * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
3934  * cancelling out the secret is taken an arbitrary number of times (addressed
3935  * in XXH3_accumulate_512), this collision is very unlikely with random inputs
3936  * and/or proper seeding:
3937  *
3938  * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
3939  * function that is only called up to 16 times per hash with up to 240 bytes of
3940  * input.
3941  *
3942  * This is not too bad for a non-cryptographic hash function, especially with
3943  * only 64 bit outputs.
3944  *
3945  * The 128-bit variant (which trades some speed for strength) is NOT affected
3946  * by this, although it is always a good idea to use a proper seed if you care
3947  * about strength.
3948  */
XXH3_mix16B(const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 seed64)3949 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input,
3950                                      const xxh_u8 *XXH_RESTRICT secret,
3951                                      xxh_u64                    seed64) {
3952 
3953       #if defined(__GNUC__) && !defined(__clang__)  /* GCC, not Clang */      \
3954           && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */          \
3955           && !defined(XXH_ENABLE_AUTOVECTORIZE)     /* Define to disable like \
3956                                                        XXH32 hack */
3957   /*
3958    * UGLY HACK:
3959    * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
3960    * slower code.
3961    *
3962    * By forcing seed64 into a register, we disrupt the cost model and
3963    * cause it to scalarize. See `XXH32_round()`
3964    *
3965    * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
3966    * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
3967    * GCC 9.2, despite both emitting scalar code.
3968    *
3969    * GCC generates much better scalar code than Clang for the rest of XXH3,
3970    * which is why finding a more optimal codepath is an interest.
3971    */
3972   XXH_COMPILER_GUARD(seed64);
3973       #endif
3974   {
3975 
3976     xxh_u64 const input_lo = XXH_readLE64(input);
3977     xxh_u64 const input_hi = XXH_readLE64(input + 8);
3978     return XXH3_mul128_fold64(input_lo ^ (XXH_readLE64(secret) + seed64),
3979                               input_hi ^ (XXH_readLE64(secret + 8) - seed64));
3980 
3981   }
3982 
3983 }
3984 
3985 /* For mid range keys, XXH3 uses a Mum-hash variant. */
XXH3_len_17to128_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)3986 XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(
3987     const xxh_u8 *XXH_RESTRICT input, size_t len,
3988     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
3989 
3990   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
3991   (void)secretSize;
3992   XXH_ASSERT(16 < len && len <= 128);
3993 
3994   {
3995 
3996     xxh_u64 acc = len * XXH_PRIME64_1;
3997     if (len > 32) {
3998 
3999       if (len > 64) {
4000 
4001         if (len > 96) {
4002 
4003           acc += XXH3_mix16B(input + 48, secret + 96, seed);
4004           acc += XXH3_mix16B(input + len - 64, secret + 112, seed);
4005 
4006         }
4007 
4008         acc += XXH3_mix16B(input + 32, secret + 64, seed);
4009         acc += XXH3_mix16B(input + len - 48, secret + 80, seed);
4010 
4011       }
4012 
4013       acc += XXH3_mix16B(input + 16, secret + 32, seed);
4014       acc += XXH3_mix16B(input + len - 32, secret + 48, seed);
4015 
4016     }
4017 
4018     acc += XXH3_mix16B(input + 0, secret + 0, seed);
4019     acc += XXH3_mix16B(input + len - 16, secret + 16, seed);
4020 
4021     return XXH3_avalanche(acc);
4022 
4023   }
4024 
4025 }
4026 
4027       #define XXH3_MIDSIZE_MAX 240
4028 
XXH3_len_129to240_64b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)4029 XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(
4030     const xxh_u8 *XXH_RESTRICT input, size_t len,
4031     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
4032 
4033   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4034   (void)secretSize;
4035   XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
4036 
4037       #define XXH3_MIDSIZE_STARTOFFSET 3
4038       #define XXH3_MIDSIZE_LASTOFFSET 17
4039 
4040   {
4041 
4042     xxh_u64   acc = len * XXH_PRIME64_1;
4043     int const nbRounds = (int)len / 16;
4044     int       i;
4045     for (i = 0; i < 8; i++) {
4046 
4047       acc += XXH3_mix16B(input + (16 * i), secret + (16 * i), seed);
4048 
4049     }
4050 
4051     acc = XXH3_avalanche(acc);
4052     XXH_ASSERT(nbRounds >= 8);
4053       #if defined(__clang__)                                /* Clang */ \
4054           && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
4055           && !defined(XXH_ENABLE_AUTOVECTORIZE)        /* Define to disable */
4056         /*
4057          * UGLY HACK:
4058          * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
4059          * In everywhere else, it uses scalar code.
4060          *
4061          * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
4062          * would still be slower than UMAAL (see XXH_mult64to128).
4063          *
4064          * Unfortunately, Clang doesn't handle the long multiplies properly and
4065          * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
4066          * scalarized into an ugly mess of VMOV.32 instructions.
4067          *
4068          * This mess is difficult to avoid without turning autovectorization
4069          * off completely, but they are usually relatively minor and/or not
4070          * worth it to fix.
4071          *
4072          * This loop is the easiest to fix, as unlike XXH32, this pragma
4073          * _actually works_ because it is a loop vectorization instead of an
4074          * SLP vectorization.
4075          */
4076         #pragma clang loop vectorize(disable)
4077       #endif
4078     for (i = 8; i < nbRounds; i++) {
4079 
4080       acc +=
4081           XXH3_mix16B(input + (16 * i),
4082                       secret + (16 * (i - 8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
4083 
4084     }
4085 
4086     /* last bytes */
4087     acc += XXH3_mix16B(input + len - 16,
4088                        secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET,
4089                        seed);
4090     return XXH3_avalanche(acc);
4091 
4092   }
4093 
4094 }
4095 
4096     /* =======     Long Keys     ======= */
4097 
4098       #define XXH_STRIPE_LEN 64
4099       #define XXH_SECRET_CONSUME_RATE \
4100         8               /* nb of secret bytes consumed at each accumulation */
4101       #define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
4102 
4103       #ifdef XXH_OLD_NAMES
4104         #define STRIPE_LEN XXH_STRIPE_LEN
4105         #define ACC_NB XXH_ACC_NB
4106       #endif
4107 
XXH_writeLE64(void * dst,xxh_u64 v64)4108 XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64) {
4109 
4110   if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
4111   memcpy(dst, &v64, sizeof(v64));
4112 
4113 }
4114 
4115       /* Several intrinsic functions below are supposed to accept __int64 as
4116        * argument, as documented in
4117        * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
4118        * However, several environments do not define __int64 type,
4119        * requiring a workaround.
4120        */
4121       #if !defined(__VMS) &&                                     \
4122           (defined(__cplusplus) || (defined(__STDC_VERSION__) && \
4123                                     (__STDC_VERSION__ >= 199901L) /* C99 */))
4124 typedef int64_t xxh_i64;
4125       #else
4126 /* the following type must have a width of 64-bit */
4127 typedef long long xxh_i64;
4128       #endif
4129 
4130     /*
4131      * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the
4132      * most optimized.
4133      *
4134      * It is a hardened version of UMAC, based off of FARSH's implementation.
4135      *
4136      * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
4137      * implementations, and it is ridiculously fast.
4138      *
4139      * We harden it by mixing the original input to the accumulators as well as
4140      * the product.
4141      *
4142      * This means that in the (relatively likely) case of a multiply by zero,
4143      * the original input is preserved.
4144      *
4145      * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
4146      * cross-pollination, as otherwise the upper and lower halves would be
4147      * essentially independent.
4148      *
4149      * This doesn't matter on 64-bit hashes since they all get merged together
4150      * in the end, so we skip the extra step.
4151      *
4152      * Both XXH3_64bits and XXH3_128bits use this subroutine.
4153      */
4154 
4155       #if (XXH_VECTOR == XXH_AVX512) || \
4156           (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
4157 
4158         #ifndef XXH_TARGET_AVX512
4159           #define XXH_TARGET_AVX512             /* disable attribute target */
4160         #endif
4161 
XXH3_accumulate_512_avx512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4162 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_accumulate_512_avx512(
4163     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4164     const void *XXH_RESTRICT secret) {
4165 
4166   __m512i *const xacc = (__m512i *)acc;
4167   XXH_ASSERT((((size_t)acc) & 63) == 0);
4168   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
4169 
4170   {
4171 
4172     /* data_vec    = input[0]; */
4173     __m512i const data_vec = _mm512_loadu_si512(input);
4174     /* key_vec     = secret[0]; */
4175     __m512i const key_vec = _mm512_loadu_si512(secret);
4176     /* data_key    = data_vec ^ key_vec; */
4177     __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
4178     /* data_key_lo = data_key >> 32; */
4179     __m512i const data_key_lo =
4180         _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4181     /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4182     __m512i const product = _mm512_mul_epu32(data_key, data_key_lo);
4183     /* xacc[0] += swap(data_vec); */
4184     __m512i const data_swap =
4185         _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
4186     __m512i const sum = _mm512_add_epi64(*xacc, data_swap);
4187     /* xacc[0] += product; */
4188     *xacc = _mm512_add_epi64(product, sum);
4189 
4190   }
4191 
4192 }
4193 
4194 /*
4195  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
4196  *
4197  * Multiplication isn't perfect, as explained by Google in HighwayHash:
4198  *
4199  *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
4200  *  // varying degrees. In descending order of goodness, bytes
4201  *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
4202  *  // As expected, the upper and lower bytes are much worse.
4203  *
4204  * Source:
4205  * https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
4206  *
4207  * Since our algorithm uses a pseudorandom secret to add some variance into the
4208  * mix, we don't need to (or want to) mix as often or as much as HighwayHash
4209  * does.
4210  *
4211  * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
4212  * extraction.
4213  *
4214  * Both XXH3_64bits and XXH3_128bits use this subroutine.
4215  */
4216 
XXH3_scrambleAcc_avx512(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4217 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_scrambleAcc_avx512(
4218     void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
4219 
4220   XXH_ASSERT((((size_t)acc) & 63) == 0);
4221   XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
4222   {
4223 
4224     __m512i *const xacc = (__m512i *)acc;
4225     const __m512i  prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
4226 
4227     /* xacc[0] ^= (xacc[0] >> 47) */
4228     __m512i const acc_vec = *xacc;
4229     __m512i const shifted = _mm512_srli_epi64(acc_vec, 47);
4230     __m512i const data_vec = _mm512_xor_si512(acc_vec, shifted);
4231     /* xacc[0] ^= secret; */
4232     __m512i const key_vec = _mm512_loadu_si512(secret);
4233     __m512i const data_key = _mm512_xor_si512(data_vec, key_vec);
4234 
4235     /* xacc[0] *= XXH_PRIME32_1; */
4236     __m512i const data_key_hi =
4237         _mm512_shuffle_epi32(data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4238     __m512i const prod_lo = _mm512_mul_epu32(data_key, prime32);
4239     __m512i const prod_hi = _mm512_mul_epu32(data_key_hi, prime32);
4240     *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
4241 
4242   }
4243 
4244 }
4245 
XXH3_initCustomSecret_avx512(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4246 XXH_FORCE_INLINE XXH_TARGET_AVX512 void XXH3_initCustomSecret_avx512(
4247     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4248 
4249   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
4250   XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
4251   XXH_ASSERT(((size_t)customSecret & 63) == 0);
4252   (void)(&XXH_writeLE64);
4253   {
4254 
4255     int const     nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
4256     __m512i const seed = _mm512_mask_set1_epi64(
4257         _mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
4258 
4259     const __m512i *const src = (const __m512i *)((const void *)XXH3_kSecret);
4260     __m512i *const       dest = (__m512i *)customSecret;
4261     int                  i;
4262     XXH_ASSERT(((size_t)src & 63) == 0);               /* control alignment */
4263     XXH_ASSERT(((size_t)dest & 63) == 0);
4264     for (i = 0; i < nbRounds; ++i) {
4265 
4266       /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void
4267        * const*', this will warn "discards 'const' qualifier". */
4268       union {
4269 
4270         const __m512i *cp;
4271         void          *p;
4272 
4273       } remote_const_void;
4274 
4275       remote_const_void.cp = src + i;
4276       dest[i] =
4277           _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
4278 
4279     }
4280 
4281   }
4282 
4283 }
4284 
4285       #endif
4286 
4287       #if (XXH_VECTOR == XXH_AVX2) || \
4288           (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
4289 
4290         #ifndef XXH_TARGET_AVX2
4291           #define XXH_TARGET_AVX2               /* disable attribute target */
4292         #endif
4293 
XXH3_accumulate_512_avx2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4294 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_accumulate_512_avx2(
4295     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4296     const void *XXH_RESTRICT secret) {
4297 
4298   XXH_ASSERT((((size_t)acc) & 31) == 0);
4299   {
4300 
4301     __m256i *const xacc = (__m256i *)acc;
4302     /* Unaligned. This is mainly for pointer arithmetic, and because
4303      * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason.
4304      */
4305     const __m256i *const xinput = (const __m256i *)input;
4306     /* Unaligned. This is mainly for pointer arithmetic, and because
4307      * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
4308     const __m256i *const xsecret = (const __m256i *)secret;
4309 
4310     size_t i;
4311     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
4312 
4313       /* data_vec    = xinput[i]; */
4314       __m256i const data_vec = _mm256_loadu_si256(xinput + i);
4315       /* key_vec     = xsecret[i]; */
4316       __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
4317       /* data_key    = data_vec ^ key_vec; */
4318       __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
4319       /* data_key_lo = data_key >> 32; */
4320       __m256i const data_key_lo =
4321           _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4322       /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4323       __m256i const product = _mm256_mul_epu32(data_key, data_key_lo);
4324       /* xacc[i] += swap(data_vec); */
4325       __m256i const data_swap =
4326           _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
4327       __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
4328       /* xacc[i] += product; */
4329       xacc[i] = _mm256_add_epi64(product, sum);
4330 
4331     }
4332 
4333   }
4334 
4335 }
4336 
XXH3_scrambleAcc_avx2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4337 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(
4338     void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
4339 
4340   XXH_ASSERT((((size_t)acc) & 31) == 0);
4341   {
4342 
4343     __m256i *const xacc = (__m256i *)acc;
4344     /* Unaligned. This is mainly for pointer arithmetic, and because
4345      * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
4346     const __m256i *const xsecret = (const __m256i *)secret;
4347     const __m256i        prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
4348 
4349     size_t i;
4350     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
4351 
4352       /* xacc[i] ^= (xacc[i] >> 47) */
4353       __m256i const acc_vec = xacc[i];
4354       __m256i const shifted = _mm256_srli_epi64(acc_vec, 47);
4355       __m256i const data_vec = _mm256_xor_si256(acc_vec, shifted);
4356       /* xacc[i] ^= xsecret; */
4357       __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
4358       __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);
4359 
4360       /* xacc[i] *= XXH_PRIME32_1; */
4361       __m256i const data_key_hi =
4362           _mm256_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4363       __m256i const prod_lo = _mm256_mul_epu32(data_key, prime32);
4364       __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32);
4365       xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
4366 
4367     }
4368 
4369   }
4370 
4371 }
4372 
XXH3_initCustomSecret_avx2(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4373 XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(
4374     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4375 
4376   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
4377   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
4378   XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
4379   (void)(&XXH_writeLE64);
4380   XXH_PREFETCH(customSecret);
4381   {
4382 
4383     __m256i const seed =
4384         _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64,
4385                           (xxh_i64)(0U - seed64), (xxh_i64)seed64);
4386 
4387     const __m256i *const src = (const __m256i *)((const void *)XXH3_kSecret);
4388     __m256i             *dest = (__m256i *)customSecret;
4389 
4390         #if defined(__GNUC__) || defined(__clang__)
4391     /*
4392      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
4393      *   - do not extract the secret from sse registers in the internal loop
4394      *   - use less common registers, and avoid pushing these reg into stack
4395      */
4396     XXH_COMPILER_GUARD(dest);
4397         #endif
4398     XXH_ASSERT(((size_t)src & 31) == 0);               /* control alignment */
4399     XXH_ASSERT(((size_t)dest & 31) == 0);
4400 
4401     /* GCC -O2 need unroll loop manually */
4402     dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src + 0), seed);
4403     dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src + 1), seed);
4404     dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src + 2), seed);
4405     dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src + 3), seed);
4406     dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src + 4), seed);
4407     dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src + 5), seed);
4408 
4409   }
4410 
4411 }
4412 
4413       #endif
4414 
4415       /* x86dispatch always generates SSE2 */
4416       #if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
4417 
4418         #ifndef XXH_TARGET_SSE2
4419           #define XXH_TARGET_SSE2               /* disable attribute target */
4420         #endif
4421 
XXH3_accumulate_512_sse2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4422 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_accumulate_512_sse2(
4423     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4424     const void *XXH_RESTRICT secret) {
4425 
4426   /* SSE2 is just a half-scale version of the AVX2 version. */
4427   XXH_ASSERT((((size_t)acc) & 15) == 0);
4428   {
4429 
4430     __m128i *const xacc = (__m128i *)acc;
4431     /* Unaligned. This is mainly for pointer arithmetic, and because
4432      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
4433     const __m128i *const xinput = (const __m128i *)input;
4434     /* Unaligned. This is mainly for pointer arithmetic, and because
4435      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
4436     const __m128i *const xsecret = (const __m128i *)secret;
4437 
4438     size_t i;
4439     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
4440 
4441       /* data_vec    = xinput[i]; */
4442       __m128i const data_vec = _mm_loadu_si128(xinput + i);
4443       /* key_vec     = xsecret[i]; */
4444       __m128i const key_vec = _mm_loadu_si128(xsecret + i);
4445       /* data_key    = data_vec ^ key_vec; */
4446       __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
4447       /* data_key_lo = data_key >> 32; */
4448       __m128i const data_key_lo =
4449           _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4450       /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4451       __m128i const product = _mm_mul_epu32(data_key, data_key_lo);
4452       /* xacc[i] += swap(data_vec); */
4453       __m128i const data_swap =
4454           _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
4455       __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
4456       /* xacc[i] += product; */
4457       xacc[i] = _mm_add_epi64(product, sum);
4458 
4459     }
4460 
4461   }
4462 
4463 }
4464 
XXH3_scrambleAcc_sse2(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4465 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(
4466     void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret) {
4467 
4468   XXH_ASSERT((((size_t)acc) & 15) == 0);
4469   {
4470 
4471     __m128i *const xacc = (__m128i *)acc;
4472     /* Unaligned. This is mainly for pointer arithmetic, and because
4473      * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
4474     const __m128i *const xsecret = (const __m128i *)secret;
4475     const __m128i        prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
4476 
4477     size_t i;
4478     for (i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
4479 
4480       /* xacc[i] ^= (xacc[i] >> 47) */
4481       __m128i const acc_vec = xacc[i];
4482       __m128i const shifted = _mm_srli_epi64(acc_vec, 47);
4483       __m128i const data_vec = _mm_xor_si128(acc_vec, shifted);
4484       /* xacc[i] ^= xsecret[i]; */
4485       __m128i const key_vec = _mm_loadu_si128(xsecret + i);
4486       __m128i const data_key = _mm_xor_si128(data_vec, key_vec);
4487 
4488       /* xacc[i] *= XXH_PRIME32_1; */
4489       __m128i const data_key_hi =
4490           _mm_shuffle_epi32(data_key, _MM_SHUFFLE(0, 3, 0, 1));
4491       __m128i const prod_lo = _mm_mul_epu32(data_key, prime32);
4492       __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32);
4493       xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
4494 
4495     }
4496 
4497   }
4498 
4499 }
4500 
XXH3_initCustomSecret_sse2(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4501 XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(
4502     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4503 
4504   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
4505   (void)(&XXH_writeLE64);
4506   {
4507 
4508     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
4509 
4510         #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
4511     /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
4512     XXH_ALIGN(16)
4513     const xxh_i64 seed64x2[2] = {(xxh_i64)seed64, (xxh_i64)(0U - seed64)};
4514     __m128i const seed = _mm_load_si128((__m128i const *)seed64x2);
4515         #else
4516     __m128i const seed =
4517         _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
4518         #endif
4519     int i;
4520 
4521     const void *const src16 = XXH3_kSecret;
4522     __m128i          *dst16 = (__m128i *)customSecret;
4523         #if defined(__GNUC__) || defined(__clang__)
4524     /*
4525      * On GCC & Clang, marking 'dest' as modified will cause the compiler:
4526      *   - do not extract the secret from sse registers in the internal loop
4527      *   - use less common registers, and avoid pushing these reg into stack
4528      */
4529     XXH_COMPILER_GUARD(dst16);
4530         #endif
4531     XXH_ASSERT(((size_t)src16 & 15) == 0);             /* control alignment */
4532     XXH_ASSERT(((size_t)dst16 & 15) == 0);
4533 
4534     for (i = 0; i < nbRounds; ++i) {
4535 
4536       dst16[i] =
4537           _mm_add_epi64(_mm_load_si128((const __m128i *)src16 + i), seed);
4538 
4539     }
4540 
4541   }
4542 
4543 }
4544 
4545       #endif
4546 
4547       #if (XXH_VECTOR == XXH_NEON)
4548 
XXH3_accumulate_512_neon(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4549 XXH_FORCE_INLINE void XXH3_accumulate_512_neon(
4550     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4551     const void *XXH_RESTRICT secret) {
4552 
4553   XXH_ASSERT((((size_t)acc) & 15) == 0);
4554   {
4555 
4556     uint64x2_t *const xacc = (uint64x2_t *)acc;
4557     /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7.
4558      */
4559     uint8_t const *const xinput = (const uint8_t *)input;
4560     uint8_t const *const xsecret = (const uint8_t *)secret;
4561 
4562     size_t i;
4563     for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
4564 
4565       /* data_vec = xinput[i]; */
4566       uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
4567       /* key_vec  = xsecret[i];  */
4568       uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4569       uint64x2_t data_key;
4570       uint32x2_t data_key_lo, data_key_hi;
4571       /* xacc[i] += swap(data_vec); */
4572       uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
4573       uint64x2_t const swapped = vextq_u64(data64, data64, 1);
4574       xacc[i] = vaddq_u64(xacc[i], swapped);
4575       /* data_key = data_vec ^ key_vec; */
4576       data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
4577       /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
4578        * data_key_hi = (uint32x2_t) (data_key >> 32);
4579        * data_key = UNDEFINED; */
4580       XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4581       /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4582       xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);
4583 
4584     }
4585 
4586   }
4587 
4588 }
4589 
XXH3_scrambleAcc_neon(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4590 XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void *XXH_RESTRICT       acc,
4591                                             const void *XXH_RESTRICT secret) {
4592 
4593   XXH_ASSERT((((size_t)acc) & 15) == 0);
4594 
4595   {
4596 
4597     uint64x2_t    *xacc = (uint64x2_t *)acc;
4598     uint8_t const *xsecret = (uint8_t const *)secret;
4599     uint32x2_t     prime = vdup_n_u32(XXH_PRIME32_1);
4600 
4601     size_t i;
4602     for (i = 0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
4603 
4604       /* xacc[i] ^= (xacc[i] >> 47); */
4605       uint64x2_t acc_vec = xacc[i];
4606       uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
4607       uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
4608 
4609       /* xacc[i] ^= xsecret[i]; */
4610       uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4611       uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
4612 
4613       /* xacc[i] *= XXH_PRIME32_1 */
4614       uint32x2_t data_key_lo, data_key_hi;
4615       /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
4616        * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
4617        * xacc[i] = UNDEFINED; */
4618       XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4619       { /*
4620          * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
4621          *
4622          * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
4623          * incorrectly "optimize" this:
4624          *   tmp     = vmul_u32(vmovn_u64(a), vmovn_u64(b));
4625          *   shifted = vshll_n_u32(tmp, 32);
4626          * to this:
4627          *   tmp     = "vmulq_u64"(a, b); // no such thing!
4628          *   shifted = vshlq_n_u64(tmp, 32);
4629          *
4630          * However, unlike SSE, Clang lacks a 64-bit multiply routine
4631          * for NEON, and it scalarizes two 64-bit multiplies instead.
4632          *
4633          * vmull_u32 has the same timing as vmul_u32, and it avoids
4634          * this bug completely.
4635          * See https://bugs.llvm.org/show_bug.cgi?id=39967
4636          */
4637         uint64x2_t prod_hi = vmull_u32(data_key_hi, prime);
4638         /* xacc[i] = prod_hi << 32; */
4639         xacc[i] = vshlq_n_u64(prod_hi, 32);
4640         /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4641         xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
4642 
4643       }
4644 
4645     }
4646 
4647   }
4648 
4649 }
4650 
4651       #endif
4652 
4653       #if (XXH_VECTOR == XXH_VSX)
4654 
XXH3_accumulate_512_vsx(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4655 XXH_FORCE_INLINE void XXH3_accumulate_512_vsx(void *XXH_RESTRICT       acc,
4656                                               const void *XXH_RESTRICT input,
4657                                               const void *XXH_RESTRICT secret) {
4658 
4659   xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;       /* presumed aligned */
4660   xxh_u64x2 const *const xinput =
4661       (xxh_u64x2 const *)input;                 /* no alignment restriction */
4662   xxh_u64x2 const *const xsecret =
4663       (xxh_u64x2 const *)secret;                /* no alignment restriction */
4664   xxh_u64x2 const v32 = {32, 32};
4665   size_t          i;
4666   for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4667 
4668     /* data_vec = xinput[i]; */
4669     xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
4670     /* key_vec = xsecret[i]; */
4671     xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4672     xxh_u64x2 const data_key = data_vec ^ key_vec;
4673     /* shuffled = (data_key << 32) | (data_key >> 32); */
4674     xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4675     /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled &
4676      * 0xFFFFFFFF); */
4677     xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4678     xacc[i] += product;
4679 
4680           /* swap high and low halves */
4681         #ifdef __s390x__
4682     xacc[i] += vec_permi(data_vec, data_vec, 2);
4683         #else
4684     xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
4685         #endif
4686 
4687   }
4688 
4689 }
4690 
XXH3_scrambleAcc_vsx(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4691 XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void *XXH_RESTRICT       acc,
4692                                            const void *XXH_RESTRICT secret) {
4693 
4694   XXH_ASSERT((((size_t)acc) & 15) == 0);
4695 
4696   {
4697 
4698     xxh_u64x2 *const       xacc = (xxh_u64x2 *)acc;
4699     const xxh_u64x2 *const xsecret = (const xxh_u64x2 *)secret;
4700     /* constants */
4701     xxh_u64x2 const v32 = {32, 32};
4702     xxh_u64x2 const v47 = {47, 47};
4703     xxh_u32x4 const prime = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,
4704                              XXH_PRIME32_1};
4705     size_t          i;
4706     for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4707 
4708       /* xacc[i] ^= (xacc[i] >> 47); */
4709       xxh_u64x2 const acc_vec = xacc[i];
4710       xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
4711 
4712       /* xacc[i] ^= xsecret[i]; */
4713       xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4714       xxh_u64x2 const data_key = data_vec ^ key_vec;
4715 
4716       /* xacc[i] *= XXH_PRIME32_1 */
4717       /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime &
4718        * 0xFFFFFFFF);  */
4719       xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
4720       /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
4721       xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
4722       xacc[i] = prod_odd + (prod_even << v32);
4723 
4724     }
4725 
4726   }
4727 
4728 }
4729 
4730       #endif
4731 
4732 /* scalar variants - universal */
4733 
XXH3_accumulate_512_scalar(void * XXH_RESTRICT acc,const void * XXH_RESTRICT input,const void * XXH_RESTRICT secret)4734 XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(
4735     void *XXH_RESTRICT acc, const void *XXH_RESTRICT input,
4736     const void *XXH_RESTRICT secret) {
4737 
4738   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
4739   const xxh_u8 *const xinput =
4740       (const xxh_u8 *)input;                    /* no alignment restriction */
4741   const xxh_u8 *const xsecret =
4742       (const xxh_u8 *)secret;                   /* no alignment restriction */
4743   size_t i;
4744   XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN - 1)) == 0);
4745   for (i = 0; i < XXH_ACC_NB; i++) {
4746 
4747     xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i);
4748     xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8);
4749     xacc[i ^ 1] += data_val;                         /* swap adjacent lanes */
4750     xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4751 
4752   }
4753 
4754 }
4755 
XXH3_scrambleAcc_scalar(void * XXH_RESTRICT acc,const void * XXH_RESTRICT secret)4756 XXH_FORCE_INLINE void XXH3_scrambleAcc_scalar(void *XXH_RESTRICT       acc,
4757                                               const void *XXH_RESTRICT secret) {
4758 
4759   xxh_u64 *const      xacc = (xxh_u64 *)acc;            /* presumed aligned */
4760   const xxh_u8 *const xsecret =
4761       (const xxh_u8 *)secret;                   /* no alignment restriction */
4762   size_t i;
4763   XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN - 1)) == 0);
4764   for (i = 0; i < XXH_ACC_NB; i++) {
4765 
4766     xxh_u64 const key64 = XXH_readLE64(xsecret + 8 * i);
4767     xxh_u64       acc64 = xacc[i];
4768     acc64 = XXH_xorshift64(acc64, 47);
4769     acc64 ^= key64;
4770     acc64 *= XXH_PRIME32_1;
4771     xacc[i] = acc64;
4772 
4773   }
4774 
4775 }
4776 
XXH3_initCustomSecret_scalar(void * XXH_RESTRICT customSecret,xxh_u64 seed64)4777 XXH_FORCE_INLINE void XXH3_initCustomSecret_scalar(
4778     void *XXH_RESTRICT customSecret, xxh_u64 seed64) {
4779 
4780   /*
4781    * We need a separate pointer for the hack below,
4782    * which requires a non-const pointer.
4783    * Any decent compiler will optimize this out otherwise.
4784    */
4785   const xxh_u8 *kSecretPtr = XXH3_kSecret;
4786   XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
4787 
4788       #if defined(__clang__) && defined(__aarch64__)
4789   /*
4790    * UGLY HACK:
4791    * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
4792    * placed sequentially, in order, at the top of the unrolled loop.
4793    *
4794    * While MOVK is great for generating constants (2 cycles for a 64-bit
4795    * constant compared to 4 cycles for LDR), long MOVK chains stall the
4796    * integer pipelines:
4797    *   I   L   S
4798    * MOVK
4799    * MOVK
4800    * MOVK
4801    * MOVK
4802    * ADD
4803    * SUB      STR
4804    *          STR
4805    * By forcing loads from memory (as the asm line causes Clang to assume
4806    * that XXH3_kSecretPtr has been changed), the pipelines are used more
4807    * efficiently:
4808    *   I   L   S
4809    *      LDR
4810    *  ADD LDR
4811    *  SUB     STR
4812    *          STR
4813    * XXH3_64bits_withSeed, len == 256, Snapdragon 835
4814    *   without hack: 2654.4 MB/s
4815    *   with hack:    3202.9 MB/s
4816    */
4817   XXH_COMPILER_GUARD(kSecretPtr);
4818       #endif
4819   /*
4820    * Note: in debug mode, this overrides the asm optimization
4821    * and Clang will emit MOVK chains again.
4822    */
4823   XXH_ASSERT(kSecretPtr == XXH3_kSecret);
4824 
4825   {
4826 
4827     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
4828     int       i;
4829     for (i = 0; i < nbRounds; i++) {
4830 
4831       /*
4832        * The asm hack causes Clang to assume that kSecretPtr aliases with
4833        * customSecret, and on aarch64, this prevented LDP from merging two
4834        * loads together for free. Putting the loads together before the stores
4835        * properly generates LDP.
4836        */
4837       xxh_u64 lo = XXH_readLE64(kSecretPtr + 16 * i) + seed64;
4838       xxh_u64 hi = XXH_readLE64(kSecretPtr + 16 * i + 8) - seed64;
4839       XXH_writeLE64((xxh_u8 *)customSecret + 16 * i, lo);
4840       XXH_writeLE64((xxh_u8 *)customSecret + 16 * i + 8, hi);
4841 
4842     }
4843 
4844   }
4845 
4846 }
4847 
4848 typedef void (*XXH3_f_accumulate_512)(void *XXH_RESTRICT, const void *,
4849                                       const void *);
4850 typedef void (*XXH3_f_scrambleAcc)(void *XXH_RESTRICT, const void *);
4851 typedef void (*XXH3_f_initCustomSecret)(void *XXH_RESTRICT, xxh_u64);
4852 
4853       #if (XXH_VECTOR == XXH_AVX512)
4854 
4855         #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
4856         #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
4857         #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
4858 
4859       #elif (XXH_VECTOR == XXH_AVX2)
4860 
4861         #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
4862         #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
4863         #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
4864 
4865       #elif (XXH_VECTOR == XXH_SSE2)
4866 
4867         #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
4868         #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
4869         #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
4870 
4871       #elif (XXH_VECTOR == XXH_NEON)
4872 
4873         #define XXH3_accumulate_512 XXH3_accumulate_512_neon
4874         #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
4875         #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4876 
4877       #elif (XXH_VECTOR == XXH_VSX)
4878 
4879         #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
4880         #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
4881         #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4882 
4883       #else                                                       /* scalar */
4884 
4885         #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
4886         #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
4887         #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4888 
4889       #endif
4890 
4891       #ifndef XXH_PREFETCH_DIST
4892         #ifdef __clang__
4893           #define XXH_PREFETCH_DIST 320
4894         #else
4895           #if (XXH_VECTOR == XXH_AVX512)
4896             #define XXH_PREFETCH_DIST 512
4897           #else
4898             #define XXH_PREFETCH_DIST 384
4899           #endif
4900         #endif                                                 /* __clang__ */
4901       #endif                                           /* XXH_PREFETCH_DIST */
4902 
4903 /*
4904  * XXH3_accumulate()
4905  * Loops over XXH3_accumulate_512().
4906  * Assumption: nbStripes will not overflow the secret size
4907  */
XXH3_accumulate(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,const xxh_u8 * XXH_RESTRICT secret,size_t nbStripes,XXH3_f_accumulate_512 f_acc512)4908 XXH_FORCE_INLINE void XXH3_accumulate(xxh_u64 *XXH_RESTRICT      acc,
4909                                       const xxh_u8 *XXH_RESTRICT input,
4910                                       const xxh_u8 *XXH_RESTRICT secret,
4911                                       size_t                     nbStripes,
4912                                       XXH3_f_accumulate_512      f_acc512) {
4913 
4914   size_t n;
4915   for (n = 0; n < nbStripes; n++) {
4916 
4917     const xxh_u8 *const in = input + n * XXH_STRIPE_LEN;
4918     XXH_PREFETCH(in + XXH_PREFETCH_DIST);
4919     f_acc512(acc, in, secret + n * XXH_SECRET_CONSUME_RATE);
4920 
4921   }
4922 
4923 }
4924 
XXH3_hashLong_internal_loop(xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)4925 XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(
4926     xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, size_t len,
4927     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
4928     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
4929 
4930   size_t const nbStripesPerBlock =
4931       (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
4932   size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
4933   size_t const nb_blocks = (len - 1) / block_len;
4934 
4935   size_t n;
4936 
4937   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4938 
4939   for (n = 0; n < nb_blocks; n++) {
4940 
4941     XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock,
4942                     f_acc512);
4943     f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
4944 
4945   }
4946 
4947   /* last partial block */
4948   XXH_ASSERT(len > XXH_STRIPE_LEN);
4949   {
4950 
4951     size_t const nbStripes =
4952         ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
4953     XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
4954     XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes,
4955                     f_acc512);
4956 
4957     /* last stripe */
4958     {
4959 
4960       const xxh_u8 *const p = input + len - XXH_STRIPE_LEN;
4961       #define XXH_SECRET_LASTACC_START                                       \
4962         7 /* not aligned on 8, last secret is different from acc & scrambler \
4963            */
4964       f_acc512(acc, p,
4965                secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
4966 
4967     }
4968 
4969   }
4970 
4971 }
4972 
XXH3_mix2Accs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret)4973 XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64 *XXH_RESTRICT acc,
4974                                        const xxh_u8 *XXH_RESTRICT  secret) {
4975 
4976   return XXH3_mul128_fold64(acc[0] ^ XXH_readLE64(secret),
4977                             acc[1] ^ XXH_readLE64(secret + 8));
4978 
4979 }
4980 
XXH3_mergeAccs(const xxh_u64 * XXH_RESTRICT acc,const xxh_u8 * XXH_RESTRICT secret,xxh_u64 start)4981 static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc,
4982                                    const xxh_u8 *XXH_RESTRICT  secret,
4983                                    xxh_u64                     start) {
4984 
4985   xxh_u64 result64 = start;
4986   size_t  i = 0;
4987 
4988   for (i = 0; i < 4; i++) {
4989 
4990     result64 += XXH3_mix2Accs(acc + 2 * i, secret + 16 * i);
4991       #if defined(__clang__)                                /* Clang */ \
4992           && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
4993           && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
4994           && !defined(XXH_ENABLE_AUTOVECTORIZE)        /* Define to disable */
4995     /*
4996      * UGLY HACK:
4997      * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
4998      * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
4999      * XXH3_64bits, len == 256, Snapdragon 835:
5000      *   without hack: 2063.7 MB/s
5001      *   with hack:    2560.7 MB/s
5002      */
5003     XXH_COMPILER_GUARD(result64);
5004       #endif
5005 
5006   }
5007 
5008   return XXH3_avalanche(result64);
5009 
5010 }
5011 
5012       #define XXH3_INIT_ACC                                              \
5013         {                                                                \
5014                                                                          \
5015           XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3,    \
5016               XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 \
5017                                                                          \
5018         }
5019 
XXH3_hashLong_64b_internal(const void * XXH_RESTRICT input,size_t len,const void * XXH_RESTRICT secret,size_t secretSize,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5020 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(
5021     const void *XXH_RESTRICT input, size_t len, const void *XXH_RESTRICT secret,
5022     size_t secretSize, XXH3_f_accumulate_512 f_acc512,
5023     XXH3_f_scrambleAcc f_scramble) {
5024 
5025   XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5026 
5027   XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len,
5028                               (const xxh_u8 *)secret, secretSize, f_acc512,
5029                               f_scramble);
5030 
5031   /* converge into final hash */
5032   XXH_STATIC_ASSERT(sizeof(acc) == 64);
5033       /* do not align on 8, so that the secret is different from the accumulator
5034        */
5035       #define XXH_SECRET_MERGEACCS_START 11
5036   XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
5037   return XXH3_mergeAccs(acc,
5038                         (const xxh_u8 *)secret + XXH_SECRET_MERGEACCS_START,
5039                         (xxh_u64)len * XXH_PRIME64_1);
5040 
5041 }
5042 
5043 /*
5044  * It's important for performance that XXH3_hashLong is not inlined.
5045  */
XXH3_hashLong_64b_withSecret(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const xxh_u8 * XXH_RESTRICT secret,size_t secretLen)5046 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(
5047     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5048     const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
5049 
5050   (void)seed64;
5051   return XXH3_hashLong_64b_internal(input, len, secret, secretLen,
5052                                     XXH3_accumulate_512, XXH3_scrambleAcc);
5053 
5054 }
5055 
5056 /*
5057  * It's important for performance that XXH3_hashLong is not inlined.
5058  * Since the function is not inlined, the compiler may not be able to understand
5059  * that, in some scenarios, its `secret` argument is actually a compile time
5060  * constant. This variant enforces that the compiler can detect that, and uses
5061  * this opportunity to streamline the generated code for better performance.
5062  */
XXH3_hashLong_64b_default(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const xxh_u8 * XXH_RESTRICT secret,size_t secretLen)5063 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_default(
5064     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5065     const xxh_u8 *XXH_RESTRICT secret, size_t secretLen) {
5066 
5067   (void)seed64;
5068   (void)secret;
5069   (void)secretLen;
5070   return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret,
5071                                     sizeof(XXH3_kSecret), XXH3_accumulate_512,
5072                                     XXH3_scrambleAcc);
5073 
5074 }
5075 
5076 /*
5077  * XXH3_hashLong_64b_withSeed():
5078  * Generate a custom key based on alteration of default XXH3_kSecret with the
5079  * seed, and then use this key for long mode hashing.
5080  *
5081  * This operation is decently fast but nonetheless costs a little bit of time.
5082  * Try to avoid it whenever possible (typically when seed==0).
5083  *
5084  * It's important for performance that XXH3_hashLong is not inlined. Not sure
5085  * why (uop cache maybe?), but the difference is large and easily measurable.
5086  */
XXH3_hashLong_64b_withSeed_internal(const void * input,size_t len,XXH64_hash_t seed,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble,XXH3_f_initCustomSecret f_initSec)5087 XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(
5088     const void *input, size_t len, XXH64_hash_t seed,
5089     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
5090     XXH3_f_initCustomSecret f_initSec) {
5091 
5092   if (seed == 0)
5093     return XXH3_hashLong_64b_internal(
5094         input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
5095   {
5096 
5097     XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5098     f_initSec(secret, seed);
5099     return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
5100                                       f_acc512, f_scramble);
5101 
5102   }
5103 
5104 }
5105 
5106 /*
5107  * It's important for performance that XXH3_hashLong is not inlined.
5108  */
XXH3_hashLong_64b_withSeed(const void * input,size_t len,XXH64_hash_t seed,const xxh_u8 * secret,size_t secretLen)5109 XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const void   *input,
5110                                                       size_t        len,
5111                                                       XXH64_hash_t  seed,
5112                                                       const xxh_u8 *secret,
5113                                                       size_t        secretLen) {
5114 
5115   (void)secret;
5116   (void)secretLen;
5117   return XXH3_hashLong_64b_withSeed_internal(
5118       input, len, seed, XXH3_accumulate_512, XXH3_scrambleAcc,
5119       XXH3_initCustomSecret);
5120 
5121 }
5122 
5123 typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void *XXH_RESTRICT, size_t,
5124                                           XXH64_hash_t,
5125                                           const xxh_u8 *XXH_RESTRICT, size_t);
5126 
5127 XXH_FORCE_INLINE XXH64_hash_t
XXH3_64bits_internal(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen,XXH3_hashLong64_f f_hashLong)5128 XXH3_64bits_internal(const void *XXH_RESTRICT input, size_t len,
5129                      XXH64_hash_t seed64, const void *XXH_RESTRICT secret,
5130                      size_t secretLen, XXH3_hashLong64_f f_hashLong) {
5131 
5132   XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
5133   /*
5134    * If an action is to be taken if `secretLen` condition is not respected,
5135    * it should be done here.
5136    * For now, it's a contract pre-condition.
5137    * Adding a check and a branch here would cost performance at every hash.
5138    * Also, note that function signature doesn't offer room to return an error.
5139    */
5140   if (len <= 16)
5141     return XXH3_len_0to16_64b((const xxh_u8 *)input, len,
5142                               (const xxh_u8 *)secret, seed64);
5143   if (len <= 128)
5144     return XXH3_len_17to128_64b((const xxh_u8 *)input, len,
5145                                 (const xxh_u8 *)secret, secretLen, seed64);
5146   if (len <= XXH3_MIDSIZE_MAX)
5147     return XXH3_len_129to240_64b((const xxh_u8 *)input, len,
5148                                  (const xxh_u8 *)secret, secretLen, seed64);
5149   return f_hashLong(input, len, seed64, (const xxh_u8 *)secret, secretLen);
5150 
5151 }
5152 
5153 /* ===   Public entry point   === */
5154 
5155 /*! @ingroup xxh3_family */
XXH3_64bits(const void * input,size_t len)5156 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len) {
5157 
5158   return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret),
5159                               XXH3_hashLong_64b_default);
5160 
5161 }
5162 
5163 /*! @ingroup xxh3_family */
XXH3_64bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)5164 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input,
5165                                                    size_t      len,
5166                                                    const void *secret,
5167                                                    size_t      secretSize) {
5168 
5169   return XXH3_64bits_internal(input, len, 0, secret, secretSize,
5170                               XXH3_hashLong_64b_withSecret);
5171 
5172 }
5173 
5174 /*! @ingroup xxh3_family */
XXH3_64bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)5175 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len,
5176                                                  XXH64_hash_t seed) {
5177 
5178   return XXH3_64bits_internal(input, len, seed, XXH3_kSecret,
5179                               sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
5180 
5181 }
5182 
5183 /* ===   XXH3 streaming   === */
5184 
5185 /*
5186  * Malloc's a pointer that is always aligned to align.
5187  *
5188  * This must be freed with `XXH_alignedFree()`.
5189  *
5190  * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
5191  * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
5192  * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
5193  *
5194  * This underalignment previously caused a rather obvious crash which went
5195  * completely unnoticed due to XXH3_createState() not actually being tested.
5196  * Credit to RedSpah for noticing this bug.
5197  *
5198  * The alignment is done manually: Functions like posix_memalign or _mm_malloc
5199  * are avoided: To maintain portability, we would have to write a fallback
5200  * like this anyways, and besides, testing for the existence of library
5201  * functions without relying on external build tools is impossible.
5202  *
5203  * The method is simple: Overallocate, manually align, and store the offset
5204  * to the original behind the returned pointer.
5205  *
5206  * Align must be a power of 2 and 8 <= align <= 128.
5207  */
XXH_alignedMalloc(size_t s,size_t align)5208 static void *XXH_alignedMalloc(size_t s, size_t align) {
5209 
5210   XXH_ASSERT(align <= 128 && align >= 8);                    /* range check */
5211   XXH_ASSERT((align & (align - 1)) == 0);                     /* power of 2 */
5212   XXH_ASSERT(s != 0 && s < (s + align));                  /* empty/overflow */
5213   {  /* Overallocate to make room for manual realignment and an offset byte */
5214     xxh_u8 *base = (xxh_u8 *)XXH_malloc(s + align);
5215     if (base != NULL) {
5216 
5217       /*
5218        * Get the offset needed to align this pointer.
5219        *
5220        * Even if the returned pointer is aligned, there will always be
5221        * at least one byte to store the offset to the original pointer.
5222        */
5223       size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
5224       /* Add the offset for the now-aligned pointer */
5225       xxh_u8 *ptr = base + offset;
5226 
5227       XXH_ASSERT((size_t)ptr % align == 0);
5228 
5229       /* Store the offset immediately before the returned pointer. */
5230       ptr[-1] = (xxh_u8)offset;
5231       return ptr;
5232 
5233     }
5234 
5235     return NULL;
5236 
5237   }
5238 
5239 }
5240 
5241 /*
5242  * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
5243  * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
5244  */
XXH_alignedFree(void * p)5245 static void XXH_alignedFree(void *p) {
5246 
5247   if (p != NULL) {
5248 
5249     xxh_u8 *ptr = (xxh_u8 *)p;
5250     /* Get the offset byte we added in XXH_malloc. */
5251     xxh_u8 offset = ptr[-1];
5252     /* Free the original malloc'd pointer */
5253     xxh_u8 *base = ptr - offset;
5254     XXH_free(base);
5255 
5256   }
5257 
5258 }
5259 
5260 /*! @ingroup xxh3_family */
XXH3_createState(void)5261 XXH_PUBLIC_API XXH3_state_t *XXH3_createState(void) {
5262 
5263   XXH3_state_t *const state =
5264       (XXH3_state_t *)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
5265   if (state == NULL) return NULL;
5266   XXH3_INITSTATE(state);
5267   return state;
5268 
5269 }
5270 
5271 /*! @ingroup xxh3_family */
XXH3_freeState(XXH3_state_t * statePtr)5272 XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr) {
5273 
5274   XXH_alignedFree(statePtr);
5275   return XXH_OK;
5276 
5277 }
5278 
5279 /*! @ingroup xxh3_family */
XXH3_copyState(XXH3_state_t * dst_state,const XXH3_state_t * src_state)5280 XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t       *dst_state,
5281                                    const XXH3_state_t *src_state) {
5282 
5283   memcpy(dst_state, src_state, sizeof(*dst_state));
5284 
5285 }
5286 
XXH3_reset_internal(XXH3_state_t * statePtr,XXH64_hash_t seed,const void * secret,size_t secretSize)5287 static void XXH3_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed,
5288                                 const void *secret, size_t secretSize) {
5289 
5290   size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
5291   size_t const initLength =
5292       offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
5293   XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
5294   XXH_ASSERT(statePtr != NULL);
5295   /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
5296   memset((char *)statePtr + initStart, 0, initLength);
5297   statePtr->acc[0] = XXH_PRIME32_3;
5298   statePtr->acc[1] = XXH_PRIME64_1;
5299   statePtr->acc[2] = XXH_PRIME64_2;
5300   statePtr->acc[3] = XXH_PRIME64_3;
5301   statePtr->acc[4] = XXH_PRIME64_4;
5302   statePtr->acc[5] = XXH_PRIME32_2;
5303   statePtr->acc[6] = XXH_PRIME64_5;
5304   statePtr->acc[7] = XXH_PRIME32_1;
5305   statePtr->seed = seed;
5306   statePtr->extSecret = (const unsigned char *)secret;
5307   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5308   statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
5309   statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
5310 
5311 }
5312 
5313 /*! @ingroup xxh3_family */
XXH3_64bits_reset(XXH3_state_t * statePtr)5314 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr) {
5315 
5316   if (statePtr == NULL) return XXH_ERROR;
5317   XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
5318   return XXH_OK;
5319 
5320 }
5321 
5322 /*! @ingroup xxh3_family */
XXH3_64bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)5323 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(
5324     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
5325 
5326   if (statePtr == NULL) return XXH_ERROR;
5327   XXH3_reset_internal(statePtr, 0, secret, secretSize);
5328   if (secret == NULL) return XXH_ERROR;
5329   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5330   return XXH_OK;
5331 
5332 }
5333 
5334 /*! @ingroup xxh3_family */
XXH3_64bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)5335 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr,
5336                                                         XXH64_hash_t  seed) {
5337 
5338   if (statePtr == NULL) return XXH_ERROR;
5339   if (seed == 0) return XXH3_64bits_reset(statePtr);
5340   if (seed != statePtr->seed)
5341     XXH3_initCustomSecret(statePtr->customSecret, seed);
5342   XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
5343   return XXH_OK;
5344 
5345 }
5346 
5347 /* Note : when XXH3_consumeStripes() is invoked,
5348  * there must be a guarantee that at least one more byte must be consumed from
5349  * input
5350  * so that the function can blindly consume all stripes using the "normal"
5351  * secret segment */
XXH3_consumeStripes(xxh_u64 * XXH_RESTRICT acc,size_t * XXH_RESTRICT nbStripesSoFarPtr,size_t nbStripesPerBlock,const xxh_u8 * XXH_RESTRICT input,size_t nbStripes,const xxh_u8 * XXH_RESTRICT secret,size_t secretLimit,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5352 XXH_FORCE_INLINE void XXH3_consumeStripes(
5353     xxh_u64 *XXH_RESTRICT acc, size_t *XXH_RESTRICT nbStripesSoFarPtr,
5354     size_t nbStripesPerBlock, const xxh_u8 *XXH_RESTRICT input,
5355     size_t nbStripes, const xxh_u8 *XXH_RESTRICT secret, size_t secretLimit,
5356     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
5357 
5358   XXH_ASSERT(nbStripes <=
5359              nbStripesPerBlock); /* can handle max 1 scramble per invocation */
5360   XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
5361   if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
5362 
5363     /* need a scrambling operation */
5364     size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
5365     size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
5366     XXH3_accumulate(acc, input,
5367                     secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
5368                     nbStripesToEndofBlock, f_acc512);
5369     f_scramble(acc, secret + secretLimit);
5370     XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret,
5371                     nbStripesAfterBlock, f_acc512);
5372     *nbStripesSoFarPtr = nbStripesAfterBlock;
5373 
5374   } else {
5375 
5376     XXH3_accumulate(acc, input,
5377                     secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE,
5378                     nbStripes, f_acc512);
5379     *nbStripesSoFarPtr += nbStripes;
5380 
5381   }
5382 
5383 }
5384 
5385 /*
5386  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
5387  */
XXH3_update(XXH3_state_t * state,const xxh_u8 * input,size_t len,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5388 XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state,
5389                                            const xxh_u8 *input, size_t len,
5390                                            XXH3_f_accumulate_512 f_acc512,
5391                                            XXH3_f_scrambleAcc    f_scramble) {
5392 
5393   if (input == NULL)
5394       #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
5395           (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
5396     return XXH_OK;
5397       #else
5398     return XXH_ERROR;
5399       #endif
5400 
5401   {
5402 
5403     const xxh_u8 *const        bEnd = input + len;
5404     const unsigned char *const secret =
5405         (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5406 
5407     state->totalLen += len;
5408     XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
5409 
5410     if (state->bufferedSize + len <=
5411         XXH3_INTERNALBUFFER_SIZE) {                   /* fill in tmp buffer */
5412       XXH_memcpy(state->buffer + state->bufferedSize, input, len);
5413       state->bufferedSize += (XXH32_hash_t)len;
5414       return XXH_OK;
5415 
5416     }
5417 
5418         /* total input is now > XXH3_INTERNALBUFFER_SIZE */
5419 
5420       #define XXH3_INTERNALBUFFER_STRIPES \
5421         (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
5422     XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN ==
5423                       0);                                 /* clean multiple */
5424 
5425     /*
5426      * Internal buffer is partially filled (always, except at beginning)
5427      * Complete it, then consume it.
5428      */
5429     if (state->bufferedSize) {
5430 
5431       size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
5432       XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
5433       input += loadSize;
5434       XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
5435                           state->nbStripesPerBlock, state->buffer,
5436                           XXH3_INTERNALBUFFER_STRIPES, secret,
5437                           state->secretLimit, f_acc512, f_scramble);
5438       state->bufferedSize = 0;
5439 
5440     }
5441 
5442     XXH_ASSERT(input < bEnd);
5443 
5444     /* Consume input by a multiple of internal buffer size */
5445     if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5446 
5447       const xxh_u8 *const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
5448       do {
5449 
5450         XXH3_consumeStripes(state->acc, &state->nbStripesSoFar,
5451                             state->nbStripesPerBlock, input,
5452                             XXH3_INTERNALBUFFER_STRIPES, secret,
5453                             state->secretLimit, f_acc512, f_scramble);
5454         input += XXH3_INTERNALBUFFER_SIZE;
5455 
5456       } while (input < limit);
5457 
5458       /* for last partial stripe */
5459       memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN,
5460              input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5461 
5462     }
5463 
5464     XXH_ASSERT(input < bEnd);
5465 
5466     /* Some remaining input (always) : buffer it */
5467     XXH_memcpy(state->buffer, input, (size_t)(bEnd - input));
5468     state->bufferedSize = (XXH32_hash_t)(bEnd - input);
5469 
5470   }
5471 
5472   return XXH_OK;
5473 
5474 }
5475 
5476 /*! @ingroup xxh3_family */
XXH3_64bits_update(XXH3_state_t * state,const void * input,size_t len)5477 XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state,
5478                                                 const void *input, size_t len) {
5479 
5480   return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
5481                      XXH3_scrambleAcc);
5482 
5483 }
5484 
XXH3_digest_long(XXH64_hash_t * acc,const XXH3_state_t * state,const unsigned char * secret)5485 XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t        *acc,
5486                                        const XXH3_state_t  *state,
5487                                        const unsigned char *secret) {
5488 
5489   /*
5490    * Digest on a local copy. This way, the state remains unaltered, and it can
5491    * continue ingesting more input afterwards.
5492    */
5493   memcpy(acc, state->acc, sizeof(state->acc));
5494   if (state->bufferedSize >= XXH_STRIPE_LEN) {
5495 
5496     size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
5497     size_t       nbStripesSoFar = state->nbStripesSoFar;
5498     XXH3_consumeStripes(acc, &nbStripesSoFar, state->nbStripesPerBlock,
5499                         state->buffer, nbStripes, secret, state->secretLimit,
5500                         XXH3_accumulate_512, XXH3_scrambleAcc);
5501     /* last stripe */
5502     XXH3_accumulate_512(acc,
5503                         state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
5504                         secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5505 
5506   } else {                                 /* bufferedSize < XXH_STRIPE_LEN */
5507 
5508     xxh_u8       lastStripe[XXH_STRIPE_LEN];
5509     size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
5510     XXH_ASSERT(state->bufferedSize >
5511                0);                   /* there is always some input buffered */
5512     memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize,
5513            catchupSize);
5514     memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
5515     XXH3_accumulate_512(acc, lastStripe,
5516                         secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5517 
5518   }
5519 
5520 }
5521 
5522 /*! @ingroup xxh3_family */
XXH3_64bits_digest(const XXH3_state_t * state)5523 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state) {
5524 
5525   const unsigned char *const secret =
5526       (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5527   if (state->totalLen > XXH3_MIDSIZE_MAX) {
5528 
5529     XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
5530     XXH3_digest_long(acc, state, secret);
5531     return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
5532                           (xxh_u64)state->totalLen * XXH_PRIME64_1);
5533 
5534   }
5535 
5536   /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
5537   if (state->seed)
5538     return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen,
5539                                 state->seed);
5540   return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
5541                                 secret, state->secretLimit + XXH_STRIPE_LEN);
5542 
5543 }
5544 
5545       #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
5546 
5547 /*! @ingroup xxh3_family */
XXH3_generateSecret(void * secretBuffer,const void * customSeed,size_t customSeedSize)5548 XXH_PUBLIC_API void XXH3_generateSecret(void       *secretBuffer,
5549                                         const void *customSeed,
5550                                         size_t      customSeedSize) {
5551 
5552   XXH_ASSERT(secretBuffer != NULL);
5553   if (customSeedSize == 0) {
5554 
5555     memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
5556     return;
5557 
5558   }
5559 
5560   XXH_ASSERT(customSeed != NULL);
5561 
5562   {
5563 
5564     size_t const       segmentSize = sizeof(XXH128_hash_t);
5565     size_t const       nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
5566     XXH128_canonical_t scrambler;
5567     XXH64_hash_t       seeds[12];
5568     size_t             segnb;
5569     XXH_ASSERT(nbSegments == 12);
5570     XXH_ASSERT(segmentSize * nbSegments ==
5571                XXH_SECRET_DEFAULT_SIZE);                  /* exact multiple */
5572     XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
5573 
5574     /*
5575      * Copy customSeed to seeds[], truncating or repeating as necessary.
5576      */
5577     {
5578 
5579       size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
5580       size_t filled = toFill;
5581       memcpy(seeds, customSeed, toFill);
5582       while (filled < sizeof(seeds)) {
5583 
5584         toFill = XXH_MIN(filled, sizeof(seeds) - filled);
5585         memcpy((char *)seeds + filled, seeds, toFill);
5586         filled += toFill;
5587 
5588       }
5589 
5590     }
5591 
5592     /* generate secret */
5593     memcpy(secretBuffer, &scrambler, sizeof(scrambler));
5594     for (segnb = 1; segnb < nbSegments; segnb++) {
5595 
5596       size_t const       segmentStart = segnb * segmentSize;
5597       XXH128_canonical_t segment;
5598       XXH128_canonicalFromHash(&segment,
5599                                XXH128(&scrambler, sizeof(scrambler),
5600                                       XXH_readLE64(seeds + segnb) + segnb));
5601       memcpy((char *)secretBuffer + segmentStart, &segment, sizeof(segment));
5602 
5603     }
5604 
5605   }
5606 
5607 }
5608 
5609 /* ==========================================
5610  * XXH3 128 bits (a.k.a XXH128)
5611  * ==========================================
5612  * XXH3's 128-bit variant has better mixing and strength than the 64-bit
5613  * variant, even without counting the significantly larger output size.
5614  *
5615  * For example, extra steps are taken to avoid the seed-dependent collisions
5616  * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
5617  *
5618  * This strength naturally comes at the cost of some speed, especially on short
5619  * lengths. Note that longer hashes are about as fast as the 64-bit version
5620  * due to it using only a slight modification of the 64-bit loop.
5621  *
5622  * XXH128 is also more oriented towards 64-bit machines. It is still extremely
5623  * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
5624  */
5625 
XXH3_len_1to3_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5626 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8 *input,
5627                                                   size_t        len,
5628                                                   const xxh_u8 *secret,
5629                                                   XXH64_hash_t  seed) {
5630 
5631   /* A doubled version of 1to3_64b with different constants. */
5632   XXH_ASSERT(input != NULL);
5633   XXH_ASSERT(1 <= len && len <= 3);
5634   XXH_ASSERT(secret != NULL);
5635   /*
5636    * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
5637    * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
5638    * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
5639    */
5640   {
5641 
5642     xxh_u8 const  c1 = input[0];
5643     xxh_u8 const  c2 = input[len >> 1];
5644     xxh_u8 const  c3 = input[len - 1];
5645     xxh_u32 const combinedl = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) |
5646                               ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
5647     xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
5648     xxh_u64 const bitflipl =
5649         (XXH_readLE32(secret) ^ XXH_readLE32(secret + 4)) + seed;
5650     xxh_u64 const bitfliph =
5651         (XXH_readLE32(secret + 8) ^ XXH_readLE32(secret + 12)) - seed;
5652     xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
5653     xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
5654     XXH128_hash_t h128;
5655     h128.low64 = XXH64_avalanche(keyed_lo);
5656     h128.high64 = XXH64_avalanche(keyed_hi);
5657     return h128;
5658 
5659   }
5660 
5661 }
5662 
XXH3_len_4to8_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5663 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8 *input,
5664                                                   size_t        len,
5665                                                   const xxh_u8 *secret,
5666                                                   XXH64_hash_t  seed) {
5667 
5668   XXH_ASSERT(input != NULL);
5669   XXH_ASSERT(secret != NULL);
5670   XXH_ASSERT(4 <= len && len <= 8);
5671   seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
5672   {
5673 
5674     xxh_u32 const input_lo = XXH_readLE32(input);
5675     xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
5676     xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
5677     xxh_u64 const bitflip =
5678         (XXH_readLE64(secret + 16) ^ XXH_readLE64(secret + 24)) + seed;
5679     xxh_u64 const keyed = input_64 ^ bitflip;
5680 
5681     /* Shift len to the left to ensure it is even, this avoids even multiplies.
5682      */
5683     XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
5684 
5685     m128.high64 += (m128.low64 << 1);
5686     m128.low64 ^= (m128.high64 >> 3);
5687 
5688     m128.low64 = XXH_xorshift64(m128.low64, 35);
5689     m128.low64 *= 0x9FB21C651E98DF25ULL;
5690     m128.low64 = XXH_xorshift64(m128.low64, 28);
5691     m128.high64 = XXH3_avalanche(m128.high64);
5692     return m128;
5693 
5694   }
5695 
5696 }
5697 
XXH3_len_9to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5698 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8 *input,
5699                                                    size_t        len,
5700                                                    const xxh_u8 *secret,
5701                                                    XXH64_hash_t  seed) {
5702 
5703   XXH_ASSERT(input != NULL);
5704   XXH_ASSERT(secret != NULL);
5705   XXH_ASSERT(9 <= len && len <= 16);
5706   {
5707 
5708     xxh_u64 const bitflipl =
5709         (XXH_readLE64(secret + 32) ^ XXH_readLE64(secret + 40)) - seed;
5710     xxh_u64 const bitfliph =
5711         (XXH_readLE64(secret + 48) ^ XXH_readLE64(secret + 56)) + seed;
5712     xxh_u64 const input_lo = XXH_readLE64(input);
5713     xxh_u64       input_hi = XXH_readLE64(input + len - 8);
5714     XXH128_hash_t m128 =
5715         XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
5716     /*
5717      * Put len in the middle of m128 to ensure that the length gets mixed to
5718      * both the low and high bits in the 128x64 multiply below.
5719      */
5720     m128.low64 += (xxh_u64)(len - 1) << 54;
5721     input_hi ^= bitfliph;
5722     /*
5723      * Add the high 32 bits of input_hi to the high 32 bits of m128, then
5724      * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
5725      * the high 64 bits of m128.
5726      *
5727      * The best approach to this operation is different on 32-bit and 64-bit.
5728      */
5729     if (sizeof(void *) < sizeof(xxh_u64)) {                       /* 32-bit */
5730       /*
5731        * 32-bit optimized version, which is more readable.
5732        *
5733        * On 32-bit, it removes an ADC and delays a dependency between the two
5734        * halves of m128.high64, but it generates an extra mask on 64-bit.
5735        */
5736       m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) +
5737                      XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
5738 
5739     } else {
5740 
5741       /*
5742        * 64-bit optimized (albeit more confusing) version.
5743        *
5744        * Uses some properties of addition and multiplication to remove the mask:
5745        *
5746        * Let:
5747        *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
5748        *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
5749        *    c = XXH_PRIME32_2
5750        *
5751        *    a + (b * c)
5752        * Inverse Property: x + y - x == y
5753        *    a + (b * (1 + c - 1))
5754        * Distributive Property: x * (y + z) == (x * y) + (x * z)
5755        *    a + (b * 1) + (b * (c - 1))
5756        * Identity Property: x * 1 == x
5757        *    a + b + (b * (c - 1))
5758        *
5759        * Substitute a, b, and c:
5760        *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 -
5761        * 1))
5762        *
5763        * Since input_hi.hi + input_hi.lo == input_hi, we get this:
5764        *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
5765        */
5766       m128.high64 +=
5767           input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
5768 
5769     }
5770 
5771     /* m128 ^= XXH_swap64(m128 >> 64); */
5772     m128.low64 ^= XXH_swap64(m128.high64);
5773 
5774     {                      /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
5775       XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
5776       h128.high64 += m128.high64 * XXH_PRIME64_2;
5777 
5778       h128.low64 = XXH3_avalanche(h128.low64);
5779       h128.high64 = XXH3_avalanche(h128.high64);
5780       return h128;
5781 
5782     }
5783 
5784   }
5785 
5786 }
5787 
5788 /*
5789  * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
5790  */
XXH3_len_0to16_128b(const xxh_u8 * input,size_t len,const xxh_u8 * secret,XXH64_hash_t seed)5791 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8 *input,
5792                                                    size_t        len,
5793                                                    const xxh_u8 *secret,
5794                                                    XXH64_hash_t  seed) {
5795 
5796   XXH_ASSERT(len <= 16);
5797   {
5798 
5799     if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
5800     if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
5801     if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
5802     {
5803 
5804       XXH128_hash_t h128;
5805       xxh_u64 const bitflipl =
5806           XXH_readLE64(secret + 64) ^ XXH_readLE64(secret + 72);
5807       xxh_u64 const bitfliph =
5808           XXH_readLE64(secret + 80) ^ XXH_readLE64(secret + 88);
5809       h128.low64 = XXH64_avalanche(seed ^ bitflipl);
5810       h128.high64 = XXH64_avalanche(seed ^ bitfliph);
5811       return h128;
5812 
5813     }
5814 
5815   }
5816 
5817 }
5818 
5819 /*
5820  * A bit slower than XXH3_mix16B, but handles multiply by zero better.
5821  */
XXH128_mix32B(XXH128_hash_t acc,const xxh_u8 * input_1,const xxh_u8 * input_2,const xxh_u8 * secret,XXH64_hash_t seed)5822 XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc,
5823                                              const xxh_u8 *input_1,
5824                                              const xxh_u8 *input_2,
5825                                              const xxh_u8 *secret,
5826                                              XXH64_hash_t  seed) {
5827 
5828   acc.low64 += XXH3_mix16B(input_1, secret + 0, seed);
5829   acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
5830   acc.high64 += XXH3_mix16B(input_2, secret + 16, seed);
5831   acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
5832   return acc;
5833 
5834 }
5835 
XXH3_len_17to128_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)5836 XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b(
5837     const xxh_u8 *XXH_RESTRICT input, size_t len,
5838     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
5839 
5840   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5841   (void)secretSize;
5842   XXH_ASSERT(16 < len && len <= 128);
5843 
5844   {
5845 
5846     XXH128_hash_t acc;
5847     acc.low64 = len * XXH_PRIME64_1;
5848     acc.high64 = 0;
5849     if (len > 32) {
5850 
5851       if (len > 64) {
5852 
5853         if (len > 96) {
5854 
5855           acc = XXH128_mix32B(acc, input + 48, input + len - 64, secret + 96,
5856                               seed);
5857 
5858         }
5859 
5860         acc =
5861             XXH128_mix32B(acc, input + 32, input + len - 48, secret + 64, seed);
5862 
5863       }
5864 
5865       acc = XXH128_mix32B(acc, input + 16, input + len - 32, secret + 32, seed);
5866 
5867     }
5868 
5869     acc = XXH128_mix32B(acc, input, input + len - 16, secret, seed);
5870     {
5871 
5872       XXH128_hash_t h128;
5873       h128.low64 = acc.low64 + acc.high64;
5874       h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
5875                     ((len - seed) * XXH_PRIME64_2);
5876       h128.low64 = XXH3_avalanche(h128.low64);
5877       h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
5878       return h128;
5879 
5880     }
5881 
5882   }
5883 
5884 }
5885 
XXH3_len_129to240_128b(const xxh_u8 * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH64_hash_t seed)5886 XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b(
5887     const xxh_u8 *XXH_RESTRICT input, size_t len,
5888     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) {
5889 
5890   XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5891   (void)secretSize;
5892   XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
5893 
5894   {
5895 
5896     XXH128_hash_t acc;
5897     int const     nbRounds = (int)len / 32;
5898     int           i;
5899     acc.low64 = len * XXH_PRIME64_1;
5900     acc.high64 = 0;
5901     for (i = 0; i < 4; i++) {
5902 
5903       acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
5904                           secret + (32 * i), seed);
5905 
5906     }
5907 
5908     acc.low64 = XXH3_avalanche(acc.low64);
5909     acc.high64 = XXH3_avalanche(acc.high64);
5910     XXH_ASSERT(nbRounds >= 4);
5911     for (i = 4; i < nbRounds; i++) {
5912 
5913       acc = XXH128_mix32B(acc, input + (32 * i), input + (32 * i) + 16,
5914                           secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
5915                           seed);
5916 
5917     }
5918 
5919     /* last bytes */
5920     acc = XXH128_mix32B(
5921         acc, input + len - 16, input + len - 32,
5922         secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
5923         0ULL - seed);
5924 
5925     {
5926 
5927       XXH128_hash_t h128;
5928       h128.low64 = acc.low64 + acc.high64;
5929       h128.high64 = (acc.low64 * XXH_PRIME64_1) + (acc.high64 * XXH_PRIME64_4) +
5930                     ((len - seed) * XXH_PRIME64_2);
5931       h128.low64 = XXH3_avalanche(h128.low64);
5932       h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
5933       return h128;
5934 
5935     }
5936 
5937   }
5938 
5939 }
5940 
XXH3_hashLong_128b_internal(const void * XXH_RESTRICT input,size_t len,const xxh_u8 * XXH_RESTRICT secret,size_t secretSize,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble)5941 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(
5942     const void *XXH_RESTRICT input, size_t len,
5943     const xxh_u8 *XXH_RESTRICT secret, size_t secretSize,
5944     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble) {
5945 
5946   XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5947 
5948   XXH3_hashLong_internal_loop(acc, (const xxh_u8 *)input, len, secret,
5949                               secretSize, f_acc512, f_scramble);
5950 
5951   /* converge into final hash */
5952   XXH_STATIC_ASSERT(sizeof(acc) == 64);
5953   XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
5954   {
5955 
5956     XXH128_hash_t h128;
5957     h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
5958                                 (xxh_u64)len * XXH_PRIME64_1);
5959     h128.high64 = XXH3_mergeAccs(
5960         acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
5961         ~((xxh_u64)len * XXH_PRIME64_2));
5962     return h128;
5963 
5964   }
5965 
5966 }
5967 
5968 /*
5969  * It's important for performance that XXH3_hashLong is not inlined.
5970  */
XXH3_hashLong_128b_default(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen)5971 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_default(
5972     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5973     const void *XXH_RESTRICT secret, size_t secretLen) {
5974 
5975   (void)seed64;
5976   (void)secret;
5977   (void)secretLen;
5978   return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret,
5979                                      sizeof(XXH3_kSecret), XXH3_accumulate_512,
5980                                      XXH3_scrambleAcc);
5981 
5982 }
5983 
5984 /*
5985  * It's important for performance that XXH3_hashLong is not inlined.
5986  */
XXH3_hashLong_128b_withSecret(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen)5987 XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(
5988     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
5989     const void *XXH_RESTRICT secret, size_t secretLen) {
5990 
5991   (void)seed64;
5992   return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
5993                                      secretLen, XXH3_accumulate_512,
5994                                      XXH3_scrambleAcc);
5995 
5996 }
5997 
XXH3_hashLong_128b_withSeed_internal(const void * XXH_RESTRICT input,size_t len,XXH64_hash_t seed64,XXH3_f_accumulate_512 f_acc512,XXH3_f_scrambleAcc f_scramble,XXH3_f_initCustomSecret f_initSec)5998 XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(
5999     const void *XXH_RESTRICT input, size_t len, XXH64_hash_t seed64,
6000     XXH3_f_accumulate_512 f_acc512, XXH3_f_scrambleAcc f_scramble,
6001     XXH3_f_initCustomSecret f_initSec) {
6002 
6003   if (seed64 == 0)
6004     return XXH3_hashLong_128b_internal(
6005         input, len, XXH3_kSecret, sizeof(XXH3_kSecret), f_acc512, f_scramble);
6006   {
6007 
6008     XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
6009     f_initSec(secret, seed64);
6010     return XXH3_hashLong_128b_internal(input, len, (const xxh_u8 *)secret,
6011                                        sizeof(secret), f_acc512, f_scramble);
6012 
6013   }
6014 
6015 }
6016 
6017 /*
6018  * It's important for performance that XXH3_hashLong is not inlined.
6019  */
6020 XXH_NO_INLINE XXH128_hash_t
XXH3_hashLong_128b_withSeed(const void * input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen)6021 XXH3_hashLong_128b_withSeed(const void *input, size_t len, XXH64_hash_t seed64,
6022                             const void *XXH_RESTRICT secret, size_t secretLen) {
6023 
6024   (void)secret;
6025   (void)secretLen;
6026   return XXH3_hashLong_128b_withSeed_internal(
6027       input, len, seed64, XXH3_accumulate_512, XXH3_scrambleAcc,
6028       XXH3_initCustomSecret);
6029 
6030 }
6031 
6032 typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void *XXH_RESTRICT, size_t,
6033                                             XXH64_hash_t,
6034                                             const void *XXH_RESTRICT, size_t);
6035 
6036 XXH_FORCE_INLINE XXH128_hash_t
XXH3_128bits_internal(const void * input,size_t len,XXH64_hash_t seed64,const void * XXH_RESTRICT secret,size_t secretLen,XXH3_hashLong128_f f_hl128)6037 XXH3_128bits_internal(const void *input, size_t len, XXH64_hash_t seed64,
6038                       const void *XXH_RESTRICT secret, size_t secretLen,
6039                       XXH3_hashLong128_f f_hl128) {
6040 
6041   XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
6042   /*
6043    * If an action is to be taken if `secret` conditions are not respected,
6044    * it should be done here.
6045    * For now, it's a contract pre-condition.
6046    * Adding a check and a branch here would cost performance at every hash.
6047    */
6048   if (len <= 16)
6049     return XXH3_len_0to16_128b((const xxh_u8 *)input, len,
6050                                (const xxh_u8 *)secret, seed64);
6051   if (len <= 128)
6052     return XXH3_len_17to128_128b((const xxh_u8 *)input, len,
6053                                  (const xxh_u8 *)secret, secretLen, seed64);
6054   if (len <= XXH3_MIDSIZE_MAX)
6055     return XXH3_len_129to240_128b((const xxh_u8 *)input, len,
6056                                   (const xxh_u8 *)secret, secretLen, seed64);
6057   return f_hl128(input, len, seed64, secret, secretLen);
6058 
6059 }
6060 
6061 /* ===   Public XXH128 API   === */
6062 
6063 /*! @ingroup xxh3_family */
XXH3_128bits(const void * input,size_t len)6064 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len) {
6065 
6066   return XXH3_128bits_internal(input, len, 0, XXH3_kSecret,
6067                                sizeof(XXH3_kSecret),
6068                                XXH3_hashLong_128b_default);
6069 
6070 }
6071 
6072 /*! @ingroup xxh3_family */
XXH3_128bits_withSecret(const void * input,size_t len,const void * secret,size_t secretSize)6073 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input,
6074                                                      size_t      len,
6075                                                      const void *secret,
6076                                                      size_t      secretSize) {
6077 
6078   return XXH3_128bits_internal(input, len, 0, (const xxh_u8 *)secret,
6079                                secretSize, XXH3_hashLong_128b_withSecret);
6080 
6081 }
6082 
6083 /*! @ingroup xxh3_family */
XXH3_128bits_withSeed(const void * input,size_t len,XXH64_hash_t seed)6084 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void  *input,
6085                                                    size_t       len,
6086                                                    XXH64_hash_t seed) {
6087 
6088   return XXH3_128bits_internal(input, len, seed, XXH3_kSecret,
6089                                sizeof(XXH3_kSecret),
6090                                XXH3_hashLong_128b_withSeed);
6091 
6092 }
6093 
6094 /*! @ingroup xxh3_family */
XXH128(const void * input,size_t len,XXH64_hash_t seed)6095 XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len,
6096                                     XXH64_hash_t seed) {
6097 
6098   return XXH3_128bits_withSeed(input, len, seed);
6099 
6100 }
6101 
6102 /* ===   XXH3 128-bit streaming   === */
6103 
6104 /*
6105  * All the functions are actually the same as for 64-bit streaming variant.
6106  * The only difference is the finalization routine.
6107  */
6108 
6109 /*! @ingroup xxh3_family */
XXH3_128bits_reset(XXH3_state_t * statePtr)6110 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr) {
6111 
6112   if (statePtr == NULL) return XXH_ERROR;
6113   XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
6114   return XXH_OK;
6115 
6116 }
6117 
6118 /*! @ingroup xxh3_family */
XXH3_128bits_reset_withSecret(XXH3_state_t * statePtr,const void * secret,size_t secretSize)6119 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(
6120     XXH3_state_t *statePtr, const void *secret, size_t secretSize) {
6121 
6122   if (statePtr == NULL) return XXH_ERROR;
6123   XXH3_reset_internal(statePtr, 0, secret, secretSize);
6124   if (secret == NULL) return XXH_ERROR;
6125   if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
6126   return XXH_OK;
6127 
6128 }
6129 
6130 /*! @ingroup xxh3_family */
XXH3_128bits_reset_withSeed(XXH3_state_t * statePtr,XXH64_hash_t seed)6131 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr,
6132                                                          XXH64_hash_t  seed) {
6133 
6134   if (statePtr == NULL) return XXH_ERROR;
6135   if (seed == 0) return XXH3_128bits_reset(statePtr);
6136   if (seed != statePtr->seed)
6137     XXH3_initCustomSecret(statePtr->customSecret, seed);
6138   XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
6139   return XXH_OK;
6140 
6141 }
6142 
6143 /*! @ingroup xxh3_family */
XXH3_128bits_update(XXH3_state_t * state,const void * input,size_t len)6144 XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state,
6145                                                  const void   *input,
6146                                                  size_t        len) {
6147 
6148   return XXH3_update(state, (const xxh_u8 *)input, len, XXH3_accumulate_512,
6149                      XXH3_scrambleAcc);
6150 
6151 }
6152 
6153 /*! @ingroup xxh3_family */
XXH3_128bits_digest(const XXH3_state_t * state)6154 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state) {
6155 
6156   const unsigned char *const secret =
6157       (state->extSecret == NULL) ? state->customSecret : state->extSecret;
6158   if (state->totalLen > XXH3_MIDSIZE_MAX) {
6159 
6160     XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
6161     XXH3_digest_long(acc, state, secret);
6162     XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >=
6163                sizeof(acc) + XXH_SECRET_MERGEACCS_START);
6164     {
6165 
6166       XXH128_hash_t h128;
6167       h128.low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START,
6168                                   (xxh_u64)state->totalLen * XXH_PRIME64_1);
6169       h128.high64 =
6170           XXH3_mergeAccs(acc,
6171                          secret + state->secretLimit + XXH_STRIPE_LEN -
6172                              sizeof(acc) - XXH_SECRET_MERGEACCS_START,
6173                          ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
6174       return h128;
6175 
6176     }
6177 
6178   }
6179 
6180   /* len <= XXH3_MIDSIZE_MAX : short code */
6181   if (state->seed)
6182     return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen,
6183                                  state->seed);
6184   return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
6185                                  secret, state->secretLimit + XXH_STRIPE_LEN);
6186 
6187 }
6188 
6189     /* 128-bit utility functions */
6190 
6191       #include <string.h>                                 /* memcmp, memcpy */
6192 
6193 /* return : 1 is equal, 0 if different */
6194 /*! @ingroup xxh3_family */
XXH128_isEqual(XXH128_hash_t h1,XXH128_hash_t h2)6195 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) {
6196 
6197   /* note : XXH128_hash_t is compact, it has no padding byte */
6198   return !(memcmp(&h1, &h2, sizeof(h1)));
6199 
6200 }
6201 
6202 /* This prototype is compatible with stdlib's qsort().
6203  * return : >0 if *h128_1  > *h128_2
6204  *          <0 if *h128_1  < *h128_2
6205  *          =0 if *h128_1 == *h128_2  */
6206 /*! @ingroup xxh3_family */
XXH128_cmp(const void * h128_1,const void * h128_2)6207 XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2) {
6208 
6209   XXH128_hash_t const h1 = *(const XXH128_hash_t *)h128_1;
6210   XXH128_hash_t const h2 = *(const XXH128_hash_t *)h128_2;
6211   int const           hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
6212   /* note : bets that, in most cases, hash values are different */
6213   if (hcmp) return hcmp;
6214   return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
6215 
6216 }
6217 
6218 /*======   Canonical representation   ======*/
6219 /*! @ingroup xxh3_family */
XXH128_canonicalFromHash(XXH128_canonical_t * dst,XXH128_hash_t hash)6220 XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst,
6221                                              XXH128_hash_t       hash) {
6222 
6223   XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
6224   if (XXH_CPU_LITTLE_ENDIAN) {
6225 
6226     hash.high64 = XXH_swap64(hash.high64);
6227     hash.low64 = XXH_swap64(hash.low64);
6228 
6229   }
6230 
6231   memcpy(dst, &hash.high64, sizeof(hash.high64));
6232   memcpy((char *)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
6233 
6234 }
6235 
6236 /*! @ingroup xxh3_family */
6237 XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(const XXH128_canonical_t * src)6238 XXH128_hashFromCanonical(const XXH128_canonical_t *src) {
6239 
6240   XXH128_hash_t h;
6241   h.high64 = XXH_readBE64(src);
6242   h.low64 = XXH_readBE64(src->digest + 8);
6243   return h;
6244 
6245 }
6246 
6247       /* Pop our optimization override from above */
6248       #if XXH_VECTOR == XXH_AVX2                      /* AVX2 */           \
6249           && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
6250           && defined(__OPTIMIZE__) &&                                      \
6251           !defined(__OPTIMIZE_SIZE__)                /* respect -O0 and -Os */
6252         #pragma GCC pop_options
6253       #endif
6254 
6255     #endif                                              /* XXH_NO_LONG_LONG */
6256 
6257   #endif                                                     /* XXH_NO_XXH3 */
6258 
6259 /*!
6260  * @}
6261  */
6262 #endif                                                /* XXH_IMPLEMENTATION */
6263 
6264 #if defined(__cplusplus)
6265 
6266 }
6267 
6268 #endif
6269 
6270