]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the | |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
5 | /* | |
6 | xxHash - Extremely Fast Hash algorithm | |
7 | Header File | |
8 | Copyright (C) 2012-2016, Yann Collet. | |
9 | ||
10 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) | |
11 | ||
12 | Redistribution and use in source and binary forms, with or without | |
13 | modification, are permitted provided that the following conditions are | |
14 | met: | |
15 | ||
16 | * Redistributions of source code must retain the above copyright | |
17 | notice, this list of conditions and the following disclaimer. | |
18 | * Redistributions in binary form must reproduce the above | |
19 | copyright notice, this list of conditions and the following disclaimer | |
20 | in the documentation and/or other materials provided with the | |
21 | distribution. | |
22 | ||
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
24 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
25 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
26 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
27 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
28 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
29 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
30 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
31 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
32 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
33 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
34 | ||
35 | You can contact the author at : | |
36 | - xxHash source repository : https://github.com/Cyan4973/xxHash | |
37 | */ | |
38 | ||
39 | // This is a fork of a preview version of xxHash, as RocksDB depends on | |
40 | // this preview version of XXH3. To allow this to coexist with the | |
41 | // standard xxHash, including in the "unity" build where all source files | |
42 | // and headers go into a single translation unit, here "XXH" has been | |
43 | // replaced with "XXPH" for XX Preview Hash. | |
44 | ||
45 | #ifndef XXPHASH_H_5627135585666179 | |
46 | #define XXPHASH_H_5627135585666179 1 | |
47 | ||
48 | /* BEGIN RocksDB customizations */ | |
49 | #ifndef XXPH_STATIC_LINKING_ONLY | |
50 | // Access experimental APIs | |
51 | #define XXPH_STATIC_LINKING_ONLY 1 | |
52 | #endif | |
53 | #define XXPH_NAMESPACE ROCKSDB_ | |
54 | #define XXPH_INLINE_ALL | |
55 | #include <cstring> | |
56 | /* END RocksDB customizations */ | |
57 | ||
58 | // clang-format off | |
59 | #if defined (__cplusplus) | |
60 | extern "C" { | |
61 | #endif | |
62 | ||
63 | ||
64 | /* **************************** | |
65 | * Definitions | |
66 | ******************************/ | |
67 | #include <stddef.h> /* size_t */ | |
68 | typedef enum { XXPH_OK=0, XXPH_ERROR } XXPH_errorcode; | |
69 | ||
70 | ||
71 | /* **************************** | |
72 | * API modifier | |
73 | ******************************/ | |
74 | /** XXPH_INLINE_ALL (and XXPH_PRIVATE_API) | |
75 | * This build macro includes xxhash functions in `static` mode | |
76 | * in order to inline them, and remove their symbol from the public list. | |
77 | * Inlining offers great performance improvement on small keys, | |
78 | * and dramatic ones when length is expressed as a compile-time constant. | |
79 | * See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html . | |
80 | * Methodology : | |
81 | * #define XXPH_INLINE_ALL | |
82 | * #include "xxhash.h" | |
83 | * `xxhash.c` is automatically included. | |
84 | * It's not useful to compile and link it as a separate object. | |
85 | */ | |
86 | #if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) | |
87 | # ifndef XXPH_STATIC_LINKING_ONLY | |
88 | # define XXPH_STATIC_LINKING_ONLY | |
89 | # endif | |
90 | # if defined(__GNUC__) | |
91 | # define XXPH_PUBLIC_API static __inline __attribute__((unused)) | |
92 | # elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) | |
93 | # define XXPH_PUBLIC_API static inline | |
94 | # elif defined(_MSC_VER) | |
95 | # define XXPH_PUBLIC_API static __inline | |
96 | # else | |
97 | /* this version may generate warnings for unused static functions */ | |
98 | # define XXPH_PUBLIC_API static | |
99 | # endif | |
100 | #else | |
101 | # if defined(WIN32) && defined(_MSC_VER) && (defined(XXPH_IMPORT) || defined(XXPH_EXPORT)) | |
102 | # ifdef XXPH_EXPORT | |
103 | # define XXPH_PUBLIC_API __declspec(dllexport) | |
104 | # elif XXPH_IMPORT | |
105 | # define XXPH_PUBLIC_API __declspec(dllimport) | |
106 | # endif | |
107 | # else | |
108 | # define XXPH_PUBLIC_API /* do nothing */ | |
109 | # endif | |
110 | #endif /* XXPH_INLINE_ALL || XXPH_PRIVATE_API */ | |
111 | ||
112 | /*! XXPH_NAMESPACE, aka Namespace Emulation : | |
113 | * | |
114 | * If you want to include _and expose_ xxHash functions from within your own library, | |
115 | * but also want to avoid symbol collisions with other libraries which may also include xxHash, | |
116 | * | |
117 | * you can use XXPH_NAMESPACE, to automatically prefix any public symbol from xxhash library | |
118 | * with the value of XXPH_NAMESPACE (therefore, avoid NULL and numeric values). | |
119 | * | |
120 | * Note that no change is required within the calling program as long as it includes `xxhash.h` : | |
121 | * regular symbol name will be automatically translated by this header. | |
122 | */ | |
123 | #ifdef XXPH_NAMESPACE | |
124 | # define XXPH_CAT(A,B) A##B | |
125 | # define XXPH_NAME2(A,B) XXPH_CAT(A,B) | |
126 | # define XXPH_versionNumber XXPH_NAME2(XXPH_NAMESPACE, XXPH_versionNumber) | |
127 | #endif | |
128 | ||
129 | ||
130 | /* ************************************* | |
131 | * Version | |
132 | ***************************************/ | |
133 | #define XXPH_VERSION_MAJOR 0 | |
134 | #define XXPH_VERSION_MINOR 7 | |
135 | #define XXPH_VERSION_RELEASE 2 | |
136 | #define XXPH_VERSION_NUMBER (XXPH_VERSION_MAJOR *100*100 + XXPH_VERSION_MINOR *100 + XXPH_VERSION_RELEASE) | |
137 | XXPH_PUBLIC_API unsigned XXPH_versionNumber (void); | |
138 | ||
139 | ||
140 | /*-********************************************************************** | |
141 | * 32-bit hash | |
142 | ************************************************************************/ | |
143 | #if !defined (__VMS) \ | |
144 | && (defined (__cplusplus) \ | |
145 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) | |
146 | # include <stdint.h> | |
147 | typedef uint32_t XXPH32_hash_t; | |
148 | #else | |
149 | # include <limits.h> | |
150 | # if UINT_MAX == 0xFFFFFFFFUL | |
151 | typedef unsigned int XXPH32_hash_t; | |
152 | # else | |
153 | # if ULONG_MAX == 0xFFFFFFFFUL | |
154 | typedef unsigned long XXPH32_hash_t; | |
155 | # else | |
156 | # error "unsupported platform : need a 32-bit type" | |
157 | # endif | |
158 | # endif | |
159 | #endif | |
160 | ||
161 | #ifndef XXPH_NO_LONG_LONG | |
162 | /*-********************************************************************** | |
163 | * 64-bit hash | |
164 | ************************************************************************/ | |
165 | #if !defined (__VMS) \ | |
166 | && (defined (__cplusplus) \ | |
167 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) | |
168 | # include <stdint.h> | |
169 | typedef uint64_t XXPH64_hash_t; | |
170 | #else | |
171 | /* the following type must have a width of 64-bit */ | |
172 | typedef unsigned long long XXPH64_hash_t; | |
173 | #endif | |
174 | ||
175 | #endif /* XXPH_NO_LONG_LONG */ | |
176 | ||
177 | ||
178 | ||
179 | #ifdef XXPH_STATIC_LINKING_ONLY | |
180 | ||
181 | /* ================================================================================================ | |
182 | This section contains declarations which are not guaranteed to remain stable. | |
183 | They may change in future versions, becoming incompatible with a different version of the library. | |
184 | These declarations should only be used with static linking. | |
185 | Never use them in association with dynamic linking ! | |
186 | =================================================================================================== */ | |
187 | ||
188 | ||
189 | /*-********************************************************************** | |
190 | * XXPH3 | |
191 | * New experimental hash | |
192 | ************************************************************************/ | |
193 | #ifndef XXPH_NO_LONG_LONG | |
194 | ||
195 | ||
196 | /* ============================================ | |
197 | * XXPH3 is a new hash algorithm, | |
198 | * featuring improved speed performance for both small and large inputs. | |
199 | * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html | |
200 | * In general, expect XXPH3 to run about ~2x faster on large inputs, | |
201 | * and >3x faster on small ones, though exact differences depend on platform. | |
202 | * | |
203 | * The algorithm is portable, will generate the same hash on all platforms. | |
204 | * It benefits greatly from vectorization units, but does not require it. | |
205 | * | |
206 | * XXPH3 offers 2 variants, _64bits and _128bits. | |
207 | * When only 64 bits are needed, prefer calling the _64bits variant : | |
208 | * it reduces the amount of mixing, resulting in faster speed on small inputs. | |
209 | * It's also generally simpler to manipulate a scalar return type than a struct. | |
210 | * | |
211 | * The XXPH3 algorithm is still considered experimental. | |
212 | * Produced results can still change between versions. | |
213 | * Results produced by v0.7.x are not comparable with results from v0.7.y . | |
214 | * It's nonetheless possible to use XXPH3 for ephemeral data (local sessions), | |
215 | * but avoid storing values in long-term storage for later reads. | |
216 | * | |
217 | * The API supports one-shot hashing, streaming mode, and custom secrets. | |
218 | * | |
219 | * There are still a number of opened questions that community can influence during the experimental period. | |
220 | * I'm trying to list a few of them below, though don't consider this list as complete. | |
221 | * | |
222 | * - 128-bits output type : currently defined as a structure of two 64-bits fields. | |
223 | * That's because 128-bit values do not exist in C standard. | |
224 | * Note that it means that, at byte level, result is not identical depending on endianess. | |
225 | * However, at field level, they are identical on all platforms. | |
226 | * The canonical representation solves the issue of identical byte-level representation across platforms, | |
227 | * which is necessary for serialization. | |
228 | * Q1 : Would there be a better representation for a 128-bit hash result ? | |
229 | * Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ? | |
230 | * | |
231 | * - Prototype XXPH128() : XXPH128() uses the same arguments as XXPH64(), for consistency. | |
232 | * It means it maps to XXPH3_128bits_withSeed(). | |
233 | * This variant is slightly slower than XXPH3_128bits(), | |
234 | * because the seed is now part of the algorithm, and can't be simplified. | |
235 | * Is that a good idea ? | |
236 | * | |
237 | * - Seed type for XXPH128() : currently, it's a single 64-bit value, like the 64-bit variant. | |
238 | * It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash. | |
239 | * But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value. | |
240 | * Such a variant could either replace current one, or become an additional one. | |
241 | * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). | |
242 | * Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXPH128 ? | |
243 | * | |
244 | * - Result for len==0 : Currently, the result of hashing a zero-length input is always `0`. | |
245 | * It seems okay as a return value when using "default" secret and seed. | |
246 | * But is it still fine to return `0` when secret or seed are non-default ? | |
247 | * Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ? | |
248 | * | |
249 | * - Consistency (1) : Streaming XXPH128 uses an XXPH3 state, which is the same state as XXPH3_64bits(). | |
250 | * It means a 128bit streaming loop must invoke the following symbols : | |
251 | * XXPH3_createState(), XXPH3_128bits_reset(), XXPH3_128bits_update() (loop), XXPH3_128bits_digest(), XXPH3_freeState(). | |
252 | * Is that consistent enough ? | |
253 | * | |
254 | * - Consistency (2) : The canonical representation of `XXPH3_64bits` is provided by existing functions | |
255 | * XXPH64_canonicalFromHash(), and reverse operation XXPH64_hashFromCanonical(). | |
256 | * As a mirror, canonical functions for XXPH128_hash_t results generated by `XXPH3_128bits` | |
257 | * are XXPH128_canonicalFromHash() and XXPH128_hashFromCanonical(). | |
258 | * Which means, `XXPH3` doesn't appear in the names, because canonical functions operate on a type, | |
259 | * independently of which algorithm was used to generate that type. | |
260 | * Is that consistent enough ? | |
261 | */ | |
262 | ||
263 | #ifdef XXPH_NAMESPACE | |
264 | # define XXPH3_64bits XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits) | |
265 | # define XXPH3_64bits_withSecret XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSecret) | |
266 | # define XXPH3_64bits_withSeed XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSeed) | |
267 | #endif | |
268 | ||
269 | /* XXPH3_64bits() : | |
270 | * default 64-bit variant, using default secret and default seed of 0. | |
271 | * It's the fastest variant. */ | |
272 | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* data, size_t len); | |
273 | ||
274 | /* XXPH3_64bits_withSecret() : | |
275 | * It's possible to provide any blob of bytes as a "secret" to generate the hash. | |
276 | * This makes it more difficult for an external actor to prepare an intentional collision. | |
277 | * The secret *must* be large enough (>= XXPH3_SECRET_SIZE_MIN). | |
278 | * It should consist of random bytes. | |
279 | * Avoid repeating same character, or sequences of bytes, | |
280 | * and especially avoid swathes of \0. | |
281 | * Failure to respect these conditions will result in a poor quality hash. | |
282 | */ | |
283 | #define XXPH3_SECRET_SIZE_MIN 136 | |
284 | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); | |
285 | ||
286 | /* XXPH3_64bits_withSeed() : | |
287 | * This variant generates on the fly a custom secret, | |
288 | * based on the default secret, altered using the `seed` value. | |
289 | * While this operation is decently fast, note that it's not completely free. | |
290 | * note : seed==0 produces same results as XXPH3_64bits() */ | |
291 | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSeed(const void* data, size_t len, XXPH64_hash_t seed); | |
292 | ||
293 | #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ | |
294 | # include <stdalign.h> | |
295 | # define XXPH_ALIGN(n) alignas(n) | |
296 | #elif defined(__GNUC__) | |
297 | # define XXPH_ALIGN(n) __attribute__ ((aligned(n))) | |
298 | #elif defined(_MSC_VER) | |
299 | # define XXPH_ALIGN(n) __declspec(align(n)) | |
300 | #else | |
301 | # define XXPH_ALIGN(n) /* disabled */ | |
302 | #endif | |
303 | ||
304 | #define XXPH3_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ | |
305 | ||
306 | #endif /* XXPH_NO_LONG_LONG */ | |
307 | ||
308 | ||
309 | /*-********************************************************************** | |
310 | * XXPH_INLINE_ALL | |
311 | ************************************************************************/ | |
312 | #if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) | |
313 | ||
314 | /* === RocksDB modification: was #include here but permanently inlining === */ | |
315 | ||
316 | typedef struct { | |
317 | XXPH64_hash_t low64; | |
318 | XXPH64_hash_t high64; | |
319 | } XXPH128_hash_t; | |
320 | ||
321 | /* ************************************* | |
322 | * Tuning parameters | |
323 | ***************************************/ | |
324 | /*!XXPH_FORCE_MEMORY_ACCESS : | |
325 | * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. | |
326 | * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. | |
327 | * The below switch allow to select different access method for improved performance. | |
328 | * Method 0 (default) : use `memcpy()`. Safe and portable. | |
329 | * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). | |
330 | * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. | |
331 | * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. | |
332 | * It can generate buggy code on targets which do not support unaligned memory accesses. | |
333 | * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) | |
334 | * See http://stackoverflow.com/a/32095106/646947 for details. | |
335 | * Prefer these methods in priority order (0 > 1 > 2) | |
336 | */ | |
337 | #ifndef XXPH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ | |
338 | # if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) | |
339 | # define XXPH_FORCE_MEMORY_ACCESS 2 | |
340 | # elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ | |
341 | (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) | |
342 | # define XXPH_FORCE_MEMORY_ACCESS 1 | |
343 | # endif | |
344 | #endif | |
345 | ||
346 | /*!XXPH_ACCEPT_NULL_INPUT_POINTER : | |
347 | * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault. | |
348 | * When this macro is enabled, xxHash actively checks input for null pointer. | |
349 | * It it is, result for null input pointers is the same as a null-length input. | |
350 | */ | |
351 | #ifndef XXPH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ | |
352 | # define XXPH_ACCEPT_NULL_INPUT_POINTER 0 | |
353 | #endif | |
354 | ||
355 | /*!XXPH_FORCE_ALIGN_CHECK : | |
356 | * This is a minor performance trick, only useful with lots of very small keys. | |
357 | * It means : check for aligned/unaligned input. | |
358 | * The check costs one initial branch per hash; | |
359 | * set it to 0 when the input is guaranteed to be aligned, | |
360 | * or when alignment doesn't matter for performance. | |
361 | */ | |
362 | #ifndef XXPH_FORCE_ALIGN_CHECK /* can be defined externally */ | |
363 | # if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) | |
364 | # define XXPH_FORCE_ALIGN_CHECK 0 | |
365 | # else | |
366 | # define XXPH_FORCE_ALIGN_CHECK 1 | |
367 | # endif | |
368 | #endif | |
369 | ||
370 | /*!XXPH_REROLL: | |
371 | * Whether to reroll XXPH32_finalize, and XXPH64_finalize, | |
372 | * instead of using an unrolled jump table/if statement loop. | |
373 | * | |
374 | * This is automatically defined on -Os/-Oz on GCC and Clang. */ | |
375 | #ifndef XXPH_REROLL | |
376 | # if defined(__OPTIMIZE_SIZE__) | |
377 | # define XXPH_REROLL 1 | |
378 | # else | |
379 | # define XXPH_REROLL 0 | |
380 | # endif | |
381 | #endif | |
382 | ||
383 | #include <limits.h> /* ULLONG_MAX */ | |
384 | ||
385 | #ifndef XXPH_STATIC_LINKING_ONLY | |
386 | #define XXPH_STATIC_LINKING_ONLY | |
387 | #endif | |
388 | ||
389 | /* BEGIN RocksDB customizations */ | |
390 | #include "port/lang.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */ | |
391 | /* END RocksDB customizations */ | |
392 | ||
393 | /* ************************************* | |
394 | * Compiler Specific Options | |
395 | ***************************************/ | |
396 | #ifdef _MSC_VER /* Visual Studio */ | |
397 | # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ | |
398 | # define XXPH_FORCE_INLINE static __forceinline | |
399 | # define XXPH_NO_INLINE static __declspec(noinline) | |
400 | #else | |
401 | # if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ | |
402 | # ifdef __GNUC__ | |
403 | # define XXPH_FORCE_INLINE static inline __attribute__((always_inline)) | |
404 | # define XXPH_NO_INLINE static __attribute__((noinline)) | |
405 | # else | |
406 | # define XXPH_FORCE_INLINE static inline | |
407 | # define XXPH_NO_INLINE static | |
408 | # endif | |
409 | # else | |
410 | # define XXPH_FORCE_INLINE static | |
411 | # define XXPH_NO_INLINE static | |
412 | # endif /* __STDC_VERSION__ */ | |
413 | #endif | |
414 | ||
415 | ||
416 | ||
417 | /* ************************************* | |
418 | * Debug | |
419 | ***************************************/ | |
420 | /* DEBUGLEVEL is expected to be defined externally, | |
421 | * typically through compiler command line. | |
422 | * Value must be a number. */ | |
423 | #ifndef DEBUGLEVEL | |
424 | # define DEBUGLEVEL 0 | |
425 | #endif | |
426 | ||
427 | #if (DEBUGLEVEL>=1) | |
428 | # include <assert.h> /* note : can still be disabled with NDEBUG */ | |
429 | # define XXPH_ASSERT(c) assert(c) | |
430 | #else | |
431 | # define XXPH_ASSERT(c) ((void)0) | |
432 | #endif | |
433 | ||
434 | /* note : use after variable declarations */ | |
435 | #define XXPH_STATIC_ASSERT(c) { enum { XXPH_sa = 1/(int)(!!(c)) }; } | |
436 | ||
437 | ||
438 | /* ************************************* | |
439 | * Basic Types | |
440 | ***************************************/ | |
441 | #if !defined (__VMS) \ | |
442 | && (defined (__cplusplus) \ | |
443 | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) | |
444 | # include <stdint.h> | |
445 | typedef uint8_t xxh_u8; | |
446 | #else | |
447 | typedef unsigned char xxh_u8; | |
448 | #endif | |
449 | typedef XXPH32_hash_t xxh_u32; | |
450 | ||
451 | ||
452 | /* === Memory access === */ | |
453 | ||
454 | #if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) | |
455 | ||
456 | /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ | |
457 | static xxh_u32 XXPH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } | |
458 | ||
459 | #elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) | |
460 | ||
461 | /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ | |
462 | /* currently only defined for gcc and icc */ | |
463 | typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; | |
464 | static xxh_u32 XXPH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } | |
465 | ||
466 | #else | |
467 | ||
468 | /* portable and safe solution. Generally efficient. | |
469 | * see : http://stackoverflow.com/a/32095106/646947 | |
470 | */ | |
471 | static xxh_u32 XXPH_read32(const void* memPtr) | |
472 | { | |
473 | xxh_u32 val; | |
474 | memcpy(&val, memPtr, sizeof(val)); | |
475 | return val; | |
476 | } | |
477 | ||
478 | #endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ | |
479 | ||
480 | ||
481 | /* === Endianess === */ | |
482 | ||
483 | /* XXPH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ | |
484 | #ifndef XXPH_CPU_LITTLE_ENDIAN | |
485 | # if defined(_WIN32) /* Windows is always little endian */ \ | |
486 | || defined(__LITTLE_ENDIAN__) \ | |
487 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) | |
488 | # define XXPH_CPU_LITTLE_ENDIAN 1 | |
489 | # elif defined(__BIG_ENDIAN__) \ | |
490 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |
491 | # define XXPH_CPU_LITTLE_ENDIAN 0 | |
492 | # else | |
493 | static int XXPH_isLittleEndian(void) | |
494 | { | |
495 | const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; /* don't use static : performance detrimental */ | |
496 | return one.c[0]; | |
497 | } | |
498 | # define XXPH_CPU_LITTLE_ENDIAN XXPH_isLittleEndian() | |
499 | # endif | |
500 | #endif | |
501 | ||
502 | ||
503 | ||
504 | ||
505 | /* **************************************** | |
506 | * Compiler-specific Functions and Macros | |
507 | ******************************************/ | |
508 | #define XXPH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) | |
509 | ||
510 | #ifndef __has_builtin | |
511 | # define __has_builtin(x) 0 | |
512 | #endif | |
513 | ||
514 | #if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64) | |
515 | # define XXPH_rotl32 __builtin_rotateleft32 | |
516 | # define XXPH_rotl64 __builtin_rotateleft64 | |
517 | /* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ | |
518 | #elif defined(_MSC_VER) | |
519 | # define XXPH_rotl32(x,r) _rotl(x,r) | |
520 | # define XXPH_rotl64(x,r) _rotl64(x,r) | |
521 | #else | |
522 | # define XXPH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) | |
523 | # define XXPH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) | |
524 | #endif | |
525 | ||
526 | #if defined(_MSC_VER) /* Visual Studio */ | |
527 | # define XXPH_swap32 _byteswap_ulong | |
528 | #elif XXPH_GCC_VERSION >= 403 | |
529 | # define XXPH_swap32 __builtin_bswap32 | |
530 | #else | |
531 | static xxh_u32 XXPH_swap32 (xxh_u32 x) | |
532 | { | |
533 | return ((x << 24) & 0xff000000 ) | | |
534 | ((x << 8) & 0x00ff0000 ) | | |
535 | ((x >> 8) & 0x0000ff00 ) | | |
536 | ((x >> 24) & 0x000000ff ); | |
537 | } | |
538 | #endif | |
539 | ||
540 | ||
541 | /* *************************** | |
542 | * Memory reads | |
543 | *****************************/ | |
544 | typedef enum { XXPH_aligned, XXPH_unaligned } XXPH_alignment; | |
545 | ||
546 | XXPH_FORCE_INLINE xxh_u32 XXPH_readLE32(const void* ptr) | |
547 | { | |
548 | return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read32(ptr) : XXPH_swap32(XXPH_read32(ptr)); | |
549 | } | |
550 | ||
551 | XXPH_FORCE_INLINE xxh_u32 | |
552 | XXPH_readLE32_align(const void* ptr, XXPH_alignment align) | |
553 | { | |
554 | if (align==XXPH_unaligned) { | |
555 | return XXPH_readLE32(ptr); | |
556 | } else { | |
557 | return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXPH_swap32(*(const xxh_u32*)ptr); | |
558 | } | |
559 | } | |
560 | ||
561 | ||
562 | /* ************************************* | |
563 | * Misc | |
564 | ***************************************/ | |
565 | XXPH_PUBLIC_API unsigned XXPH_versionNumber (void) { return XXPH_VERSION_NUMBER; } | |
566 | ||
567 | ||
568 | static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ | |
569 | static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ | |
570 | static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ | |
571 | static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ | |
572 | static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ | |
573 | ||
574 | #ifndef XXPH_NO_LONG_LONG | |
575 | ||
576 | /* ******************************************************************* | |
577 | * 64-bit hash functions | |
578 | *********************************************************************/ | |
579 | ||
580 | /*====== Memory access ======*/ | |
581 | ||
582 | typedef XXPH64_hash_t xxh_u64; | |
583 | ||
584 | #if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) | |
585 | ||
586 | /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ | |
587 | static xxh_u64 XXPH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } | |
588 | ||
589 | #elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) | |
590 | ||
591 | /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ | |
592 | /* currently only defined for gcc and icc */ | |
593 | typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; | |
594 | static xxh_u64 XXPH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } | |
595 | ||
596 | #else | |
597 | ||
598 | /* portable and safe solution. Generally efficient. | |
599 | * see : http://stackoverflow.com/a/32095106/646947 | |
600 | */ | |
601 | ||
602 | static xxh_u64 XXPH_read64(const void* memPtr) | |
603 | { | |
604 | xxh_u64 val; | |
605 | memcpy(&val, memPtr, sizeof(val)); | |
606 | return val; | |
607 | } | |
608 | ||
609 | #endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ | |
610 | ||
611 | #if defined(_MSC_VER) /* Visual Studio */ | |
612 | # define XXPH_swap64 _byteswap_uint64 | |
613 | #elif XXPH_GCC_VERSION >= 403 | |
614 | # define XXPH_swap64 __builtin_bswap64 | |
615 | #else | |
616 | static xxh_u64 XXPH_swap64 (xxh_u64 x) | |
617 | { | |
618 | return ((x << 56) & 0xff00000000000000ULL) | | |
619 | ((x << 40) & 0x00ff000000000000ULL) | | |
620 | ((x << 24) & 0x0000ff0000000000ULL) | | |
621 | ((x << 8) & 0x000000ff00000000ULL) | | |
622 | ((x >> 8) & 0x00000000ff000000ULL) | | |
623 | ((x >> 24) & 0x0000000000ff0000ULL) | | |
624 | ((x >> 40) & 0x000000000000ff00ULL) | | |
625 | ((x >> 56) & 0x00000000000000ffULL); | |
626 | } | |
627 | #endif | |
628 | ||
629 | XXPH_FORCE_INLINE xxh_u64 XXPH_readLE64(const void* ptr) | |
630 | { | |
631 | return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read64(ptr) : XXPH_swap64(XXPH_read64(ptr)); | |
632 | } | |
633 | ||
634 | XXPH_FORCE_INLINE xxh_u64 | |
635 | XXPH_readLE64_align(const void* ptr, XXPH_alignment align) | |
636 | { | |
637 | if (align==XXPH_unaligned) | |
638 | return XXPH_readLE64(ptr); | |
639 | else | |
640 | return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXPH_swap64(*(const xxh_u64*)ptr); | |
641 | } | |
642 | ||
643 | ||
644 | /*====== xxh64 ======*/ | |
645 | ||
646 | static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ | |
647 | static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ | |
648 | static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ | |
649 | static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ | |
650 | static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ | |
651 | ||
652 | ||
653 | /* ********************************************************************* | |
654 | * XXPH3 | |
655 | * New generation hash designed for speed on small keys and vectorization | |
656 | ************************************************************************ */ | |
657 | ||
658 | /*======== Was #include "xxh3.h", now inlined below ==========*/ | |
659 | ||
660 | /* | |
661 | xxHash - Extremely Fast Hash algorithm | |
662 | Development source file for `xxh3` | |
663 | Copyright (C) 2019-present, Yann Collet. | |
664 | ||
665 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) | |
666 | ||
667 | Redistribution and use in source and binary forms, with or without | |
668 | modification, are permitted provided that the following conditions are | |
669 | met: | |
670 | ||
671 | * Redistributions of source code must retain the above copyright | |
672 | notice, this list of conditions and the following disclaimer. | |
673 | * Redistributions in binary form must reproduce the above | |
674 | copyright notice, this list of conditions and the following disclaimer | |
675 | in the documentation and/or other materials provided with the | |
676 | distribution. | |
677 | ||
678 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
679 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
680 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
681 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
682 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
683 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
684 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
685 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
686 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
687 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
688 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
689 | ||
690 | You can contact the author at : | |
691 | - xxHash source repository : https://github.com/Cyan4973/xxHash | |
692 | */ | |
693 | ||
694 | /* RocksDB Note: This file contains a preview release (xxhash repository | |
695 | version 0.7.2) of XXPH3 that is unlikely to be compatible with the final | |
696 | version of XXPH3. We have therefore renamed this XXPH3 ("preview"), for | |
697 | clarity so that we can continue to use this version even after | |
698 | integrating a newer incompatible version. | |
699 | */ | |
700 | ||
701 | /* === Dependencies === */ | |
702 | ||
703 | #undef XXPH_INLINE_ALL /* in case it's already defined */ | |
704 | #define XXPH_INLINE_ALL | |
705 | ||
706 | ||
707 | /* === Compiler specifics === */ | |
708 | ||
709 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ | |
710 | # define XXPH_RESTRICT restrict | |
711 | #else | |
712 | /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */ | |
713 | # define XXPH_RESTRICT /* disable */ | |
714 | #endif | |
715 | ||
716 | #if defined(__GNUC__) | |
717 | # if defined(__AVX2__) | |
718 | # include <immintrin.h> | |
719 | # elif defined(__SSE2__) | |
720 | # include <emmintrin.h> | |
721 | # elif defined(__ARM_NEON__) || defined(__ARM_NEON) | |
722 | # define inline __inline__ /* clang bug */ | |
723 | # include <arm_neon.h> | |
724 | # undef inline | |
725 | # endif | |
726 | #elif defined(_MSC_VER) | |
727 | # include <intrin.h> | |
728 | #endif | |
729 | ||
730 | /* | |
731 | * Sanity check. | |
732 | * | |
733 | * XXPH3 only requires these features to be efficient: | |
734 | * | |
735 | * - Usable unaligned access | |
736 | * - A 32-bit or 64-bit ALU | |
737 | * - If 32-bit, a decent ADC instruction | |
738 | * - A 32 or 64-bit multiply with a 64-bit result | |
739 | * | |
740 | * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the | |
741 | * classic 16-bit only subset of ARM's instruction set. | |
742 | * | |
743 | * First of all, Thumb-1 lacks support for the UMULL instruction which | |
744 | * performs the important long multiply. This means numerous __aeabi_lmul | |
745 | * calls. | |
746 | * | |
747 | * Second of all, the 8 functional registers are just not enough. | |
748 | * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need | |
749 | * Lo registers, and this shuffling results in thousands more MOVs than A32. | |
750 | * | |
751 | * A32 and T32 don't have this limitation. They can access all 14 registers, | |
752 | * do a 32->64 multiply with UMULL, and the flexible operand is helpful too. | |
753 | * | |
754 | * If compiling Thumb-1 for a target which supports ARM instructions, we | |
755 | * will give a warning. | |
756 | * | |
757 | * Usually, if this happens, it is because of an accident and you probably | |
758 | * need to specify -march, as you probably meant to compileh for a newer | |
759 | * architecture. | |
760 | */ | |
761 | #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) | |
762 | # warning "XXPH3 is highly inefficient without ARM or Thumb-2." | |
763 | #endif | |
764 | ||
765 | /* ========================================== | |
766 | * Vectorization detection | |
767 | * ========================================== */ | |
768 | #define XXPH_SCALAR 0 | |
769 | #define XXPH_SSE2 1 | |
770 | #define XXPH_AVX2 2 | |
771 | #define XXPH_NEON 3 | |
772 | #define XXPH_VSX 4 | |
773 | ||
774 | #ifndef XXPH_VECTOR /* can be defined on command line */ | |
775 | # if defined(__AVX2__) | |
776 | # define XXPH_VECTOR XXPH_AVX2 | |
777 | # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) | |
778 | # define XXPH_VECTOR XXPH_SSE2 | |
779 | # elif defined(__GNUC__) /* msvc support maybe later */ \ | |
780 | && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ | |
781 | && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ | |
782 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) | |
783 | # define XXPH_VECTOR XXPH_NEON | |
784 | # elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__) | |
785 | # define XXPH_VECTOR XXPH_VSX | |
786 | # else | |
787 | # define XXPH_VECTOR XXPH_SCALAR | |
788 | # endif | |
789 | #endif | |
790 | ||
791 | /* control alignment of accumulator, | |
792 | * for compatibility with fast vector loads */ | |
793 | #ifndef XXPH_ACC_ALIGN | |
794 | # if XXPH_VECTOR == 0 /* scalar */ | |
795 | # define XXPH_ACC_ALIGN 8 | |
796 | # elif XXPH_VECTOR == 1 /* sse2 */ | |
797 | # define XXPH_ACC_ALIGN 16 | |
798 | # elif XXPH_VECTOR == 2 /* avx2 */ | |
799 | # define XXPH_ACC_ALIGN 32 | |
800 | # elif XXPH_VECTOR == 3 /* neon */ | |
801 | # define XXPH_ACC_ALIGN 16 | |
802 | # elif XXPH_VECTOR == 4 /* vsx */ | |
803 | # define XXPH_ACC_ALIGN 16 | |
804 | # endif | |
805 | #endif | |
806 | ||
807 | /* xxh_u64 XXPH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */ | |
808 | #if defined(_MSC_VER) && defined(_M_IX86) | |
809 | # include <intrin.h> | |
810 | # define XXPH_mult32to64(x, y) __emulu(x, y) | |
811 | #else | |
812 | # define XXPH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF)) | |
813 | #endif | |
814 | ||
815 | /* VSX stuff. It's a lot because VSX support is mediocre across compilers and | |
816 | * there is a lot of mischief with endianness. */ | |
817 | #if XXPH_VECTOR == XXPH_VSX | |
818 | # include <altivec.h> | |
819 | # undef vector | |
820 | typedef __vector unsigned long long U64x2; | |
821 | typedef __vector unsigned char U8x16; | |
822 | typedef __vector unsigned U32x4; | |
823 | ||
824 | #ifndef XXPH_VSX_BE | |
825 | # if defined(__BIG_ENDIAN__) \ | |
826 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |
827 | # define XXPH_VSX_BE 1 | |
828 | # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ | |
829 | # warning "-maltivec=be is not recommended. Please use native endianness." | |
830 | # define XXPH_VSX_BE 1 | |
831 | # else | |
832 | # define XXPH_VSX_BE 0 | |
833 | # endif | |
834 | #endif | |
835 | ||
836 | /* We need some helpers for big endian mode. */ | |
837 | #if XXPH_VSX_BE | |
838 | /* A wrapper for POWER9's vec_revb. */ | |
839 | # ifdef __POWER9_VECTOR__ | |
840 | # define XXPH_vec_revb vec_revb | |
841 | # else | |
842 | XXPH_FORCE_INLINE U64x2 XXPH_vec_revb(U64x2 val) | |
843 | { | |
844 | U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, | |
845 | 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; | |
846 | return vec_perm(val, val, vByteSwap); | |
847 | } | |
848 | # endif | |
849 | ||
850 | /* Power8 Crypto gives us vpermxor which is very handy for | |
851 | * PPC64EB. | |
852 | * | |
853 | * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask) | |
854 | * { | |
855 | * U8x16 ret; | |
856 | * for (int i = 0; i < 16; i++) { | |
857 | * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4]; | |
858 | * } | |
859 | * return ret; | |
860 | * } | |
861 | * | |
862 | * Because both of the main loops load the key, swap, and xor it with input, | |
863 | * we can combine the key swap into this instruction. | |
864 | */ | |
865 | # ifdef vec_permxor | |
866 | # define XXPH_vec_permxor vec_permxor | |
867 | # else | |
868 | # define XXPH_vec_permxor __builtin_crypto_vpermxor | |
869 | # endif | |
870 | #endif /* XXPH_VSX_BE */ | |
871 | /* | |
872 | * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes | |
873 | * vec_mule. | |
874 | * | |
875 | * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while. | |
876 | * Clang has an easy way to control this, we can just use the builtin which doesn't swap. | |
877 | * GCC needs inline assembly. */ | |
878 | #if __has_builtin(__builtin_altivec_vmuleuw) | |
879 | # define XXPH_vec_mulo __builtin_altivec_vmulouw | |
880 | # define XXPH_vec_mule __builtin_altivec_vmuleuw | |
881 | #else | |
882 | /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ | |
883 | XXPH_FORCE_INLINE U64x2 XXPH_vec_mulo(U32x4 a, U32x4 b) { | |
884 | U64x2 result; | |
885 | __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); | |
886 | return result; | |
887 | } | |
888 | XXPH_FORCE_INLINE U64x2 XXPH_vec_mule(U32x4 a, U32x4 b) { | |
889 | U64x2 result; | |
890 | __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); | |
891 | return result; | |
892 | } | |
893 | #endif /* __has_builtin(__builtin_altivec_vmuleuw) */ | |
894 | #endif /* XXPH_VECTOR == XXPH_VSX */ | |
895 | ||
896 | /* prefetch | |
897 | * can be disabled, by declaring XXPH_NO_PREFETCH build macro */ | |
898 | #if defined(XXPH_NO_PREFETCH) | |
899 | # define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ | |
900 | #else | |
901 | #if defined(_MSC_VER) && \ | |
902 | (defined(_M_X64) || \ | |
903 | defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */ | |
904 | # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ | |
905 | # define XXPH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) | |
906 | # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) | |
907 | # define XXPH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) | |
908 | # else | |
909 | # define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ | |
910 | # endif | |
911 | #endif /* XXPH_NO_PREFETCH */ | |
912 | ||
913 | ||
914 | /* ========================================== | |
915 | * XXPH3 default settings | |
916 | * ========================================== */ | |
917 | ||
918 | #define XXPH_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ | |
919 | ||
920 | #if (XXPH_SECRET_DEFAULT_SIZE < XXPH3_SECRET_SIZE_MIN) | |
921 | # error "default keyset is not large enough" | |
922 | #endif | |
923 | ||
924 | XXPH_ALIGN(64) static const xxh_u8 kSecret[XXPH_SECRET_DEFAULT_SIZE] = { | |
925 | 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, | |
926 | 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, | |
927 | 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, | |
928 | 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, | |
929 | 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, | |
930 | 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, | |
931 | 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, | |
932 | 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, | |
933 | ||
934 | 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, | |
935 | 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, | |
936 | 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, | |
937 | 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, | |
938 | }; | |
939 | ||
940 | /* | |
941 | * GCC for x86 has a tendency to use SSE in this loop. While it | |
942 | * successfully avoids swapping (as MUL overwrites EAX and EDX), it | |
943 | * slows it down because instead of free register swap shifts, it | |
944 | * must use pshufd and punpckl/hd. | |
945 | * | |
946 | * To prevent this, we use this attribute to shut off SSE. | |
947 | */ | |
948 | #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) | |
949 | __attribute__((__target__("no-sse"))) | |
950 | #endif | |
951 | static XXPH128_hash_t | |
952 | XXPH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) | |
953 | { | |
954 | /* | |
955 | * GCC/Clang __uint128_t method. | |
956 | * | |
957 | * On most 64-bit targets, GCC and Clang define a __uint128_t type. | |
958 | * This is usually the best way as it usually uses a native long 64-bit | |
959 | * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. | |
960 | * | |
961 | * Usually. | |
962 | * | |
963 | * Despite being a 32-bit platform, Clang (and emscripten) define this | |
964 | * type despite not having the arithmetic for it. This results in a | |
965 | * laggy compiler builtin call which calculates a full 128-bit multiply. | |
966 | * In that case it is best to use the portable one. | |
967 | * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 | |
968 | */ | |
969 | #if defined(__GNUC__) && !defined(__wasm__) \ | |
970 | && defined(__SIZEOF_INT128__) \ | |
971 | || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) | |
972 | ||
973 | __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; | |
974 | XXPH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) }; | |
975 | return r128; | |
976 | ||
977 | /* | |
978 | * MSVC for x64's _umul128 method. | |
979 | * | |
980 | * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); | |
981 | * | |
982 | * This compiles to single operand MUL on x64. | |
983 | */ | |
984 | #elif defined(_M_X64) || defined(_M_IA64) | |
985 | ||
986 | #ifndef _MSC_VER | |
987 | # pragma intrinsic(_umul128) | |
988 | #endif | |
989 | xxh_u64 product_high; | |
990 | xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); | |
991 | XXPH128_hash_t const r128 = { product_low, product_high }; | |
992 | return r128; | |
993 | ||
994 | #else | |
995 | /* | |
996 | * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. | |
997 | * | |
998 | * This is a fast and simple grade school multiply, which is shown | |
999 | * below with base 10 arithmetic instead of base 0x100000000. | |
1000 | * | |
1001 | * 9 3 // D2 lhs = 93 | |
1002 | * x 7 5 // D2 rhs = 75 | |
1003 | * ---------- | |
1004 | * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) | |
1005 | * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) | |
1006 | * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) | |
1007 | * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) | |
1008 | * --------- | |
1009 | * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 | |
1010 | * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 | |
1011 | * --------- | |
1012 | * 6 9 7 5 | |
1013 | * | |
1014 | * The reasons for adding the products like this are: | |
1015 | * 1. It avoids manual carry tracking. Just like how | |
1016 | * (9 * 9) + 9 + 9 = 99, the same applies with this for | |
1017 | * UINT64_MAX. This avoids a lot of complexity. | |
1018 | * | |
1019 | * 2. It hints for, and on Clang, compiles to, the powerful UMAAL | |
1020 | * instruction available in ARMv6+ A32/T32, which is shown below: | |
1021 | * | |
1022 | * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) | |
1023 | * { | |
1024 | * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; | |
1025 | * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); | |
1026 | * *RdHi = (xxh_u32)(product >> 32); | |
1027 | * } | |
1028 | * | |
1029 | * This instruction was designed for efficient long multiplication, | |
1030 | * and allows this to be calculated in only 4 instructions which | |
1031 | * is comparable to some 64-bit ALUs. | |
1032 | * | |
1033 | * 3. It isn't terrible on other platforms. Usually this will be | |
1034 | * a couple of 32-bit ADD/ADCs. | |
1035 | */ | |
1036 | ||
1037 | /* First calculate all of the cross products. */ | |
1038 | xxh_u64 const lo_lo = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); | |
1039 | xxh_u64 const hi_lo = XXPH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); | |
1040 | xxh_u64 const lo_hi = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); | |
1041 | xxh_u64 const hi_hi = XXPH_mult32to64(lhs >> 32, rhs >> 32); | |
1042 | ||
1043 | /* Now add the products together. These will never overflow. */ | |
1044 | xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; | |
1045 | xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; | |
1046 | xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); | |
1047 | ||
1048 | XXPH128_hash_t r128 = { lower, upper }; | |
1049 | return r128; | |
1050 | #endif | |
1051 | } | |
1052 | ||
1053 | /* | |
1054 | * We want to keep the attribute here because a target switch | |
1055 | * disables inlining. | |
1056 | * | |
1057 | * Does a 64-bit to 128-bit multiply, then XOR folds it. | |
1058 | * The reason for the separate function is to prevent passing | |
1059 | * too many structs around by value. This will hopefully inline | |
1060 | * the multiply, but we don't force it. | |
1061 | */ | |
1062 | #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) | |
1063 | __attribute__((__target__("no-sse"))) | |
1064 | #endif | |
1065 | static xxh_u64 | |
1066 | XXPH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) | |
1067 | { | |
1068 | XXPH128_hash_t product = XXPH_mult64to128(lhs, rhs); | |
1069 | return product.low64 ^ product.high64; | |
1070 | } | |
1071 | ||
1072 | ||
1073 | static XXPH64_hash_t XXPH3_avalanche(xxh_u64 h64) | |
1074 | { | |
1075 | h64 ^= h64 >> 37; | |
1076 | h64 *= PRIME64_3; | |
1077 | h64 ^= h64 >> 32; | |
1078 | return h64; | |
1079 | } | |
1080 | ||
1081 | ||
1082 | /* ========================================== | |
1083 | * Short keys | |
1084 | * ========================================== */ | |
1085 | ||
1086 | XXPH_FORCE_INLINE XXPH64_hash_t | |
1087 | XXPH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) | |
1088 | { | |
1089 | XXPH_ASSERT(input != NULL); | |
1090 | XXPH_ASSERT(1 <= len && len <= 3); | |
1091 | XXPH_ASSERT(secret != NULL); | |
1092 | { xxh_u8 const c1 = input[0]; | |
1093 | xxh_u8 const c2 = input[len >> 1]; | |
1094 | xxh_u8 const c3 = input[len - 1]; | |
1095 | xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24); | |
1096 | xxh_u64 const keyed = (xxh_u64)combined ^ (XXPH_readLE32(secret) + seed); | |
1097 | xxh_u64 const mixed = keyed * PRIME64_1; | |
1098 | return XXPH3_avalanche(mixed); | |
1099 | } | |
1100 | } | |
1101 | ||
1102 | XXPH_FORCE_INLINE XXPH64_hash_t | |
1103 | XXPH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) | |
1104 | { | |
1105 | XXPH_ASSERT(input != NULL); | |
1106 | XXPH_ASSERT(secret != NULL); | |
1107 | XXPH_ASSERT(4 <= len && len <= 8); | |
1108 | { xxh_u32 const input_lo = XXPH_readLE32(input); | |
1109 | xxh_u32 const input_hi = XXPH_readLE32(input + len - 4); | |
1110 | xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32); | |
1111 | xxh_u64 const keyed = input_64 ^ (XXPH_readLE64(secret) + seed); | |
1112 | xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1); | |
1113 | return XXPH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2); | |
1114 | } | |
1115 | } | |
1116 | ||
1117 | XXPH_FORCE_INLINE XXPH64_hash_t | |
1118 | XXPH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) | |
1119 | { | |
1120 | XXPH_ASSERT(input != NULL); | |
1121 | XXPH_ASSERT(secret != NULL); | |
1122 | XXPH_ASSERT(9 <= len && len <= 16); | |
1123 | { xxh_u64 const input_lo = XXPH_readLE64(input) ^ (XXPH_readLE64(secret) + seed); | |
1124 | xxh_u64 const input_hi = XXPH_readLE64(input + len - 8) ^ (XXPH_readLE64(secret + 8) - seed); | |
1125 | xxh_u64 const acc = len + (input_lo + input_hi) + XXPH3_mul128_fold64(input_lo, input_hi); | |
1126 | return XXPH3_avalanche(acc); | |
1127 | } | |
1128 | } | |
1129 | ||
1130 | XXPH_FORCE_INLINE XXPH64_hash_t | |
1131 | XXPH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) | |
1132 | { | |
1133 | XXPH_ASSERT(len <= 16); | |
1134 | { if (len > 8) return XXPH3_len_9to16_64b(input, len, secret, seed); | |
1135 | if (len >= 4) return XXPH3_len_4to8_64b(input, len, secret, seed); | |
1136 | if (len) return XXPH3_len_1to3_64b(input, len, secret, seed); | |
1137 | /* | |
1138 | * RocksDB modification from XXPH3 preview: zero result for empty | |
1139 | * string can be problematic for multiplication-based algorithms. | |
1140 | * Return a hash of the seed instead. | |
1141 | */ | |
1142 | return XXPH3_mul128_fold64(seed + XXPH_readLE64(secret), PRIME64_2); | |
1143 | } | |
1144 | } | |
1145 | ||
1146 | ||
1147 | /* === Long Keys === */ | |
1148 | ||
1149 | #define STRIPE_LEN 64 | |
1150 | #define XXPH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ | |
1151 | #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) | |
1152 | ||
1153 | typedef enum { XXPH3_acc_64bits, XXPH3_acc_128bits } XXPH3_accWidth_e; | |
1154 | ||
1155 | XXPH_FORCE_INLINE void | |
1156 | XXPH3_accumulate_512( void* XXPH_RESTRICT acc, | |
1157 | const void* XXPH_RESTRICT input, | |
1158 | const void* XXPH_RESTRICT secret, | |
1159 | XXPH3_accWidth_e accWidth) | |
1160 | { | |
1161 | #if (XXPH_VECTOR == XXPH_AVX2) | |
1162 | ||
1163 | XXPH_ASSERT((((size_t)acc) & 31) == 0); | |
1164 | { XXPH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; | |
1165 | const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ | |
1166 | const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ | |
1167 | ||
1168 | size_t i; | |
1169 | for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { | |
1170 | __m256i const data_vec = _mm256_loadu_si256 (xinput+i); | |
1171 | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); | |
1172 | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ | |
1173 | __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ | |
1174 | if (accWidth == XXPH3_acc_128bits) { | |
1175 | __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); | |
1176 | __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); | |
1177 | xacc[i] = _mm256_add_epi64(product, sum); | |
1178 | } else { /* XXPH3_acc_64bits */ | |
1179 | __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); | |
1180 | xacc[i] = _mm256_add_epi64(product, sum); | |
1181 | } | |
1182 | } } | |
1183 | ||
1184 | #elif (XXPH_VECTOR == XXPH_SSE2) | |
1185 | ||
1186 | XXPH_ASSERT((((size_t)acc) & 15) == 0); | |
1187 | { XXPH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; | |
1188 | const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ | |
1189 | const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ | |
1190 | ||
1191 | size_t i; | |
1192 | for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { | |
1193 | __m128i const data_vec = _mm_loadu_si128 (xinput+i); | |
1194 | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); | |
1195 | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ | |
1196 | __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ | |
1197 | if (accWidth == XXPH3_acc_128bits) { | |
1198 | __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); | |
1199 | __m128i const sum = _mm_add_epi64(xacc[i], data_swap); | |
1200 | xacc[i] = _mm_add_epi64(product, sum); | |
1201 | } else { /* XXPH3_acc_64bits */ | |
1202 | __m128i const sum = _mm_add_epi64(xacc[i], data_vec); | |
1203 | xacc[i] = _mm_add_epi64(product, sum); | |
1204 | } | |
1205 | } } | |
1206 | ||
1207 | #elif (XXPH_VECTOR == XXPH_NEON) | |
1208 | ||
1209 | XXPH_ASSERT((((size_t)acc) & 15) == 0); | |
1210 | { | |
1211 | XXPH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; | |
1212 | /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ | |
1213 | uint8_t const* const xinput = (const uint8_t *) input; | |
1214 | uint8_t const* const xsecret = (const uint8_t *) secret; | |
1215 | ||
1216 | size_t i; | |
1217 | for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { | |
1218 | #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */ | |
1219 | /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this. | |
1220 | * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang | |
1221 | * assumes I don't want to destroy it and tries to make a copy. This slows down the code | |
1222 | * a lot. | |
1223 | * aarch64 not only uses an entirely different syntax, but it requires three | |
1224 | * instructions... | |
1225 | * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly | |
1226 | * zip1 v3.2s, v0.2s, v1.2s // first zip | |
1227 | * zip2 v2.2s, v0.2s, v1.2s // second zip | |
1228 | * ...to do what ARM does in one: | |
1229 | * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */ | |
1230 | ||
1231 | /* data_vec = xsecret[i]; */ | |
1232 | uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); | |
1233 | /* key_vec = xsecret[i]; */ | |
1234 | uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); | |
1235 | /* data_key = data_vec ^ key_vec; */ | |
1236 | uint32x4_t data_key; | |
1237 | ||
1238 | if (accWidth == XXPH3_acc_64bits) { | |
1239 | /* Add first to prevent register swaps */ | |
1240 | /* xacc[i] += data_vec; */ | |
1241 | xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); | |
1242 | } else { /* XXPH3_acc_128bits */ | |
1243 | /* xacc[i] += swap(data_vec); */ | |
1244 | /* can probably be optimized better */ | |
1245 | uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); | |
1246 | uint64x2_t const swapped= vextq_u64(data64, data64, 1); | |
1247 | xacc[i] = vaddq_u64 (xacc[i], swapped); | |
1248 | } | |
1249 | ||
1250 | data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec)); | |
1251 | ||
1252 | /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place. | |
1253 | * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */ | |
1254 | __asm__("vzip.32 %e0, %f0" : "+w" (data_key)); | |
1255 | /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */ | |
1256 | xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key)); | |
1257 | ||
1258 | #else | |
1259 | /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */ | |
1260 | ||
1261 | /* data_vec = xsecret[i]; */ | |
1262 | uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); | |
1263 | /* key_vec = xsecret[i]; */ | |
1264 | uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); | |
1265 | /* data_key = data_vec ^ key_vec; */ | |
1266 | uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); | |
1267 | /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */ | |
1268 | uint32x2_t const data_key_lo = vmovn_u64 (data_key); | |
1269 | /* data_key_hi = (uint32x2_t) (data_key >> 32); */ | |
1270 | uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32); | |
1271 | if (accWidth == XXPH3_acc_64bits) { | |
1272 | /* xacc[i] += data_vec; */ | |
1273 | xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); | |
1274 | } else { /* XXPH3_acc_128bits */ | |
1275 | /* xacc[i] += swap(data_vec); */ | |
1276 | uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); | |
1277 | uint64x2_t const swapped= vextq_u64(data64, data64, 1); | |
1278 | xacc[i] = vaddq_u64 (xacc[i], swapped); | |
1279 | } | |
1280 | /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ | |
1281 | xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); | |
1282 | ||
1283 | #endif | |
1284 | } | |
1285 | } | |
1286 | ||
1287 | #elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) | |
1288 | U64x2* const xacc = (U64x2*) acc; /* presumed aligned */ | |
1289 | U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */ | |
1290 | U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */ | |
1291 | U64x2 const v32 = { 32, 32 }; | |
1292 | #if XXPH_VSX_BE | |
1293 | U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, | |
1294 | 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; | |
1295 | #endif | |
1296 | size_t i; | |
1297 | for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { | |
1298 | /* data_vec = xinput[i]; */ | |
1299 | /* key_vec = xsecret[i]; */ | |
1300 | #if XXPH_VSX_BE | |
1301 | /* byteswap */ | |
1302 | U64x2 const data_vec = XXPH_vec_revb(vec_vsx_ld(0, xinput + i)); | |
1303 | U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); | |
1304 | /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */ | |
1305 | U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); | |
1306 | #else | |
1307 | U64x2 const data_vec = vec_vsx_ld(0, xinput + i); | |
1308 | U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); | |
1309 | U64x2 const data_key = data_vec ^ key_vec; | |
1310 | #endif | |
1311 | /* shuffled = (data_key << 32) | (data_key >> 32); */ | |
1312 | U32x4 const shuffled = (U32x4)vec_rl(data_key, v32); | |
1313 | /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */ | |
1314 | U64x2 const product = XXPH_vec_mulo((U32x4)data_key, shuffled); | |
1315 | xacc[i] += product; | |
1316 | ||
1317 | if (accWidth == XXPH3_acc_64bits) { | |
1318 | xacc[i] += data_vec; | |
1319 | } else { /* XXPH3_acc_128bits */ | |
1320 | /* swap high and low halves */ | |
1321 | U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); | |
1322 | xacc[i] += data_swapped; | |
1323 | } | |
1324 | } | |
1325 | ||
1326 | #else /* scalar variant of Accumulator - universal */ | |
1327 | ||
1328 | XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ | |
1329 | const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ | |
1330 | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ | |
1331 | size_t i; | |
1332 | XXPH_ASSERT(((size_t)acc & (XXPH_ACC_ALIGN-1)) == 0); | |
1333 | for (i=0; i < ACC_NB; i++) { | |
1334 | xxh_u64 const data_val = XXPH_readLE64(xinput + 8*i); | |
1335 | xxh_u64 const data_key = data_val ^ XXPH_readLE64(xsecret + i*8); | |
1336 | ||
1337 | if (accWidth == XXPH3_acc_64bits) { | |
1338 | xacc[i] += data_val; | |
1339 | } else { | |
1340 | xacc[i ^ 1] += data_val; /* swap adjacent lanes */ | |
1341 | } | |
1342 | xacc[i] += XXPH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); | |
1343 | } | |
1344 | #endif | |
1345 | } | |
1346 | ||
1347 | XXPH_FORCE_INLINE void | |
1348 | XXPH3_scrambleAcc(void* XXPH_RESTRICT acc, const void* XXPH_RESTRICT secret) | |
1349 | { | |
1350 | #if (XXPH_VECTOR == XXPH_AVX2) | |
1351 | ||
1352 | XXPH_ASSERT((((size_t)acc) & 31) == 0); | |
1353 | { XXPH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; | |
1354 | const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */ | |
1355 | const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); | |
1356 | ||
1357 | size_t i; | |
1358 | for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { | |
1359 | /* xacc[i] ^= (xacc[i] >> 47) */ | |
1360 | __m256i const acc_vec = xacc[i]; | |
1361 | __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); | |
1362 | __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); | |
1363 | /* xacc[i] ^= xsecret; */ | |
1364 | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); | |
1365 | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); | |
1366 | ||
1367 | /* xacc[i] *= PRIME32_1; */ | |
1368 | __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31); | |
1369 | __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); | |
1370 | __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); | |
1371 | xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); | |
1372 | } | |
1373 | } | |
1374 | ||
1375 | #elif (XXPH_VECTOR == XXPH_SSE2) | |
1376 | ||
1377 | XXPH_ASSERT((((size_t)acc) & 15) == 0); | |
1378 | { XXPH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; | |
1379 | const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */ | |
1380 | const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); | |
1381 | ||
1382 | size_t i; | |
1383 | for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { | |
1384 | /* xacc[i] ^= (xacc[i] >> 47) */ | |
1385 | __m128i const acc_vec = xacc[i]; | |
1386 | __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); | |
1387 | __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); | |
1388 | /* xacc[i] ^= xsecret; */ | |
1389 | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); | |
1390 | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); | |
1391 | ||
1392 | /* xacc[i] *= PRIME32_1; */ | |
1393 | __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31); | |
1394 | __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); | |
1395 | __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); | |
1396 | xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); | |
1397 | } | |
1398 | } | |
1399 | ||
1400 | #elif (XXPH_VECTOR == XXPH_NEON) | |
1401 | ||
1402 | XXPH_ASSERT((((size_t)acc) & 15) == 0); | |
1403 | ||
1404 | { uint64x2_t* const xacc = (uint64x2_t*) acc; | |
1405 | uint8_t const* const xsecret = (uint8_t const*) secret; | |
1406 | uint32x2_t const prime = vdup_n_u32 (PRIME32_1); | |
1407 | ||
1408 | size_t i; | |
1409 | for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { | |
1410 | /* data_vec = xacc[i] ^ (xacc[i] >> 47); */ | |
1411 | uint64x2_t const acc_vec = xacc[i]; | |
1412 | uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47); | |
1413 | uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted); | |
1414 | ||
1415 | /* key_vec = xsecret[i]; */ | |
1416 | uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16))); | |
1417 | /* data_key = data_vec ^ key_vec; */ | |
1418 | uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec); | |
1419 | /* shuffled = { data_key[0, 2], data_key[1, 3] }; */ | |
1420 | uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key)); | |
1421 | ||
1422 | /* data_key *= PRIME32_1 */ | |
1423 | ||
1424 | /* prod_hi = (data_key >> 32) * PRIME32_1; */ | |
1425 | uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime); | |
1426 | /* xacc[i] = prod_hi << 32; */ | |
1427 | xacc[i] = vshlq_n_u64(prod_hi, 32); | |
1428 | /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ | |
1429 | xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime); | |
1430 | } } | |
1431 | ||
1432 | #elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) | |
1433 | ||
1434 | U64x2* const xacc = (U64x2*) acc; | |
1435 | const U64x2* const xsecret = (const U64x2*) secret; | |
1436 | /* constants */ | |
1437 | U64x2 const v32 = { 32, 32 }; | |
1438 | U64x2 const v47 = { 47, 47 }; | |
1439 | U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; | |
1440 | size_t i; | |
1441 | #if XXPH_VSX_BE | |
1442 | /* endian swap */ | |
1443 | U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, | |
1444 | 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; | |
1445 | #endif | |
1446 | for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { | |
1447 | U64x2 const acc_vec = xacc[i]; | |
1448 | U64x2 const data_vec = acc_vec ^ (acc_vec >> v47); | |
1449 | /* key_vec = xsecret[i]; */ | |
1450 | #if XXPH_VSX_BE | |
1451 | /* swap bytes words */ | |
1452 | U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); | |
1453 | U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); | |
1454 | #else | |
1455 | U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); | |
1456 | U64x2 const data_key = data_vec ^ key_vec; | |
1457 | #endif | |
1458 | ||
1459 | /* data_key *= PRIME32_1 */ | |
1460 | ||
1461 | /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */ | |
1462 | U64x2 const prod_even = XXPH_vec_mule((U32x4)data_key, prime); | |
1463 | /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */ | |
1464 | U64x2 const prod_odd = XXPH_vec_mulo((U32x4)data_key, prime); | |
1465 | xacc[i] = prod_odd + (prod_even << v32); | |
1466 | } | |
1467 | ||
1468 | #else /* scalar variant of Scrambler - universal */ | |
1469 | ||
1470 | XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ | |
1471 | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ | |
1472 | size_t i; | |
1473 | XXPH_ASSERT((((size_t)acc) & (XXPH_ACC_ALIGN-1)) == 0); | |
1474 | for (i=0; i < ACC_NB; i++) { | |
1475 | xxh_u64 const key64 = XXPH_readLE64(xsecret + 8*i); | |
1476 | xxh_u64 acc64 = xacc[i]; | |
1477 | acc64 ^= acc64 >> 47; | |
1478 | acc64 ^= key64; | |
1479 | acc64 *= PRIME32_1; | |
1480 | xacc[i] = acc64; | |
1481 | } | |
1482 | ||
1483 | #endif | |
1484 | } | |
1485 | ||
1486 | #define XXPH_PREFETCH_DIST 384 | |
1487 | ||
1488 | /* assumption : nbStripes will not overflow secret size */ | |
1489 | XXPH_FORCE_INLINE void | |
1490 | XXPH3_accumulate( xxh_u64* XXPH_RESTRICT acc, | |
1491 | const xxh_u8* XXPH_RESTRICT input, | |
1492 | const xxh_u8* XXPH_RESTRICT secret, | |
1493 | size_t nbStripes, | |
1494 | XXPH3_accWidth_e accWidth) | |
1495 | { | |
1496 | size_t n; | |
1497 | for (n = 0; n < nbStripes; n++ ) { | |
1498 | const xxh_u8* const in = input + n*STRIPE_LEN; | |
1499 | XXPH_PREFETCH(in + XXPH_PREFETCH_DIST); | |
1500 | XXPH3_accumulate_512(acc, | |
1501 | in, | |
1502 | secret + n*XXPH_SECRET_CONSUME_RATE, | |
1503 | accWidth); | |
1504 | } | |
1505 | } | |
1506 | ||
1507 | /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`, | |
1508 | * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`. | |
1509 | * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE` | |
1510 | * Pretty much every other modes and compilers prefer `FORCE_INLINE`. | |
1511 | */ | |
1512 | ||
1513 | #if defined(__clang__) && (XXPH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__) | |
1514 | static void | |
1515 | #else | |
1516 | XXPH_FORCE_INLINE void | |
1517 | #endif | |
1518 | XXPH3_hashLong_internal_loop( xxh_u64* XXPH_RESTRICT acc, | |
1519 | const xxh_u8* XXPH_RESTRICT input, size_t len, | |
1520 | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, | |
1521 | XXPH3_accWidth_e accWidth) | |
1522 | { | |
1523 | size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXPH_SECRET_CONSUME_RATE; | |
1524 | size_t const block_len = STRIPE_LEN * nb_rounds; | |
1525 | size_t const nb_blocks = len / block_len; | |
1526 | ||
1527 | size_t n; | |
1528 | ||
1529 | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); | |
1530 | ||
1531 | for (n = 0; n < nb_blocks; n++) { | |
1532 | XXPH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); | |
1533 | XXPH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); | |
1534 | } | |
1535 | ||
1536 | /* last partial block */ | |
1537 | XXPH_ASSERT(len > STRIPE_LEN); | |
1538 | { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; | |
1539 | XXPH_ASSERT(nbStripes <= (secretSize / XXPH_SECRET_CONSUME_RATE)); | |
1540 | XXPH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); | |
1541 | ||
1542 | /* last stripe */ | |
1543 | if (len & (STRIPE_LEN - 1)) { | |
1544 | const xxh_u8* const p = input + len - STRIPE_LEN; | |
1545 | #define XXPH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */ | |
1546 | XXPH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXPH_SECRET_LASTACC_START, accWidth); | |
1547 | } } | |
1548 | } | |
1549 | ||
1550 | XXPH_FORCE_INLINE xxh_u64 | |
1551 | XXPH3_mix2Accs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret) | |
1552 | { | |
1553 | return XXPH3_mul128_fold64( | |
1554 | acc[0] ^ XXPH_readLE64(secret), | |
1555 | acc[1] ^ XXPH_readLE64(secret+8) ); | |
1556 | } | |
1557 | ||
1558 | static XXPH64_hash_t | |
1559 | XXPH3_mergeAccs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret, xxh_u64 start) | |
1560 | { | |
1561 | xxh_u64 result64 = start; | |
1562 | ||
1563 | result64 += XXPH3_mix2Accs(acc+0, secret + 0); | |
1564 | result64 += XXPH3_mix2Accs(acc+2, secret + 16); | |
1565 | result64 += XXPH3_mix2Accs(acc+4, secret + 32); | |
1566 | result64 += XXPH3_mix2Accs(acc+6, secret + 48); | |
1567 | ||
1568 | return XXPH3_avalanche(result64); | |
1569 | } | |
1570 | ||
1571 | #define XXPH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ | |
1572 | PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }; | |
1573 | ||
1574 | XXPH_FORCE_INLINE XXPH64_hash_t | |
1575 | XXPH3_hashLong_internal(const xxh_u8* XXPH_RESTRICT input, size_t len, | |
1576 | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) | |
1577 | { | |
1578 | XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXPH3_INIT_ACC; | |
1579 | ||
1580 | XXPH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXPH3_acc_64bits); | |
1581 | ||
1582 | /* converge into final hash */ | |
1583 | XXPH_STATIC_ASSERT(sizeof(acc) == 64); | |
1584 | #define XXPH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */ | |
1585 | XXPH_ASSERT(secretSize >= sizeof(acc) + XXPH_SECRET_MERGEACCS_START); | |
1586 | return XXPH3_mergeAccs(acc, secret + XXPH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); | |
1587 | } | |
1588 | ||
1589 | ||
1590 | XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
1591 | XXPH3_hashLong_64b_defaultSecret(const xxh_u8* XXPH_RESTRICT input, size_t len) | |
1592 | { | |
1593 | return XXPH3_hashLong_internal(input, len, kSecret, sizeof(kSecret)); | |
1594 | } | |
1595 | ||
1596 | XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
1597 | XXPH3_hashLong_64b_withSecret(const xxh_u8* XXPH_RESTRICT input, size_t len, | |
1598 | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) | |
1599 | { | |
1600 | return XXPH3_hashLong_internal(input, len, secret, secretSize); | |
1601 | } | |
1602 | ||
1603 | ||
1604 | XXPH_FORCE_INLINE void XXPH_writeLE64(void* dst, xxh_u64 v64) | |
1605 | { | |
1606 | if (!XXPH_CPU_LITTLE_ENDIAN) v64 = XXPH_swap64(v64); | |
1607 | memcpy(dst, &v64, sizeof(v64)); | |
1608 | } | |
1609 | ||
1610 | /* XXPH3_initCustomSecret() : | |
1611 | * destination `customSecret` is presumed allocated and same size as `kSecret`. | |
1612 | */ | |
1613 | XXPH_FORCE_INLINE void XXPH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64) | |
1614 | { | |
1615 | int const nbRounds = XXPH_SECRET_DEFAULT_SIZE / 16; | |
1616 | int i; | |
1617 | ||
1618 | XXPH_STATIC_ASSERT((XXPH_SECRET_DEFAULT_SIZE & 15) == 0); | |
1619 | ||
1620 | for (i=0; i < nbRounds; i++) { | |
1621 | XXPH_writeLE64(customSecret + 16*i, XXPH_readLE64(kSecret + 16*i) + seed64); | |
1622 | XXPH_writeLE64(customSecret + 16*i + 8, XXPH_readLE64(kSecret + 16*i + 8) - seed64); | |
1623 | } | |
1624 | } | |
1625 | ||
1626 | ||
1627 | /* XXPH3_hashLong_64b_withSeed() : | |
1628 | * Generate a custom key, | |
1629 | * based on alteration of default kSecret with the seed, | |
1630 | * and then use this key for long mode hashing. | |
1631 | * This operation is decently fast but nonetheless costs a little bit of time. | |
1632 | * Try to avoid it whenever possible (typically when seed==0). | |
1633 | */ | |
1634 | XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
1635 | XXPH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXPH64_hash_t seed) | |
1636 | { | |
1637 | XXPH_ALIGN(8) xxh_u8 secret[XXPH_SECRET_DEFAULT_SIZE]; | |
1638 | if (seed==0) return XXPH3_hashLong_64b_defaultSecret(input, len); | |
1639 | XXPH3_initCustomSecret(secret, seed); | |
1640 | return XXPH3_hashLong_internal(input, len, secret, sizeof(secret)); | |
1641 | } | |
1642 | ||
1643 | ||
1644 | XXPH_FORCE_INLINE xxh_u64 XXPH3_mix16B(const xxh_u8* XXPH_RESTRICT input, | |
1645 | const xxh_u8* XXPH_RESTRICT secret, xxh_u64 seed64) | |
1646 | { | |
1647 | xxh_u64 const input_lo = XXPH_readLE64(input); | |
1648 | xxh_u64 const input_hi = XXPH_readLE64(input+8); | |
1649 | return XXPH3_mul128_fold64( | |
1650 | input_lo ^ (XXPH_readLE64(secret) + seed64), | |
1651 | input_hi ^ (XXPH_readLE64(secret+8) - seed64) ); | |
1652 | } | |
1653 | ||
1654 | ||
1655 | XXPH_FORCE_INLINE XXPH64_hash_t | |
1656 | XXPH3_len_17to128_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, | |
1657 | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, | |
1658 | XXPH64_hash_t seed) | |
1659 | { | |
1660 | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; | |
1661 | XXPH_ASSERT(16 < len && len <= 128); | |
1662 | ||
1663 | { xxh_u64 acc = len * PRIME64_1; | |
1664 | if (len > 32) { | |
1665 | if (len > 64) { | |
1666 | if (len > 96) { | |
1667 | acc += XXPH3_mix16B(input+48, secret+96, seed); | |
1668 | acc += XXPH3_mix16B(input+len-64, secret+112, seed); | |
1669 | } | |
1670 | acc += XXPH3_mix16B(input+32, secret+64, seed); | |
1671 | acc += XXPH3_mix16B(input+len-48, secret+80, seed); | |
1672 | } | |
1673 | acc += XXPH3_mix16B(input+16, secret+32, seed); | |
1674 | acc += XXPH3_mix16B(input+len-32, secret+48, seed); | |
1675 | } | |
1676 | acc += XXPH3_mix16B(input+0, secret+0, seed); | |
1677 | acc += XXPH3_mix16B(input+len-16, secret+16, seed); | |
1678 | ||
1679 | return XXPH3_avalanche(acc); | |
1680 | } | |
1681 | } | |
1682 | ||
1683 | #define XXPH3_MIDSIZE_MAX 240 | |
1684 | ||
1685 | XXPH_NO_INLINE XXPH64_hash_t | |
1686 | XXPH3_len_129to240_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, | |
1687 | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, | |
1688 | XXPH64_hash_t seed) | |
1689 | { | |
1690 | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; | |
1691 | XXPH_ASSERT(128 < len && len <= XXPH3_MIDSIZE_MAX); | |
1692 | ||
1693 | #define XXPH3_MIDSIZE_STARTOFFSET 3 | |
1694 | #define XXPH3_MIDSIZE_LASTOFFSET 17 | |
1695 | ||
1696 | { xxh_u64 acc = len * PRIME64_1; | |
1697 | int const nbRounds = (int)len / 16; | |
1698 | int i; | |
1699 | for (i=0; i<8; i++) { | |
1700 | acc += XXPH3_mix16B(input+(16*i), secret+(16*i), seed); | |
1701 | } | |
1702 | acc = XXPH3_avalanche(acc); | |
1703 | XXPH_ASSERT(nbRounds >= 8); | |
1704 | for (i=8 ; i < nbRounds; i++) { | |
1705 | acc += XXPH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXPH3_MIDSIZE_STARTOFFSET, seed); | |
1706 | } | |
1707 | /* last bytes */ | |
1708 | acc += XXPH3_mix16B(input + len - 16, secret + XXPH3_SECRET_SIZE_MIN - XXPH3_MIDSIZE_LASTOFFSET, seed); | |
1709 | return XXPH3_avalanche(acc); | |
1710 | } | |
1711 | } | |
1712 | ||
1713 | /* === Public entry point === */ | |
1714 | ||
1715 | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* input, size_t len) | |
1716 | { | |
1717 | if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); | |
1718 | if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); | |
1719 | if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); | |
1720 | return XXPH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); | |
1721 | } | |
1722 | ||
1723 | XXPH_PUBLIC_API XXPH64_hash_t | |
1724 | XXPH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) | |
1725 | { | |
1726 | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); | |
1727 | /* if an action must be taken should `secret` conditions not be respected, | |
1728 | * it should be done here. | |
1729 | * For now, it's a contract pre-condition. | |
1730 | * Adding a check and a branch here would cost performance at every hash */ | |
1731 | if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); | |
1732 | if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); | |
1733 | if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); | |
1734 | return XXPH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); | |
1735 | } | |
1736 | ||
1737 | XXPH_PUBLIC_API XXPH64_hash_t | |
1738 | XXPH3_64bits_withSeed(const void* input, size_t len, XXPH64_hash_t seed) | |
1739 | { | |
1740 | if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); | |
1741 | if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); | |
1742 | if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); | |
1743 | return XXPH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); | |
1744 | } | |
1745 | ||
1746 | /* === XXPH3 streaming === */ | |
1747 | ||
1748 | /* RocksDB Note: unused & removed due to bug in preview version */ | |
1749 | ||
1750 | /*======== END #include "xxh3.h", now inlined above ==========*/ | |
1751 | ||
1752 | #endif /* XXPH_NO_LONG_LONG */ | |
1753 | ||
1754 | /* === END RocksDB modification of permanently inlining === */ | |
1755 | ||
1756 | #endif /* defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) */ | |
1757 | ||
1758 | #endif /* XXPH_STATIC_LINKING_ONLY */ | |
1759 | ||
1760 | #if defined (__cplusplus) | |
1761 | } | |
1762 | #endif | |
1763 | ||
1764 | #endif /* XXPHASH_H_5627135585666179 */ |