1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 xxHash - Extremely Fast Hash algorithm
7 Development source file for `xxh3`
8 Copyright (C) 2019-present, Yann Collet.
10 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are
16 * Redistributions of source code must retain the above copyright
17 notice, this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above
19 copyright notice, this list of conditions and the following disclaimer
20 in the documentation and/or other materials provided with the
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 You can contact the author at :
36 - xxHash source repository : https://github.com/Cyan4973/xxHash
39 /* RocksDB Note: This file contains a preview release (xxhash repository
40 version 0.7.2) of XXH3 that is unlikely to be compatible with the final
41 version of XXH3. We have therefore renamed this XXH3p ("preview"), for
42 clarity so that we can continue to use this version even after
43 integrating a newer incompatible version.
47 This file is separated for development purposes.
48 It will be integrated into `xxhash.c` when development phase is complete.
55 /* === Dependencies === */
57 #undef XXH_INLINE_ALL /* in case it's already defined */
58 #define XXH_INLINE_ALL
62 /* === Compiler specifics === */
64 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
65 # define XXH_RESTRICT restrict
67 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
68 # define XXH_RESTRICT /* disable */
72 # if defined(__AVX2__)
73 # include <immintrin.h>
74 # elif defined(__SSE2__)
75 # include <emmintrin.h>
76 # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
77 # define inline __inline__ /* clang bug */
78 # include <arm_neon.h>
81 #elif defined(_MSC_VER)
88 * XXH3 only requires these features to be efficient:
90 * - Usable unaligned access
91 * - A 32-bit or 64-bit ALU
92 * - If 32-bit, a decent ADC instruction
93 * - A 32 or 64-bit multiply with a 64-bit result
95 * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
96 * classic 16-bit only subset of ARM's instruction set.
98 * First of all, Thumb-1 lacks support for the UMULL instruction which
99 * performs the important long multiply. This means numerous __aeabi_lmul
102 * Second of all, the 8 functional registers are just not enough.
103 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
104 * Lo registers, and this shuffling results in thousands more MOVs than A32.
106 * A32 and T32 don't have this limitation. They can access all 14 registers,
107 * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
109 * If compiling Thumb-1 for a target which supports ARM instructions, we
110 * will give a warning.
112 * Usually, if this happens, it is because of an accident and you probably
113 * need to specify -march, as you probably meant to compileh for a newer
116 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
117 # warning "XXH3 is highly inefficient without ARM or Thumb-2."
120 /* ==========================================
121 * Vectorization detection
122 * ========================================== */
129 #ifndef XXH_VECTOR /* can be defined on command line */
130 # if defined(__AVX2__)
131 # define XXH_VECTOR XXH_AVX2
132 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
133 # define XXH_VECTOR XXH_SSE2
134 # elif defined(__GNUC__) /* msvc support maybe later */ \
135 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
136 && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
137 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
138 # define XXH_VECTOR XXH_NEON
139 # elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
140 # define XXH_VECTOR XXH_VSX
142 # define XXH_VECTOR XXH_SCALAR
146 /* control alignment of accumulator,
147 * for compatibility with fast vector loads */
148 #ifndef XXH_ACC_ALIGN
149 # if XXH_VECTOR == 0 /* scalar */
150 # define XXH_ACC_ALIGN 8
151 # elif XXH_VECTOR == 1 /* sse2 */
152 # define XXH_ACC_ALIGN 16
153 # elif XXH_VECTOR == 2 /* avx2 */
154 # define XXH_ACC_ALIGN 32
155 # elif XXH_VECTOR == 3 /* neon */
156 # define XXH_ACC_ALIGN 16
157 # elif XXH_VECTOR == 4 /* vsx */
158 # define XXH_ACC_ALIGN 16
162 /* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
163 #if defined(_MSC_VER) && defined(_M_IX86)
165 # define XXH_mult32to64(x, y) __emulu(x, y)
167 # define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
170 /* VSX stuff. It's a lot because VSX support is mediocre across compilers and
171 * there is a lot of mischief with endianness. */
172 #if XXH_VECTOR == XXH_VSX
173 # include <altivec.h>
175 typedef __vector
unsigned long long U64x2
;
176 typedef __vector
unsigned char U8x16
;
177 typedef __vector
unsigned U32x4
;
180 # if defined(__BIG_ENDIAN__) \
181 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
182 # define XXH_VSX_BE 1
183 # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
184 # warning "-maltivec=be is not recommended. Please use native endianness."
185 # define XXH_VSX_BE 1
187 # define XXH_VSX_BE 0
191 /* We need some helpers for big endian mode. */
193 /* A wrapper for POWER9's vec_revb. */
194 # ifdef __POWER9_VECTOR__
195 # define XXH_vec_revb vec_revb
197 XXH_FORCE_INLINE U64x2
XXH_vec_revb(U64x2 val
)
199 U8x16
const vByteSwap
= { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
200 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
201 return vec_perm(val
, val
, vByteSwap
);
205 /* Power8 Crypto gives us vpermxor which is very handy for
208 * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
211 * for (int i = 0; i < 16; i++) {
212 * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
217 * Because both of the main loops load the key, swap, and xor it with input,
218 * we can combine the key swap into this instruction.
221 # define XXH_vec_permxor vec_permxor
223 # define XXH_vec_permxor __builtin_crypto_vpermxor
225 #endif /* XXH_VSX_BE */
227 * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
230 * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
231 * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
232 * GCC needs inline assembly. */
233 #if __has_builtin(__builtin_altivec_vmuleuw)
234 # define XXH_vec_mulo __builtin_altivec_vmulouw
235 # define XXH_vec_mule __builtin_altivec_vmuleuw
237 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
238 XXH_FORCE_INLINE U64x2
XXH_vec_mulo(U32x4 a
, U32x4 b
) {
240 __asm__("vmulouw %0, %1, %2" : "=v" (result
) : "v" (a
), "v" (b
));
243 XXH_FORCE_INLINE U64x2
XXH_vec_mule(U32x4 a
, U32x4 b
) {
245 __asm__("vmuleuw %0, %1, %2" : "=v" (result
) : "v" (a
), "v" (b
));
248 #endif /* __has_builtin(__builtin_altivec_vmuleuw) */
249 #endif /* XXH_VECTOR == XXH_VSX */
252 * can be disabled, by declaring XXH_NO_PREFETCH build macro */
253 #if defined(XXH_NO_PREFETCH)
254 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
256 #if defined(_MSC_VER) && \
257 (defined(_M_X64) || \
258 defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */
259 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
260 # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
261 # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
262 # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
264 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
266 #endif /* XXH_NO_PREFETCH */
269 /* ==========================================
270 * XXH3 default settings
271 * ========================================== */
273 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3p_SECRET_SIZE_MIN */
275 #if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
276 # error "default keyset is not large enough"
279 XXH_ALIGN(64) static const xxh_u8 kSecret
[XXH_SECRET_DEFAULT_SIZE
] = {
280 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
281 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
282 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
283 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
284 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
285 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
286 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
287 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
289 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
290 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
291 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
292 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
296 * GCC for x86 has a tendency to use SSE in this loop. While it
297 * successfully avoids swapping (as MUL overwrites EAX and EDX), it
298 * slows it down because instead of free register swap shifts, it
299 * must use pshufd and punpckl/hd.
301 * To prevent this, we use this attribute to shut off SSE.
303 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
304 __attribute__((__target__("no-sse")))
307 XXH_mult64to128(xxh_u64 lhs
, xxh_u64 rhs
)
310 * GCC/Clang __uint128_t method.
312 * On most 64-bit targets, GCC and Clang define a __uint128_t type.
313 * This is usually the best way as it usually uses a native long 64-bit
314 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
318 * Despite being a 32-bit platform, Clang (and emscripten) define this
319 * type despite not having the arithmetic for it. This results in a
320 * laggy compiler builtin call which calculates a full 128-bit multiply.
321 * In that case it is best to use the portable one.
322 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
324 #if defined(__GNUC__) && !defined(__wasm__) \
325 && defined(__SIZEOF_INT128__) \
326 || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
328 __uint128_t product
= (__uint128_t
)lhs
* (__uint128_t
)rhs
;
329 XXH128_hash_t
const r128
= { (xxh_u64
)(product
), (xxh_u64
)(product
>> 64) };
333 * MSVC for x64's _umul128 method.
335 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
337 * This compiles to single operand MUL on x64.
339 #elif defined(_M_X64) || defined(_M_IA64)
342 # pragma intrinsic(_umul128)
344 xxh_u64 product_high
;
345 xxh_u64
const product_low
= _umul128(lhs
, rhs
, &product_high
);
346 XXH128_hash_t
const r128
= { product_low
, product_high
};
351 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
353 * This is a fast and simple grade school multiply, which is shown
354 * below with base 10 arithmetic instead of base 0x100000000.
357 * x 7 5 // D2 rhs = 75
359 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
360 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
361 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
362 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
364 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21
365 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63
369 * The reasons for adding the products like this are:
370 * 1. It avoids manual carry tracking. Just like how
371 * (9 * 9) + 9 + 9 = 99, the same applies with this for
372 * UINT64_MAX. This avoids a lot of complexity.
374 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
375 * instruction available in ARMv6+ A32/T32, which is shown below:
377 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
379 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
380 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
381 * *RdHi = (xxh_u32)(product >> 32);
384 * This instruction was designed for efficient long multiplication,
385 * and allows this to be calculated in only 4 instructions which
386 * is comparable to some 64-bit ALUs.
388 * 3. It isn't terrible on other platforms. Usually this will be
389 * a couple of 32-bit ADD/ADCs.
392 /* First calculate all of the cross products. */
393 xxh_u64
const lo_lo
= XXH_mult32to64(lhs
& 0xFFFFFFFF, rhs
& 0xFFFFFFFF);
394 xxh_u64
const hi_lo
= XXH_mult32to64(lhs
>> 32, rhs
& 0xFFFFFFFF);
395 xxh_u64
const lo_hi
= XXH_mult32to64(lhs
& 0xFFFFFFFF, rhs
>> 32);
396 xxh_u64
const hi_hi
= XXH_mult32to64(lhs
>> 32, rhs
>> 32);
398 /* Now add the products together. These will never overflow. */
399 xxh_u64
const cross
= (lo_lo
>> 32) + (hi_lo
& 0xFFFFFFFF) + lo_hi
;
400 xxh_u64
const upper
= (hi_lo
>> 32) + (cross
>> 32) + hi_hi
;
401 xxh_u64
const lower
= (cross
<< 32) | (lo_lo
& 0xFFFFFFFF);
403 XXH128_hash_t r128
= { lower
, upper
};
409 * We want to keep the attribute here because a target switch
412 * Does a 64-bit to 128-bit multiply, then XOR folds it.
413 * The reason for the separate function is to prevent passing
414 * too many structs around by value. This will hopefully inline
415 * the multiply, but we don't force it.
417 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
418 __attribute__((__target__("no-sse")))
421 XXH3p_mul128_fold64(xxh_u64 lhs
, xxh_u64 rhs
)
423 XXH128_hash_t product
= XXH_mult64to128(lhs
, rhs
);
424 return product
.low64
^ product
.high64
;
428 static XXH64_hash_t
XXH3p_avalanche(xxh_u64 h64
)
437 /* ==========================================
439 * ========================================== */
441 XXH_FORCE_INLINE XXH64_hash_t
442 XXH3p_len_1to3_64b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
444 XXH_ASSERT(input
!= NULL
);
445 XXH_ASSERT(1 <= len
&& len
<= 3);
446 XXH_ASSERT(secret
!= NULL
);
447 { xxh_u8
const c1
= input
[0];
448 xxh_u8
const c2
= input
[len
>> 1];
449 xxh_u8
const c3
= input
[len
- 1];
450 xxh_u32
const combined
= ((xxh_u32
)c1
) | (((xxh_u32
)c2
) << 8) | (((xxh_u32
)c3
) << 16) | (((xxh_u32
)len
) << 24);
451 xxh_u64
const keyed
= (xxh_u64
)combined
^ (XXH_readLE32(secret
) + seed
);
452 xxh_u64
const mixed
= keyed
* PRIME64_1
;
453 return XXH3p_avalanche(mixed
);
457 XXH_FORCE_INLINE XXH64_hash_t
458 XXH3p_len_4to8_64b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
460 XXH_ASSERT(input
!= NULL
);
461 XXH_ASSERT(secret
!= NULL
);
462 XXH_ASSERT(4 <= len
&& len
<= 8);
463 { xxh_u32
const input_lo
= XXH_readLE32(input
);
464 xxh_u32
const input_hi
= XXH_readLE32(input
+ len
- 4);
465 xxh_u64
const input_64
= input_lo
| ((xxh_u64
)input_hi
<< 32);
466 xxh_u64
const keyed
= input_64
^ (XXH_readLE64(secret
) + seed
);
467 xxh_u64
const mix64
= len
+ ((keyed
^ (keyed
>> 51)) * PRIME32_1
);
468 return XXH3p_avalanche((mix64
^ (mix64
>> 47)) * PRIME64_2
);
472 XXH_FORCE_INLINE XXH64_hash_t
473 XXH3p_len_9to16_64b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
475 XXH_ASSERT(input
!= NULL
);
476 XXH_ASSERT(secret
!= NULL
);
477 XXH_ASSERT(9 <= len
&& len
<= 16);
478 { xxh_u64
const input_lo
= XXH_readLE64(input
) ^ (XXH_readLE64(secret
) + seed
);
479 xxh_u64
const input_hi
= XXH_readLE64(input
+ len
- 8) ^ (XXH_readLE64(secret
+ 8) - seed
);
480 xxh_u64
const acc
= len
+ (input_lo
+ input_hi
) + XXH3p_mul128_fold64(input_lo
, input_hi
);
481 return XXH3p_avalanche(acc
);
485 XXH_FORCE_INLINE XXH64_hash_t
486 XXH3p_len_0to16_64b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
488 XXH_ASSERT(len
<= 16);
489 { if (len
> 8) return XXH3p_len_9to16_64b(input
, len
, secret
, seed
);
490 if (len
>= 4) return XXH3p_len_4to8_64b(input
, len
, secret
, seed
);
491 if (len
) return XXH3p_len_1to3_64b(input
, len
, secret
, seed
);
493 * RocksDB modification from XXH3 preview: zero result for empty
494 * string can be problematic for multiplication-based algorithms.
495 * Return a hash of the seed instead.
497 return XXH3p_mul128_fold64(seed
+ XXH_readLE64(secret
), PRIME64_2
);
502 /* === Long Keys === */
504 #define STRIPE_LEN 64
505 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
506 #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
508 typedef enum { XXH3p_acc_64bits
, XXH3p_acc_128bits
} XXH3p_accWidth_e
;
510 XXH_FORCE_INLINE
void
511 XXH3p_accumulate_512( void* XXH_RESTRICT acc
,
512 const void* XXH_RESTRICT input
,
513 const void* XXH_RESTRICT secret
,
514 XXH3p_accWidth_e accWidth
)
516 #if (XXH_VECTOR == XXH_AVX2)
518 XXH_ASSERT((((size_t)acc
) & 31) == 0);
519 { XXH_ALIGN(32) __m256i
* const xacc
= (__m256i
*) acc
;
520 const __m256i
* const xinput
= (const __m256i
*) input
; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
521 const __m256i
* const xsecret
= (const __m256i
*) secret
; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
524 for (i
=0; i
< STRIPE_LEN
/sizeof(__m256i
); i
++) {
525 __m256i
const data_vec
= _mm256_loadu_si256 (xinput
+i
);
526 __m256i
const key_vec
= _mm256_loadu_si256 (xsecret
+i
);
527 __m256i
const data_key
= _mm256_xor_si256 (data_vec
, key_vec
); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
528 __m256i
const product
= _mm256_mul_epu32 (data_key
, _mm256_shuffle_epi32 (data_key
, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
529 if (accWidth
== XXH3p_acc_128bits
) {
530 __m256i
const data_swap
= _mm256_shuffle_epi32(data_vec
, _MM_SHUFFLE(1,0,3,2));
531 __m256i
const sum
= _mm256_add_epi64(xacc
[i
], data_swap
);
532 xacc
[i
] = _mm256_add_epi64(product
, sum
);
533 } else { /* XXH3p_acc_64bits */
534 __m256i
const sum
= _mm256_add_epi64(xacc
[i
], data_vec
);
535 xacc
[i
] = _mm256_add_epi64(product
, sum
);
539 #elif (XXH_VECTOR == XXH_SSE2)
541 XXH_ASSERT((((size_t)acc
) & 15) == 0);
542 { XXH_ALIGN(16) __m128i
* const xacc
= (__m128i
*) acc
;
543 const __m128i
* const xinput
= (const __m128i
*) input
; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
544 const __m128i
* const xsecret
= (const __m128i
*) secret
; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
547 for (i
=0; i
< STRIPE_LEN
/sizeof(__m128i
); i
++) {
548 __m128i
const data_vec
= _mm_loadu_si128 (xinput
+i
);
549 __m128i
const key_vec
= _mm_loadu_si128 (xsecret
+i
);
550 __m128i
const data_key
= _mm_xor_si128 (data_vec
, key_vec
); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
551 __m128i
const product
= _mm_mul_epu32 (data_key
, _mm_shuffle_epi32 (data_key
, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
552 if (accWidth
== XXH3p_acc_128bits
) {
553 __m128i
const data_swap
= _mm_shuffle_epi32(data_vec
, _MM_SHUFFLE(1,0,3,2));
554 __m128i
const sum
= _mm_add_epi64(xacc
[i
], data_swap
);
555 xacc
[i
] = _mm_add_epi64(product
, sum
);
556 } else { /* XXH3p_acc_64bits */
557 __m128i
const sum
= _mm_add_epi64(xacc
[i
], data_vec
);
558 xacc
[i
] = _mm_add_epi64(product
, sum
);
562 #elif (XXH_VECTOR == XXH_NEON)
564 XXH_ASSERT((((size_t)acc
) & 15) == 0);
566 XXH_ALIGN(16) uint64x2_t
* const xacc
= (uint64x2_t
*) acc
;
567 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
568 uint8_t const* const xinput
= (const uint8_t *) input
;
569 uint8_t const* const xsecret
= (const uint8_t *) secret
;
572 for (i
=0; i
< STRIPE_LEN
/ sizeof(uint64x2_t
); i
++) {
573 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
574 /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
575 * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
576 * assumes I don't want to destroy it and tries to make a copy. This slows down the code
578 * aarch64 not only uses an entirely different syntax, but it requires three
580 * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly
581 * zip1 v3.2s, v0.2s, v1.2s // first zip
582 * zip2 v2.2s, v0.2s, v1.2s // second zip
583 * ...to do what ARM does in one:
584 * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */
586 /* data_vec = xsecret[i]; */
587 uint8x16_t
const data_vec
= vld1q_u8(xinput
+ (i
* 16));
588 /* key_vec = xsecret[i]; */
589 uint8x16_t
const key_vec
= vld1q_u8(xsecret
+ (i
* 16));
590 /* data_key = data_vec ^ key_vec; */
593 if (accWidth
== XXH3p_acc_64bits
) {
594 /* Add first to prevent register swaps */
595 /* xacc[i] += data_vec; */
596 xacc
[i
] = vaddq_u64 (xacc
[i
], vreinterpretq_u64_u8(data_vec
));
597 } else { /* XXH3p_acc_128bits */
598 /* xacc[i] += swap(data_vec); */
599 /* can probably be optimized better */
600 uint64x2_t
const data64
= vreinterpretq_u64_u8(data_vec
);
601 uint64x2_t
const swapped
= vextq_u64(data64
, data64
, 1);
602 xacc
[i
] = vaddq_u64 (xacc
[i
], swapped
);
605 data_key
= vreinterpretq_u32_u8(veorq_u8(data_vec
, key_vec
));
607 /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
608 * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
609 __asm__("vzip.32 %e0, %f0" : "+w" (data_key
));
610 /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
611 xacc
[i
] = vmlal_u32(xacc
[i
], vget_low_u32(data_key
), vget_high_u32(data_key
));
614 /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
616 /* data_vec = xsecret[i]; */
617 uint8x16_t
const data_vec
= vld1q_u8(xinput
+ (i
* 16));
618 /* key_vec = xsecret[i]; */
619 uint8x16_t
const key_vec
= vld1q_u8(xsecret
+ (i
* 16));
620 /* data_key = data_vec ^ key_vec; */
621 uint64x2_t
const data_key
= vreinterpretq_u64_u8(veorq_u8(data_vec
, key_vec
));
622 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
623 uint32x2_t
const data_key_lo
= vmovn_u64 (data_key
);
624 /* data_key_hi = (uint32x2_t) (data_key >> 32); */
625 uint32x2_t
const data_key_hi
= vshrn_n_u64 (data_key
, 32);
626 if (accWidth
== XXH3p_acc_64bits
) {
627 /* xacc[i] += data_vec; */
628 xacc
[i
] = vaddq_u64 (xacc
[i
], vreinterpretq_u64_u8(data_vec
));
629 } else { /* XXH3p_acc_128bits */
630 /* xacc[i] += swap(data_vec); */
631 uint64x2_t
const data64
= vreinterpretq_u64_u8(data_vec
);
632 uint64x2_t
const swapped
= vextq_u64(data64
, data64
, 1);
633 xacc
[i
] = vaddq_u64 (xacc
[i
], swapped
);
635 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
636 xacc
[i
] = vmlal_u32 (xacc
[i
], data_key_lo
, data_key_hi
);
642 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
643 U64x2
* const xacc
= (U64x2
*) acc
; /* presumed aligned */
644 U64x2
const* const xinput
= (U64x2
const*) input
; /* no alignment restriction */
645 U64x2
const* const xsecret
= (U64x2
const*) secret
; /* no alignment restriction */
646 U64x2
const v32
= { 32, 32 };
648 U8x16
const vXorSwap
= { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
649 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
652 for (i
= 0; i
< STRIPE_LEN
/ sizeof(U64x2
); i
++) {
653 /* data_vec = xinput[i]; */
654 /* key_vec = xsecret[i]; */
657 U64x2
const data_vec
= XXH_vec_revb(vec_vsx_ld(0, xinput
+ i
));
658 U64x2
const key_raw
= vec_vsx_ld(0, xsecret
+ i
);
659 /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
660 U64x2
const data_key
= (U64x2
)XXH_vec_permxor((U8x16
)data_vec
, (U8x16
)key_raw
, vXorSwap
);
662 U64x2
const data_vec
= vec_vsx_ld(0, xinput
+ i
);
663 U64x2
const key_vec
= vec_vsx_ld(0, xsecret
+ i
);
664 U64x2
const data_key
= data_vec
^ key_vec
;
666 /* shuffled = (data_key << 32) | (data_key >> 32); */
667 U32x4
const shuffled
= (U32x4
)vec_rl(data_key
, v32
);
668 /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
669 U64x2
const product
= XXH_vec_mulo((U32x4
)data_key
, shuffled
);
672 if (accWidth
== XXH3p_acc_64bits
) {
674 } else { /* XXH3p_acc_128bits */
675 /* swap high and low halves */
676 U64x2
const data_swapped
= vec_xxpermdi(data_vec
, data_vec
, 2);
677 xacc
[i
] += data_swapped
;
681 #else /* scalar variant of Accumulator - universal */
683 XXH_ALIGN(XXH_ACC_ALIGN
) xxh_u64
* const xacc
= (xxh_u64
*) acc
; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
684 const xxh_u8
* const xinput
= (const xxh_u8
*) input
; /* no alignment restriction */
685 const xxh_u8
* const xsecret
= (const xxh_u8
*) secret
; /* no alignment restriction */
687 XXH_ASSERT(((size_t)acc
& (XXH_ACC_ALIGN
-1)) == 0);
688 for (i
=0; i
< ACC_NB
; i
++) {
689 xxh_u64
const data_val
= XXH_readLE64(xinput
+ 8*i
);
690 xxh_u64
const data_key
= data_val
^ XXH_readLE64(xsecret
+ i
*8);
692 if (accWidth
== XXH3p_acc_64bits
) {
695 xacc
[i
^ 1] += data_val
; /* swap adjacent lanes */
697 xacc
[i
] += XXH_mult32to64(data_key
& 0xFFFFFFFF, data_key
>> 32);
702 XXH_FORCE_INLINE
void
703 XXH3p_scrambleAcc(void* XXH_RESTRICT acc
, const void* XXH_RESTRICT secret
)
705 #if (XXH_VECTOR == XXH_AVX2)
707 XXH_ASSERT((((size_t)acc
) & 31) == 0);
708 { XXH_ALIGN(32) __m256i
* const xacc
= (__m256i
*) acc
;
709 const __m256i
* const xsecret
= (const __m256i
*) secret
; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
710 const __m256i prime32
= _mm256_set1_epi32((int)PRIME32_1
);
713 for (i
=0; i
< STRIPE_LEN
/sizeof(__m256i
); i
++) {
714 /* xacc[i] ^= (xacc[i] >> 47) */
715 __m256i
const acc_vec
= xacc
[i
];
716 __m256i
const shifted
= _mm256_srli_epi64 (acc_vec
, 47);
717 __m256i
const data_vec
= _mm256_xor_si256 (acc_vec
, shifted
);
718 /* xacc[i] ^= xsecret; */
719 __m256i
const key_vec
= _mm256_loadu_si256 (xsecret
+i
);
720 __m256i
const data_key
= _mm256_xor_si256 (data_vec
, key_vec
);
722 /* xacc[i] *= PRIME32_1; */
723 __m256i
const data_key_hi
= _mm256_shuffle_epi32 (data_key
, 0x31);
724 __m256i
const prod_lo
= _mm256_mul_epu32 (data_key
, prime32
);
725 __m256i
const prod_hi
= _mm256_mul_epu32 (data_key_hi
, prime32
);
726 xacc
[i
] = _mm256_add_epi64(prod_lo
, _mm256_slli_epi64(prod_hi
, 32));
730 #elif (XXH_VECTOR == XXH_SSE2)
732 XXH_ASSERT((((size_t)acc
) & 15) == 0);
733 { XXH_ALIGN(16) __m128i
* const xacc
= (__m128i
*) acc
;
734 const __m128i
* const xsecret
= (const __m128i
*) secret
; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
735 const __m128i prime32
= _mm_set1_epi32((int)PRIME32_1
);
738 for (i
=0; i
< STRIPE_LEN
/sizeof(__m128i
); i
++) {
739 /* xacc[i] ^= (xacc[i] >> 47) */
740 __m128i
const acc_vec
= xacc
[i
];
741 __m128i
const shifted
= _mm_srli_epi64 (acc_vec
, 47);
742 __m128i
const data_vec
= _mm_xor_si128 (acc_vec
, shifted
);
743 /* xacc[i] ^= xsecret; */
744 __m128i
const key_vec
= _mm_loadu_si128 (xsecret
+i
);
745 __m128i
const data_key
= _mm_xor_si128 (data_vec
, key_vec
);
747 /* xacc[i] *= PRIME32_1; */
748 __m128i
const data_key_hi
= _mm_shuffle_epi32 (data_key
, 0x31);
749 __m128i
const prod_lo
= _mm_mul_epu32 (data_key
, prime32
);
750 __m128i
const prod_hi
= _mm_mul_epu32 (data_key_hi
, prime32
);
751 xacc
[i
] = _mm_add_epi64(prod_lo
, _mm_slli_epi64(prod_hi
, 32));
755 #elif (XXH_VECTOR == XXH_NEON)
757 XXH_ASSERT((((size_t)acc
) & 15) == 0);
759 { uint64x2_t
* const xacc
= (uint64x2_t
*) acc
;
760 uint8_t const* const xsecret
= (uint8_t const*) secret
;
761 uint32x2_t
const prime
= vdup_n_u32 (PRIME32_1
);
764 for (i
=0; i
< STRIPE_LEN
/sizeof(uint64x2_t
); i
++) {
765 /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
766 uint64x2_t
const acc_vec
= xacc
[i
];
767 uint64x2_t
const shifted
= vshrq_n_u64 (acc_vec
, 47);
768 uint64x2_t
const data_vec
= veorq_u64 (acc_vec
, shifted
);
770 /* key_vec = xsecret[i]; */
771 uint32x4_t
const key_vec
= vreinterpretq_u32_u8(vld1q_u8(xsecret
+ (i
* 16)));
772 /* data_key = data_vec ^ key_vec; */
773 uint32x4_t
const data_key
= veorq_u32 (vreinterpretq_u32_u64(data_vec
), key_vec
);
774 /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
775 uint32x2x2_t
const shuffled
= vzip_u32 (vget_low_u32(data_key
), vget_high_u32(data_key
));
777 /* data_key *= PRIME32_1 */
779 /* prod_hi = (data_key >> 32) * PRIME32_1; */
780 uint64x2_t
const prod_hi
= vmull_u32 (shuffled
.val
[1], prime
);
781 /* xacc[i] = prod_hi << 32; */
782 xacc
[i
] = vshlq_n_u64(prod_hi
, 32);
783 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
784 xacc
[i
] = vmlal_u32(xacc
[i
], shuffled
.val
[0], prime
);
787 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
789 U64x2
* const xacc
= (U64x2
*) acc
;
790 const U64x2
* const xsecret
= (const U64x2
*) secret
;
792 U64x2
const v32
= { 32, 32 };
793 U64x2
const v47
= { 47, 47 };
794 U32x4
const prime
= { PRIME32_1
, PRIME32_1
, PRIME32_1
, PRIME32_1
};
798 U8x16
const vXorSwap
= { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
799 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
801 for (i
= 0; i
< STRIPE_LEN
/ sizeof(U64x2
); i
++) {
802 U64x2
const acc_vec
= xacc
[i
];
803 U64x2
const data_vec
= acc_vec
^ (acc_vec
>> v47
);
804 /* key_vec = xsecret[i]; */
806 /* swap bytes words */
807 U64x2
const key_raw
= vec_vsx_ld(0, xsecret
+ i
);
808 U64x2
const data_key
= (U64x2
)XXH_vec_permxor((U8x16
)data_vec
, (U8x16
)key_raw
, vXorSwap
);
810 U64x2
const key_vec
= vec_vsx_ld(0, xsecret
+ i
);
811 U64x2
const data_key
= data_vec
^ key_vec
;
814 /* data_key *= PRIME32_1 */
816 /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */
817 U64x2
const prod_even
= XXH_vec_mule((U32x4
)data_key
, prime
);
818 /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */
819 U64x2
const prod_odd
= XXH_vec_mulo((U32x4
)data_key
, prime
);
820 xacc
[i
] = prod_odd
+ (prod_even
<< v32
);
823 #else /* scalar variant of Scrambler - universal */
825 XXH_ALIGN(XXH_ACC_ALIGN
) xxh_u64
* const xacc
= (xxh_u64
*) acc
; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
826 const xxh_u8
* const xsecret
= (const xxh_u8
*) secret
; /* no alignment restriction */
828 XXH_ASSERT((((size_t)acc
) & (XXH_ACC_ALIGN
-1)) == 0);
829 for (i
=0; i
< ACC_NB
; i
++) {
830 xxh_u64
const key64
= XXH_readLE64(xsecret
+ 8*i
);
831 xxh_u64 acc64
= xacc
[i
];
832 acc64
^= acc64
>> 47;
841 #define XXH_PREFETCH_DIST 384
843 /* assumption : nbStripes will not overflow secret size */
844 XXH_FORCE_INLINE
void
845 XXH3p_accumulate( xxh_u64
* XXH_RESTRICT acc
,
846 const xxh_u8
* XXH_RESTRICT input
,
847 const xxh_u8
* XXH_RESTRICT secret
,
849 XXH3p_accWidth_e accWidth
)
852 for (n
= 0; n
< nbStripes
; n
++ ) {
853 const xxh_u8
* const in
= input
+ n
*STRIPE_LEN
;
854 XXH_PREFETCH(in
+ XXH_PREFETCH_DIST
);
855 XXH3p_accumulate_512(acc
,
857 secret
+ n
*XXH_SECRET_CONSUME_RATE
,
862 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
863 * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
864 * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
865 * Pretty much every other modes and compilers prefer `FORCE_INLINE`.
868 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
871 XXH_FORCE_INLINE
void
873 XXH3p_hashLong_internal_loop( xxh_u64
* XXH_RESTRICT acc
,
874 const xxh_u8
* XXH_RESTRICT input
, size_t len
,
875 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
,
876 XXH3p_accWidth_e accWidth
)
878 size_t const nb_rounds
= (secretSize
- STRIPE_LEN
) / XXH_SECRET_CONSUME_RATE
;
879 size_t const block_len
= STRIPE_LEN
* nb_rounds
;
880 size_t const nb_blocks
= len
/ block_len
;
884 XXH_ASSERT(secretSize
>= XXH3p_SECRET_SIZE_MIN
);
886 for (n
= 0; n
< nb_blocks
; n
++) {
887 XXH3p_accumulate(acc
, input
+ n
*block_len
, secret
, nb_rounds
, accWidth
);
888 XXH3p_scrambleAcc(acc
, secret
+ secretSize
- STRIPE_LEN
);
891 /* last partial block */
892 XXH_ASSERT(len
> STRIPE_LEN
);
893 { size_t const nbStripes
= (len
- (block_len
* nb_blocks
)) / STRIPE_LEN
;
894 XXH_ASSERT(nbStripes
<= (secretSize
/ XXH_SECRET_CONSUME_RATE
));
895 XXH3p_accumulate(acc
, input
+ nb_blocks
*block_len
, secret
, nbStripes
, accWidth
);
898 if (len
& (STRIPE_LEN
- 1)) {
899 const xxh_u8
* const p
= input
+ len
- STRIPE_LEN
;
900 #define XXH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */
901 XXH3p_accumulate_512(acc
, p
, secret
+ secretSize
- STRIPE_LEN
- XXH_SECRET_LASTACC_START
, accWidth
);
905 XXH_FORCE_INLINE xxh_u64
906 XXH3p_mix2Accs(const xxh_u64
* XXH_RESTRICT acc
, const xxh_u8
* XXH_RESTRICT secret
)
908 return XXH3p_mul128_fold64(
909 acc
[0] ^ XXH_readLE64(secret
),
910 acc
[1] ^ XXH_readLE64(secret
+8) );
914 XXH3p_mergeAccs(const xxh_u64
* XXH_RESTRICT acc
, const xxh_u8
* XXH_RESTRICT secret
, xxh_u64 start
)
916 xxh_u64 result64
= start
;
918 result64
+= XXH3p_mix2Accs(acc
+0, secret
+ 0);
919 result64
+= XXH3p_mix2Accs(acc
+2, secret
+ 16);
920 result64
+= XXH3p_mix2Accs(acc
+4, secret
+ 32);
921 result64
+= XXH3p_mix2Accs(acc
+6, secret
+ 48);
923 return XXH3p_avalanche(result64
);
926 #define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
927 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
929 XXH_FORCE_INLINE XXH64_hash_t
930 XXH3p_hashLong_internal(const xxh_u8
* XXH_RESTRICT input
, size_t len
,
931 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
)
933 XXH_ALIGN(XXH_ACC_ALIGN
) xxh_u64 acc
[ACC_NB
] = XXH3p_INIT_ACC
;
935 XXH3p_hashLong_internal_loop(acc
, input
, len
, secret
, secretSize
, XXH3p_acc_64bits
);
937 /* converge into final hash */
938 XXH_STATIC_ASSERT(sizeof(acc
) == 64);
939 #define XXH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */
940 XXH_ASSERT(secretSize
>= sizeof(acc
) + XXH_SECRET_MERGEACCS_START
);
941 return XXH3p_mergeAccs(acc
, secret
+ XXH_SECRET_MERGEACCS_START
, (xxh_u64
)len
* PRIME64_1
);
945 XXH_NO_INLINE XXH64_hash_t
/* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
946 XXH3p_hashLong_64b_defaultSecret(const xxh_u8
* XXH_RESTRICT input
, size_t len
)
948 return XXH3p_hashLong_internal(input
, len
, kSecret
, sizeof(kSecret
));
951 XXH_NO_INLINE XXH64_hash_t
/* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
952 XXH3p_hashLong_64b_withSecret(const xxh_u8
* XXH_RESTRICT input
, size_t len
,
953 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
)
955 return XXH3p_hashLong_internal(input
, len
, secret
, secretSize
);
959 XXH_FORCE_INLINE
void XXH_writeLE64(void* dst
, xxh_u64 v64
)
961 if (!XXH_CPU_LITTLE_ENDIAN
) v64
= XXH_swap64(v64
);
962 memcpy(dst
, &v64
, sizeof(v64
));
965 /* XXH3p_initCustomSecret() :
966 * destination `customSecret` is presumed allocated and same size as `kSecret`.
968 XXH_FORCE_INLINE
void XXH3p_initCustomSecret(xxh_u8
* customSecret
, xxh_u64 seed64
)
970 int const nbRounds
= XXH_SECRET_DEFAULT_SIZE
/ 16;
973 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE
& 15) == 0);
975 for (i
=0; i
< nbRounds
; i
++) {
976 XXH_writeLE64(customSecret
+ 16*i
, XXH_readLE64(kSecret
+ 16*i
) + seed64
);
977 XXH_writeLE64(customSecret
+ 16*i
+ 8, XXH_readLE64(kSecret
+ 16*i
+ 8) - seed64
);
982 /* XXH3p_hashLong_64b_withSeed() :
983 * Generate a custom key,
984 * based on alteration of default kSecret with the seed,
985 * and then use this key for long mode hashing.
986 * This operation is decently fast but nonetheless costs a little bit of time.
987 * Try to avoid it whenever possible (typically when seed==0).
989 XXH_NO_INLINE XXH64_hash_t
/* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
990 XXH3p_hashLong_64b_withSeed(const xxh_u8
* input
, size_t len
, XXH64_hash_t seed
)
992 XXH_ALIGN(8) xxh_u8 secret
[XXH_SECRET_DEFAULT_SIZE
];
993 if (seed
==0) return XXH3p_hashLong_64b_defaultSecret(input
, len
);
994 XXH3p_initCustomSecret(secret
, seed
);
995 return XXH3p_hashLong_internal(input
, len
, secret
, sizeof(secret
));
999 XXH_FORCE_INLINE xxh_u64
XXH3p_mix16B(const xxh_u8
* XXH_RESTRICT input
,
1000 const xxh_u8
* XXH_RESTRICT secret
, xxh_u64 seed64
)
1002 xxh_u64
const input_lo
= XXH_readLE64(input
);
1003 xxh_u64
const input_hi
= XXH_readLE64(input
+8);
1004 return XXH3p_mul128_fold64(
1005 input_lo
^ (XXH_readLE64(secret
) + seed64
),
1006 input_hi
^ (XXH_readLE64(secret
+8) - seed64
) );
1010 XXH_FORCE_INLINE XXH64_hash_t
1011 XXH3p_len_17to128_64b(const xxh_u8
* XXH_RESTRICT input
, size_t len
,
1012 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
,
1015 XXH_ASSERT(secretSize
>= XXH3p_SECRET_SIZE_MIN
); (void)secretSize
;
1016 XXH_ASSERT(16 < len
&& len
<= 128);
1018 { xxh_u64 acc
= len
* PRIME64_1
;
1022 acc
+= XXH3p_mix16B(input
+48, secret
+96, seed
);
1023 acc
+= XXH3p_mix16B(input
+len
-64, secret
+112, seed
);
1025 acc
+= XXH3p_mix16B(input
+32, secret
+64, seed
);
1026 acc
+= XXH3p_mix16B(input
+len
-48, secret
+80, seed
);
1028 acc
+= XXH3p_mix16B(input
+16, secret
+32, seed
);
1029 acc
+= XXH3p_mix16B(input
+len
-32, secret
+48, seed
);
1031 acc
+= XXH3p_mix16B(input
+0, secret
+0, seed
);
1032 acc
+= XXH3p_mix16B(input
+len
-16, secret
+16, seed
);
1034 return XXH3p_avalanche(acc
);
1038 #define XXH3p_MIDSIZE_MAX 240
1040 XXH_NO_INLINE XXH64_hash_t
1041 XXH3p_len_129to240_64b(const xxh_u8
* XXH_RESTRICT input
, size_t len
,
1042 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
,
1045 XXH_ASSERT(secretSize
>= XXH3p_SECRET_SIZE_MIN
); (void)secretSize
;
1046 XXH_ASSERT(128 < len
&& len
<= XXH3p_MIDSIZE_MAX
);
1048 #define XXH3p_MIDSIZE_STARTOFFSET 3
1049 #define XXH3p_MIDSIZE_LASTOFFSET 17
1051 { xxh_u64 acc
= len
* PRIME64_1
;
1052 int const nbRounds
= (int)len
/ 16;
1054 for (i
=0; i
<8; i
++) {
1055 acc
+= XXH3p_mix16B(input
+(16*i
), secret
+(16*i
), seed
);
1057 acc
= XXH3p_avalanche(acc
);
1058 XXH_ASSERT(nbRounds
>= 8);
1059 for (i
=8 ; i
< nbRounds
; i
++) {
1060 acc
+= XXH3p_mix16B(input
+(16*i
), secret
+(16*(i
-8)) + XXH3p_MIDSIZE_STARTOFFSET
, seed
);
1063 acc
+= XXH3p_mix16B(input
+ len
- 16, secret
+ XXH3p_SECRET_SIZE_MIN
- XXH3p_MIDSIZE_LASTOFFSET
, seed
);
1064 return XXH3p_avalanche(acc
);
1068 /* === Public entry point === */
1070 XXH_PUBLIC_API XXH64_hash_t
XXH3p_64bits(const void* input
, size_t len
)
1072 if (len
<= 16) return XXH3p_len_0to16_64b((const xxh_u8
*)input
, len
, kSecret
, 0);
1073 if (len
<= 128) return XXH3p_len_17to128_64b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), 0);
1074 if (len
<= XXH3p_MIDSIZE_MAX
) return XXH3p_len_129to240_64b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), 0);
1075 return XXH3p_hashLong_64b_defaultSecret((const xxh_u8
*)input
, len
);
1078 XXH_PUBLIC_API XXH64_hash_t
1079 XXH3p_64bits_withSecret(const void* input
, size_t len
, const void* secret
, size_t secretSize
)
1081 XXH_ASSERT(secretSize
>= XXH3p_SECRET_SIZE_MIN
);
1082 /* if an action must be taken should `secret` conditions not be respected,
1083 * it should be done here.
1084 * For now, it's a contract pre-condition.
1085 * Adding a check and a branch here would cost performance at every hash */
1086 if (len
<= 16) return XXH3p_len_0to16_64b((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, 0);
1087 if (len
<= 128) return XXH3p_len_17to128_64b((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, secretSize
, 0);
1088 if (len
<= XXH3p_MIDSIZE_MAX
) return XXH3p_len_129to240_64b((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, secretSize
, 0);
1089 return XXH3p_hashLong_64b_withSecret((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, secretSize
);
1092 XXH_PUBLIC_API XXH64_hash_t
1093 XXH3p_64bits_withSeed(const void* input
, size_t len
, XXH64_hash_t seed
)
1095 if (len
<= 16) return XXH3p_len_0to16_64b((const xxh_u8
*)input
, len
, kSecret
, seed
);
1096 if (len
<= 128) return XXH3p_len_17to128_64b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), seed
);
1097 if (len
<= XXH3p_MIDSIZE_MAX
) return XXH3p_len_129to240_64b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), seed
);
1098 return XXH3p_hashLong_64b_withSeed((const xxh_u8
*)input
, len
, seed
);
1101 /* === XXH3 streaming === */
1103 /* RocksDB Note: unused & removed due to bug in preview version */
1105 /* ==========================================
1106 * XXH3 128 bits (=> XXH128)
1107 * ========================================== */
1109 XXH_FORCE_INLINE XXH128_hash_t
1110 XXH3p_len_1to3_128b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
1112 XXH_ASSERT(input
!= NULL
);
1113 XXH_ASSERT(1 <= len
&& len
<= 3);
1114 XXH_ASSERT(secret
!= NULL
);
1115 { xxh_u8
const c1
= input
[0];
1116 xxh_u8
const c2
= input
[len
>> 1];
1117 xxh_u8
const c3
= input
[len
- 1];
1118 xxh_u32
const combinedl
= ((xxh_u32
)c1
) + (((xxh_u32
)c2
) << 8) + (((xxh_u32
)c3
) << 16) + (((xxh_u32
)len
) << 24);
1119 xxh_u32
const combinedh
= XXH_swap32(combinedl
);
1120 xxh_u64
const keyed_lo
= (xxh_u64
)combinedl
^ (XXH_readLE32(secret
) + seed
);
1121 xxh_u64
const keyed_hi
= (xxh_u64
)combinedh
^ (XXH_readLE32(secret
+4) - seed
);
1122 xxh_u64
const mixedl
= keyed_lo
* PRIME64_1
;
1123 xxh_u64
const mixedh
= keyed_hi
* PRIME64_5
;
1124 XXH128_hash_t
const h128
= { XXH3p_avalanche(mixedl
) /*low64*/, XXH3p_avalanche(mixedh
) /*high64*/ };
1130 XXH_FORCE_INLINE XXH128_hash_t
1131 XXH3p_len_4to8_128b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
1133 XXH_ASSERT(input
!= NULL
);
1134 XXH_ASSERT(secret
!= NULL
);
1135 XXH_ASSERT(4 <= len
&& len
<= 8);
1136 { xxh_u32
const input_lo
= XXH_readLE32(input
);
1137 xxh_u32
const input_hi
= XXH_readLE32(input
+ len
- 4);
1138 xxh_u64
const input_64_lo
= input_lo
+ ((xxh_u64
)input_hi
<< 32);
1139 xxh_u64
const input_64_hi
= XXH_swap64(input_64_lo
);
1140 xxh_u64
const keyed_lo
= input_64_lo
^ (XXH_readLE64(secret
) + seed
);
1141 xxh_u64
const keyed_hi
= input_64_hi
^ (XXH_readLE64(secret
+ 8) - seed
);
1142 xxh_u64
const mix64l1
= len
+ ((keyed_lo
^ (keyed_lo
>> 51)) * PRIME32_1
);
1143 xxh_u64
const mix64l2
= (mix64l1
^ (mix64l1
>> 47)) * PRIME64_2
;
1144 xxh_u64
const mix64h1
= ((keyed_hi
^ (keyed_hi
>> 47)) * PRIME64_1
) - len
;
1145 xxh_u64
const mix64h2
= (mix64h1
^ (mix64h1
>> 43)) * PRIME64_4
;
1146 { XXH128_hash_t
const h128
= { XXH3p_avalanche(mix64l2
) /*low64*/, XXH3p_avalanche(mix64h2
) /*high64*/ };
1151 XXH_FORCE_INLINE XXH128_hash_t
1152 XXH3p_len_9to16_128b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
1154 XXH_ASSERT(input
!= NULL
);
1155 XXH_ASSERT(secret
!= NULL
);
1156 XXH_ASSERT(9 <= len
&& len
<= 16);
1157 { xxh_u64
const input_lo
= XXH_readLE64(input
) ^ (XXH_readLE64(secret
) + seed
);
1158 xxh_u64
const input_hi
= XXH_readLE64(input
+ len
- 8) ^ (XXH_readLE64(secret
+8) - seed
);
1159 XXH128_hash_t m128
= XXH_mult64to128(input_lo
^ input_hi
, PRIME64_1
);
1160 xxh_u64
const lenContrib
= XXH_mult32to64(len
, PRIME32_5
);
1161 m128
.low64
+= lenContrib
;
1162 m128
.high64
+= input_hi
* PRIME64_1
;
1163 m128
.low64
^= (m128
.high64
>> 32);
1164 { XXH128_hash_t h128
= XXH_mult64to128(m128
.low64
, PRIME64_2
);
1165 h128
.high64
+= m128
.high64
* PRIME64_2
;
1166 h128
.low64
= XXH3p_avalanche(h128
.low64
);
1167 h128
.high64
= XXH3p_avalanche(h128
.high64
);
1172 /* Assumption : `secret` size is >= 16
1173 * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
1174 XXH_FORCE_INLINE XXH128_hash_t
1175 XXH3p_len_0to16_128b(const xxh_u8
* input
, size_t len
, const xxh_u8
* secret
, XXH64_hash_t seed
)
1177 XXH_ASSERT(len
<= 16);
1178 { if (len
> 8) return XXH3p_len_9to16_128b(input
, len
, secret
, seed
);
1179 if (len
>= 4) return XXH3p_len_4to8_128b(input
, len
, secret
, seed
);
1180 if (len
) return XXH3p_len_1to3_128b(input
, len
, secret
, seed
);
1181 { XXH128_hash_t
const h128
= { 0, 0 };
1186 XXH_FORCE_INLINE XXH128_hash_t
1187 XXH3p_hashLong_128b_internal(const xxh_u8
* XXH_RESTRICT input
, size_t len
,
1188 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
)
1190 XXH_ALIGN(XXH_ACC_ALIGN
) xxh_u64 acc
[ACC_NB
] = XXH3p_INIT_ACC
;
1192 XXH3p_hashLong_internal_loop(acc
, input
, len
, secret
, secretSize
, XXH3p_acc_128bits
);
1194 /* converge into final hash */
1195 XXH_STATIC_ASSERT(sizeof(acc
) == 64);
1196 XXH_ASSERT(secretSize
>= sizeof(acc
) + XXH_SECRET_MERGEACCS_START
);
1197 { xxh_u64
const low64
= XXH3p_mergeAccs(acc
, secret
+ XXH_SECRET_MERGEACCS_START
, (xxh_u64
)len
* PRIME64_1
);
1198 xxh_u64
const high64
= XXH3p_mergeAccs(acc
, secret
+ secretSize
- sizeof(acc
) - XXH_SECRET_MERGEACCS_START
, ~((xxh_u64
)len
* PRIME64_2
));
1199 XXH128_hash_t
const h128
= { low64
, high64
};
1204 XXH_NO_INLINE XXH128_hash_t
/* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1205 XXH3p_hashLong_128b_defaultSecret(const xxh_u8
* input
, size_t len
)
1207 return XXH3p_hashLong_128b_internal(input
, len
, kSecret
, sizeof(kSecret
));
1210 XXH_NO_INLINE XXH128_hash_t
/* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1211 XXH3p_hashLong_128b_withSecret(const xxh_u8
* input
, size_t len
,
1212 const xxh_u8
* secret
, size_t secretSize
)
1214 return XXH3p_hashLong_128b_internal(input
, len
, secret
, secretSize
);
1217 XXH_NO_INLINE XXH128_hash_t
/* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1218 XXH3p_hashLong_128b_withSeed(const xxh_u8
* input
, size_t len
, XXH64_hash_t seed
)
1220 XXH_ALIGN(8) xxh_u8 secret
[XXH_SECRET_DEFAULT_SIZE
];
1221 if (seed
== 0) return XXH3p_hashLong_128b_defaultSecret(input
, len
);
1222 XXH3p_initCustomSecret(secret
, seed
);
1223 return XXH3p_hashLong_128b_internal(input
, len
, secret
, sizeof(secret
));
1227 XXH_FORCE_INLINE XXH128_hash_t
1228 XXH128_mix32B(XXH128_hash_t acc
, const xxh_u8
* input_1
, const xxh_u8
* input_2
, const xxh_u8
* secret
, XXH64_hash_t seed
)
1230 acc
.low64
+= XXH3p_mix16B (input_1
, secret
+0, seed
);
1231 acc
.low64
^= XXH_readLE64(input_2
) + XXH_readLE64(input_2
+ 8);
1232 acc
.high64
+= XXH3p_mix16B (input_2
, secret
+16, seed
);
1233 acc
.high64
^= XXH_readLE64(input_1
) + XXH_readLE64(input_1
+ 8);
1237 XXH_NO_INLINE XXH128_hash_t
1238 XXH3p_len_129to240_128b(const xxh_u8
* XXH_RESTRICT input
, size_t len
,
1239 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
,
1242 XXH_ASSERT(secretSize
>= XXH3p_SECRET_SIZE_MIN
); (void)secretSize
;
1243 XXH_ASSERT(128 < len
&& len
<= XXH3p_MIDSIZE_MAX
);
1245 { XXH128_hash_t acc
;
1246 int const nbRounds
= (int)len
/ 32;
1248 acc
.low64
= len
* PRIME64_1
;
1250 for (i
=0; i
<4; i
++) {
1251 acc
= XXH128_mix32B(acc
, input
+(32*i
), input
+(32*i
)+16, secret
+(32*i
), seed
);
1253 acc
.low64
= XXH3p_avalanche(acc
.low64
);
1254 acc
.high64
= XXH3p_avalanche(acc
.high64
);
1255 XXH_ASSERT(nbRounds
>= 4);
1256 for (i
=4 ; i
< nbRounds
; i
++) {
1257 acc
= XXH128_mix32B(acc
, input
+(32*i
), input
+(32*i
)+16, secret
+XXH3p_MIDSIZE_STARTOFFSET
+(32*(i
-4)), seed
);
1260 acc
= XXH128_mix32B(acc
, input
+ len
- 16, input
+ len
- 32, secret
+ XXH3p_SECRET_SIZE_MIN
- XXH3p_MIDSIZE_LASTOFFSET
- 16, 0ULL - seed
);
1262 { xxh_u64
const low64
= acc
.low64
+ acc
.high64
;
1263 xxh_u64
const high64
= (acc
.low64
* PRIME64_1
) + (acc
.high64
* PRIME64_4
) + ((len
- seed
) * PRIME64_2
);
1264 XXH128_hash_t
const h128
= { XXH3p_avalanche(low64
), (XXH64_hash_t
)0 - XXH3p_avalanche(high64
) };
1271 XXH_FORCE_INLINE XXH128_hash_t
1272 XXH3p_len_17to128_128b(const xxh_u8
* XXH_RESTRICT input
, size_t len
,
1273 const xxh_u8
* XXH_RESTRICT secret
, size_t secretSize
,
1276 XXH_ASSERT(secretSize
>= XXH3p_SECRET_SIZE_MIN
); (void)secretSize
;
1277 XXH_ASSERT(16 < len
&& len
<= 128);
1279 { XXH128_hash_t acc
;
1280 acc
.low64
= len
* PRIME64_1
;
1285 acc
= XXH128_mix32B(acc
, input
+48, input
+len
-64, secret
+96, seed
);
1287 acc
= XXH128_mix32B(acc
, input
+32, input
+len
-48, secret
+64, seed
);
1289 acc
= XXH128_mix32B(acc
, input
+16, input
+len
-32, secret
+32, seed
);
1291 acc
= XXH128_mix32B(acc
, input
, input
+len
-16, secret
, seed
);
1292 { xxh_u64
const low64
= acc
.low64
+ acc
.high64
;
1293 xxh_u64
const high64
= (acc
.low64
* PRIME64_1
) + (acc
.high64
* PRIME64_4
) + ((len
- seed
) * PRIME64_2
);
1294 XXH128_hash_t
const h128
= { XXH3p_avalanche(low64
), (XXH64_hash_t
)0 - XXH3p_avalanche(high64
) };
1300 XXH_PUBLIC_API XXH128_hash_t
XXH3p_128bits(const void* input
, size_t len
)
1302 if (len
<= 16) return XXH3p_len_0to16_128b((const xxh_u8
*)input
, len
, kSecret
, 0);
1303 if (len
<= 128) return XXH3p_len_17to128_128b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), 0);
1304 if (len
<= XXH3p_MIDSIZE_MAX
) return XXH3p_len_129to240_128b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), 0);
1305 return XXH3p_hashLong_128b_defaultSecret((const xxh_u8
*)input
, len
);
1308 XXH_PUBLIC_API XXH128_hash_t
1309 XXH3p_128bits_withSecret(const void* input
, size_t len
, const void* secret
, size_t secretSize
)
1311 XXH_ASSERT(secretSize
>= XXH3p_SECRET_SIZE_MIN
);
1312 /* if an action must be taken should `secret` conditions not be respected,
1313 * it should be done here.
1314 * For now, it's a contract pre-condition.
1315 * Adding a check and a branch here would cost performance at every hash */
1316 if (len
<= 16) return XXH3p_len_0to16_128b((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, 0);
1317 if (len
<= 128) return XXH3p_len_17to128_128b((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, secretSize
, 0);
1318 if (len
<= XXH3p_MIDSIZE_MAX
) return XXH3p_len_129to240_128b((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, secretSize
, 0);
1319 return XXH3p_hashLong_128b_withSecret((const xxh_u8
*)input
, len
, (const xxh_u8
*)secret
, secretSize
);
1322 XXH_PUBLIC_API XXH128_hash_t
1323 XXH3p_128bits_withSeed(const void* input
, size_t len
, XXH64_hash_t seed
)
1325 if (len
<= 16) return XXH3p_len_0to16_128b((const xxh_u8
*)input
, len
, kSecret
, seed
);
1326 if (len
<= 128) return XXH3p_len_17to128_128b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), seed
);
1327 if (len
<= XXH3p_MIDSIZE_MAX
) return XXH3p_len_129to240_128b((const xxh_u8
*)input
, len
, kSecret
, sizeof(kSecret
), seed
);
1328 return XXH3p_hashLong_128b_withSeed((const xxh_u8
*)input
, len
, seed
);
1331 XXH_PUBLIC_API XXH128_hash_t
1332 XXH128(const void* input
, size_t len
, XXH64_hash_t seed
)
1334 return XXH3p_128bits_withSeed(input
, len
, seed
);
1338 /* === XXH3 128-bit streaming === */
1340 /* RocksDB Note: unused & removed due to bug in preview version */
1342 /* 128-bit utility functions */
1344 #include <string.h> /* memcmp */
1346 /* return : 1 is equal, 0 if different */
1347 XXH_PUBLIC_API
int XXH128_isEqual(XXH128_hash_t h1
, XXH128_hash_t h2
)
1349 /* note : XXH128_hash_t is compact, it has no padding byte */
1350 return !(memcmp(&h1
, &h2
, sizeof(h1
)));
1353 /* This prototype is compatible with stdlib's qsort().
1354 * return : >0 if *h128_1 > *h128_2
1355 * <0 if *h128_1 < *h128_2
1356 * =0 if *h128_1 == *h128_2 */
1357 XXH_PUBLIC_API
int XXH128_cmp(const void* h128_1
, const void* h128_2
)
1359 XXH128_hash_t
const h1
= *(const XXH128_hash_t
*)h128_1
;
1360 XXH128_hash_t
const h2
= *(const XXH128_hash_t
*)h128_2
;
1361 int const hcmp
= (h1
.high64
> h2
.high64
) - (h2
.high64
> h1
.high64
);
1362 /* note : bets that, in most cases, hash values are different */
1363 if (hcmp
) return hcmp
;
1364 return (h1
.low64
> h2
.low64
) - (h2
.low64
> h1
.low64
);
1368 /*====== Canonical representation ======*/
1370 XXH128_canonicalFromHash(XXH128_canonical_t
* dst
, XXH128_hash_t hash
)
1372 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t
) == sizeof(XXH128_hash_t
));
1373 if (XXH_CPU_LITTLE_ENDIAN
) {
1374 hash
.high64
= XXH_swap64(hash
.high64
);
1375 hash
.low64
= XXH_swap64(hash
.low64
);
1377 memcpy(dst
, &hash
.high64
, sizeof(hash
.high64
));
1378 memcpy((char*)dst
+ sizeof(hash
.high64
), &hash
.low64
, sizeof(hash
.low64
));
1381 XXH_PUBLIC_API XXH128_hash_t
1382 XXH128_hashFromCanonical(const XXH128_canonical_t
* src
)
1385 h
.high64
= XXH_readBE64(src
);
1386 h
.low64
= XXH_readBE64(src
->digest
+ 8);
1392 #endif /* XXH3p_H */