]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/util/xxh3p.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / util / xxh3p.h
1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 /*
6 xxHash - Extremely Fast Hash algorithm
7 Development source file for `xxh3`
8 Copyright (C) 2019-present, Yann Collet.
9
10 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
11
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are
14 met:
15
16 * Redistributions of source code must retain the above copyright
17 notice, this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above
19 copyright notice, this list of conditions and the following disclaimer
20 in the documentation and/or other materials provided with the
21 distribution.
22
23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
35 You can contact the author at :
36 - xxHash source repository : https://github.com/Cyan4973/xxHash
37 */
38
39 /* RocksDB Note: This file contains a preview release (xxhash repository
40 version 0.7.2) of XXH3 that is unlikely to be compatible with the final
41 version of XXH3. We have therefore renamed this XXH3p ("preview"), for
42 clarity so that we can continue to use this version even after
43 integrating a newer incompatible version.
44 */
45
46 /* Note :
47 This file is separated for development purposes.
48 It will be integrated into `xxhash.c` when development phase is complete.
49 */
50
51 #ifndef XXH3p_H
52 #define XXH3p_H
53
54
55 /* === Dependencies === */
56
57 #undef XXH_INLINE_ALL /* in case it's already defined */
58 #define XXH_INLINE_ALL
59 #include "xxhash.h"
60
61
62 /* === Compiler specifics === */
63
64 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
65 # define XXH_RESTRICT restrict
66 #else
67 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
68 # define XXH_RESTRICT /* disable */
69 #endif
70
71 #if defined(__GNUC__)
72 # if defined(__AVX2__)
73 # include <immintrin.h>
74 # elif defined(__SSE2__)
75 # include <emmintrin.h>
76 # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
77 # define inline __inline__ /* clang bug */
78 # include <arm_neon.h>
79 # undef inline
80 # endif
81 #elif defined(_MSC_VER)
82 # include <intrin.h>
83 #endif
84
85 /*
86 * Sanity check.
87 *
88 * XXH3 only requires these features to be efficient:
89 *
90 * - Usable unaligned access
91 * - A 32-bit or 64-bit ALU
92 * - If 32-bit, a decent ADC instruction
93 * - A 32 or 64-bit multiply with a 64-bit result
94 *
95 * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
96 * classic 16-bit only subset of ARM's instruction set.
97 *
98 * First of all, Thumb-1 lacks support for the UMULL instruction which
99 * performs the important long multiply. This means numerous __aeabi_lmul
100 * calls.
101 *
102 * Second of all, the 8 functional registers are just not enough.
103 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
104 * Lo registers, and this shuffling results in thousands more MOVs than A32.
105 *
106 * A32 and T32 don't have this limitation. They can access all 14 registers,
107 * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
108 *
109 * If compiling Thumb-1 for a target which supports ARM instructions, we
110 * will give a warning.
111 *
112 * Usually, if this happens, it is because of an accident and you probably
113 * need to specify -march, as you probably meant to compileh for a newer
114 * architecture.
115 */
116 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
117 # warning "XXH3 is highly inefficient without ARM or Thumb-2."
118 #endif
119
120 /* ==========================================
121 * Vectorization detection
122 * ========================================== */
123 #define XXH_SCALAR 0
124 #define XXH_SSE2 1
125 #define XXH_AVX2 2
126 #define XXH_NEON 3
127 #define XXH_VSX 4
128
129 #ifndef XXH_VECTOR /* can be defined on command line */
130 # if defined(__AVX2__)
131 # define XXH_VECTOR XXH_AVX2
132 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
133 # define XXH_VECTOR XXH_SSE2
134 # elif defined(__GNUC__) /* msvc support maybe later */ \
135 && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
136 && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
137 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
138 # define XXH_VECTOR XXH_NEON
139 # elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
140 # define XXH_VECTOR XXH_VSX
141 # else
142 # define XXH_VECTOR XXH_SCALAR
143 # endif
144 #endif
145
146 /* control alignment of accumulator,
147 * for compatibility with fast vector loads */
148 #ifndef XXH_ACC_ALIGN
149 # if XXH_VECTOR == 0 /* scalar */
150 # define XXH_ACC_ALIGN 8
151 # elif XXH_VECTOR == 1 /* sse2 */
152 # define XXH_ACC_ALIGN 16
153 # elif XXH_VECTOR == 2 /* avx2 */
154 # define XXH_ACC_ALIGN 32
155 # elif XXH_VECTOR == 3 /* neon */
156 # define XXH_ACC_ALIGN 16
157 # elif XXH_VECTOR == 4 /* vsx */
158 # define XXH_ACC_ALIGN 16
159 # endif
160 #endif
161
162 /* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
163 #if defined(_MSC_VER) && defined(_M_IX86)
164 # include <intrin.h>
165 # define XXH_mult32to64(x, y) __emulu(x, y)
166 #else
167 # define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
168 #endif
169
170 /* VSX stuff. It's a lot because VSX support is mediocre across compilers and
171 * there is a lot of mischief with endianness. */
172 #if XXH_VECTOR == XXH_VSX
173 # include <altivec.h>
174 # undef vector
175 typedef __vector unsigned long long U64x2;
176 typedef __vector unsigned char U8x16;
177 typedef __vector unsigned U32x4;
178
179 #ifndef XXH_VSX_BE
180 # if defined(__BIG_ENDIAN__) \
181 || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
182 # define XXH_VSX_BE 1
183 # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
184 # warning "-maltivec=be is not recommended. Please use native endianness."
185 # define XXH_VSX_BE 1
186 # else
187 # define XXH_VSX_BE 0
188 # endif
189 #endif
190
191 /* We need some helpers for big endian mode. */
192 #if XXH_VSX_BE
193 /* A wrapper for POWER9's vec_revb. */
194 # ifdef __POWER9_VECTOR__
195 # define XXH_vec_revb vec_revb
196 # else
197 XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
198 {
199 U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
200 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
201 return vec_perm(val, val, vByteSwap);
202 }
203 # endif
204
205 /* Power8 Crypto gives us vpermxor which is very handy for
206 * PPC64EB.
207 *
208 * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
209 * {
210 * U8x16 ret;
211 * for (int i = 0; i < 16; i++) {
212 * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
213 * }
214 * return ret;
215 * }
216 *
217 * Because both of the main loops load the key, swap, and xor it with input,
218 * we can combine the key swap into this instruction.
219 */
220 # ifdef vec_permxor
221 # define XXH_vec_permxor vec_permxor
222 # else
223 # define XXH_vec_permxor __builtin_crypto_vpermxor
224 # endif
225 #endif /* XXH_VSX_BE */
226 /*
227 * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
228 * vec_mule.
229 *
230 * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
231 * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
232 * GCC needs inline assembly. */
233 #if __has_builtin(__builtin_altivec_vmuleuw)
234 # define XXH_vec_mulo __builtin_altivec_vmulouw
235 # define XXH_vec_mule __builtin_altivec_vmuleuw
236 #else
237 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
238 XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
239 U64x2 result;
240 __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
241 return result;
242 }
243 XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
244 U64x2 result;
245 __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
246 return result;
247 }
248 #endif /* __has_builtin(__builtin_altivec_vmuleuw) */
249 #endif /* XXH_VECTOR == XXH_VSX */
250
251 /* prefetch
252 * can be disabled, by declaring XXH_NO_PREFETCH build macro */
253 #if defined(XXH_NO_PREFETCH)
254 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
255 #else
256 #if defined(_MSC_VER) && \
257 (defined(_M_X64) || \
258 defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */
259 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
260 # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
261 # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
262 # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
263 # else
264 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
265 # endif
266 #endif /* XXH_NO_PREFETCH */
267
268
269 /* ==========================================
270 * XXH3 default settings
271 * ========================================== */
272
273 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3p_SECRET_SIZE_MIN */
274
275 #if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
276 # error "default keyset is not large enough"
277 #endif
278
279 XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
280 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
281 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
282 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
283 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
284 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
285 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
286 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
287 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
288
289 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
290 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
291 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
292 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
293 };
294
295 /*
296 * GCC for x86 has a tendency to use SSE in this loop. While it
297 * successfully avoids swapping (as MUL overwrites EAX and EDX), it
298 * slows it down because instead of free register swap shifts, it
299 * must use pshufd and punpckl/hd.
300 *
301 * To prevent this, we use this attribute to shut off SSE.
302 */
303 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
304 __attribute__((__target__("no-sse")))
305 #endif
306 static XXH128_hash_t
307 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
308 {
309 /*
310 * GCC/Clang __uint128_t method.
311 *
312 * On most 64-bit targets, GCC and Clang define a __uint128_t type.
313 * This is usually the best way as it usually uses a native long 64-bit
314 * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
315 *
316 * Usually.
317 *
318 * Despite being a 32-bit platform, Clang (and emscripten) define this
319 * type despite not having the arithmetic for it. This results in a
320 * laggy compiler builtin call which calculates a full 128-bit multiply.
321 * In that case it is best to use the portable one.
322 * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
323 */
324 #if defined(__GNUC__) && !defined(__wasm__) \
325 && defined(__SIZEOF_INT128__) \
326 || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
327
328 __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
329 XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
330 return r128;
331
332 /*
333 * MSVC for x64's _umul128 method.
334 *
335 * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
336 *
337 * This compiles to single operand MUL on x64.
338 */
339 #elif defined(_M_X64) || defined(_M_IA64)
340
341 #ifndef _MSC_VER
342 # pragma intrinsic(_umul128)
343 #endif
344 xxh_u64 product_high;
345 xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
346 XXH128_hash_t const r128 = { product_low, product_high };
347 return r128;
348
349 #else
350 /*
351 * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
352 *
353 * This is a fast and simple grade school multiply, which is shown
354 * below with base 10 arithmetic instead of base 0x100000000.
355 *
356 * 9 3 // D2 lhs = 93
357 * x 7 5 // D2 rhs = 75
358 * ----------
359 * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
360 * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
361 * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
362 * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
363 * ---------
364 * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21
365 * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63
366 * ---------
367 * 6 9 7 5
368 *
369 * The reasons for adding the products like this are:
370 * 1. It avoids manual carry tracking. Just like how
371 * (9 * 9) + 9 + 9 = 99, the same applies with this for
372 * UINT64_MAX. This avoids a lot of complexity.
373 *
374 * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
375 * instruction available in ARMv6+ A32/T32, which is shown below:
376 *
377 * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
378 * {
379 * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
380 * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
381 * *RdHi = (xxh_u32)(product >> 32);
382 * }
383 *
384 * This instruction was designed for efficient long multiplication,
385 * and allows this to be calculated in only 4 instructions which
386 * is comparable to some 64-bit ALUs.
387 *
388 * 3. It isn't terrible on other platforms. Usually this will be
389 * a couple of 32-bit ADD/ADCs.
390 */
391
392 /* First calculate all of the cross products. */
393 xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
394 xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
395 xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
396 xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
397
398 /* Now add the products together. These will never overflow. */
399 xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
400 xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
401 xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
402
403 XXH128_hash_t r128 = { lower, upper };
404 return r128;
405 #endif
406 }
407
408 /*
409 * We want to keep the attribute here because a target switch
410 * disables inlining.
411 *
412 * Does a 64-bit to 128-bit multiply, then XOR folds it.
413 * The reason for the separate function is to prevent passing
414 * too many structs around by value. This will hopefully inline
415 * the multiply, but we don't force it.
416 */
417 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
418 __attribute__((__target__("no-sse")))
419 #endif
420 static xxh_u64
421 XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
422 {
423 XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
424 return product.low64 ^ product.high64;
425 }
426
427
428 static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
429 {
430 h64 ^= h64 >> 37;
431 h64 *= PRIME64_3;
432 h64 ^= h64 >> 32;
433 return h64;
434 }
435
436
437 /* ==========================================
438 * Short keys
439 * ========================================== */
440
441 XXH_FORCE_INLINE XXH64_hash_t
442 XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
443 {
444 XXH_ASSERT(input != NULL);
445 XXH_ASSERT(1 <= len && len <= 3);
446 XXH_ASSERT(secret != NULL);
447 { xxh_u8 const c1 = input[0];
448 xxh_u8 const c2 = input[len >> 1];
449 xxh_u8 const c3 = input[len - 1];
450 xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
451 xxh_u64 const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
452 xxh_u64 const mixed = keyed * PRIME64_1;
453 return XXH3p_avalanche(mixed);
454 }
455 }
456
457 XXH_FORCE_INLINE XXH64_hash_t
458 XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
459 {
460 XXH_ASSERT(input != NULL);
461 XXH_ASSERT(secret != NULL);
462 XXH_ASSERT(4 <= len && len <= 8);
463 { xxh_u32 const input_lo = XXH_readLE32(input);
464 xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
465 xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
466 xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
467 xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
468 return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
469 }
470 }
471
472 XXH_FORCE_INLINE XXH64_hash_t
473 XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
474 {
475 XXH_ASSERT(input != NULL);
476 XXH_ASSERT(secret != NULL);
477 XXH_ASSERT(9 <= len && len <= 16);
478 { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
479 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
480 xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
481 return XXH3p_avalanche(acc);
482 }
483 }
484
485 XXH_FORCE_INLINE XXH64_hash_t
486 XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
487 {
488 XXH_ASSERT(len <= 16);
489 { if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
490 if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
491 if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
492 /*
493 * RocksDB modification from XXH3 preview: zero result for empty
494 * string can be problematic for multiplication-based algorithms.
495 * Return a hash of the seed instead.
496 */
497 return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
498 }
499 }
500
501
502 /* === Long Keys === */
503
504 #define STRIPE_LEN 64
505 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
506 #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
507
508 typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
509
510 XXH_FORCE_INLINE void
511 XXH3p_accumulate_512( void* XXH_RESTRICT acc,
512 const void* XXH_RESTRICT input,
513 const void* XXH_RESTRICT secret,
514 XXH3p_accWidth_e accWidth)
515 {
516 #if (XXH_VECTOR == XXH_AVX2)
517
518 XXH_ASSERT((((size_t)acc) & 31) == 0);
519 { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
520 const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
521 const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
522
523 size_t i;
524 for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
525 __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
526 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
527 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
528 __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
529 if (accWidth == XXH3p_acc_128bits) {
530 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
531 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
532 xacc[i] = _mm256_add_epi64(product, sum);
533 } else { /* XXH3p_acc_64bits */
534 __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
535 xacc[i] = _mm256_add_epi64(product, sum);
536 }
537 } }
538
539 #elif (XXH_VECTOR == XXH_SSE2)
540
541 XXH_ASSERT((((size_t)acc) & 15) == 0);
542 { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
543 const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
544 const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
545
546 size_t i;
547 for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
548 __m128i const data_vec = _mm_loadu_si128 (xinput+i);
549 __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
550 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
551 __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
552 if (accWidth == XXH3p_acc_128bits) {
553 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
554 __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
555 xacc[i] = _mm_add_epi64(product, sum);
556 } else { /* XXH3p_acc_64bits */
557 __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
558 xacc[i] = _mm_add_epi64(product, sum);
559 }
560 } }
561
562 #elif (XXH_VECTOR == XXH_NEON)
563
564 XXH_ASSERT((((size_t)acc) & 15) == 0);
565 {
566 XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
567 /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
568 uint8_t const* const xinput = (const uint8_t *) input;
569 uint8_t const* const xsecret = (const uint8_t *) secret;
570
571 size_t i;
572 for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
573 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
574 /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
575 * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
576 * assumes I don't want to destroy it and tries to make a copy. This slows down the code
577 * a lot.
578 * aarch64 not only uses an entirely different syntax, but it requires three
579 * instructions...
580 * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly
581 * zip1 v3.2s, v0.2s, v1.2s // first zip
582 * zip2 v2.2s, v0.2s, v1.2s // second zip
583 * ...to do what ARM does in one:
584 * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */
585
586 /* data_vec = xsecret[i]; */
587 uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
588 /* key_vec = xsecret[i]; */
589 uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
590 /* data_key = data_vec ^ key_vec; */
591 uint32x4_t data_key;
592
593 if (accWidth == XXH3p_acc_64bits) {
594 /* Add first to prevent register swaps */
595 /* xacc[i] += data_vec; */
596 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
597 } else { /* XXH3p_acc_128bits */
598 /* xacc[i] += swap(data_vec); */
599 /* can probably be optimized better */
600 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
601 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
602 xacc[i] = vaddq_u64 (xacc[i], swapped);
603 }
604
605 data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
606
607 /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
608 * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
609 __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
610 /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
611 xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
612
613 #else
614 /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
615
616 /* data_vec = xsecret[i]; */
617 uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
618 /* key_vec = xsecret[i]; */
619 uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
620 /* data_key = data_vec ^ key_vec; */
621 uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
622 /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
623 uint32x2_t const data_key_lo = vmovn_u64 (data_key);
624 /* data_key_hi = (uint32x2_t) (data_key >> 32); */
625 uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
626 if (accWidth == XXH3p_acc_64bits) {
627 /* xacc[i] += data_vec; */
628 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
629 } else { /* XXH3p_acc_128bits */
630 /* xacc[i] += swap(data_vec); */
631 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
632 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
633 xacc[i] = vaddq_u64 (xacc[i], swapped);
634 }
635 /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
636 xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
637
638 #endif
639 }
640 }
641
642 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
643 U64x2* const xacc = (U64x2*) acc; /* presumed aligned */
644 U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */
645 U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */
646 U64x2 const v32 = { 32, 32 };
647 #if XXH_VSX_BE
648 U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
649 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
650 #endif
651 size_t i;
652 for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
653 /* data_vec = xinput[i]; */
654 /* key_vec = xsecret[i]; */
655 #if XXH_VSX_BE
656 /* byteswap */
657 U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
658 U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
659 /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
660 U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
661 #else
662 U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
663 U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
664 U64x2 const data_key = data_vec ^ key_vec;
665 #endif
666 /* shuffled = (data_key << 32) | (data_key >> 32); */
667 U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
668 /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
669 U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
670 xacc[i] += product;
671
672 if (accWidth == XXH3p_acc_64bits) {
673 xacc[i] += data_vec;
674 } else { /* XXH3p_acc_128bits */
675 /* swap high and low halves */
676 U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
677 xacc[i] += data_swapped;
678 }
679 }
680
681 #else /* scalar variant of Accumulator - universal */
682
683 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
684 const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
685 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
686 size_t i;
687 XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
688 for (i=0; i < ACC_NB; i++) {
689 xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
690 xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
691
692 if (accWidth == XXH3p_acc_64bits) {
693 xacc[i] += data_val;
694 } else {
695 xacc[i ^ 1] += data_val; /* swap adjacent lanes */
696 }
697 xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
698 }
699 #endif
700 }
701
702 XXH_FORCE_INLINE void
703 XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
704 {
705 #if (XXH_VECTOR == XXH_AVX2)
706
707 XXH_ASSERT((((size_t)acc) & 31) == 0);
708 { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
709 const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
710 const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
711
712 size_t i;
713 for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
714 /* xacc[i] ^= (xacc[i] >> 47) */
715 __m256i const acc_vec = xacc[i];
716 __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
717 __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
718 /* xacc[i] ^= xsecret; */
719 __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
720 __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
721
722 /* xacc[i] *= PRIME32_1; */
723 __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
724 __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
725 __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
726 xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
727 }
728 }
729
730 #elif (XXH_VECTOR == XXH_SSE2)
731
732 XXH_ASSERT((((size_t)acc) & 15) == 0);
733 { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
734 const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
735 const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
736
737 size_t i;
738 for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
739 /* xacc[i] ^= (xacc[i] >> 47) */
740 __m128i const acc_vec = xacc[i];
741 __m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
742 __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
743 /* xacc[i] ^= xsecret; */
744 __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
745 __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
746
747 /* xacc[i] *= PRIME32_1; */
748 __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
749 __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
750 __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
751 xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
752 }
753 }
754
755 #elif (XXH_VECTOR == XXH_NEON)
756
757 XXH_ASSERT((((size_t)acc) & 15) == 0);
758
759 { uint64x2_t* const xacc = (uint64x2_t*) acc;
760 uint8_t const* const xsecret = (uint8_t const*) secret;
761 uint32x2_t const prime = vdup_n_u32 (PRIME32_1);
762
763 size_t i;
764 for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
765 /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
766 uint64x2_t const acc_vec = xacc[i];
767 uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47);
768 uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted);
769
770 /* key_vec = xsecret[i]; */
771 uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
772 /* data_key = data_vec ^ key_vec; */
773 uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec);
774 /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
775 uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key));
776
777 /* data_key *= PRIME32_1 */
778
779 /* prod_hi = (data_key >> 32) * PRIME32_1; */
780 uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime);
781 /* xacc[i] = prod_hi << 32; */
782 xacc[i] = vshlq_n_u64(prod_hi, 32);
783 /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
784 xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
785 } }
786
787 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
788
789 U64x2* const xacc = (U64x2*) acc;
790 const U64x2* const xsecret = (const U64x2*) secret;
791 /* constants */
792 U64x2 const v32 = { 32, 32 };
793 U64x2 const v47 = { 47, 47 };
794 U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
795 size_t i;
796 #if XXH_VSX_BE
797 /* endian swap */
798 U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
799 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
800 #endif
801 for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
802 U64x2 const acc_vec = xacc[i];
803 U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
804 /* key_vec = xsecret[i]; */
805 #if XXH_VSX_BE
806 /* swap bytes words */
807 U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
808 U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
809 #else
810 U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
811 U64x2 const data_key = data_vec ^ key_vec;
812 #endif
813
814 /* data_key *= PRIME32_1 */
815
816 /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */
817 U64x2 const prod_even = XXH_vec_mule((U32x4)data_key, prime);
818 /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */
819 U64x2 const prod_odd = XXH_vec_mulo((U32x4)data_key, prime);
820 xacc[i] = prod_odd + (prod_even << v32);
821 }
822
823 #else /* scalar variant of Scrambler - universal */
824
825 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
826 const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
827 size_t i;
828 XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
829 for (i=0; i < ACC_NB; i++) {
830 xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
831 xxh_u64 acc64 = xacc[i];
832 acc64 ^= acc64 >> 47;
833 acc64 ^= key64;
834 acc64 *= PRIME32_1;
835 xacc[i] = acc64;
836 }
837
838 #endif
839 }
840
841 #define XXH_PREFETCH_DIST 384
842
843 /* assumption : nbStripes will not overflow secret size */
844 XXH_FORCE_INLINE void
845 XXH3p_accumulate( xxh_u64* XXH_RESTRICT acc,
846 const xxh_u8* XXH_RESTRICT input,
847 const xxh_u8* XXH_RESTRICT secret,
848 size_t nbStripes,
849 XXH3p_accWidth_e accWidth)
850 {
851 size_t n;
852 for (n = 0; n < nbStripes; n++ ) {
853 const xxh_u8* const in = input + n*STRIPE_LEN;
854 XXH_PREFETCH(in + XXH_PREFETCH_DIST);
855 XXH3p_accumulate_512(acc,
856 in,
857 secret + n*XXH_SECRET_CONSUME_RATE,
858 accWidth);
859 }
860 }
861
862 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
863 * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
864 * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
865 * Pretty much every other modes and compilers prefer `FORCE_INLINE`.
866 */
867
868 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
869 static void
870 #else
871 XXH_FORCE_INLINE void
872 #endif
873 XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
874 const xxh_u8* XXH_RESTRICT input, size_t len,
875 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
876 XXH3p_accWidth_e accWidth)
877 {
878 size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
879 size_t const block_len = STRIPE_LEN * nb_rounds;
880 size_t const nb_blocks = len / block_len;
881
882 size_t n;
883
884 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
885
886 for (n = 0; n < nb_blocks; n++) {
887 XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
888 XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
889 }
890
891 /* last partial block */
892 XXH_ASSERT(len > STRIPE_LEN);
893 { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
894 XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
895 XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
896
897 /* last stripe */
898 if (len & (STRIPE_LEN - 1)) {
899 const xxh_u8* const p = input + len - STRIPE_LEN;
900 #define XXH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */
901 XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
902 } }
903 }
904
905 XXH_FORCE_INLINE xxh_u64
906 XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
907 {
908 return XXH3p_mul128_fold64(
909 acc[0] ^ XXH_readLE64(secret),
910 acc[1] ^ XXH_readLE64(secret+8) );
911 }
912
913 static XXH64_hash_t
914 XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
915 {
916 xxh_u64 result64 = start;
917
918 result64 += XXH3p_mix2Accs(acc+0, secret + 0);
919 result64 += XXH3p_mix2Accs(acc+2, secret + 16);
920 result64 += XXH3p_mix2Accs(acc+4, secret + 32);
921 result64 += XXH3p_mix2Accs(acc+6, secret + 48);
922
923 return XXH3p_avalanche(result64);
924 }
925
926 #define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
927 PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
928
929 XXH_FORCE_INLINE XXH64_hash_t
930 XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
931 const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
932 {
933 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
934
935 XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
936
937 /* converge into final hash */
938 XXH_STATIC_ASSERT(sizeof(acc) == 64);
939 #define XXH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */
940 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
941 return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
942 }
943
944
945 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
946 XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
947 {
948 return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
949 }
950
951 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
952 XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
953 const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
954 {
955 return XXH3p_hashLong_internal(input, len, secret, secretSize);
956 }
957
958
959 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
960 {
961 if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
962 memcpy(dst, &v64, sizeof(v64));
963 }
964
965 /* XXH3p_initCustomSecret() :
966 * destination `customSecret` is presumed allocated and same size as `kSecret`.
967 */
968 XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
969 {
970 int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
971 int i;
972
973 XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
974
975 for (i=0; i < nbRounds; i++) {
976 XXH_writeLE64(customSecret + 16*i, XXH_readLE64(kSecret + 16*i) + seed64);
977 XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
978 }
979 }
980
981
982 /* XXH3p_hashLong_64b_withSeed() :
983 * Generate a custom key,
984 * based on alteration of default kSecret with the seed,
985 * and then use this key for long mode hashing.
986 * This operation is decently fast but nonetheless costs a little bit of time.
987 * Try to avoid it whenever possible (typically when seed==0).
988 */
989 XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
990 XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
991 {
992 XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
993 if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
994 XXH3p_initCustomSecret(secret, seed);
995 return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
996 }
997
998
999 XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
1000 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
1001 {
1002 xxh_u64 const input_lo = XXH_readLE64(input);
1003 xxh_u64 const input_hi = XXH_readLE64(input+8);
1004 return XXH3p_mul128_fold64(
1005 input_lo ^ (XXH_readLE64(secret) + seed64),
1006 input_hi ^ (XXH_readLE64(secret+8) - seed64) );
1007 }
1008
1009
1010 XXH_FORCE_INLINE XXH64_hash_t
1011 XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1012 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1013 XXH64_hash_t seed)
1014 {
1015 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1016 XXH_ASSERT(16 < len && len <= 128);
1017
1018 { xxh_u64 acc = len * PRIME64_1;
1019 if (len > 32) {
1020 if (len > 64) {
1021 if (len > 96) {
1022 acc += XXH3p_mix16B(input+48, secret+96, seed);
1023 acc += XXH3p_mix16B(input+len-64, secret+112, seed);
1024 }
1025 acc += XXH3p_mix16B(input+32, secret+64, seed);
1026 acc += XXH3p_mix16B(input+len-48, secret+80, seed);
1027 }
1028 acc += XXH3p_mix16B(input+16, secret+32, seed);
1029 acc += XXH3p_mix16B(input+len-32, secret+48, seed);
1030 }
1031 acc += XXH3p_mix16B(input+0, secret+0, seed);
1032 acc += XXH3p_mix16B(input+len-16, secret+16, seed);
1033
1034 return XXH3p_avalanche(acc);
1035 }
1036 }
1037
1038 #define XXH3p_MIDSIZE_MAX 240
1039
1040 XXH_NO_INLINE XXH64_hash_t
1041 XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1042 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1043 XXH64_hash_t seed)
1044 {
1045 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1046 XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1047
1048 #define XXH3p_MIDSIZE_STARTOFFSET 3
1049 #define XXH3p_MIDSIZE_LASTOFFSET 17
1050
1051 { xxh_u64 acc = len * PRIME64_1;
1052 int const nbRounds = (int)len / 16;
1053 int i;
1054 for (i=0; i<8; i++) {
1055 acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
1056 }
1057 acc = XXH3p_avalanche(acc);
1058 XXH_ASSERT(nbRounds >= 8);
1059 for (i=8 ; i < nbRounds; i++) {
1060 acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
1061 }
1062 /* last bytes */
1063 acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
1064 return XXH3p_avalanche(acc);
1065 }
1066 }
1067
1068 /* === Public entry point === */
1069
1070 XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
1071 {
1072 if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
1073 if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1074 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1075 return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
1076 }
1077
1078 XXH_PUBLIC_API XXH64_hash_t
1079 XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1080 {
1081 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1082 /* if an action must be taken should `secret` conditions not be respected,
1083 * it should be done here.
1084 * For now, it's a contract pre-condition.
1085 * Adding a check and a branch here would cost performance at every hash */
1086 if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1087 if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1088 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1089 return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1090 }
1091
1092 XXH_PUBLIC_API XXH64_hash_t
1093 XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1094 {
1095 if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
1096 if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1097 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1098 return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
1099 }
1100
1101 /* === XXH3 streaming === */
1102
1103 /* RocksDB Note: unused & removed due to bug in preview version */
1104
1105 /* ==========================================
1106 * XXH3 128 bits (=> XXH128)
1107 * ========================================== */
1108
1109 XXH_FORCE_INLINE XXH128_hash_t
1110 XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1111 {
1112 XXH_ASSERT(input != NULL);
1113 XXH_ASSERT(1 <= len && len <= 3);
1114 XXH_ASSERT(secret != NULL);
1115 { xxh_u8 const c1 = input[0];
1116 xxh_u8 const c2 = input[len >> 1];
1117 xxh_u8 const c3 = input[len - 1];
1118 xxh_u32 const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
1119 xxh_u32 const combinedh = XXH_swap32(combinedl);
1120 xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret) + seed);
1121 xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
1122 xxh_u64 const mixedl = keyed_lo * PRIME64_1;
1123 xxh_u64 const mixedh = keyed_hi * PRIME64_5;
1124 XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
1125 return h128;
1126 }
1127 }
1128
1129
1130 XXH_FORCE_INLINE XXH128_hash_t
1131 XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1132 {
1133 XXH_ASSERT(input != NULL);
1134 XXH_ASSERT(secret != NULL);
1135 XXH_ASSERT(4 <= len && len <= 8);
1136 { xxh_u32 const input_lo = XXH_readLE32(input);
1137 xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
1138 xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
1139 xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
1140 xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
1141 xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
1142 xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
1143 xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
1144 xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
1145 xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
1146 { XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
1147 return h128;
1148 } }
1149 }
1150
1151 XXH_FORCE_INLINE XXH128_hash_t
1152 XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1153 {
1154 XXH_ASSERT(input != NULL);
1155 XXH_ASSERT(secret != NULL);
1156 XXH_ASSERT(9 <= len && len <= 16);
1157 { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
1158 xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
1159 XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
1160 xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
1161 m128.low64 += lenContrib;
1162 m128.high64 += input_hi * PRIME64_1;
1163 m128.low64 ^= (m128.high64 >> 32);
1164 { XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
1165 h128.high64 += m128.high64 * PRIME64_2;
1166 h128.low64 = XXH3p_avalanche(h128.low64);
1167 h128.high64 = XXH3p_avalanche(h128.high64);
1168 return h128;
1169 } }
1170 }
1171
1172 /* Assumption : `secret` size is >= 16
1173 * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
1174 XXH_FORCE_INLINE XXH128_hash_t
1175 XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1176 {
1177 XXH_ASSERT(len <= 16);
1178 { if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
1179 if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
1180 if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
1181 { XXH128_hash_t const h128 = { 0, 0 };
1182 return h128;
1183 } }
1184 }
1185
1186 XXH_FORCE_INLINE XXH128_hash_t
1187 XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
1188 const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1189 {
1190 XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
1191
1192 XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
1193
1194 /* converge into final hash */
1195 XXH_STATIC_ASSERT(sizeof(acc) == 64);
1196 XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1197 { xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
1198 xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
1199 XXH128_hash_t const h128 = { low64, high64 };
1200 return h128;
1201 }
1202 }
1203
1204 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1205 XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
1206 {
1207 return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
1208 }
1209
1210 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1211 XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
1212 const xxh_u8* secret, size_t secretSize)
1213 {
1214 return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
1215 }
1216
1217 XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1218 XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
1219 {
1220 XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
1221 if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
1222 XXH3p_initCustomSecret(secret, seed);
1223 return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
1224 }
1225
1226
1227 XXH_FORCE_INLINE XXH128_hash_t
1228 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
1229 {
1230 acc.low64 += XXH3p_mix16B (input_1, secret+0, seed);
1231 acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
1232 acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
1233 acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
1234 return acc;
1235 }
1236
1237 XXH_NO_INLINE XXH128_hash_t
1238 XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1239 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1240 XXH64_hash_t seed)
1241 {
1242 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1243 XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1244
1245 { XXH128_hash_t acc;
1246 int const nbRounds = (int)len / 32;
1247 int i;
1248 acc.low64 = len * PRIME64_1;
1249 acc.high64 = 0;
1250 for (i=0; i<4; i++) {
1251 acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
1252 }
1253 acc.low64 = XXH3p_avalanche(acc.low64);
1254 acc.high64 = XXH3p_avalanche(acc.high64);
1255 XXH_ASSERT(nbRounds >= 4);
1256 for (i=4 ; i < nbRounds; i++) {
1257 acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
1258 }
1259 /* last bytes */
1260 acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
1261
1262 { xxh_u64 const low64 = acc.low64 + acc.high64;
1263 xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1264 XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1265 return h128;
1266 }
1267 }
1268 }
1269
1270
1271 XXH_FORCE_INLINE XXH128_hash_t
1272 XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1273 const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1274 XXH64_hash_t seed)
1275 {
1276 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1277 XXH_ASSERT(16 < len && len <= 128);
1278
1279 { XXH128_hash_t acc;
1280 acc.low64 = len * PRIME64_1;
1281 acc.high64 = 0;
1282 if (len > 32) {
1283 if (len > 64) {
1284 if (len > 96) {
1285 acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
1286 }
1287 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
1288 }
1289 acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
1290 }
1291 acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
1292 { xxh_u64 const low64 = acc.low64 + acc.high64;
1293 xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1294 XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1295 return h128;
1296 }
1297 }
1298 }
1299
1300 XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
1301 {
1302 if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
1303 if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1304 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1305 return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
1306 }
1307
1308 XXH_PUBLIC_API XXH128_hash_t
1309 XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1310 {
1311 XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1312 /* if an action must be taken should `secret` conditions not be respected,
1313 * it should be done here.
1314 * For now, it's a contract pre-condition.
1315 * Adding a check and a branch here would cost performance at every hash */
1316 if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1317 if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1318 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1319 return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1320 }
1321
1322 XXH_PUBLIC_API XXH128_hash_t
1323 XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1324 {
1325 if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
1326 if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1327 if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1328 return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
1329 }
1330
1331 XXH_PUBLIC_API XXH128_hash_t
1332 XXH128(const void* input, size_t len, XXH64_hash_t seed)
1333 {
1334 return XXH3p_128bits_withSeed(input, len, seed);
1335 }
1336
1337
1338 /* === XXH3 128-bit streaming === */
1339
1340 /* RocksDB Note: unused & removed due to bug in preview version */
1341
1342 /* 128-bit utility functions */
1343
1344 #include <string.h> /* memcmp */
1345
1346 /* return : 1 is equal, 0 if different */
1347 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
1348 {
1349 /* note : XXH128_hash_t is compact, it has no padding byte */
1350 return !(memcmp(&h1, &h2, sizeof(h1)));
1351 }
1352
1353 /* This prototype is compatible with stdlib's qsort().
1354 * return : >0 if *h128_1 > *h128_2
1355 * <0 if *h128_1 < *h128_2
1356 * =0 if *h128_1 == *h128_2 */
1357 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
1358 {
1359 XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
1360 XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
1361 int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
1362 /* note : bets that, in most cases, hash values are different */
1363 if (hcmp) return hcmp;
1364 return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
1365 }
1366
1367
1368 /*====== Canonical representation ======*/
1369 XXH_PUBLIC_API void
1370 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
1371 {
1372 XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
1373 if (XXH_CPU_LITTLE_ENDIAN) {
1374 hash.high64 = XXH_swap64(hash.high64);
1375 hash.low64 = XXH_swap64(hash.low64);
1376 }
1377 memcpy(dst, &hash.high64, sizeof(hash.high64));
1378 memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
1379 }
1380
1381 XXH_PUBLIC_API XXH128_hash_t
1382 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
1383 {
1384 XXH128_hash_t h;
1385 h.high64 = XXH_readBE64(src);
1386 h.low64 = XXH_readBE64(src->digest + 8);
1387 return h;
1388 }
1389
1390
1391
1392 #endif /* XXH3p_H */