]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the | |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
5 | /* | |
6 | xxHash - Extremely Fast Hash algorithm | |
7 | Development source file for `xxh3` | |
8 | Copyright (C) 2019-present, Yann Collet. | |
9 | ||
10 | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) | |
11 | ||
12 | Redistribution and use in source and binary forms, with or without | |
13 | modification, are permitted provided that the following conditions are | |
14 | met: | |
15 | ||
16 | * Redistributions of source code must retain the above copyright | |
17 | notice, this list of conditions and the following disclaimer. | |
18 | * Redistributions in binary form must reproduce the above | |
19 | copyright notice, this list of conditions and the following disclaimer | |
20 | in the documentation and/or other materials provided with the | |
21 | distribution. | |
22 | ||
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
24 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
25 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
26 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
27 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
28 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
29 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
30 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
31 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
32 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
33 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
34 | ||
35 | You can contact the author at : | |
36 | - xxHash source repository : https://github.com/Cyan4973/xxHash | |
37 | */ | |
38 | ||
39 | /* RocksDB Note: This file contains a preview release (xxhash repository | |
40 | version 0.7.2) of XXH3 that is unlikely to be compatible with the final | |
41 | version of XXH3. We have therefore renamed this XXH3p ("preview"), for | |
42 | clarity so that we can continue to use this version even after | |
43 | integrating a newer incompatible version. | |
44 | */ | |
45 | ||
46 | /* Note : | |
47 | This file is separated for development purposes. | |
48 | It will be integrated into `xxhash.c` when development phase is complete. | |
49 | */ | |
50 | ||
51 | #ifndef XXH3p_H | |
52 | #define XXH3p_H | |
53 | ||
54 | ||
55 | /* === Dependencies === */ | |
56 | ||
57 | #undef XXH_INLINE_ALL /* in case it's already defined */ | |
58 | #define XXH_INLINE_ALL | |
59 | #include "xxhash.h" | |
60 | ||
61 | ||
62 | /* === Compiler specifics === */ | |
63 | ||
64 | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ | |
65 | # define XXH_RESTRICT restrict | |
66 | #else | |
67 | /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */ | |
68 | # define XXH_RESTRICT /* disable */ | |
69 | #endif | |
70 | ||
71 | #if defined(__GNUC__) | |
72 | # if defined(__AVX2__) | |
73 | # include <immintrin.h> | |
74 | # elif defined(__SSE2__) | |
75 | # include <emmintrin.h> | |
76 | # elif defined(__ARM_NEON__) || defined(__ARM_NEON) | |
77 | # define inline __inline__ /* clang bug */ | |
78 | # include <arm_neon.h> | |
79 | # undef inline | |
80 | # endif | |
81 | #elif defined(_MSC_VER) | |
82 | # include <intrin.h> | |
83 | #endif | |
84 | ||
85 | /* | |
86 | * Sanity check. | |
87 | * | |
88 | * XXH3 only requires these features to be efficient: | |
89 | * | |
90 | * - Usable unaligned access | |
91 | * - A 32-bit or 64-bit ALU | |
92 | * - If 32-bit, a decent ADC instruction | |
93 | * - A 32 or 64-bit multiply with a 64-bit result | |
94 | * | |
95 | * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the | |
96 | * classic 16-bit only subset of ARM's instruction set. | |
97 | * | |
98 | * First of all, Thumb-1 lacks support for the UMULL instruction which | |
99 | * performs the important long multiply. This means numerous __aeabi_lmul | |
100 | * calls. | |
101 | * | |
102 | * Second of all, the 8 functional registers are just not enough. | |
103 | * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need | |
104 | * Lo registers, and this shuffling results in thousands more MOVs than A32. | |
105 | * | |
106 | * A32 and T32 don't have this limitation. They can access all 14 registers, | |
107 | * do a 32->64 multiply with UMULL, and the flexible operand is helpful too. | |
108 | * | |
109 | * If compiling Thumb-1 for a target which supports ARM instructions, we | |
110 | * will give a warning. | |
111 | * | |
112 | * Usually, if this happens, it is because of an accident and you probably | |
113 | * need to specify -march, as you probably meant to compileh for a newer | |
114 | * architecture. | |
115 | */ | |
116 | #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) | |
117 | # warning "XXH3 is highly inefficient without ARM or Thumb-2." | |
118 | #endif | |
119 | ||
120 | /* ========================================== | |
121 | * Vectorization detection | |
122 | * ========================================== */ | |
123 | #define XXH_SCALAR 0 | |
124 | #define XXH_SSE2 1 | |
125 | #define XXH_AVX2 2 | |
126 | #define XXH_NEON 3 | |
127 | #define XXH_VSX 4 | |
128 | ||
129 | #ifndef XXH_VECTOR /* can be defined on command line */ | |
130 | # if defined(__AVX2__) | |
131 | # define XXH_VECTOR XXH_AVX2 | |
132 | # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) | |
133 | # define XXH_VECTOR XXH_SSE2 | |
134 | # elif defined(__GNUC__) /* msvc support maybe later */ \ | |
135 | && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ | |
136 | && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ | |
137 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) | |
138 | # define XXH_VECTOR XXH_NEON | |
139 | # elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__) | |
140 | # define XXH_VECTOR XXH_VSX | |
141 | # else | |
142 | # define XXH_VECTOR XXH_SCALAR | |
143 | # endif | |
144 | #endif | |
145 | ||
146 | /* control alignment of accumulator, | |
147 | * for compatibility with fast vector loads */ | |
148 | #ifndef XXH_ACC_ALIGN | |
149 | # if XXH_VECTOR == 0 /* scalar */ | |
150 | # define XXH_ACC_ALIGN 8 | |
151 | # elif XXH_VECTOR == 1 /* sse2 */ | |
152 | # define XXH_ACC_ALIGN 16 | |
153 | # elif XXH_VECTOR == 2 /* avx2 */ | |
154 | # define XXH_ACC_ALIGN 32 | |
155 | # elif XXH_VECTOR == 3 /* neon */ | |
156 | # define XXH_ACC_ALIGN 16 | |
157 | # elif XXH_VECTOR == 4 /* vsx */ | |
158 | # define XXH_ACC_ALIGN 16 | |
159 | # endif | |
160 | #endif | |
161 | ||
162 | /* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */ | |
163 | #if defined(_MSC_VER) && defined(_M_IX86) | |
164 | # include <intrin.h> | |
165 | # define XXH_mult32to64(x, y) __emulu(x, y) | |
166 | #else | |
167 | # define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF)) | |
168 | #endif | |
169 | ||
170 | /* VSX stuff. It's a lot because VSX support is mediocre across compilers and | |
171 | * there is a lot of mischief with endianness. */ | |
172 | #if XXH_VECTOR == XXH_VSX | |
173 | # include <altivec.h> | |
174 | # undef vector | |
175 | typedef __vector unsigned long long U64x2; | |
176 | typedef __vector unsigned char U8x16; | |
177 | typedef __vector unsigned U32x4; | |
178 | ||
179 | #ifndef XXH_VSX_BE | |
180 | # if defined(__BIG_ENDIAN__) \ | |
181 | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |
182 | # define XXH_VSX_BE 1 | |
183 | # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ | |
184 | # warning "-maltivec=be is not recommended. Please use native endianness." | |
185 | # define XXH_VSX_BE 1 | |
186 | # else | |
187 | # define XXH_VSX_BE 0 | |
188 | # endif | |
189 | #endif | |
190 | ||
191 | /* We need some helpers for big endian mode. */ | |
192 | #if XXH_VSX_BE | |
193 | /* A wrapper for POWER9's vec_revb. */ | |
194 | # ifdef __POWER9_VECTOR__ | |
195 | # define XXH_vec_revb vec_revb | |
196 | # else | |
197 | XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val) | |
198 | { | |
199 | U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, | |
200 | 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; | |
201 | return vec_perm(val, val, vByteSwap); | |
202 | } | |
203 | # endif | |
204 | ||
205 | /* Power8 Crypto gives us vpermxor which is very handy for | |
206 | * PPC64EB. | |
207 | * | |
208 | * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask) | |
209 | * { | |
210 | * U8x16 ret; | |
211 | * for (int i = 0; i < 16; i++) { | |
212 | * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4]; | |
213 | * } | |
214 | * return ret; | |
215 | * } | |
216 | * | |
217 | * Because both of the main loops load the key, swap, and xor it with input, | |
218 | * we can combine the key swap into this instruction. | |
219 | */ | |
220 | # ifdef vec_permxor | |
221 | # define XXH_vec_permxor vec_permxor | |
222 | # else | |
223 | # define XXH_vec_permxor __builtin_crypto_vpermxor | |
224 | # endif | |
225 | #endif /* XXH_VSX_BE */ | |
226 | /* | |
227 | * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes | |
228 | * vec_mule. | |
229 | * | |
230 | * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while. | |
231 | * Clang has an easy way to control this, we can just use the builtin which doesn't swap. | |
232 | * GCC needs inline assembly. */ | |
233 | #if __has_builtin(__builtin_altivec_vmuleuw) | |
234 | # define XXH_vec_mulo __builtin_altivec_vmulouw | |
235 | # define XXH_vec_mule __builtin_altivec_vmuleuw | |
236 | #else | |
237 | /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ | |
238 | XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) { | |
239 | U64x2 result; | |
240 | __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); | |
241 | return result; | |
242 | } | |
243 | XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) { | |
244 | U64x2 result; | |
245 | __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); | |
246 | return result; | |
247 | } | |
248 | #endif /* __has_builtin(__builtin_altivec_vmuleuw) */ | |
249 | #endif /* XXH_VECTOR == XXH_VSX */ | |
250 | ||
251 | /* prefetch | |
252 | * can be disabled, by declaring XXH_NO_PREFETCH build macro */ | |
253 | #if defined(XXH_NO_PREFETCH) | |
254 | # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ | |
255 | #else | |
20effc67 TL |
256 | #if defined(_MSC_VER) && \ |
257 | (defined(_M_X64) || \ | |
258 | defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */ | |
f67539c2 TL |
259 | # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ |
260 | # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) | |
261 | # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) | |
262 | # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) | |
263 | # else | |
264 | # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ | |
265 | # endif | |
266 | #endif /* XXH_NO_PREFETCH */ | |
267 | ||
268 | ||
269 | /* ========================================== | |
270 | * XXH3 default settings | |
271 | * ========================================== */ | |
272 | ||
273 | #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3p_SECRET_SIZE_MIN */ | |
274 | ||
275 | #if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN) | |
276 | # error "default keyset is not large enough" | |
277 | #endif | |
278 | ||
279 | XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = { | |
280 | 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, | |
281 | 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, | |
282 | 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, | |
283 | 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, | |
284 | 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, | |
285 | 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, | |
286 | 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, | |
287 | 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, | |
288 | ||
289 | 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, | |
290 | 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, | |
291 | 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, | |
292 | 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, | |
293 | }; | |
294 | ||
295 | /* | |
296 | * GCC for x86 has a tendency to use SSE in this loop. While it | |
297 | * successfully avoids swapping (as MUL overwrites EAX and EDX), it | |
298 | * slows it down because instead of free register swap shifts, it | |
299 | * must use pshufd and punpckl/hd. | |
300 | * | |
301 | * To prevent this, we use this attribute to shut off SSE. | |
302 | */ | |
303 | #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) | |
304 | __attribute__((__target__("no-sse"))) | |
305 | #endif | |
306 | static XXH128_hash_t | |
307 | XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) | |
308 | { | |
309 | /* | |
310 | * GCC/Clang __uint128_t method. | |
311 | * | |
312 | * On most 64-bit targets, GCC and Clang define a __uint128_t type. | |
313 | * This is usually the best way as it usually uses a native long 64-bit | |
314 | * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. | |
315 | * | |
316 | * Usually. | |
317 | * | |
318 | * Despite being a 32-bit platform, Clang (and emscripten) define this | |
319 | * type despite not having the arithmetic for it. This results in a | |
320 | * laggy compiler builtin call which calculates a full 128-bit multiply. | |
321 | * In that case it is best to use the portable one. | |
322 | * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 | |
323 | */ | |
324 | #if defined(__GNUC__) && !defined(__wasm__) \ | |
325 | && defined(__SIZEOF_INT128__) \ | |
326 | || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) | |
327 | ||
328 | __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; | |
329 | XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) }; | |
330 | return r128; | |
331 | ||
332 | /* | |
333 | * MSVC for x64's _umul128 method. | |
334 | * | |
335 | * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); | |
336 | * | |
337 | * This compiles to single operand MUL on x64. | |
338 | */ | |
339 | #elif defined(_M_X64) || defined(_M_IA64) | |
340 | ||
341 | #ifndef _MSC_VER | |
342 | # pragma intrinsic(_umul128) | |
343 | #endif | |
344 | xxh_u64 product_high; | |
345 | xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); | |
346 | XXH128_hash_t const r128 = { product_low, product_high }; | |
347 | return r128; | |
348 | ||
349 | #else | |
350 | /* | |
351 | * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. | |
352 | * | |
353 | * This is a fast and simple grade school multiply, which is shown | |
354 | * below with base 10 arithmetic instead of base 0x100000000. | |
355 | * | |
356 | * 9 3 // D2 lhs = 93 | |
357 | * x 7 5 // D2 rhs = 75 | |
358 | * ---------- | |
359 | * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) | |
360 | * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) | |
361 | * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) | |
362 | * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) | |
363 | * --------- | |
364 | * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 | |
365 | * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 | |
366 | * --------- | |
367 | * 6 9 7 5 | |
368 | * | |
369 | * The reasons for adding the products like this are: | |
370 | * 1. It avoids manual carry tracking. Just like how | |
371 | * (9 * 9) + 9 + 9 = 99, the same applies with this for | |
372 | * UINT64_MAX. This avoids a lot of complexity. | |
373 | * | |
374 | * 2. It hints for, and on Clang, compiles to, the powerful UMAAL | |
375 | * instruction available in ARMv6+ A32/T32, which is shown below: | |
376 | * | |
377 | * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) | |
378 | * { | |
379 | * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; | |
380 | * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); | |
381 | * *RdHi = (xxh_u32)(product >> 32); | |
382 | * } | |
383 | * | |
384 | * This instruction was designed for efficient long multiplication, | |
385 | * and allows this to be calculated in only 4 instructions which | |
386 | * is comparable to some 64-bit ALUs. | |
387 | * | |
388 | * 3. It isn't terrible on other platforms. Usually this will be | |
389 | * a couple of 32-bit ADD/ADCs. | |
390 | */ | |
391 | ||
392 | /* First calculate all of the cross products. */ | |
393 | xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); | |
394 | xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); | |
395 | xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); | |
396 | xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); | |
397 | ||
398 | /* Now add the products together. These will never overflow. */ | |
399 | xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; | |
400 | xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; | |
401 | xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); | |
402 | ||
403 | XXH128_hash_t r128 = { lower, upper }; | |
404 | return r128; | |
405 | #endif | |
406 | } | |
407 | ||
408 | /* | |
409 | * We want to keep the attribute here because a target switch | |
410 | * disables inlining. | |
411 | * | |
412 | * Does a 64-bit to 128-bit multiply, then XOR folds it. | |
413 | * The reason for the separate function is to prevent passing | |
414 | * too many structs around by value. This will hopefully inline | |
415 | * the multiply, but we don't force it. | |
416 | */ | |
417 | #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) | |
418 | __attribute__((__target__("no-sse"))) | |
419 | #endif | |
420 | static xxh_u64 | |
421 | XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) | |
422 | { | |
423 | XXH128_hash_t product = XXH_mult64to128(lhs, rhs); | |
424 | return product.low64 ^ product.high64; | |
425 | } | |
426 | ||
427 | ||
428 | static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64) | |
429 | { | |
430 | h64 ^= h64 >> 37; | |
431 | h64 *= PRIME64_3; | |
432 | h64 ^= h64 >> 32; | |
433 | return h64; | |
434 | } | |
435 | ||
436 | ||
437 | /* ========================================== | |
438 | * Short keys | |
439 | * ========================================== */ | |
440 | ||
441 | XXH_FORCE_INLINE XXH64_hash_t | |
442 | XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
443 | { | |
444 | XXH_ASSERT(input != NULL); | |
445 | XXH_ASSERT(1 <= len && len <= 3); | |
446 | XXH_ASSERT(secret != NULL); | |
447 | { xxh_u8 const c1 = input[0]; | |
448 | xxh_u8 const c2 = input[len >> 1]; | |
449 | xxh_u8 const c3 = input[len - 1]; | |
450 | xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24); | |
451 | xxh_u64 const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed); | |
452 | xxh_u64 const mixed = keyed * PRIME64_1; | |
453 | return XXH3p_avalanche(mixed); | |
454 | } | |
455 | } | |
456 | ||
457 | XXH_FORCE_INLINE XXH64_hash_t | |
458 | XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
459 | { | |
460 | XXH_ASSERT(input != NULL); | |
461 | XXH_ASSERT(secret != NULL); | |
462 | XXH_ASSERT(4 <= len && len <= 8); | |
463 | { xxh_u32 const input_lo = XXH_readLE32(input); | |
464 | xxh_u32 const input_hi = XXH_readLE32(input + len - 4); | |
465 | xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32); | |
466 | xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed); | |
467 | xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1); | |
468 | return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2); | |
469 | } | |
470 | } | |
471 | ||
472 | XXH_FORCE_INLINE XXH64_hash_t | |
473 | XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
474 | { | |
475 | XXH_ASSERT(input != NULL); | |
476 | XXH_ASSERT(secret != NULL); | |
477 | XXH_ASSERT(9 <= len && len <= 16); | |
478 | { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed); | |
479 | xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed); | |
480 | xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi); | |
481 | return XXH3p_avalanche(acc); | |
482 | } | |
483 | } | |
484 | ||
485 | XXH_FORCE_INLINE XXH64_hash_t | |
486 | XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
487 | { | |
488 | XXH_ASSERT(len <= 16); | |
489 | { if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed); | |
490 | if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed); | |
491 | if (len) return XXH3p_len_1to3_64b(input, len, secret, seed); | |
492 | /* | |
493 | * RocksDB modification from XXH3 preview: zero result for empty | |
494 | * string can be problematic for multiplication-based algorithms. | |
495 | * Return a hash of the seed instead. | |
496 | */ | |
497 | return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2); | |
498 | } | |
499 | } | |
500 | ||
501 | ||
502 | /* === Long Keys === */ | |
503 | ||
504 | #define STRIPE_LEN 64 | |
505 | #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ | |
506 | #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) | |
507 | ||
508 | typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e; | |
509 | ||
510 | XXH_FORCE_INLINE void | |
511 | XXH3p_accumulate_512( void* XXH_RESTRICT acc, | |
512 | const void* XXH_RESTRICT input, | |
513 | const void* XXH_RESTRICT secret, | |
514 | XXH3p_accWidth_e accWidth) | |
515 | { | |
516 | #if (XXH_VECTOR == XXH_AVX2) | |
517 | ||
518 | XXH_ASSERT((((size_t)acc) & 31) == 0); | |
519 | { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; | |
520 | const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ | |
521 | const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ | |
522 | ||
523 | size_t i; | |
524 | for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { | |
525 | __m256i const data_vec = _mm256_loadu_si256 (xinput+i); | |
526 | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); | |
527 | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ | |
528 | __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ | |
529 | if (accWidth == XXH3p_acc_128bits) { | |
530 | __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); | |
531 | __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); | |
532 | xacc[i] = _mm256_add_epi64(product, sum); | |
533 | } else { /* XXH3p_acc_64bits */ | |
534 | __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); | |
535 | xacc[i] = _mm256_add_epi64(product, sum); | |
536 | } | |
537 | } } | |
538 | ||
539 | #elif (XXH_VECTOR == XXH_SSE2) | |
540 | ||
541 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
542 | { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; | |
543 | const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ | |
544 | const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ | |
545 | ||
546 | size_t i; | |
547 | for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { | |
548 | __m128i const data_vec = _mm_loadu_si128 (xinput+i); | |
549 | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); | |
550 | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ | |
551 | __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ | |
552 | if (accWidth == XXH3p_acc_128bits) { | |
553 | __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); | |
554 | __m128i const sum = _mm_add_epi64(xacc[i], data_swap); | |
555 | xacc[i] = _mm_add_epi64(product, sum); | |
556 | } else { /* XXH3p_acc_64bits */ | |
557 | __m128i const sum = _mm_add_epi64(xacc[i], data_vec); | |
558 | xacc[i] = _mm_add_epi64(product, sum); | |
559 | } | |
560 | } } | |
561 | ||
562 | #elif (XXH_VECTOR == XXH_NEON) | |
563 | ||
564 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
565 | { | |
566 | XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; | |
567 | /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ | |
568 | uint8_t const* const xinput = (const uint8_t *) input; | |
569 | uint8_t const* const xsecret = (const uint8_t *) secret; | |
570 | ||
571 | size_t i; | |
572 | for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { | |
573 | #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */ | |
574 | /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this. | |
575 | * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang | |
576 | * assumes I don't want to destroy it and tries to make a copy. This slows down the code | |
577 | * a lot. | |
578 | * aarch64 not only uses an entirely different syntax, but it requires three | |
579 | * instructions... | |
580 | * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly | |
581 | * zip1 v3.2s, v0.2s, v1.2s // first zip | |
582 | * zip2 v2.2s, v0.2s, v1.2s // second zip | |
583 | * ...to do what ARM does in one: | |
584 | * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */ | |
585 | ||
586 | /* data_vec = xsecret[i]; */ | |
587 | uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); | |
588 | /* key_vec = xsecret[i]; */ | |
589 | uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); | |
590 | /* data_key = data_vec ^ key_vec; */ | |
591 | uint32x4_t data_key; | |
592 | ||
593 | if (accWidth == XXH3p_acc_64bits) { | |
594 | /* Add first to prevent register swaps */ | |
595 | /* xacc[i] += data_vec; */ | |
596 | xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); | |
597 | } else { /* XXH3p_acc_128bits */ | |
598 | /* xacc[i] += swap(data_vec); */ | |
599 | /* can probably be optimized better */ | |
600 | uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); | |
601 | uint64x2_t const swapped= vextq_u64(data64, data64, 1); | |
602 | xacc[i] = vaddq_u64 (xacc[i], swapped); | |
603 | } | |
604 | ||
605 | data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec)); | |
606 | ||
607 | /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place. | |
608 | * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */ | |
609 | __asm__("vzip.32 %e0, %f0" : "+w" (data_key)); | |
610 | /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */ | |
611 | xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key)); | |
612 | ||
613 | #else | |
614 | /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */ | |
615 | ||
616 | /* data_vec = xsecret[i]; */ | |
617 | uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); | |
618 | /* key_vec = xsecret[i]; */ | |
619 | uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); | |
620 | /* data_key = data_vec ^ key_vec; */ | |
621 | uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); | |
622 | /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */ | |
623 | uint32x2_t const data_key_lo = vmovn_u64 (data_key); | |
624 | /* data_key_hi = (uint32x2_t) (data_key >> 32); */ | |
625 | uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32); | |
626 | if (accWidth == XXH3p_acc_64bits) { | |
627 | /* xacc[i] += data_vec; */ | |
628 | xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); | |
629 | } else { /* XXH3p_acc_128bits */ | |
630 | /* xacc[i] += swap(data_vec); */ | |
631 | uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); | |
632 | uint64x2_t const swapped= vextq_u64(data64, data64, 1); | |
633 | xacc[i] = vaddq_u64 (xacc[i], swapped); | |
634 | } | |
635 | /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ | |
636 | xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); | |
637 | ||
638 | #endif | |
639 | } | |
640 | } | |
641 | ||
20effc67 | 642 | #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) |
f67539c2 TL |
643 | U64x2* const xacc = (U64x2*) acc; /* presumed aligned */ |
644 | U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */ | |
645 | U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */ | |
646 | U64x2 const v32 = { 32, 32 }; | |
647 | #if XXH_VSX_BE | |
648 | U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, | |
649 | 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; | |
650 | #endif | |
651 | size_t i; | |
652 | for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { | |
653 | /* data_vec = xinput[i]; */ | |
654 | /* key_vec = xsecret[i]; */ | |
655 | #if XXH_VSX_BE | |
656 | /* byteswap */ | |
657 | U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i)); | |
658 | U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); | |
659 | /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */ | |
660 | U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); | |
661 | #else | |
662 | U64x2 const data_vec = vec_vsx_ld(0, xinput + i); | |
663 | U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); | |
664 | U64x2 const data_key = data_vec ^ key_vec; | |
665 | #endif | |
666 | /* shuffled = (data_key << 32) | (data_key >> 32); */ | |
667 | U32x4 const shuffled = (U32x4)vec_rl(data_key, v32); | |
668 | /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */ | |
669 | U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled); | |
670 | xacc[i] += product; | |
671 | ||
672 | if (accWidth == XXH3p_acc_64bits) { | |
673 | xacc[i] += data_vec; | |
674 | } else { /* XXH3p_acc_128bits */ | |
675 | /* swap high and low halves */ | |
676 | U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); | |
677 | xacc[i] += data_swapped; | |
678 | } | |
679 | } | |
680 | ||
681 | #else /* scalar variant of Accumulator - universal */ | |
682 | ||
683 | XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ | |
684 | const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ | |
685 | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ | |
686 | size_t i; | |
687 | XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); | |
688 | for (i=0; i < ACC_NB; i++) { | |
689 | xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); | |
690 | xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); | |
691 | ||
692 | if (accWidth == XXH3p_acc_64bits) { | |
693 | xacc[i] += data_val; | |
694 | } else { | |
695 | xacc[i ^ 1] += data_val; /* swap adjacent lanes */ | |
696 | } | |
697 | xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); | |
698 | } | |
699 | #endif | |
700 | } | |
701 | ||
702 | XXH_FORCE_INLINE void | |
703 | XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
704 | { | |
705 | #if (XXH_VECTOR == XXH_AVX2) | |
706 | ||
707 | XXH_ASSERT((((size_t)acc) & 31) == 0); | |
708 | { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; | |
709 | const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */ | |
710 | const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); | |
711 | ||
712 | size_t i; | |
713 | for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { | |
714 | /* xacc[i] ^= (xacc[i] >> 47) */ | |
715 | __m256i const acc_vec = xacc[i]; | |
716 | __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); | |
717 | __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); | |
718 | /* xacc[i] ^= xsecret; */ | |
719 | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); | |
720 | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); | |
721 | ||
722 | /* xacc[i] *= PRIME32_1; */ | |
723 | __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31); | |
724 | __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); | |
725 | __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); | |
726 | xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); | |
727 | } | |
728 | } | |
729 | ||
730 | #elif (XXH_VECTOR == XXH_SSE2) | |
731 | ||
732 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
733 | { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; | |
734 | const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */ | |
735 | const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); | |
736 | ||
737 | size_t i; | |
738 | for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { | |
739 | /* xacc[i] ^= (xacc[i] >> 47) */ | |
740 | __m128i const acc_vec = xacc[i]; | |
741 | __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); | |
742 | __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); | |
743 | /* xacc[i] ^= xsecret; */ | |
744 | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); | |
745 | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); | |
746 | ||
747 | /* xacc[i] *= PRIME32_1; */ | |
748 | __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31); | |
749 | __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); | |
750 | __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); | |
751 | xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); | |
752 | } | |
753 | } | |
754 | ||
755 | #elif (XXH_VECTOR == XXH_NEON) | |
756 | ||
757 | XXH_ASSERT((((size_t)acc) & 15) == 0); | |
758 | ||
759 | { uint64x2_t* const xacc = (uint64x2_t*) acc; | |
760 | uint8_t const* const xsecret = (uint8_t const*) secret; | |
761 | uint32x2_t const prime = vdup_n_u32 (PRIME32_1); | |
762 | ||
763 | size_t i; | |
764 | for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { | |
765 | /* data_vec = xacc[i] ^ (xacc[i] >> 47); */ | |
766 | uint64x2_t const acc_vec = xacc[i]; | |
767 | uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47); | |
768 | uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted); | |
769 | ||
770 | /* key_vec = xsecret[i]; */ | |
771 | uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16))); | |
772 | /* data_key = data_vec ^ key_vec; */ | |
773 | uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec); | |
774 | /* shuffled = { data_key[0, 2], data_key[1, 3] }; */ | |
775 | uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key)); | |
776 | ||
777 | /* data_key *= PRIME32_1 */ | |
778 | ||
779 | /* prod_hi = (data_key >> 32) * PRIME32_1; */ | |
780 | uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime); | |
781 | /* xacc[i] = prod_hi << 32; */ | |
782 | xacc[i] = vshlq_n_u64(prod_hi, 32); | |
783 | /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ | |
784 | xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime); | |
785 | } } | |
786 | ||
20effc67 | 787 | #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) |
f67539c2 TL |
788 | |
789 | U64x2* const xacc = (U64x2*) acc; | |
790 | const U64x2* const xsecret = (const U64x2*) secret; | |
791 | /* constants */ | |
792 | U64x2 const v32 = { 32, 32 }; | |
793 | U64x2 const v47 = { 47, 47 }; | |
794 | U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; | |
795 | size_t i; | |
796 | #if XXH_VSX_BE | |
797 | /* endian swap */ | |
798 | U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, | |
799 | 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; | |
800 | #endif | |
801 | for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { | |
802 | U64x2 const acc_vec = xacc[i]; | |
803 | U64x2 const data_vec = acc_vec ^ (acc_vec >> v47); | |
804 | /* key_vec = xsecret[i]; */ | |
805 | #if XXH_VSX_BE | |
806 | /* swap bytes words */ | |
807 | U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); | |
808 | U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); | |
809 | #else | |
810 | U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); | |
811 | U64x2 const data_key = data_vec ^ key_vec; | |
812 | #endif | |
813 | ||
814 | /* data_key *= PRIME32_1 */ | |
815 | ||
816 | /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */ | |
817 | U64x2 const prod_even = XXH_vec_mule((U32x4)data_key, prime); | |
818 | /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */ | |
819 | U64x2 const prod_odd = XXH_vec_mulo((U32x4)data_key, prime); | |
820 | xacc[i] = prod_odd + (prod_even << v32); | |
821 | } | |
822 | ||
823 | #else /* scalar variant of Scrambler - universal */ | |
824 | ||
825 | XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ | |
826 | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ | |
827 | size_t i; | |
828 | XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); | |
829 | for (i=0; i < ACC_NB; i++) { | |
830 | xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); | |
831 | xxh_u64 acc64 = xacc[i]; | |
832 | acc64 ^= acc64 >> 47; | |
833 | acc64 ^= key64; | |
834 | acc64 *= PRIME32_1; | |
835 | xacc[i] = acc64; | |
836 | } | |
837 | ||
838 | #endif | |
839 | } | |
840 | ||
841 | #define XXH_PREFETCH_DIST 384 | |
842 | ||
843 | /* assumption : nbStripes will not overflow secret size */ | |
844 | XXH_FORCE_INLINE void | |
845 | XXH3p_accumulate( xxh_u64* XXH_RESTRICT acc, | |
846 | const xxh_u8* XXH_RESTRICT input, | |
847 | const xxh_u8* XXH_RESTRICT secret, | |
848 | size_t nbStripes, | |
849 | XXH3p_accWidth_e accWidth) | |
850 | { | |
851 | size_t n; | |
852 | for (n = 0; n < nbStripes; n++ ) { | |
853 | const xxh_u8* const in = input + n*STRIPE_LEN; | |
854 | XXH_PREFETCH(in + XXH_PREFETCH_DIST); | |
855 | XXH3p_accumulate_512(acc, | |
856 | in, | |
857 | secret + n*XXH_SECRET_CONSUME_RATE, | |
858 | accWidth); | |
859 | } | |
860 | } | |
861 | ||
862 | /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`, | |
863 | * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`. | |
864 | * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE` | |
865 | * Pretty much every other modes and compilers prefer `FORCE_INLINE`. | |
866 | */ | |
867 | ||
868 | #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__) | |
869 | static void | |
870 | #else | |
871 | XXH_FORCE_INLINE void | |
872 | #endif | |
873 | XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc, | |
874 | const xxh_u8* XXH_RESTRICT input, size_t len, | |
875 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
876 | XXH3p_accWidth_e accWidth) | |
877 | { | |
878 | size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; | |
879 | size_t const block_len = STRIPE_LEN * nb_rounds; | |
880 | size_t const nb_blocks = len / block_len; | |
881 | ||
882 | size_t n; | |
883 | ||
884 | XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); | |
885 | ||
886 | for (n = 0; n < nb_blocks; n++) { | |
887 | XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); | |
888 | XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); | |
889 | } | |
890 | ||
891 | /* last partial block */ | |
892 | XXH_ASSERT(len > STRIPE_LEN); | |
893 | { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; | |
894 | XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); | |
895 | XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); | |
896 | ||
897 | /* last stripe */ | |
898 | if (len & (STRIPE_LEN - 1)) { | |
899 | const xxh_u8* const p = input + len - STRIPE_LEN; | |
900 | #define XXH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */ | |
901 | XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth); | |
902 | } } | |
903 | } | |
904 | ||
905 | XXH_FORCE_INLINE xxh_u64 | |
906 | XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) | |
907 | { | |
908 | return XXH3p_mul128_fold64( | |
909 | acc[0] ^ XXH_readLE64(secret), | |
910 | acc[1] ^ XXH_readLE64(secret+8) ); | |
911 | } | |
912 | ||
913 | static XXH64_hash_t | |
914 | XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) | |
915 | { | |
916 | xxh_u64 result64 = start; | |
917 | ||
918 | result64 += XXH3p_mix2Accs(acc+0, secret + 0); | |
919 | result64 += XXH3p_mix2Accs(acc+2, secret + 16); | |
920 | result64 += XXH3p_mix2Accs(acc+4, secret + 32); | |
921 | result64 += XXH3p_mix2Accs(acc+6, secret + 48); | |
922 | ||
923 | return XXH3p_avalanche(result64); | |
924 | } | |
925 | ||
926 | #define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ | |
927 | PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }; | |
928 | ||
929 | XXH_FORCE_INLINE XXH64_hash_t | |
930 | XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len, | |
931 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize) | |
932 | { | |
933 | XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC; | |
934 | ||
935 | XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits); | |
936 | ||
937 | /* converge into final hash */ | |
938 | XXH_STATIC_ASSERT(sizeof(acc) == 64); | |
939 | #define XXH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */ | |
940 | XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); | |
941 | return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); | |
942 | } | |
943 | ||
944 | ||
945 | XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
946 | XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len) | |
947 | { | |
948 | return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret)); | |
949 | } | |
950 | ||
951 | XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
952 | XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len, | |
953 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize) | |
954 | { | |
955 | return XXH3p_hashLong_internal(input, len, secret, secretSize); | |
956 | } | |
957 | ||
958 | ||
959 | XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) | |
960 | { | |
961 | if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); | |
962 | memcpy(dst, &v64, sizeof(v64)); | |
963 | } | |
964 | ||
965 | /* XXH3p_initCustomSecret() : | |
966 | * destination `customSecret` is presumed allocated and same size as `kSecret`. | |
967 | */ | |
968 | XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64) | |
969 | { | |
970 | int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; | |
971 | int i; | |
972 | ||
973 | XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); | |
974 | ||
975 | for (i=0; i < nbRounds; i++) { | |
976 | XXH_writeLE64(customSecret + 16*i, XXH_readLE64(kSecret + 16*i) + seed64); | |
977 | XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64); | |
978 | } | |
979 | } | |
980 | ||
981 | ||
982 | /* XXH3p_hashLong_64b_withSeed() : | |
983 | * Generate a custom key, | |
984 | * based on alteration of default kSecret with the seed, | |
985 | * and then use this key for long mode hashing. | |
986 | * This operation is decently fast but nonetheless costs a little bit of time. | |
987 | * Try to avoid it whenever possible (typically when seed==0). | |
988 | */ | |
989 | XXH_NO_INLINE XXH64_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
990 | XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) | |
991 | { | |
992 | XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; | |
993 | if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len); | |
994 | XXH3p_initCustomSecret(secret, seed); | |
995 | return XXH3p_hashLong_internal(input, len, secret, sizeof(secret)); | |
996 | } | |
997 | ||
998 | ||
999 | XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input, | |
1000 | const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) | |
1001 | { | |
1002 | xxh_u64 const input_lo = XXH_readLE64(input); | |
1003 | xxh_u64 const input_hi = XXH_readLE64(input+8); | |
1004 | return XXH3p_mul128_fold64( | |
1005 | input_lo ^ (XXH_readLE64(secret) + seed64), | |
1006 | input_hi ^ (XXH_readLE64(secret+8) - seed64) ); | |
1007 | } | |
1008 | ||
1009 | ||
1010 | XXH_FORCE_INLINE XXH64_hash_t | |
1011 | XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
1012 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
1013 | XXH64_hash_t seed) | |
1014 | { | |
1015 | XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize; | |
1016 | XXH_ASSERT(16 < len && len <= 128); | |
1017 | ||
1018 | { xxh_u64 acc = len * PRIME64_1; | |
1019 | if (len > 32) { | |
1020 | if (len > 64) { | |
1021 | if (len > 96) { | |
1022 | acc += XXH3p_mix16B(input+48, secret+96, seed); | |
1023 | acc += XXH3p_mix16B(input+len-64, secret+112, seed); | |
1024 | } | |
1025 | acc += XXH3p_mix16B(input+32, secret+64, seed); | |
1026 | acc += XXH3p_mix16B(input+len-48, secret+80, seed); | |
1027 | } | |
1028 | acc += XXH3p_mix16B(input+16, secret+32, seed); | |
1029 | acc += XXH3p_mix16B(input+len-32, secret+48, seed); | |
1030 | } | |
1031 | acc += XXH3p_mix16B(input+0, secret+0, seed); | |
1032 | acc += XXH3p_mix16B(input+len-16, secret+16, seed); | |
1033 | ||
1034 | return XXH3p_avalanche(acc); | |
1035 | } | |
1036 | } | |
1037 | ||
1038 | #define XXH3p_MIDSIZE_MAX 240 | |
1039 | ||
1040 | XXH_NO_INLINE XXH64_hash_t | |
1041 | XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
1042 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
1043 | XXH64_hash_t seed) | |
1044 | { | |
1045 | XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize; | |
1046 | XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX); | |
1047 | ||
1048 | #define XXH3p_MIDSIZE_STARTOFFSET 3 | |
1049 | #define XXH3p_MIDSIZE_LASTOFFSET 17 | |
1050 | ||
1051 | { xxh_u64 acc = len * PRIME64_1; | |
1052 | int const nbRounds = (int)len / 16; | |
1053 | int i; | |
1054 | for (i=0; i<8; i++) { | |
1055 | acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed); | |
1056 | } | |
1057 | acc = XXH3p_avalanche(acc); | |
1058 | XXH_ASSERT(nbRounds >= 8); | |
1059 | for (i=8 ; i < nbRounds; i++) { | |
1060 | acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed); | |
1061 | } | |
1062 | /* last bytes */ | |
1063 | acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed); | |
1064 | return XXH3p_avalanche(acc); | |
1065 | } | |
1066 | } | |
1067 | ||
1068 | /* === Public entry point === */ | |
1069 | ||
1070 | XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len) | |
1071 | { | |
1072 | if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); | |
1073 | if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); | |
1074 | if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); | |
1075 | return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len); | |
1076 | } | |
1077 | ||
1078 | XXH_PUBLIC_API XXH64_hash_t | |
1079 | XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) | |
1080 | { | |
1081 | XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); | |
1082 | /* if an action must be taken should `secret` conditions not be respected, | |
1083 | * it should be done here. | |
1084 | * For now, it's a contract pre-condition. | |
1085 | * Adding a check and a branch here would cost performance at every hash */ | |
1086 | if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); | |
1087 | if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); | |
1088 | if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); | |
1089 | return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); | |
1090 | } | |
1091 | ||
1092 | XXH_PUBLIC_API XXH64_hash_t | |
1093 | XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) | |
1094 | { | |
1095 | if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); | |
1096 | if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); | |
1097 | if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); | |
1098 | return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); | |
1099 | } | |
1100 | ||
1101 | /* === XXH3 streaming === */ | |
1102 | ||
20effc67 | 1103 | /* RocksDB Note: unused & removed due to bug in preview version */ |
f67539c2 TL |
1104 | |
1105 | /* ========================================== | |
1106 | * XXH3 128 bits (=> XXH128) | |
1107 | * ========================================== */ | |
1108 | ||
1109 | XXH_FORCE_INLINE XXH128_hash_t | |
1110 | XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
1111 | { | |
1112 | XXH_ASSERT(input != NULL); | |
1113 | XXH_ASSERT(1 <= len && len <= 3); | |
1114 | XXH_ASSERT(secret != NULL); | |
1115 | { xxh_u8 const c1 = input[0]; | |
1116 | xxh_u8 const c2 = input[len >> 1]; | |
1117 | xxh_u8 const c3 = input[len - 1]; | |
1118 | xxh_u32 const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24); | |
1119 | xxh_u32 const combinedh = XXH_swap32(combinedl); | |
1120 | xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret) + seed); | |
1121 | xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed); | |
1122 | xxh_u64 const mixedl = keyed_lo * PRIME64_1; | |
1123 | xxh_u64 const mixedh = keyed_hi * PRIME64_5; | |
1124 | XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ }; | |
1125 | return h128; | |
1126 | } | |
1127 | } | |
1128 | ||
1129 | ||
1130 | XXH_FORCE_INLINE XXH128_hash_t | |
1131 | XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
1132 | { | |
1133 | XXH_ASSERT(input != NULL); | |
1134 | XXH_ASSERT(secret != NULL); | |
1135 | XXH_ASSERT(4 <= len && len <= 8); | |
1136 | { xxh_u32 const input_lo = XXH_readLE32(input); | |
1137 | xxh_u32 const input_hi = XXH_readLE32(input + len - 4); | |
1138 | xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32); | |
1139 | xxh_u64 const input_64_hi = XXH_swap64(input_64_lo); | |
1140 | xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed); | |
1141 | xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed); | |
1142 | xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1); | |
1143 | xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2; | |
1144 | xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len; | |
1145 | xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4; | |
1146 | { XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ }; | |
1147 | return h128; | |
1148 | } } | |
1149 | } | |
1150 | ||
1151 | XXH_FORCE_INLINE XXH128_hash_t | |
1152 | XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
1153 | { | |
1154 | XXH_ASSERT(input != NULL); | |
1155 | XXH_ASSERT(secret != NULL); | |
1156 | XXH_ASSERT(9 <= len && len <= 16); | |
1157 | { xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed); | |
1158 | xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed); | |
1159 | XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1); | |
1160 | xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5); | |
1161 | m128.low64 += lenContrib; | |
1162 | m128.high64 += input_hi * PRIME64_1; | |
1163 | m128.low64 ^= (m128.high64 >> 32); | |
1164 | { XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2); | |
1165 | h128.high64 += m128.high64 * PRIME64_2; | |
1166 | h128.low64 = XXH3p_avalanche(h128.low64); | |
1167 | h128.high64 = XXH3p_avalanche(h128.high64); | |
1168 | return h128; | |
1169 | } } | |
1170 | } | |
1171 | ||
1172 | /* Assumption : `secret` size is >= 16 | |
1173 | * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */ | |
1174 | XXH_FORCE_INLINE XXH128_hash_t | |
1175 | XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) | |
1176 | { | |
1177 | XXH_ASSERT(len <= 16); | |
1178 | { if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed); | |
1179 | if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed); | |
1180 | if (len) return XXH3p_len_1to3_128b(input, len, secret, seed); | |
1181 | { XXH128_hash_t const h128 = { 0, 0 }; | |
1182 | return h128; | |
1183 | } } | |
1184 | } | |
1185 | ||
1186 | XXH_FORCE_INLINE XXH128_hash_t | |
1187 | XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len, | |
1188 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize) | |
1189 | { | |
1190 | XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC; | |
1191 | ||
1192 | XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits); | |
1193 | ||
1194 | /* converge into final hash */ | |
1195 | XXH_STATIC_ASSERT(sizeof(acc) == 64); | |
1196 | XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); | |
1197 | { xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); | |
1198 | xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2)); | |
1199 | XXH128_hash_t const h128 = { low64, high64 }; | |
1200 | return h128; | |
1201 | } | |
1202 | } | |
1203 | ||
1204 | XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
1205 | XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len) | |
1206 | { | |
1207 | return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret)); | |
1208 | } | |
1209 | ||
1210 | XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
1211 | XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len, | |
1212 | const xxh_u8* secret, size_t secretSize) | |
1213 | { | |
1214 | return XXH3p_hashLong_128b_internal(input, len, secret, secretSize); | |
1215 | } | |
1216 | ||
1217 | XXH_NO_INLINE XXH128_hash_t /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ | |
1218 | XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed) | |
1219 | { | |
1220 | XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; | |
1221 | if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len); | |
1222 | XXH3p_initCustomSecret(secret, seed); | |
1223 | return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret)); | |
1224 | } | |
1225 | ||
1226 | ||
1227 | XXH_FORCE_INLINE XXH128_hash_t | |
1228 | XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed) | |
1229 | { | |
1230 | acc.low64 += XXH3p_mix16B (input_1, secret+0, seed); | |
1231 | acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); | |
1232 | acc.high64 += XXH3p_mix16B (input_2, secret+16, seed); | |
1233 | acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); | |
1234 | return acc; | |
1235 | } | |
1236 | ||
1237 | XXH_NO_INLINE XXH128_hash_t | |
1238 | XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
1239 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
1240 | XXH64_hash_t seed) | |
1241 | { | |
1242 | XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize; | |
1243 | XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX); | |
1244 | ||
1245 | { XXH128_hash_t acc; | |
1246 | int const nbRounds = (int)len / 32; | |
1247 | int i; | |
1248 | acc.low64 = len * PRIME64_1; | |
1249 | acc.high64 = 0; | |
1250 | for (i=0; i<4; i++) { | |
1251 | acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed); | |
1252 | } | |
1253 | acc.low64 = XXH3p_avalanche(acc.low64); | |
1254 | acc.high64 = XXH3p_avalanche(acc.high64); | |
1255 | XXH_ASSERT(nbRounds >= 4); | |
1256 | for (i=4 ; i < nbRounds; i++) { | |
1257 | acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed); | |
1258 | } | |
1259 | /* last bytes */ | |
1260 | acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed); | |
1261 | ||
1262 | { xxh_u64 const low64 = acc.low64 + acc.high64; | |
1263 | xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2); | |
1264 | XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) }; | |
1265 | return h128; | |
1266 | } | |
1267 | } | |
1268 | } | |
1269 | ||
1270 | ||
1271 | XXH_FORCE_INLINE XXH128_hash_t | |
1272 | XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, | |
1273 | const xxh_u8* XXH_RESTRICT secret, size_t secretSize, | |
1274 | XXH64_hash_t seed) | |
1275 | { | |
1276 | XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize; | |
1277 | XXH_ASSERT(16 < len && len <= 128); | |
1278 | ||
1279 | { XXH128_hash_t acc; | |
1280 | acc.low64 = len * PRIME64_1; | |
1281 | acc.high64 = 0; | |
1282 | if (len > 32) { | |
1283 | if (len > 64) { | |
1284 | if (len > 96) { | |
1285 | acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); | |
1286 | } | |
1287 | acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); | |
1288 | } | |
1289 | acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); | |
1290 | } | |
1291 | acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); | |
1292 | { xxh_u64 const low64 = acc.low64 + acc.high64; | |
1293 | xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2); | |
1294 | XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) }; | |
1295 | return h128; | |
1296 | } | |
1297 | } | |
1298 | } | |
1299 | ||
1300 | XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len) | |
1301 | { | |
1302 | if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0); | |
1303 | if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); | |
1304 | if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); | |
1305 | return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len); | |
1306 | } | |
1307 | ||
1308 | XXH_PUBLIC_API XXH128_hash_t | |
1309 | XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) | |
1310 | { | |
1311 | XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); | |
1312 | /* if an action must be taken should `secret` conditions not be respected, | |
1313 | * it should be done here. | |
1314 | * For now, it's a contract pre-condition. | |
1315 | * Adding a check and a branch here would cost performance at every hash */ | |
1316 | if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); | |
1317 | if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); | |
1318 | if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); | |
1319 | return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); | |
1320 | } | |
1321 | ||
1322 | XXH_PUBLIC_API XXH128_hash_t | |
1323 | XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) | |
1324 | { | |
1325 | if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed); | |
1326 | if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); | |
1327 | if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); | |
1328 | return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed); | |
1329 | } | |
1330 | ||
1331 | XXH_PUBLIC_API XXH128_hash_t | |
1332 | XXH128(const void* input, size_t len, XXH64_hash_t seed) | |
1333 | { | |
1334 | return XXH3p_128bits_withSeed(input, len, seed); | |
1335 | } | |
1336 | ||
1337 | ||
1338 | /* === XXH3 128-bit streaming === */ | |
1339 | ||
20effc67 | 1340 | /* RocksDB Note: unused & removed due to bug in preview version */ |
f67539c2 TL |
1341 | |
1342 | /* 128-bit utility functions */ | |
1343 | ||
1344 | #include <string.h> /* memcmp */ | |
1345 | ||
1346 | /* return : 1 is equal, 0 if different */ | |
1347 | XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) | |
1348 | { | |
1349 | /* note : XXH128_hash_t is compact, it has no padding byte */ | |
1350 | return !(memcmp(&h1, &h2, sizeof(h1))); | |
1351 | } | |
1352 | ||
1353 | /* This prototype is compatible with stdlib's qsort(). | |
1354 | * return : >0 if *h128_1 > *h128_2 | |
1355 | * <0 if *h128_1 < *h128_2 | |
1356 | * =0 if *h128_1 == *h128_2 */ | |
1357 | XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) | |
1358 | { | |
1359 | XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; | |
1360 | XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; | |
1361 | int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); | |
1362 | /* note : bets that, in most cases, hash values are different */ | |
1363 | if (hcmp) return hcmp; | |
1364 | return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); | |
1365 | } | |
1366 | ||
1367 | ||
1368 | /*====== Canonical representation ======*/ | |
1369 | XXH_PUBLIC_API void | |
1370 | XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) | |
1371 | { | |
1372 | XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); | |
1373 | if (XXH_CPU_LITTLE_ENDIAN) { | |
1374 | hash.high64 = XXH_swap64(hash.high64); | |
1375 | hash.low64 = XXH_swap64(hash.low64); | |
1376 | } | |
1377 | memcpy(dst, &hash.high64, sizeof(hash.high64)); | |
1378 | memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); | |
1379 | } | |
1380 | ||
1381 | XXH_PUBLIC_API XXH128_hash_t | |
1382 | XXH128_hashFromCanonical(const XXH128_canonical_t* src) | |
1383 | { | |
1384 | XXH128_hash_t h; | |
1385 | h.high64 = XXH_readBE64(src); | |
1386 | h.low64 = XXH_readBE64(src->digest + 8); | |
1387 | return h; | |
1388 | } | |
1389 | ||
1390 | ||
1391 | ||
1392 | #endif /* XXH3p_H */ |