ceph/src/rocksdb/util/xxh3p.h

   1 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 /*
   6    xxHash - Extremely Fast Hash algorithm
   7    Development source file for `xxh3`
   8    Copyright (C) 2019-present, Yann Collet.
   9
  10    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
  11
  12    Redistribution and use in source and binary forms, with or without
  13    modification, are permitted provided that the following conditions are
  14    met:
  15
  16        * Redistributions of source code must retain the above copyright
  17    notice, this list of conditions and the following disclaimer.
  18        * Redistributions in binary form must reproduce the above
  19    copyright notice, this list of conditions and the following disclaimer
  20    in the documentation and/or other materials provided with the
  21    distribution.
  22
  23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  26    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  27    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  28    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  29    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  30    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  31    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  32    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  33    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  34
  35    You can contact the author at :
  36    - xxHash source repository : https://github.com/Cyan4973/xxHash
  37 */
  38
  39 /* RocksDB Note: This file contains a preview release (xxhash repository
  40    version 0.7.2) of XXH3 that is unlikely to be compatible with the final
  41    version of XXH3. We have therefore renamed this XXH3p ("preview"), for
  42    clarity so that we can continue to use this version even after
  43    integrating a newer incompatible version.
  44 */
  45
  46 /* Note :
  47    This file is separated for development purposes.
  48    It will be integrated into `xxhash.c` when development phase is complete.
  49 */
  50
  51 #ifndef XXH3p_H
  52 #define XXH3p_H
  53
  54
  55 /* ===   Dependencies   === */
  56
  57 #undef XXH_INLINE_ALL   /* in case it's already defined */
  58 #define XXH_INLINE_ALL
  59 #include "xxhash.h"
  60
  61
  62 /* ===   Compiler specifics   === */
  63
  64 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
  65 #  define XXH_RESTRICT   restrict
  66 #else
  67 /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
  68 #  define XXH_RESTRICT   /* disable */
  69 #endif
  70
  71 #if defined(__GNUC__)
  72 #  if defined(__AVX2__)
  73 #    include <immintrin.h>
  74 #  elif defined(__SSE2__)
  75 #    include <emmintrin.h>
  76 #  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
  77 #    define inline __inline__  /* clang bug */
  78 #    include <arm_neon.h>
  79 #    undef inline
  80 #  endif
  81 #elif defined(_MSC_VER)
  82 #  include <intrin.h>
  83 #endif
  84
  85 /*
  86  * Sanity check.
  87  *
  88  * XXH3 only requires these features to be efficient:
  89  *
  90  *  - Usable unaligned access
  91  *  - A 32-bit or 64-bit ALU
  92  *      - If 32-bit, a decent ADC instruction
  93  *  - A 32 or 64-bit multiply with a 64-bit result
  94  *
  95  * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
  96  * classic 16-bit only subset of ARM's instruction set.
  97  *
  98  * First of all, Thumb-1 lacks support for the UMULL instruction which
  99  * performs the important long multiply. This means numerous __aeabi_lmul
 100  * calls.
 101  *
 102  * Second of all, the 8 functional registers are just not enough.
 103  * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
 104  * Lo registers, and this shuffling results in thousands more MOVs than A32.
 105  *
 106  * A32 and T32 don't have this limitation. They can access all 14 registers,
 107  * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
 108  *
 109  * If compiling Thumb-1 for a target which supports ARM instructions, we
 110  * will give a warning.
 111  *
 112  * Usually, if this happens, it is because of an accident and you probably
 113  * need to specify -march, as you probably meant to compileh for a newer
 114  * architecture.
 115  */
 116 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
 117 #   warning "XXH3 is highly inefficient without ARM or Thumb-2."
 118 #endif
 119
 120 /* ==========================================
 121  * Vectorization detection
 122  * ========================================== */
 123 #define XXH_SCALAR 0
 124 #define XXH_SSE2   1
 125 #define XXH_AVX2   2
 126 #define XXH_NEON   3
 127 #define XXH_VSX    4
 128
 129 #ifndef XXH_VECTOR    /* can be defined on command line */
 130 #  if defined(__AVX2__)
 131 #    define XXH_VECTOR XXH_AVX2
 132 #  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
 133 #    define XXH_VECTOR XXH_SSE2
 134 #  elif defined(__GNUC__) /* msvc support maybe later */ \
 135   && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
 136   && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
 137     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
 138 #    define XXH_VECTOR XXH_NEON
 139 #  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
 140 #    define XXH_VECTOR XXH_VSX
 141 #  else
 142 #    define XXH_VECTOR XXH_SCALAR
 143 #  endif
 144 #endif
 145
 146 /* control alignment of accumulator,
 147  * for compatibility with fast vector loads */
 148 #ifndef XXH_ACC_ALIGN
 149 #  if XXH_VECTOR == 0   /* scalar */
 150 #     define XXH_ACC_ALIGN 8
 151 #  elif XXH_VECTOR == 1  /* sse2 */
 152 #     define XXH_ACC_ALIGN 16
 153 #  elif XXH_VECTOR == 2  /* avx2 */
 154 #     define XXH_ACC_ALIGN 32
 155 #  elif XXH_VECTOR == 3  /* neon */
 156 #     define XXH_ACC_ALIGN 16
 157 #  elif XXH_VECTOR == 4  /* vsx */
 158 #     define XXH_ACC_ALIGN 16
 159 #  endif
 160 #endif
 161
 162 /* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
 163 #if defined(_MSC_VER) && defined(_M_IX86)
 164 #    include <intrin.h>
 165 #    define XXH_mult32to64(x, y) __emulu(x, y)
 166 #else
 167 #    define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
 168 #endif
 169
 170 /* VSX stuff. It's a lot because VSX support is mediocre across compilers and
 171  * there is a lot of mischief with endianness. */
 172 #if XXH_VECTOR == XXH_VSX
 173 #  include <altivec.h>
 174 #  undef vector
 175 typedef __vector unsigned long long U64x2;
 176 typedef __vector unsigned char U8x16;
 177 typedef __vector unsigned U32x4;
 178
 179 #ifndef XXH_VSX_BE
 180 #  if defined(__BIG_ENDIAN__) \
 181   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 182 #    define XXH_VSX_BE 1
 183 #  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
 184 #    warning "-maltivec=be is not recommended. Please use native endianness."
 185 #    define XXH_VSX_BE 1
 186 #  else
 187 #    define XXH_VSX_BE 0
 188 #  endif
 189 #endif
 190
 191 /* We need some helpers for big endian mode. */
 192 #if XXH_VSX_BE
 193 /* A wrapper for POWER9's vec_revb. */
 194 #  ifdef __POWER9_VECTOR__
 195 #    define XXH_vec_revb vec_revb
 196 #  else
 197 XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
 198 {
 199     U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
 200                               0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
 201     return vec_perm(val, val, vByteSwap);
 202 }
 203 #  endif
 204
 205 /* Power8 Crypto gives us vpermxor which is very handy for
 206  * PPC64EB.
 207  *
 208  * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
 209  * {
 210  *     U8x16 ret;
 211  *     for (int i = 0; i < 16; i++) {
 212  *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
 213  *     }
 214  *     return ret;
 215  * }
 216  *
 217  * Because both of the main loops load the key, swap, and xor it with input,
 218  * we can combine the key swap into this instruction.
 219  */
 220 #  ifdef vec_permxor
 221 #    define XXH_vec_permxor vec_permxor
 222 #  else
 223 #    define XXH_vec_permxor __builtin_crypto_vpermxor
 224 #  endif
 225 #endif  /* XXH_VSX_BE */
 226 /*
 227  * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
 228  * vec_mule.
 229  *
 230  * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
 231  * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
 232  * GCC needs inline assembly. */
 233 #if __has_builtin(__builtin_altivec_vmuleuw)
 234 #  define XXH_vec_mulo __builtin_altivec_vmulouw
 235 #  define XXH_vec_mule __builtin_altivec_vmuleuw
 236 #else
 237 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
 238 XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
 239     U64x2 result;
 240     __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
 241     return result;
 242 }
 243 XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
 244     U64x2 result;
 245     __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
 246     return result;
 247 }
 248 #endif /* __has_builtin(__builtin_altivec_vmuleuw) */
 249 #endif /* XXH_VECTOR == XXH_VSX */
 250
 251 /* prefetch
 252  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
 253 #if defined(XXH_NO_PREFETCH)
 254 #  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
 255 #else
 256 #if defined(_MSC_VER) && \
 257     (defined(_M_X64) ||  \
 258      defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */
 259 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
 260 #    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
 261 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
 262 #    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
 263 #  else
 264 #    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
 265 #  endif
 266 #endif  /* XXH_NO_PREFETCH */
 267
 268
 269 /* ==========================================
 270  * XXH3 default settings
 271  * ========================================== */
 272
 273 #define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
 274
 275 #if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
 276 #  error "default keyset is not large enough"
 277 #endif
 278
 279 XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
 280     0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
 281     0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
 282     0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
 283     0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
 284     0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
 285     0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
 286     0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
 287     0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
 288
 289     0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
 290     0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
 291     0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
 292     0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
 293 };
 294
 295 /*
 296  * GCC for x86 has a tendency to use SSE in this loop. While it
 297  * successfully avoids swapping (as MUL overwrites EAX and EDX), it
 298  * slows it down because instead of free register swap shifts, it
 299  * must use pshufd and punpckl/hd.
 300  *
 301  * To prevent this, we use this attribute to shut off SSE.
 302  */
 303 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
 304 __attribute__((__target__("no-sse")))
 305 #endif
 306 static XXH128_hash_t
 307 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
 308 {
 309     /*
 310      * GCC/Clang __uint128_t method.
 311      *
 312      * On most 64-bit targets, GCC and Clang define a __uint128_t type.
 313      * This is usually the best way as it usually uses a native long 64-bit
 314      * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
 315      *
 316      * Usually.
 317      *
 318      * Despite being a 32-bit platform, Clang (and emscripten) define this
 319      * type despite not having the arithmetic for it. This results in a
 320      * laggy compiler builtin call which calculates a full 128-bit multiply.
 321      * In that case it is best to use the portable one.
 322      * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
 323      */
 324 #if defined(__GNUC__) && !defined(__wasm__) \
 325     && defined(__SIZEOF_INT128__) \
 326     || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
 327
 328     __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
 329     XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
 330     return r128;
 331
 332     /*
 333      * MSVC for x64's _umul128 method.
 334      *
 335      * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
 336      *
 337      * This compiles to single operand MUL on x64.
 338      */
 339 #elif defined(_M_X64) || defined(_M_IA64)
 340
 341 #ifndef _MSC_VER
 342 #   pragma intrinsic(_umul128)
 343 #endif
 344     xxh_u64 product_high;
 345     xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
 346     XXH128_hash_t const r128 = { product_low, product_high };
 347     return r128;
 348
 349 #else
 350     /*
 351      * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
 352      *
 353      * This is a fast and simple grade school multiply, which is shown
 354      * below with base 10 arithmetic instead of base 0x100000000.
 355      *
 356      *           9 3 // D2 lhs = 93
 357      *         x 7 5 // D2 rhs = 75
 358      *     ----------
 359      *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
 360      *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
 361      *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
 362      *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
 363      *     ---------
 364      *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
 365      *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
 366      *     ---------
 367      *       6 9 7 5
 368      *
 369      * The reasons for adding the products like this are:
 370      *  1. It avoids manual carry tracking. Just like how
 371      *     (9 * 9) + 9 + 9 = 99, the same applies with this for
 372      *     UINT64_MAX. This avoids a lot of complexity.
 373      *
 374      *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
 375      *     instruction available in ARMv6+ A32/T32, which is shown below:
 376      *
 377      *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
 378      *         {
 379      *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
 380      *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
 381      *             *RdHi = (xxh_u32)(product >> 32);
 382      *         }
 383      *
 384      *     This instruction was designed for efficient long multiplication,
 385      *     and allows this to be calculated in only 4 instructions which
 386      *     is comparable to some 64-bit ALUs.
 387      *
 388      *  3. It isn't terrible on other platforms. Usually this will be
 389      *     a couple of 32-bit ADD/ADCs.
 390      */
 391
 392     /* First calculate all of the cross products. */
 393     xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
 394     xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
 395     xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
 396     xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
 397
 398     /* Now add the products together. These will never overflow. */
 399     xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
 400     xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
 401     xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
 402
 403     XXH128_hash_t r128 = { lower, upper };
 404     return r128;
 405 #endif
 406 }
 407
 408 /*
 409  * We want to keep the attribute here because a target switch
 410  * disables inlining.
 411  *
 412  * Does a 64-bit to 128-bit multiply, then XOR folds it.
 413  * The reason for the separate function is to prevent passing
 414  * too many structs around by value. This will hopefully inline
 415  * the multiply, but we don't force it.
 416  */
 417 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
 418 __attribute__((__target__("no-sse")))
 419 #endif
 420 static xxh_u64
 421 XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
 422 {
 423     XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
 424     return product.low64 ^ product.high64;
 425 }
 426
 427
 428 static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
 429 {
 430     h64 ^= h64 >> 37;
 431     h64 *= PRIME64_3;
 432     h64 ^= h64 >> 32;
 433     return h64;
 434 }
 435
 436
 437 /* ==========================================
 438  * Short keys
 439  * ========================================== */
 440
 441 XXH_FORCE_INLINE XXH64_hash_t
 442 XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
 443 {
 444     XXH_ASSERT(input != NULL);
 445     XXH_ASSERT(1 <= len && len <= 3);
 446     XXH_ASSERT(secret != NULL);
 447     {   xxh_u8 const c1 = input[0];
 448         xxh_u8 const c2 = input[len >> 1];
 449         xxh_u8 const c3 = input[len - 1];
 450         xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
 451         xxh_u64  const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
 452         xxh_u64  const mixed = keyed * PRIME64_1;
 453         return XXH3p_avalanche(mixed);
 454     }
 455 }
 456
 457 XXH_FORCE_INLINE XXH64_hash_t
 458 XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
 459 {
 460     XXH_ASSERT(input != NULL);
 461     XXH_ASSERT(secret != NULL);
 462     XXH_ASSERT(4 <= len && len <= 8);
 463     {   xxh_u32 const input_lo = XXH_readLE32(input);
 464         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
 465         xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
 466         xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
 467         xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
 468         return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
 469     }
 470 }
 471
 472 XXH_FORCE_INLINE XXH64_hash_t
 473 XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
 474 {
 475     XXH_ASSERT(input != NULL);
 476     XXH_ASSERT(secret != NULL);
 477     XXH_ASSERT(9 <= len && len <= 16);
 478     {   xxh_u64 const input_lo = XXH_readLE64(input)           ^ (XXH_readLE64(secret)     + seed);
 479         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
 480         xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
 481         return XXH3p_avalanche(acc);
 482     }
 483 }
 484
 485 XXH_FORCE_INLINE XXH64_hash_t
 486 XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
 487 {
 488     XXH_ASSERT(len <= 16);
 489     {   if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
 490         if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
 491         if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
 492         /*
 493          * RocksDB modification from XXH3 preview: zero result for empty
 494          * string can be problematic for multiplication-based algorithms.
 495          * Return a hash of the seed instead.
 496          */
 497         return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
 498     }
 499 }
 500
 501
 502 /* ===    Long Keys    === */
 503
 504 #define STRIPE_LEN 64
 505 #define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
 506 #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
 507
 508 typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
 509
 510 XXH_FORCE_INLINE void
 511 XXH3p_accumulate_512(      void* XXH_RESTRICT acc,
 512                     const void* XXH_RESTRICT input,
 513                     const void* XXH_RESTRICT secret,
 514                     XXH3p_accWidth_e accWidth)
 515 {
 516 #if (XXH_VECTOR == XXH_AVX2)
 517
 518     XXH_ASSERT((((size_t)acc) & 31) == 0);
 519     {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
 520         const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
 521         const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
 522
 523         size_t i;
 524         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
 525             __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
 526             __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
 527             __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
 528             __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
 529             if (accWidth == XXH3p_acc_128bits) {
 530                 __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
 531                 __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
 532                 xacc[i]  = _mm256_add_epi64(product, sum);
 533             } else {  /* XXH3p_acc_64bits */
 534                 __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
 535                 xacc[i]  = _mm256_add_epi64(product, sum);
 536             }
 537     }   }
 538
 539 #elif (XXH_VECTOR == XXH_SSE2)
 540
 541     XXH_ASSERT((((size_t)acc) & 15) == 0);
 542     {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
 543         const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
 544         const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
 545
 546         size_t i;
 547         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
 548             __m128i const data_vec = _mm_loadu_si128 (xinput+i);
 549             __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
 550             __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
 551             __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
 552             if (accWidth == XXH3p_acc_128bits) {
 553                 __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
 554                 __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
 555                 xacc[i]  = _mm_add_epi64(product, sum);
 556             } else {  /* XXH3p_acc_64bits */
 557                 __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
 558                 xacc[i]  = _mm_add_epi64(product, sum);
 559             }
 560     }   }
 561
 562 #elif (XXH_VECTOR == XXH_NEON)
 563
 564     XXH_ASSERT((((size_t)acc) & 15) == 0);
 565     {
 566         XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
 567         /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
 568         uint8_t const* const xinput = (const uint8_t *) input;
 569         uint8_t const* const xsecret  = (const uint8_t *) secret;
 570
 571         size_t i;
 572         for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
 573 #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
 574             /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
 575              * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
 576              * assumes I don't want to destroy it and tries to make a copy. This slows down the code
 577              * a lot.
 578              * aarch64 not only uses an entirely different syntax, but it requires three
 579              * instructions...
 580              *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
 581              *    zip1   v3.2s, v0.2s, v1.2s   // first zip
 582              *    zip2   v2.2s, v0.2s, v1.2s   // second zip
 583              * ...to do what ARM does in one:
 584              *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
 585
 586             /* data_vec = xsecret[i]; */
 587             uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
 588             /* key_vec  = xsecret[i];  */
 589             uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
 590             /* data_key = data_vec ^ key_vec; */
 591             uint32x4_t       data_key;
 592
 593             if (accWidth == XXH3p_acc_64bits) {
 594                 /* Add first to prevent register swaps */
 595                 /* xacc[i] += data_vec; */
 596                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
 597             } else {  /* XXH3p_acc_128bits */
 598                 /* xacc[i] += swap(data_vec); */
 599                 /* can probably be optimized better */
 600                 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
 601                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
 602                 xacc[i] = vaddq_u64 (xacc[i], swapped);
 603             }
 604
 605             data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
 606
 607             /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
 608              * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
 609             __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
 610             /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
 611             xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
 612
 613 #else
 614             /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
 615
 616             /* data_vec = xsecret[i]; */
 617             uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
 618             /* key_vec  = xsecret[i];  */
 619             uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
 620             /* data_key = data_vec ^ key_vec; */
 621             uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
 622             /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
 623             uint32x2_t const data_key_lo = vmovn_u64  (data_key);
 624             /* data_key_hi = (uint32x2_t) (data_key >> 32); */
 625             uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
 626             if (accWidth == XXH3p_acc_64bits) {
 627                 /* xacc[i] += data_vec; */
 628                 xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
 629             } else {  /* XXH3p_acc_128bits */
 630                 /* xacc[i] += swap(data_vec); */
 631                 uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
 632                 uint64x2_t const swapped= vextq_u64(data64, data64, 1);
 633                 xacc[i] = vaddq_u64 (xacc[i], swapped);
 634             }
 635             /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
 636             xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
 637
 638 #endif
 639         }
 640     }
 641
 642 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
 643           U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
 644     U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
 645     U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
 646     U64x2 const v32 = { 32,  32 };
 647 #if XXH_VSX_BE
 648     U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
 649                               0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
 650 #endif
 651     size_t i;
 652     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
 653         /* data_vec = xinput[i]; */
 654         /* key_vec = xsecret[i]; */
 655 #if XXH_VSX_BE
 656         /* byteswap */
 657         U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
 658         U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
 659         /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
 660         U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
 661 #else
 662         U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
 663         U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
 664         U64x2 const data_key = data_vec ^ key_vec;
 665 #endif
 666         /* shuffled = (data_key << 32) | (data_key >> 32); */
 667         U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
 668         /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
 669         U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
 670         xacc[i] += product;
 671
 672         if (accWidth == XXH3p_acc_64bits) {
 673             xacc[i] += data_vec;
 674         } else {  /* XXH3p_acc_128bits */
 675             /* swap high and low halves */
 676             U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
 677             xacc[i] += data_swapped;
 678         }
 679     }
 680
 681 #else   /* scalar variant of Accumulator - universal */
 682
 683     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
 684     const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
 685     const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
 686     size_t i;
 687     XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
 688     for (i=0; i < ACC_NB; i++) {
 689         xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
 690         xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
 691
 692         if (accWidth == XXH3p_acc_64bits) {
 693             xacc[i] += data_val;
 694         } else {
 695             xacc[i ^ 1] += data_val; /* swap adjacent lanes */
 696         }
 697         xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
 698     }
 699 #endif
 700 }
 701
 702 XXH_FORCE_INLINE void
 703 XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
 704 {
 705 #if (XXH_VECTOR == XXH_AVX2)
 706
 707     XXH_ASSERT((((size_t)acc) & 31) == 0);
 708     {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
 709         const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
 710         const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
 711
 712         size_t i;
 713         for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
 714             /* xacc[i] ^= (xacc[i] >> 47) */
 715             __m256i const acc_vec     = xacc[i];
 716             __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
 717             __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
 718             /* xacc[i] ^= xsecret; */
 719             __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
 720             __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
 721
 722             /* xacc[i] *= PRIME32_1; */
 723             __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
 724             __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
 725             __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
 726             xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
 727         }
 728     }
 729
 730 #elif (XXH_VECTOR == XXH_SSE2)
 731
 732     XXH_ASSERT((((size_t)acc) & 15) == 0);
 733     {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
 734         const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
 735         const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
 736
 737         size_t i;
 738         for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
 739             /* xacc[i] ^= (xacc[i] >> 47) */
 740             __m128i const acc_vec     = xacc[i];
 741             __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
 742             __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
 743             /* xacc[i] ^= xsecret; */
 744             __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
 745             __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
 746
 747             /* xacc[i] *= PRIME32_1; */
 748             __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
 749             __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
 750             __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
 751             xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
 752         }
 753     }
 754
 755 #elif (XXH_VECTOR == XXH_NEON)
 756
 757     XXH_ASSERT((((size_t)acc) & 15) == 0);
 758
 759     {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
 760         uint8_t const* const xsecret = (uint8_t const*) secret;
 761         uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
 762
 763         size_t i;
 764         for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
 765             /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
 766             uint64x2_t const   acc_vec  = xacc[i];
 767             uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
 768             uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
 769
 770             /* key_vec  = xsecret[i]; */
 771             uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
 772             /* data_key = data_vec ^ key_vec; */
 773             uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
 774             /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
 775             uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
 776
 777             /* data_key *= PRIME32_1 */
 778
 779             /* prod_hi = (data_key >> 32) * PRIME32_1; */
 780             uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
 781             /* xacc[i] = prod_hi << 32; */
 782             xacc[i] = vshlq_n_u64(prod_hi, 32);
 783             /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
 784             xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
 785     }   }
 786
 787 #elif (XXH_VECTOR == XXH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5)
 788
 789           U64x2* const xacc =       (U64x2*) acc;
 790     const U64x2* const xsecret = (const U64x2*) secret;
 791     /* constants */
 792     U64x2 const v32  = { 32, 32 };
 793     U64x2 const v47 = { 47, 47 };
 794     U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
 795     size_t i;
 796 #if XXH_VSX_BE
 797     /* endian swap */
 798     U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
 799                               0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
 800 #endif
 801     for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
 802         U64x2 const acc_vec  = xacc[i];
 803         U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
 804         /* key_vec = xsecret[i]; */
 805 #if XXH_VSX_BE
 806         /* swap bytes words */
 807         U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
 808         U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
 809 #else
 810         U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
 811         U64x2 const data_key = data_vec ^ key_vec;
 812 #endif
 813
 814         /* data_key *= PRIME32_1 */
 815
 816         /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
 817         U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
 818         /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
 819         U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
 820         xacc[i] = prod_odd + (prod_even << v32);
 821     }
 822
 823 #else   /* scalar variant of Scrambler - universal */
 824
 825     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
 826     const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
 827     size_t i;
 828     XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
 829     for (i=0; i < ACC_NB; i++) {
 830         xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
 831         xxh_u64 acc64 = xacc[i];
 832         acc64 ^= acc64 >> 47;
 833         acc64 ^= key64;
 834         acc64 *= PRIME32_1;
 835         xacc[i] = acc64;
 836     }
 837
 838 #endif
 839 }
 840
 841 #define XXH_PREFETCH_DIST 384
 842
 843 /* assumption : nbStripes will not overflow secret size */
 844 XXH_FORCE_INLINE void
 845 XXH3p_accumulate(       xxh_u64* XXH_RESTRICT acc,
 846                 const xxh_u8* XXH_RESTRICT input,
 847                 const xxh_u8* XXH_RESTRICT secret,
 848                       size_t nbStripes,
 849                       XXH3p_accWidth_e accWidth)
 850 {
 851     size_t n;
 852     for (n = 0; n < nbStripes; n++ ) {
 853         const xxh_u8* const in = input + n*STRIPE_LEN;
 854         XXH_PREFETCH(in + XXH_PREFETCH_DIST);
 855         XXH3p_accumulate_512(acc,
 856                             in,
 857                             secret + n*XXH_SECRET_CONSUME_RATE,
 858                             accWidth);
 859     }
 860 }
 861
 862 /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
 863  *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
 864  *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
 865  *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
 866  */
 867
 868 #if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
 869 static void
 870 #else
 871 XXH_FORCE_INLINE void
 872 #endif
 873 XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
 874                       const xxh_u8* XXH_RESTRICT input, size_t len,
 875                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
 876                             XXH3p_accWidth_e accWidth)
 877 {
 878     size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
 879     size_t const block_len = STRIPE_LEN * nb_rounds;
 880     size_t const nb_blocks = len / block_len;
 881
 882     size_t n;
 883
 884     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
 885
 886     for (n = 0; n < nb_blocks; n++) {
 887         XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
 888         XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
 889     }
 890
 891     /* last partial block */
 892     XXH_ASSERT(len > STRIPE_LEN);
 893     {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
 894         XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
 895         XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
 896
 897         /* last stripe */
 898         if (len & (STRIPE_LEN - 1)) {
 899             const xxh_u8* const p = input + len - STRIPE_LEN;
 900 #define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
 901             XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
 902     }   }
 903 }
 904
 905 XXH_FORCE_INLINE xxh_u64
 906 XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
 907 {
 908     return XXH3p_mul128_fold64(
 909                acc[0] ^ XXH_readLE64(secret),
 910                acc[1] ^ XXH_readLE64(secret+8) );
 911 }
 912
 913 static XXH64_hash_t
 914 XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
 915 {
 916     xxh_u64 result64 = start;
 917
 918     result64 += XXH3p_mix2Accs(acc+0, secret +  0);
 919     result64 += XXH3p_mix2Accs(acc+2, secret + 16);
 920     result64 += XXH3p_mix2Accs(acc+4, secret + 32);
 921     result64 += XXH3p_mix2Accs(acc+6, secret + 48);
 922
 923     return XXH3p_avalanche(result64);
 924 }
 925
 926 #define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
 927                         PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
 928
 929 XXH_FORCE_INLINE XXH64_hash_t
 930 XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
 931                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
 932 {
 933     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
 934
 935     XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
 936
 937     /* converge into final hash */
 938     XXH_STATIC_ASSERT(sizeof(acc) == 64);
 939 #define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
 940     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
 941     return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
 942 }
 943
 944
 945 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
 946 XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
 947 {
 948     return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
 949 }
 950
 951 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
 952 XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
 953                              const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
 954 {
 955     return XXH3p_hashLong_internal(input, len, secret, secretSize);
 956 }
 957
 958
 959 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
 960 {
 961     if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
 962     memcpy(dst, &v64, sizeof(v64));
 963 }
 964
 965 /* XXH3p_initCustomSecret() :
 966  * destination `customSecret` is presumed allocated and same size as `kSecret`.
 967  */
 968 XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
 969 {
 970     int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
 971     int i;
 972
 973     XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
 974
 975     for (i=0; i < nbRounds; i++) {
 976         XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
 977         XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
 978     }
 979 }
 980
 981
 982 /* XXH3p_hashLong_64b_withSeed() :
 983  * Generate a custom key,
 984  * based on alteration of default kSecret with the seed,
 985  * and then use this key for long mode hashing.
 986  * This operation is decently fast but nonetheless costs a little bit of time.
 987  * Try to avoid it whenever possible (typically when seed==0).
 988  */
 989 XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
 990 XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
 991 {
 992     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
 993     if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
 994     XXH3p_initCustomSecret(secret, seed);
 995     return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
 996 }
 997
 998
 999 XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
1000                                  const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
1001 {
1002     xxh_u64 const input_lo = XXH_readLE64(input);
1003     xxh_u64 const input_hi = XXH_readLE64(input+8);
1004     return XXH3p_mul128_fold64(
1005                input_lo ^ (XXH_readLE64(secret)   + seed64),
1006                input_hi ^ (XXH_readLE64(secret+8) - seed64) );
1007 }
1008
1009
1010 XXH_FORCE_INLINE XXH64_hash_t
1011 XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1012                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1013                      XXH64_hash_t seed)
1014 {
1015     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1016     XXH_ASSERT(16 < len && len <= 128);
1017
1018     {   xxh_u64 acc = len * PRIME64_1;
1019         if (len > 32) {
1020             if (len > 64) {
1021                 if (len > 96) {
1022                     acc += XXH3p_mix16B(input+48, secret+96, seed);
1023                     acc += XXH3p_mix16B(input+len-64, secret+112, seed);
1024                 }
1025                 acc += XXH3p_mix16B(input+32, secret+64, seed);
1026                 acc += XXH3p_mix16B(input+len-48, secret+80, seed);
1027             }
1028             acc += XXH3p_mix16B(input+16, secret+32, seed);
1029             acc += XXH3p_mix16B(input+len-32, secret+48, seed);
1030         }
1031         acc += XXH3p_mix16B(input+0, secret+0, seed);
1032         acc += XXH3p_mix16B(input+len-16, secret+16, seed);
1033
1034         return XXH3p_avalanche(acc);
1035     }
1036 }
1037
1038 #define XXH3p_MIDSIZE_MAX 240
1039
1040 XXH_NO_INLINE XXH64_hash_t
1041 XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
1042                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1043                       XXH64_hash_t seed)
1044 {
1045     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1046     XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1047
1048     #define XXH3p_MIDSIZE_STARTOFFSET 3
1049     #define XXH3p_MIDSIZE_LASTOFFSET  17
1050
1051     {   xxh_u64 acc = len * PRIME64_1;
1052         int const nbRounds = (int)len / 16;
1053         int i;
1054         for (i=0; i<8; i++) {
1055             acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
1056         }
1057         acc = XXH3p_avalanche(acc);
1058         XXH_ASSERT(nbRounds >= 8);
1059         for (i=8 ; i < nbRounds; i++) {
1060             acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
1061         }
1062         /* last bytes */
1063         acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
1064         return XXH3p_avalanche(acc);
1065     }
1066 }
1067
1068 /* ===   Public entry point   === */
1069
1070 XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
1071 {
1072     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
1073     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1074     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1075     return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
1076 }
1077
1078 XXH_PUBLIC_API XXH64_hash_t
1079 XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1080 {
1081     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1082     /* if an action must be taken should `secret` conditions not be respected,
1083      * it should be done here.
1084      * For now, it's a contract pre-condition.
1085      * Adding a check and a branch here would cost performance at every hash */
1086      if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1087      if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1088      if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1089      return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1090 }
1091
1092 XXH_PUBLIC_API XXH64_hash_t
1093 XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1094 {
1095     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
1096     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1097     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1098     return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
1099 }
1100
1101 /* ===   XXH3 streaming   === */
1102
1103 /* RocksDB Note: unused & removed due to bug in preview version */
1104
1105 /* ==========================================
1106  * XXH3 128 bits (=> XXH128)
1107  * ========================================== */
1108
1109 XXH_FORCE_INLINE XXH128_hash_t
1110 XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1111 {
1112     XXH_ASSERT(input != NULL);
1113     XXH_ASSERT(1 <= len && len <= 3);
1114     XXH_ASSERT(secret != NULL);
1115     {   xxh_u8 const c1 = input[0];
1116         xxh_u8 const c2 = input[len >> 1];
1117         xxh_u8 const c3 = input[len - 1];
1118         xxh_u32  const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
1119         xxh_u32  const combinedh = XXH_swap32(combinedl);
1120         xxh_u64  const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret)   + seed);
1121         xxh_u64  const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
1122         xxh_u64  const mixedl = keyed_lo * PRIME64_1;
1123         xxh_u64  const mixedh = keyed_hi * PRIME64_5;
1124         XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
1125         return h128;
1126     }
1127 }
1128
1129
1130 XXH_FORCE_INLINE XXH128_hash_t
1131 XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1132 {
1133     XXH_ASSERT(input != NULL);
1134     XXH_ASSERT(secret != NULL);
1135     XXH_ASSERT(4 <= len && len <= 8);
1136     {   xxh_u32 const input_lo = XXH_readLE32(input);
1137         xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
1138         xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
1139         xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
1140         xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
1141         xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
1142         xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
1143         xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
1144         xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
1145         xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
1146         {   XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
1147             return h128;
1148     }   }
1149 }
1150
1151 XXH_FORCE_INLINE XXH128_hash_t
1152 XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1153 {
1154     XXH_ASSERT(input != NULL);
1155     XXH_ASSERT(secret != NULL);
1156     XXH_ASSERT(9 <= len && len <= 16);
1157     {   xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
1158         xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
1159         XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
1160         xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
1161         m128.low64 += lenContrib;
1162         m128.high64 += input_hi * PRIME64_1;
1163         m128.low64  ^= (m128.high64 >> 32);
1164         {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
1165             h128.high64 += m128.high64 * PRIME64_2;
1166             h128.low64   = XXH3p_avalanche(h128.low64);
1167             h128.high64  = XXH3p_avalanche(h128.high64);
1168             return h128;
1169     }   }
1170 }
1171
1172 /* Assumption : `secret` size is >= 16
1173  * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
1174 XXH_FORCE_INLINE XXH128_hash_t
1175 XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1176 {
1177     XXH_ASSERT(len <= 16);
1178     {   if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
1179         if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
1180         if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
1181         {   XXH128_hash_t const h128 = { 0, 0 };
1182             return h128;
1183     }   }
1184 }
1185
1186 XXH_FORCE_INLINE XXH128_hash_t
1187 XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
1188                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1189 {
1190     XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
1191
1192     XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
1193
1194     /* converge into final hash */
1195     XXH_STATIC_ASSERT(sizeof(acc) == 64);
1196     XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1197     {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
1198         xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
1199         XXH128_hash_t const h128 = { low64, high64 };
1200         return h128;
1201     }
1202 }
1203
1204 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1205 XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
1206 {
1207     return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
1208 }
1209
1210 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1211 XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
1212                               const xxh_u8* secret, size_t secretSize)
1213 {
1214     return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
1215 }
1216
1217 XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
1218 XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
1219 {
1220     XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
1221     if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
1222     XXH3p_initCustomSecret(secret, seed);
1223     return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
1224 }
1225
1226
1227 XXH_FORCE_INLINE XXH128_hash_t
1228 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
1229 {
1230     acc.low64  += XXH3p_mix16B (input_1, secret+0, seed);
1231     acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
1232     acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
1233     acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
1234     return acc;
1235 }
1236
1237 XXH_NO_INLINE XXH128_hash_t
1238 XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1239                        const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1240                        XXH64_hash_t seed)
1241 {
1242     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1243     XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
1244
1245     {   XXH128_hash_t acc;
1246         int const nbRounds = (int)len / 32;
1247         int i;
1248         acc.low64 = len * PRIME64_1;
1249         acc.high64 = 0;
1250         for (i=0; i<4; i++) {
1251             acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
1252         }
1253         acc.low64 = XXH3p_avalanche(acc.low64);
1254         acc.high64 = XXH3p_avalanche(acc.high64);
1255         XXH_ASSERT(nbRounds >= 4);
1256         for (i=4 ; i < nbRounds; i++) {
1257             acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
1258         }
1259         /* last bytes */
1260         acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
1261
1262         {   xxh_u64 const low64 = acc.low64 + acc.high64;
1263             xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1264             XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1265             return h128;
1266         }
1267     }
1268 }
1269
1270
1271 XXH_FORCE_INLINE XXH128_hash_t
1272 XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1273                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1274                       XXH64_hash_t seed)
1275 {
1276     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
1277     XXH_ASSERT(16 < len && len <= 128);
1278
1279     {   XXH128_hash_t acc;
1280         acc.low64 = len * PRIME64_1;
1281         acc.high64 = 0;
1282         if (len > 32) {
1283             if (len > 64) {
1284                 if (len > 96) {
1285                     acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
1286                 }
1287                 acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
1288             }
1289             acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
1290         }
1291         acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
1292         {   xxh_u64 const low64 = acc.low64 + acc.high64;
1293             xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1294             XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
1295             return h128;
1296         }
1297     }
1298 }
1299
1300 XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
1301 {
1302     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
1303     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1304     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1305     return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
1306 }
1307
1308 XXH_PUBLIC_API XXH128_hash_t
1309 XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1310 {
1311     XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
1312     /* if an action must be taken should `secret` conditions not be respected,
1313      * it should be done here.
1314      * For now, it's a contract pre-condition.
1315      * Adding a check and a branch here would cost performance at every hash */
1316      if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1317      if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1318      if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1319      return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1320 }
1321
1322 XXH_PUBLIC_API XXH128_hash_t
1323 XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1324 {
1325     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
1326     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1327     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1328     return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
1329 }
1330
1331 XXH_PUBLIC_API XXH128_hash_t
1332 XXH128(const void* input, size_t len, XXH64_hash_t seed)
1333 {
1334     return XXH3p_128bits_withSeed(input, len, seed);
1335 }
1336
1337
1338 /* ===   XXH3 128-bit streaming   === */
1339
1340 /* RocksDB Note: unused & removed due to bug in preview version */
1341
1342 /* 128-bit utility functions */
1343
1344 #include <string.h>   /* memcmp */
1345
1346 /* return : 1 is equal, 0 if different */
1347 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
1348 {
1349     /* note : XXH128_hash_t is compact, it has no padding byte */
1350     return !(memcmp(&h1, &h2, sizeof(h1)));
1351 }
1352
1353 /* This prototype is compatible with stdlib's qsort().
1354  * return : >0 if *h128_1  > *h128_2
1355  *          <0 if *h128_1  < *h128_2
1356  *          =0 if *h128_1 == *h128_2  */
1357 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
1358 {
1359     XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
1360     XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
1361     int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
1362     /* note : bets that, in most cases, hash values are different */
1363     if (hcmp) return hcmp;
1364     return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
1365 }
1366
1367
1368 /*======   Canonical representation   ======*/
1369 XXH_PUBLIC_API void
1370 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
1371 {
1372     XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
1373     if (XXH_CPU_LITTLE_ENDIAN) {
1374         hash.high64 = XXH_swap64(hash.high64);
1375         hash.low64  = XXH_swap64(hash.low64);
1376     }
1377     memcpy(dst, &hash.high64, sizeof(hash.high64));
1378     memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
1379 }
1380
1381 XXH_PUBLIC_API XXH128_hash_t
1382 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
1383 {
1384     XXH128_hash_t h;
1385     h.high64 = XXH_readBE64(src);
1386     h.low64  = XXH_readBE64(src->digest + 8);
1387     return h;
1388 }
1389
1390
1391
1392 #endif  /* XXH3p_H */