ceph/src/rocksdb/util/ribbon_test.cc

   1 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5
   6 #include <cmath>
   7
   8 #include "test_util/testharness.h"
   9 #include "util/bloom_impl.h"
  10 #include "util/coding.h"
  11 #include "util/hash.h"
  12 #include "util/ribbon_impl.h"
  13 #include "util/stop_watch.h"
  14
  15 #ifndef GFLAGS
  16 uint32_t FLAGS_thoroughness = 5;
  17 bool FLAGS_find_occ = false;
  18 double FLAGS_find_next_factor = 1.414;
  19 double FLAGS_find_success = 0.95;
  20 double FLAGS_find_delta_start = 0.01;
  21 double FLAGS_find_delta_end = 0.0001;
  22 double FLAGS_find_delta_shrink = 0.99;
  23 uint32_t FLAGS_find_min_slots = 128;
  24 uint32_t FLAGS_find_max_slots = 12800000;
  25 #else
  26 #include "util/gflags_compat.h"
  27 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
  28 // Using 500 is a good test when you have time to be thorough.
  29 // Default is for general RocksDB regression test runs.
  30 DEFINE_uint32(thoroughness, 5, "iterations per configuration");
  31
  32 // Options for FindOccupancyForSuccessRate, which is more of a tool
  33 // than a test.
  34 DEFINE_bool(find_occ, false,
  35             "whether to run the FindOccupancyForSuccessRate tool");
  36 DEFINE_double(find_next_factor, 1.414,
  37               "target success rate for FindOccupancyForSuccessRate");
  38 DEFINE_double(find_success, 0.95,
  39               "target success rate for FindOccupancyForSuccessRate");
  40 DEFINE_double(find_delta_start, 0.01, " for FindOccupancyForSuccessRate");
  41 DEFINE_double(find_delta_end, 0.0001, " for FindOccupancyForSuccessRate");
  42 DEFINE_double(find_delta_shrink, 0.99, " for FindOccupancyForSuccessRate");
  43 DEFINE_uint32(find_min_slots, 128,
  44               "number of slots for FindOccupancyForSuccessRate");
  45 DEFINE_uint32(find_max_slots, 12800000,
  46               "number of slots for FindOccupancyForSuccessRate");
  47 #endif  // GFLAGS
  48
  49 template <typename TypesAndSettings>
  50 class RibbonTypeParamTest : public ::testing::Test {};
  51
  52 class RibbonTest : public ::testing::Test {};
  53
  54 namespace {
  55
  56 // Different ways of generating keys for testing
  57
  58 // Generate semi-sequential keys
  59 struct StandardKeyGen {
  60   StandardKeyGen(const std::string& prefix, uint64_t id)
  61       : id_(id), str_(prefix) {
  62     ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
  63   }
  64
  65   // Prefix (only one required)
  66   StandardKeyGen& operator++() {
  67     ++id_;
  68     return *this;
  69   }
  70
  71   StandardKeyGen& operator+=(uint64_t i) {
  72     id_ += i;
  73     return *this;
  74   }
  75
  76   const std::string& operator*() {
  77     // Use multiplication to mix things up a little in the key
  78     ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
  79                                      id_ * uint64_t{0x1500000001});
  80     return str_;
  81   }
  82
  83   bool operator==(const StandardKeyGen& other) {
  84     // Same prefix is assumed
  85     return id_ == other.id_;
  86   }
  87   bool operator!=(const StandardKeyGen& other) {
  88     // Same prefix is assumed
  89     return id_ != other.id_;
  90   }
  91
  92   uint64_t id_;
  93   std::string str_;
  94 };
  95
  96 // Generate small sequential keys, that can misbehave with sequential seeds
  97 // as in https://github.com/Cyan4973/xxHash/issues/469.
  98 // These keys are only heuristically unique, but that's OK with 64 bits,
  99 // for testing purposes.
 100 struct SmallKeyGen {
 101   SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
 102     // Hash the prefix for a heuristically unique offset
 103     id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
 104     ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
 105   }
 106
 107   // Prefix (only one required)
 108   SmallKeyGen& operator++() {
 109     ++id_;
 110     return *this;
 111   }
 112
 113   SmallKeyGen& operator+=(uint64_t i) {
 114     id_ += i;
 115     return *this;
 116   }
 117
 118   const std::string& operator*() {
 119     ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
 120     return str_;
 121   }
 122
 123   bool operator==(const SmallKeyGen& other) { return id_ == other.id_; }
 124   bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; }
 125
 126   uint64_t id_;
 127   std::string str_;
 128 };
 129
 130 template <typename KeyGen>
 131 struct Hash32KeyGenWrapper : public KeyGen {
 132   Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
 133       : KeyGen(prefix, id) {}
 134   uint32_t operator*() {
 135     auto& key = *static_cast<KeyGen&>(*this);
 136     // unseeded
 137     return ROCKSDB_NAMESPACE::GetSliceHash(key);
 138   }
 139 };
 140
 141 template <typename KeyGen>
 142 struct Hash64KeyGenWrapper : public KeyGen {
 143   Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
 144       : KeyGen(prefix, id) {}
 145   uint64_t operator*() {
 146     auto& key = *static_cast<KeyGen&>(*this);
 147     // unseeded
 148     return ROCKSDB_NAMESPACE::GetSliceHash64(key);
 149   }
 150 };
 151
 152 }  // namespace
 153
 154 using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
 155 using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
 156 using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
 157
 158 struct DefaultTypesAndSettings {
 159   using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
 160   using ResultRow = uint8_t;
 161   using Index = uint32_t;
 162   using Hash = uint64_t;
 163   using Seed = uint32_t;
 164   using Key = ROCKSDB_NAMESPACE::Slice;
 165   static constexpr bool kIsFilter = true;
 166   static constexpr bool kFirstCoeffAlwaysOne = true;
 167   static constexpr bool kUseSmash = false;
 168   static constexpr bool kAllowZeroStarts = false;
 169   static Hash HashFn(const Key& key, uint64_t raw_seed) {
 170     // This version 0.7.2 preview of XXH3 (a.k.a. XXH3p) function does
 171     // not pass SmallKeyGen tests below without some seed premixing from
 172     // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
 173     return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
 174   }
 175   // For testing
 176   using KeyGen = StandardKeyGen;
 177 };
 178
 179 using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
 180 struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings {
 181   static constexpr bool kUseSmash = true;
 182 };
 183 struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings {
 184   using CoeffRow = uint64_t;
 185 };
 186 struct TypesAndSettings_Coeff64Smash1 : public DefaultTypesAndSettings {
 187   using CoeffRow = uint64_t;
 188   static constexpr bool kUseSmash = true;
 189 };
 190 struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash1 {
 191   static constexpr bool kFirstCoeffAlwaysOne = false;
 192 };
 193 struct TypesAndSettings_Result16 : public DefaultTypesAndSettings {
 194   using ResultRow = uint16_t;
 195 };
 196 struct TypesAndSettings_Result32 : public DefaultTypesAndSettings {
 197   using ResultRow = uint32_t;
 198 };
 199 struct TypesAndSettings_IndexSizeT : public DefaultTypesAndSettings {
 200   using Index = size_t;
 201 };
 202 struct TypesAndSettings_Hash32 : public DefaultTypesAndSettings {
 203   using Hash = uint32_t;
 204   static Hash HashFn(const Key& key, Hash raw_seed) {
 205     // This MurmurHash1 function does not pass tests below without the
 206     // seed premixing from StandardHasher. In fact, it needs more than
 207     // just a multiplication mixer on the ordinal seed.
 208     return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
 209   }
 210 };
 211 struct TypesAndSettings_Hash32_Result16 : public TypesAndSettings_Hash32 {
 212   using ResultRow = uint16_t;
 213 };
 214 struct TypesAndSettings_KeyString : public DefaultTypesAndSettings {
 215   using Key = std::string;
 216 };
 217 struct TypesAndSettings_Seed8 : public DefaultTypesAndSettings {
 218   // This is not a generally recommended configuration. With the configured
 219   // hash function, it would fail with SmallKeyGen due to insufficient
 220   // independence among the seeds.
 221   using Seed = uint8_t;
 222 };
 223 struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings {
 224   static constexpr bool kFirstCoeffAlwaysOne = false;
 225 };
 226 struct TypesAndSettings_AllowZeroStarts : public DefaultTypesAndSettings {
 227   static constexpr bool kAllowZeroStarts = true;
 228 };
 229 struct TypesAndSettings_Seed64 : public DefaultTypesAndSettings {
 230   using Seed = uint64_t;
 231 };
 232 struct TypesAndSettings_Rehasher
 233     : public StandardRehasherAdapter<DefaultTypesAndSettings> {
 234   using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
 235 };
 236 struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
 237   using ResultRow = uint16_t;
 238 };
 239 struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
 240   using ResultRow = uint32_t;
 241 };
 242 struct TypesAndSettings_Rehasher_Seed64
 243     : public StandardRehasherAdapter<TypesAndSettings_Seed64> {
 244   using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
 245   // Note: 64-bit seed with Rehasher gives slightly better average reseeds
 246 };
 247 struct TypesAndSettings_Rehasher32
 248     : public StandardRehasherAdapter<TypesAndSettings_Hash32> {
 249   using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
 250 };
 251 struct TypesAndSettings_Rehasher32_Coeff64
 252     : public TypesAndSettings_Rehasher32 {
 253   using CoeffRow = uint64_t;
 254 };
 255 struct TypesAndSettings_SmallKeyGen : public DefaultTypesAndSettings {
 256   // SmallKeyGen stresses the independence of different hash seeds
 257   using KeyGen = SmallKeyGen;
 258 };
 259 struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
 260   // SmallKeyGen stresses the independence of different hash seeds
 261   using KeyGen = SmallKeyGen;
 262 };
 263
 264 using TestTypesAndSettings = ::testing::Types<
 265     TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
 266     TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash0,
 267     TypesAndSettings_Coeff64Smash1, TypesAndSettings_Result16,
 268     TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
 269     TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
 270     TypesAndSettings_KeyString, TypesAndSettings_Seed8,
 271     TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
 272     TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
 273     TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
 274     TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
 275     TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
 276     TypesAndSettings_Hash32_SmallKeyGen>;
 277 TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
 278
 279 namespace {
 280
 281 // For testing Poisson-distributed (or similar) statistics, get value for
 282 // `stddevs_allowed` standard deviations above expected mean
 283 // `expected_count`.
 284 // (Poisson approximates Binomial only if probability of a trial being
 285 // in the count is low.)
 286 uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) {
 287   return static_cast<uint64_t>(
 288       expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0);
 289 }
 290
 291 uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) {
 292   return static_cast<uint64_t>(std::max(
 293       0.0, expected_count - stddevs_allowed * std::sqrt(expected_count)));
 294 }
 295
 296 uint64_t FrequentPoissonUpperBound(double expected_count) {
 297   // Allow up to 5.0 standard deviations for frequently checked statistics
 298   return PoissonUpperBound(expected_count, 5.0);
 299 }
 300
 301 uint64_t FrequentPoissonLowerBound(double expected_count) {
 302   return PoissonLowerBound(expected_count, 5.0);
 303 }
 304
 305 uint64_t InfrequentPoissonUpperBound(double expected_count) {
 306   // Allow up to 3 standard deviations for infrequently checked statistics
 307   return PoissonUpperBound(expected_count, 3.0);
 308 }
 309
 310 uint64_t InfrequentPoissonLowerBound(double expected_count) {
 311   return PoissonLowerBound(expected_count, 3.0);
 312 }
 313
 314 }  // namespace
 315
 316 TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
 317   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
 318   IMPORT_RIBBON_IMPL_TYPES(TypeParam);
 319   using KeyGen = typename TypeParam::KeyGen;
 320
 321   // For testing FP rate etc.
 322   constexpr Index kNumToCheck = 100000;
 323
 324   const auto log2_thoroughness =
 325       static_cast<Hash>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
 326
 327   // With overhead of just 2%, expect ~50% encoding success per
 328   // seed with ~5k keys on 64-bit ribbon, or ~150k keys on 128-bit ribbon.
 329   const double kFactor = 1.02;
 330
 331   uint64_t total_reseeds = 0;
 332   uint64_t total_single_failures = 0;
 333   uint64_t total_batch_successes = 0;
 334   uint64_t total_fp_count = 0;
 335   uint64_t total_added = 0;
 336
 337   uint64_t soln_query_nanos = 0;
 338   uint64_t soln_query_count = 0;
 339   uint64_t bloom_query_nanos = 0;
 340   uint64_t isoln_query_nanos = 0;
 341   uint64_t isoln_query_count = 0;
 342
 343   // Take different samples if you change thoroughness
 344   ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
 345
 346   for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
 347     uint32_t num_to_add =
 348         sizeof(CoeffRow) == 16 ? 130000 : TypeParam::kUseSmash ? 5500 : 2500;
 349
 350     // Use different values between that number and 50% of that number
 351     num_to_add -= rnd.Uniformish(num_to_add / 2);
 352
 353     total_added += num_to_add;
 354
 355     // Most of the time, test the Interleaved solution storage, but when
 356     // we do we have to make num_slots a multiple of kCoeffBits. So
 357     // sometimes we want to test without that limitation.
 358     bool test_interleaved = (i % 7) != 6;
 359
 360     Index num_slots = static_cast<Index>(num_to_add * kFactor);
 361     if (test_interleaved) {
 362       // Round to supported number of slots
 363       num_slots = InterleavedSoln::RoundUpNumSlots(num_slots);
 364       // Re-adjust num_to_add to get as close as possible to kFactor
 365       num_to_add = static_cast<uint32_t>(num_slots / kFactor);
 366     }
 367
 368     std::string prefix;
 369     ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
 370
 371     // Batch that must be added
 372     std::string added_str = prefix + "added";
 373     KeyGen keys_begin(added_str, 0);
 374     KeyGen keys_end(added_str, num_to_add);
 375
 376     // A couple more that will probably be added
 377     KeyGen one_more(prefix + "more", 1);
 378     KeyGen two_more(prefix + "more", 2);
 379
 380     // Batch that may or may not be added
 381     const Index kBatchSize =
 382         sizeof(CoeffRow) == 16 ? 300 : TypeParam::kUseSmash ? 20 : 10;
 383     std::string batch_str = prefix + "batch";
 384     KeyGen batch_begin(batch_str, 0);
 385     KeyGen batch_end(batch_str, kBatchSize);
 386
 387     // Batch never (successfully) added, but used for querying FP rate
 388     std::string not_str = prefix + "not";
 389     KeyGen other_keys_begin(not_str, 0);
 390     KeyGen other_keys_end(not_str, kNumToCheck);
 391
 392     // Vary bytes for InterleavedSoln to use number of solution columns
 393     // from 0 to max allowed by ResultRow type (and used by SimpleSoln).
 394     // Specifically include 0 and max, and otherwise skew toward max.
 395     uint32_t max_ibytes = static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
 396     size_t ibytes;
 397     if (i == 0) {
 398       ibytes = 0;
 399     } else if (i == 1) {
 400       ibytes = max_ibytes;
 401     } else {
 402       // Skewed
 403       ibytes = std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
 404     }
 405     std::unique_ptr<char[]> idata(new char[ibytes]);
 406     InterleavedSoln isoln(idata.get(), ibytes);
 407
 408     SimpleSoln soln;
 409     Hasher hasher;
 410     bool first_single;
 411     bool second_single;
 412     bool batch_success;
 413     {
 414       Banding banding;
 415       // Traditional solve for a fixed set.
 416       ASSERT_TRUE(
 417           banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
 418
 419       // Now to test backtracking, starting with guaranteed fail. By using
 420       // the keys that will be used to test FP rate, we are then doing an
 421       // extra check that after backtracking there are no remnants (e.g. in
 422       // result side of banding) of these entries.
 423       Index occupied_count = banding.GetOccupiedCount();
 424       banding.EnsureBacktrackSize(kNumToCheck);
 425       EXPECT_FALSE(
 426           banding.AddRangeOrRollBack(other_keys_begin, other_keys_end));
 427       EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
 428
 429       // Check that we still have a good chance of adding a couple more
 430       // individually
 431       first_single = banding.Add(*one_more);
 432       second_single = banding.Add(*two_more);
 433       Index more_added = (first_single ? 1 : 0) + (second_single ? 1 : 0);
 434       total_single_failures += 2U - more_added;
 435
 436       // Or as a batch
 437       batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end);
 438       if (batch_success) {
 439         more_added += kBatchSize;
 440         ++total_batch_successes;
 441       }
 442       EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
 443
 444       // Also verify that redundant adds are OK (no effect)
 445       ASSERT_TRUE(
 446           banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
 447       EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
 448
 449       // Now back-substitution
 450       soln.BackSubstFrom(banding);
 451       if (test_interleaved) {
 452         isoln.BackSubstFrom(banding);
 453       }
 454
 455       Seed reseeds = banding.GetOrdinalSeed();
 456       total_reseeds += reseeds;
 457
 458       EXPECT_LE(reseeds, 8 + log2_thoroughness);
 459       if (reseeds > log2_thoroughness + 1) {
 460         fprintf(
 461             stderr, "%s high reseeds at %u, %u/%u: %u\n",
 462             reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
 463             static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
 464             static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
 465       }
 466       hasher.SetOrdinalSeed(reseeds);
 467     }
 468     // soln and hasher now independent of Banding object
 469
 470     // Verify keys added
 471     KeyGen cur = keys_begin;
 472     while (cur != keys_end) {
 473       ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
 474       ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
 475       ++cur;
 476     }
 477     // We (maybe) snuck these in!
 478     if (first_single) {
 479       ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
 480       ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
 481     }
 482     if (second_single) {
 483       ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
 484       ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
 485     }
 486     if (batch_success) {
 487       cur = batch_begin;
 488       while (cur != batch_end) {
 489         ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
 490         ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
 491         ++cur;
 492       }
 493     }
 494
 495     // Check FP rate (depends only on number of result bits == solution columns)
 496     Index fp_count = 0;
 497     cur = other_keys_begin;
 498     {
 499       ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
 500                                              true);
 501       while (cur != other_keys_end) {
 502         bool fp = soln.FilterQuery(*cur, hasher);
 503         fp_count += fp ? 1 : 0;
 504         ++cur;
 505       }
 506       soln_query_nanos += timer.ElapsedNanos();
 507       soln_query_count += kNumToCheck;
 508     }
 509     {
 510       double expected_fp_count = soln.ExpectedFpRate() * kNumToCheck;
 511       // For expected FP rate, also include false positives due to collisions
 512       // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
 513       double correction =
 514           kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add);
 515       EXPECT_LE(fp_count,
 516                 FrequentPoissonUpperBound(expected_fp_count + correction));
 517       EXPECT_GE(fp_count,
 518                 FrequentPoissonLowerBound(expected_fp_count + correction));
 519     }
 520     total_fp_count += fp_count;
 521
 522     // And also check FP rate for isoln
 523     if (test_interleaved) {
 524       Index ifp_count = 0;
 525       cur = other_keys_begin;
 526       ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
 527                                              true);
 528       while (cur != other_keys_end) {
 529         ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0;
 530         ++cur;
 531       }
 532       isoln_query_nanos += timer.ElapsedNanos();
 533       isoln_query_count += kNumToCheck;
 534       {
 535         double expected_fp_count = isoln.ExpectedFpRate() * kNumToCheck;
 536         // For expected FP rate, also include false positives due to collisions
 537         // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
 538         double correction =
 539             kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add);
 540         EXPECT_LE(ifp_count,
 541                   FrequentPoissonUpperBound(expected_fp_count + correction));
 542         EXPECT_GE(ifp_count,
 543                   FrequentPoissonLowerBound(expected_fp_count + correction));
 544       }
 545       // Since the bits used in isoln are a subset of the bits used in soln,
 546       // it cannot have fewer FPs
 547       EXPECT_GE(ifp_count, fp_count);
 548     }
 549
 550     // And compare to Bloom time, for fun
 551     if (ibytes >= /* minimum Bloom impl bytes*/ 64) {
 552       Index bfp_count = 0;
 553       cur = other_keys_begin;
 554       ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
 555                                              true);
 556       while (cur != other_keys_end) {
 557         uint64_t h = hasher.GetHash(*cur);
 558         uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h);
 559         uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h)
 560                                         : h1 * 0x9e3779b9;
 561         bfp_count += ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch(
 562                          h1, h2, static_cast<uint32_t>(ibytes), 6, idata.get())
 563                          ? 1
 564                          : 0;
 565         ++cur;
 566       }
 567       bloom_query_nanos += timer.ElapsedNanos();
 568       // ensure bfp_count is used
 569       ASSERT_LT(bfp_count, kNumToCheck);
 570     }
 571   }
 572
 573   // "outside" == key not in original set so either negative or false positive
 574   fprintf(stderr, "Simple      outside query, hot, incl hashing, ns/key: %g\n",
 575           1.0 * soln_query_nanos / soln_query_count);
 576   fprintf(stderr, "Interleaved outside query, hot, incl hashing, ns/key: %g\n",
 577           1.0 * isoln_query_nanos / isoln_query_count);
 578   fprintf(stderr, "Bloom       outside query, hot, incl hashing, ns/key: %g\n",
 579           1.0 * bloom_query_nanos / soln_query_count);
 580
 581   {
 582     double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
 583     fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
 584     // Values above were chosen to target around 50% chance of encoding success
 585     // rate (average of 1.0 re-seeds) or slightly better. But 1.15 is also close
 586     // enough.
 587     EXPECT_LE(total_reseeds,
 588               InfrequentPoissonUpperBound(1.15 * FLAGS_thoroughness));
 589     // Would use 0.85 here instead of 0.75, but
 590     // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
 591     // sequential keys with a small, cheap hash function. We accept that
 592     // there are surely inputs that are somewhat bad for this setup, but
 593     // these somewhat good inputs are probably more likely.
 594     EXPECT_GE(total_reseeds,
 595               InfrequentPoissonLowerBound(0.75 * FLAGS_thoroughness));
 596   }
 597
 598   {
 599     uint64_t total_singles = 2 * FLAGS_thoroughness;
 600     double single_failure_rate = 1.0 * total_single_failures / total_singles;
 601     fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate);
 602     // A rough bound (one sided) based on nothing in particular
 603     double expected_single_failures =
 604         1.0 * total_singles /
 605         (sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32);
 606     EXPECT_LE(total_single_failures,
 607               InfrequentPoissonUpperBound(expected_single_failures));
 608   }
 609
 610   {
 611     // Counting successes here for Poisson to approximate the Binomial
 612     // distribution.
 613     // A rough bound (one sided) based on nothing in particular.
 614     double expected_batch_successes = 1.0 * FLAGS_thoroughness / 2;
 615     uint64_t lower_bound =
 616         InfrequentPoissonLowerBound(expected_batch_successes);
 617     fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n",
 618             1.0 * total_batch_successes / FLAGS_thoroughness,
 619             1.0 * lower_bound / FLAGS_thoroughness);
 620     EXPECT_GE(total_batch_successes, lower_bound);
 621   }
 622
 623   {
 624     uint64_t total_checked = uint64_t{kNumToCheck} * FLAGS_thoroughness;
 625     double expected_total_fp_count =
 626         total_checked * std::pow(0.5, 8U * sizeof(ResultRow));
 627     // For expected FP rate, also include false positives due to collisions
 628     // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
 629     double average_added = 1.0 * total_added / FLAGS_thoroughness;
 630     expected_total_fp_count +=
 631         total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
 632
 633     uint64_t upper_bound = InfrequentPoissonUpperBound(expected_total_fp_count);
 634     uint64_t lower_bound = InfrequentPoissonLowerBound(expected_total_fp_count);
 635     fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n",
 636             1.0 * total_fp_count / total_checked,
 637             expected_total_fp_count / total_checked,
 638             1.0 * upper_bound / total_checked,
 639             1.0 * lower_bound / total_checked);
 640     EXPECT_LE(total_fp_count, upper_bound);
 641     EXPECT_GE(total_fp_count, lower_bound);
 642   }
 643 }
 644
 645 TYPED_TEST(RibbonTypeParamTest, Extremes) {
 646   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
 647   IMPORT_RIBBON_IMPL_TYPES(TypeParam);
 648   using KeyGen = typename TypeParam::KeyGen;
 649
 650   size_t bytes = 128 * 1024;
 651   std::unique_ptr<char[]> buf(new char[bytes]);
 652   InterleavedSoln isoln(buf.get(), bytes);
 653   SimpleSoln soln;
 654   Hasher hasher;
 655   Banding banding;
 656
 657   // ########################################
 658   // Add zero keys to minimal number of slots
 659   KeyGen begin_and_end("foo", 123);
 660   ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
 661       /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
 662       /* seed mask*/ 0));
 663
 664   soln.BackSubstFrom(banding);
 665   isoln.BackSubstFrom(banding);
 666
 667   // Because there's plenty of memory, we expect the interleaved solution to
 668   // use maximum supported columns (same as simple solution)
 669   ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow));
 670   ASSERT_EQ(isoln.GetUpperStartBlock(), 0U);
 671
 672   // Somewhat oddly, we expect same FP rate as if we had essentially filled
 673   // up the slots.
 674   constexpr Index kNumToCheck = 100000;
 675   KeyGen other_keys_begin("not", 0);
 676   KeyGen other_keys_end("not", kNumToCheck);
 677
 678   Index fp_count = 0;
 679   KeyGen cur = other_keys_begin;
 680   while (cur != other_keys_end) {
 681     bool isoln_query_result = isoln.FilterQuery(*cur, hasher);
 682     bool soln_query_result = soln.FilterQuery(*cur, hasher);
 683     // Solutions are equivalent
 684     ASSERT_EQ(isoln_query_result, soln_query_result);
 685     // And in fact we only expect an FP when ResultRow is 0
 686     // CHANGE: no longer true because of filling some unused slots
 687     // with pseudorandom values.
 688     // ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
 689     //                                hasher.GetHash(*cur)) == ResultRow{0});
 690     fp_count += soln_query_result ? 1 : 0;
 691     ++cur;
 692   }
 693   {
 694     ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate());
 695     double expected_fp_count = isoln.ExpectedFpRate() * kNumToCheck;
 696     EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count));
 697     EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count));
 698   }
 699
 700   // ######################################################
 701   // Use zero bytes for interleaved solution (key(s) added)
 702
 703   // Add one key
 704   KeyGen key_begin("added", 0);
 705   KeyGen key_end("added", 1);
 706   ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
 707       /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
 708       /* seed mask*/ 0));
 709
 710   InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
 711
 712   isoln2.BackSubstFrom(banding);
 713
 714   ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U);
 715   ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U);
 716
 717   // All queries return true
 718   ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher));
 719   ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0);
 720 }
 721
 722 TEST(RibbonTest, AllowZeroStarts) {
 723   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
 724   IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
 725   using KeyGen = StandardKeyGen;
 726
 727   InterleavedSoln isoln(nullptr, /*bytes*/ 0);
 728   SimpleSoln soln;
 729   Hasher hasher;
 730   Banding banding;
 731
 732   KeyGen begin("foo", 0);
 733   KeyGen end("foo", 1);
 734   // Can't add 1 entry
 735   ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
 736
 737   KeyGen begin_and_end("foo", 123);
 738   // Can add 0 entries
 739   ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
 740                                               begin_and_end));
 741
 742   Seed reseeds = banding.GetOrdinalSeed();
 743   ASSERT_EQ(reseeds, 0U);
 744   hasher.SetOrdinalSeed(reseeds);
 745
 746   // Can construct 0-slot solutions
 747   isoln.BackSubstFrom(banding);
 748   soln.BackSubstFrom(banding);
 749
 750   // Should always return false
 751   ASSERT_FALSE(isoln.FilterQuery(*begin, hasher));
 752   ASSERT_FALSE(soln.FilterQuery(*begin, hasher));
 753
 754   // And report that in FP rate
 755   ASSERT_EQ(isoln.ExpectedFpRate(), 0.0);
 756   ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
 757 }
 758
 759 TEST(RibbonTest, RawAndOrdinalSeeds) {
 760   StandardHasher<TypesAndSettings_Seed64> hasher64;
 761   StandardHasher<DefaultTypesAndSettings> hasher64_32;
 762   StandardHasher<TypesAndSettings_Hash32> hasher32;
 763   StandardHasher<TypesAndSettings_Seed8> hasher8;
 764
 765   for (uint32_t limit : {0xffU, 0xffffU}) {
 766     std::vector<bool> seen(limit + 1);
 767     for (uint32_t i = 0; i < limit; ++i) {
 768       hasher64.SetOrdinalSeed(i);
 769       auto raw64 = hasher64.GetRawSeed();
 770       hasher32.SetOrdinalSeed(i);
 771       auto raw32 = hasher32.GetRawSeed();
 772       hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
 773       auto raw8 = hasher8.GetRawSeed();
 774       {
 775         hasher64_32.SetOrdinalSeed(i);
 776         auto raw64_32 = hasher64_32.GetRawSeed();
 777         ASSERT_EQ(raw64_32, raw32);  // Same size seed
 778       }
 779       if (i == 0) {
 780         // Documented that ordinal seed 0 == raw seed 0
 781         ASSERT_EQ(raw64, 0U);
 782         ASSERT_EQ(raw32, 0U);
 783         ASSERT_EQ(raw8, 0U);
 784       } else {
 785         // Extremely likely that upper bits are set
 786         ASSERT_GT(raw64, raw32);
 787         ASSERT_GT(raw32, raw8);
 788       }
 789       // Hashers agree on lower bits
 790       ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
 791       ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
 792
 793       // The translation is one-to-one for this size prefix
 794       uint32_t v = static_cast<uint32_t>(raw32 & limit);
 795       ASSERT_EQ(raw64 & limit, v);
 796       ASSERT_FALSE(seen[v]);
 797       seen[v] = true;
 798     }
 799   }
 800 }
 801
 802 namespace {
 803
 804 struct PhsfInputGen {
 805   PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
 806     val_.first = prefix;
 807     ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
 808   }
 809
 810   // Prefix (only one required)
 811   PhsfInputGen& operator++() {
 812     ++id_;
 813     return *this;
 814   }
 815
 816   const std::pair<std::string, uint8_t>& operator*() {
 817     // Use multiplication to mix things up a little in the key
 818     ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
 819                                      id_ * uint64_t{0x1500000001});
 820     // Occasionally repeat values etc.
 821     val_.second = static_cast<uint8_t>(id_ * 7 / 8);
 822     return val_;
 823   }
 824
 825   const std::pair<std::string, uint8_t>* operator->() { return &**this; }
 826
 827   bool operator==(const PhsfInputGen& other) {
 828     // Same prefix is assumed
 829     return id_ == other.id_;
 830   }
 831   bool operator!=(const PhsfInputGen& other) {
 832     // Same prefix is assumed
 833     return id_ != other.id_;
 834   }
 835
 836   uint64_t id_;
 837   std::pair<std::string, uint8_t> val_;
 838 };
 839
 840 struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
 841   static constexpr bool kIsFilter = false;
 842 };
 843 }  // namespace
 844
 845 TEST(RibbonTest, PhsfBasic) {
 846   IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
 847   IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
 848
 849   Index num_slots = 12800;
 850   Index num_to_add = static_cast<Index>(num_slots / 1.02);
 851
 852   PhsfInputGen begin("in", 0);
 853   PhsfInputGen end("in", num_to_add);
 854
 855   std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
 856   InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
 857   SimpleSoln soln;
 858   Hasher hasher;
 859
 860   {
 861     Banding banding;
 862     ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
 863
 864     soln.BackSubstFrom(banding);
 865     isoln.BackSubstFrom(banding);
 866
 867     hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
 868   }
 869
 870   for (PhsfInputGen cur = begin; cur != end; ++cur) {
 871     ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
 872     ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
 873   }
 874 }
 875
 876 // Not a real test, but a tool used to build GetNumSlotsFor95PctSuccess
 877 TYPED_TEST(RibbonTypeParamTest, FindOccupancyForSuccessRate) {
 878   IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
 879   IMPORT_RIBBON_IMPL_TYPES(TypeParam);
 880   using KeyGen = typename TypeParam::KeyGen;
 881
 882   if (!FLAGS_find_occ) {
 883     fprintf(stderr, "Tool disabled during unit test runs\n");
 884     return;
 885   }
 886
 887   KeyGen cur("blah", 0);
 888
 889   Banding banding;
 890   Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots);
 891   while (num_slots < FLAGS_find_max_slots) {
 892     double factor = 0.95;
 893     double delta = FLAGS_find_delta_start;
 894     while (delta > FLAGS_find_delta_end) {
 895       Index num_to_add = static_cast<Index>(factor * num_slots);
 896       KeyGen end = cur;
 897       end += num_to_add;
 898       bool success = banding.ResetAndFindSeedToSolve(num_slots, cur, end, 0, 0);
 899       cur = end;  // fresh keys
 900       if (success) {
 901         factor += delta * (1.0 - FLAGS_find_success);
 902         factor = std::min(factor, 1.0);
 903       } else {
 904         factor -= delta * FLAGS_find_success;
 905         factor = std::max(factor, 0.0);
 906       }
 907       delta *= FLAGS_find_delta_shrink;
 908       fprintf(stderr,
 909               "slots: %u log2_slots: %g target_success: %g ->overhead: %g\r",
 910               static_cast<unsigned>(num_slots),
 911               std::log(num_slots * 1.0) / std::log(2.0), FLAGS_find_success,
 912               1.0 / factor);
 913     }
 914     fprintf(stderr, "\n");
 915
 916     num_slots = std::max(
 917         num_slots + 1, static_cast<Index>(num_slots * FLAGS_find_next_factor));
 918     num_slots = InterleavedSoln::RoundUpNumSlots(num_slots);
 919   }
 920 }
 921
 922 // TODO: unit tests for configuration APIs
 923 // TODO: unit tests for small filter FP rates
 924
 925 int main(int argc, char** argv) {
 926   ::testing::InitGoogleTest(&argc, argv);
 927 #ifdef GFLAGS
 928   ParseCommandLineFlags(&argc, &argv, true);
 929 #endif  // GFLAGS
 930   return RUN_ALL_TESTS();
 931 }