]>
Commit | Line | Data |
---|---|---|
20effc67 TL |
1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the | |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
5 | ||
6 | #include <cmath> | |
7 | ||
8 | #include "test_util/testharness.h" | |
9 | #include "util/bloom_impl.h" | |
10 | #include "util/coding.h" | |
11 | #include "util/hash.h" | |
12 | #include "util/ribbon_impl.h" | |
13 | #include "util/stop_watch.h" | |
14 | ||
15 | #ifndef GFLAGS | |
16 | uint32_t FLAGS_thoroughness = 5; | |
17 | bool FLAGS_find_occ = false; | |
18 | double FLAGS_find_next_factor = 1.414; | |
19 | double FLAGS_find_success = 0.95; | |
20 | double FLAGS_find_delta_start = 0.01; | |
21 | double FLAGS_find_delta_end = 0.0001; | |
22 | double FLAGS_find_delta_shrink = 0.99; | |
23 | uint32_t FLAGS_find_min_slots = 128; | |
24 | uint32_t FLAGS_find_max_slots = 12800000; | |
25 | #else | |
26 | #include "util/gflags_compat.h" | |
27 | using GFLAGS_NAMESPACE::ParseCommandLineFlags; | |
28 | // Using 500 is a good test when you have time to be thorough. | |
29 | // Default is for general RocksDB regression test runs. | |
30 | DEFINE_uint32(thoroughness, 5, "iterations per configuration"); | |
31 | ||
32 | // Options for FindOccupancyForSuccessRate, which is more of a tool | |
33 | // than a test. | |
34 | DEFINE_bool(find_occ, false, | |
35 | "whether to run the FindOccupancyForSuccessRate tool"); | |
36 | DEFINE_double(find_next_factor, 1.414, | |
37 | "target success rate for FindOccupancyForSuccessRate"); | |
38 | DEFINE_double(find_success, 0.95, | |
39 | "target success rate for FindOccupancyForSuccessRate"); | |
40 | DEFINE_double(find_delta_start, 0.01, " for FindOccupancyForSuccessRate"); | |
41 | DEFINE_double(find_delta_end, 0.0001, " for FindOccupancyForSuccessRate"); | |
42 | DEFINE_double(find_delta_shrink, 0.99, " for FindOccupancyForSuccessRate"); | |
43 | DEFINE_uint32(find_min_slots, 128, | |
44 | "number of slots for FindOccupancyForSuccessRate"); | |
45 | DEFINE_uint32(find_max_slots, 12800000, | |
46 | "number of slots for FindOccupancyForSuccessRate"); | |
47 | #endif // GFLAGS | |
48 | ||
49 | template <typename TypesAndSettings> | |
50 | class RibbonTypeParamTest : public ::testing::Test {}; | |
51 | ||
52 | class RibbonTest : public ::testing::Test {}; | |
53 | ||
54 | namespace { | |
55 | ||
56 | // Different ways of generating keys for testing | |
57 | ||
58 | // Generate semi-sequential keys | |
59 | struct StandardKeyGen { | |
60 | StandardKeyGen(const std::string& prefix, uint64_t id) | |
61 | : id_(id), str_(prefix) { | |
62 | ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0); | |
63 | } | |
64 | ||
65 | // Prefix (only one required) | |
66 | StandardKeyGen& operator++() { | |
67 | ++id_; | |
68 | return *this; | |
69 | } | |
70 | ||
71 | StandardKeyGen& operator+=(uint64_t i) { | |
72 | id_ += i; | |
73 | return *this; | |
74 | } | |
75 | ||
76 | const std::string& operator*() { | |
77 | // Use multiplication to mix things up a little in the key | |
78 | ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], | |
79 | id_ * uint64_t{0x1500000001}); | |
80 | return str_; | |
81 | } | |
82 | ||
83 | bool operator==(const StandardKeyGen& other) { | |
84 | // Same prefix is assumed | |
85 | return id_ == other.id_; | |
86 | } | |
87 | bool operator!=(const StandardKeyGen& other) { | |
88 | // Same prefix is assumed | |
89 | return id_ != other.id_; | |
90 | } | |
91 | ||
92 | uint64_t id_; | |
93 | std::string str_; | |
94 | }; | |
95 | ||
96 | // Generate small sequential keys, that can misbehave with sequential seeds | |
97 | // as in https://github.com/Cyan4973/xxHash/issues/469. | |
98 | // These keys are only heuristically unique, but that's OK with 64 bits, | |
99 | // for testing purposes. | |
100 | struct SmallKeyGen { | |
101 | SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) { | |
102 | // Hash the prefix for a heuristically unique offset | |
103 | id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix); | |
104 | ROCKSDB_NAMESPACE::PutFixed64(&str_, id_); | |
105 | } | |
106 | ||
107 | // Prefix (only one required) | |
108 | SmallKeyGen& operator++() { | |
109 | ++id_; | |
110 | return *this; | |
111 | } | |
112 | ||
113 | SmallKeyGen& operator+=(uint64_t i) { | |
114 | id_ += i; | |
115 | return *this; | |
116 | } | |
117 | ||
118 | const std::string& operator*() { | |
119 | ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_); | |
120 | return str_; | |
121 | } | |
122 | ||
123 | bool operator==(const SmallKeyGen& other) { return id_ == other.id_; } | |
124 | bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; } | |
125 | ||
126 | uint64_t id_; | |
127 | std::string str_; | |
128 | }; | |
129 | ||
130 | template <typename KeyGen> | |
131 | struct Hash32KeyGenWrapper : public KeyGen { | |
132 | Hash32KeyGenWrapper(const std::string& prefix, uint64_t id) | |
133 | : KeyGen(prefix, id) {} | |
134 | uint32_t operator*() { | |
135 | auto& key = *static_cast<KeyGen&>(*this); | |
136 | // unseeded | |
137 | return ROCKSDB_NAMESPACE::GetSliceHash(key); | |
138 | } | |
139 | }; | |
140 | ||
141 | template <typename KeyGen> | |
142 | struct Hash64KeyGenWrapper : public KeyGen { | |
143 | Hash64KeyGenWrapper(const std::string& prefix, uint64_t id) | |
144 | : KeyGen(prefix, id) {} | |
145 | uint64_t operator*() { | |
146 | auto& key = *static_cast<KeyGen&>(*this); | |
147 | // unseeded | |
148 | return ROCKSDB_NAMESPACE::GetSliceHash64(key); | |
149 | } | |
150 | }; | |
151 | ||
152 | } // namespace | |
153 | ||
154 | using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate; | |
155 | using ROCKSDB_NAMESPACE::ribbon::StandardHasher; | |
156 | using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter; | |
157 | ||
158 | struct DefaultTypesAndSettings { | |
159 | using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128; | |
160 | using ResultRow = uint8_t; | |
161 | using Index = uint32_t; | |
162 | using Hash = uint64_t; | |
163 | using Seed = uint32_t; | |
164 | using Key = ROCKSDB_NAMESPACE::Slice; | |
165 | static constexpr bool kIsFilter = true; | |
166 | static constexpr bool kFirstCoeffAlwaysOne = true; | |
167 | static constexpr bool kUseSmash = false; | |
168 | static constexpr bool kAllowZeroStarts = false; | |
169 | static Hash HashFn(const Key& key, uint64_t raw_seed) { | |
170 | // This version 0.7.2 preview of XXH3 (a.k.a. XXH3p) function does | |
171 | // not pass SmallKeyGen tests below without some seed premixing from | |
172 | // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469 | |
173 | return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed); | |
174 | } | |
175 | // For testing | |
176 | using KeyGen = StandardKeyGen; | |
177 | }; | |
178 | ||
179 | using TypesAndSettings_Coeff128 = DefaultTypesAndSettings; | |
180 | struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings { | |
181 | static constexpr bool kUseSmash = true; | |
182 | }; | |
183 | struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings { | |
184 | using CoeffRow = uint64_t; | |
185 | }; | |
186 | struct TypesAndSettings_Coeff64Smash1 : public DefaultTypesAndSettings { | |
187 | using CoeffRow = uint64_t; | |
188 | static constexpr bool kUseSmash = true; | |
189 | }; | |
190 | struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash1 { | |
191 | static constexpr bool kFirstCoeffAlwaysOne = false; | |
192 | }; | |
193 | struct TypesAndSettings_Result16 : public DefaultTypesAndSettings { | |
194 | using ResultRow = uint16_t; | |
195 | }; | |
196 | struct TypesAndSettings_Result32 : public DefaultTypesAndSettings { | |
197 | using ResultRow = uint32_t; | |
198 | }; | |
199 | struct TypesAndSettings_IndexSizeT : public DefaultTypesAndSettings { | |
200 | using Index = size_t; | |
201 | }; | |
202 | struct TypesAndSettings_Hash32 : public DefaultTypesAndSettings { | |
203 | using Hash = uint32_t; | |
204 | static Hash HashFn(const Key& key, Hash raw_seed) { | |
205 | // This MurmurHash1 function does not pass tests below without the | |
206 | // seed premixing from StandardHasher. In fact, it needs more than | |
207 | // just a multiplication mixer on the ordinal seed. | |
208 | return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed); | |
209 | } | |
210 | }; | |
211 | struct TypesAndSettings_Hash32_Result16 : public TypesAndSettings_Hash32 { | |
212 | using ResultRow = uint16_t; | |
213 | }; | |
214 | struct TypesAndSettings_KeyString : public DefaultTypesAndSettings { | |
215 | using Key = std::string; | |
216 | }; | |
217 | struct TypesAndSettings_Seed8 : public DefaultTypesAndSettings { | |
218 | // This is not a generally recommended configuration. With the configured | |
219 | // hash function, it would fail with SmallKeyGen due to insufficient | |
220 | // independence among the seeds. | |
221 | using Seed = uint8_t; | |
222 | }; | |
223 | struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings { | |
224 | static constexpr bool kFirstCoeffAlwaysOne = false; | |
225 | }; | |
226 | struct TypesAndSettings_AllowZeroStarts : public DefaultTypesAndSettings { | |
227 | static constexpr bool kAllowZeroStarts = true; | |
228 | }; | |
229 | struct TypesAndSettings_Seed64 : public DefaultTypesAndSettings { | |
230 | using Seed = uint64_t; | |
231 | }; | |
232 | struct TypesAndSettings_Rehasher | |
233 | : public StandardRehasherAdapter<DefaultTypesAndSettings> { | |
234 | using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>; | |
235 | }; | |
236 | struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher { | |
237 | using ResultRow = uint16_t; | |
238 | }; | |
239 | struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher { | |
240 | using ResultRow = uint32_t; | |
241 | }; | |
242 | struct TypesAndSettings_Rehasher_Seed64 | |
243 | : public StandardRehasherAdapter<TypesAndSettings_Seed64> { | |
244 | using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>; | |
245 | // Note: 64-bit seed with Rehasher gives slightly better average reseeds | |
246 | }; | |
247 | struct TypesAndSettings_Rehasher32 | |
248 | : public StandardRehasherAdapter<TypesAndSettings_Hash32> { | |
249 | using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>; | |
250 | }; | |
251 | struct TypesAndSettings_Rehasher32_Coeff64 | |
252 | : public TypesAndSettings_Rehasher32 { | |
253 | using CoeffRow = uint64_t; | |
254 | }; | |
255 | struct TypesAndSettings_SmallKeyGen : public DefaultTypesAndSettings { | |
256 | // SmallKeyGen stresses the independence of different hash seeds | |
257 | using KeyGen = SmallKeyGen; | |
258 | }; | |
259 | struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 { | |
260 | // SmallKeyGen stresses the independence of different hash seeds | |
261 | using KeyGen = SmallKeyGen; | |
262 | }; | |
263 | ||
264 | using TestTypesAndSettings = ::testing::Types< | |
265 | TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash, | |
266 | TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash0, | |
267 | TypesAndSettings_Coeff64Smash1, TypesAndSettings_Result16, | |
268 | TypesAndSettings_Result32, TypesAndSettings_IndexSizeT, | |
269 | TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16, | |
270 | TypesAndSettings_KeyString, TypesAndSettings_Seed8, | |
271 | TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts, | |
272 | TypesAndSettings_Seed64, TypesAndSettings_Rehasher, | |
273 | TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32, | |
274 | TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32, | |
275 | TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen, | |
276 | TypesAndSettings_Hash32_SmallKeyGen>; | |
277 | TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings); | |
278 | ||
279 | namespace { | |
280 | ||
281 | // For testing Poisson-distributed (or similar) statistics, get value for | |
282 | // `stddevs_allowed` standard deviations above expected mean | |
283 | // `expected_count`. | |
284 | // (Poisson approximates Binomial only if probability of a trial being | |
285 | // in the count is low.) | |
286 | uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) { | |
287 | return static_cast<uint64_t>( | |
288 | expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0); | |
289 | } | |
290 | ||
291 | uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) { | |
292 | return static_cast<uint64_t>(std::max( | |
293 | 0.0, expected_count - stddevs_allowed * std::sqrt(expected_count))); | |
294 | } | |
295 | ||
296 | uint64_t FrequentPoissonUpperBound(double expected_count) { | |
297 | // Allow up to 5.0 standard deviations for frequently checked statistics | |
298 | return PoissonUpperBound(expected_count, 5.0); | |
299 | } | |
300 | ||
301 | uint64_t FrequentPoissonLowerBound(double expected_count) { | |
302 | return PoissonLowerBound(expected_count, 5.0); | |
303 | } | |
304 | ||
305 | uint64_t InfrequentPoissonUpperBound(double expected_count) { | |
306 | // Allow up to 3 standard deviations for infrequently checked statistics | |
307 | return PoissonUpperBound(expected_count, 3.0); | |
308 | } | |
309 | ||
310 | uint64_t InfrequentPoissonLowerBound(double expected_count) { | |
311 | return PoissonLowerBound(expected_count, 3.0); | |
312 | } | |
313 | ||
314 | } // namespace | |
315 | ||
316 | TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) { | |
317 | IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); | |
318 | IMPORT_RIBBON_IMPL_TYPES(TypeParam); | |
319 | using KeyGen = typename TypeParam::KeyGen; | |
320 | ||
321 | // For testing FP rate etc. | |
322 | constexpr Index kNumToCheck = 100000; | |
323 | ||
324 | const auto log2_thoroughness = | |
325 | static_cast<Hash>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness)); | |
326 | ||
327 | // With overhead of just 2%, expect ~50% encoding success per | |
328 | // seed with ~5k keys on 64-bit ribbon, or ~150k keys on 128-bit ribbon. | |
329 | const double kFactor = 1.02; | |
330 | ||
331 | uint64_t total_reseeds = 0; | |
332 | uint64_t total_single_failures = 0; | |
333 | uint64_t total_batch_successes = 0; | |
334 | uint64_t total_fp_count = 0; | |
335 | uint64_t total_added = 0; | |
336 | ||
337 | uint64_t soln_query_nanos = 0; | |
338 | uint64_t soln_query_count = 0; | |
339 | uint64_t bloom_query_nanos = 0; | |
340 | uint64_t isoln_query_nanos = 0; | |
341 | uint64_t isoln_query_count = 0; | |
342 | ||
343 | // Take different samples if you change thoroughness | |
344 | ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness); | |
345 | ||
346 | for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) { | |
347 | uint32_t num_to_add = | |
348 | sizeof(CoeffRow) == 16 ? 130000 : TypeParam::kUseSmash ? 5500 : 2500; | |
349 | ||
350 | // Use different values between that number and 50% of that number | |
351 | num_to_add -= rnd.Uniformish(num_to_add / 2); | |
352 | ||
353 | total_added += num_to_add; | |
354 | ||
355 | // Most of the time, test the Interleaved solution storage, but when | |
356 | // we do we have to make num_slots a multiple of kCoeffBits. So | |
357 | // sometimes we want to test without that limitation. | |
358 | bool test_interleaved = (i % 7) != 6; | |
359 | ||
360 | Index num_slots = static_cast<Index>(num_to_add * kFactor); | |
361 | if (test_interleaved) { | |
362 | // Round to supported number of slots | |
363 | num_slots = InterleavedSoln::RoundUpNumSlots(num_slots); | |
364 | // Re-adjust num_to_add to get as close as possible to kFactor | |
365 | num_to_add = static_cast<uint32_t>(num_slots / kFactor); | |
366 | } | |
367 | ||
368 | std::string prefix; | |
369 | ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next()); | |
370 | ||
371 | // Batch that must be added | |
372 | std::string added_str = prefix + "added"; | |
373 | KeyGen keys_begin(added_str, 0); | |
374 | KeyGen keys_end(added_str, num_to_add); | |
375 | ||
376 | // A couple more that will probably be added | |
377 | KeyGen one_more(prefix + "more", 1); | |
378 | KeyGen two_more(prefix + "more", 2); | |
379 | ||
380 | // Batch that may or may not be added | |
381 | const Index kBatchSize = | |
382 | sizeof(CoeffRow) == 16 ? 300 : TypeParam::kUseSmash ? 20 : 10; | |
383 | std::string batch_str = prefix + "batch"; | |
384 | KeyGen batch_begin(batch_str, 0); | |
385 | KeyGen batch_end(batch_str, kBatchSize); | |
386 | ||
387 | // Batch never (successfully) added, but used for querying FP rate | |
388 | std::string not_str = prefix + "not"; | |
389 | KeyGen other_keys_begin(not_str, 0); | |
390 | KeyGen other_keys_end(not_str, kNumToCheck); | |
391 | ||
392 | // Vary bytes for InterleavedSoln to use number of solution columns | |
393 | // from 0 to max allowed by ResultRow type (and used by SimpleSoln). | |
394 | // Specifically include 0 and max, and otherwise skew toward max. | |
395 | uint32_t max_ibytes = static_cast<uint32_t>(sizeof(ResultRow) * num_slots); | |
396 | size_t ibytes; | |
397 | if (i == 0) { | |
398 | ibytes = 0; | |
399 | } else if (i == 1) { | |
400 | ibytes = max_ibytes; | |
401 | } else { | |
402 | // Skewed | |
403 | ibytes = std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes)); | |
404 | } | |
405 | std::unique_ptr<char[]> idata(new char[ibytes]); | |
406 | InterleavedSoln isoln(idata.get(), ibytes); | |
407 | ||
408 | SimpleSoln soln; | |
409 | Hasher hasher; | |
410 | bool first_single; | |
411 | bool second_single; | |
412 | bool batch_success; | |
413 | { | |
414 | Banding banding; | |
415 | // Traditional solve for a fixed set. | |
416 | ASSERT_TRUE( | |
417 | banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end)); | |
418 | ||
419 | // Now to test backtracking, starting with guaranteed fail. By using | |
420 | // the keys that will be used to test FP rate, we are then doing an | |
421 | // extra check that after backtracking there are no remnants (e.g. in | |
422 | // result side of banding) of these entries. | |
423 | Index occupied_count = banding.GetOccupiedCount(); | |
424 | banding.EnsureBacktrackSize(kNumToCheck); | |
425 | EXPECT_FALSE( | |
426 | banding.AddRangeOrRollBack(other_keys_begin, other_keys_end)); | |
427 | EXPECT_EQ(occupied_count, banding.GetOccupiedCount()); | |
428 | ||
429 | // Check that we still have a good chance of adding a couple more | |
430 | // individually | |
431 | first_single = banding.Add(*one_more); | |
432 | second_single = banding.Add(*two_more); | |
433 | Index more_added = (first_single ? 1 : 0) + (second_single ? 1 : 0); | |
434 | total_single_failures += 2U - more_added; | |
435 | ||
436 | // Or as a batch | |
437 | batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end); | |
438 | if (batch_success) { | |
439 | more_added += kBatchSize; | |
440 | ++total_batch_successes; | |
441 | } | |
442 | EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added); | |
443 | ||
444 | // Also verify that redundant adds are OK (no effect) | |
445 | ASSERT_TRUE( | |
446 | banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8))); | |
447 | EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added); | |
448 | ||
449 | // Now back-substitution | |
450 | soln.BackSubstFrom(banding); | |
451 | if (test_interleaved) { | |
452 | isoln.BackSubstFrom(banding); | |
453 | } | |
454 | ||
455 | Seed reseeds = banding.GetOrdinalSeed(); | |
456 | total_reseeds += reseeds; | |
457 | ||
458 | EXPECT_LE(reseeds, 8 + log2_thoroughness); | |
459 | if (reseeds > log2_thoroughness + 1) { | |
460 | fprintf( | |
461 | stderr, "%s high reseeds at %u, %u/%u: %u\n", | |
462 | reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat", | |
463 | static_cast<unsigned>(i), static_cast<unsigned>(num_to_add), | |
464 | static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds)); | |
465 | } | |
466 | hasher.SetOrdinalSeed(reseeds); | |
467 | } | |
468 | // soln and hasher now independent of Banding object | |
469 | ||
470 | // Verify keys added | |
471 | KeyGen cur = keys_begin; | |
472 | while (cur != keys_end) { | |
473 | ASSERT_TRUE(soln.FilterQuery(*cur, hasher)); | |
474 | ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher)); | |
475 | ++cur; | |
476 | } | |
477 | // We (maybe) snuck these in! | |
478 | if (first_single) { | |
479 | ASSERT_TRUE(soln.FilterQuery(*one_more, hasher)); | |
480 | ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher)); | |
481 | } | |
482 | if (second_single) { | |
483 | ASSERT_TRUE(soln.FilterQuery(*two_more, hasher)); | |
484 | ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher)); | |
485 | } | |
486 | if (batch_success) { | |
487 | cur = batch_begin; | |
488 | while (cur != batch_end) { | |
489 | ASSERT_TRUE(soln.FilterQuery(*cur, hasher)); | |
490 | ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher)); | |
491 | ++cur; | |
492 | } | |
493 | } | |
494 | ||
495 | // Check FP rate (depends only on number of result bits == solution columns) | |
496 | Index fp_count = 0; | |
497 | cur = other_keys_begin; | |
498 | { | |
499 | ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(), | |
500 | true); | |
501 | while (cur != other_keys_end) { | |
502 | bool fp = soln.FilterQuery(*cur, hasher); | |
503 | fp_count += fp ? 1 : 0; | |
504 | ++cur; | |
505 | } | |
506 | soln_query_nanos += timer.ElapsedNanos(); | |
507 | soln_query_count += kNumToCheck; | |
508 | } | |
509 | { | |
510 | double expected_fp_count = soln.ExpectedFpRate() * kNumToCheck; | |
511 | // For expected FP rate, also include false positives due to collisions | |
512 | // in Hash value. (Negligible for 64-bit, can matter for 32-bit.) | |
513 | double correction = | |
514 | kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add); | |
515 | EXPECT_LE(fp_count, | |
516 | FrequentPoissonUpperBound(expected_fp_count + correction)); | |
517 | EXPECT_GE(fp_count, | |
518 | FrequentPoissonLowerBound(expected_fp_count + correction)); | |
519 | } | |
520 | total_fp_count += fp_count; | |
521 | ||
522 | // And also check FP rate for isoln | |
523 | if (test_interleaved) { | |
524 | Index ifp_count = 0; | |
525 | cur = other_keys_begin; | |
526 | ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(), | |
527 | true); | |
528 | while (cur != other_keys_end) { | |
529 | ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0; | |
530 | ++cur; | |
531 | } | |
532 | isoln_query_nanos += timer.ElapsedNanos(); | |
533 | isoln_query_count += kNumToCheck; | |
534 | { | |
535 | double expected_fp_count = isoln.ExpectedFpRate() * kNumToCheck; | |
536 | // For expected FP rate, also include false positives due to collisions | |
537 | // in Hash value. (Negligible for 64-bit, can matter for 32-bit.) | |
538 | double correction = | |
539 | kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add); | |
540 | EXPECT_LE(ifp_count, | |
541 | FrequentPoissonUpperBound(expected_fp_count + correction)); | |
542 | EXPECT_GE(ifp_count, | |
543 | FrequentPoissonLowerBound(expected_fp_count + correction)); | |
544 | } | |
545 | // Since the bits used in isoln are a subset of the bits used in soln, | |
546 | // it cannot have fewer FPs | |
547 | EXPECT_GE(ifp_count, fp_count); | |
548 | } | |
549 | ||
550 | // And compare to Bloom time, for fun | |
551 | if (ibytes >= /* minimum Bloom impl bytes*/ 64) { | |
552 | Index bfp_count = 0; | |
553 | cur = other_keys_begin; | |
554 | ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(), | |
555 | true); | |
556 | while (cur != other_keys_end) { | |
557 | uint64_t h = hasher.GetHash(*cur); | |
558 | uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h); | |
559 | uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h) | |
560 | : h1 * 0x9e3779b9; | |
561 | bfp_count += ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch( | |
562 | h1, h2, static_cast<uint32_t>(ibytes), 6, idata.get()) | |
563 | ? 1 | |
564 | : 0; | |
565 | ++cur; | |
566 | } | |
567 | bloom_query_nanos += timer.ElapsedNanos(); | |
568 | // ensure bfp_count is used | |
569 | ASSERT_LT(bfp_count, kNumToCheck); | |
570 | } | |
571 | } | |
572 | ||
573 | // "outside" == key not in original set so either negative or false positive | |
574 | fprintf(stderr, "Simple outside query, hot, incl hashing, ns/key: %g\n", | |
575 | 1.0 * soln_query_nanos / soln_query_count); | |
576 | fprintf(stderr, "Interleaved outside query, hot, incl hashing, ns/key: %g\n", | |
577 | 1.0 * isoln_query_nanos / isoln_query_count); | |
578 | fprintf(stderr, "Bloom outside query, hot, incl hashing, ns/key: %g\n", | |
579 | 1.0 * bloom_query_nanos / soln_query_count); | |
580 | ||
581 | { | |
582 | double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness; | |
583 | fprintf(stderr, "Average re-seeds: %g\n", average_reseeds); | |
584 | // Values above were chosen to target around 50% chance of encoding success | |
585 | // rate (average of 1.0 re-seeds) or slightly better. But 1.15 is also close | |
586 | // enough. | |
587 | EXPECT_LE(total_reseeds, | |
588 | InfrequentPoissonUpperBound(1.15 * FLAGS_thoroughness)); | |
589 | // Would use 0.85 here instead of 0.75, but | |
590 | // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of | |
591 | // sequential keys with a small, cheap hash function. We accept that | |
592 | // there are surely inputs that are somewhat bad for this setup, but | |
593 | // these somewhat good inputs are probably more likely. | |
594 | EXPECT_GE(total_reseeds, | |
595 | InfrequentPoissonLowerBound(0.75 * FLAGS_thoroughness)); | |
596 | } | |
597 | ||
598 | { | |
599 | uint64_t total_singles = 2 * FLAGS_thoroughness; | |
600 | double single_failure_rate = 1.0 * total_single_failures / total_singles; | |
601 | fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate); | |
602 | // A rough bound (one sided) based on nothing in particular | |
603 | double expected_single_failures = | |
604 | 1.0 * total_singles / | |
605 | (sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32); | |
606 | EXPECT_LE(total_single_failures, | |
607 | InfrequentPoissonUpperBound(expected_single_failures)); | |
608 | } | |
609 | ||
610 | { | |
611 | // Counting successes here for Poisson to approximate the Binomial | |
612 | // distribution. | |
613 | // A rough bound (one sided) based on nothing in particular. | |
614 | double expected_batch_successes = 1.0 * FLAGS_thoroughness / 2; | |
615 | uint64_t lower_bound = | |
616 | InfrequentPoissonLowerBound(expected_batch_successes); | |
617 | fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n", | |
618 | 1.0 * total_batch_successes / FLAGS_thoroughness, | |
619 | 1.0 * lower_bound / FLAGS_thoroughness); | |
620 | EXPECT_GE(total_batch_successes, lower_bound); | |
621 | } | |
622 | ||
623 | { | |
624 | uint64_t total_checked = uint64_t{kNumToCheck} * FLAGS_thoroughness; | |
625 | double expected_total_fp_count = | |
626 | total_checked * std::pow(0.5, 8U * sizeof(ResultRow)); | |
627 | // For expected FP rate, also include false positives due to collisions | |
628 | // in Hash value. (Negligible for 64-bit, can matter for 32-bit.) | |
629 | double average_added = 1.0 * total_added / FLAGS_thoroughness; | |
630 | expected_total_fp_count += | |
631 | total_checked * ExpectedCollisionFpRate(Hasher(), average_added); | |
632 | ||
633 | uint64_t upper_bound = InfrequentPoissonUpperBound(expected_total_fp_count); | |
634 | uint64_t lower_bound = InfrequentPoissonLowerBound(expected_total_fp_count); | |
635 | fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n", | |
636 | 1.0 * total_fp_count / total_checked, | |
637 | expected_total_fp_count / total_checked, | |
638 | 1.0 * upper_bound / total_checked, | |
639 | 1.0 * lower_bound / total_checked); | |
640 | EXPECT_LE(total_fp_count, upper_bound); | |
641 | EXPECT_GE(total_fp_count, lower_bound); | |
642 | } | |
643 | } | |
644 | ||
645 | TYPED_TEST(RibbonTypeParamTest, Extremes) { | |
646 | IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); | |
647 | IMPORT_RIBBON_IMPL_TYPES(TypeParam); | |
648 | using KeyGen = typename TypeParam::KeyGen; | |
649 | ||
650 | size_t bytes = 128 * 1024; | |
651 | std::unique_ptr<char[]> buf(new char[bytes]); | |
652 | InterleavedSoln isoln(buf.get(), bytes); | |
653 | SimpleSoln soln; | |
654 | Hasher hasher; | |
655 | Banding banding; | |
656 | ||
657 | // ######################################## | |
658 | // Add zero keys to minimal number of slots | |
659 | KeyGen begin_and_end("foo", 123); | |
660 | ASSERT_TRUE(banding.ResetAndFindSeedToSolve( | |
661 | /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0, | |
662 | /* seed mask*/ 0)); | |
663 | ||
664 | soln.BackSubstFrom(banding); | |
665 | isoln.BackSubstFrom(banding); | |
666 | ||
667 | // Because there's plenty of memory, we expect the interleaved solution to | |
668 | // use maximum supported columns (same as simple solution) | |
669 | ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow)); | |
670 | ASSERT_EQ(isoln.GetUpperStartBlock(), 0U); | |
671 | ||
672 | // Somewhat oddly, we expect same FP rate as if we had essentially filled | |
673 | // up the slots. | |
674 | constexpr Index kNumToCheck = 100000; | |
675 | KeyGen other_keys_begin("not", 0); | |
676 | KeyGen other_keys_end("not", kNumToCheck); | |
677 | ||
678 | Index fp_count = 0; | |
679 | KeyGen cur = other_keys_begin; | |
680 | while (cur != other_keys_end) { | |
681 | bool isoln_query_result = isoln.FilterQuery(*cur, hasher); | |
682 | bool soln_query_result = soln.FilterQuery(*cur, hasher); | |
683 | // Solutions are equivalent | |
684 | ASSERT_EQ(isoln_query_result, soln_query_result); | |
685 | // And in fact we only expect an FP when ResultRow is 0 | |
686 | // CHANGE: no longer true because of filling some unused slots | |
687 | // with pseudorandom values. | |
688 | // ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash( | |
689 | // hasher.GetHash(*cur)) == ResultRow{0}); | |
690 | fp_count += soln_query_result ? 1 : 0; | |
691 | ++cur; | |
692 | } | |
693 | { | |
694 | ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate()); | |
695 | double expected_fp_count = isoln.ExpectedFpRate() * kNumToCheck; | |
696 | EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count)); | |
697 | EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count)); | |
698 | } | |
699 | ||
700 | // ###################################################### | |
701 | // Use zero bytes for interleaved solution (key(s) added) | |
702 | ||
703 | // Add one key | |
704 | KeyGen key_begin("added", 0); | |
705 | KeyGen key_end("added", 1); | |
706 | ASSERT_TRUE(banding.ResetAndFindSeedToSolve( | |
707 | /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0, | |
708 | /* seed mask*/ 0)); | |
709 | ||
710 | InterleavedSoln isoln2(nullptr, /*bytes*/ 0); | |
711 | ||
712 | isoln2.BackSubstFrom(banding); | |
713 | ||
714 | ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U); | |
715 | ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U); | |
716 | ||
717 | // All queries return true | |
718 | ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher)); | |
719 | ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0); | |
720 | } | |
721 | ||
722 | TEST(RibbonTest, AllowZeroStarts) { | |
723 | IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts); | |
724 | IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts); | |
725 | using KeyGen = StandardKeyGen; | |
726 | ||
727 | InterleavedSoln isoln(nullptr, /*bytes*/ 0); | |
728 | SimpleSoln soln; | |
729 | Hasher hasher; | |
730 | Banding banding; | |
731 | ||
732 | KeyGen begin("foo", 0); | |
733 | KeyGen end("foo", 1); | |
734 | // Can't add 1 entry | |
735 | ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end)); | |
736 | ||
737 | KeyGen begin_and_end("foo", 123); | |
738 | // Can add 0 entries | |
739 | ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end, | |
740 | begin_and_end)); | |
741 | ||
742 | Seed reseeds = banding.GetOrdinalSeed(); | |
743 | ASSERT_EQ(reseeds, 0U); | |
744 | hasher.SetOrdinalSeed(reseeds); | |
745 | ||
746 | // Can construct 0-slot solutions | |
747 | isoln.BackSubstFrom(banding); | |
748 | soln.BackSubstFrom(banding); | |
749 | ||
750 | // Should always return false | |
751 | ASSERT_FALSE(isoln.FilterQuery(*begin, hasher)); | |
752 | ASSERT_FALSE(soln.FilterQuery(*begin, hasher)); | |
753 | ||
754 | // And report that in FP rate | |
755 | ASSERT_EQ(isoln.ExpectedFpRate(), 0.0); | |
756 | ASSERT_EQ(soln.ExpectedFpRate(), 0.0); | |
757 | } | |
758 | ||
759 | TEST(RibbonTest, RawAndOrdinalSeeds) { | |
760 | StandardHasher<TypesAndSettings_Seed64> hasher64; | |
761 | StandardHasher<DefaultTypesAndSettings> hasher64_32; | |
762 | StandardHasher<TypesAndSettings_Hash32> hasher32; | |
763 | StandardHasher<TypesAndSettings_Seed8> hasher8; | |
764 | ||
765 | for (uint32_t limit : {0xffU, 0xffffU}) { | |
766 | std::vector<bool> seen(limit + 1); | |
767 | for (uint32_t i = 0; i < limit; ++i) { | |
768 | hasher64.SetOrdinalSeed(i); | |
769 | auto raw64 = hasher64.GetRawSeed(); | |
770 | hasher32.SetOrdinalSeed(i); | |
771 | auto raw32 = hasher32.GetRawSeed(); | |
772 | hasher8.SetOrdinalSeed(static_cast<uint8_t>(i)); | |
773 | auto raw8 = hasher8.GetRawSeed(); | |
774 | { | |
775 | hasher64_32.SetOrdinalSeed(i); | |
776 | auto raw64_32 = hasher64_32.GetRawSeed(); | |
777 | ASSERT_EQ(raw64_32, raw32); // Same size seed | |
778 | } | |
779 | if (i == 0) { | |
780 | // Documented that ordinal seed 0 == raw seed 0 | |
781 | ASSERT_EQ(raw64, 0U); | |
782 | ASSERT_EQ(raw32, 0U); | |
783 | ASSERT_EQ(raw8, 0U); | |
784 | } else { | |
785 | // Extremely likely that upper bits are set | |
786 | ASSERT_GT(raw64, raw32); | |
787 | ASSERT_GT(raw32, raw8); | |
788 | } | |
789 | // Hashers agree on lower bits | |
790 | ASSERT_EQ(static_cast<uint32_t>(raw64), raw32); | |
791 | ASSERT_EQ(static_cast<uint8_t>(raw32), raw8); | |
792 | ||
793 | // The translation is one-to-one for this size prefix | |
794 | uint32_t v = static_cast<uint32_t>(raw32 & limit); | |
795 | ASSERT_EQ(raw64 & limit, v); | |
796 | ASSERT_FALSE(seen[v]); | |
797 | seen[v] = true; | |
798 | } | |
799 | } | |
800 | } | |
801 | ||
802 | namespace { | |
803 | ||
804 | struct PhsfInputGen { | |
805 | PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) { | |
806 | val_.first = prefix; | |
807 | ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0); | |
808 | } | |
809 | ||
810 | // Prefix (only one required) | |
811 | PhsfInputGen& operator++() { | |
812 | ++id_; | |
813 | return *this; | |
814 | } | |
815 | ||
816 | const std::pair<std::string, uint8_t>& operator*() { | |
817 | // Use multiplication to mix things up a little in the key | |
818 | ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8], | |
819 | id_ * uint64_t{0x1500000001}); | |
820 | // Occasionally repeat values etc. | |
821 | val_.second = static_cast<uint8_t>(id_ * 7 / 8); | |
822 | return val_; | |
823 | } | |
824 | ||
825 | const std::pair<std::string, uint8_t>* operator->() { return &**this; } | |
826 | ||
827 | bool operator==(const PhsfInputGen& other) { | |
828 | // Same prefix is assumed | |
829 | return id_ == other.id_; | |
830 | } | |
831 | bool operator!=(const PhsfInputGen& other) { | |
832 | // Same prefix is assumed | |
833 | return id_ != other.id_; | |
834 | } | |
835 | ||
836 | uint64_t id_; | |
837 | std::pair<std::string, uint8_t> val_; | |
838 | }; | |
839 | ||
840 | struct PhsfTypesAndSettings : public DefaultTypesAndSettings { | |
841 | static constexpr bool kIsFilter = false; | |
842 | }; | |
843 | } // namespace | |
844 | ||
845 | TEST(RibbonTest, PhsfBasic) { | |
846 | IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings); | |
847 | IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings); | |
848 | ||
849 | Index num_slots = 12800; | |
850 | Index num_to_add = static_cast<Index>(num_slots / 1.02); | |
851 | ||
852 | PhsfInputGen begin("in", 0); | |
853 | PhsfInputGen end("in", num_to_add); | |
854 | ||
855 | std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]); | |
856 | InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots); | |
857 | SimpleSoln soln; | |
858 | Hasher hasher; | |
859 | ||
860 | { | |
861 | Banding banding; | |
862 | ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end)); | |
863 | ||
864 | soln.BackSubstFrom(banding); | |
865 | isoln.BackSubstFrom(banding); | |
866 | ||
867 | hasher.SetOrdinalSeed(banding.GetOrdinalSeed()); | |
868 | } | |
869 | ||
870 | for (PhsfInputGen cur = begin; cur != end; ++cur) { | |
871 | ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher)); | |
872 | ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher)); | |
873 | } | |
874 | } | |
875 | ||
876 | // Not a real test, but a tool used to build GetNumSlotsFor95PctSuccess | |
877 | TYPED_TEST(RibbonTypeParamTest, FindOccupancyForSuccessRate) { | |
878 | IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); | |
879 | IMPORT_RIBBON_IMPL_TYPES(TypeParam); | |
880 | using KeyGen = typename TypeParam::KeyGen; | |
881 | ||
882 | if (!FLAGS_find_occ) { | |
883 | fprintf(stderr, "Tool disabled during unit test runs\n"); | |
884 | return; | |
885 | } | |
886 | ||
887 | KeyGen cur("blah", 0); | |
888 | ||
889 | Banding banding; | |
890 | Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots); | |
891 | while (num_slots < FLAGS_find_max_slots) { | |
892 | double factor = 0.95; | |
893 | double delta = FLAGS_find_delta_start; | |
894 | while (delta > FLAGS_find_delta_end) { | |
895 | Index num_to_add = static_cast<Index>(factor * num_slots); | |
896 | KeyGen end = cur; | |
897 | end += num_to_add; | |
898 | bool success = banding.ResetAndFindSeedToSolve(num_slots, cur, end, 0, 0); | |
899 | cur = end; // fresh keys | |
900 | if (success) { | |
901 | factor += delta * (1.0 - FLAGS_find_success); | |
902 | factor = std::min(factor, 1.0); | |
903 | } else { | |
904 | factor -= delta * FLAGS_find_success; | |
905 | factor = std::max(factor, 0.0); | |
906 | } | |
907 | delta *= FLAGS_find_delta_shrink; | |
908 | fprintf(stderr, | |
909 | "slots: %u log2_slots: %g target_success: %g ->overhead: %g\r", | |
910 | static_cast<unsigned>(num_slots), | |
911 | std::log(num_slots * 1.0) / std::log(2.0), FLAGS_find_success, | |
912 | 1.0 / factor); | |
913 | } | |
914 | fprintf(stderr, "\n"); | |
915 | ||
916 | num_slots = std::max( | |
917 | num_slots + 1, static_cast<Index>(num_slots * FLAGS_find_next_factor)); | |
918 | num_slots = InterleavedSoln::RoundUpNumSlots(num_slots); | |
919 | } | |
920 | } | |
921 | ||
922 | // TODO: unit tests for configuration APIs | |
923 | // TODO: unit tests for small filter FP rates | |
924 | ||
925 | int main(int argc, char** argv) { | |
926 | ::testing::InitGoogleTest(&argc, argv); | |
927 | #ifdef GFLAGS | |
928 | ParseCommandLineFlags(&argc, &argv, true); | |
929 | #endif // GFLAGS | |
930 | return RUN_ALL_TESTS(); | |
931 | } |