]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/util/bloom_impl.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / util / bloom_impl.h
CommitLineData
f67539c2
TL
1// Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5//
6// Implementation details of various Bloom filter implementations used in
7// RocksDB. (DynamicBloom is in a separate file for now because it
8// supports concurrent write.)
9
10#pragma once
11#include <stddef.h>
12#include <stdint.h>
20effc67 13
f67539c2
TL
14#include <cmath>
15
20effc67 16#include "port/port.h" // for PREFETCH
f67539c2
TL
17#include "rocksdb/slice.h"
18#include "util/hash.h"
19
20#ifdef HAVE_AVX2
21#include <immintrin.h>
22#endif
23
24namespace ROCKSDB_NAMESPACE {
25
26class BloomMath {
27 public:
28 // False positive rate of a standard Bloom filter, for given ratio of
29 // filter memory bits to added keys, and number of probes per operation.
30 // (The false positive rate is effectively independent of scale, assuming
31 // the implementation scales OK.)
32 static double StandardFpRate(double bits_per_key, int num_probes) {
33 // Standard very-good-estimate formula. See
34 // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
35 return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes);
36 }
37
38 // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter,
39 // for given ratio of filter memory bits to added keys, number of probes per
40 // operation (all within the given block or cache line size), and block or
41 // cache line size.
42 static double CacheLocalFpRate(double bits_per_key, int num_probes,
43 int cache_line_bits) {
44 double keys_per_cache_line = cache_line_bits / bits_per_key;
45 // A reasonable estimate is the average of the FP rates for one standard
46 // deviation above and below the mean bucket occupancy. See
47 // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math
48 double keys_stddev = std::sqrt(keys_per_cache_line);
49 double crowded_fp = StandardFpRate(
50 cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes);
51 double uncrowded_fp = StandardFpRate(
52 cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes);
53 return (crowded_fp + uncrowded_fp) / 2;
54 }
55
56 // False positive rate of querying a new item against `num_keys` items, all
57 // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes
58 // themselves are stored losslessly. See Section 4 of
59 // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
60 static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) {
61 double inv_fingerprint_space = std::pow(0.5, fingerprint_bits);
62 // Base estimate assumes each key maps to a unique fingerprint.
63 // Could be > 1 in extreme cases.
64 double base_estimate = num_keys * inv_fingerprint_space;
65 // To account for potential overlap, we choose between two formulas
66 if (base_estimate > 0.0001) {
67 // A very good formula assuming we don't construct a floating point
68 // number extremely close to 1. Always produces a probability < 1.
69 return 1.0 - std::exp(-base_estimate);
70 } else {
71 // A very good formula when base_estimate is far below 1. (Subtract
72 // away the integral-approximated sum that some key has same hash as
73 // one coming before it in a list.)
74 return base_estimate - (base_estimate * base_estimate * 0.5);
75 }
76 }
77
78 // Returns the probably of either of two independent(-ish) events
79 // happening, given their probabilities. (This is useful for combining
80 // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate
81 // for a hash-efficient Bloom filter's FP rate. See Section 4 of
82 // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
83 static double IndependentProbabilitySum(double rate1, double rate2) {
84 // Use formula that avoids floating point extremely close to 1 if
85 // rates are extremely small.
86 return rate1 + rate2 - (rate1 * rate2);
87 }
88};
89
90// A fast, flexible, and accurate cache-local Bloom implementation with
91// SIMD-optimized query performance (currently using AVX2 on Intel). Write
20effc67 92// performance and non-SIMD read are very good, benefiting from FastRange32
f67539c2
TL
93// used in place of % and single-cycle multiplication on recent processors.
94//
95// Most other SIMD Bloom implementations sacrifice flexibility and/or
96// accuracy by requiring num_probes to be a power of two and restricting
97// where each probe can occur in a cache line. This implementation sacrifices
98// SIMD-optimization for add (might still be possible, especially with AVX512)
99// in favor of allowing any num_probes, not crossing cache line boundary,
100// and accuracy close to theoretical best accuracy for a cache-local Bloom.
101// E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket
102// (Intel cache line size) is 0.9535% FP rate. This implementation yields
103// about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or
104// about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.)
105//
106// This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or
107// a 64-bit hash (split into two uint32s). With many millions of keys, the
108// false positive rate associated with using a 32-bit hash can dominate the
109// false positive rate of the underlying filter. At 10 bits/key setting, the
110// inflection point is about 40 million keys, so 32-bit hash is a bad idea
111// with 10s of millions of keys or more.
112//
113// Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange
114// to pick a cache line, which can be faster than 64-bit in some cases.
115// This only hurts accuracy as you get into 10s of GB for a single filter,
116// and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to
117// 64-bit fastrange if you need filters so big. ;)
118//
119// Using only a 32-bit input hash within each cache line has negligible
120// impact for any reasonable cache line / bucket size, for arbitrary filter
121// size, and potentially saves intermediate data size in some cases vs.
122// tracking full 64 bits. (Even in an implementation using 64-bit arithmetic
123// to generate indices, I might do the same, as a single multiplication
124// suffices to generate a sufficiently mixed 64 bits from 32 bits.)
125//
126// This implementation is currently tied to Intel cache line size, 64 bytes ==
127// 512 bits. If there's sufficient demand for other cache line sizes, this is
128// a pretty good implementation to extend, but slight performance enhancements
129// are possible with an alternate implementation (probably not very compatible
130// with SIMD):
131// (1) Use rotation in addition to multiplication for remixing
132// (like murmur hash). (Using multiplication alone *slightly* hurts accuracy
133// because lower bits never depend on original upper bits.)
134// (2) Extract more than one bit index from each re-mix. (Only if rotation
135// or similar is part of remix, because otherwise you're making the
136// multiplication-only problem worse.)
137// (3) Re-mix full 64 bit hash, to get maximum number of bit indices per
138// re-mix.
139//
140class FastLocalBloomImpl {
141 public:
142 // NOTE: this has only been validated to enough accuracy for producing
143 // reasonable warnings / user feedback, not for making functional decisions.
144 static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes,
145 int hash_bits) {
146 return BloomMath::IndependentProbabilitySum(
147 BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes,
148 /*cache line bits*/ 512),
149 BloomMath::FingerprintFpRate(keys, hash_bits));
150 }
151
152 static inline int ChooseNumProbes(int millibits_per_key) {
153 // Since this implementation can (with AVX2) make up to 8 probes
154 // for the same cost, we pick the most accurate num_probes, based
155 // on actual tests of the implementation. Note that for higher
156 // bits/key, the best choice for cache-local Bloom can be notably
157 // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k.
158 if (millibits_per_key <= 2080) {
159 return 1;
160 } else if (millibits_per_key <= 3580) {
161 return 2;
162 } else if (millibits_per_key <= 5100) {
163 return 3;
164 } else if (millibits_per_key <= 6640) {
165 return 4;
166 } else if (millibits_per_key <= 8300) {
167 return 5;
168 } else if (millibits_per_key <= 10070) {
169 return 6;
170 } else if (millibits_per_key <= 11720) {
171 return 7;
172 } else if (millibits_per_key <= 14001) {
173 // Would be something like <= 13800 but sacrificing *slightly* for
174 // more settings using <= 8 probes.
175 return 8;
176 } else if (millibits_per_key <= 16050) {
177 return 9;
178 } else if (millibits_per_key <= 18300) {
179 return 10;
180 } else if (millibits_per_key <= 22001) {
181 return 11;
182 } else if (millibits_per_key <= 25501) {
183 return 12;
184 } else if (millibits_per_key > 50000) {
185 // Top out at 24 probes (three sets of 8)
186 return 24;
187 } else {
188 // Roughly optimal choices for remaining range
189 // e.g.
190 // 28000 -> 12, 28001 -> 13
191 // 50000 -> 23, 50001 -> 24
192 return (millibits_per_key - 1) / 2000 - 1;
193 }
194 }
195
196 static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
197 int num_probes, char *data) {
20effc67 198 uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
f67539c2
TL
199 AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
200 }
201
202 static inline void AddHashPrepared(uint32_t h2, int num_probes,
203 char *data_at_cache_line) {
204 uint32_t h = h2;
205 for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
206 // 9-bit address within 512 bit cache line
207 int bitpos = h >> (32 - 9);
208 data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7));
209 }
210 }
211
212 static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
213 const char *data,
214 uint32_t /*out*/ *byte_offset) {
20effc67 215 uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
f67539c2
TL
216 PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
217 PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
218 *byte_offset = bytes_to_cache_line;
219 }
220
221 static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
222 int num_probes, const char *data) {
20effc67 223 uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6;
f67539c2
TL
224 return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
225 }
226
227 static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
228 const char *data_at_cache_line) {
229 uint32_t h = h2;
230#ifdef HAVE_AVX2
231 int rem_probes = num_probes;
232
233 // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18,
234 // etc.} one can insert specialized code for rem_probes <= 2, bypassing
235 // the SIMD code in those cases. There is a detectable but minor overhead
236 // applied to other values of num_probes (when not statically determined),
237 // but smoother performance curve vs. num_probes. But for now, when
238 // in doubt, don't add unnecessary code.
239
240 // Powers of 32-bit golden ratio, mod 2**32.
241 const __m256i multipliers =
242 _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9,
243 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749);
244
245 for (;;) {
246 // Eight copies of hash
247 __m256i hash_vector = _mm256_set1_epi32(h);
248
249 // Same effect as repeated multiplication by 0x9e3779b9 thanks to
250 // associativity of multiplication.
251 hash_vector = _mm256_mullo_epi32(hash_vector, multipliers);
252
253 // Now the top 9 bits of each of the eight 32-bit values in
254 // hash_vector are bit addresses for probes within the cache line.
255 // While the platform-independent code uses byte addressing (6 bits
256 // to pick a byte + 3 bits to pick a bit within a byte), here we work
257 // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit
258 // within a word) because that works well with AVX2 and is equivalent
259 // under little-endian.
260
261 // Shift each right by 28 bits to get 4-bit word addresses.
262 const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28);
263
264 // Gather 32-bit values spread over 512 bits by 4-bit address. In
265 // essence, we are dereferencing eight pointers within the cache
266 // line.
267 //
268 // Option 1: AVX2 gather (seems to be a little slow - understandable)
269 // const __m256i value_vector =
270 // _mm256_i32gather_epi32(static_cast<const int
271 // *>(data_at_cache_line),
272 // word_addresses,
273 // /*bytes / i32*/ 4);
274 // END Option 1
275 // Potentially unaligned as we're not *always* cache-aligned -> loadu
276 const __m256i *mm_data =
277 reinterpret_cast<const __m256i *>(data_at_cache_line);
278 __m256i lower = _mm256_loadu_si256(mm_data);
279 __m256i upper = _mm256_loadu_si256(mm_data + 1);
280 // Option 2: AVX512VL permute hack
281 // Only negligibly faster than Option 3, so not yet worth supporting
282 // const __m256i value_vector =
283 // _mm256_permutex2var_epi32(lower, word_addresses, upper);
284 // END Option 2
285 // Option 3: AVX2 permute+blend hack
286 // Use lowest three bits to order probing values, as if all from same
287 // 256 bit piece.
288 lower = _mm256_permutevar8x32_epi32(lower, word_addresses);
289 upper = _mm256_permutevar8x32_epi32(upper, word_addresses);
290 // Just top 1 bit of address, to select between lower and upper.
291 const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31);
292 // Finally: the next 8 probed 32-bit values, in probing sequence order.
293 const __m256i value_vector =
294 _mm256_blendv_epi8(lower, upper, upper_lower_selector);
295 // END Option 3
296
297 // We might not need to probe all 8, so build a mask for selecting only
298 // what we need. (The k_selector(s) could be pre-computed but that
299 // doesn't seem to make a noticeable performance difference.)
300 const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
301 // Subtract rem_probes from each of those constants
302 __m256i k_selector =
303 _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes));
304 // Negative after subtract -> use/select
305 // Keep only high bit (logical shift right each by 31).
306 k_selector = _mm256_srli_epi32(k_selector, 31);
307
308 // Strip off the 4 bit word address (shift left)
309 __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4);
310 // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses.
311 bit_addresses = _mm256_srli_epi32(bit_addresses, 27);
312 // Build a bit mask
313 const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses);
314
315 // Like ((~value_vector) & bit_mask) == 0)
316 bool match = _mm256_testc_si256(value_vector, bit_mask) != 0;
317
318 // This check first so that it's easy for branch predictor to optimize
319 // num_probes <= 8 case, making it free of unpredictable branches.
320 if (rem_probes <= 8) {
321 return match;
322 } else if (!match) {
323 return false;
324 }
325 // otherwise
326 // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power
327 h *= 0xab25f4c1;
328 rem_probes -= 8;
329 }
330#else
331 for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
332 // 9-bit address within 512 bit cache line
333 int bitpos = h >> (32 - 9);
334 if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) {
335 return false;
336 }
337 }
338 return true;
339#endif
340 }
341};
342
343// A legacy Bloom filter implementation with no locality of probes (slow).
344// It uses double hashing to generate a sequence of hash values.
345// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
346// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
347//
348// DO NOT REUSE
349//
350class LegacyNoLocalityBloomImpl {
351 public:
352 static inline int ChooseNumProbes(int bits_per_key) {
353 // We intentionally round down to reduce probing cost a little bit
354 int num_probes = static_cast<int>(bits_per_key * 0.69); // 0.69 =~ ln(2)
355 if (num_probes < 1) num_probes = 1;
356 if (num_probes > 30) num_probes = 30;
357 return num_probes;
358 }
359
360 static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
361 char *data) {
362 const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
363 for (int i = 0; i < num_probes; i++) {
364 const uint32_t bitpos = h % total_bits;
365 data[bitpos / 8] |= (1 << (bitpos % 8));
366 h += delta;
367 }
368 }
369
370 static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
371 int num_probes, const char *data) {
372 const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
373 for (int i = 0; i < num_probes; i++) {
374 const uint32_t bitpos = h % total_bits;
375 if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
376 return false;
377 }
378 h += delta;
379 }
380 return true;
381 }
382};
383
384// A legacy Bloom filter implementation with probes local to a single
385// cache line (fast). Because SST files might be transported between
386// platforms, the cache line size is a parameter rather than hard coded.
387// (But if specified as a constant parameter, an optimizing compiler
388// should take advantage of that.)
389//
390// When ExtraRotates is false, this implementation is notably deficient in
391// accuracy. Specifically, it uses double hashing with a 1/512 chance of the
392// increment being zero (when cache line size is 512 bits). Thus, there's a
393// 1/512 chance of probing only one index, which we'd expect to incur about
394// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
395// https://github.com/facebook/rocksdb/issues/4120
396//
397// DO NOT REUSE
398//
399template <bool ExtraRotates>
400class LegacyLocalityBloomImpl {
401 private:
402 static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
403 uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
404 return offset_h % num_lines;
405 }
406
407 public:
408 // NOTE: this has only been validated to enough accuracy for producing
409 // reasonable warnings / user feedback, not for making functional decisions.
410 static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) {
411 double bits_per_key = 8.0 * bytes / keys;
412 double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes,
413 /*cache line bits*/ 512);
414 if (!ExtraRotates) {
415 // Good estimate of impact of flaw in index computation.
416 // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key.
417 // The + 22 shifts it nicely to fit for lower bits/key.
418 filter_rate += 0.1 / (bits_per_key * 0.75 + 22);
419 } else {
420 // Not yet validated
421 assert(false);
422 }
423 // Always uses 32-bit hash
424 double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32);
425 return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate);
426 }
427
428 static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
429 char *data, int log2_cache_line_bytes) {
430 const int log2_cache_line_bits = log2_cache_line_bytes + 3;
431
432 char *data_at_offset =
433 data + (GetLine(h, num_lines) << log2_cache_line_bytes);
434 const uint32_t delta = (h >> 17) | (h << 15);
435 for (int i = 0; i < num_probes; ++i) {
436 // Mask to bit-within-cache-line address
437 const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
438 data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
439 if (ExtraRotates) {
440 h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
441 }
442 h += delta;
443 }
444 }
445
446 static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
447 const char *data,
448 uint32_t /*out*/ *byte_offset,
449 int log2_cache_line_bytes) {
450 uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
451 PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
452 PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */,
453 1 /* locality */);
454 *byte_offset = b;
455 }
456
457 static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
458 int num_probes, const char *data,
459 int log2_cache_line_bytes) {
460 uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
461 return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
462 }
463
464 static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
465 const char *data_at_offset,
466 int log2_cache_line_bytes) {
467 const int log2_cache_line_bits = log2_cache_line_bytes + 3;
468
469 const uint32_t delta = (h >> 17) | (h << 15);
470 for (int i = 0; i < num_probes; ++i) {
471 // Mask to bit-within-cache-line address
472 const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
473 if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
474 return false;
475 }
476 if (ExtraRotates) {
477 h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
478 }
479 h += delta;
480 }
481 return true;
482 }
483};
484
485} // namespace ROCKSDB_NAMESPACE