]> git.proxmox.com Git - ceph.git/blame - ceph/src/common/bloom_filter.hpp
import quincy beta 17.1.0
[ceph.git] / ceph / src / common / bloom_filter.hpp
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4/*
5 *******************************************************************
6 * *
7 * Open Bloom Filter *
8 * *
9 * Author: Arash Partow - 2000 *
10 * URL: http://www.partow.net/programming/hashfunctions/index.html *
11 * *
12 * Copyright notice: *
13 * Free use of the Open Bloom Filter Library is permitted under *
14 * the guidelines and in accordance with the most current version *
15 * of the Boost Software License, Version 1.0 *
16 * http://www.opensource.org/licenses/bsl1.0.html *
17 * *
18 *******************************************************************
19*/
20
21
22#ifndef COMMON_BLOOM_FILTER_HPP
23#define COMMON_BLOOM_FILTER_HPP
24
7c673cae 25#include <cmath>
7c673cae 26
7c673cae 27#include "include/encoding.h"
20effc67 28#include "include/mempool.h"
7c673cae 29
20effc67 30static const unsigned char bit_mask[CHAR_BIT] = {
7c673cae
FG
31 0x01, //00000001
32 0x02, //00000010
33 0x04, //00000100
34 0x08, //00001000
35 0x10, //00010000
36 0x20, //00100000
37 0x40, //01000000
38 0x80 //10000000
39};
40
7c673cae
FG
41class bloom_filter
42{
43protected:
44
20effc67
TL
45 using bloom_type = unsigned int;
46 using cell_type = unsigned char;
47 using table_type = mempool::bloom_filter::vector<cell_type>;
7c673cae 48
7c673cae 49 std::vector<bloom_type> salt_; ///< vector of salts
20effc67 50 table_type bit_table_; ///< bit map
7c673cae
FG
51 std::size_t salt_count_; ///< number of salts
52 std::size_t table_size_; ///< bit table size in bytes
53 std::size_t insert_count_; ///< insertion count
54 std::size_t target_element_count_; ///< target number of unique insertions
55 std::size_t random_seed_; ///< random seed
56
57public:
58
59 bloom_filter()
20effc67 60 : salt_count_(0),
7c673cae
FG
61 table_size_(0),
62 insert_count_(0),
63 target_element_count_(0),
64 random_seed_(0)
65 {}
66
67 bloom_filter(const std::size_t& predicted_inserted_element_count,
68 const double& false_positive_probability,
69 const std::size_t& random_seed)
20effc67 70 : insert_count_(0),
7c673cae
FG
71 target_element_count_(predicted_inserted_element_count),
72 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
73 {
11fdf7f2 74 ceph_assert(false_positive_probability > 0.0);
20effc67
TL
75 std::tie(salt_count_, table_size_) =
76 find_optimal_parameters(predicted_inserted_element_count,
77 false_positive_probability);
7c673cae
FG
78 init();
79 }
80
81 bloom_filter(const std::size_t& salt_count,
82 std::size_t table_size,
83 const std::size_t& random_seed,
84 std::size_t target_element_count)
20effc67 85 : salt_count_(salt_count),
7c673cae
FG
86 table_size_(table_size),
87 insert_count_(0),
88 target_element_count_(target_element_count),
89 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
90 {
91 init();
92 }
93
94 void init() {
95 generate_unique_salt();
20effc67 96 bit_table_.resize(table_size_, static_cast<unsigned char>(0x00));
7c673cae
FG
97 }
98
99 bloom_filter(const bloom_filter& filter)
7c673cae
FG
100 {
101 this->operator=(filter);
102 }
103
104 bloom_filter& operator = (const bloom_filter& filter)
105 {
106 if (this != &filter) {
7c673cae
FG
107 salt_count_ = filter.salt_count_;
108 table_size_ = filter.table_size_;
109 insert_count_ = filter.insert_count_;
110 target_element_count_ = filter.target_element_count_;
111 random_seed_ = filter.random_seed_;
20effc67 112 bit_table_ = filter.bit_table_;
7c673cae
FG
113 salt_ = filter.salt_;
114 }
115 return *this;
116 }
117
20effc67 118 virtual ~bloom_filter() = default;
7c673cae
FG
119
120 inline bool operator!() const
121 {
122 return (0 == table_size_);
123 }
124
125 inline void clear()
126 {
20effc67
TL
127 std::fill(bit_table_.begin(), bit_table_.end(),
128 static_cast<unsigned char>(0x00));
7c673cae
FG
129 insert_count_ = 0;
130 }
131
132 /**
133 * insert a u32 into the set
134 *
135 * NOTE: the internal hash is weak enough that consecutive inputs do
136 * not achieve the desired fpp. Well-mixed values should be used
137 * here (e.g., put rjhash(x) into the filter instead of just x).
138 *
139 * @param val integer value to insert
140 */
141 inline void insert(uint32_t val) {
20effc67
TL
142 for (auto salt : salt_) {
143 auto [bit_index, bit] = compute_indices(hash_ap(val, salt));
7c673cae
FG
144 bit_table_[bit_index >> 3] |= bit_mask[bit];
145 }
146 ++insert_count_;
147 }
148
149 inline void insert(const unsigned char* key_begin, const std::size_t& length)
150 {
20effc67
TL
151 for (auto salt : salt_) {
152 auto [bit_index, bit] = compute_indices(hash_ap(key_begin, length, salt));
7c673cae
FG
153 bit_table_[bit_index >> 3] |= bit_mask[bit];
154 }
155 ++insert_count_;
156 }
157
7c673cae
FG
158 inline void insert(const std::string& key)
159 {
160 insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
161 }
162
163 inline void insert(const char* data, const std::size_t& length)
164 {
165 insert(reinterpret_cast<const unsigned char*>(data),length);
166 }
167
168 template<typename InputIterator>
169 inline void insert(const InputIterator begin, const InputIterator end)
170 {
171 InputIterator itr = begin;
172 while (end != itr)
173 {
174 insert(*(itr++));
175 }
176 }
177
178 /**
179 * check if a u32 is contained by set
180 *
181 * NOTE: the internal hash is weak enough that consecutive inputs do
182 * not achieve the desired fpp. Well-mixed values should be used
183 * here (e.g., put rjhash(x) into the filter instead of just x).
184 *
185 * @param val integer value to query
186 * @returns true if value is (probably) in the set, false if it definitely is not
187 */
188 inline virtual bool contains(uint32_t val) const
189 {
20effc67 190 if (table_size_ == 0) {
7c673cae 191 return false;
20effc67
TL
192 }
193 for (auto salt : salt_) {
194 auto [bit_index, bit] = compute_indices(hash_ap(val, salt));
195 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) {
7c673cae
FG
196 return false;
197 }
198 }
199 return true;
200 }
201
202 inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
203 {
20effc67 204 if (table_size_ == 0) {
7c673cae 205 return false;
20effc67
TL
206 }
207 for (auto salt : salt_) {
208 auto [bit_index, bit] = compute_indices(hash_ap(key_begin, length, salt));
209 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) {
7c673cae
FG
210 return false;
211 }
212 }
213 return true;
214 }
215
7c673cae
FG
216 inline bool contains(const std::string& key) const
217 {
218 return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
219 }
220
221 inline bool contains(const char* data, const std::size_t& length) const
222 {
223 return contains(reinterpret_cast<const unsigned char*>(data),length);
224 }
225
226 template<typename InputIterator>
227 inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
228 {
229 InputIterator itr = begin;
230 while (end != itr)
231 {
232 if (!contains(*itr))
233 {
234 return itr;
235 }
236 ++itr;
237 }
238 return end;
239 }
240
241 template<typename InputIterator>
242 inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
243 {
244 InputIterator itr = begin;
245 while (end != itr)
246 {
247 if (contains(*itr))
248 {
249 return itr;
250 }
251 ++itr;
252 }
253 return end;
254 }
255
256 inline virtual std::size_t size() const
257 {
20effc67 258 return table_size_ * CHAR_BIT;
7c673cae
FG
259 }
260
261 inline std::size_t element_count() const
262 {
263 return insert_count_;
264 }
265
266 inline bool is_full() const
267 {
268 return insert_count_ >= target_element_count_;
269 }
270
271 /*
272 * density of bits set. inconvenient units, but:
273 * .3 = ~50% target insertions
274 * .5 = 100% target insertions, "perfectly full"
275 * .75 = 200% target insertions
276 * 1.0 = all bits set... infinite insertions
277 */
20effc67 278 double density() const;
7c673cae
FG
279
280 virtual inline double approx_unique_element_count() const {
281 // this is not a very good estimate; a better solution should have
282 // some asymptotic behavior as density() approaches 1.0.
283 return (double)target_element_count_ * 2.0 * density();
284 }
285
286 inline double effective_fpp() const
287 {
288 /*
289 Note:
290 The effective false positive probability is calculated using the
291 designated table size and hash function count in conjunction with
292 the current number of inserted elements - not the user defined
293 predicated/expected number of inserted elements.
294 */
295 return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
296 }
297
298 inline const cell_type* table() const
299 {
20effc67 300 return bit_table_.data();
7c673cae
FG
301 }
302
303protected:
304
20effc67
TL
305 virtual std::pair<size_t /* bit_index */,
306 size_t /* bit */>
307 compute_indices(const bloom_type& hash) const
7c673cae 308 {
20effc67
TL
309 size_t bit_index = hash % (table_size_ << 3);
310 size_t bit = bit_index & 7;
311 return {bit_index, bit};
7c673cae
FG
312 }
313
314 void generate_unique_salt()
315 {
316 /*
317 Note:
318 A distinct hash function need not be implementation-wise
319 distinct. In the current implementation "seeding" a common
320 hash function with different values seems to be adequate.
321 */
322 const unsigned int predef_salt_count = 128;
323 static const bloom_type predef_salt[predef_salt_count] = {
324 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
325 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
326 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
327 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
328 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
329 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
330 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
331 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
332 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
333 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
334 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
335 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
336 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
337 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
338 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
339 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
340 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
341 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
342 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
343 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
344 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
345 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
346 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
347 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
348 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
349 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
350 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
351 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
352 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
353 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
354 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
355 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
356 };
357
358 if (salt_count_ <= predef_salt_count)
359 {
360 std::copy(predef_salt,
361 predef_salt + salt_count_,
362 std::back_inserter(salt_));
363 for (unsigned int i = 0; i < salt_.size(); ++i)
364 {
365 /*
366 Note:
367 This is done to integrate the user defined random seed,
368 so as to allow for the generation of unique bloom filter
369 instances.
370 */
371 salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
372 }
373 }
374 else
375 {
376 std::copy(predef_salt,predef_salt + predef_salt_count,
377 std::back_inserter(salt_));
378 srand(static_cast<unsigned int>(random_seed_));
379 while (salt_.size() < salt_count_)
380 {
381 bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
382 if (0 == current_salt)
383 continue;
384 if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
385 {
386 salt_.push_back(current_salt);
387 }
388 }
389 }
390 }
391
20effc67
TL
392 static std::pair<std::size_t /* salt_count */,
393 std::size_t /* table_size */>
394 find_optimal_parameters(std::size_t target_insert_count,
395 double target_fpp)
7c673cae
FG
396 {
397 /*
398 Note:
399 The following will attempt to find the number of hash functions
400 and minimum amount of storage bits required to construct a bloom
401 filter consistent with the user defined false positive probability
402 and estimated element insertion count.
403 */
404
405 double min_m = std::numeric_limits<double>::infinity();
406 double min_k = 0.0;
407 double curr_m = 0.0;
408 double k = 1.0;
409 while (k < 1000.0)
410 {
411 double numerator = (- k * target_insert_count);
412 double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
413 curr_m = numerator / denominator;
414
415 if (curr_m < min_m)
416 {
417 min_m = curr_m;
418 min_k = k;
419 }
420 k += 1.0;
421 }
422
20effc67 423 size_t salt_count = static_cast<std::size_t>(min_k);
7c673cae 424 size_t t = static_cast<std::size_t>(min_m);
20effc67
TL
425 t += (((t & 7) != 0) ? (CHAR_BIT - (t & 7)) : 0);
426 size_t table_size = t >> 3;
427 return {salt_count, table_size};
7c673cae
FG
428 }
429
430 inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
431 {
432 hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
433 hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
434 hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
435 hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
436 return hash;
437 }
438
439 inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
440 {
441 const unsigned char* itr = begin;
442
443 while (remaining_length >= 4)
444 {
445 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
446 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
447 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
448 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
449 remaining_length -= 4;
450 }
451
452 while (remaining_length >= 2)
453 {
454 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
455 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
456 remaining_length -= 2;
457 }
458
459 if (remaining_length)
460 {
461 hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
462 }
463
464 return hash;
465 }
466
467public:
9f95a23c
TL
468 void encode(ceph::buffer::list& bl) const;
469 void decode(ceph::buffer::list::const_iterator& bl);
470 void dump(ceph::Formatter *f) const;
7c673cae
FG
471 static void generate_test_instances(std::list<bloom_filter*>& ls);
472};
473WRITE_CLASS_ENCODER(bloom_filter)
474
475
476class compressible_bloom_filter : public bloom_filter
477{
478public:
479
480 compressible_bloom_filter() : bloom_filter() {}
481
482 compressible_bloom_filter(const std::size_t& predicted_element_count,
483 const double& false_positive_probability,
484 const std::size_t& random_seed)
485 : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
486 {
487 size_list.push_back(table_size_);
488 }
489
490 compressible_bloom_filter(const std::size_t& salt_count,
491 std::size_t table_size,
492 const std::size_t& random_seed,
493 std::size_t target_count)
494 : bloom_filter(salt_count, table_size, random_seed, target_count)
495 {
496 size_list.push_back(table_size_);
497 }
498
499 inline std::size_t size() const override
500 {
20effc67 501 return size_list.back() * CHAR_BIT;
7c673cae
FG
502 }
503
504 inline bool compress(const double& target_ratio)
505 {
20effc67 506 if (bit_table_.empty())
7c673cae
FG
507 return false;
508
509 if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
510 {
511 return false;
512 }
513
514 std::size_t original_table_size = size_list.back();
515 std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
516
517 if ((!new_table_size) || (new_table_size >= original_table_size))
518 {
519 return false;
520 }
521
20effc67
TL
522 table_type tmp(new_table_size);
523 std::copy(bit_table_.begin(), bit_table_.begin() + new_table_size, tmp.begin());
524 auto itr = bit_table_.begin() + new_table_size;
525 auto end = bit_table_.begin() + original_table_size;
526 auto itr_tmp = tmp.begin();
527 auto itr_end = tmp.begin() + new_table_size;
528 while (end != itr) {
7c673cae 529 *(itr_tmp++) |= (*itr++);
20effc67
TL
530 if (itr_tmp == itr_end) {
531 itr_tmp = tmp.begin();
532 }
7c673cae 533 }
20effc67 534 std::swap(bit_table_, tmp);
7c673cae
FG
535 size_list.push_back(new_table_size);
536 table_size_ = new_table_size;
537
538 return true;
539 }
540
541 inline double approx_unique_element_count() const override {
542 // this is not a very good estimate; a better solution should have
543 // some asymptotic behavior as density() approaches 1.0.
544 //
545 // the compress() correction is also bad; it tends to under-estimate.
546 return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
547 }
548
549private:
550
20effc67
TL
551 std::pair<size_t /* bit_index */,
552 size_t /* bit */>
553 compute_indices(const bloom_type& hash) const final
7c673cae 554 {
20effc67
TL
555 size_t bit_index = hash;
556 for (auto size : size_list) {
557 bit_index %= size << 3;
7c673cae 558 }
20effc67
TL
559 size_t bit = bit_index & 7;
560 return {bit_index, bit};
7c673cae
FG
561 }
562
563 std::vector<std::size_t> size_list;
564public:
9f95a23c
TL
565 void encode(ceph::bufferlist& bl) const;
566 void decode(ceph::bufferlist::const_iterator& bl);
567 void dump(ceph::Formatter *f) const;
7c673cae
FG
568 static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
569};
570WRITE_CLASS_ENCODER(compressible_bloom_filter)
571
572#endif
573
574
575/*
576 Note 1:
20effc67 577 If it can be guaranteed that CHAR_BIT will be of the form 2^n then
7c673cae
FG
578 the following optimization can be used:
579
20effc67 580 bit_table_[bit_index >> n] |= bit_mask[bit_index & (CHAR_BIT - 1)];
7c673cae
FG
581
582 Note 2:
583 For performance reasons where possible when allocating memory it should
584 be aligned (aligned_alloc) according to the architecture being used.
585*/