1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
5 *******************************************************************
9 * Author: Arash Partow - 2000 *
10 * URL: http://www.partow.net/programming/hashfunctions/index.html *
13 * Free use of the Open Bloom Filter Library is permitted under *
14 * the guidelines and in accordance with the most current version *
15 * of the Boost Software License, Version 1.0 *
16 * http://www.opensource.org/licenses/bsl1.0.html *
18 *******************************************************************
22 #ifndef COMMON_BLOOM_FILTER_HPP
23 #define COMMON_BLOOM_FILTER_HPP
27 #include "include/encoding.h"
28 #include "include/mempool.h"
30 static const unsigned char bit_mask[CHAR_BIT] = {
45 using bloom_type = unsigned int;
46 using cell_type = unsigned char;
47 using table_type = mempool::bloom_filter::vector<cell_type>;
49 std::vector<bloom_type> salt_; ///< vector of salts
50 table_type bit_table_; ///< bit map
51 std::size_t salt_count_; ///< number of salts
52 std::size_t table_size_; ///< bit table size in bytes
53 std::size_t insert_count_; ///< insertion count
54 std::size_t target_element_count_; ///< target number of unique insertions
55 std::size_t random_seed_; ///< random seed
63 target_element_count_(0),
67 bloom_filter(const std::size_t& predicted_inserted_element_count,
68 const double& false_positive_probability,
69 const std::size_t& random_seed)
71 target_element_count_(predicted_inserted_element_count),
72 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
74 ceph_assert(false_positive_probability > 0.0);
75 std::tie(salt_count_, table_size_) =
76 find_optimal_parameters(predicted_inserted_element_count,
77 false_positive_probability);
81 bloom_filter(const std::size_t& salt_count,
82 std::size_t table_size,
83 const std::size_t& random_seed,
84 std::size_t target_element_count)
85 : salt_count_(salt_count),
86 table_size_(table_size),
88 target_element_count_(target_element_count),
89 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
95 generate_unique_salt();
96 bit_table_.resize(table_size_, static_cast<unsigned char>(0x00));
99 bloom_filter(const bloom_filter& filter)
101 this->operator=(filter);
104 bloom_filter& operator = (const bloom_filter& filter)
106 if (this != &filter) {
107 salt_count_ = filter.salt_count_;
108 table_size_ = filter.table_size_;
109 insert_count_ = filter.insert_count_;
110 target_element_count_ = filter.target_element_count_;
111 random_seed_ = filter.random_seed_;
112 bit_table_ = filter.bit_table_;
113 salt_ = filter.salt_;
118 virtual ~bloom_filter() = default;
120 inline bool operator!() const
122 return (0 == table_size_);
127 std::fill(bit_table_.begin(), bit_table_.end(),
128 static_cast<unsigned char>(0x00));
133 * insert a u32 into the set
135 * NOTE: the internal hash is weak enough that consecutive inputs do
136 * not achieve the desired fpp. Well-mixed values should be used
137 * here (e.g., put rjhash(x) into the filter instead of just x).
139 * @param val integer value to insert
141 inline void insert(uint32_t val) {
142 for (auto salt : salt_) {
143 auto [bit_index, bit] = compute_indices(hash_ap(val, salt));
144 bit_table_[bit_index >> 3] |= bit_mask[bit];
149 inline void insert(const unsigned char* key_begin, const std::size_t& length)
151 for (auto salt : salt_) {
152 auto [bit_index, bit] = compute_indices(hash_ap(key_begin, length, salt));
153 bit_table_[bit_index >> 3] |= bit_mask[bit];
158 inline void insert(const std::string& key)
160 insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
163 inline void insert(const char* data, const std::size_t& length)
165 insert(reinterpret_cast<const unsigned char*>(data),length);
168 template<typename InputIterator>
169 inline void insert(const InputIterator begin, const InputIterator end)
171 InputIterator itr = begin;
179 * check if a u32 is contained by set
181 * NOTE: the internal hash is weak enough that consecutive inputs do
182 * not achieve the desired fpp. Well-mixed values should be used
183 * here (e.g., put rjhash(x) into the filter instead of just x).
185 * @param val integer value to query
186 * @returns true if value is (probably) in the set, false if it definitely is not
188 inline virtual bool contains(uint32_t val) const
190 if (table_size_ == 0) {
193 for (auto salt : salt_) {
194 auto [bit_index, bit] = compute_indices(hash_ap(val, salt));
195 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) {
202 inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
204 if (table_size_ == 0) {
207 for (auto salt : salt_) {
208 auto [bit_index, bit] = compute_indices(hash_ap(key_begin, length, salt));
209 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) {
216 inline bool contains(const std::string& key) const
218 return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
221 inline bool contains(const char* data, const std::size_t& length) const
223 return contains(reinterpret_cast<const unsigned char*>(data),length);
226 template<typename InputIterator>
227 inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
229 InputIterator itr = begin;
241 template<typename InputIterator>
242 inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
244 InputIterator itr = begin;
256 inline virtual std::size_t size() const
258 return table_size_ * CHAR_BIT;
261 inline std::size_t element_count() const
263 return insert_count_;
266 inline bool is_full() const
268 return insert_count_ >= target_element_count_;
272 * density of bits set. inconvenient units, but:
273 * .3 = ~50% target insertions
274 * .5 = 100% target insertions, "perfectly full"
275 * .75 = 200% target insertions
276 * 1.0 = all bits set... infinite insertions
278 double density() const;
280 virtual inline double approx_unique_element_count() const {
281 // this is not a very good estimate; a better solution should have
282 // some asymptotic behavior as density() approaches 1.0.
283 return (double)target_element_count_ * 2.0 * density();
286 inline double effective_fpp() const
290 The effective false positive probability is calculated using the
291 designated table size and hash function count in conjunction with
292 the current number of inserted elements - not the user defined
293 predicated/expected number of inserted elements.
295 return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
298 inline const cell_type* table() const
300 return bit_table_.data();
305 virtual std::pair<size_t /* bit_index */,
307 compute_indices(const bloom_type& hash) const
309 size_t bit_index = hash % (table_size_ << 3);
310 size_t bit = bit_index & 7;
311 return {bit_index, bit};
314 void generate_unique_salt()
318 A distinct hash function need not be implementation-wise
319 distinct. In the current implementation "seeding" a common
320 hash function with different values seems to be adequate.
322 const unsigned int predef_salt_count = 128;
323 static const bloom_type predef_salt[predef_salt_count] = {
324 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
325 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
326 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
327 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
328 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
329 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
330 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
331 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
332 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
333 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
334 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
335 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
336 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
337 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
338 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
339 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
340 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
341 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
342 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
343 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
344 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
345 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
346 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
347 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
348 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
349 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
350 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
351 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
352 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
353 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
354 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
355 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
358 if (salt_count_ <= predef_salt_count)
360 std::copy(predef_salt,
361 predef_salt + salt_count_,
362 std::back_inserter(salt_));
363 for (unsigned int i = 0; i < salt_.size(); ++i)
367 This is done to integrate the user defined random seed,
368 so as to allow for the generation of unique bloom filter
371 salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
376 std::copy(predef_salt,predef_salt + predef_salt_count,
377 std::back_inserter(salt_));
378 srand(static_cast<unsigned int>(random_seed_));
379 while (salt_.size() < salt_count_)
381 bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
382 if (0 == current_salt)
384 if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
386 salt_.push_back(current_salt);
392 static std::pair<std::size_t /* salt_count */,
393 std::size_t /* table_size */>
394 find_optimal_parameters(std::size_t target_insert_count,
399 The following will attempt to find the number of hash functions
400 and minimum amount of storage bits required to construct a bloom
401 filter consistent with the user defined false positive probability
402 and estimated element insertion count.
405 double min_m = std::numeric_limits<double>::infinity();
411 double numerator = (- k * target_insert_count);
412 double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
413 curr_m = numerator / denominator;
423 size_t salt_count = static_cast<std::size_t>(min_k);
424 size_t t = static_cast<std::size_t>(min_m);
425 t += (((t & 7) != 0) ? (CHAR_BIT - (t & 7)) : 0);
426 size_t table_size = t >> 3;
427 return {salt_count, table_size};
430 inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
432 hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
433 hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
434 hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
435 hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
439 inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
441 const unsigned char* itr = begin;
443 while (remaining_length >= 4)
445 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
446 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
447 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
448 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
449 remaining_length -= 4;
452 while (remaining_length >= 2)
454 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
455 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
456 remaining_length -= 2;
459 if (remaining_length)
461 hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
468 void encode(ceph::buffer::list& bl) const;
469 void decode(ceph::buffer::list::const_iterator& bl);
470 void dump(ceph::Formatter *f) const;
471 static void generate_test_instances(std::list<bloom_filter*>& ls);
473 WRITE_CLASS_ENCODER(bloom_filter)
476 class compressible_bloom_filter : public bloom_filter
480 compressible_bloom_filter() : bloom_filter() {}
482 compressible_bloom_filter(const std::size_t& predicted_element_count,
483 const double& false_positive_probability,
484 const std::size_t& random_seed)
485 : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
487 size_list.push_back(table_size_);
490 compressible_bloom_filter(const std::size_t& salt_count,
491 std::size_t table_size,
492 const std::size_t& random_seed,
493 std::size_t target_count)
494 : bloom_filter(salt_count, table_size, random_seed, target_count)
496 size_list.push_back(table_size_);
499 inline std::size_t size() const override
501 return size_list.back() * CHAR_BIT;
504 inline bool compress(const double& target_ratio)
506 if (bit_table_.empty())
509 if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
514 std::size_t original_table_size = size_list.back();
515 std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
517 if ((!new_table_size) || (new_table_size >= original_table_size))
522 table_type tmp(new_table_size);
523 std::copy(bit_table_.begin(), bit_table_.begin() + new_table_size, tmp.begin());
524 auto itr = bit_table_.begin() + new_table_size;
525 auto end = bit_table_.begin() + original_table_size;
526 auto itr_tmp = tmp.begin();
527 auto itr_end = tmp.begin() + new_table_size;
529 *(itr_tmp++) |= (*itr++);
530 if (itr_tmp == itr_end) {
531 itr_tmp = tmp.begin();
534 std::swap(bit_table_, tmp);
535 size_list.push_back(new_table_size);
536 table_size_ = new_table_size;
541 inline double approx_unique_element_count() const override {
542 // this is not a very good estimate; a better solution should have
543 // some asymptotic behavior as density() approaches 1.0.
545 // the compress() correction is also bad; it tends to under-estimate.
546 return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
551 std::pair<size_t /* bit_index */,
553 compute_indices(const bloom_type& hash) const final
555 size_t bit_index = hash;
556 for (auto size : size_list) {
557 bit_index %= size << 3;
559 size_t bit = bit_index & 7;
560 return {bit_index, bit};
563 std::vector<std::size_t> size_list;
565 void encode(ceph::bufferlist& bl) const;
566 void decode(ceph::bufferlist::const_iterator& bl);
567 void dump(ceph::Formatter *f) const;
568 static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
570 WRITE_CLASS_ENCODER(compressible_bloom_filter)
577 If it can be guaranteed that CHAR_BIT will be of the form 2^n then
578 the following optimization can be used:
580 bit_table_[bit_index >> n] |= bit_mask[bit_index & (CHAR_BIT - 1)];
583 For performance reasons where possible when allocating memory it should
584 be aligned (aligned_alloc) according to the architecture being used.