1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
5 *******************************************************************
9 * Author: Arash Partow - 2000 *
10 * URL: http://www.partow.net/programming/hashfunctions/index.html *
13 * Free use of the Open Bloom Filter Library is permitted under *
14 * the guidelines and in accordance with the most current version *
15 * of the Boost Software License, Version 1.0 *
16 * http://www.opensource.org/licenses/bsl1.0.html *
18 *******************************************************************
22 #ifndef COMMON_BLOOM_FILTER_HPP
23 #define COMMON_BLOOM_FILTER_HPP
27 #include "include/mempool.h"
28 #include "include/encoding.h"
30 static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
31 static const unsigned char bit_mask[bits_per_char] = {
42 MEMPOOL_DECLARE_FACTORY(unsigned char, byte, bloom_filter);
48 typedef unsigned int bloom_type;
49 typedef unsigned char cell_type;
51 unsigned char* bit_table_; ///< pointer to bit map
52 std::vector<bloom_type> salt_; ///< vector of salts
53 std::size_t salt_count_; ///< number of salts
54 std::size_t table_size_; ///< bit table size in bytes
55 std::size_t insert_count_; ///< insertion count
56 std::size_t target_element_count_; ///< target number of unique insertions
57 std::size_t random_seed_; ///< random seed
66 target_element_count_(0),
70 bloom_filter(const std::size_t& predicted_inserted_element_count,
71 const double& false_positive_probability,
72 const std::size_t& random_seed)
75 target_element_count_(predicted_inserted_element_count),
76 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
78 ceph_assert(false_positive_probability > 0.0);
79 find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
80 &salt_count_, &table_size_);
84 bloom_filter(const std::size_t& salt_count,
85 std::size_t table_size,
86 const std::size_t& random_seed,
87 std::size_t target_element_count)
89 salt_count_(salt_count),
90 table_size_(table_size),
92 target_element_count_(target_element_count),
93 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
99 generate_unique_salt();
101 bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
102 std::fill_n(bit_table_, table_size_, 0x00);
108 bloom_filter(const bloom_filter& filter)
111 this->operator=(filter);
114 bloom_filter& operator = (const bloom_filter& filter)
116 if (this != &filter) {
118 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
120 salt_count_ = filter.salt_count_;
121 table_size_ = filter.table_size_;
122 insert_count_ = filter.insert_count_;
123 target_element_count_ = filter.target_element_count_;
124 random_seed_ = filter.random_seed_;
125 bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
126 std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
127 salt_ = filter.salt_;
132 virtual ~bloom_filter()
134 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
137 inline bool operator!() const
139 return (0 == table_size_);
145 std::fill_n(bit_table_, table_size_, 0x00);
150 * insert a u32 into the set
152 * NOTE: the internal hash is weak enough that consecutive inputs do
153 * not achieve the desired fpp. Well-mixed values should be used
154 * here (e.g., put rjhash(x) into the filter instead of just x).
156 * @param val integer value to insert
158 inline void insert(uint32_t val) {
159 ceph_assert(bit_table_);
160 std::size_t bit_index = 0;
162 for (std::size_t i = 0; i < salt_.size(); ++i)
164 compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
165 bit_table_[bit_index >> 3] |= bit_mask[bit];
170 inline void insert(const unsigned char* key_begin, const std::size_t& length)
172 ceph_assert(bit_table_);
173 std::size_t bit_index = 0;
175 for (std::size_t i = 0; i < salt_.size(); ++i)
177 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
178 bit_table_[bit_index >> 3] |= bit_mask[bit];
183 inline void insert(const std::string& key)
185 insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
188 inline void insert(const char* data, const std::size_t& length)
190 insert(reinterpret_cast<const unsigned char*>(data),length);
193 template<typename InputIterator>
194 inline void insert(const InputIterator begin, const InputIterator end)
196 InputIterator itr = begin;
204 * check if a u32 is contained by set
206 * NOTE: the internal hash is weak enough that consecutive inputs do
207 * not achieve the desired fpp. Well-mixed values should be used
208 * here (e.g., put rjhash(x) into the filter instead of just x).
210 * @param val integer value to query
211 * @returns true if value is (probably) in the set, false if it definitely is not
213 inline virtual bool contains(uint32_t val) const
217 std::size_t bit_index = 0;
219 for (std::size_t i = 0; i < salt_.size(); ++i)
221 compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
222 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
230 inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
234 std::size_t bit_index = 0;
236 for (std::size_t i = 0; i < salt_.size(); ++i)
238 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
239 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
247 inline bool contains(const std::string& key) const
249 return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
252 inline bool contains(const char* data, const std::size_t& length) const
254 return contains(reinterpret_cast<const unsigned char*>(data),length);
257 template<typename InputIterator>
258 inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
260 InputIterator itr = begin;
272 template<typename InputIterator>
273 inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
275 InputIterator itr = begin;
287 inline virtual std::size_t size() const
289 return table_size_ * bits_per_char;
292 inline std::size_t element_count() const
294 return insert_count_;
297 inline bool is_full() const
299 return insert_count_ >= target_element_count_;
303 * density of bits set. inconvenient units, but:
304 * .3 = ~50% target insertions
305 * .5 = 100% target insertions, "perfectly full"
306 * .75 = 200% target insertions
307 * 1.0 = all bits set... infinite insertions
309 inline double density() const
314 uint8_t *p = bit_table_;
315 size_t left = table_size_;
322 return (double)set / (double)(table_size_ << 3);
325 virtual inline double approx_unique_element_count() const {
326 // this is not a very good estimate; a better solution should have
327 // some asymptotic behavior as density() approaches 1.0.
328 return (double)target_element_count_ * 2.0 * density();
331 inline double effective_fpp() const
335 The effective false positive probability is calculated using the
336 designated table size and hash function count in conjunction with
337 the current number of inserted elements - not the user defined
338 predicated/expected number of inserted elements.
340 return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
343 inline const cell_type* table() const
350 inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
352 bit_index = hash % (table_size_ << 3);
356 void generate_unique_salt()
360 A distinct hash function need not be implementation-wise
361 distinct. In the current implementation "seeding" a common
362 hash function with different values seems to be adequate.
364 const unsigned int predef_salt_count = 128;
365 static const bloom_type predef_salt[predef_salt_count] = {
366 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
367 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
368 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
369 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
370 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
371 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
372 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
373 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
374 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
375 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
376 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
377 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
378 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
379 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
380 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
381 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
382 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
383 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
384 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
385 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
386 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
387 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
388 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
389 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
390 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
391 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
392 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
393 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
394 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
395 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
396 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
397 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
400 if (salt_count_ <= predef_salt_count)
402 std::copy(predef_salt,
403 predef_salt + salt_count_,
404 std::back_inserter(salt_));
405 for (unsigned int i = 0; i < salt_.size(); ++i)
409 This is done to integrate the user defined random seed,
410 so as to allow for the generation of unique bloom filter
413 salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
418 std::copy(predef_salt,predef_salt + predef_salt_count,
419 std::back_inserter(salt_));
420 srand(static_cast<unsigned int>(random_seed_));
421 while (salt_.size() < salt_count_)
423 bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
424 if (0 == current_salt)
426 if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
428 salt_.push_back(current_salt);
434 static void find_optimal_parameters(std::size_t target_insert_count,
436 std::size_t *salt_count,
437 std::size_t *table_size)
441 The following will attempt to find the number of hash functions
442 and minimum amount of storage bits required to construct a bloom
443 filter consistent with the user defined false positive probability
444 and estimated element insertion count.
447 double min_m = std::numeric_limits<double>::infinity();
453 double numerator = (- k * target_insert_count);
454 double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
455 curr_m = numerator / denominator;
465 *salt_count = static_cast<std::size_t>(min_k);
466 size_t t = static_cast<std::size_t>(min_m);
467 t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
468 *table_size = t >> 3;
471 inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
473 hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
474 hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
475 hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
476 hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
480 inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
482 const unsigned char* itr = begin;
484 while (remaining_length >= 4)
486 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
487 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
488 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
489 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
490 remaining_length -= 4;
493 while (remaining_length >= 2)
495 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
496 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
497 remaining_length -= 2;
500 if (remaining_length)
502 hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
509 void encode(ceph::buffer::list& bl) const;
510 void decode(ceph::buffer::list::const_iterator& bl);
511 void dump(ceph::Formatter *f) const;
512 static void generate_test_instances(std::list<bloom_filter*>& ls);
514 WRITE_CLASS_ENCODER(bloom_filter)
517 class compressible_bloom_filter : public bloom_filter
521 compressible_bloom_filter() : bloom_filter() {}
523 compressible_bloom_filter(const std::size_t& predicted_element_count,
524 const double& false_positive_probability,
525 const std::size_t& random_seed)
526 : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
528 size_list.push_back(table_size_);
531 compressible_bloom_filter(const std::size_t& salt_count,
532 std::size_t table_size,
533 const std::size_t& random_seed,
534 std::size_t target_count)
535 : bloom_filter(salt_count, table_size, random_seed, target_count)
537 size_list.push_back(table_size_);
540 inline std::size_t size() const override
542 return size_list.back() * bits_per_char;
545 inline bool compress(const double& target_ratio)
550 if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
555 std::size_t original_table_size = size_list.back();
556 std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
558 if ((!new_table_size) || (new_table_size >= original_table_size))
563 cell_type* tmp = mempool::bloom_filter::alloc_byte.allocate(new_table_size);
564 std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
565 cell_type* itr = bit_table_ + (new_table_size);
566 cell_type* end = bit_table_ + (original_table_size);
567 cell_type* itr_tmp = tmp;
568 cell_type* itr_end = tmp + (new_table_size);
571 *(itr_tmp++) |= (*itr++);
572 if (itr_tmp == itr_end)
576 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
578 size_list.push_back(new_table_size);
579 table_size_ = new_table_size;
584 inline double approx_unique_element_count() const override {
585 // this is not a very good estimate; a better solution should have
586 // some asymptotic behavior as density() approaches 1.0.
588 // the compress() correction is also bad; it tends to under-estimate.
589 return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
594 inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const override
597 for (std::size_t i = 0; i < size_list.size(); ++i)
599 bit_index %= size_list[i] << 3;
604 std::vector<std::size_t> size_list;
606 void encode(ceph::bufferlist& bl) const;
607 void decode(ceph::bufferlist::const_iterator& bl);
608 void dump(ceph::Formatter *f) const;
609 static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
611 WRITE_CLASS_ENCODER(compressible_bloom_filter)
618 If it can be guaranteed that bits_per_char will be of the form 2^n then
619 the following optimization can be used:
621 hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
624 For performance reasons where possible when allocating memory it should
625 be aligned (aligned_alloc) according to the architecture being used.