]> git.proxmox.com Git - ceph.git/blame - ceph/src/common/bloom_filter.hpp
update sources to v12.1.0
[ceph.git] / ceph / src / common / bloom_filter.hpp
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4/*
5 *******************************************************************
6 * *
7 * Open Bloom Filter *
8 * *
9 * Author: Arash Partow - 2000 *
10 * URL: http://www.partow.net/programming/hashfunctions/index.html *
11 * *
12 * Copyright notice: *
13 * Free use of the Open Bloom Filter Library is permitted under *
14 * the guidelines and in accordance with the most current version *
15 * of the Boost Software License, Version 1.0 *
16 * http://www.opensource.org/licenses/bsl1.0.html *
17 * *
18 *******************************************************************
19*/
20
21
22#ifndef COMMON_BLOOM_FILTER_HPP
23#define COMMON_BLOOM_FILTER_HPP
24
7c673cae 25#include <cmath>
7c673cae
FG
26
27#include "include/mempool.h"
28#include "include/encoding.h"
29
30static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
31static const unsigned char bit_mask[bits_per_char] = {
32 0x01, //00000001
33 0x02, //00000010
34 0x04, //00000100
35 0x08, //00001000
36 0x10, //00010000
37 0x20, //00100000
38 0x40, //01000000
39 0x80 //10000000
40};
41
42MEMPOOL_DECLARE_FACTORY(unsigned char, byte, bloom_filter);
43
44class bloom_filter
45{
46protected:
47
48 typedef unsigned int bloom_type;
49 typedef unsigned char cell_type;
50
51 unsigned char* bit_table_; ///< pointer to bit map
52 std::vector<bloom_type> salt_; ///< vector of salts
53 std::size_t salt_count_; ///< number of salts
54 std::size_t table_size_; ///< bit table size in bytes
55 std::size_t insert_count_; ///< insertion count
56 std::size_t target_element_count_; ///< target number of unique insertions
57 std::size_t random_seed_; ///< random seed
58
59public:
60
61 bloom_filter()
62 : bit_table_(0),
63 salt_count_(0),
64 table_size_(0),
65 insert_count_(0),
66 target_element_count_(0),
67 random_seed_(0)
68 {}
69
70 bloom_filter(const std::size_t& predicted_inserted_element_count,
71 const double& false_positive_probability,
72 const std::size_t& random_seed)
73 : bit_table_(0),
74 insert_count_(0),
75 target_element_count_(predicted_inserted_element_count),
76 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
77 {
78 assert(false_positive_probability > 0.0);
79 find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
80 &salt_count_, &table_size_);
81 init();
82 }
83
84 bloom_filter(const std::size_t& salt_count,
85 std::size_t table_size,
86 const std::size_t& random_seed,
87 std::size_t target_element_count)
88 : bit_table_(0),
89 salt_count_(salt_count),
90 table_size_(table_size),
91 insert_count_(0),
92 target_element_count_(target_element_count),
93 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
94 {
95 init();
96 }
97
98 void init() {
99 generate_unique_salt();
100 if (table_size_) {
101 bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
102 std::fill_n(bit_table_, table_size_, 0x00);
103 } else {
104 bit_table_ = NULL;
105 }
106 }
107
108 bloom_filter(const bloom_filter& filter)
109 : bit_table_(0)
110 {
111 this->operator=(filter);
112 }
113
114 bloom_filter& operator = (const bloom_filter& filter)
115 {
116 if (this != &filter) {
117 if (bit_table_) {
118 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
119 }
120 salt_count_ = filter.salt_count_;
121 table_size_ = filter.table_size_;
122 insert_count_ = filter.insert_count_;
123 target_element_count_ = filter.target_element_count_;
124 random_seed_ = filter.random_seed_;
125 bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
126 std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
127 salt_ = filter.salt_;
128 }
129 return *this;
130 }
131
132 virtual ~bloom_filter()
133 {
134 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
135 }
136
137 inline bool operator!() const
138 {
139 return (0 == table_size_);
140 }
141
142 inline void clear()
143 {
144 if (bit_table_)
145 std::fill_n(bit_table_, table_size_, 0x00);
146 insert_count_ = 0;
147 }
148
149 /**
150 * insert a u32 into the set
151 *
152 * NOTE: the internal hash is weak enough that consecutive inputs do
153 * not achieve the desired fpp. Well-mixed values should be used
154 * here (e.g., put rjhash(x) into the filter instead of just x).
155 *
156 * @param val integer value to insert
157 */
158 inline void insert(uint32_t val) {
159 assert(bit_table_);
160 std::size_t bit_index = 0;
161 std::size_t bit = 0;
162 for (std::size_t i = 0; i < salt_.size(); ++i)
163 {
164 compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
165 bit_table_[bit_index >> 3] |= bit_mask[bit];
166 }
167 ++insert_count_;
168 }
169
170 inline void insert(const unsigned char* key_begin, const std::size_t& length)
171 {
172 assert(bit_table_);
173 std::size_t bit_index = 0;
174 std::size_t bit = 0;
175 for (std::size_t i = 0; i < salt_.size(); ++i)
176 {
177 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
178 bit_table_[bit_index >> 3] |= bit_mask[bit];
179 }
180 ++insert_count_;
181 }
182
183 template<typename T>
184 inline void insert(const T& t)
185 {
186 // Note: T must be a C++ POD type.
187 insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
188 }
189
190 inline void insert(const std::string& key)
191 {
192 insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
193 }
194
195 inline void insert(const char* data, const std::size_t& length)
196 {
197 insert(reinterpret_cast<const unsigned char*>(data),length);
198 }
199
200 template<typename InputIterator>
201 inline void insert(const InputIterator begin, const InputIterator end)
202 {
203 InputIterator itr = begin;
204 while (end != itr)
205 {
206 insert(*(itr++));
207 }
208 }
209
210 /**
211 * check if a u32 is contained by set
212 *
213 * NOTE: the internal hash is weak enough that consecutive inputs do
214 * not achieve the desired fpp. Well-mixed values should be used
215 * here (e.g., put rjhash(x) into the filter instead of just x).
216 *
217 * @param val integer value to query
218 * @returns true if value is (probably) in the set, false if it definitely is not
219 */
220 inline virtual bool contains(uint32_t val) const
221 {
222 if (!bit_table_)
223 return false;
224 std::size_t bit_index = 0;
225 std::size_t bit = 0;
226 for (std::size_t i = 0; i < salt_.size(); ++i)
227 {
228 compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
229 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
230 {
231 return false;
232 }
233 }
234 return true;
235 }
236
237 inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
238 {
239 if (!bit_table_)
240 return false;
241 std::size_t bit_index = 0;
242 std::size_t bit = 0;
243 for (std::size_t i = 0; i < salt_.size(); ++i)
244 {
245 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
246 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
247 {
248 return false;
249 }
250 }
251 return true;
252 }
253
254 template<typename T>
255 inline bool contains(const T& t) const
256 {
257 return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
258 }
259
260 inline bool contains(const std::string& key) const
261 {
262 return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
263 }
264
265 inline bool contains(const char* data, const std::size_t& length) const
266 {
267 return contains(reinterpret_cast<const unsigned char*>(data),length);
268 }
269
270 template<typename InputIterator>
271 inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
272 {
273 InputIterator itr = begin;
274 while (end != itr)
275 {
276 if (!contains(*itr))
277 {
278 return itr;
279 }
280 ++itr;
281 }
282 return end;
283 }
284
285 template<typename InputIterator>
286 inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
287 {
288 InputIterator itr = begin;
289 while (end != itr)
290 {
291 if (contains(*itr))
292 {
293 return itr;
294 }
295 ++itr;
296 }
297 return end;
298 }
299
300 inline virtual std::size_t size() const
301 {
302 return table_size_ * bits_per_char;
303 }
304
305 inline std::size_t element_count() const
306 {
307 return insert_count_;
308 }
309
310 inline bool is_full() const
311 {
312 return insert_count_ >= target_element_count_;
313 }
314
315 /*
316 * density of bits set. inconvenient units, but:
317 * .3 = ~50% target insertions
318 * .5 = 100% target insertions, "perfectly full"
319 * .75 = 200% target insertions
320 * 1.0 = all bits set... infinite insertions
321 */
322 inline double density() const
323 {
324 if (!bit_table_)
325 return 0.0;
326 size_t set = 0;
327 uint8_t *p = bit_table_;
328 size_t left = table_size_;
329 while (left-- > 0) {
330 uint8_t c = *p;
331 for (; c; ++set)
332 c &= c - 1;
333 ++p;
334 }
335 return (double)set / (double)(table_size_ << 3);
336 }
337
338 virtual inline double approx_unique_element_count() const {
339 // this is not a very good estimate; a better solution should have
340 // some asymptotic behavior as density() approaches 1.0.
341 return (double)target_element_count_ * 2.0 * density();
342 }
343
344 inline double effective_fpp() const
345 {
346 /*
347 Note:
348 The effective false positive probability is calculated using the
349 designated table size and hash function count in conjunction with
350 the current number of inserted elements - not the user defined
351 predicated/expected number of inserted elements.
352 */
353 return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
354 }
355
356 inline const cell_type* table() const
357 {
358 return bit_table_;
359 }
360
361protected:
362
363 inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
364 {
365 bit_index = hash % (table_size_ << 3);
366 bit = bit_index & 7;
367 }
368
369 void generate_unique_salt()
370 {
371 /*
372 Note:
373 A distinct hash function need not be implementation-wise
374 distinct. In the current implementation "seeding" a common
375 hash function with different values seems to be adequate.
376 */
377 const unsigned int predef_salt_count = 128;
378 static const bloom_type predef_salt[predef_salt_count] = {
379 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
380 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
381 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
382 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
383 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
384 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
385 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
386 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
387 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
388 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
389 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
390 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
391 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
392 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
393 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
394 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
395 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
396 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
397 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
398 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
399 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
400 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
401 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
402 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
403 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
404 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
405 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
406 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
407 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
408 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
409 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
410 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
411 };
412
413 if (salt_count_ <= predef_salt_count)
414 {
415 std::copy(predef_salt,
416 predef_salt + salt_count_,
417 std::back_inserter(salt_));
418 for (unsigned int i = 0; i < salt_.size(); ++i)
419 {
420 /*
421 Note:
422 This is done to integrate the user defined random seed,
423 so as to allow for the generation of unique bloom filter
424 instances.
425 */
426 salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
427 }
428 }
429 else
430 {
431 std::copy(predef_salt,predef_salt + predef_salt_count,
432 std::back_inserter(salt_));
433 srand(static_cast<unsigned int>(random_seed_));
434 while (salt_.size() < salt_count_)
435 {
436 bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
437 if (0 == current_salt)
438 continue;
439 if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
440 {
441 salt_.push_back(current_salt);
442 }
443 }
444 }
445 }
446
447 static void find_optimal_parameters(std::size_t target_insert_count,
448 double target_fpp,
449 std::size_t *salt_count,
450 std::size_t *table_size)
451 {
452 /*
453 Note:
454 The following will attempt to find the number of hash functions
455 and minimum amount of storage bits required to construct a bloom
456 filter consistent with the user defined false positive probability
457 and estimated element insertion count.
458 */
459
460 double min_m = std::numeric_limits<double>::infinity();
461 double min_k = 0.0;
462 double curr_m = 0.0;
463 double k = 1.0;
464 while (k < 1000.0)
465 {
466 double numerator = (- k * target_insert_count);
467 double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
468 curr_m = numerator / denominator;
469
470 if (curr_m < min_m)
471 {
472 min_m = curr_m;
473 min_k = k;
474 }
475 k += 1.0;
476 }
477
478 *salt_count = static_cast<std::size_t>(min_k);
479 size_t t = static_cast<std::size_t>(min_m);
480 t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
481 *table_size = t >> 3;
482 }
483
484 inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
485 {
486 hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
487 hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
488 hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
489 hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
490 return hash;
491 }
492
493 inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
494 {
495 const unsigned char* itr = begin;
496
497 while (remaining_length >= 4)
498 {
499 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
500 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
501 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
502 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
503 remaining_length -= 4;
504 }
505
506 while (remaining_length >= 2)
507 {
508 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
509 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
510 remaining_length -= 2;
511 }
512
513 if (remaining_length)
514 {
515 hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
516 }
517
518 return hash;
519 }
520
521public:
522 void encode(bufferlist& bl) const;
523 void decode(bufferlist::iterator& bl);
524 void dump(Formatter *f) const;
525 static void generate_test_instances(std::list<bloom_filter*>& ls);
526};
527WRITE_CLASS_ENCODER(bloom_filter)
528
529
530class compressible_bloom_filter : public bloom_filter
531{
532public:
533
534 compressible_bloom_filter() : bloom_filter() {}
535
536 compressible_bloom_filter(const std::size_t& predicted_element_count,
537 const double& false_positive_probability,
538 const std::size_t& random_seed)
539 : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
540 {
541 size_list.push_back(table_size_);
542 }
543
544 compressible_bloom_filter(const std::size_t& salt_count,
545 std::size_t table_size,
546 const std::size_t& random_seed,
547 std::size_t target_count)
548 : bloom_filter(salt_count, table_size, random_seed, target_count)
549 {
550 size_list.push_back(table_size_);
551 }
552
553 inline std::size_t size() const override
554 {
555 return size_list.back() * bits_per_char;
556 }
557
558 inline bool compress(const double& target_ratio)
559 {
560 if (!bit_table_)
561 return false;
562
563 if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
564 {
565 return false;
566 }
567
568 std::size_t original_table_size = size_list.back();
569 std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
570
571 if ((!new_table_size) || (new_table_size >= original_table_size))
572 {
573 return false;
574 }
575
576 cell_type* tmp = mempool::bloom_filter::alloc_byte.allocate(new_table_size);
577 std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
578 cell_type* itr = bit_table_ + (new_table_size);
579 cell_type* end = bit_table_ + (original_table_size);
580 cell_type* itr_tmp = tmp;
581 cell_type* itr_end = tmp + (new_table_size);
582 while (end != itr)
583 {
584 *(itr_tmp++) |= (*itr++);
585 if (itr_tmp == itr_end)
586 itr_tmp = tmp;
587 }
588
589 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
590 bit_table_ = tmp;
591 size_list.push_back(new_table_size);
592 table_size_ = new_table_size;
593
594 return true;
595 }
596
597 inline double approx_unique_element_count() const override {
598 // this is not a very good estimate; a better solution should have
599 // some asymptotic behavior as density() approaches 1.0.
600 //
601 // the compress() correction is also bad; it tends to under-estimate.
602 return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
603 }
604
605private:
606
607 inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const override
608 {
609 bit_index = hash;
610 for (std::size_t i = 0; i < size_list.size(); ++i)
611 {
612 bit_index %= size_list[i] << 3;
613 }
614 bit = bit_index & 7;
615 }
616
617 std::vector<std::size_t> size_list;
618public:
619 void encode(bufferlist& bl) const;
620 void decode(bufferlist::iterator& bl);
621 void dump(Formatter *f) const;
622 static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
623};
624WRITE_CLASS_ENCODER(compressible_bloom_filter)
625
626#endif
627
628
629/*
630 Note 1:
631 If it can be guaranteed that bits_per_char will be of the form 2^n then
632 the following optimization can be used:
633
634 hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
635
636 Note 2:
637 For performance reasons where possible when allocating memory it should
638 be aligned (aligned_alloc) according to the architecture being used.
639*/