1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013 Inktank <info@inktank.com>
8 * LGPL2.1 (see COPYING-LGPL2.1) or later
12 #include <gtest/gtest.h>
14 #include "include/stringify.h"
15 #include "common/bloom_filter.hpp"
17 TEST(BloomFilter
, Basic
) {
18 bloom_filter
bf(10, .1, 1);
22 ASSERT_TRUE(bf
.contains("foo"));
23 ASSERT_TRUE(bf
.contains("bar"));
25 ASSERT_EQ(2U, bf
.element_count());
28 TEST(BloomFilter
, Empty
) {
30 for (int i
=0; i
<100; ++i
) {
31 ASSERT_FALSE(bf
.contains((uint32_t) i
));
32 ASSERT_FALSE(bf
.contains(stringify(i
)));
36 TEST(BloomFilter
, Sweep
) {
37 std::cout
.setf(std::ios_base::fixed
, std::ios_base::floatfield
);
38 std::cout
.precision(5);
39 std::cout
<< "# max\tfpp\tactual\tsize\tB/insert" << std::endl
;
40 for (int ex
= 3; ex
< 12; ex
+= 2) {
41 for (float fpp
= .001; fpp
< .5; fpp
*= 4.0) {
43 bloom_filter
bf(max
, fpp
, 1);
47 ASSERT_TRUE(bf
.contains("foo"));
48 ASSERT_TRUE(bf
.contains("bar"));
50 for (int n
= 0; n
< max
; n
++)
51 bf
.insert("ok" + stringify(n
));
55 for (int n
= 0; n
< test
; n
++)
56 if (bf
.contains("asdf" + stringify(n
)))
59 ASSERT_TRUE(bf
.contains("foo"));
60 ASSERT_TRUE(bf
.contains("bar"));
62 double actual
= (double)hit
/ (double)test
;
67 double byte_per_insert
= (double)bl
.length() / (double)max
;
69 std::cout
<< max
<< "\t" << fpp
<< "\t" << actual
<< "\t" << bl
.length() << "\t" << byte_per_insert
<< std::endl
;
70 ASSERT_TRUE(actual
< fpp
* 10);
76 TEST(BloomFilter
, SweepInt
) {
77 unsigned int seed
= 0;
78 std::cout
.setf(std::ios_base::fixed
, std::ios_base::floatfield
);
79 std::cout
.precision(5);
80 std::cout
<< "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl
;
81 for (int ex
= 3; ex
< 12; ex
+= 2) {
82 for (float fpp
= .001; fpp
< .5; fpp
*= 4.0) {
84 bloom_filter
bf(max
, fpp
, 1);
91 // In Ceph code, the uint32_t input routines to the bloom filter
92 // are used with hash values that are uniformly distributed over
93 // the uint32_t range. To model this behavior in the test, we
94 // pass in values generated by a pseudo-random generator.
95 // To make the test reproducible anyway, use a fixed seed here,
96 // but a different one in each instance.
99 for (int n
= 0; n
< max
; n
++)
100 bf
.insert((uint32_t) rand());
102 int test
= max
* 100;
104 for (int n
= 0; n
< test
; n
++)
105 if (bf
.contains((uint32_t) rand()))
111 double actual
= (double)hit
/ (double)test
;
116 double byte_per_insert
= (double)bl
.length() / (double)max
;
118 std::cout
<< max
<< "\t" << fpp
<< "\t" << actual
<< "\t" << bl
.length() << "\t" << byte_per_insert
119 << "\t" << bf
.density() << "\t" << bf
.approx_unique_element_count() << std::endl
;
120 ASSERT_TRUE(actual
< fpp
* 3);
121 ASSERT_TRUE(actual
> fpp
/ 3);
122 ASSERT_TRUE(bf
.density() > 0.40);
123 ASSERT_TRUE(bf
.density() < 0.60);
129 TEST(BloomFilter
, CompressibleSweep
) {
130 unsigned int seed
= 0;
131 std::cout
.setf(std::ios_base::fixed
, std::ios_base::floatfield
);
132 std::cout
.precision(5);
133 std::cout
<< "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n";
136 for (int div
= 1; div
< 10; div
++) {
137 compressible_bloom_filter
bf(max
, fpp
, 1);
139 // See the comment in SweepInt.
142 std::vector
<uint32_t> values
;
144 for (int n
= 0; n
< t
; n
++) {
145 uint32_t val
= (uint32_t) rand();
147 values
.push_back(val
);
150 unsigned est
= bf
.approx_unique_element_count();
152 bf
.compress(1.0 / div
);
154 for (auto val
: values
)
155 ASSERT_TRUE(bf
.contains(val
));
157 int test
= max
* 100;
159 for (int n
= 0; n
< test
; n
++)
160 if (bf
.contains((uint32_t) rand()))
163 double actual
= (double)hit
/ (double)test
;
168 double byte_per_insert
= (double)bl
.length() / (double)max
;
169 unsigned est_after
= bf
.approx_unique_element_count();
176 << "\t" << bl
.length() << "\t" << byte_per_insert
179 ASSERT_TRUE(actual
< fpp
* 2.0);
180 ASSERT_TRUE(actual
> fpp
/ 2.0);
181 ASSERT_TRUE(est_after
< est
* 2);
182 ASSERT_TRUE(est_after
> est
/ 2);
188 TEST(BloomFilter
, BinSweep
) {
189 std::cout
.setf(std::ios_base::fixed
, std::ios_base::floatfield
);
190 std::cout
.precision(5);
191 int total_max
= 16384;
192 float total_fpp
= .01;
193 std::cout
<< "total_inserts " << total_max
<< " target-fpp " << total_fpp
<< std::endl
;
194 for (int bins
= 1; bins
< 16; ++bins
) {
195 int max
= total_max
/ bins
;
196 float fpp
= total_fpp
/ bins
;//pow(total_fpp, bins);
198 std::vector
<std::unique_ptr
<bloom_filter
>> ls
;
200 for (int i
=0; i
<bins
; i
++) {
201 ls
.push_back(std::make_unique
<bloom_filter
>(max
, fpp
, i
));
202 for (int j
=0; j
<max
; j
++) {
203 ls
.back()->insert(10000 * (i
+1) + j
);
205 encode(*ls
.front(), bl
);
209 int test
= max
* 100;
210 for (int i
=0; i
<test
; ++i
) {
211 for (std::vector
<std::unique_ptr
<bloom_filter
>>::iterator j
= ls
.begin(); j
!= ls
.end(); ++j
) {
212 if ((*j
)->contains(i
* 732)) { // note: sequential i does not work here; the intenral int hash is weak!!
219 double actual
= (double)hit
/ (double)test
;
220 std::cout
<< "bins " << bins
<< " bin-max " << max
<< " bin-fpp " << fpp
221 << " actual-fpp " << actual
222 << " total-size " << bl
.length() << std::endl
;
226 // disable these tests; doing dual insertions in consecutive filters
227 // appears to be equivalent to doing a single insertion in a bloom
228 // filter that is twice as big.
231 // test the fpp over a sequence of bloom filters, each with unique
232 // items inserted into it.
234 // we expect: actual_fpp = num_filters * per_filter_fpp
235 TEST(BloomFilter
, Sequence
) {
239 for (int seq
= 2; seq
<= 128; seq
*= 2) {
240 std::vector
<bloom_filter
*> ls
;
241 for (int i
=0; i
<seq
; i
++) {
242 ls
.push_back(new bloom_filter(max
*2, fpp
, i
));
243 for (int j
=0; j
<max
; j
++) {
244 ls
.back()->insert("ok" + stringify(j
) + "_" + stringify(i
));
246 ls
[ls
.size() - 2]->insert("ok" + stringify(j
) + "_" + stringify(i
));
251 int test
= max
* 100;
252 for (int i
=0; i
<test
; ++i
) {
253 for (std::vector
<bloom_filter
*>::iterator j
= ls
.begin(); j
!= ls
.end(); ++j
) {
254 if ((*j
)->contains("bad" + stringify(i
))) {
261 double actual
= (double)hit
/ (double)test
;
262 std::cout
<< "seq " << seq
<< " max " << max
<< " fpp " << fpp
<< " actual " << actual
<< std::endl
;
266 // test the ffp over a sequence of bloom filters, where actual values
267 // are always inserted into a consecutive pair of filters. in order
268 // to have a false positive, we need to falsely match two consecutive
271 // we expect: actual_fpp = num_filters * per_filter_fpp^2
272 TEST(BloomFilter
, SequenceDouble
) {
275 for (int seq
= 2; seq
<= 128; seq
*= 2) {
276 std::vector
<bloom_filter
*> ls
;
277 for (int i
=0; i
<seq
; i
++) {
278 ls
.push_back(new bloom_filter(max
*2, fpp
, i
));
279 for (int j
=0; j
<max
; j
++) {
280 ls
.back()->insert("ok" + stringify(j
) + "_" + stringify(i
));
282 ls
[ls
.size() - 2]->insert("ok" + stringify(j
) + "_" + stringify(i
));
287 int test
= max
* 100;
289 for (int i
=0; i
<test
; ++i
) {
290 for (std::vector
<bloom_filter
*>::iterator j
= ls
.begin(); j
!= ls
.end(); ++j
) {
291 if ((*j
)->contains("bad" + stringify(i
))) {
303 double actual
= (double)hit
/ (double)test
;
304 std::cout
<< "seq " << seq
<< " max " << max
<< " fpp " << fpp
<< " actual " << actual
305 << " expected " << (fpp
*fpp
*(double)seq
) << std::endl
;
311 TEST(BloomFilter
, Assignement
) {
312 bloom_filter
bf1(10, .1, 1), bf2
;
318 ASSERT_TRUE(bf2
.contains("foo"));
319 ASSERT_FALSE(bf2
.contains("bar"));
321 ASSERT_EQ(2U, bf1
.element_count());
322 ASSERT_EQ(1U, bf2
.element_count());