]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2013 Inktank <info@inktank.com> | |
7 | * | |
9f95a23c | 8 | * LGPL-2.1 (see COPYING-LGPL2.1) or later |
7c673cae FG |
9 | */ |
10 | ||
11 | #include <iostream> | |
12 | #include <gtest/gtest.h> | |
13 | ||
14 | #include "include/stringify.h" | |
15 | #include "common/bloom_filter.hpp" | |
16 | ||
17 | TEST(BloomFilter, Basic) { | |
18 | bloom_filter bf(10, .1, 1); | |
19 | bf.insert("foo"); | |
20 | bf.insert("bar"); | |
21 | ||
22 | ASSERT_TRUE(bf.contains("foo")); | |
23 | ASSERT_TRUE(bf.contains("bar")); | |
24 | ||
25 | ASSERT_EQ(2U, bf.element_count()); | |
26 | } | |
27 | ||
28 | TEST(BloomFilter, Empty) { | |
29 | bloom_filter bf; | |
30 | for (int i=0; i<100; ++i) { | |
eafe8130 | 31 | ASSERT_FALSE(bf.contains((uint32_t) i)); |
7c673cae FG |
32 | ASSERT_FALSE(bf.contains(stringify(i))); |
33 | } | |
34 | } | |
35 | ||
36 | TEST(BloomFilter, Sweep) { | |
37 | std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); | |
38 | std::cout.precision(5); | |
39 | std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl; | |
40 | for (int ex = 3; ex < 12; ex += 2) { | |
41 | for (float fpp = .001; fpp < .5; fpp *= 4.0) { | |
42 | int max = 2 << ex; | |
43 | bloom_filter bf(max, fpp, 1); | |
44 | bf.insert("foo"); | |
45 | bf.insert("bar"); | |
46 | ||
47 | ASSERT_TRUE(bf.contains("foo")); | |
48 | ASSERT_TRUE(bf.contains("bar")); | |
49 | ||
50 | for (int n = 0; n < max; n++) | |
51 | bf.insert("ok" + stringify(n)); | |
52 | ||
53 | int test = max * 100; | |
54 | int hit = 0; | |
55 | for (int n = 0; n < test; n++) | |
56 | if (bf.contains("asdf" + stringify(n))) | |
57 | hit++; | |
58 | ||
59 | ASSERT_TRUE(bf.contains("foo")); | |
60 | ASSERT_TRUE(bf.contains("bar")); | |
61 | ||
62 | double actual = (double)hit / (double)test; | |
63 | ||
64 | bufferlist bl; | |
11fdf7f2 | 65 | encode(bf, bl); |
7c673cae FG |
66 | |
67 | double byte_per_insert = (double)bl.length() / (double)max; | |
68 | ||
69 | std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl; | |
70 | ASSERT_TRUE(actual < fpp * 10); | |
71 | ||
72 | } | |
73 | } | |
74 | } | |
75 | ||
76 | TEST(BloomFilter, SweepInt) { | |
eafe8130 | 77 | unsigned int seed = 0; |
7c673cae FG |
78 | std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); |
79 | std::cout.precision(5); | |
80 | std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl; | |
81 | for (int ex = 3; ex < 12; ex += 2) { | |
82 | for (float fpp = .001; fpp < .5; fpp *= 4.0) { | |
83 | int max = 2 << ex; | |
84 | bloom_filter bf(max, fpp, 1); | |
85 | bf.insert("foo"); | |
86 | bf.insert("bar"); | |
87 | ||
88 | ASSERT_TRUE(123); | |
89 | ASSERT_TRUE(456); | |
90 | ||
eafe8130 TL |
91 | // In Ceph code, the uint32_t input routines to the bloom filter |
92 | // are used with hash values that are uniformly distributed over | |
93 | // the uint32_t range. To model this behavior in the test, we | |
94 | // pass in values generated by a pseudo-random generator. | |
95 | // To make the test reproducible anyway, use a fixed seed here, | |
96 | // but a different one in each instance. | |
97 | srand(seed++); | |
98 | ||
7c673cae | 99 | for (int n = 0; n < max; n++) |
eafe8130 | 100 | bf.insert((uint32_t) rand()); |
7c673cae FG |
101 | |
102 | int test = max * 100; | |
103 | int hit = 0; | |
104 | for (int n = 0; n < test; n++) | |
eafe8130 | 105 | if (bf.contains((uint32_t) rand())) |
7c673cae FG |
106 | hit++; |
107 | ||
108 | ASSERT_TRUE(123); | |
109 | ASSERT_TRUE(456); | |
110 | ||
111 | double actual = (double)hit / (double)test; | |
112 | ||
113 | bufferlist bl; | |
11fdf7f2 | 114 | encode(bf, bl); |
7c673cae FG |
115 | |
116 | double byte_per_insert = (double)bl.length() / (double)max; | |
117 | ||
118 | std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert | |
119 | << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl; | |
eafe8130 TL |
120 | ASSERT_TRUE(actual < fpp * 3); |
121 | ASSERT_TRUE(actual > fpp / 3); | |
7c673cae FG |
122 | ASSERT_TRUE(bf.density() > 0.40); |
123 | ASSERT_TRUE(bf.density() < 0.60); | |
124 | } | |
125 | } | |
126 | } | |
127 | ||
128 | ||
129 | TEST(BloomFilter, CompressibleSweep) { | |
eafe8130 | 130 | unsigned int seed = 0; |
7c673cae FG |
131 | std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); |
132 | std::cout.precision(5); | |
133 | std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n"; | |
134 | float fpp = .01; | |
135 | int max = 1024; | |
136 | for (int div = 1; div < 10; div++) { | |
137 | compressible_bloom_filter bf(max, fpp, 1); | |
eafe8130 TL |
138 | |
139 | // See the comment in SweepInt. | |
140 | srand(seed++); | |
141 | ||
142 | std::vector<uint32_t> values; | |
7c673cae | 143 | int t = max/div; |
eafe8130 TL |
144 | for (int n = 0; n < t; n++) { |
145 | uint32_t val = (uint32_t) rand(); | |
146 | bf.insert(val); | |
147 | values.push_back(val); | |
148 | } | |
7c673cae FG |
149 | |
150 | unsigned est = bf.approx_unique_element_count(); | |
151 | if (div > 1) | |
152 | bf.compress(1.0 / div); | |
153 | ||
eafe8130 TL |
154 | for (auto val : values) |
155 | ASSERT_TRUE(bf.contains(val)); | |
7c673cae FG |
156 | |
157 | int test = max * 100; | |
158 | int hit = 0; | |
159 | for (int n = 0; n < test; n++) | |
eafe8130 | 160 | if (bf.contains((uint32_t) rand())) |
7c673cae FG |
161 | hit++; |
162 | ||
163 | double actual = (double)hit / (double)test; | |
164 | ||
165 | bufferlist bl; | |
11fdf7f2 | 166 | encode(bf, bl); |
7c673cae FG |
167 | |
168 | double byte_per_insert = (double)bl.length() / (double)max; | |
169 | unsigned est_after = bf.approx_unique_element_count(); | |
170 | std::cout << max | |
171 | << "\t" << t | |
172 | << "\t" << est | |
173 | << "\t" << est_after | |
174 | << "\t" << fpp | |
175 | << "\t" << actual | |
176 | << "\t" << bl.length() << "\t" << byte_per_insert | |
177 | << std::endl; | |
178 | ||
179 | ASSERT_TRUE(actual < fpp * 2.0); | |
180 | ASSERT_TRUE(actual > fpp / 2.0); | |
181 | ASSERT_TRUE(est_after < est * 2); | |
182 | ASSERT_TRUE(est_after > est / 2); | |
183 | } | |
184 | } | |
185 | ||
186 | ||
187 | ||
188 | TEST(BloomFilter, BinSweep) { | |
189 | std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield); | |
190 | std::cout.precision(5); | |
191 | int total_max = 16384; | |
192 | float total_fpp = .01; | |
193 | std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl; | |
194 | for (int bins = 1; bins < 16; ++bins) { | |
195 | int max = total_max / bins; | |
196 | float fpp = total_fpp / bins;//pow(total_fpp, bins); | |
197 | ||
11fdf7f2 | 198 | std::vector<std::unique_ptr<bloom_filter>> ls; |
7c673cae FG |
199 | bufferlist bl; |
200 | for (int i=0; i<bins; i++) { | |
11fdf7f2 | 201 | ls.push_back(std::make_unique<bloom_filter>(max, fpp, i)); |
7c673cae FG |
202 | for (int j=0; j<max; j++) { |
203 | ls.back()->insert(10000 * (i+1) + j); | |
204 | } | |
11fdf7f2 | 205 | encode(*ls.front(), bl); |
7c673cae FG |
206 | } |
207 | ||
208 | int hit = 0; | |
209 | int test = max * 100; | |
210 | for (int i=0; i<test; ++i) { | |
11fdf7f2 | 211 | for (std::vector<std::unique_ptr<bloom_filter>>::iterator j = ls.begin(); j != ls.end(); ++j) { |
7c673cae FG |
212 | if ((*j)->contains(i * 732)) { // note: sequential i does not work here; the intenral int hash is weak!! |
213 | hit++; | |
214 | break; | |
215 | } | |
216 | } | |
217 | } | |
218 | ||
219 | double actual = (double)hit / (double)test; | |
220 | std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp | |
221 | << " actual-fpp " << actual | |
222 | << " total-size " << bl.length() << std::endl; | |
223 | } | |
224 | } | |
225 | ||
226 | // disable these tests; doing dual insertions in consecutive filters | |
227 | // appears to be equivalent to doing a single insertion in a bloom | |
228 | // filter that is twice as big. | |
229 | #if 0 | |
230 | ||
231 | // test the fpp over a sequence of bloom filters, each with unique | |
232 | // items inserted into it. | |
233 | // | |
234 | // we expect: actual_fpp = num_filters * per_filter_fpp | |
235 | TEST(BloomFilter, Sequence) { | |
236 | ||
237 | int max = 1024; | |
238 | double fpp = .01; | |
239 | for (int seq = 2; seq <= 128; seq *= 2) { | |
240 | std::vector<bloom_filter*> ls; | |
241 | for (int i=0; i<seq; i++) { | |
242 | ls.push_back(new bloom_filter(max*2, fpp, i)); | |
243 | for (int j=0; j<max; j++) { | |
244 | ls.back()->insert("ok" + stringify(j) + "_" + stringify(i)); | |
245 | if (ls.size() > 1) | |
246 | ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i)); | |
247 | } | |
248 | } | |
249 | ||
250 | int hit = 0; | |
251 | int test = max * 100; | |
252 | for (int i=0; i<test; ++i) { | |
253 | for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { | |
254 | if ((*j)->contains("bad" + stringify(i))) { | |
255 | hit++; | |
256 | break; | |
257 | } | |
258 | } | |
259 | } | |
260 | ||
261 | double actual = (double)hit / (double)test; | |
262 | std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl; | |
263 | } | |
264 | } | |
265 | ||
266 | // test the ffp over a sequence of bloom filters, where actual values | |
267 | // are always inserted into a consecutive pair of filters. in order | |
268 | // to have a false positive, we need to falsely match two consecutive | |
269 | // filters. | |
270 | // | |
271 | // we expect: actual_fpp = num_filters * per_filter_fpp^2 | |
272 | TEST(BloomFilter, SequenceDouble) { | |
273 | int max = 1024; | |
274 | double fpp = .01; | |
275 | for (int seq = 2; seq <= 128; seq *= 2) { | |
276 | std::vector<bloom_filter*> ls; | |
277 | for (int i=0; i<seq; i++) { | |
278 | ls.push_back(new bloom_filter(max*2, fpp, i)); | |
279 | for (int j=0; j<max; j++) { | |
280 | ls.back()->insert("ok" + stringify(j) + "_" + stringify(i)); | |
281 | if (ls.size() > 1) | |
282 | ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i)); | |
283 | } | |
284 | } | |
285 | ||
286 | int hit = 0; | |
287 | int test = max * 100; | |
288 | int run = 0; | |
289 | for (int i=0; i<test; ++i) { | |
290 | for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) { | |
291 | if ((*j)->contains("bad" + stringify(i))) { | |
292 | run++; | |
293 | if (run >= 2) { | |
294 | hit++; | |
295 | break; | |
296 | } | |
297 | } else { | |
298 | run = 0; | |
299 | } | |
300 | } | |
301 | } | |
302 | ||
303 | double actual = (double)hit / (double)test; | |
304 | std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual | |
305 | << " expected " << (fpp*fpp*(double)seq) << std::endl; | |
306 | } | |
307 | } | |
308 | ||
309 | #endif | |
310 | ||
311 | TEST(BloomFilter, Assignement) { | |
312 | bloom_filter bf1(10, .1, 1), bf2; | |
313 | ||
314 | bf1.insert("foo"); | |
315 | bf2 = bf1; | |
316 | bf1.insert("bar"); | |
317 | ||
318 | ASSERT_TRUE(bf2.contains("foo")); | |
319 | ASSERT_FALSE(bf2.contains("bar")); | |
320 | ||
321 | ASSERT_EQ(2U, bf1.element_count()); | |
322 | ASSERT_EQ(1U, bf2.element_count()); | |
323 | } |