]> git.proxmox.com Git - ceph.git/blame - ceph/src/test/common/test_bloom_filter.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / test / common / test_bloom_filter.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013 Inktank <info@inktank.com>
7 *
9f95a23c 8 * LGPL-2.1 (see COPYING-LGPL2.1) or later
7c673cae
FG
9 */
10
11#include <iostream>
12#include <gtest/gtest.h>
13
14#include "include/stringify.h"
15#include "common/bloom_filter.hpp"
16
17TEST(BloomFilter, Basic) {
18 bloom_filter bf(10, .1, 1);
19 bf.insert("foo");
20 bf.insert("bar");
21
22 ASSERT_TRUE(bf.contains("foo"));
23 ASSERT_TRUE(bf.contains("bar"));
24
25 ASSERT_EQ(2U, bf.element_count());
26}
27
28TEST(BloomFilter, Empty) {
29 bloom_filter bf;
30 for (int i=0; i<100; ++i) {
eafe8130 31 ASSERT_FALSE(bf.contains((uint32_t) i));
7c673cae
FG
32 ASSERT_FALSE(bf.contains(stringify(i)));
33 }
34}
35
36TEST(BloomFilter, Sweep) {
37 std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
38 std::cout.precision(5);
39 std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
40 for (int ex = 3; ex < 12; ex += 2) {
41 for (float fpp = .001; fpp < .5; fpp *= 4.0) {
42 int max = 2 << ex;
43 bloom_filter bf(max, fpp, 1);
44 bf.insert("foo");
45 bf.insert("bar");
46
47 ASSERT_TRUE(bf.contains("foo"));
48 ASSERT_TRUE(bf.contains("bar"));
49
50 for (int n = 0; n < max; n++)
51 bf.insert("ok" + stringify(n));
52
53 int test = max * 100;
54 int hit = 0;
55 for (int n = 0; n < test; n++)
56 if (bf.contains("asdf" + stringify(n)))
57 hit++;
58
59 ASSERT_TRUE(bf.contains("foo"));
60 ASSERT_TRUE(bf.contains("bar"));
61
62 double actual = (double)hit / (double)test;
63
64 bufferlist bl;
11fdf7f2 65 encode(bf, bl);
7c673cae
FG
66
67 double byte_per_insert = (double)bl.length() / (double)max;
68
69 std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
70 ASSERT_TRUE(actual < fpp * 10);
71
72 }
73 }
74}
75
76TEST(BloomFilter, SweepInt) {
eafe8130 77 unsigned int seed = 0;
7c673cae
FG
78 std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
79 std::cout.precision(5);
80 std::cout << "# max\tfpp\tactual\tsize\tB/insert\tdensity\tapprox_element_count" << std::endl;
81 for (int ex = 3; ex < 12; ex += 2) {
82 for (float fpp = .001; fpp < .5; fpp *= 4.0) {
83 int max = 2 << ex;
84 bloom_filter bf(max, fpp, 1);
85 bf.insert("foo");
86 bf.insert("bar");
87
88 ASSERT_TRUE(123);
89 ASSERT_TRUE(456);
90
eafe8130
TL
91 // In Ceph code, the uint32_t input routines to the bloom filter
92 // are used with hash values that are uniformly distributed over
93 // the uint32_t range. To model this behavior in the test, we
94 // pass in values generated by a pseudo-random generator.
95 // To make the test reproducible anyway, use a fixed seed here,
96 // but a different one in each instance.
97 srand(seed++);
98
7c673cae 99 for (int n = 0; n < max; n++)
eafe8130 100 bf.insert((uint32_t) rand());
7c673cae
FG
101
102 int test = max * 100;
103 int hit = 0;
104 for (int n = 0; n < test; n++)
eafe8130 105 if (bf.contains((uint32_t) rand()))
7c673cae
FG
106 hit++;
107
108 ASSERT_TRUE(123);
109 ASSERT_TRUE(456);
110
111 double actual = (double)hit / (double)test;
112
113 bufferlist bl;
11fdf7f2 114 encode(bf, bl);
7c673cae
FG
115
116 double byte_per_insert = (double)bl.length() / (double)max;
117
118 std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert
119 << "\t" << bf.density() << "\t" << bf.approx_unique_element_count() << std::endl;
eafe8130
TL
120 ASSERT_TRUE(actual < fpp * 3);
121 ASSERT_TRUE(actual > fpp / 3);
7c673cae
FG
122 ASSERT_TRUE(bf.density() > 0.40);
123 ASSERT_TRUE(bf.density() < 0.60);
124 }
125 }
126}
127
128
129TEST(BloomFilter, CompressibleSweep) {
eafe8130 130 unsigned int seed = 0;
7c673cae
FG
131 std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
132 std::cout.precision(5);
133 std::cout << "# max\tins\test ins\tafter\ttgtfpp\tactual\tsize\tb/elem\n";
134 float fpp = .01;
135 int max = 1024;
136 for (int div = 1; div < 10; div++) {
137 compressible_bloom_filter bf(max, fpp, 1);
eafe8130
TL
138
139 // See the comment in SweepInt.
140 srand(seed++);
141
142 std::vector<uint32_t> values;
7c673cae 143 int t = max/div;
eafe8130
TL
144 for (int n = 0; n < t; n++) {
145 uint32_t val = (uint32_t) rand();
146 bf.insert(val);
147 values.push_back(val);
148 }
7c673cae
FG
149
150 unsigned est = bf.approx_unique_element_count();
151 if (div > 1)
152 bf.compress(1.0 / div);
153
eafe8130
TL
154 for (auto val : values)
155 ASSERT_TRUE(bf.contains(val));
7c673cae
FG
156
157 int test = max * 100;
158 int hit = 0;
159 for (int n = 0; n < test; n++)
eafe8130 160 if (bf.contains((uint32_t) rand()))
7c673cae
FG
161 hit++;
162
163 double actual = (double)hit / (double)test;
164
165 bufferlist bl;
11fdf7f2 166 encode(bf, bl);
7c673cae
FG
167
168 double byte_per_insert = (double)bl.length() / (double)max;
169 unsigned est_after = bf.approx_unique_element_count();
170 std::cout << max
171 << "\t" << t
172 << "\t" << est
173 << "\t" << est_after
174 << "\t" << fpp
175 << "\t" << actual
176 << "\t" << bl.length() << "\t" << byte_per_insert
177 << std::endl;
178
179 ASSERT_TRUE(actual < fpp * 2.0);
180 ASSERT_TRUE(actual > fpp / 2.0);
181 ASSERT_TRUE(est_after < est * 2);
182 ASSERT_TRUE(est_after > est / 2);
183 }
184}
185
186
187
188TEST(BloomFilter, BinSweep) {
189 std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
190 std::cout.precision(5);
191 int total_max = 16384;
192 float total_fpp = .01;
193 std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
194 for (int bins = 1; bins < 16; ++bins) {
195 int max = total_max / bins;
196 float fpp = total_fpp / bins;//pow(total_fpp, bins);
197
11fdf7f2 198 std::vector<std::unique_ptr<bloom_filter>> ls;
7c673cae
FG
199 bufferlist bl;
200 for (int i=0; i<bins; i++) {
11fdf7f2 201 ls.push_back(std::make_unique<bloom_filter>(max, fpp, i));
7c673cae
FG
202 for (int j=0; j<max; j++) {
203 ls.back()->insert(10000 * (i+1) + j);
204 }
11fdf7f2 205 encode(*ls.front(), bl);
7c673cae
FG
206 }
207
208 int hit = 0;
209 int test = max * 100;
210 for (int i=0; i<test; ++i) {
11fdf7f2 211 for (std::vector<std::unique_ptr<bloom_filter>>::iterator j = ls.begin(); j != ls.end(); ++j) {
7c673cae
FG
212 if ((*j)->contains(i * 732)) { // note: sequential i does not work here; the intenral int hash is weak!!
213 hit++;
214 break;
215 }
216 }
217 }
218
219 double actual = (double)hit / (double)test;
220 std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp
221 << " actual-fpp " << actual
222 << " total-size " << bl.length() << std::endl;
223 }
224}
225
226// disable these tests; doing dual insertions in consecutive filters
227// appears to be equivalent to doing a single insertion in a bloom
228// filter that is twice as big.
229#if 0
230
231// test the fpp over a sequence of bloom filters, each with unique
232// items inserted into it.
233//
234// we expect: actual_fpp = num_filters * per_filter_fpp
235TEST(BloomFilter, Sequence) {
236
237 int max = 1024;
238 double fpp = .01;
239 for (int seq = 2; seq <= 128; seq *= 2) {
240 std::vector<bloom_filter*> ls;
241 for (int i=0; i<seq; i++) {
242 ls.push_back(new bloom_filter(max*2, fpp, i));
243 for (int j=0; j<max; j++) {
244 ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
245 if (ls.size() > 1)
246 ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
247 }
248 }
249
250 int hit = 0;
251 int test = max * 100;
252 for (int i=0; i<test; ++i) {
253 for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
254 if ((*j)->contains("bad" + stringify(i))) {
255 hit++;
256 break;
257 }
258 }
259 }
260
261 double actual = (double)hit / (double)test;
262 std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl;
263 }
264}
265
266// test the ffp over a sequence of bloom filters, where actual values
267// are always inserted into a consecutive pair of filters. in order
268// to have a false positive, we need to falsely match two consecutive
269// filters.
270//
271// we expect: actual_fpp = num_filters * per_filter_fpp^2
272TEST(BloomFilter, SequenceDouble) {
273 int max = 1024;
274 double fpp = .01;
275 for (int seq = 2; seq <= 128; seq *= 2) {
276 std::vector<bloom_filter*> ls;
277 for (int i=0; i<seq; i++) {
278 ls.push_back(new bloom_filter(max*2, fpp, i));
279 for (int j=0; j<max; j++) {
280 ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
281 if (ls.size() > 1)
282 ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
283 }
284 }
285
286 int hit = 0;
287 int test = max * 100;
288 int run = 0;
289 for (int i=0; i<test; ++i) {
290 for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
291 if ((*j)->contains("bad" + stringify(i))) {
292 run++;
293 if (run >= 2) {
294 hit++;
295 break;
296 }
297 } else {
298 run = 0;
299 }
300 }
301 }
302
303 double actual = (double)hit / (double)test;
304 std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual
305 << " expected " << (fpp*fpp*(double)seq) << std::endl;
306 }
307}
308
309#endif
310
311TEST(BloomFilter, Assignement) {
312 bloom_filter bf1(10, .1, 1), bf2;
313
314 bf1.insert("foo");
315 bf2 = bf1;
316 bf1.insert("bar");
317
318 ASSERT_TRUE(bf2.contains("foo"));
319 ASSERT_FALSE(bf2.contains("bar"));
320
321 ASSERT_EQ(2U, bf1.element_count());
322 ASSERT_EQ(1U, bf2.element_count());
323}