]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_test.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / table / block_test.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 #include <stdio.h>
7 #include <algorithm>
8 #include <set>
9 #include <string>
10 #include <unordered_set>
11 #include <utility>
12 #include <vector>
13
14 #include "db/dbformat.h"
15 #include "db/write_batch_internal.h"
16 #include "db/memtable.h"
17 #include "rocksdb/db.h"
18 #include "rocksdb/env.h"
19 #include "rocksdb/iterator.h"
20 #include "rocksdb/table.h"
21 #include "rocksdb/slice_transform.h"
22 #include "table/block.h"
23 #include "table/block_builder.h"
24 #include "table/format.h"
25 #include "util/random.h"
26 #include "util/testharness.h"
27 #include "util/testutil.h"
28
29 namespace rocksdb {
30
31 static std::string RandomString(Random* rnd, int len) {
32 std::string r;
33 test::RandomString(rnd, len, &r);
34 return r;
35 }
36 std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
37 Random *rnd) {
38 char buf[50];
39 char *p = &buf[0];
40 snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
41 std::string k(p);
42 if (padding_size) {
43 k += RandomString(rnd, padding_size);
44 }
45
46 return k;
47 }
48
49 // Generate random key value pairs.
50 // The generated key will be sorted. You can tune the parameters to generated
51 // different kinds of test key/value pairs for different scenario.
52 void GenerateRandomKVs(std::vector<std::string> *keys,
53 std::vector<std::string> *values, const int from,
54 const int len, const int step = 1,
55 const int padding_size = 0,
56 const int keys_share_prefix = 1) {
57 Random rnd(302);
58
59 // generate different prefix
60 for (int i = from; i < from + len; i += step) {
61 // generating keys that shares the prefix
62 for (int j = 0; j < keys_share_prefix; ++j) {
63 keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
64
65 // 100 bytes values
66 values->emplace_back(RandomString(&rnd, 100));
67 }
68 }
69 }
70
71 // Same as GenerateRandomKVs but the values are BlockHandle
72 void GenerateRandomKBHs(std::vector<std::string> *keys,
73 std::vector<BlockHandle> *values, const int from,
74 const int len, const int step = 1,
75 const int padding_size = 0,
76 const int keys_share_prefix = 1) {
77 Random rnd(302);
78 uint64_t offset = 0;
79
80 // generate different prefix
81 for (int i = from; i < from + len; i += step) {
82 // generate keys that shares the prefix
83 for (int j = 0; j < keys_share_prefix; ++j) {
84 keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
85
86 uint64_t size = rnd.Uniform(1024 * 16);
87 BlockHandle handle(offset, size);
88 offset += size + kBlockTrailerSize;
89 values->emplace_back(handle);
90 }
91 }
92 }
93
94 class BlockTest : public testing::Test {};
95
96 // block test
97 TEST_F(BlockTest, SimpleTest) {
98 Random rnd(301);
99 Options options = Options();
100 std::unique_ptr<InternalKeyComparator> ic;
101 ic.reset(new test::PlainInternalKeyComparator(options.comparator));
102
103 std::vector<std::string> keys;
104 std::vector<std::string> values;
105 BlockBuilder builder(16);
106 int num_records = 100000;
107
108 GenerateRandomKVs(&keys, &values, 0, num_records);
109 // add a bunch of records to a block
110 for (int i = 0; i < num_records; i++) {
111 builder.Add(keys[i], values[i]);
112 }
113
114 // read serialized contents of the block
115 Slice rawblock = builder.Finish();
116
117 // create block reader
118 BlockContents contents;
119 contents.data = rawblock;
120 contents.cachable = false;
121 Block reader(std::move(contents), kDisableGlobalSequenceNumber);
122
123 // read contents of block sequentially
124 int count = 0;
125 InternalIterator *iter =
126 reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
127 for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) {
128
129 // read kv from block
130 Slice k = iter->key();
131 Slice v = iter->value();
132
133 // compare with lookaside array
134 ASSERT_EQ(k.ToString().compare(keys[count]), 0);
135 ASSERT_EQ(v.ToString().compare(values[count]), 0);
136 }
137 delete iter;
138
139 // read block contents randomly
140 iter =
141 reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
142 for (int i = 0; i < num_records; i++) {
143
144 // find a random key in the lookaside array
145 int index = rnd.Uniform(num_records);
146 Slice k(keys[index]);
147
148 // search in block for this key
149 iter->Seek(k);
150 ASSERT_TRUE(iter->Valid());
151 Slice v = iter->value();
152 ASSERT_EQ(v.ToString().compare(values[index]), 0);
153 }
154 delete iter;
155 }
156
157 TEST_F(BlockTest, ValueDeltaEncodingTest) {
158 Random rnd(301);
159 Options options = Options();
160 std::unique_ptr<InternalKeyComparator> ic;
161 ic.reset(new test::PlainInternalKeyComparator(options.comparator));
162
163 std::vector<std::string> keys;
164 std::vector<BlockHandle> values;
165 const bool kUseDeltaEncoding = true;
166 const bool kUseValueDeltaEncoding = true;
167 BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding);
168 int num_records = 100;
169
170 GenerateRandomKBHs(&keys, &values, 0, num_records);
171 // add a bunch of records to a block
172 BlockHandle last_encoded_handle;
173 for (int i = 0; i < num_records; i++) {
174 auto block_handle = values[i];
175 std::string handle_encoding;
176 block_handle.EncodeTo(&handle_encoding);
177 std::string handle_delta_encoding;
178 PutVarsignedint64(&handle_delta_encoding,
179 block_handle.size() - last_encoded_handle.size());
180 last_encoded_handle = block_handle;
181 const Slice handle_delta_encoding_slice(handle_delta_encoding);
182 builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice);
183 }
184
185 // read serialized contents of the block
186 Slice rawblock = builder.Finish();
187
188 // create block reader
189 BlockContents contents;
190 contents.data = rawblock;
191 contents.cachable = false;
192 Block reader(std::move(contents), kDisableGlobalSequenceNumber);
193
194 const bool kTotalOrderSeek = true;
195 const bool kIncludesSeq = true;
196 const bool kValueIsFull = !kUseValueDeltaEncoding;
197 IndexBlockIter *kNullIter = nullptr;
198 Statistics *kNullStats = nullptr;
199 // read contents of block sequentially
200 int count = 0;
201 InternalIteratorBase<BlockHandle> *iter = reader.NewIterator<IndexBlockIter>(
202 options.comparator, options.comparator, kNullIter, kNullStats,
203 kTotalOrderSeek, kIncludesSeq, kValueIsFull);
204 for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
205 // read kv from block
206 Slice k = iter->key();
207 BlockHandle handle = iter->value();
208
209 // compare with lookaside array
210 ASSERT_EQ(k.ToString().compare(keys[count]), 0);
211
212 ASSERT_EQ(values[count].offset(), handle.offset());
213 ASSERT_EQ(values[count].size(), handle.size());
214 }
215 delete iter;
216
217 // read block contents randomly
218 iter = reader.NewIterator<IndexBlockIter>(
219 options.comparator, options.comparator, kNullIter, kNullStats,
220 kTotalOrderSeek, kIncludesSeq, kValueIsFull);
221 for (int i = 0; i < num_records; i++) {
222 // find a random key in the lookaside array
223 int index = rnd.Uniform(num_records);
224 Slice k(keys[index]);
225
226 // search in block for this key
227 iter->Seek(k);
228 ASSERT_TRUE(iter->Valid());
229 BlockHandle handle = iter->value();
230 ASSERT_EQ(values[index].offset(), handle.offset());
231 ASSERT_EQ(values[index].size(), handle.size());
232 }
233 delete iter;
234 }
235 // return the block contents
236 BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
237 const std::vector<std::string> &keys,
238 const std::vector<std::string> &values,
239 const int /*prefix_group_size*/ = 1) {
240 builder->reset(new BlockBuilder(1 /* restart interval */));
241
242 // Add only half of the keys
243 for (size_t i = 0; i < keys.size(); ++i) {
244 (*builder)->Add(keys[i], values[i]);
245 }
246 Slice rawblock = (*builder)->Finish();
247
248 BlockContents contents;
249 contents.data = rawblock;
250 contents.cachable = false;
251
252 return contents;
253 }
254
255 void CheckBlockContents(BlockContents contents, const int max_key,
256 const std::vector<std::string> &keys,
257 const std::vector<std::string> &values) {
258 const size_t prefix_size = 6;
259 // create block reader
260 BlockContents contents_ref(contents.data, contents.cachable,
261 contents.compression_type);
262 Block reader1(std::move(contents), kDisableGlobalSequenceNumber);
263 Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber);
264
265 std::unique_ptr<const SliceTransform> prefix_extractor(
266 NewFixedPrefixTransform(prefix_size));
267
268 std::unique_ptr<InternalIterator> regular_iter(
269 reader2.NewIterator<DataBlockIter>(BytewiseComparator(),
270 BytewiseComparator()));
271
272 // Seek existent keys
273 for (size_t i = 0; i < keys.size(); i++) {
274 regular_iter->Seek(keys[i]);
275 ASSERT_OK(regular_iter->status());
276 ASSERT_TRUE(regular_iter->Valid());
277
278 Slice v = regular_iter->value();
279 ASSERT_EQ(v.ToString().compare(values[i]), 0);
280 }
281
282 // Seek non-existent keys.
283 // For hash index, if no key with a given prefix is not found, iterator will
284 // simply be set as invalid; whereas the binary search based iterator will
285 // return the one that is closest.
286 for (int i = 1; i < max_key - 1; i += 2) {
287 auto key = GenerateKey(i, 0, 0, nullptr);
288 regular_iter->Seek(key);
289 ASSERT_TRUE(regular_iter->Valid());
290 }
291 }
292
293 // In this test case, no two key share same prefix.
294 TEST_F(BlockTest, SimpleIndexHash) {
295 const int kMaxKey = 100000;
296 std::vector<std::string> keys;
297 std::vector<std::string> values;
298 GenerateRandomKVs(&keys, &values, 0 /* first key id */,
299 kMaxKey /* last key id */, 2 /* step */,
300 8 /* padding size (8 bytes randomly generated suffix) */);
301
302 std::unique_ptr<BlockBuilder> builder;
303 auto contents = GetBlockContents(&builder, keys, values);
304
305 CheckBlockContents(std::move(contents), kMaxKey, keys, values);
306 }
307
308 TEST_F(BlockTest, IndexHashWithSharedPrefix) {
309 const int kMaxKey = 100000;
310 // for each prefix, there will be 5 keys starts with it.
311 const int kPrefixGroup = 5;
312 std::vector<std::string> keys;
313 std::vector<std::string> values;
314 // Generate keys with same prefix.
315 GenerateRandomKVs(&keys, &values, 0, // first key id
316 kMaxKey, // last key id
317 2, // step
318 10, // padding size,
319 kPrefixGroup);
320
321 std::unique_ptr<BlockBuilder> builder;
322 auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
323
324 CheckBlockContents(std::move(contents), kMaxKey, keys, values);
325 }
326
327 // A slow and accurate version of BlockReadAmpBitmap that simply store
328 // all the marked ranges in a set.
329 class BlockReadAmpBitmapSlowAndAccurate {
330 public:
331 void Mark(size_t start_offset, size_t end_offset) {
332 assert(end_offset >= start_offset);
333 marked_ranges_.emplace(end_offset, start_offset);
334 }
335
336 void ResetCheckSequence() { iter_valid_ = false; }
337
338 // Return true if any byte in this range was Marked
339 // This does linear search from the previous position. When calling
340 // multiple times, `offset` needs to be incremental to get correct results.
341 // Call ResetCheckSequence() to reset it.
342 bool IsPinMarked(size_t offset) {
343 if (iter_valid_) {
344 // Has existing iterator, try linear search from
345 // the iterator.
346 for (int i = 0; i < 64; i++) {
347 if (offset < iter_->second) {
348 return false;
349 }
350 if (offset <= iter_->first) {
351 return true;
352 }
353
354 iter_++;
355 if (iter_ == marked_ranges_.end()) {
356 iter_valid_ = false;
357 return false;
358 }
359 }
360 }
361 // Initial call or have linear searched too many times.
362 // Do binary search.
363 iter_ = marked_ranges_.lower_bound(
364 std::make_pair(offset, static_cast<size_t>(0)));
365 if (iter_ == marked_ranges_.end()) {
366 iter_valid_ = false;
367 return false;
368 }
369 iter_valid_ = true;
370 return offset <= iter_->first && offset >= iter_->second;
371 }
372
373 private:
374 std::set<std::pair<size_t, size_t>> marked_ranges_;
375 std::set<std::pair<size_t, size_t>>::iterator iter_;
376 bool iter_valid_ = false;
377 };
378
379 TEST_F(BlockTest, BlockReadAmpBitmap) {
380 uint32_t pin_offset = 0;
381 SyncPoint::GetInstance()->SetCallBack(
382 "BlockReadAmpBitmap:rnd", [&pin_offset](void* arg) {
383 pin_offset = *(static_cast<uint32_t*>(arg));
384 });
385 SyncPoint::GetInstance()->EnableProcessing();
386 std::vector<size_t> block_sizes = {
387 1, // 1 byte
388 32, // 32 bytes
389 61, // 61 bytes
390 64, // 64 bytes
391 512, // 0.5 KB
392 1024, // 1 KB
393 1024 * 4, // 4 KB
394 1024 * 10, // 10 KB
395 1024 * 50, // 50 KB
396 1024 * 1024 * 4, // 5 MB
397 777,
398 124653,
399 };
400 const size_t kBytesPerBit = 64;
401
402 Random rnd(301);
403 for (size_t block_size : block_sizes) {
404 std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
405 BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get());
406 BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate;
407
408 size_t needed_bits = (block_size / kBytesPerBit);
409 if (block_size % kBytesPerBit != 0) {
410 needed_bits++;
411 }
412
413 ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size);
414
415 // Generate some random entries
416 std::vector<size_t> random_entry_offsets;
417 for (int i = 0; i < 1000; i++) {
418 random_entry_offsets.push_back(rnd.Next() % block_size);
419 }
420 std::sort(random_entry_offsets.begin(), random_entry_offsets.end());
421 auto it =
422 std::unique(random_entry_offsets.begin(), random_entry_offsets.end());
423 random_entry_offsets.resize(
424 std::distance(random_entry_offsets.begin(), it));
425
426 std::vector<std::pair<size_t, size_t>> random_entries;
427 for (size_t i = 0; i < random_entry_offsets.size(); i++) {
428 size_t entry_start = random_entry_offsets[i];
429 size_t entry_end;
430 if (i + 1 < random_entry_offsets.size()) {
431 entry_end = random_entry_offsets[i + 1] - 1;
432 } else {
433 entry_end = block_size - 1;
434 }
435 random_entries.emplace_back(entry_start, entry_end);
436 }
437
438 for (size_t i = 0; i < random_entries.size(); i++) {
439 read_amp_slow_and_accurate.ResetCheckSequence();
440 auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
441
442 read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
443 static_cast<uint32_t>(current_entry.second));
444 read_amp_slow_and_accurate.Mark(current_entry.first,
445 current_entry.second);
446
447 size_t total_bits = 0;
448 for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) {
449 total_bits += read_amp_slow_and_accurate.IsPinMarked(
450 bit_idx * kBytesPerBit + pin_offset);
451 }
452 size_t expected_estimate_useful = total_bits * kBytesPerBit;
453 size_t got_estimate_useful =
454 stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
455 ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
456 }
457 }
458 SyncPoint::GetInstance()->DisableProcessing();
459 SyncPoint::GetInstance()->ClearAllCallBacks();
460 }
461
462 TEST_F(BlockTest, BlockWithReadAmpBitmap) {
463 Random rnd(301);
464 Options options = Options();
465 std::unique_ptr<InternalKeyComparator> ic;
466 ic.reset(new test::PlainInternalKeyComparator(options.comparator));
467
468 std::vector<std::string> keys;
469 std::vector<std::string> values;
470 BlockBuilder builder(16);
471 int num_records = 10000;
472
473 GenerateRandomKVs(&keys, &values, 0, num_records, 1);
474 // add a bunch of records to a block
475 for (int i = 0; i < num_records; i++) {
476 builder.Add(keys[i], values[i]);
477 }
478
479 Slice rawblock = builder.Finish();
480 const size_t kBytesPerBit = 8;
481
482 // Read the block sequentially using Next()
483 {
484 std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
485
486 // create block reader
487 BlockContents contents;
488 contents.data = rawblock;
489 contents.cachable = true;
490 Block reader(std::move(contents), kDisableGlobalSequenceNumber,
491 kBytesPerBit, stats.get());
492
493 // read contents of block sequentially
494 size_t read_bytes = 0;
495 DataBlockIter *iter =
496 static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
497 options.comparator, options.comparator, nullptr, stats.get()));
498 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
499 iter->value();
500 read_bytes += iter->TEST_CurrentEntrySize();
501
502 double semi_acc_read_amp =
503 static_cast<double>(read_bytes) / rawblock.size();
504 double read_amp = static_cast<double>(stats->getTickerCount(
505 READ_AMP_ESTIMATE_USEFUL_BYTES)) /
506 stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
507
508 // Error in read amplification will be less than 1% if we are reading
509 // sequentially
510 double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
511 EXPECT_LT(error_pct, 1);
512 }
513
514 delete iter;
515 }
516
517 // Read the block sequentially using Seek()
518 {
519 std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
520
521 // create block reader
522 BlockContents contents;
523 contents.data = rawblock;
524 contents.cachable = true;
525 Block reader(std::move(contents), kDisableGlobalSequenceNumber,
526 kBytesPerBit, stats.get());
527
528 size_t read_bytes = 0;
529 DataBlockIter *iter =
530 static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
531 options.comparator, options.comparator, nullptr, stats.get()));
532 for (int i = 0; i < num_records; i++) {
533 Slice k(keys[i]);
534
535 // search in block for this key
536 iter->Seek(k);
537 iter->value();
538 read_bytes += iter->TEST_CurrentEntrySize();
539
540 double semi_acc_read_amp =
541 static_cast<double>(read_bytes) / rawblock.size();
542 double read_amp = static_cast<double>(stats->getTickerCount(
543 READ_AMP_ESTIMATE_USEFUL_BYTES)) /
544 stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
545
546 // Error in read amplification will be less than 1% if we are reading
547 // sequentially
548 double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
549 EXPECT_LT(error_pct, 1);
550 }
551 delete iter;
552 }
553
554 // Read the block randomly
555 {
556 std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
557
558 // create block reader
559 BlockContents contents;
560 contents.data = rawblock;
561 contents.cachable = true;
562 Block reader(std::move(contents), kDisableGlobalSequenceNumber,
563 kBytesPerBit, stats.get());
564
565 size_t read_bytes = 0;
566 DataBlockIter *iter =
567 static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
568 options.comparator, options.comparator, nullptr, stats.get()));
569 std::unordered_set<int> read_keys;
570 for (int i = 0; i < num_records; i++) {
571 int index = rnd.Uniform(num_records);
572 Slice k(keys[index]);
573
574 iter->Seek(k);
575 iter->value();
576 if (read_keys.find(index) == read_keys.end()) {
577 read_keys.insert(index);
578 read_bytes += iter->TEST_CurrentEntrySize();
579 }
580
581 double semi_acc_read_amp =
582 static_cast<double>(read_bytes) / rawblock.size();
583 double read_amp = static_cast<double>(stats->getTickerCount(
584 READ_AMP_ESTIMATE_USEFUL_BYTES)) /
585 stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
586
587 double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
588 // Error in read amplification will be less than 2% if we are reading
589 // randomly
590 EXPECT_LT(error_pct, 2);
591 }
592 delete iter;
593 }
594 }
595
596 TEST_F(BlockTest, ReadAmpBitmapPow2) {
597 std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
598 ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1);
599 ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2);
600 ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4);
601 ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8);
602 ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16);
603 ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32);
604
605 ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2);
606 ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4);
607 ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8);
608 ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16);
609 ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32);
610 ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32);
611 }
612
613 } // namespace rocksdb
614
615 int main(int argc, char **argv) {
616 ::testing::InitGoogleTest(&argc, argv);
617 return RUN_ALL_TESTS();
618 }