1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
10 #include <unordered_set>
14 #include "db/dbformat.h"
15 #include "db/write_batch_internal.h"
16 #include "db/memtable.h"
17 #include "rocksdb/db.h"
18 #include "rocksdb/env.h"
19 #include "rocksdb/iterator.h"
20 #include "rocksdb/table.h"
21 #include "rocksdb/slice_transform.h"
22 #include "table/block.h"
23 #include "table/block_builder.h"
24 #include "table/format.h"
25 #include "util/random.h"
26 #include "util/testharness.h"
27 #include "util/testutil.h"
31 static std::string
RandomString(Random
* rnd
, int len
) {
33 test::RandomString(rnd
, len
, &r
);
36 std::string
GenerateKey(int primary_key
, int secondary_key
, int padding_size
,
40 snprintf(buf
, sizeof(buf
), "%6d%4d", primary_key
, secondary_key
);
43 k
+= RandomString(rnd
, padding_size
);
49 // Generate random key value pairs.
50 // The generated key will be sorted. You can tune the parameters to generated
51 // different kinds of test key/value pairs for different scenario.
52 void GenerateRandomKVs(std::vector
<std::string
> *keys
,
53 std::vector
<std::string
> *values
, const int from
,
54 const int len
, const int step
= 1,
55 const int padding_size
= 0,
56 const int keys_share_prefix
= 1) {
59 // generate different prefix
60 for (int i
= from
; i
< from
+ len
; i
+= step
) {
61 // generating keys that shares the prefix
62 for (int j
= 0; j
< keys_share_prefix
; ++j
) {
63 keys
->emplace_back(GenerateKey(i
, j
, padding_size
, &rnd
));
66 values
->emplace_back(RandomString(&rnd
, 100));
71 // Same as GenerateRandomKVs but the values are BlockHandle
72 void GenerateRandomKBHs(std::vector
<std::string
> *keys
,
73 std::vector
<BlockHandle
> *values
, const int from
,
74 const int len
, const int step
= 1,
75 const int padding_size
= 0,
76 const int keys_share_prefix
= 1) {
80 // generate different prefix
81 for (int i
= from
; i
< from
+ len
; i
+= step
) {
82 // generate keys that shares the prefix
83 for (int j
= 0; j
< keys_share_prefix
; ++j
) {
84 keys
->emplace_back(GenerateKey(i
, j
, padding_size
, &rnd
));
86 uint64_t size
= rnd
.Uniform(1024 * 16);
87 BlockHandle
handle(offset
, size
);
88 offset
+= size
+ kBlockTrailerSize
;
89 values
->emplace_back(handle
);
94 class BlockTest
: public testing::Test
{};
97 TEST_F(BlockTest
, SimpleTest
) {
99 Options options
= Options();
100 std::unique_ptr
<InternalKeyComparator
> ic
;
101 ic
.reset(new test::PlainInternalKeyComparator(options
.comparator
));
103 std::vector
<std::string
> keys
;
104 std::vector
<std::string
> values
;
105 BlockBuilder
builder(16);
106 int num_records
= 100000;
108 GenerateRandomKVs(&keys
, &values
, 0, num_records
);
109 // add a bunch of records to a block
110 for (int i
= 0; i
< num_records
; i
++) {
111 builder
.Add(keys
[i
], values
[i
]);
114 // read serialized contents of the block
115 Slice rawblock
= builder
.Finish();
117 // create block reader
118 BlockContents contents
;
119 contents
.data
= rawblock
;
120 contents
.cachable
= false;
121 Block
reader(std::move(contents
), kDisableGlobalSequenceNumber
);
123 // read contents of block sequentially
125 InternalIterator
*iter
=
126 reader
.NewIterator
<DataBlockIter
>(options
.comparator
, options
.comparator
);
127 for (iter
->SeekToFirst();iter
->Valid(); count
++, iter
->Next()) {
129 // read kv from block
130 Slice k
= iter
->key();
131 Slice v
= iter
->value();
133 // compare with lookaside array
134 ASSERT_EQ(k
.ToString().compare(keys
[count
]), 0);
135 ASSERT_EQ(v
.ToString().compare(values
[count
]), 0);
139 // read block contents randomly
141 reader
.NewIterator
<DataBlockIter
>(options
.comparator
, options
.comparator
);
142 for (int i
= 0; i
< num_records
; i
++) {
144 // find a random key in the lookaside array
145 int index
= rnd
.Uniform(num_records
);
146 Slice
k(keys
[index
]);
148 // search in block for this key
150 ASSERT_TRUE(iter
->Valid());
151 Slice v
= iter
->value();
152 ASSERT_EQ(v
.ToString().compare(values
[index
]), 0);
157 TEST_F(BlockTest
, ValueDeltaEncodingTest
) {
159 Options options
= Options();
160 std::unique_ptr
<InternalKeyComparator
> ic
;
161 ic
.reset(new test::PlainInternalKeyComparator(options
.comparator
));
163 std::vector
<std::string
> keys
;
164 std::vector
<BlockHandle
> values
;
165 const bool kUseDeltaEncoding
= true;
166 const bool kUseValueDeltaEncoding
= true;
167 BlockBuilder
builder(16, kUseDeltaEncoding
, kUseValueDeltaEncoding
);
168 int num_records
= 100;
170 GenerateRandomKBHs(&keys
, &values
, 0, num_records
);
171 // add a bunch of records to a block
172 BlockHandle last_encoded_handle
;
173 for (int i
= 0; i
< num_records
; i
++) {
174 auto block_handle
= values
[i
];
175 std::string handle_encoding
;
176 block_handle
.EncodeTo(&handle_encoding
);
177 std::string handle_delta_encoding
;
178 PutVarsignedint64(&handle_delta_encoding
,
179 block_handle
.size() - last_encoded_handle
.size());
180 last_encoded_handle
= block_handle
;
181 const Slice
handle_delta_encoding_slice(handle_delta_encoding
);
182 builder
.Add(keys
[i
], handle_encoding
, &handle_delta_encoding_slice
);
185 // read serialized contents of the block
186 Slice rawblock
= builder
.Finish();
188 // create block reader
189 BlockContents contents
;
190 contents
.data
= rawblock
;
191 contents
.cachable
= false;
192 Block
reader(std::move(contents
), kDisableGlobalSequenceNumber
);
194 const bool kTotalOrderSeek
= true;
195 const bool kIncludesSeq
= true;
196 const bool kValueIsFull
= !kUseValueDeltaEncoding
;
197 IndexBlockIter
*kNullIter
= nullptr;
198 Statistics
*kNullStats
= nullptr;
199 // read contents of block sequentially
201 InternalIteratorBase
<BlockHandle
> *iter
= reader
.NewIterator
<IndexBlockIter
>(
202 options
.comparator
, options
.comparator
, kNullIter
, kNullStats
,
203 kTotalOrderSeek
, kIncludesSeq
, kValueIsFull
);
204 for (iter
->SeekToFirst(); iter
->Valid(); count
++, iter
->Next()) {
205 // read kv from block
206 Slice k
= iter
->key();
207 BlockHandle handle
= iter
->value();
209 // compare with lookaside array
210 ASSERT_EQ(k
.ToString().compare(keys
[count
]), 0);
212 ASSERT_EQ(values
[count
].offset(), handle
.offset());
213 ASSERT_EQ(values
[count
].size(), handle
.size());
217 // read block contents randomly
218 iter
= reader
.NewIterator
<IndexBlockIter
>(
219 options
.comparator
, options
.comparator
, kNullIter
, kNullStats
,
220 kTotalOrderSeek
, kIncludesSeq
, kValueIsFull
);
221 for (int i
= 0; i
< num_records
; i
++) {
222 // find a random key in the lookaside array
223 int index
= rnd
.Uniform(num_records
);
224 Slice
k(keys
[index
]);
226 // search in block for this key
228 ASSERT_TRUE(iter
->Valid());
229 BlockHandle handle
= iter
->value();
230 ASSERT_EQ(values
[index
].offset(), handle
.offset());
231 ASSERT_EQ(values
[index
].size(), handle
.size());
235 // return the block contents
236 BlockContents
GetBlockContents(std::unique_ptr
<BlockBuilder
> *builder
,
237 const std::vector
<std::string
> &keys
,
238 const std::vector
<std::string
> &values
,
239 const int /*prefix_group_size*/ = 1) {
240 builder
->reset(new BlockBuilder(1 /* restart interval */));
242 // Add only half of the keys
243 for (size_t i
= 0; i
< keys
.size(); ++i
) {
244 (*builder
)->Add(keys
[i
], values
[i
]);
246 Slice rawblock
= (*builder
)->Finish();
248 BlockContents contents
;
249 contents
.data
= rawblock
;
250 contents
.cachable
= false;
255 void CheckBlockContents(BlockContents contents
, const int max_key
,
256 const std::vector
<std::string
> &keys
,
257 const std::vector
<std::string
> &values
) {
258 const size_t prefix_size
= 6;
259 // create block reader
260 BlockContents
contents_ref(contents
.data
, contents
.cachable
,
261 contents
.compression_type
);
262 Block
reader1(std::move(contents
), kDisableGlobalSequenceNumber
);
263 Block
reader2(std::move(contents_ref
), kDisableGlobalSequenceNumber
);
265 std::unique_ptr
<const SliceTransform
> prefix_extractor(
266 NewFixedPrefixTransform(prefix_size
));
268 std::unique_ptr
<InternalIterator
> regular_iter(
269 reader2
.NewIterator
<DataBlockIter
>(BytewiseComparator(),
270 BytewiseComparator()));
272 // Seek existent keys
273 for (size_t i
= 0; i
< keys
.size(); i
++) {
274 regular_iter
->Seek(keys
[i
]);
275 ASSERT_OK(regular_iter
->status());
276 ASSERT_TRUE(regular_iter
->Valid());
278 Slice v
= regular_iter
->value();
279 ASSERT_EQ(v
.ToString().compare(values
[i
]), 0);
282 // Seek non-existent keys.
283 // For hash index, if no key with a given prefix is not found, iterator will
284 // simply be set as invalid; whereas the binary search based iterator will
285 // return the one that is closest.
286 for (int i
= 1; i
< max_key
- 1; i
+= 2) {
287 auto key
= GenerateKey(i
, 0, 0, nullptr);
288 regular_iter
->Seek(key
);
289 ASSERT_TRUE(regular_iter
->Valid());
293 // In this test case, no two key share same prefix.
294 TEST_F(BlockTest
, SimpleIndexHash
) {
295 const int kMaxKey
= 100000;
296 std::vector
<std::string
> keys
;
297 std::vector
<std::string
> values
;
298 GenerateRandomKVs(&keys
, &values
, 0 /* first key id */,
299 kMaxKey
/* last key id */, 2 /* step */,
300 8 /* padding size (8 bytes randomly generated suffix) */);
302 std::unique_ptr
<BlockBuilder
> builder
;
303 auto contents
= GetBlockContents(&builder
, keys
, values
);
305 CheckBlockContents(std::move(contents
), kMaxKey
, keys
, values
);
308 TEST_F(BlockTest
, IndexHashWithSharedPrefix
) {
309 const int kMaxKey
= 100000;
310 // for each prefix, there will be 5 keys starts with it.
311 const int kPrefixGroup
= 5;
312 std::vector
<std::string
> keys
;
313 std::vector
<std::string
> values
;
314 // Generate keys with same prefix.
315 GenerateRandomKVs(&keys
, &values
, 0, // first key id
316 kMaxKey
, // last key id
321 std::unique_ptr
<BlockBuilder
> builder
;
322 auto contents
= GetBlockContents(&builder
, keys
, values
, kPrefixGroup
);
324 CheckBlockContents(std::move(contents
), kMaxKey
, keys
, values
);
327 // A slow and accurate version of BlockReadAmpBitmap that simply store
328 // all the marked ranges in a set.
329 class BlockReadAmpBitmapSlowAndAccurate
{
331 void Mark(size_t start_offset
, size_t end_offset
) {
332 assert(end_offset
>= start_offset
);
333 marked_ranges_
.emplace(end_offset
, start_offset
);
336 void ResetCheckSequence() { iter_valid_
= false; }
338 // Return true if any byte in this range was Marked
339 // This does linear search from the previous position. When calling
340 // multiple times, `offset` needs to be incremental to get correct results.
341 // Call ResetCheckSequence() to reset it.
342 bool IsPinMarked(size_t offset
) {
344 // Has existing iterator, try linear search from
346 for (int i
= 0; i
< 64; i
++) {
347 if (offset
< iter_
->second
) {
350 if (offset
<= iter_
->first
) {
355 if (iter_
== marked_ranges_
.end()) {
361 // Initial call or have linear searched too many times.
363 iter_
= marked_ranges_
.lower_bound(
364 std::make_pair(offset
, static_cast<size_t>(0)));
365 if (iter_
== marked_ranges_
.end()) {
370 return offset
<= iter_
->first
&& offset
>= iter_
->second
;
374 std::set
<std::pair
<size_t, size_t>> marked_ranges_
;
375 std::set
<std::pair
<size_t, size_t>>::iterator iter_
;
376 bool iter_valid_
= false;
379 TEST_F(BlockTest
, BlockReadAmpBitmap
) {
380 uint32_t pin_offset
= 0;
381 SyncPoint::GetInstance()->SetCallBack(
382 "BlockReadAmpBitmap:rnd", [&pin_offset
](void* arg
) {
383 pin_offset
= *(static_cast<uint32_t*>(arg
));
385 SyncPoint::GetInstance()->EnableProcessing();
386 std::vector
<size_t> block_sizes
= {
396 1024 * 1024 * 4, // 5 MB
400 const size_t kBytesPerBit
= 64;
403 for (size_t block_size
: block_sizes
) {
404 std::shared_ptr
<Statistics
> stats
= rocksdb::CreateDBStatistics();
405 BlockReadAmpBitmap
read_amp_bitmap(block_size
, kBytesPerBit
, stats
.get());
406 BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate
;
408 size_t needed_bits
= (block_size
/ kBytesPerBit
);
409 if (block_size
% kBytesPerBit
!= 0) {
413 ASSERT_EQ(stats
->getTickerCount(READ_AMP_TOTAL_READ_BYTES
), block_size
);
415 // Generate some random entries
416 std::vector
<size_t> random_entry_offsets
;
417 for (int i
= 0; i
< 1000; i
++) {
418 random_entry_offsets
.push_back(rnd
.Next() % block_size
);
420 std::sort(random_entry_offsets
.begin(), random_entry_offsets
.end());
422 std::unique(random_entry_offsets
.begin(), random_entry_offsets
.end());
423 random_entry_offsets
.resize(
424 std::distance(random_entry_offsets
.begin(), it
));
426 std::vector
<std::pair
<size_t, size_t>> random_entries
;
427 for (size_t i
= 0; i
< random_entry_offsets
.size(); i
++) {
428 size_t entry_start
= random_entry_offsets
[i
];
430 if (i
+ 1 < random_entry_offsets
.size()) {
431 entry_end
= random_entry_offsets
[i
+ 1] - 1;
433 entry_end
= block_size
- 1;
435 random_entries
.emplace_back(entry_start
, entry_end
);
438 for (size_t i
= 0; i
< random_entries
.size(); i
++) {
439 read_amp_slow_and_accurate
.ResetCheckSequence();
440 auto ¤t_entry
= random_entries
[rnd
.Next() % random_entries
.size()];
442 read_amp_bitmap
.Mark(static_cast<uint32_t>(current_entry
.first
),
443 static_cast<uint32_t>(current_entry
.second
));
444 read_amp_slow_and_accurate
.Mark(current_entry
.first
,
445 current_entry
.second
);
447 size_t total_bits
= 0;
448 for (size_t bit_idx
= 0; bit_idx
< needed_bits
; bit_idx
++) {
449 total_bits
+= read_amp_slow_and_accurate
.IsPinMarked(
450 bit_idx
* kBytesPerBit
+ pin_offset
);
452 size_t expected_estimate_useful
= total_bits
* kBytesPerBit
;
453 size_t got_estimate_useful
=
454 stats
->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES
);
455 ASSERT_EQ(expected_estimate_useful
, got_estimate_useful
);
458 SyncPoint::GetInstance()->DisableProcessing();
459 SyncPoint::GetInstance()->ClearAllCallBacks();
462 TEST_F(BlockTest
, BlockWithReadAmpBitmap
) {
464 Options options
= Options();
465 std::unique_ptr
<InternalKeyComparator
> ic
;
466 ic
.reset(new test::PlainInternalKeyComparator(options
.comparator
));
468 std::vector
<std::string
> keys
;
469 std::vector
<std::string
> values
;
470 BlockBuilder
builder(16);
471 int num_records
= 10000;
473 GenerateRandomKVs(&keys
, &values
, 0, num_records
, 1);
474 // add a bunch of records to a block
475 for (int i
= 0; i
< num_records
; i
++) {
476 builder
.Add(keys
[i
], values
[i
]);
479 Slice rawblock
= builder
.Finish();
480 const size_t kBytesPerBit
= 8;
482 // Read the block sequentially using Next()
484 std::shared_ptr
<Statistics
> stats
= rocksdb::CreateDBStatistics();
486 // create block reader
487 BlockContents contents
;
488 contents
.data
= rawblock
;
489 contents
.cachable
= true;
490 Block
reader(std::move(contents
), kDisableGlobalSequenceNumber
,
491 kBytesPerBit
, stats
.get());
493 // read contents of block sequentially
494 size_t read_bytes
= 0;
495 DataBlockIter
*iter
=
496 static_cast<DataBlockIter
*>(reader
.NewIterator
<DataBlockIter
>(
497 options
.comparator
, options
.comparator
, nullptr, stats
.get()));
498 for (iter
->SeekToFirst(); iter
->Valid(); iter
->Next()) {
500 read_bytes
+= iter
->TEST_CurrentEntrySize();
502 double semi_acc_read_amp
=
503 static_cast<double>(read_bytes
) / rawblock
.size();
504 double read_amp
= static_cast<double>(stats
->getTickerCount(
505 READ_AMP_ESTIMATE_USEFUL_BYTES
)) /
506 stats
->getTickerCount(READ_AMP_TOTAL_READ_BYTES
);
508 // Error in read amplification will be less than 1% if we are reading
510 double error_pct
= fabs(semi_acc_read_amp
- read_amp
) * 100;
511 EXPECT_LT(error_pct
, 1);
517 // Read the block sequentially using Seek()
519 std::shared_ptr
<Statistics
> stats
= rocksdb::CreateDBStatistics();
521 // create block reader
522 BlockContents contents
;
523 contents
.data
= rawblock
;
524 contents
.cachable
= true;
525 Block
reader(std::move(contents
), kDisableGlobalSequenceNumber
,
526 kBytesPerBit
, stats
.get());
528 size_t read_bytes
= 0;
529 DataBlockIter
*iter
=
530 static_cast<DataBlockIter
*>(reader
.NewIterator
<DataBlockIter
>(
531 options
.comparator
, options
.comparator
, nullptr, stats
.get()));
532 for (int i
= 0; i
< num_records
; i
++) {
535 // search in block for this key
538 read_bytes
+= iter
->TEST_CurrentEntrySize();
540 double semi_acc_read_amp
=
541 static_cast<double>(read_bytes
) / rawblock
.size();
542 double read_amp
= static_cast<double>(stats
->getTickerCount(
543 READ_AMP_ESTIMATE_USEFUL_BYTES
)) /
544 stats
->getTickerCount(READ_AMP_TOTAL_READ_BYTES
);
546 // Error in read amplification will be less than 1% if we are reading
548 double error_pct
= fabs(semi_acc_read_amp
- read_amp
) * 100;
549 EXPECT_LT(error_pct
, 1);
554 // Read the block randomly
556 std::shared_ptr
<Statistics
> stats
= rocksdb::CreateDBStatistics();
558 // create block reader
559 BlockContents contents
;
560 contents
.data
= rawblock
;
561 contents
.cachable
= true;
562 Block
reader(std::move(contents
), kDisableGlobalSequenceNumber
,
563 kBytesPerBit
, stats
.get());
565 size_t read_bytes
= 0;
566 DataBlockIter
*iter
=
567 static_cast<DataBlockIter
*>(reader
.NewIterator
<DataBlockIter
>(
568 options
.comparator
, options
.comparator
, nullptr, stats
.get()));
569 std::unordered_set
<int> read_keys
;
570 for (int i
= 0; i
< num_records
; i
++) {
571 int index
= rnd
.Uniform(num_records
);
572 Slice
k(keys
[index
]);
576 if (read_keys
.find(index
) == read_keys
.end()) {
577 read_keys
.insert(index
);
578 read_bytes
+= iter
->TEST_CurrentEntrySize();
581 double semi_acc_read_amp
=
582 static_cast<double>(read_bytes
) / rawblock
.size();
583 double read_amp
= static_cast<double>(stats
->getTickerCount(
584 READ_AMP_ESTIMATE_USEFUL_BYTES
)) /
585 stats
->getTickerCount(READ_AMP_TOTAL_READ_BYTES
);
587 double error_pct
= fabs(semi_acc_read_amp
- read_amp
) * 100;
588 // Error in read amplification will be less than 2% if we are reading
590 EXPECT_LT(error_pct
, 2);
596 TEST_F(BlockTest
, ReadAmpBitmapPow2
) {
597 std::shared_ptr
<Statistics
> stats
= rocksdb::CreateDBStatistics();
598 ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats
.get()).GetBytesPerBit(), 1);
599 ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats
.get()).GetBytesPerBit(), 2);
600 ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats
.get()).GetBytesPerBit(), 4);
601 ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats
.get()).GetBytesPerBit(), 8);
602 ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats
.get()).GetBytesPerBit(), 16);
603 ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats
.get()).GetBytesPerBit(), 32);
605 ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats
.get()).GetBytesPerBit(), 2);
606 ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats
.get()).GetBytesPerBit(), 4);
607 ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats
.get()).GetBytesPerBit(), 8);
608 ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats
.get()).GetBytesPerBit(), 16);
609 ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats
.get()).GetBytesPerBit(), 32);
610 ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats
.get()).GetBytesPerBit(), 32);
613 } // namespace rocksdb
615 int main(int argc
, char **argv
) {
616 ::testing::InitGoogleTest(&argc
, argv
);
617 return RUN_ALL_TESTS();