1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 // Decodes the blocks generated by block_builder.cc.
12 #include "table/block.h"
15 #include <unordered_map>
18 #include "monitoring/perf_context_imp.h"
19 #include "port/port.h"
20 #include "port/stack_trace.h"
21 #include "rocksdb/comparator.h"
22 #include "table/block_prefix_index.h"
23 #include "table/format.h"
24 #include "util/coding.h"
25 #include "util/logging.h"
29 // Helper routine: decode the next block entry starting at "p",
30 // storing the number of shared key bytes, non_shared key bytes,
31 // and the length of the value in "*shared", "*non_shared", and
32 // "*value_length", respectively. Will not derefence past "limit".
34 // If any errors are detected, returns nullptr. Otherwise, returns a
35 // pointer to the key delta (just past the three decoded values).
36 static inline const char* DecodeEntry(const char* p
, const char* limit
,
39 uint32_t* value_length
) {
40 if (limit
- p
< 3) return nullptr;
41 *shared
= reinterpret_cast<const unsigned char*>(p
)[0];
42 *non_shared
= reinterpret_cast<const unsigned char*>(p
)[1];
43 *value_length
= reinterpret_cast<const unsigned char*>(p
)[2];
44 if ((*shared
| *non_shared
| *value_length
) < 128) {
45 // Fast path: all three values are encoded in one byte each
48 if ((p
= GetVarint32Ptr(p
, limit
, shared
)) == nullptr) return nullptr;
49 if ((p
= GetVarint32Ptr(p
, limit
, non_shared
)) == nullptr) return nullptr;
50 if ((p
= GetVarint32Ptr(p
, limit
, value_length
)) == nullptr) return nullptr;
53 if (static_cast<uint32_t>(limit
- p
) < (*non_shared
+ *value_length
)) {
59 void BlockIter::Next() {
64 void BlockIter::Prev() {
67 assert(prev_entries_idx_
== -1 ||
68 static_cast<size_t>(prev_entries_idx_
) < prev_entries_
.size());
69 // Check if we can use cached prev_entries_
70 if (prev_entries_idx_
> 0 &&
71 prev_entries_
[prev_entries_idx_
].offset
== current_
) {
72 // Read cached CachedPrevEntry
74 const CachedPrevEntry
& current_prev_entry
=
75 prev_entries_
[prev_entries_idx_
];
77 const char* key_ptr
= nullptr;
78 if (current_prev_entry
.key_ptr
!= nullptr) {
79 // The key is not delta encoded and stored in the data block
80 key_ptr
= current_prev_entry
.key_ptr
;
83 // The key is delta encoded and stored in prev_entries_keys_buff_
84 key_ptr
= prev_entries_keys_buff_
.data() + current_prev_entry
.key_offset
;
87 const Slice
current_key(key_ptr
, current_prev_entry
.key_size
);
89 current_
= current_prev_entry
.offset
;
90 key_
.SetInternalKey(current_key
, false /* copy */);
91 value_
= current_prev_entry
.value
;
96 // Clear prev entries cache
97 prev_entries_idx_
= -1;
98 prev_entries_
.clear();
99 prev_entries_keys_buff_
.clear();
101 // Scan backwards to a restart point before current_
102 const uint32_t original
= current_
;
103 while (GetRestartPoint(restart_index_
) >= original
) {
104 if (restart_index_
== 0) {
106 current_
= restarts_
;
107 restart_index_
= num_restarts_
;
113 SeekToRestartPoint(restart_index_
);
116 if (!ParseNextKey()) {
119 Slice current_key
= key();
121 if (key_
.IsKeyPinned()) {
122 // The key is not delta encoded
123 prev_entries_
.emplace_back(current_
, current_key
.data(), 0,
124 current_key
.size(), value());
126 // The key is delta encoded, cache decoded key in buffer
127 size_t new_key_offset
= prev_entries_keys_buff_
.size();
128 prev_entries_keys_buff_
.append(current_key
.data(), current_key
.size());
130 prev_entries_
.emplace_back(current_
, nullptr, new_key_offset
,
131 current_key
.size(), value());
133 // Loop until end of current entry hits the start of original entry
134 } while (NextEntryOffset() < original
);
135 prev_entries_idx_
= static_cast<int32_t>(prev_entries_
.size()) - 1;
138 void BlockIter::Seek(const Slice
& target
) {
139 PERF_TIMER_GUARD(block_seek_nanos
);
140 if (data_
== nullptr) { // Not init yet
146 ok
= PrefixSeek(target
, &index
);
148 ok
= BinarySeek(target
, 0, num_restarts_
- 1, &index
);
154 SeekToRestartPoint(index
);
155 // Linear search (within restart block) for first key >= target
158 if (!ParseNextKey() || Compare(key_
.GetInternalKey(), target
) >= 0) {
164 void BlockIter::SeekForPrev(const Slice
& target
) {
165 PERF_TIMER_GUARD(block_seek_nanos
);
166 if (data_
== nullptr) { // Not init yet
171 ok
= BinarySeek(target
, 0, num_restarts_
- 1, &index
);
176 SeekToRestartPoint(index
);
177 // Linear search (within restart block) for first key >= target
179 while (ParseNextKey() && Compare(key_
.GetInternalKey(), target
) < 0) {
184 while (Valid() && Compare(key_
.GetInternalKey(), target
) > 0) {
190 void BlockIter::SeekToFirst() {
191 if (data_
== nullptr) { // Not init yet
194 SeekToRestartPoint(0);
198 void BlockIter::SeekToLast() {
199 if (data_
== nullptr) { // Not init yet
202 SeekToRestartPoint(num_restarts_
- 1);
203 while (ParseNextKey() && NextEntryOffset() < restarts_
) {
208 void BlockIter::CorruptionError() {
209 current_
= restarts_
;
210 restart_index_
= num_restarts_
;
211 status_
= Status::Corruption("bad entry in block");
216 bool BlockIter::ParseNextKey() {
217 current_
= NextEntryOffset();
218 const char* p
= data_
+ current_
;
219 const char* limit
= data_
+ restarts_
; // Restarts come right after data
221 // No more entries to return. Mark as invalid.
222 current_
= restarts_
;
223 restart_index_
= num_restarts_
;
228 uint32_t shared
, non_shared
, value_length
;
229 p
= DecodeEntry(p
, limit
, &shared
, &non_shared
, &value_length
);
230 if (p
== nullptr || key_
.Size() < shared
) {
235 // If this key dont share any bytes with prev key then we dont need
236 // to decode it and can use it's address in the block directly.
237 key_
.SetInternalKey(Slice(p
, non_shared
), false /* copy */);
240 // This key share `shared` bytes with prev key, we need to decode it
241 key_
.TrimAppend(shared
, p
, non_shared
);
245 if (global_seqno_
!= kDisableGlobalSequenceNumber
) {
246 // If we are reading a file with a global sequence number we should
247 // expect that all encoded sequence numbers are zeros and all value
248 // types are kTypeValue
249 assert(GetInternalKeySeqno(key_
.GetInternalKey()) == 0);
250 assert(ExtractValueType(key_
.GetInternalKey()) == ValueType::kTypeValue
);
253 // TODO(tec): Investigate updating the seqno in the loaded block
254 // directly instead of doing a copy and update.
256 // We cannot use the key address in the block directly because
257 // we have a global_seqno_ that will overwrite the encoded one.
262 key_
.UpdateInternalKey(global_seqno_
, ValueType::kTypeValue
);
265 value_
= Slice(p
+ non_shared
, value_length
);
266 while (restart_index_
+ 1 < num_restarts_
&&
267 GetRestartPoint(restart_index_
+ 1) < current_
) {
274 // Binary search in restart array to find the first restart point that
275 // is either the last restart point with a key less than target,
276 // which means the key of next restart point is larger than target, or
277 // the first restart point with a key = target
278 bool BlockIter::BinarySeek(const Slice
& target
, uint32_t left
, uint32_t right
,
280 assert(left
<= right
);
282 while (left
< right
) {
283 uint32_t mid
= (left
+ right
+ 1) / 2;
284 uint32_t region_offset
= GetRestartPoint(mid
);
285 uint32_t shared
, non_shared
, value_length
;
286 const char* key_ptr
= DecodeEntry(data_
+ region_offset
, data_
+ restarts_
,
287 &shared
, &non_shared
, &value_length
);
288 if (key_ptr
== nullptr || (shared
!= 0)) {
292 Slice
mid_key(key_ptr
, non_shared
);
293 int cmp
= Compare(mid_key
, target
);
295 // Key at "mid" is smaller than "target". Therefore all
296 // blocks before "mid" are uninteresting.
298 } else if (cmp
> 0) {
299 // Key at "mid" is >= "target". Therefore all blocks at or
300 // after "mid" are uninteresting.
311 // Compare target key and the block key of the block of `block_index`.
312 // Return -1 if error.
313 int BlockIter::CompareBlockKey(uint32_t block_index
, const Slice
& target
) {
314 uint32_t region_offset
= GetRestartPoint(block_index
);
315 uint32_t shared
, non_shared
, value_length
;
316 const char* key_ptr
= DecodeEntry(data_
+ region_offset
, data_
+ restarts_
,
317 &shared
, &non_shared
, &value_length
);
318 if (key_ptr
== nullptr || (shared
!= 0)) {
320 return 1; // Return target is smaller
322 Slice
block_key(key_ptr
, non_shared
);
323 return Compare(block_key
, target
);
326 // Binary search in block_ids to find the first block
327 // with a key >= target
328 bool BlockIter::BinaryBlockIndexSeek(const Slice
& target
, uint32_t* block_ids
,
329 uint32_t left
, uint32_t right
,
331 assert(left
<= right
);
332 uint32_t left_bound
= left
;
334 while (left
<= right
) {
335 uint32_t mid
= (right
+ left
) / 2;
337 int cmp
= CompareBlockKey(block_ids
[mid
], target
);
342 // Key at "target" is larger than "mid". Therefore all
343 // blocks before or at "mid" are uninteresting.
346 // Key at "target" is <= "mid". Therefore all blocks
347 // after "mid" are uninteresting.
348 // If there is only one block left, we found it.
349 if (left
== right
) break;
355 // In one of the two following cases:
356 // (1) left is the first one of block_ids
357 // (2) there is a gap of blocks between block of `left` and `left-1`.
358 // we can further distinguish the case of key in the block or key not
359 // existing, by comparing the target key and the key of the previous
360 // block to the left of the block found.
361 if (block_ids
[left
] > 0 &&
362 (left
== left_bound
|| block_ids
[left
- 1] != block_ids
[left
] - 1) &&
363 CompareBlockKey(block_ids
[left
] - 1, target
) > 0) {
364 current_
= restarts_
;
368 *index
= block_ids
[left
];
371 assert(left
> right
);
372 // Mark iterator invalid
373 current_
= restarts_
;
378 bool BlockIter::PrefixSeek(const Slice
& target
, uint32_t* index
) {
379 assert(prefix_index_
);
380 uint32_t* block_ids
= nullptr;
381 uint32_t num_blocks
= prefix_index_
->GetBlocks(target
, &block_ids
);
383 if (num_blocks
== 0) {
384 current_
= restarts_
;
387 return BinaryBlockIndexSeek(target
, block_ids
, 0, num_blocks
- 1, index
);
391 uint32_t Block::NumRestarts() const {
392 assert(size_
>= 2*sizeof(uint32_t));
393 return DecodeFixed32(data_
+ size_
- sizeof(uint32_t));
396 Block::Block(BlockContents
&& contents
, SequenceNumber _global_seqno
,
397 size_t read_amp_bytes_per_bit
, Statistics
* statistics
)
398 : contents_(std::move(contents
)),
399 data_(contents_
.data
.data()),
400 size_(contents_
.data
.size()),
401 global_seqno_(_global_seqno
) {
402 if (size_
< sizeof(uint32_t)) {
403 size_
= 0; // Error marker
406 static_cast<uint32_t>(size_
) - (1 + NumRestarts()) * sizeof(uint32_t);
407 if (restart_offset_
> size_
- sizeof(uint32_t)) {
408 // The size is too small for NumRestarts() and therefore
409 // restart_offset_ wrapped around.
413 if (read_amp_bytes_per_bit
!= 0 && statistics
&& size_
!= 0) {
414 read_amp_bitmap_
.reset(new BlockReadAmpBitmap(
415 restart_offset_
, read_amp_bytes_per_bit
, statistics
));
419 InternalIterator
* Block::NewIterator(const Comparator
* cmp
, BlockIter
* iter
,
420 bool total_order_seek
, Statistics
* stats
) {
421 if (size_
< 2*sizeof(uint32_t)) {
422 if (iter
!= nullptr) {
423 iter
->SetStatus(Status::Corruption("bad block contents"));
426 return NewErrorInternalIterator(Status::Corruption("bad block contents"));
429 const uint32_t num_restarts
= NumRestarts();
430 if (num_restarts
== 0) {
431 if (iter
!= nullptr) {
432 iter
->SetStatus(Status::OK());
435 return NewEmptyInternalIterator();
438 BlockPrefixIndex
* prefix_index_ptr
=
439 total_order_seek
? nullptr : prefix_index_
.get();
441 if (iter
!= nullptr) {
442 iter
->Initialize(cmp
, data_
, restart_offset_
, num_restarts
,
443 prefix_index_ptr
, global_seqno_
, read_amp_bitmap_
.get());
445 iter
= new BlockIter(cmp
, data_
, restart_offset_
, num_restarts
,
446 prefix_index_ptr
, global_seqno_
,
447 read_amp_bitmap_
.get());
450 if (read_amp_bitmap_
) {
451 if (read_amp_bitmap_
->GetStatistics() != stats
) {
452 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
453 read_amp_bitmap_
->SetStatistics(stats
);
461 void Block::SetBlockPrefixIndex(BlockPrefixIndex
* prefix_index
) {
462 prefix_index_
.reset(prefix_index
);
465 size_t Block::ApproximateMemoryUsage() const {
466 size_t usage
= usable_size();
468 usage
+= prefix_index_
->ApproximateMemoryUsage();
473 } // namespace rocksdb