1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 // Decodes the blocks generated by block_builder.cc.
12 #include "table/block.h"
15 #include <unordered_map>
18 #include "monitoring/perf_context_imp.h"
19 #include "port/port.h"
20 #include "port/stack_trace.h"
21 #include "rocksdb/comparator.h"
22 #include "table/block_prefix_index.h"
23 #include "table/data_block_footer.h"
24 #include "table/format.h"
25 #include "util/coding.h"
26 #include "util/logging.h"
30 // Helper routine: decode the next block entry starting at "p",
31 // storing the number of shared key bytes, non_shared key bytes,
32 // and the length of the value in "*shared", "*non_shared", and
33 // "*value_length", respectively. Will not derefence past "limit".
35 // If any errors are detected, returns nullptr. Otherwise, returns a
36 // pointer to the key delta (just past the three decoded values).
38 inline const char* operator()(const char* p
, const char* limit
,
39 uint32_t* shared
, uint32_t* non_shared
,
40 uint32_t* value_length
) {
41 // We need 2 bytes for shared and non_shared size. We also need one more
42 // byte either for value size or the actual value in case of value delta
44 assert(limit
- p
>= 3);
45 *shared
= reinterpret_cast<const unsigned char*>(p
)[0];
46 *non_shared
= reinterpret_cast<const unsigned char*>(p
)[1];
47 *value_length
= reinterpret_cast<const unsigned char*>(p
)[2];
48 if ((*shared
| *non_shared
| *value_length
) < 128) {
49 // Fast path: all three values are encoded in one byte each
52 if ((p
= GetVarint32Ptr(p
, limit
, shared
)) == nullptr) return nullptr;
53 if ((p
= GetVarint32Ptr(p
, limit
, non_shared
)) == nullptr) return nullptr;
54 if ((p
= GetVarint32Ptr(p
, limit
, value_length
)) == nullptr) {
59 // Using an assert in place of "return null" since we should not pay the
60 // cost of checking for corruption on every single key decoding
61 assert(!(static_cast<uint32_t>(limit
- p
) < (*non_shared
+ *value_length
)));
66 // Helper routine: similar to DecodeEntry but does not have assertions.
67 // Instead, returns nullptr so that caller can detect and report failure.
68 struct CheckAndDecodeEntry
{
69 inline const char* operator()(const char* p
, const char* limit
,
70 uint32_t* shared
, uint32_t* non_shared
,
71 uint32_t* value_length
) {
72 // We need 2 bytes for shared and non_shared size. We also need one more
73 // byte either for value size or the actual value in case of value delta
78 *shared
= reinterpret_cast<const unsigned char*>(p
)[0];
79 *non_shared
= reinterpret_cast<const unsigned char*>(p
)[1];
80 *value_length
= reinterpret_cast<const unsigned char*>(p
)[2];
81 if ((*shared
| *non_shared
| *value_length
) < 128) {
82 // Fast path: all three values are encoded in one byte each
85 if ((p
= GetVarint32Ptr(p
, limit
, shared
)) == nullptr) return nullptr;
86 if ((p
= GetVarint32Ptr(p
, limit
, non_shared
)) == nullptr) return nullptr;
87 if ((p
= GetVarint32Ptr(p
, limit
, value_length
)) == nullptr) {
92 if (static_cast<uint32_t>(limit
- p
) < (*non_shared
+ *value_length
)) {
100 inline const char* operator()(const char* p
, const char* limit
,
101 uint32_t* shared
, uint32_t* non_shared
) {
102 uint32_t value_length
;
103 return DecodeEntry()(p
, limit
, shared
, non_shared
, &value_length
);
107 // In format_version 4, which is used by index blocks, the value size is not
108 // encoded before the entry, as the value is known to be the handle with the
111 inline const char* operator()(const char* p
, const char* limit
,
112 uint32_t* shared
, uint32_t* non_shared
) {
113 // We need 2 bytes for shared and non_shared size. We also need one more
114 // byte either for value size or the actual value in case of value delta
116 if (limit
- p
< 3) return nullptr;
117 *shared
= reinterpret_cast<const unsigned char*>(p
)[0];
118 *non_shared
= reinterpret_cast<const unsigned char*>(p
)[1];
119 if ((*shared
| *non_shared
) < 128) {
120 // Fast path: all three values are encoded in one byte each
123 if ((p
= GetVarint32Ptr(p
, limit
, shared
)) == nullptr) return nullptr;
124 if ((p
= GetVarint32Ptr(p
, limit
, non_shared
)) == nullptr) return nullptr;
130 void DataBlockIter::Next() {
132 ParseNextDataKey
<DecodeEntry
>();
135 void DataBlockIter::NextOrReport() {
137 ParseNextDataKey
<CheckAndDecodeEntry
>();
140 void IndexBlockIter::Next() {
145 void IndexBlockIter::Prev() {
147 // Scan backwards to a restart point before current_
148 const uint32_t original
= current_
;
149 while (GetRestartPoint(restart_index_
) >= original
) {
150 if (restart_index_
== 0) {
152 current_
= restarts_
;
153 restart_index_
= num_restarts_
;
158 SeekToRestartPoint(restart_index_
);
160 if (!ParseNextIndexKey()) {
163 // Loop until end of current entry hits the start of original entry
164 } while (NextEntryOffset() < original
);
167 // Similar to IndexBlockIter::Prev but also caches the prev entries
168 void DataBlockIter::Prev() {
171 assert(prev_entries_idx_
== -1 ||
172 static_cast<size_t>(prev_entries_idx_
) < prev_entries_
.size());
173 // Check if we can use cached prev_entries_
174 if (prev_entries_idx_
> 0 &&
175 prev_entries_
[prev_entries_idx_
].offset
== current_
) {
176 // Read cached CachedPrevEntry
178 const CachedPrevEntry
& current_prev_entry
=
179 prev_entries_
[prev_entries_idx_
];
181 const char* key_ptr
= nullptr;
182 if (current_prev_entry
.key_ptr
!= nullptr) {
183 // The key is not delta encoded and stored in the data block
184 key_ptr
= current_prev_entry
.key_ptr
;
187 // The key is delta encoded and stored in prev_entries_keys_buff_
188 key_ptr
= prev_entries_keys_buff_
.data() + current_prev_entry
.key_offset
;
191 const Slice
current_key(key_ptr
, current_prev_entry
.key_size
);
193 current_
= current_prev_entry
.offset
;
194 key_
.SetKey(current_key
, false /* copy */);
195 value_
= current_prev_entry
.value
;
200 // Clear prev entries cache
201 prev_entries_idx_
= -1;
202 prev_entries_
.clear();
203 prev_entries_keys_buff_
.clear();
205 // Scan backwards to a restart point before current_
206 const uint32_t original
= current_
;
207 while (GetRestartPoint(restart_index_
) >= original
) {
208 if (restart_index_
== 0) {
210 current_
= restarts_
;
211 restart_index_
= num_restarts_
;
217 SeekToRestartPoint(restart_index_
);
220 if (!ParseNextDataKey
<DecodeEntry
>()) {
223 Slice current_key
= key();
225 if (key_
.IsKeyPinned()) {
226 // The key is not delta encoded
227 prev_entries_
.emplace_back(current_
, current_key
.data(), 0,
228 current_key
.size(), value());
230 // The key is delta encoded, cache decoded key in buffer
231 size_t new_key_offset
= prev_entries_keys_buff_
.size();
232 prev_entries_keys_buff_
.append(current_key
.data(), current_key
.size());
234 prev_entries_
.emplace_back(current_
, nullptr, new_key_offset
,
235 current_key
.size(), value());
237 // Loop until end of current entry hits the start of original entry
238 } while (NextEntryOffset() < original
);
239 prev_entries_idx_
= static_cast<int32_t>(prev_entries_
.size()) - 1;
242 void DataBlockIter::Seek(const Slice
& target
) {
243 Slice seek_key
= target
;
244 PERF_TIMER_GUARD(block_seek_nanos
);
245 if (data_
== nullptr) { // Not init yet
249 bool ok
= BinarySeek
<DecodeKey
>(seek_key
, 0, num_restarts_
- 1, &index
,
255 SeekToRestartPoint(index
);
256 // Linear search (within restart block) for first key >= target
259 if (!ParseNextDataKey
<DecodeEntry
>() || Compare(key_
, seek_key
) >= 0) {
265 // Optimized Seek for point lookup for an internal key `target`
266 // target = "seek_user_key @ type | seqno".
268 // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
269 // or kTypeBlobIndex, this function behaves identically as Seek().
271 // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
272 // or kTypeBlobIndex:
274 // If the return value is FALSE, iter location is undefined, and it means:
275 // 1) there is no key in this block falling into the range:
276 // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
278 // 2) the last key of this block has a greater user_key from seek_user_key
280 // If the return value is TRUE, iter location has two possibilies:
281 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
282 // this case, it points to the first key_ with a larger user_key or a
283 // matching user_key with a seqno no greater than the seeking seqno.
284 // 2) If the iter is invalid, it means that either all the user_key is less
285 // than the seek_user_key, or the block ends with a matching user_key but
286 // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
288 bool DataBlockIter::SeekForGetImpl(const Slice
& target
) {
289 Slice user_key
= ExtractUserKey(target
);
290 uint32_t map_offset
= restarts_
+ num_restarts_
* sizeof(uint32_t);
291 uint8_t entry
= data_block_hash_index_
->Lookup(data_
, map_offset
, user_key
);
293 if (entry
== kCollision
) {
294 // HashSeek not effective, falling back
299 if (entry
== kNoEntry
) {
300 // Even if we cannot find the user_key in this block, the result may
301 // exist in the next block. Consider this exmpale:
303 // Block N: [aab@100, ... , app@120]
304 // bounary key: axy@50 (we make minimal assumption about a boundary key)
305 // Block N+1: [axy@10, ... ]
307 // If seek_key = axy@60, the search will starts from Block N.
308 // Even if the user_key is not found in the hash map, the caller still
309 // have to conntinue searching the next block.
311 // In this case, we pretend the key is the the last restart interval.
312 // The while-loop below will search the last restart interval for the
313 // key. It will stop at the first key that is larger than the seek_key,
314 // or to the end of the block if no one is larger.
315 entry
= static_cast<uint8_t>(num_restarts_
- 1);
318 uint32_t restart_index
= entry
;
320 // check if the key is in the restart_interval
321 assert(restart_index
< num_restarts_
);
322 SeekToRestartPoint(restart_index
);
324 const char* limit
= nullptr;
325 if (restart_index_
+ 1 < num_restarts_
) {
326 limit
= data_
+ GetRestartPoint(restart_index_
+ 1);
328 limit
= data_
+ restarts_
;
332 // Here we only linear seek the target key inside the restart interval.
333 // If a key does not exist inside a restart interval, we avoid
334 // further searching the block content accross restart interval boundary.
336 // TODO(fwu): check the left and write boundary of the restart interval
337 // to avoid linear seek a target key that is out of range.
338 if (!ParseNextDataKey
<DecodeEntry
>(limit
) || Compare(key_
, target
) >= 0) {
339 // we stop at the first potential matching user key.
344 if (current_
== restarts_
) {
345 // Search reaches to the end of the block. There are three possibilites:
346 // 1) there is only one user_key match in the block (otherwise collsion).
347 // the matching user_key resides in the last restart interval, and it
348 // is the last key of the restart interval and of the block as well.
349 // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
351 // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
352 // AND all existing user_keys in the restart interval are smaller than
355 // 3) The seek_key is a false positive and happens to be hashed to the
356 // last restart interval, AND all existing user_keys in the restart
357 // interval are smaller than seek_user_key.
359 // The result may exist in the next block each case, so we return true.
363 if (user_comparator_
->Compare(key_
.GetUserKey(), user_key
) != 0) {
364 // the key is not in this block and cannot be at the next block either.
368 // Here we are conservative and only support a limited set of cases
369 ValueType value_type
= ExtractValueType(key_
.GetKey());
370 if (value_type
!= ValueType::kTypeValue
&&
371 value_type
!= ValueType::kTypeDeletion
&&
372 value_type
!= ValueType::kTypeSingleDeletion
&&
373 value_type
!= ValueType::kTypeBlobIndex
) {
378 // Result found, and the iter is correctly set.
382 void IndexBlockIter::Seek(const Slice
& target
) {
383 Slice seek_key
= target
;
384 if (!key_includes_seq_
) {
385 seek_key
= ExtractUserKey(target
);
387 PERF_TIMER_GUARD(block_seek_nanos
);
388 if (data_
== nullptr) { // Not init yet
394 ok
= PrefixSeek(target
, &index
);
395 } else if (value_delta_encoded_
) {
396 ok
= BinarySeek
<DecodeKeyV4
>(seek_key
, 0, num_restarts_
- 1, &index
,
399 ok
= BinarySeek
<DecodeKey
>(seek_key
, 0, num_restarts_
- 1, &index
,
406 SeekToRestartPoint(index
);
407 // Linear search (within restart block) for first key >= target
410 if (!ParseNextIndexKey() || Compare(key_
, seek_key
) >= 0) {
416 void DataBlockIter::SeekForPrev(const Slice
& target
) {
417 PERF_TIMER_GUARD(block_seek_nanos
);
418 Slice seek_key
= target
;
419 if (data_
== nullptr) { // Not init yet
423 bool ok
= BinarySeek
<DecodeKey
>(seek_key
, 0, num_restarts_
- 1, &index
,
429 SeekToRestartPoint(index
);
430 // Linear search (within restart block) for first key >= seek_key
432 while (ParseNextDataKey
<DecodeEntry
>() && Compare(key_
, seek_key
) < 0) {
437 while (Valid() && Compare(key_
, seek_key
) > 0) {
443 void DataBlockIter::SeekToFirst() {
444 if (data_
== nullptr) { // Not init yet
447 SeekToRestartPoint(0);
448 ParseNextDataKey
<DecodeEntry
>();
451 void DataBlockIter::SeekToFirstOrReport() {
452 if (data_
== nullptr) { // Not init yet
455 SeekToRestartPoint(0);
456 ParseNextDataKey
<CheckAndDecodeEntry
>();
459 void IndexBlockIter::SeekToFirst() {
460 if (data_
== nullptr) { // Not init yet
463 SeekToRestartPoint(0);
467 void DataBlockIter::SeekToLast() {
468 if (data_
== nullptr) { // Not init yet
471 SeekToRestartPoint(num_restarts_
- 1);
472 while (ParseNextDataKey
<DecodeEntry
>() && NextEntryOffset() < restarts_
) {
477 void IndexBlockIter::SeekToLast() {
478 if (data_
== nullptr) { // Not init yet
481 SeekToRestartPoint(num_restarts_
- 1);
482 while (ParseNextIndexKey() && NextEntryOffset() < restarts_
) {
487 template <class TValue
>
488 void BlockIter
<TValue
>::CorruptionError() {
489 current_
= restarts_
;
490 restart_index_
= num_restarts_
;
491 status_
= Status::Corruption("bad entry in block");
496 template <typename DecodeEntryFunc
>
497 bool DataBlockIter::ParseNextDataKey(const char* limit
) {
498 current_
= NextEntryOffset();
499 const char* p
= data_
+ current_
;
501 limit
= data_
+ restarts_
; // Restarts come right after data
505 // No more entries to return. Mark as invalid.
506 current_
= restarts_
;
507 restart_index_
= num_restarts_
;
512 uint32_t shared
, non_shared
, value_length
;
513 p
= DecodeEntryFunc()(p
, limit
, &shared
, &non_shared
, &value_length
);
514 if (p
== nullptr || key_
.Size() < shared
) {
519 // If this key dont share any bytes with prev key then we dont need
520 // to decode it and can use it's address in the block directly.
521 key_
.SetKey(Slice(p
, non_shared
), false /* copy */);
524 // This key share `shared` bytes with prev key, we need to decode it
525 key_
.TrimAppend(shared
, p
, non_shared
);
529 if (global_seqno_
!= kDisableGlobalSequenceNumber
) {
530 // If we are reading a file with a global sequence number we should
531 // expect that all encoded sequence numbers are zeros and any value
532 // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
533 assert(GetInternalKeySeqno(key_
.GetInternalKey()) == 0);
535 ValueType value_type
= ExtractValueType(key_
.GetKey());
536 assert(value_type
== ValueType::kTypeValue
||
537 value_type
== ValueType::kTypeMerge
||
538 value_type
== ValueType::kTypeDeletion
||
539 value_type
== ValueType::kTypeRangeDeletion
);
542 // TODO(tec): Investigate updating the seqno in the loaded block
543 // directly instead of doing a copy and update.
545 // We cannot use the key address in the block directly because
546 // we have a global_seqno_ that will overwrite the encoded one.
551 key_
.UpdateInternalKey(global_seqno_
, value_type
);
554 value_
= Slice(p
+ non_shared
, value_length
);
556 while (restart_index_
+ 1 < num_restarts_
&&
557 GetRestartPoint(restart_index_
+ 1) < current_
) {
561 // else we are in the middle of a restart interval and the restart_index_
562 // thus has not changed
567 bool IndexBlockIter::ParseNextIndexKey() {
568 current_
= NextEntryOffset();
569 const char* p
= data_
+ current_
;
570 const char* limit
= data_
+ restarts_
; // Restarts come right after data
572 // No more entries to return. Mark as invalid.
573 current_
= restarts_
;
574 restart_index_
= num_restarts_
;
579 uint32_t shared
, non_shared
, value_length
;
580 if (value_delta_encoded_
) {
581 p
= DecodeKeyV4()(p
, limit
, &shared
, &non_shared
);
584 p
= DecodeEntry()(p
, limit
, &shared
, &non_shared
, &value_length
);
586 if (p
== nullptr || key_
.Size() < shared
) {
591 // If this key dont share any bytes with prev key then we dont need
592 // to decode it and can use it's address in the block directly.
593 key_
.SetKey(Slice(p
, non_shared
), false /* copy */);
596 // This key share `shared` bytes with prev key, we need to decode it
597 key_
.TrimAppend(shared
, p
, non_shared
);
600 value_
= Slice(p
+ non_shared
, value_length
);
602 while (restart_index_
+ 1 < num_restarts_
&&
603 GetRestartPoint(restart_index_
+ 1) < current_
) {
607 // else we are in the middle of a restart interval and the restart_index_
608 // thus has not changed
609 if (value_delta_encoded_
) {
610 assert(value_length
== 0);
611 DecodeCurrentValue(shared
);
617 // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
618 // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
620 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
621 // where, k is key, v is value, and its encoding is in parenthesis.
622 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
623 // The format of each value, i.e., block hanlde, is (offset, size) whenever the
624 // shared_size is 0, which included the first entry in each restart point.
625 // Otherwise the format is delta-size = block handle size - size of last block
627 void IndexBlockIter::DecodeCurrentValue(uint32_t shared
) {
628 assert(value_delta_encoded_
);
629 const char* limit
= data_
+ restarts_
;
632 const char* newp
= GetVarint64Ptr(value_
.data(), limit
, &o
);
634 newp
= GetVarint64Ptr(newp
, limit
, &s
);
636 decoded_value_
= BlockHandle(o
, s
);
637 value_
= Slice(value_
.data(), newp
- value_
.data());
639 uint64_t next_value_base
=
640 decoded_value_
.offset() + decoded_value_
.size() + kBlockTrailerSize
;
642 const char* newp
= GetVarsignedint64Ptr(value_
.data(), limit
, &delta
);
644 BlockHandle(next_value_base
, decoded_value_
.size() + delta
);
645 value_
= Slice(value_
.data(), newp
- value_
.data());
649 // Binary search in restart array to find the first restart point that
650 // is either the last restart point with a key less than target,
651 // which means the key of next restart point is larger than target, or
652 // the first restart point with a key = target
653 template <class TValue
>
654 template <typename DecodeKeyFunc
>
655 bool BlockIter
<TValue
>::BinarySeek(const Slice
& target
, uint32_t left
,
656 uint32_t right
, uint32_t* index
,
657 const Comparator
* comp
) {
658 assert(left
<= right
);
660 while (left
< right
) {
661 uint32_t mid
= (left
+ right
+ 1) / 2;
662 uint32_t region_offset
= GetRestartPoint(mid
);
663 uint32_t shared
, non_shared
;
664 const char* key_ptr
= DecodeKeyFunc()(
665 data_
+ region_offset
, data_
+ restarts_
, &shared
, &non_shared
);
666 if (key_ptr
== nullptr || (shared
!= 0)) {
670 Slice
mid_key(key_ptr
, non_shared
);
671 int cmp
= comp
->Compare(mid_key
, target
);
673 // Key at "mid" is smaller than "target". Therefore all
674 // blocks before "mid" are uninteresting.
676 } else if (cmp
> 0) {
677 // Key at "mid" is >= "target". Therefore all blocks at or
678 // after "mid" are uninteresting.
689 // Compare target key and the block key of the block of `block_index`.
690 // Return -1 if error.
691 int IndexBlockIter::CompareBlockKey(uint32_t block_index
, const Slice
& target
) {
692 uint32_t region_offset
= GetRestartPoint(block_index
);
693 uint32_t shared
, non_shared
;
694 const char* key_ptr
=
696 ? DecodeKeyV4()(data_
+ region_offset
, data_
+ restarts_
, &shared
,
698 : DecodeKey()(data_
+ region_offset
, data_
+ restarts_
, &shared
,
700 if (key_ptr
== nullptr || (shared
!= 0)) {
702 return 1; // Return target is smaller
704 Slice
block_key(key_ptr
, non_shared
);
705 return Compare(block_key
, target
);
708 // Binary search in block_ids to find the first block
709 // with a key >= target
710 bool IndexBlockIter::BinaryBlockIndexSeek(const Slice
& target
,
711 uint32_t* block_ids
, uint32_t left
,
712 uint32_t right
, uint32_t* index
) {
713 assert(left
<= right
);
714 uint32_t left_bound
= left
;
716 while (left
<= right
) {
717 uint32_t mid
= (right
+ left
) / 2;
719 int cmp
= CompareBlockKey(block_ids
[mid
], target
);
724 // Key at "target" is larger than "mid". Therefore all
725 // blocks before or at "mid" are uninteresting.
728 // Key at "target" is <= "mid". Therefore all blocks
729 // after "mid" are uninteresting.
730 // If there is only one block left, we found it.
731 if (left
== right
) break;
737 // In one of the two following cases:
738 // (1) left is the first one of block_ids
739 // (2) there is a gap of blocks between block of `left` and `left-1`.
740 // we can further distinguish the case of key in the block or key not
741 // existing, by comparing the target key and the key of the previous
742 // block to the left of the block found.
743 if (block_ids
[left
] > 0 &&
744 (left
== left_bound
|| block_ids
[left
- 1] != block_ids
[left
] - 1) &&
745 CompareBlockKey(block_ids
[left
] - 1, target
) > 0) {
746 current_
= restarts_
;
750 *index
= block_ids
[left
];
753 assert(left
> right
);
754 // Mark iterator invalid
755 current_
= restarts_
;
760 bool IndexBlockIter::PrefixSeek(const Slice
& target
, uint32_t* index
) {
761 assert(prefix_index_
);
762 Slice seek_key
= target
;
763 if (!key_includes_seq_
) {
764 seek_key
= ExtractUserKey(target
);
766 uint32_t* block_ids
= nullptr;
767 uint32_t num_blocks
= prefix_index_
->GetBlocks(target
, &block_ids
);
769 if (num_blocks
== 0) {
770 current_
= restarts_
;
773 return BinaryBlockIndexSeek(seek_key
, block_ids
, 0, num_blocks
- 1, index
);
777 uint32_t Block::NumRestarts() const {
778 assert(size_
>= 2 * sizeof(uint32_t));
779 uint32_t block_footer
= DecodeFixed32(data_
+ size_
- sizeof(uint32_t));
780 uint32_t num_restarts
= block_footer
;
781 if (size_
> kMaxBlockSizeSupportedByHashIndex
) {
782 // In BlockBuilder, we have ensured a block with HashIndex is less than
783 // kMaxBlockSizeSupportedByHashIndex (64KiB).
785 // Therefore, if we encounter a block with a size > 64KiB, the block
786 // cannot have HashIndex. So the footer will directly interpreted as
789 // Such check is for backward compatibility. We can ensure legacy block
790 // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
791 // correctly as no HashIndex even if the MSB of num_restarts is set.
794 BlockBasedTableOptions::DataBlockIndexType index_type
;
795 UnPackIndexTypeAndNumRestarts(block_footer
, &index_type
, &num_restarts
);
799 BlockBasedTableOptions::DataBlockIndexType
Block::IndexType() const {
800 assert(size_
>= 2 * sizeof(uint32_t));
801 if (size_
> kMaxBlockSizeSupportedByHashIndex
) {
802 // The check is for the same reason as that in NumRestarts()
803 return BlockBasedTableOptions::kDataBlockBinarySearch
;
805 uint32_t block_footer
= DecodeFixed32(data_
+ size_
- sizeof(uint32_t));
806 uint32_t num_restarts
= block_footer
;
807 BlockBasedTableOptions::DataBlockIndexType index_type
;
808 UnPackIndexTypeAndNumRestarts(block_footer
, &index_type
, &num_restarts
);
813 // This sync point can be re-enabled if RocksDB can control the
814 // initialization order of any/all static options created by the user.
815 // TEST_SYNC_POINT("Block::~Block");
818 Block::Block(BlockContents
&& contents
, SequenceNumber _global_seqno
,
819 size_t read_amp_bytes_per_bit
, Statistics
* statistics
)
820 : contents_(std::move(contents
)),
821 data_(contents_
.data
.data()),
822 size_(contents_
.data
.size()),
825 global_seqno_(_global_seqno
) {
826 TEST_SYNC_POINT("Block::Block:0");
827 if (size_
< sizeof(uint32_t)) {
828 size_
= 0; // Error marker
830 // Should only decode restart points for uncompressed blocks
831 num_restarts_
= NumRestarts();
832 switch (IndexType()) {
833 case BlockBasedTableOptions::kDataBlockBinarySearch
:
834 restart_offset_
= static_cast<uint32_t>(size_
) -
835 (1 + num_restarts_
) * sizeof(uint32_t);
836 if (restart_offset_
> size_
- sizeof(uint32_t)) {
837 // The size is too small for NumRestarts() and therefore
838 // restart_offset_ wrapped around.
842 case BlockBasedTableOptions::kDataBlockBinaryAndHash
:
843 if (size_
< sizeof(uint32_t) /* block footer */ +
844 sizeof(uint16_t) /* NUM_BUCK */) {
850 data_block_hash_index_
.Initialize(
851 contents
.data
.data(),
852 static_cast<uint16_t>(contents
.data
.size() -
853 sizeof(uint32_t)), /*chop off
857 restart_offset_
= map_offset
- num_restarts_
* sizeof(uint32_t);
859 if (restart_offset_
> map_offset
) {
860 // map_offset is too small for NumRestarts() and
861 // therefore restart_offset_ wrapped around.
867 size_
= 0; // Error marker
870 if (read_amp_bytes_per_bit
!= 0 && statistics
&& size_
!= 0) {
871 read_amp_bitmap_
.reset(new BlockReadAmpBitmap(
872 restart_offset_
, read_amp_bytes_per_bit
, statistics
));
877 DataBlockIter
* Block::NewIterator(const Comparator
* cmp
, const Comparator
* ucmp
,
878 DataBlockIter
* iter
, Statistics
* stats
,
879 bool /*total_order_seek*/,
880 bool /*key_includes_seq*/,
881 bool /*value_is_full*/,
882 bool block_contents_pinned
,
883 BlockPrefixIndex
* /*prefix_index*/) {
884 DataBlockIter
* ret_iter
;
885 if (iter
!= nullptr) {
888 ret_iter
= new DataBlockIter
;
890 if (size_
< 2 * sizeof(uint32_t)) {
891 ret_iter
->Invalidate(Status::Corruption("bad block contents"));
894 if (num_restarts_
== 0) {
896 ret_iter
->Invalidate(Status::OK());
899 ret_iter
->Initialize(
900 cmp
, ucmp
, data_
, restart_offset_
, num_restarts_
, global_seqno_
,
901 read_amp_bitmap_
.get(), block_contents_pinned
,
902 data_block_hash_index_
.Valid() ? &data_block_hash_index_
: nullptr);
903 if (read_amp_bitmap_
) {
904 if (read_amp_bitmap_
->GetStatistics() != stats
) {
905 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
906 read_amp_bitmap_
->SetStatistics(stats
);
915 IndexBlockIter
* Block::NewIterator(const Comparator
* cmp
,
916 const Comparator
* ucmp
, IndexBlockIter
* iter
,
917 Statistics
* /*stats*/, bool total_order_seek
,
918 bool key_includes_seq
, bool value_is_full
,
919 bool block_contents_pinned
,
920 BlockPrefixIndex
* prefix_index
) {
921 IndexBlockIter
* ret_iter
;
922 if (iter
!= nullptr) {
925 ret_iter
= new IndexBlockIter
;
927 if (size_
< 2 * sizeof(uint32_t)) {
928 ret_iter
->Invalidate(Status::Corruption("bad block contents"));
931 if (num_restarts_
== 0) {
933 ret_iter
->Invalidate(Status::OK());
936 BlockPrefixIndex
* prefix_index_ptr
=
937 total_order_seek
? nullptr : prefix_index
;
938 ret_iter
->Initialize(cmp
, ucmp
, data_
, restart_offset_
, num_restarts_
,
939 prefix_index_ptr
, key_includes_seq
, value_is_full
,
940 block_contents_pinned
,
941 nullptr /* data_block_hash_index */);
947 size_t Block::ApproximateMemoryUsage() const {
948 size_t usage
= usable_size();
949 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
950 usage
+= malloc_usable_size((void*)this);
952 usage
+= sizeof(*this);
953 #endif // ROCKSDB_MALLOC_USABLE_SIZE
954 if (read_amp_bitmap_
) {
955 usage
+= read_amp_bitmap_
->ApproximateMemoryUsage();
960 } // namespace rocksdb