1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 // Decodes the blocks generated by block_builder.cc.
12 #include "table/block_based/block.h"
15 #include <unordered_map>
18 #include "monitoring/perf_context_imp.h"
19 #include "port/port.h"
20 #include "port/stack_trace.h"
21 #include "rocksdb/comparator.h"
22 #include "table/block_based/block_prefix_index.h"
23 #include "table/block_based/data_block_footer.h"
24 #include "table/format.h"
25 #include "util/coding.h"
27 namespace ROCKSDB_NAMESPACE
{
29 // Helper routine: decode the next block entry starting at "p",
30 // storing the number of shared key bytes, non_shared key bytes,
31 // and the length of the value in "*shared", "*non_shared", and
32 // "*value_length", respectively. Will not derefence past "limit".
34 // If any errors are detected, returns nullptr. Otherwise, returns a
35 // pointer to the key delta (just past the three decoded values).
37 inline const char* operator()(const char* p
, const char* limit
,
38 uint32_t* shared
, uint32_t* non_shared
,
39 uint32_t* value_length
) {
40 // We need 2 bytes for shared and non_shared size. We also need one more
41 // byte either for value size or the actual value in case of value delta
43 assert(limit
- p
>= 3);
44 *shared
= reinterpret_cast<const unsigned char*>(p
)[0];
45 *non_shared
= reinterpret_cast<const unsigned char*>(p
)[1];
46 *value_length
= reinterpret_cast<const unsigned char*>(p
)[2];
47 if ((*shared
| *non_shared
| *value_length
) < 128) {
48 // Fast path: all three values are encoded in one byte each
51 if ((p
= GetVarint32Ptr(p
, limit
, shared
)) == nullptr) return nullptr;
52 if ((p
= GetVarint32Ptr(p
, limit
, non_shared
)) == nullptr) return nullptr;
53 if ((p
= GetVarint32Ptr(p
, limit
, value_length
)) == nullptr) {
58 // Using an assert in place of "return null" since we should not pay the
59 // cost of checking for corruption on every single key decoding
60 assert(!(static_cast<uint32_t>(limit
- p
) < (*non_shared
+ *value_length
)));
65 // Helper routine: similar to DecodeEntry but does not have assertions.
66 // Instead, returns nullptr so that caller can detect and report failure.
67 struct CheckAndDecodeEntry
{
68 inline const char* operator()(const char* p
, const char* limit
,
69 uint32_t* shared
, uint32_t* non_shared
,
70 uint32_t* value_length
) {
71 // We need 2 bytes for shared and non_shared size. We also need one more
72 // byte either for value size or the actual value in case of value delta
77 *shared
= reinterpret_cast<const unsigned char*>(p
)[0];
78 *non_shared
= reinterpret_cast<const unsigned char*>(p
)[1];
79 *value_length
= reinterpret_cast<const unsigned char*>(p
)[2];
80 if ((*shared
| *non_shared
| *value_length
) < 128) {
81 // Fast path: all three values are encoded in one byte each
84 if ((p
= GetVarint32Ptr(p
, limit
, shared
)) == nullptr) return nullptr;
85 if ((p
= GetVarint32Ptr(p
, limit
, non_shared
)) == nullptr) return nullptr;
86 if ((p
= GetVarint32Ptr(p
, limit
, value_length
)) == nullptr) {
91 if (static_cast<uint32_t>(limit
- p
) < (*non_shared
+ *value_length
)) {
99 inline const char* operator()(const char* p
, const char* limit
,
100 uint32_t* shared
, uint32_t* non_shared
) {
101 uint32_t value_length
;
102 return DecodeEntry()(p
, limit
, shared
, non_shared
, &value_length
);
106 // In format_version 4, which is used by index blocks, the value size is not
107 // encoded before the entry, as the value is known to be the handle with the
110 inline const char* operator()(const char* p
, const char* limit
,
111 uint32_t* shared
, uint32_t* non_shared
) {
112 // We need 2 bytes for shared and non_shared size. We also need one more
113 // byte either for value size or the actual value in case of value delta
115 if (limit
- p
< 3) return nullptr;
116 *shared
= reinterpret_cast<const unsigned char*>(p
)[0];
117 *non_shared
= reinterpret_cast<const unsigned char*>(p
)[1];
118 if ((*shared
| *non_shared
) < 128) {
119 // Fast path: all three values are encoded in one byte each
122 if ((p
= GetVarint32Ptr(p
, limit
, shared
)) == nullptr) return nullptr;
123 if ((p
= GetVarint32Ptr(p
, limit
, non_shared
)) == nullptr) return nullptr;
129 void DataBlockIter::NextImpl() { ParseNextDataKey
<DecodeEntry
>(); }
131 void DataBlockIter::NextOrReportImpl() {
132 ParseNextDataKey
<CheckAndDecodeEntry
>();
135 void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
137 void IndexBlockIter::PrevImpl() {
139 // Scan backwards to a restart point before current_
140 const uint32_t original
= current_
;
141 while (GetRestartPoint(restart_index_
) >= original
) {
142 if (restart_index_
== 0) {
144 current_
= restarts_
;
145 restart_index_
= num_restarts_
;
150 SeekToRestartPoint(restart_index_
);
151 // Loop until end of current entry hits the start of original entry
152 while (ParseNextIndexKey() && NextEntryOffset() < original
) {
156 // Similar to IndexBlockIter::PrevImpl but also caches the prev entries
157 void DataBlockIter::PrevImpl() {
160 assert(prev_entries_idx_
== -1 ||
161 static_cast<size_t>(prev_entries_idx_
) < prev_entries_
.size());
162 // Check if we can use cached prev_entries_
163 if (prev_entries_idx_
> 0 &&
164 prev_entries_
[prev_entries_idx_
].offset
== current_
) {
165 // Read cached CachedPrevEntry
167 const CachedPrevEntry
& current_prev_entry
=
168 prev_entries_
[prev_entries_idx_
];
170 const char* key_ptr
= nullptr;
172 if (current_prev_entry
.key_ptr
!= nullptr) {
173 // The key is not delta encoded and stored in the data block
174 key_ptr
= current_prev_entry
.key_ptr
;
175 raw_key_cached
= false;
177 // The key is delta encoded and stored in prev_entries_keys_buff_
178 key_ptr
= prev_entries_keys_buff_
.data() + current_prev_entry
.key_offset
;
179 raw_key_cached
= true;
181 const Slice
current_key(key_ptr
, current_prev_entry
.key_size
);
183 current_
= current_prev_entry
.offset
;
184 // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
185 // not necessity. It is convenient since this class treats keys as pinned
186 // when `raw_key_` points to an outside buffer. So we cannot allow
187 // `raw_key_` point into Prev cache as it is a transient outside buffer
188 // (i.e., keys in it are not actually pinned).
189 raw_key_
.SetKey(current_key
, raw_key_cached
/* copy */);
190 value_
= current_prev_entry
.value
;
195 // Clear prev entries cache
196 prev_entries_idx_
= -1;
197 prev_entries_
.clear();
198 prev_entries_keys_buff_
.clear();
200 // Scan backwards to a restart point before current_
201 const uint32_t original
= current_
;
202 while (GetRestartPoint(restart_index_
) >= original
) {
203 if (restart_index_
== 0) {
205 current_
= restarts_
;
206 restart_index_
= num_restarts_
;
212 SeekToRestartPoint(restart_index_
);
215 if (!ParseNextDataKey
<DecodeEntry
>()) {
218 Slice current_key
= raw_key_
.GetKey();
220 if (raw_key_
.IsKeyPinned()) {
221 // The key is not delta encoded
222 prev_entries_
.emplace_back(current_
, current_key
.data(), 0,
223 current_key
.size(), value());
225 // The key is delta encoded, cache decoded key in buffer
226 size_t new_key_offset
= prev_entries_keys_buff_
.size();
227 prev_entries_keys_buff_
.append(current_key
.data(), current_key
.size());
229 prev_entries_
.emplace_back(current_
, nullptr, new_key_offset
,
230 current_key
.size(), value());
232 // Loop until end of current entry hits the start of original entry
233 } while (NextEntryOffset() < original
);
234 prev_entries_idx_
= static_cast<int32_t>(prev_entries_
.size()) - 1;
237 void DataBlockIter::SeekImpl(const Slice
& target
) {
238 Slice seek_key
= target
;
239 PERF_TIMER_GUARD(block_seek_nanos
);
240 if (data_
== nullptr) { // Not init yet
244 bool skip_linear_scan
= false;
245 bool ok
= BinarySeek
<DecodeKey
>(seek_key
, &index
, &skip_linear_scan
);
250 FindKeyAfterBinarySeek(seek_key
, index
, skip_linear_scan
);
253 // Optimized Seek for point lookup for an internal key `target`
254 // target = "seek_user_key @ type | seqno".
256 // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
257 // or kTypeBlobIndex, this function behaves identically as Seek().
259 // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
260 // or kTypeBlobIndex:
262 // If the return value is FALSE, iter location is undefined, and it means:
263 // 1) there is no key in this block falling into the range:
264 // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
266 // 2) the last key of this block has a greater user_key from seek_user_key
268 // If the return value is TRUE, iter location has two possibilies:
269 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
270 // this case, it points to the first key with a larger user_key or a matching
271 // user_key with a seqno no greater than the seeking seqno.
272 // 2) If the iter is invalid, it means that either all the user_key is less
273 // than the seek_user_key, or the block ends with a matching user_key but
274 // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
276 bool DataBlockIter::SeekForGetImpl(const Slice
& target
) {
277 Slice target_user_key
= ExtractUserKey(target
);
278 uint32_t map_offset
= restarts_
+ num_restarts_
* sizeof(uint32_t);
280 data_block_hash_index_
->Lookup(data_
, map_offset
, target_user_key
);
282 if (entry
== kCollision
) {
283 // HashSeek not effective, falling back
288 if (entry
== kNoEntry
) {
289 // Even if we cannot find the user_key in this block, the result may
290 // exist in the next block. Consider this example:
292 // Block N: [aab@100, ... , app@120]
293 // boundary key: axy@50 (we make minimal assumption about a boundary key)
294 // Block N+1: [axy@10, ... ]
296 // If seek_key = axy@60, the search will starts from Block N.
297 // Even if the user_key is not found in the hash map, the caller still
298 // have to continue searching the next block.
300 // In this case, we pretend the key is the the last restart interval.
301 // The while-loop below will search the last restart interval for the
302 // key. It will stop at the first key that is larger than the seek_key,
303 // or to the end of the block if no one is larger.
304 entry
= static_cast<uint8_t>(num_restarts_
- 1);
307 uint32_t restart_index
= entry
;
309 // check if the key is in the restart_interval
310 assert(restart_index
< num_restarts_
);
311 SeekToRestartPoint(restart_index
);
313 const char* limit
= nullptr;
314 if (restart_index_
+ 1 < num_restarts_
) {
315 limit
= data_
+ GetRestartPoint(restart_index_
+ 1);
317 limit
= data_
+ restarts_
;
321 // Here we only linear seek the target key inside the restart interval.
322 // If a key does not exist inside a restart interval, we avoid
323 // further searching the block content accross restart interval boundary.
325 // TODO(fwu): check the left and write boundary of the restart interval
326 // to avoid linear seek a target key that is out of range.
327 if (!ParseNextDataKey
<DecodeEntry
>(limit
) ||
328 CompareCurrentKey(target
) >= 0) {
329 // we stop at the first potential matching user key.
334 if (current_
== restarts_
) {
335 // Search reaches to the end of the block. There are three possibilites:
336 // 1) there is only one user_key match in the block (otherwise collsion).
337 // the matching user_key resides in the last restart interval, and it
338 // is the last key of the restart interval and of the block as well.
339 // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
341 // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
342 // AND all existing user_keys in the restart interval are smaller than
345 // 3) The seek_key is a false positive and happens to be hashed to the
346 // last restart interval, AND all existing user_keys in the restart
347 // interval are smaller than seek_user_key.
349 // The result may exist in the next block each case, so we return true.
353 if (ucmp().Compare(raw_key_
.GetUserKey(), target_user_key
) != 0) {
354 // the key is not in this block and cannot be at the next block either.
358 // Here we are conservative and only support a limited set of cases
359 ValueType value_type
= ExtractValueType(raw_key_
.GetInternalKey());
360 if (value_type
!= ValueType::kTypeValue
&&
361 value_type
!= ValueType::kTypeDeletion
&&
362 value_type
!= ValueType::kTypeSingleDeletion
&&
363 value_type
!= ValueType::kTypeBlobIndex
) {
368 // Result found, and the iter is correctly set.
372 void IndexBlockIter::SeekImpl(const Slice
& target
) {
373 TEST_SYNC_POINT("IndexBlockIter::Seek:0");
374 PERF_TIMER_GUARD(block_seek_nanos
);
375 if (data_
== nullptr) { // Not init yet
378 Slice seek_key
= target
;
379 if (raw_key_
.IsUserKey()) {
380 seek_key
= ExtractUserKey(target
);
382 status_
= Status::OK();
384 bool skip_linear_scan
= false;
387 bool prefix_may_exist
= true;
388 ok
= PrefixSeek(target
, &index
, &prefix_may_exist
);
389 if (!prefix_may_exist
) {
390 // This is to let the caller to distinguish between non-existing prefix,
391 // and when key is larger than the last key, which both set Valid() to
393 current_
= restarts_
;
394 status_
= Status::NotFound();
396 // restart interval must be one when hash search is enabled so the binary
397 // search simply lands at the right place.
398 skip_linear_scan
= true;
399 } else if (value_delta_encoded_
) {
400 ok
= BinarySeek
<DecodeKeyV4
>(seek_key
, &index
, &skip_linear_scan
);
402 ok
= BinarySeek
<DecodeKey
>(seek_key
, &index
, &skip_linear_scan
);
408 FindKeyAfterBinarySeek(seek_key
, index
, skip_linear_scan
);
411 void DataBlockIter::SeekForPrevImpl(const Slice
& target
) {
412 PERF_TIMER_GUARD(block_seek_nanos
);
413 Slice seek_key
= target
;
414 if (data_
== nullptr) { // Not init yet
418 bool skip_linear_scan
= false;
419 bool ok
= BinarySeek
<DecodeKey
>(seek_key
, &index
, &skip_linear_scan
);
424 FindKeyAfterBinarySeek(seek_key
, index
, skip_linear_scan
);
429 while (Valid() && CompareCurrentKey(seek_key
) > 0) {
435 void DataBlockIter::SeekToFirstImpl() {
436 if (data_
== nullptr) { // Not init yet
439 SeekToRestartPoint(0);
440 ParseNextDataKey
<DecodeEntry
>();
443 void DataBlockIter::SeekToFirstOrReportImpl() {
444 if (data_
== nullptr) { // Not init yet
447 SeekToRestartPoint(0);
448 ParseNextDataKey
<CheckAndDecodeEntry
>();
451 void IndexBlockIter::SeekToFirstImpl() {
452 if (data_
== nullptr) { // Not init yet
455 status_
= Status::OK();
456 SeekToRestartPoint(0);
460 void DataBlockIter::SeekToLastImpl() {
461 if (data_
== nullptr) { // Not init yet
464 SeekToRestartPoint(num_restarts_
- 1);
465 while (ParseNextDataKey
<DecodeEntry
>() && NextEntryOffset() < restarts_
) {
470 void IndexBlockIter::SeekToLastImpl() {
471 if (data_
== nullptr) { // Not init yet
474 status_
= Status::OK();
475 SeekToRestartPoint(num_restarts_
- 1);
476 while (ParseNextIndexKey() && NextEntryOffset() < restarts_
) {
481 template <class TValue
>
482 void BlockIter
<TValue
>::CorruptionError() {
483 current_
= restarts_
;
484 restart_index_
= num_restarts_
;
485 status_
= Status::Corruption("bad entry in block");
490 template <typename DecodeEntryFunc
>
491 bool DataBlockIter::ParseNextDataKey(const char* limit
) {
492 current_
= NextEntryOffset();
493 const char* p
= data_
+ current_
;
495 limit
= data_
+ restarts_
; // Restarts come right after data
499 // No more entries to return. Mark as invalid.
500 current_
= restarts_
;
501 restart_index_
= num_restarts_
;
506 uint32_t shared
, non_shared
, value_length
;
507 p
= DecodeEntryFunc()(p
, limit
, &shared
, &non_shared
, &value_length
);
508 if (p
== nullptr || raw_key_
.Size() < shared
) {
513 // If this key doesn't share any bytes with prev key then we don't need
514 // to decode it and can use its address in the block directly.
515 raw_key_
.SetKey(Slice(p
, non_shared
), false /* copy */);
517 // This key share `shared` bytes with prev key, we need to decode it
518 raw_key_
.TrimAppend(shared
, p
, non_shared
);
522 if (global_seqno_
!= kDisableGlobalSequenceNumber
) {
523 // If we are reading a file with a global sequence number we should
524 // expect that all encoded sequence numbers are zeros and any value
525 // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
526 uint64_t packed
= ExtractInternalKeyFooter(raw_key_
.GetKey());
527 SequenceNumber seqno
;
528 ValueType value_type
;
529 UnPackSequenceAndType(packed
, &seqno
, &value_type
);
530 assert(value_type
== ValueType::kTypeValue
||
531 value_type
== ValueType::kTypeMerge
||
532 value_type
== ValueType::kTypeDeletion
||
533 value_type
== ValueType::kTypeRangeDeletion
);
538 value_
= Slice(p
+ non_shared
, value_length
);
540 while (restart_index_
+ 1 < num_restarts_
&&
541 GetRestartPoint(restart_index_
+ 1) < current_
) {
545 // else we are in the middle of a restart interval and the restart_index_
546 // thus has not changed
551 bool IndexBlockIter::ParseNextIndexKey() {
552 current_
= NextEntryOffset();
553 const char* p
= data_
+ current_
;
554 const char* limit
= data_
+ restarts_
; // Restarts come right after data
556 // No more entries to return. Mark as invalid.
557 current_
= restarts_
;
558 restart_index_
= num_restarts_
;
563 uint32_t shared
, non_shared
, value_length
;
564 if (value_delta_encoded_
) {
565 p
= DecodeKeyV4()(p
, limit
, &shared
, &non_shared
);
568 p
= DecodeEntry()(p
, limit
, &shared
, &non_shared
, &value_length
);
570 if (p
== nullptr || raw_key_
.Size() < shared
) {
575 // If this key doesn't share any bytes with prev key then we don't need
576 // to decode it and can use its address in the block directly.
577 raw_key_
.SetKey(Slice(p
, non_shared
), false /* copy */);
579 // This key share `shared` bytes with prev key, we need to decode it
580 raw_key_
.TrimAppend(shared
, p
, non_shared
);
582 value_
= Slice(p
+ non_shared
, value_length
);
584 while (restart_index_
+ 1 < num_restarts_
&&
585 GetRestartPoint(restart_index_
+ 1) < current_
) {
589 // else we are in the middle of a restart interval and the restart_index_
590 // thus has not changed
591 if (value_delta_encoded_
|| global_seqno_state_
!= nullptr) {
592 DecodeCurrentValue(shared
);
598 // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
599 // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
601 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
602 // where, k is key, v is value, and its encoding is in parenthesis.
603 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
604 // The format of each value, i.e., block handle, is (offset, size) whenever the
605 // shared_size is 0, which included the first entry in each restart point.
606 // Otherwise the format is delta-size = block handle size - size of last block
608 void IndexBlockIter::DecodeCurrentValue(uint32_t shared
) {
609 Slice
v(value_
.data(), data_
+ restarts_
- value_
.data());
610 // Delta encoding is used if `shared` != 0.
611 Status decode_s
__attribute__((__unused__
)) = decoded_value_
.DecodeFrom(
613 (value_delta_encoded_
&& shared
) ? &decoded_value_
.handle
: nullptr);
614 assert(decode_s
.ok());
615 value_
= Slice(value_
.data(), v
.data() - value_
.data());
617 if (global_seqno_state_
!= nullptr) {
618 // Overwrite sequence number the same way as in DataBlockIter.
620 IterKey
& first_internal_key
= global_seqno_state_
->first_internal_key
;
621 first_internal_key
.SetInternalKey(decoded_value_
.first_internal_key
,
624 assert(GetInternalKeySeqno(first_internal_key
.GetInternalKey()) == 0);
626 ValueType value_type
= ExtractValueType(first_internal_key
.GetKey());
627 assert(value_type
== ValueType::kTypeValue
||
628 value_type
== ValueType::kTypeMerge
||
629 value_type
== ValueType::kTypeDeletion
||
630 value_type
== ValueType::kTypeRangeDeletion
);
632 first_internal_key
.UpdateInternalKey(global_seqno_state_
->global_seqno
,
634 decoded_value_
.first_internal_key
= first_internal_key
.GetKey();
638 template <class TValue
>
639 void BlockIter
<TValue
>::FindKeyAfterBinarySeek(const Slice
& target
,
641 bool skip_linear_scan
) {
642 // SeekToRestartPoint() only does the lookup in the restart block. We need
643 // to follow it up with NextImpl() to position the iterator at the restart
645 SeekToRestartPoint(index
);
648 if (!skip_linear_scan
) {
649 // Linear search (within restart block) for first key >= target
651 if (index
+ 1 < num_restarts_
) {
652 // We are in a non-last restart interval. Since `BinarySeek()` guarantees
653 // the next restart key is strictly greater than `target`, we can
654 // terminate upon reaching it without any additional key comparison.
655 max_offset
= GetRestartPoint(index
+ 1);
657 // We are in the last restart interval. The while-loop will terminate by
658 // `Valid()` returning false upon advancing past the block's last key.
659 max_offset
= port::kMaxUint32
;
666 if (current_
== max_offset
) {
667 assert(CompareCurrentKey(target
) > 0);
669 } else if (CompareCurrentKey(target
) >= 0) {
676 // Binary searches in restart array to find the starting restart point for the
677 // linear scan, and stores it in `*index`. Assumes restart array does not
678 // contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
679 // is strictly greater than `target` or does not exist (this can be used to
680 // elide a comparison when linear scan reaches all the way to the next restart
681 // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
682 // `*index`th restart key is the final result so that key does not need to be
683 // compared again later.
684 template <class TValue
>
685 template <typename DecodeKeyFunc
>
686 bool BlockIter
<TValue
>::BinarySeek(const Slice
& target
, uint32_t* index
,
687 bool* skip_linear_scan
) {
688 if (restarts_
== 0) {
689 // SST files dedicated to range tombstones are written with index blocks
690 // that have no keys while also having `num_restarts_ == 1`. This would
691 // cause a problem for `BinarySeek()` as it'd try to access the first key
692 // which does not exist. We identify such blocks by the offset at which
693 // their restarts are stored, and return false to prevent any attempted
698 *skip_linear_scan
= false;
700 // - Restart key at index `left` is less than or equal to the target key. The
701 // sentinel index `-1` is considered to have a key that is less than all
703 // - Any restart keys after index `right` are strictly greater than the target
705 int64_t left
= -1, right
= num_restarts_
- 1;
706 while (left
!= right
) {
707 // The `mid` is computed by rounding up so it lands in (`left`, `right`].
708 int64_t mid
= left
+ (right
- left
+ 1) / 2;
709 uint32_t region_offset
= GetRestartPoint(static_cast<uint32_t>(mid
));
710 uint32_t shared
, non_shared
;
711 const char* key_ptr
= DecodeKeyFunc()(
712 data_
+ region_offset
, data_
+ restarts_
, &shared
, &non_shared
);
713 if (key_ptr
== nullptr || (shared
!= 0)) {
717 Slice
mid_key(key_ptr
, non_shared
);
718 raw_key_
.SetKey(mid_key
, false /* copy */);
719 int cmp
= CompareCurrentKey(target
);
721 // Key at "mid" is smaller than "target". Therefore all
722 // blocks before "mid" are uninteresting.
724 } else if (cmp
> 0) {
725 // Key at "mid" is >= "target". Therefore all blocks at or
726 // after "mid" are uninteresting.
729 *skip_linear_scan
= true;
735 // All keys in the block were strictly greater than `target`. So the very
736 // first key in the block is the final seek result.
737 *skip_linear_scan
= true;
740 *index
= static_cast<uint32_t>(left
);
745 // Compare target key and the block key of the block of `block_index`.
746 // Return -1 if error.
747 int IndexBlockIter::CompareBlockKey(uint32_t block_index
, const Slice
& target
) {
748 uint32_t region_offset
= GetRestartPoint(block_index
);
749 uint32_t shared
, non_shared
;
750 const char* key_ptr
=
752 ? DecodeKeyV4()(data_
+ region_offset
, data_
+ restarts_
, &shared
,
754 : DecodeKey()(data_
+ region_offset
, data_
+ restarts_
, &shared
,
756 if (key_ptr
== nullptr || (shared
!= 0)) {
758 return 1; // Return target is smaller
760 Slice
block_key(key_ptr
, non_shared
);
761 raw_key_
.SetKey(block_key
, false /* copy */);
762 return CompareCurrentKey(target
);
765 // Binary search in block_ids to find the first block
766 // with a key >= target
767 bool IndexBlockIter::BinaryBlockIndexSeek(const Slice
& target
,
768 uint32_t* block_ids
, uint32_t left
,
769 uint32_t right
, uint32_t* index
,
770 bool* prefix_may_exist
) {
771 assert(left
<= right
);
773 assert(prefix_may_exist
);
774 *prefix_may_exist
= true;
775 uint32_t left_bound
= left
;
777 while (left
<= right
) {
778 uint32_t mid
= (right
+ left
) / 2;
780 int cmp
= CompareBlockKey(block_ids
[mid
], target
);
785 // Key at "target" is larger than "mid". Therefore all
786 // blocks before or at "mid" are uninteresting.
789 // Key at "target" is <= "mid". Therefore all blocks
790 // after "mid" are uninteresting.
791 // If there is only one block left, we found it.
792 if (left
== right
) break;
798 // In one of the two following cases:
799 // (1) left is the first one of block_ids
800 // (2) there is a gap of blocks between block of `left` and `left-1`.
801 // we can further distinguish the case of key in the block or key not
802 // existing, by comparing the target key and the key of the previous
803 // block to the left of the block found.
804 if (block_ids
[left
] > 0 &&
805 (left
== left_bound
|| block_ids
[left
- 1] != block_ids
[left
] - 1) &&
806 CompareBlockKey(block_ids
[left
] - 1, target
) > 0) {
807 current_
= restarts_
;
808 *prefix_may_exist
= false;
812 *index
= block_ids
[left
];
815 assert(left
> right
);
817 // If the next block key is larger than seek key, it is possible that
818 // no key shares the prefix with `target`, or all keys with the same
819 // prefix as `target` are smaller than prefix. In the latter case,
820 // we are mandated to set the position the same as the total order.
821 // In the latter case, either:
822 // (1) `target` falls into the range of the next block. In this case,
823 // we can place the iterator to the next block, or
824 // (2) `target` is larger than all block keys. In this case we can
825 // keep the iterator invalidate without setting `prefix_may_exist`
827 // We might sometimes end up with setting the total order position
828 // while there is no key sharing the prefix as `target`, but it
829 // still follows the contract.
830 uint32_t right_index
= block_ids
[right
];
831 assert(right_index
+ 1 <= num_restarts_
);
832 if (right_index
+ 1 < num_restarts_
) {
833 if (CompareBlockKey(right_index
+ 1, target
) >= 0) {
834 *index
= right_index
+ 1;
837 // We have to set the flag here because we are not positioning
838 // the iterator to the total order position.
839 *prefix_may_exist
= false;
843 // Mark iterator invalid
844 current_
= restarts_
;
849 bool IndexBlockIter::PrefixSeek(const Slice
& target
, uint32_t* index
,
850 bool* prefix_may_exist
) {
852 assert(prefix_may_exist
);
853 assert(prefix_index_
);
854 *prefix_may_exist
= true;
855 Slice seek_key
= target
;
856 if (raw_key_
.IsUserKey()) {
857 seek_key
= ExtractUserKey(target
);
859 uint32_t* block_ids
= nullptr;
860 uint32_t num_blocks
= prefix_index_
->GetBlocks(target
, &block_ids
);
862 if (num_blocks
== 0) {
863 current_
= restarts_
;
864 *prefix_may_exist
= false;
868 return BinaryBlockIndexSeek(seek_key
, block_ids
, 0, num_blocks
- 1, index
,
873 uint32_t Block::NumRestarts() const {
874 assert(size_
>= 2 * sizeof(uint32_t));
875 uint32_t block_footer
= DecodeFixed32(data_
+ size_
- sizeof(uint32_t));
876 uint32_t num_restarts
= block_footer
;
877 if (size_
> kMaxBlockSizeSupportedByHashIndex
) {
878 // In BlockBuilder, we have ensured a block with HashIndex is less than
879 // kMaxBlockSizeSupportedByHashIndex (64KiB).
881 // Therefore, if we encounter a block with a size > 64KiB, the block
882 // cannot have HashIndex. So the footer will directly interpreted as
885 // Such check is for backward compatibility. We can ensure legacy block
886 // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
887 // correctly as no HashIndex even if the MSB of num_restarts is set.
890 BlockBasedTableOptions::DataBlockIndexType index_type
;
891 UnPackIndexTypeAndNumRestarts(block_footer
, &index_type
, &num_restarts
);
895 BlockBasedTableOptions::DataBlockIndexType
Block::IndexType() const {
896 assert(size_
>= 2 * sizeof(uint32_t));
897 if (size_
> kMaxBlockSizeSupportedByHashIndex
) {
898 // The check is for the same reason as that in NumRestarts()
899 return BlockBasedTableOptions::kDataBlockBinarySearch
;
901 uint32_t block_footer
= DecodeFixed32(data_
+ size_
- sizeof(uint32_t));
902 uint32_t num_restarts
= block_footer
;
903 BlockBasedTableOptions::DataBlockIndexType index_type
;
904 UnPackIndexTypeAndNumRestarts(block_footer
, &index_type
, &num_restarts
);
909 // This sync point can be re-enabled if RocksDB can control the
910 // initialization order of any/all static options created by the user.
911 // TEST_SYNC_POINT("Block::~Block");
914 Block::Block(BlockContents
&& contents
, size_t read_amp_bytes_per_bit
,
915 Statistics
* statistics
)
916 : contents_(std::move(contents
)),
917 data_(contents_
.data
.data()),
918 size_(contents_
.data
.size()),
921 TEST_SYNC_POINT("Block::Block:0");
922 if (size_
< sizeof(uint32_t)) {
923 size_
= 0; // Error marker
925 // Should only decode restart points for uncompressed blocks
926 num_restarts_
= NumRestarts();
927 switch (IndexType()) {
928 case BlockBasedTableOptions::kDataBlockBinarySearch
:
929 restart_offset_
= static_cast<uint32_t>(size_
) -
930 (1 + num_restarts_
) * sizeof(uint32_t);
931 if (restart_offset_
> size_
- sizeof(uint32_t)) {
932 // The size is too small for NumRestarts() and therefore
933 // restart_offset_ wrapped around.
937 case BlockBasedTableOptions::kDataBlockBinaryAndHash
:
938 if (size_
< sizeof(uint32_t) /* block footer */ +
939 sizeof(uint16_t) /* NUM_BUCK */) {
945 data_block_hash_index_
.Initialize(
946 contents
.data
.data(),
947 static_cast<uint16_t>(contents
.data
.size() -
948 sizeof(uint32_t)), /*chop off
952 restart_offset_
= map_offset
- num_restarts_
* sizeof(uint32_t);
954 if (restart_offset_
> map_offset
) {
955 // map_offset is too small for NumRestarts() and
956 // therefore restart_offset_ wrapped around.
962 size_
= 0; // Error marker
965 if (read_amp_bytes_per_bit
!= 0 && statistics
&& size_
!= 0) {
966 read_amp_bitmap_
.reset(new BlockReadAmpBitmap(
967 restart_offset_
, read_amp_bytes_per_bit
, statistics
));
971 DataBlockIter
* Block::NewDataIterator(const Comparator
* raw_ucmp
,
972 SequenceNumber global_seqno
,
973 DataBlockIter
* iter
, Statistics
* stats
,
974 bool block_contents_pinned
) {
975 DataBlockIter
* ret_iter
;
976 if (iter
!= nullptr) {
979 ret_iter
= new DataBlockIter
;
981 if (size_
< 2 * sizeof(uint32_t)) {
982 ret_iter
->Invalidate(Status::Corruption("bad block contents"));
985 if (num_restarts_
== 0) {
987 ret_iter
->Invalidate(Status::OK());
990 ret_iter
->Initialize(
991 raw_ucmp
, data_
, restart_offset_
, num_restarts_
, global_seqno
,
992 read_amp_bitmap_
.get(), block_contents_pinned
,
993 data_block_hash_index_
.Valid() ? &data_block_hash_index_
: nullptr);
994 if (read_amp_bitmap_
) {
995 if (read_amp_bitmap_
->GetStatistics() != stats
) {
996 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
997 read_amp_bitmap_
->SetStatistics(stats
);
1005 IndexBlockIter
* Block::NewIndexIterator(
1006 const Comparator
* raw_ucmp
, SequenceNumber global_seqno
,
1007 IndexBlockIter
* iter
, Statistics
* /*stats*/, bool total_order_seek
,
1008 bool have_first_key
, bool key_includes_seq
, bool value_is_full
,
1009 bool block_contents_pinned
, BlockPrefixIndex
* prefix_index
) {
1010 IndexBlockIter
* ret_iter
;
1011 if (iter
!= nullptr) {
1014 ret_iter
= new IndexBlockIter
;
1016 if (size_
< 2 * sizeof(uint32_t)) {
1017 ret_iter
->Invalidate(Status::Corruption("bad block contents"));
1020 if (num_restarts_
== 0) {
1022 ret_iter
->Invalidate(Status::OK());
1025 BlockPrefixIndex
* prefix_index_ptr
=
1026 total_order_seek
? nullptr : prefix_index
;
1027 ret_iter
->Initialize(raw_ucmp
, data_
, restart_offset_
, num_restarts_
,
1028 global_seqno
, prefix_index_ptr
, have_first_key
,
1029 key_includes_seq
, value_is_full
,
1030 block_contents_pinned
);
1036 size_t Block::ApproximateMemoryUsage() const {
1037 size_t usage
= usable_size();
1038 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
1039 usage
+= malloc_usable_size((void*)this);
1041 usage
+= sizeof(*this);
1042 #endif // ROCKSDB_MALLOC_USABLE_SIZE
1043 if (read_amp_bitmap_
) {
1044 usage
+= read_amp_bitmap_
->ApproximateMemoryUsage();
1049 } // namespace ROCKSDB_NAMESPACE