]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/block_based/block.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / block_based / block.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9//
10// Decodes the blocks generated by block_builder.cc.
11
f67539c2 12#include "table/block_based/block.h"
7c673cae
FG
13#include <algorithm>
14#include <string>
15#include <unordered_map>
16#include <vector>
17
18#include "monitoring/perf_context_imp.h"
19#include "port/port.h"
20#include "port/stack_trace.h"
21#include "rocksdb/comparator.h"
f67539c2
TL
22#include "table/block_based/block_prefix_index.h"
23#include "table/block_based/data_block_footer.h"
7c673cae
FG
24#include "table/format.h"
25#include "util/coding.h"
7c673cae 26
f67539c2 27namespace ROCKSDB_NAMESPACE {
7c673cae
FG
28
29// Helper routine: decode the next block entry starting at "p",
30// storing the number of shared key bytes, non_shared key bytes,
31// and the length of the value in "*shared", "*non_shared", and
32// "*value_length", respectively. Will not derefence past "limit".
33//
34// If any errors are detected, returns nullptr. Otherwise, returns a
35// pointer to the key delta (just past the three decoded values).
11fdf7f2
TL
36struct DecodeEntry {
37 inline const char* operator()(const char* p, const char* limit,
38 uint32_t* shared, uint32_t* non_shared,
39 uint32_t* value_length) {
40 // We need 2 bytes for shared and non_shared size. We also need one more
41 // byte either for value size or the actual value in case of value delta
42 // encoding.
43 assert(limit - p >= 3);
44 *shared = reinterpret_cast<const unsigned char*>(p)[0];
45 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
46 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
47 if ((*shared | *non_shared | *value_length) < 128) {
48 // Fast path: all three values are encoded in one byte each
49 p += 3;
50 } else {
51 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
52 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
53 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
54 return nullptr;
55 }
56 }
57
58 // Using an assert in place of "return null" since we should not pay the
59 // cost of checking for corruption on every single key decoding
60 assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
61 return p;
7c673cae 62 }
11fdf7f2 63};
7c673cae 64
494da23a
TL
65// Helper routine: similar to DecodeEntry but does not have assertions.
66// Instead, returns nullptr so that caller can detect and report failure.
67struct CheckAndDecodeEntry {
68 inline const char* operator()(const char* p, const char* limit,
69 uint32_t* shared, uint32_t* non_shared,
70 uint32_t* value_length) {
71 // We need 2 bytes for shared and non_shared size. We also need one more
72 // byte either for value size or the actual value in case of value delta
73 // encoding.
74 if (limit - p < 3) {
75 return nullptr;
76 }
77 *shared = reinterpret_cast<const unsigned char*>(p)[0];
78 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
79 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
80 if ((*shared | *non_shared | *value_length) < 128) {
81 // Fast path: all three values are encoded in one byte each
82 p += 3;
83 } else {
84 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
85 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
86 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
87 return nullptr;
88 }
89 }
90
91 if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
92 return nullptr;
93 }
94 return p;
95 }
96};
97
11fdf7f2
TL
98struct DecodeKey {
99 inline const char* operator()(const char* p, const char* limit,
100 uint32_t* shared, uint32_t* non_shared) {
101 uint32_t value_length;
102 return DecodeEntry()(p, limit, shared, non_shared, &value_length);
7c673cae 103 }
11fdf7f2
TL
104};
105
106// In format_version 4, which is used by index blocks, the value size is not
107// encoded before the entry, as the value is known to be the handle with the
108// known size.
109struct DecodeKeyV4 {
110 inline const char* operator()(const char* p, const char* limit,
111 uint32_t* shared, uint32_t* non_shared) {
112 // We need 2 bytes for shared and non_shared size. We also need one more
113 // byte either for value size or the actual value in case of value delta
114 // encoding.
115 if (limit - p < 3) return nullptr;
116 *shared = reinterpret_cast<const unsigned char*>(p)[0];
117 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
118 if ((*shared | *non_shared) < 128) {
119 // Fast path: all three values are encoded in one byte each
120 p += 2;
121 } else {
122 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
123 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
124 }
125 return p;
126 }
127};
128
20effc67 129void DataBlockIter::NextImpl() { ParseNextDataKey<DecodeEntry>(); }
494da23a 130
20effc67 131void DataBlockIter::NextOrReportImpl() {
494da23a 132 ParseNextDataKey<CheckAndDecodeEntry>();
7c673cae
FG
133}
134
20effc67 135void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
7c673cae 136
20effc67 137void IndexBlockIter::PrevImpl() {
11fdf7f2
TL
138 assert(Valid());
139 // Scan backwards to a restart point before current_
140 const uint32_t original = current_;
141 while (GetRestartPoint(restart_index_) >= original) {
142 if (restart_index_ == 0) {
143 // No more entries
144 current_ = restarts_;
145 restart_index_ = num_restarts_;
146 return;
147 }
148 restart_index_--;
149 }
150 SeekToRestartPoint(restart_index_);
f67539c2
TL
151 // Loop until end of current entry hits the start of original entry
152 while (ParseNextIndexKey() && NextEntryOffset() < original) {
153 }
11fdf7f2
TL
154}
155
20effc67
TL
156// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
157void DataBlockIter::PrevImpl() {
7c673cae
FG
158 assert(Valid());
159
160 assert(prev_entries_idx_ == -1 ||
161 static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
162 // Check if we can use cached prev_entries_
163 if (prev_entries_idx_ > 0 &&
164 prev_entries_[prev_entries_idx_].offset == current_) {
165 // Read cached CachedPrevEntry
166 prev_entries_idx_--;
167 const CachedPrevEntry& current_prev_entry =
168 prev_entries_[prev_entries_idx_];
169
170 const char* key_ptr = nullptr;
20effc67 171 bool raw_key_cached;
7c673cae
FG
172 if (current_prev_entry.key_ptr != nullptr) {
173 // The key is not delta encoded and stored in the data block
174 key_ptr = current_prev_entry.key_ptr;
20effc67 175 raw_key_cached = false;
7c673cae
FG
176 } else {
177 // The key is delta encoded and stored in prev_entries_keys_buff_
178 key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
20effc67 179 raw_key_cached = true;
7c673cae
FG
180 }
181 const Slice current_key(key_ptr, current_prev_entry.key_size);
182
183 current_ = current_prev_entry.offset;
20effc67
TL
184 // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
185 // not necessity. It is convenient since this class treats keys as pinned
186 // when `raw_key_` points to an outside buffer. So we cannot allow
187 // `raw_key_` point into Prev cache as it is a transient outside buffer
188 // (i.e., keys in it are not actually pinned).
189 raw_key_.SetKey(current_key, raw_key_cached /* copy */);
7c673cae
FG
190 value_ = current_prev_entry.value;
191
192 return;
193 }
194
195 // Clear prev entries cache
196 prev_entries_idx_ = -1;
197 prev_entries_.clear();
198 prev_entries_keys_buff_.clear();
199
200 // Scan backwards to a restart point before current_
201 const uint32_t original = current_;
202 while (GetRestartPoint(restart_index_) >= original) {
203 if (restart_index_ == 0) {
204 // No more entries
205 current_ = restarts_;
206 restart_index_ = num_restarts_;
207 return;
208 }
209 restart_index_--;
210 }
211
212 SeekToRestartPoint(restart_index_);
213
214 do {
494da23a 215 if (!ParseNextDataKey<DecodeEntry>()) {
7c673cae
FG
216 break;
217 }
20effc67 218 Slice current_key = raw_key_.GetKey();
7c673cae 219
20effc67 220 if (raw_key_.IsKeyPinned()) {
7c673cae
FG
221 // The key is not delta encoded
222 prev_entries_.emplace_back(current_, current_key.data(), 0,
223 current_key.size(), value());
224 } else {
225 // The key is delta encoded, cache decoded key in buffer
226 size_t new_key_offset = prev_entries_keys_buff_.size();
227 prev_entries_keys_buff_.append(current_key.data(), current_key.size());
228
229 prev_entries_.emplace_back(current_, nullptr, new_key_offset,
230 current_key.size(), value());
231 }
232 // Loop until end of current entry hits the start of original entry
233 } while (NextEntryOffset() < original);
234 prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
235}
236
20effc67 237void DataBlockIter::SeekImpl(const Slice& target) {
11fdf7f2
TL
238 Slice seek_key = target;
239 PERF_TIMER_GUARD(block_seek_nanos);
240 if (data_ == nullptr) { // Not init yet
241 return;
242 }
243 uint32_t index = 0;
20effc67
TL
244 bool skip_linear_scan = false;
245 bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
11fdf7f2
TL
246
247 if (!ok) {
248 return;
249 }
20effc67 250 FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
11fdf7f2
TL
251}
252
253// Optimized Seek for point lookup for an internal key `target`
254// target = "seek_user_key @ type | seqno".
255//
256// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
257// or kTypeBlobIndex, this function behaves identically as Seek().
258//
259// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
260// or kTypeBlobIndex:
261//
262// If the return value is FALSE, iter location is undefined, and it means:
263// 1) there is no key in this block falling into the range:
264// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
265// inclusive; AND
266// 2) the last key of this block has a greater user_key from seek_user_key
267//
268// If the return value is TRUE, iter location has two possibilies:
269// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
20effc67
TL
270// this case, it points to the first key with a larger user_key or a matching
271// user_key with a seqno no greater than the seeking seqno.
11fdf7f2
TL
272// 2) If the iter is invalid, it means that either all the user_key is less
273// than the seek_user_key, or the block ends with a matching user_key but
274// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
275// but larger type).
276bool DataBlockIter::SeekForGetImpl(const Slice& target) {
f67539c2 277 Slice target_user_key = ExtractUserKey(target);
11fdf7f2 278 uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
f67539c2
TL
279 uint8_t entry =
280 data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
11fdf7f2
TL
281
282 if (entry == kCollision) {
283 // HashSeek not effective, falling back
20effc67 284 SeekImpl(target);
11fdf7f2
TL
285 return true;
286 }
287
288 if (entry == kNoEntry) {
289 // Even if we cannot find the user_key in this block, the result may
20effc67 290 // exist in the next block. Consider this example:
11fdf7f2
TL
291 //
292 // Block N: [aab@100, ... , app@120]
20effc67 293 // boundary key: axy@50 (we make minimal assumption about a boundary key)
11fdf7f2
TL
294 // Block N+1: [axy@10, ... ]
295 //
296 // If seek_key = axy@60, the search will starts from Block N.
297 // Even if the user_key is not found in the hash map, the caller still
20effc67 298 // have to continue searching the next block.
11fdf7f2
TL
299 //
300 // In this case, we pretend the key is the the last restart interval.
301 // The while-loop below will search the last restart interval for the
302 // key. It will stop at the first key that is larger than the seek_key,
303 // or to the end of the block if no one is larger.
304 entry = static_cast<uint8_t>(num_restarts_ - 1);
305 }
306
307 uint32_t restart_index = entry;
308
309 // check if the key is in the restart_interval
310 assert(restart_index < num_restarts_);
311 SeekToRestartPoint(restart_index);
312
313 const char* limit = nullptr;
314 if (restart_index_ + 1 < num_restarts_) {
315 limit = data_ + GetRestartPoint(restart_index_ + 1);
316 } else {
317 limit = data_ + restarts_;
318 }
319
320 while (true) {
321 // Here we only linear seek the target key inside the restart interval.
322 // If a key does not exist inside a restart interval, we avoid
323 // further searching the block content accross restart interval boundary.
324 //
325 // TODO(fwu): check the left and write boundary of the restart interval
326 // to avoid linear seek a target key that is out of range.
20effc67
TL
327 if (!ParseNextDataKey<DecodeEntry>(limit) ||
328 CompareCurrentKey(target) >= 0) {
11fdf7f2
TL
329 // we stop at the first potential matching user key.
330 break;
331 }
332 }
333
334 if (current_ == restarts_) {
335 // Search reaches to the end of the block. There are three possibilites:
336 // 1) there is only one user_key match in the block (otherwise collsion).
337 // the matching user_key resides in the last restart interval, and it
338 // is the last key of the restart interval and of the block as well.
339 // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
340 //
341 // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
342 // AND all existing user_keys in the restart interval are smaller than
343 // seek_user_key.
344 //
345 // 3) The seek_key is a false positive and happens to be hashed to the
346 // last restart interval, AND all existing user_keys in the restart
347 // interval are smaller than seek_user_key.
348 //
349 // The result may exist in the next block each case, so we return true.
350 return true;
351 }
352
20effc67 353 if (ucmp().Compare(raw_key_.GetUserKey(), target_user_key) != 0) {
11fdf7f2
TL
354 // the key is not in this block and cannot be at the next block either.
355 return false;
356 }
357
358 // Here we are conservative and only support a limited set of cases
20effc67 359 ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
11fdf7f2
TL
360 if (value_type != ValueType::kTypeValue &&
361 value_type != ValueType::kTypeDeletion &&
362 value_type != ValueType::kTypeSingleDeletion &&
363 value_type != ValueType::kTypeBlobIndex) {
20effc67 364 SeekImpl(target);
11fdf7f2
TL
365 return true;
366 }
367
368 // Result found, and the iter is correctly set.
369 return true;
370}
371
20effc67 372void IndexBlockIter::SeekImpl(const Slice& target) {
f67539c2 373 TEST_SYNC_POINT("IndexBlockIter::Seek:0");
7c673cae
FG
374 PERF_TIMER_GUARD(block_seek_nanos);
375 if (data_ == nullptr) { // Not init yet
376 return;
377 }
20effc67
TL
378 Slice seek_key = target;
379 if (raw_key_.IsUserKey()) {
380 seek_key = ExtractUserKey(target);
381 }
f67539c2 382 status_ = Status::OK();
7c673cae 383 uint32_t index = 0;
20effc67 384 bool skip_linear_scan = false;
7c673cae
FG
385 bool ok = false;
386 if (prefix_index_) {
f67539c2
TL
387 bool prefix_may_exist = true;
388 ok = PrefixSeek(target, &index, &prefix_may_exist);
389 if (!prefix_may_exist) {
390 // This is to let the caller to distinguish between non-existing prefix,
391 // and when key is larger than the last key, which both set Valid() to
392 // false.
393 current_ = restarts_;
394 status_ = Status::NotFound();
395 }
20effc67
TL
396 // restart interval must be one when hash search is enabled so the binary
397 // search simply lands at the right place.
398 skip_linear_scan = true;
11fdf7f2 399 } else if (value_delta_encoded_) {
20effc67 400 ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
7c673cae 401 } else {
20effc67 402 ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
7c673cae
FG
403 }
404
405 if (!ok) {
406 return;
407 }
20effc67 408 FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
7c673cae
FG
409}
410
20effc67 411void DataBlockIter::SeekForPrevImpl(const Slice& target) {
7c673cae 412 PERF_TIMER_GUARD(block_seek_nanos);
11fdf7f2 413 Slice seek_key = target;
7c673cae
FG
414 if (data_ == nullptr) { // Not init yet
415 return;
416 }
417 uint32_t index = 0;
20effc67
TL
418 bool skip_linear_scan = false;
419 bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
7c673cae
FG
420
421 if (!ok) {
422 return;
423 }
20effc67 424 FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
7c673cae 425
7c673cae 426 if (!Valid()) {
20effc67 427 SeekToLastImpl();
7c673cae 428 } else {
20effc67
TL
429 while (Valid() && CompareCurrentKey(seek_key) > 0) {
430 PrevImpl();
7c673cae
FG
431 }
432 }
433}
434
20effc67 435void DataBlockIter::SeekToFirstImpl() {
7c673cae
FG
436 if (data_ == nullptr) { // Not init yet
437 return;
438 }
439 SeekToRestartPoint(0);
494da23a
TL
440 ParseNextDataKey<DecodeEntry>();
441}
442
20effc67 443void DataBlockIter::SeekToFirstOrReportImpl() {
494da23a
TL
444 if (data_ == nullptr) { // Not init yet
445 return;
446 }
447 SeekToRestartPoint(0);
448 ParseNextDataKey<CheckAndDecodeEntry>();
7c673cae
FG
449}
450
20effc67 451void IndexBlockIter::SeekToFirstImpl() {
11fdf7f2
TL
452 if (data_ == nullptr) { // Not init yet
453 return;
454 }
f67539c2 455 status_ = Status::OK();
11fdf7f2
TL
456 SeekToRestartPoint(0);
457 ParseNextIndexKey();
458}
459
20effc67 460void DataBlockIter::SeekToLastImpl() {
7c673cae
FG
461 if (data_ == nullptr) { // Not init yet
462 return;
463 }
464 SeekToRestartPoint(num_restarts_ - 1);
494da23a 465 while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
7c673cae
FG
466 // Keep skipping
467 }
468}
469
20effc67 470void IndexBlockIter::SeekToLastImpl() {
11fdf7f2
TL
471 if (data_ == nullptr) { // Not init yet
472 return;
473 }
f67539c2 474 status_ = Status::OK();
11fdf7f2
TL
475 SeekToRestartPoint(num_restarts_ - 1);
476 while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
477 // Keep skipping
478 }
479}
480
481template <class TValue>
482void BlockIter<TValue>::CorruptionError() {
7c673cae
FG
483 current_ = restarts_;
484 restart_index_ = num_restarts_;
485 status_ = Status::Corruption("bad entry in block");
20effc67 486 raw_key_.Clear();
7c673cae
FG
487 value_.clear();
488}
489
494da23a 490template <typename DecodeEntryFunc>
11fdf7f2 491bool DataBlockIter::ParseNextDataKey(const char* limit) {
7c673cae
FG
492 current_ = NextEntryOffset();
493 const char* p = data_ + current_;
11fdf7f2
TL
494 if (!limit) {
495 limit = data_ + restarts_; // Restarts come right after data
496 }
497
7c673cae
FG
498 if (p >= limit) {
499 // No more entries to return. Mark as invalid.
500 current_ = restarts_;
501 restart_index_ = num_restarts_;
502 return false;
503 }
504
505 // Decode next entry
506 uint32_t shared, non_shared, value_length;
494da23a 507 p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
20effc67 508 if (p == nullptr || raw_key_.Size() < shared) {
7c673cae
FG
509 CorruptionError();
510 return false;
511 } else {
512 if (shared == 0) {
20effc67
TL
513 // If this key doesn't share any bytes with prev key then we don't need
514 // to decode it and can use its address in the block directly.
515 raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
7c673cae
FG
516 } else {
517 // This key share `shared` bytes with prev key, we need to decode it
20effc67 518 raw_key_.TrimAppend(shared, p, non_shared);
7c673cae
FG
519 }
520
20effc67 521#ifndef NDEBUG
7c673cae
FG
522 if (global_seqno_ != kDisableGlobalSequenceNumber) {
523 // If we are reading a file with a global sequence number we should
11fdf7f2
TL
524 // expect that all encoded sequence numbers are zeros and any value
525 // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
20effc67
TL
526 uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
527 SequenceNumber seqno;
528 ValueType value_type;
529 UnPackSequenceAndType(packed, &seqno, &value_type);
11fdf7f2
TL
530 assert(value_type == ValueType::kTypeValue ||
531 value_type == ValueType::kTypeMerge ||
532 value_type == ValueType::kTypeDeletion ||
533 value_type == ValueType::kTypeRangeDeletion);
20effc67 534 assert(seqno == 0);
7c673cae 535 }
20effc67 536#endif // NDEBUG
7c673cae
FG
537
538 value_ = Slice(p + non_shared, value_length);
11fdf7f2
TL
539 if (shared == 0) {
540 while (restart_index_ + 1 < num_restarts_ &&
541 GetRestartPoint(restart_index_ + 1) < current_) {
542 ++restart_index_;
543 }
544 }
545 // else we are in the middle of a restart interval and the restart_index_
546 // thus has not changed
547 return true;
548 }
549}
550
551bool IndexBlockIter::ParseNextIndexKey() {
552 current_ = NextEntryOffset();
553 const char* p = data_ + current_;
554 const char* limit = data_ + restarts_; // Restarts come right after data
555 if (p >= limit) {
556 // No more entries to return. Mark as invalid.
557 current_ = restarts_;
558 restart_index_ = num_restarts_;
559 return false;
560 }
561
562 // Decode next entry
563 uint32_t shared, non_shared, value_length;
564 if (value_delta_encoded_) {
565 p = DecodeKeyV4()(p, limit, &shared, &non_shared);
566 value_length = 0;
567 } else {
568 p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
569 }
20effc67 570 if (p == nullptr || raw_key_.Size() < shared) {
11fdf7f2
TL
571 CorruptionError();
572 return false;
573 }
574 if (shared == 0) {
20effc67
TL
575 // If this key doesn't share any bytes with prev key then we don't need
576 // to decode it and can use its address in the block directly.
577 raw_key_.SetKey(Slice(p, non_shared), false /* copy */);
11fdf7f2
TL
578 } else {
579 // This key share `shared` bytes with prev key, we need to decode it
20effc67 580 raw_key_.TrimAppend(shared, p, non_shared);
11fdf7f2
TL
581 }
582 value_ = Slice(p + non_shared, value_length);
583 if (shared == 0) {
7c673cae
FG
584 while (restart_index_ + 1 < num_restarts_ &&
585 GetRestartPoint(restart_index_ + 1) < current_) {
586 ++restart_index_;
587 }
11fdf7f2
TL
588 }
589 // else we are in the middle of a restart interval and the restart_index_
590 // thus has not changed
f67539c2 591 if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
11fdf7f2
TL
592 DecodeCurrentValue(shared);
593 }
594 return true;
595}
596
597// The format:
598// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
599// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
600// ...
601// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
602// where, k is key, v is value, and its encoding is in parenthesis.
603// The format of each key is (shared_size, non_shared_size, shared, non_shared)
20effc67 604// The format of each value, i.e., block handle, is (offset, size) whenever the
11fdf7f2
TL
605// shared_size is 0, which included the first entry in each restart point.
606// Otherwise the format is delta-size = block handle size - size of last block
607// handle.
608void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
f67539c2
TL
609 Slice v(value_.data(), data_ + restarts_ - value_.data());
610 // Delta encoding is used if `shared` != 0.
611 Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
612 &v, have_first_key_,
613 (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
614 assert(decode_s.ok());
615 value_ = Slice(value_.data(), v.data() - value_.data());
616
617 if (global_seqno_state_ != nullptr) {
618 // Overwrite sequence number the same way as in DataBlockIter.
619
620 IterKey& first_internal_key = global_seqno_state_->first_internal_key;
621 first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
622 /* copy */ true);
623
624 assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
625
626 ValueType value_type = ExtractValueType(first_internal_key.GetKey());
627 assert(value_type == ValueType::kTypeValue ||
628 value_type == ValueType::kTypeMerge ||
629 value_type == ValueType::kTypeDeletion ||
630 value_type == ValueType::kTypeRangeDeletion);
631
632 first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
633 value_type);
634 decoded_value_.first_internal_key = first_internal_key.GetKey();
7c673cae
FG
635 }
636}
637
20effc67
TL
638template <class TValue>
639void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
640 uint32_t index,
641 bool skip_linear_scan) {
642 // SeekToRestartPoint() only does the lookup in the restart block. We need
643 // to follow it up with NextImpl() to position the iterator at the restart
644 // key.
645 SeekToRestartPoint(index);
646 NextImpl();
647
648 if (!skip_linear_scan) {
649 // Linear search (within restart block) for first key >= target
650 uint32_t max_offset;
651 if (index + 1 < num_restarts_) {
652 // We are in a non-last restart interval. Since `BinarySeek()` guarantees
653 // the next restart key is strictly greater than `target`, we can
654 // terminate upon reaching it without any additional key comparison.
655 max_offset = GetRestartPoint(index + 1);
656 } else {
657 // We are in the last restart interval. The while-loop will terminate by
658 // `Valid()` returning false upon advancing past the block's last key.
659 max_offset = port::kMaxUint32;
660 }
661 while (true) {
662 NextImpl();
663 if (!Valid()) {
664 break;
665 }
666 if (current_ == max_offset) {
667 assert(CompareCurrentKey(target) > 0);
668 break;
669 } else if (CompareCurrentKey(target) >= 0) {
670 break;
671 }
672 }
673 }
674}
675
676// Binary searches in restart array to find the starting restart point for the
677// linear scan, and stores it in `*index`. Assumes restart array does not
678// contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
679// is strictly greater than `target` or does not exist (this can be used to
680// elide a comparison when linear scan reaches all the way to the next restart
681// key). Furthermore, `*skip_linear_scan` is set to indicate whether the
682// `*index`th restart key is the final result so that key does not need to be
683// compared again later.
11fdf7f2
TL
684template <class TValue>
685template <typename DecodeKeyFunc>
20effc67
TL
686bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
687 bool* skip_linear_scan) {
688 if (restarts_ == 0) {
689 // SST files dedicated to range tombstones are written with index blocks
690 // that have no keys while also having `num_restarts_ == 1`. This would
691 // cause a problem for `BinarySeek()` as it'd try to access the first key
692 // which does not exist. We identify such blocks by the offset at which
693 // their restarts are stored, and return false to prevent any attempted
694 // key accesses.
695 return false;
696 }
7c673cae 697
20effc67
TL
698 *skip_linear_scan = false;
699 // Loop invariants:
700 // - Restart key at index `left` is less than or equal to the target key. The
701 // sentinel index `-1` is considered to have a key that is less than all
702 // keys.
703 // - Any restart keys after index `right` are strictly greater than the target
704 // key.
705 int64_t left = -1, right = num_restarts_ - 1;
706 while (left != right) {
707 // The `mid` is computed by rounding up so it lands in (`left`, `right`].
708 int64_t mid = left + (right - left + 1) / 2;
709 uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
11fdf7f2
TL
710 uint32_t shared, non_shared;
711 const char* key_ptr = DecodeKeyFunc()(
712 data_ + region_offset, data_ + restarts_, &shared, &non_shared);
7c673cae
FG
713 if (key_ptr == nullptr || (shared != 0)) {
714 CorruptionError();
715 return false;
716 }
717 Slice mid_key(key_ptr, non_shared);
20effc67
TL
718 raw_key_.SetKey(mid_key, false /* copy */);
719 int cmp = CompareCurrentKey(target);
7c673cae
FG
720 if (cmp < 0) {
721 // Key at "mid" is smaller than "target". Therefore all
722 // blocks before "mid" are uninteresting.
723 left = mid;
724 } else if (cmp > 0) {
725 // Key at "mid" is >= "target". Therefore all blocks at or
726 // after "mid" are uninteresting.
727 right = mid - 1;
728 } else {
20effc67 729 *skip_linear_scan = true;
7c673cae
FG
730 left = right = mid;
731 }
732 }
733
20effc67
TL
734 if (left == -1) {
735 // All keys in the block were strictly greater than `target`. So the very
736 // first key in the block is the final seek result.
737 *skip_linear_scan = true;
738 *index = 0;
739 } else {
740 *index = static_cast<uint32_t>(left);
741 }
7c673cae
FG
742 return true;
743}
744
745// Compare target key and the block key of the block of `block_index`.
746// Return -1 if error.
11fdf7f2 747int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
7c673cae 748 uint32_t region_offset = GetRestartPoint(block_index);
11fdf7f2
TL
749 uint32_t shared, non_shared;
750 const char* key_ptr =
751 value_delta_encoded_
752 ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
753 &non_shared)
754 : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
755 &non_shared);
7c673cae
FG
756 if (key_ptr == nullptr || (shared != 0)) {
757 CorruptionError();
758 return 1; // Return target is smaller
759 }
760 Slice block_key(key_ptr, non_shared);
20effc67
TL
761 raw_key_.SetKey(block_key, false /* copy */);
762 return CompareCurrentKey(target);
7c673cae
FG
763}
764
765// Binary search in block_ids to find the first block
766// with a key >= target
11fdf7f2
TL
767bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
768 uint32_t* block_ids, uint32_t left,
f67539c2
TL
769 uint32_t right, uint32_t* index,
770 bool* prefix_may_exist) {
7c673cae 771 assert(left <= right);
f67539c2
TL
772 assert(index);
773 assert(prefix_may_exist);
774 *prefix_may_exist = true;
7c673cae
FG
775 uint32_t left_bound = left;
776
777 while (left <= right) {
778 uint32_t mid = (right + left) / 2;
779
780 int cmp = CompareBlockKey(block_ids[mid], target);
781 if (!status_.ok()) {
782 return false;
783 }
784 if (cmp < 0) {
785 // Key at "target" is larger than "mid". Therefore all
786 // blocks before or at "mid" are uninteresting.
787 left = mid + 1;
788 } else {
789 // Key at "target" is <= "mid". Therefore all blocks
790 // after "mid" are uninteresting.
791 // If there is only one block left, we found it.
792 if (left == right) break;
793 right = mid;
794 }
795 }
796
797 if (left == right) {
798 // In one of the two following cases:
799 // (1) left is the first one of block_ids
800 // (2) there is a gap of blocks between block of `left` and `left-1`.
801 // we can further distinguish the case of key in the block or key not
802 // existing, by comparing the target key and the key of the previous
803 // block to the left of the block found.
804 if (block_ids[left] > 0 &&
805 (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
806 CompareBlockKey(block_ids[left] - 1, target) > 0) {
807 current_ = restarts_;
f67539c2 808 *prefix_may_exist = false;
7c673cae
FG
809 return false;
810 }
811
812 *index = block_ids[left];
813 return true;
814 } else {
815 assert(left > right);
f67539c2
TL
816
817 // If the next block key is larger than seek key, it is possible that
818 // no key shares the prefix with `target`, or all keys with the same
819 // prefix as `target` are smaller than prefix. In the latter case,
820 // we are mandated to set the position the same as the total order.
821 // In the latter case, either:
822 // (1) `target` falls into the range of the next block. In this case,
823 // we can place the iterator to the next block, or
824 // (2) `target` is larger than all block keys. In this case we can
825 // keep the iterator invalidate without setting `prefix_may_exist`
826 // to false.
827 // We might sometimes end up with setting the total order position
828 // while there is no key sharing the prefix as `target`, but it
829 // still follows the contract.
830 uint32_t right_index = block_ids[right];
831 assert(right_index + 1 <= num_restarts_);
832 if (right_index + 1 < num_restarts_) {
833 if (CompareBlockKey(right_index + 1, target) >= 0) {
834 *index = right_index + 1;
835 return true;
836 } else {
837 // We have to set the flag here because we are not positioning
838 // the iterator to the total order position.
839 *prefix_may_exist = false;
840 }
841 }
842
7c673cae
FG
843 // Mark iterator invalid
844 current_ = restarts_;
845 return false;
846 }
847}
848
f67539c2
TL
849bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
850 bool* prefix_may_exist) {
851 assert(index);
852 assert(prefix_may_exist);
7c673cae 853 assert(prefix_index_);
f67539c2 854 *prefix_may_exist = true;
11fdf7f2 855 Slice seek_key = target;
20effc67 856 if (raw_key_.IsUserKey()) {
11fdf7f2
TL
857 seek_key = ExtractUserKey(target);
858 }
7c673cae
FG
859 uint32_t* block_ids = nullptr;
860 uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
861
862 if (num_blocks == 0) {
863 current_ = restarts_;
f67539c2 864 *prefix_may_exist = false;
7c673cae 865 return false;
494da23a 866 } else {
f67539c2
TL
867 assert(block_ids);
868 return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
869 prefix_may_exist);
7c673cae
FG
870 }
871}
872
873uint32_t Block::NumRestarts() const {
494da23a 874 assert(size_ >= 2 * sizeof(uint32_t));
11fdf7f2
TL
875 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
876 uint32_t num_restarts = block_footer;
877 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
878 // In BlockBuilder, we have ensured a block with HashIndex is less than
879 // kMaxBlockSizeSupportedByHashIndex (64KiB).
880 //
881 // Therefore, if we encounter a block with a size > 64KiB, the block
882 // cannot have HashIndex. So the footer will directly interpreted as
883 // num_restarts.
884 //
885 // Such check is for backward compatibility. We can ensure legacy block
886 // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
887 // correctly as no HashIndex even if the MSB of num_restarts is set.
888 return num_restarts;
889 }
890 BlockBasedTableOptions::DataBlockIndexType index_type;
891 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
892 return num_restarts;
893}
894
895BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
896 assert(size_ >= 2 * sizeof(uint32_t));
897 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
898 // The check is for the same reason as that in NumRestarts()
899 return BlockBasedTableOptions::kDataBlockBinarySearch;
900 }
901 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
902 uint32_t num_restarts = block_footer;
903 BlockBasedTableOptions::DataBlockIndexType index_type;
904 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
905 return index_type;
906}
907
908Block::~Block() {
909 // This sync point can be re-enabled if RocksDB can control the
910 // initialization order of any/all static options created by the user.
911 // TEST_SYNC_POINT("Block::~Block");
7c673cae
FG
912}
913
20effc67
TL
914Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
915 Statistics* statistics)
7c673cae
FG
916 : contents_(std::move(contents)),
917 data_(contents_.data.data()),
918 size_(contents_.data.size()),
11fdf7f2 919 restart_offset_(0),
20effc67 920 num_restarts_(0) {
11fdf7f2 921 TEST_SYNC_POINT("Block::Block:0");
7c673cae
FG
922 if (size_ < sizeof(uint32_t)) {
923 size_ = 0; // Error marker
924 } else {
11fdf7f2 925 // Should only decode restart points for uncompressed blocks
494da23a
TL
926 num_restarts_ = NumRestarts();
927 switch (IndexType()) {
928 case BlockBasedTableOptions::kDataBlockBinarySearch:
929 restart_offset_ = static_cast<uint32_t>(size_) -
930 (1 + num_restarts_) * sizeof(uint32_t);
931 if (restart_offset_ > size_ - sizeof(uint32_t)) {
932 // The size is too small for NumRestarts() and therefore
933 // restart_offset_ wrapped around.
934 size_ = 0;
935 }
936 break;
937 case BlockBasedTableOptions::kDataBlockBinaryAndHash:
938 if (size_ < sizeof(uint32_t) /* block footer */ +
939 sizeof(uint16_t) /* NUM_BUCK */) {
940 size_ = 0;
11fdf7f2 941 break;
494da23a
TL
942 }
943
944 uint16_t map_offset;
945 data_block_hash_index_.Initialize(
946 contents.data.data(),
947 static_cast<uint16_t>(contents.data.size() -
948 sizeof(uint32_t)), /*chop off
949 NUM_RESTARTS*/
950 &map_offset);
951
952 restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
953
954 if (restart_offset_ > map_offset) {
955 // map_offset is too small for NumRestarts() and
956 // therefore restart_offset_ wrapped around.
957 size_ = 0;
11fdf7f2 958 break;
494da23a
TL
959 }
960 break;
961 default:
962 size_ = 0; // Error marker
7c673cae
FG
963 }
964 }
965 if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
966 read_amp_bitmap_.reset(new BlockReadAmpBitmap(
967 restart_offset_, read_amp_bytes_per_bit, statistics));
968 }
969}
970
20effc67
TL
971DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
972 SequenceNumber global_seqno,
f67539c2
TL
973 DataBlockIter* iter, Statistics* stats,
974 bool block_contents_pinned) {
11fdf7f2
TL
975 DataBlockIter* ret_iter;
976 if (iter != nullptr) {
977 ret_iter = iter;
978 } else {
979 ret_iter = new DataBlockIter;
7c673cae 980 }
11fdf7f2
TL
981 if (size_ < 2 * sizeof(uint32_t)) {
982 ret_iter->Invalidate(Status::Corruption("bad block contents"));
983 return ret_iter;
984 }
985 if (num_restarts_ == 0) {
986 // Empty block.
987 ret_iter->Invalidate(Status::OK());
988 return ret_iter;
7c673cae 989 } else {
11fdf7f2 990 ret_iter->Initialize(
20effc67 991 raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
494da23a 992 read_amp_bitmap_.get(), block_contents_pinned,
11fdf7f2 993 data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
7c673cae
FG
994 if (read_amp_bitmap_) {
995 if (read_amp_bitmap_->GetStatistics() != stats) {
996 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
997 read_amp_bitmap_->SetStatistics(stats);
998 }
999 }
1000 }
1001
11fdf7f2 1002 return ret_iter;
7c673cae
FG
1003}
1004
f67539c2 1005IndexBlockIter* Block::NewIndexIterator(
20effc67
TL
1006 const Comparator* raw_ucmp, SequenceNumber global_seqno,
1007 IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
1008 bool have_first_key, bool key_includes_seq, bool value_is_full,
1009 bool block_contents_pinned, BlockPrefixIndex* prefix_index) {
11fdf7f2
TL
1010 IndexBlockIter* ret_iter;
1011 if (iter != nullptr) {
1012 ret_iter = iter;
1013 } else {
1014 ret_iter = new IndexBlockIter;
1015 }
1016 if (size_ < 2 * sizeof(uint32_t)) {
1017 ret_iter->Invalidate(Status::Corruption("bad block contents"));
1018 return ret_iter;
1019 }
1020 if (num_restarts_ == 0) {
1021 // Empty block.
1022 ret_iter->Invalidate(Status::OK());
1023 return ret_iter;
1024 } else {
1025 BlockPrefixIndex* prefix_index_ptr =
1026 total_order_seek ? nullptr : prefix_index;
20effc67
TL
1027 ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_,
1028 global_seqno, prefix_index_ptr, have_first_key,
f67539c2
TL
1029 key_includes_seq, value_is_full,
1030 block_contents_pinned);
11fdf7f2
TL
1031 }
1032
1033 return ret_iter;
7c673cae
FG
1034}
1035
1036size_t Block::ApproximateMemoryUsage() const {
1037 size_t usage = usable_size();
11fdf7f2
TL
1038#ifdef ROCKSDB_MALLOC_USABLE_SIZE
1039 usage += malloc_usable_size((void*)this);
1040#else
1041 usage += sizeof(*this);
1042#endif // ROCKSDB_MALLOC_USABLE_SIZE
1043 if (read_amp_bitmap_) {
1044 usage += read_amp_bitmap_->ApproximateMemoryUsage();
7c673cae
FG
1045 }
1046 return usage;
1047}
1048
f67539c2 1049} // namespace ROCKSDB_NAMESPACE