]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block.cc
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / table / block.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Decodes the blocks generated by block_builder.cc.
11
12 #include "table/block.h"
13 #include <algorithm>
14 #include <string>
15 #include <unordered_map>
16 #include <vector>
17
18 #include "monitoring/perf_context_imp.h"
19 #include "port/port.h"
20 #include "port/stack_trace.h"
21 #include "rocksdb/comparator.h"
22 #include "table/block_prefix_index.h"
23 #include "table/data_block_footer.h"
24 #include "table/format.h"
25 #include "util/coding.h"
26 #include "util/logging.h"
27
28 namespace rocksdb {
29
30 // Helper routine: decode the next block entry starting at "p",
31 // storing the number of shared key bytes, non_shared key bytes,
32 // and the length of the value in "*shared", "*non_shared", and
33 // "*value_length", respectively. Will not derefence past "limit".
34 //
35 // If any errors are detected, returns nullptr. Otherwise, returns a
36 // pointer to the key delta (just past the three decoded values).
37 struct DecodeEntry {
38 inline const char* operator()(const char* p, const char* limit,
39 uint32_t* shared, uint32_t* non_shared,
40 uint32_t* value_length) {
41 // We need 2 bytes for shared and non_shared size. We also need one more
42 // byte either for value size or the actual value in case of value delta
43 // encoding.
44 assert(limit - p >= 3);
45 *shared = reinterpret_cast<const unsigned char*>(p)[0];
46 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
47 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
48 if ((*shared | *non_shared | *value_length) < 128) {
49 // Fast path: all three values are encoded in one byte each
50 p += 3;
51 } else {
52 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
53 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
54 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
55 return nullptr;
56 }
57 }
58
59 // Using an assert in place of "return null" since we should not pay the
60 // cost of checking for corruption on every single key decoding
61 assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
62 return p;
63 }
64 };
65
66 // Helper routine: similar to DecodeEntry but does not have assertions.
67 // Instead, returns nullptr so that caller can detect and report failure.
68 struct CheckAndDecodeEntry {
69 inline const char* operator()(const char* p, const char* limit,
70 uint32_t* shared, uint32_t* non_shared,
71 uint32_t* value_length) {
72 // We need 2 bytes for shared and non_shared size. We also need one more
73 // byte either for value size or the actual value in case of value delta
74 // encoding.
75 if (limit - p < 3) {
76 return nullptr;
77 }
78 *shared = reinterpret_cast<const unsigned char*>(p)[0];
79 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
80 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
81 if ((*shared | *non_shared | *value_length) < 128) {
82 // Fast path: all three values are encoded in one byte each
83 p += 3;
84 } else {
85 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
86 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
87 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
88 return nullptr;
89 }
90 }
91
92 if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
93 return nullptr;
94 }
95 return p;
96 }
97 };
98
99 struct DecodeKey {
100 inline const char* operator()(const char* p, const char* limit,
101 uint32_t* shared, uint32_t* non_shared) {
102 uint32_t value_length;
103 return DecodeEntry()(p, limit, shared, non_shared, &value_length);
104 }
105 };
106
107 // In format_version 4, which is used by index blocks, the value size is not
108 // encoded before the entry, as the value is known to be the handle with the
109 // known size.
110 struct DecodeKeyV4 {
111 inline const char* operator()(const char* p, const char* limit,
112 uint32_t* shared, uint32_t* non_shared) {
113 // We need 2 bytes for shared and non_shared size. We also need one more
114 // byte either for value size or the actual value in case of value delta
115 // encoding.
116 if (limit - p < 3) return nullptr;
117 *shared = reinterpret_cast<const unsigned char*>(p)[0];
118 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
119 if ((*shared | *non_shared) < 128) {
120 // Fast path: all three values are encoded in one byte each
121 p += 2;
122 } else {
123 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
124 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
125 }
126 return p;
127 }
128 };
129
130 void DataBlockIter::Next() {
131 assert(Valid());
132 ParseNextDataKey<DecodeEntry>();
133 }
134
135 void DataBlockIter::NextOrReport() {
136 assert(Valid());
137 ParseNextDataKey<CheckAndDecodeEntry>();
138 }
139
140 void IndexBlockIter::Next() {
141 assert(Valid());
142 ParseNextIndexKey();
143 }
144
145 void IndexBlockIter::Prev() {
146 assert(Valid());
147 // Scan backwards to a restart point before current_
148 const uint32_t original = current_;
149 while (GetRestartPoint(restart_index_) >= original) {
150 if (restart_index_ == 0) {
151 // No more entries
152 current_ = restarts_;
153 restart_index_ = num_restarts_;
154 return;
155 }
156 restart_index_--;
157 }
158 SeekToRestartPoint(restart_index_);
159 do {
160 if (!ParseNextIndexKey()) {
161 break;
162 }
163 // Loop until end of current entry hits the start of original entry
164 } while (NextEntryOffset() < original);
165 }
166
167 // Similar to IndexBlockIter::Prev but also caches the prev entries
168 void DataBlockIter::Prev() {
169 assert(Valid());
170
171 assert(prev_entries_idx_ == -1 ||
172 static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
173 // Check if we can use cached prev_entries_
174 if (prev_entries_idx_ > 0 &&
175 prev_entries_[prev_entries_idx_].offset == current_) {
176 // Read cached CachedPrevEntry
177 prev_entries_idx_--;
178 const CachedPrevEntry& current_prev_entry =
179 prev_entries_[prev_entries_idx_];
180
181 const char* key_ptr = nullptr;
182 if (current_prev_entry.key_ptr != nullptr) {
183 // The key is not delta encoded and stored in the data block
184 key_ptr = current_prev_entry.key_ptr;
185 key_pinned_ = true;
186 } else {
187 // The key is delta encoded and stored in prev_entries_keys_buff_
188 key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
189 key_pinned_ = false;
190 }
191 const Slice current_key(key_ptr, current_prev_entry.key_size);
192
193 current_ = current_prev_entry.offset;
194 key_.SetKey(current_key, false /* copy */);
195 value_ = current_prev_entry.value;
196
197 return;
198 }
199
200 // Clear prev entries cache
201 prev_entries_idx_ = -1;
202 prev_entries_.clear();
203 prev_entries_keys_buff_.clear();
204
205 // Scan backwards to a restart point before current_
206 const uint32_t original = current_;
207 while (GetRestartPoint(restart_index_) >= original) {
208 if (restart_index_ == 0) {
209 // No more entries
210 current_ = restarts_;
211 restart_index_ = num_restarts_;
212 return;
213 }
214 restart_index_--;
215 }
216
217 SeekToRestartPoint(restart_index_);
218
219 do {
220 if (!ParseNextDataKey<DecodeEntry>()) {
221 break;
222 }
223 Slice current_key = key();
224
225 if (key_.IsKeyPinned()) {
226 // The key is not delta encoded
227 prev_entries_.emplace_back(current_, current_key.data(), 0,
228 current_key.size(), value());
229 } else {
230 // The key is delta encoded, cache decoded key in buffer
231 size_t new_key_offset = prev_entries_keys_buff_.size();
232 prev_entries_keys_buff_.append(current_key.data(), current_key.size());
233
234 prev_entries_.emplace_back(current_, nullptr, new_key_offset,
235 current_key.size(), value());
236 }
237 // Loop until end of current entry hits the start of original entry
238 } while (NextEntryOffset() < original);
239 prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
240 }
241
242 void DataBlockIter::Seek(const Slice& target) {
243 Slice seek_key = target;
244 PERF_TIMER_GUARD(block_seek_nanos);
245 if (data_ == nullptr) { // Not init yet
246 return;
247 }
248 uint32_t index = 0;
249 bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
250 comparator_);
251
252 if (!ok) {
253 return;
254 }
255 SeekToRestartPoint(index);
256 // Linear search (within restart block) for first key >= target
257
258 while (true) {
259 if (!ParseNextDataKey<DecodeEntry>() || Compare(key_, seek_key) >= 0) {
260 return;
261 }
262 }
263 }
264
265 // Optimized Seek for point lookup for an internal key `target`
266 // target = "seek_user_key @ type | seqno".
267 //
268 // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
269 // or kTypeBlobIndex, this function behaves identically as Seek().
270 //
271 // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
272 // or kTypeBlobIndex:
273 //
274 // If the return value is FALSE, iter location is undefined, and it means:
275 // 1) there is no key in this block falling into the range:
276 // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
277 // inclusive; AND
278 // 2) the last key of this block has a greater user_key from seek_user_key
279 //
280 // If the return value is TRUE, iter location has two possibilies:
281 // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
282 // this case, it points to the first key_ with a larger user_key or a
283 // matching user_key with a seqno no greater than the seeking seqno.
284 // 2) If the iter is invalid, it means that either all the user_key is less
285 // than the seek_user_key, or the block ends with a matching user_key but
286 // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
287 // but larger type).
288 bool DataBlockIter::SeekForGetImpl(const Slice& target) {
289 Slice user_key = ExtractUserKey(target);
290 uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
291 uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key);
292
293 if (entry == kCollision) {
294 // HashSeek not effective, falling back
295 Seek(target);
296 return true;
297 }
298
299 if (entry == kNoEntry) {
300 // Even if we cannot find the user_key in this block, the result may
301 // exist in the next block. Consider this exmpale:
302 //
303 // Block N: [aab@100, ... , app@120]
304 // bounary key: axy@50 (we make minimal assumption about a boundary key)
305 // Block N+1: [axy@10, ... ]
306 //
307 // If seek_key = axy@60, the search will starts from Block N.
308 // Even if the user_key is not found in the hash map, the caller still
309 // have to conntinue searching the next block.
310 //
311 // In this case, we pretend the key is the the last restart interval.
312 // The while-loop below will search the last restart interval for the
313 // key. It will stop at the first key that is larger than the seek_key,
314 // or to the end of the block if no one is larger.
315 entry = static_cast<uint8_t>(num_restarts_ - 1);
316 }
317
318 uint32_t restart_index = entry;
319
320 // check if the key is in the restart_interval
321 assert(restart_index < num_restarts_);
322 SeekToRestartPoint(restart_index);
323
324 const char* limit = nullptr;
325 if (restart_index_ + 1 < num_restarts_) {
326 limit = data_ + GetRestartPoint(restart_index_ + 1);
327 } else {
328 limit = data_ + restarts_;
329 }
330
331 while (true) {
332 // Here we only linear seek the target key inside the restart interval.
333 // If a key does not exist inside a restart interval, we avoid
334 // further searching the block content accross restart interval boundary.
335 //
336 // TODO(fwu): check the left and write boundary of the restart interval
337 // to avoid linear seek a target key that is out of range.
338 if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
339 // we stop at the first potential matching user key.
340 break;
341 }
342 }
343
344 if (current_ == restarts_) {
345 // Search reaches to the end of the block. There are three possibilites:
346 // 1) there is only one user_key match in the block (otherwise collsion).
347 // the matching user_key resides in the last restart interval, and it
348 // is the last key of the restart interval and of the block as well.
349 // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
350 //
351 // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
352 // AND all existing user_keys in the restart interval are smaller than
353 // seek_user_key.
354 //
355 // 3) The seek_key is a false positive and happens to be hashed to the
356 // last restart interval, AND all existing user_keys in the restart
357 // interval are smaller than seek_user_key.
358 //
359 // The result may exist in the next block each case, so we return true.
360 return true;
361 }
362
363 if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) {
364 // the key is not in this block and cannot be at the next block either.
365 return false;
366 }
367
368 // Here we are conservative and only support a limited set of cases
369 ValueType value_type = ExtractValueType(key_.GetKey());
370 if (value_type != ValueType::kTypeValue &&
371 value_type != ValueType::kTypeDeletion &&
372 value_type != ValueType::kTypeSingleDeletion &&
373 value_type != ValueType::kTypeBlobIndex) {
374 Seek(target);
375 return true;
376 }
377
378 // Result found, and the iter is correctly set.
379 return true;
380 }
381
382 void IndexBlockIter::Seek(const Slice& target) {
383 Slice seek_key = target;
384 if (!key_includes_seq_) {
385 seek_key = ExtractUserKey(target);
386 }
387 PERF_TIMER_GUARD(block_seek_nanos);
388 if (data_ == nullptr) { // Not init yet
389 return;
390 }
391 uint32_t index = 0;
392 bool ok = false;
393 if (prefix_index_) {
394 ok = PrefixSeek(target, &index);
395 } else if (value_delta_encoded_) {
396 ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
397 comparator_);
398 } else {
399 ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
400 comparator_);
401 }
402
403 if (!ok) {
404 return;
405 }
406 SeekToRestartPoint(index);
407 // Linear search (within restart block) for first key >= target
408
409 while (true) {
410 if (!ParseNextIndexKey() || Compare(key_, seek_key) >= 0) {
411 return;
412 }
413 }
414 }
415
416 void DataBlockIter::SeekForPrev(const Slice& target) {
417 PERF_TIMER_GUARD(block_seek_nanos);
418 Slice seek_key = target;
419 if (data_ == nullptr) { // Not init yet
420 return;
421 }
422 uint32_t index = 0;
423 bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
424 comparator_);
425
426 if (!ok) {
427 return;
428 }
429 SeekToRestartPoint(index);
430 // Linear search (within restart block) for first key >= seek_key
431
432 while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
433 }
434 if (!Valid()) {
435 SeekToLast();
436 } else {
437 while (Valid() && Compare(key_, seek_key) > 0) {
438 Prev();
439 }
440 }
441 }
442
443 void DataBlockIter::SeekToFirst() {
444 if (data_ == nullptr) { // Not init yet
445 return;
446 }
447 SeekToRestartPoint(0);
448 ParseNextDataKey<DecodeEntry>();
449 }
450
451 void DataBlockIter::SeekToFirstOrReport() {
452 if (data_ == nullptr) { // Not init yet
453 return;
454 }
455 SeekToRestartPoint(0);
456 ParseNextDataKey<CheckAndDecodeEntry>();
457 }
458
459 void IndexBlockIter::SeekToFirst() {
460 if (data_ == nullptr) { // Not init yet
461 return;
462 }
463 SeekToRestartPoint(0);
464 ParseNextIndexKey();
465 }
466
467 void DataBlockIter::SeekToLast() {
468 if (data_ == nullptr) { // Not init yet
469 return;
470 }
471 SeekToRestartPoint(num_restarts_ - 1);
472 while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
473 // Keep skipping
474 }
475 }
476
477 void IndexBlockIter::SeekToLast() {
478 if (data_ == nullptr) { // Not init yet
479 return;
480 }
481 SeekToRestartPoint(num_restarts_ - 1);
482 while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
483 // Keep skipping
484 }
485 }
486
487 template <class TValue>
488 void BlockIter<TValue>::CorruptionError() {
489 current_ = restarts_;
490 restart_index_ = num_restarts_;
491 status_ = Status::Corruption("bad entry in block");
492 key_.Clear();
493 value_.clear();
494 }
495
496 template <typename DecodeEntryFunc>
497 bool DataBlockIter::ParseNextDataKey(const char* limit) {
498 current_ = NextEntryOffset();
499 const char* p = data_ + current_;
500 if (!limit) {
501 limit = data_ + restarts_; // Restarts come right after data
502 }
503
504 if (p >= limit) {
505 // No more entries to return. Mark as invalid.
506 current_ = restarts_;
507 restart_index_ = num_restarts_;
508 return false;
509 }
510
511 // Decode next entry
512 uint32_t shared, non_shared, value_length;
513 p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
514 if (p == nullptr || key_.Size() < shared) {
515 CorruptionError();
516 return false;
517 } else {
518 if (shared == 0) {
519 // If this key dont share any bytes with prev key then we dont need
520 // to decode it and can use it's address in the block directly.
521 key_.SetKey(Slice(p, non_shared), false /* copy */);
522 key_pinned_ = true;
523 } else {
524 // This key share `shared` bytes with prev key, we need to decode it
525 key_.TrimAppend(shared, p, non_shared);
526 key_pinned_ = false;
527 }
528
529 if (global_seqno_ != kDisableGlobalSequenceNumber) {
530 // If we are reading a file with a global sequence number we should
531 // expect that all encoded sequence numbers are zeros and any value
532 // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
533 assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
534
535 ValueType value_type = ExtractValueType(key_.GetKey());
536 assert(value_type == ValueType::kTypeValue ||
537 value_type == ValueType::kTypeMerge ||
538 value_type == ValueType::kTypeDeletion ||
539 value_type == ValueType::kTypeRangeDeletion);
540
541 if (key_pinned_) {
542 // TODO(tec): Investigate updating the seqno in the loaded block
543 // directly instead of doing a copy and update.
544
545 // We cannot use the key address in the block directly because
546 // we have a global_seqno_ that will overwrite the encoded one.
547 key_.OwnKey();
548 key_pinned_ = false;
549 }
550
551 key_.UpdateInternalKey(global_seqno_, value_type);
552 }
553
554 value_ = Slice(p + non_shared, value_length);
555 if (shared == 0) {
556 while (restart_index_ + 1 < num_restarts_ &&
557 GetRestartPoint(restart_index_ + 1) < current_) {
558 ++restart_index_;
559 }
560 }
561 // else we are in the middle of a restart interval and the restart_index_
562 // thus has not changed
563 return true;
564 }
565 }
566
567 bool IndexBlockIter::ParseNextIndexKey() {
568 current_ = NextEntryOffset();
569 const char* p = data_ + current_;
570 const char* limit = data_ + restarts_; // Restarts come right after data
571 if (p >= limit) {
572 // No more entries to return. Mark as invalid.
573 current_ = restarts_;
574 restart_index_ = num_restarts_;
575 return false;
576 }
577
578 // Decode next entry
579 uint32_t shared, non_shared, value_length;
580 if (value_delta_encoded_) {
581 p = DecodeKeyV4()(p, limit, &shared, &non_shared);
582 value_length = 0;
583 } else {
584 p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
585 }
586 if (p == nullptr || key_.Size() < shared) {
587 CorruptionError();
588 return false;
589 }
590 if (shared == 0) {
591 // If this key dont share any bytes with prev key then we dont need
592 // to decode it and can use it's address in the block directly.
593 key_.SetKey(Slice(p, non_shared), false /* copy */);
594 key_pinned_ = true;
595 } else {
596 // This key share `shared` bytes with prev key, we need to decode it
597 key_.TrimAppend(shared, p, non_shared);
598 key_pinned_ = false;
599 }
600 value_ = Slice(p + non_shared, value_length);
601 if (shared == 0) {
602 while (restart_index_ + 1 < num_restarts_ &&
603 GetRestartPoint(restart_index_ + 1) < current_) {
604 ++restart_index_;
605 }
606 }
607 // else we are in the middle of a restart interval and the restart_index_
608 // thus has not changed
609 if (value_delta_encoded_) {
610 assert(value_length == 0);
611 DecodeCurrentValue(shared);
612 }
613 return true;
614 }
615
616 // The format:
617 // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
618 // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
619 // ...
620 // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
621 // where, k is key, v is value, and its encoding is in parenthesis.
622 // The format of each key is (shared_size, non_shared_size, shared, non_shared)
623 // The format of each value, i.e., block hanlde, is (offset, size) whenever the
624 // shared_size is 0, which included the first entry in each restart point.
625 // Otherwise the format is delta-size = block handle size - size of last block
626 // handle.
627 void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
628 assert(value_delta_encoded_);
629 const char* limit = data_ + restarts_;
630 if (shared == 0) {
631 uint64_t o, s;
632 const char* newp = GetVarint64Ptr(value_.data(), limit, &o);
633 assert(newp);
634 newp = GetVarint64Ptr(newp, limit, &s);
635 assert(newp);
636 decoded_value_ = BlockHandle(o, s);
637 value_ = Slice(value_.data(), newp - value_.data());
638 } else {
639 uint64_t next_value_base =
640 decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize;
641 int64_t delta;
642 const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta);
643 decoded_value_ =
644 BlockHandle(next_value_base, decoded_value_.size() + delta);
645 value_ = Slice(value_.data(), newp - value_.data());
646 }
647 }
648
649 // Binary search in restart array to find the first restart point that
650 // is either the last restart point with a key less than target,
651 // which means the key of next restart point is larger than target, or
652 // the first restart point with a key = target
653 template <class TValue>
654 template <typename DecodeKeyFunc>
655 bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
656 uint32_t right, uint32_t* index,
657 const Comparator* comp) {
658 assert(left <= right);
659
660 while (left < right) {
661 uint32_t mid = (left + right + 1) / 2;
662 uint32_t region_offset = GetRestartPoint(mid);
663 uint32_t shared, non_shared;
664 const char* key_ptr = DecodeKeyFunc()(
665 data_ + region_offset, data_ + restarts_, &shared, &non_shared);
666 if (key_ptr == nullptr || (shared != 0)) {
667 CorruptionError();
668 return false;
669 }
670 Slice mid_key(key_ptr, non_shared);
671 int cmp = comp->Compare(mid_key, target);
672 if (cmp < 0) {
673 // Key at "mid" is smaller than "target". Therefore all
674 // blocks before "mid" are uninteresting.
675 left = mid;
676 } else if (cmp > 0) {
677 // Key at "mid" is >= "target". Therefore all blocks at or
678 // after "mid" are uninteresting.
679 right = mid - 1;
680 } else {
681 left = right = mid;
682 }
683 }
684
685 *index = left;
686 return true;
687 }
688
689 // Compare target key and the block key of the block of `block_index`.
690 // Return -1 if error.
691 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
692 uint32_t region_offset = GetRestartPoint(block_index);
693 uint32_t shared, non_shared;
694 const char* key_ptr =
695 value_delta_encoded_
696 ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
697 &non_shared)
698 : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
699 &non_shared);
700 if (key_ptr == nullptr || (shared != 0)) {
701 CorruptionError();
702 return 1; // Return target is smaller
703 }
704 Slice block_key(key_ptr, non_shared);
705 return Compare(block_key, target);
706 }
707
708 // Binary search in block_ids to find the first block
709 // with a key >= target
710 bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
711 uint32_t* block_ids, uint32_t left,
712 uint32_t right, uint32_t* index) {
713 assert(left <= right);
714 uint32_t left_bound = left;
715
716 while (left <= right) {
717 uint32_t mid = (right + left) / 2;
718
719 int cmp = CompareBlockKey(block_ids[mid], target);
720 if (!status_.ok()) {
721 return false;
722 }
723 if (cmp < 0) {
724 // Key at "target" is larger than "mid". Therefore all
725 // blocks before or at "mid" are uninteresting.
726 left = mid + 1;
727 } else {
728 // Key at "target" is <= "mid". Therefore all blocks
729 // after "mid" are uninteresting.
730 // If there is only one block left, we found it.
731 if (left == right) break;
732 right = mid;
733 }
734 }
735
736 if (left == right) {
737 // In one of the two following cases:
738 // (1) left is the first one of block_ids
739 // (2) there is a gap of blocks between block of `left` and `left-1`.
740 // we can further distinguish the case of key in the block or key not
741 // existing, by comparing the target key and the key of the previous
742 // block to the left of the block found.
743 if (block_ids[left] > 0 &&
744 (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
745 CompareBlockKey(block_ids[left] - 1, target) > 0) {
746 current_ = restarts_;
747 return false;
748 }
749
750 *index = block_ids[left];
751 return true;
752 } else {
753 assert(left > right);
754 // Mark iterator invalid
755 current_ = restarts_;
756 return false;
757 }
758 }
759
760 bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
761 assert(prefix_index_);
762 Slice seek_key = target;
763 if (!key_includes_seq_) {
764 seek_key = ExtractUserKey(target);
765 }
766 uint32_t* block_ids = nullptr;
767 uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
768
769 if (num_blocks == 0) {
770 current_ = restarts_;
771 return false;
772 } else {
773 return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index);
774 }
775 }
776
777 uint32_t Block::NumRestarts() const {
778 assert(size_ >= 2 * sizeof(uint32_t));
779 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
780 uint32_t num_restarts = block_footer;
781 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
782 // In BlockBuilder, we have ensured a block with HashIndex is less than
783 // kMaxBlockSizeSupportedByHashIndex (64KiB).
784 //
785 // Therefore, if we encounter a block with a size > 64KiB, the block
786 // cannot have HashIndex. So the footer will directly interpreted as
787 // num_restarts.
788 //
789 // Such check is for backward compatibility. We can ensure legacy block
790 // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
791 // correctly as no HashIndex even if the MSB of num_restarts is set.
792 return num_restarts;
793 }
794 BlockBasedTableOptions::DataBlockIndexType index_type;
795 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
796 return num_restarts;
797 }
798
799 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
800 assert(size_ >= 2 * sizeof(uint32_t));
801 if (size_ > kMaxBlockSizeSupportedByHashIndex) {
802 // The check is for the same reason as that in NumRestarts()
803 return BlockBasedTableOptions::kDataBlockBinarySearch;
804 }
805 uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
806 uint32_t num_restarts = block_footer;
807 BlockBasedTableOptions::DataBlockIndexType index_type;
808 UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
809 return index_type;
810 }
811
812 Block::~Block() {
813 // This sync point can be re-enabled if RocksDB can control the
814 // initialization order of any/all static options created by the user.
815 // TEST_SYNC_POINT("Block::~Block");
816 }
817
818 Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
819 size_t read_amp_bytes_per_bit, Statistics* statistics)
820 : contents_(std::move(contents)),
821 data_(contents_.data.data()),
822 size_(contents_.data.size()),
823 restart_offset_(0),
824 num_restarts_(0),
825 global_seqno_(_global_seqno) {
826 TEST_SYNC_POINT("Block::Block:0");
827 if (size_ < sizeof(uint32_t)) {
828 size_ = 0; // Error marker
829 } else {
830 // Should only decode restart points for uncompressed blocks
831 num_restarts_ = NumRestarts();
832 switch (IndexType()) {
833 case BlockBasedTableOptions::kDataBlockBinarySearch:
834 restart_offset_ = static_cast<uint32_t>(size_) -
835 (1 + num_restarts_) * sizeof(uint32_t);
836 if (restart_offset_ > size_ - sizeof(uint32_t)) {
837 // The size is too small for NumRestarts() and therefore
838 // restart_offset_ wrapped around.
839 size_ = 0;
840 }
841 break;
842 case BlockBasedTableOptions::kDataBlockBinaryAndHash:
843 if (size_ < sizeof(uint32_t) /* block footer */ +
844 sizeof(uint16_t) /* NUM_BUCK */) {
845 size_ = 0;
846 break;
847 }
848
849 uint16_t map_offset;
850 data_block_hash_index_.Initialize(
851 contents.data.data(),
852 static_cast<uint16_t>(contents.data.size() -
853 sizeof(uint32_t)), /*chop off
854 NUM_RESTARTS*/
855 &map_offset);
856
857 restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
858
859 if (restart_offset_ > map_offset) {
860 // map_offset is too small for NumRestarts() and
861 // therefore restart_offset_ wrapped around.
862 size_ = 0;
863 break;
864 }
865 break;
866 default:
867 size_ = 0; // Error marker
868 }
869 }
870 if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
871 read_amp_bitmap_.reset(new BlockReadAmpBitmap(
872 restart_offset_, read_amp_bytes_per_bit, statistics));
873 }
874 }
875
876 template <>
877 DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
878 DataBlockIter* iter, Statistics* stats,
879 bool /*total_order_seek*/,
880 bool /*key_includes_seq*/,
881 bool /*value_is_full*/,
882 bool block_contents_pinned,
883 BlockPrefixIndex* /*prefix_index*/) {
884 DataBlockIter* ret_iter;
885 if (iter != nullptr) {
886 ret_iter = iter;
887 } else {
888 ret_iter = new DataBlockIter;
889 }
890 if (size_ < 2 * sizeof(uint32_t)) {
891 ret_iter->Invalidate(Status::Corruption("bad block contents"));
892 return ret_iter;
893 }
894 if (num_restarts_ == 0) {
895 // Empty block.
896 ret_iter->Invalidate(Status::OK());
897 return ret_iter;
898 } else {
899 ret_iter->Initialize(
900 cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
901 read_amp_bitmap_.get(), block_contents_pinned,
902 data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
903 if (read_amp_bitmap_) {
904 if (read_amp_bitmap_->GetStatistics() != stats) {
905 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
906 read_amp_bitmap_->SetStatistics(stats);
907 }
908 }
909 }
910
911 return ret_iter;
912 }
913
914 template <>
915 IndexBlockIter* Block::NewIterator(const Comparator* cmp,
916 const Comparator* ucmp, IndexBlockIter* iter,
917 Statistics* /*stats*/, bool total_order_seek,
918 bool key_includes_seq, bool value_is_full,
919 bool block_contents_pinned,
920 BlockPrefixIndex* prefix_index) {
921 IndexBlockIter* ret_iter;
922 if (iter != nullptr) {
923 ret_iter = iter;
924 } else {
925 ret_iter = new IndexBlockIter;
926 }
927 if (size_ < 2 * sizeof(uint32_t)) {
928 ret_iter->Invalidate(Status::Corruption("bad block contents"));
929 return ret_iter;
930 }
931 if (num_restarts_ == 0) {
932 // Empty block.
933 ret_iter->Invalidate(Status::OK());
934 return ret_iter;
935 } else {
936 BlockPrefixIndex* prefix_index_ptr =
937 total_order_seek ? nullptr : prefix_index;
938 ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
939 prefix_index_ptr, key_includes_seq, value_is_full,
940 block_contents_pinned,
941 nullptr /* data_block_hash_index */);
942 }
943
944 return ret_iter;
945 }
946
947 size_t Block::ApproximateMemoryUsage() const {
948 size_t usage = usable_size();
949 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
950 usage += malloc_usable_size((void*)this);
951 #else
952 usage += sizeof(*this);
953 #endif // ROCKSDB_MALLOC_USABLE_SIZE
954 if (read_amp_bitmap_) {
955 usage += read_amp_bitmap_->ApproximateMemoryUsage();
956 }
957 return usage;
958 }
959
960 } // namespace rocksdb