]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_based/block.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / block.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #pragma once
11 #include <stddef.h>
12 #include <stdint.h>
13
14 #include <string>
15 #include <vector>
16
17 #include "db/pinned_iterators_manager.h"
18 #include "port/malloc.h"
19 #include "rocksdb/iterator.h"
20 #include "rocksdb/options.h"
21 #include "rocksdb/statistics.h"
22 #include "rocksdb/table.h"
23 #include "table/block_based/block_prefix_index.h"
24 #include "table/block_based/data_block_hash_index.h"
25 #include "table/format.h"
26 #include "table/internal_iterator.h"
27 #include "test_util/sync_point.h"
28 #include "util/random.h"
29
30 namespace ROCKSDB_NAMESPACE {
31
32 struct BlockContents;
33 class Comparator;
34 template <class TValue>
35 class BlockIter;
36 class DataBlockIter;
37 class IndexBlockIter;
38 class MetaBlockIter;
39 class BlockPrefixIndex;
40
41 // BlockReadAmpBitmap is a bitmap that map the ROCKSDB_NAMESPACE::Block data
42 // bytes to a bitmap with ratio bytes_per_bit. Whenever we access a range of
43 // bytes in the Block we update the bitmap and increment
44 // READ_AMP_ESTIMATE_USEFUL_BYTES.
45 class BlockReadAmpBitmap {
46 public:
47 explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
48 Statistics* statistics)
49 : bitmap_(nullptr),
50 bytes_per_bit_pow_(0),
51 statistics_(statistics),
52 rnd_(Random::GetTLSInstance()->Uniform(
53 static_cast<int>(bytes_per_bit))) {
54 TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
55 assert(block_size > 0 && bytes_per_bit > 0);
56
57 // convert bytes_per_bit to be a power of 2
58 while (bytes_per_bit >>= 1) {
59 bytes_per_bit_pow_++;
60 }
61
62 // num_bits_needed = ceil(block_size / bytes_per_bit)
63 size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
64 assert(num_bits_needed > 0);
65
66 // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
67 size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
68
69 // Create bitmap and set all the bits to 0
70 bitmap_ = new std::atomic<uint32_t>[bitmap_size]();
71
72 RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
73 }
74
75 ~BlockReadAmpBitmap() { delete[] bitmap_; }
76
77 void Mark(uint32_t start_offset, uint32_t end_offset) {
78 assert(end_offset >= start_offset);
79 // Index of first bit in mask
80 uint32_t start_bit =
81 (start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
82 bytes_per_bit_pow_;
83 // Index of last bit in mask + 1
84 uint32_t exclusive_end_bit =
85 (end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
86 if (start_bit >= exclusive_end_bit) {
87 return;
88 }
89 assert(exclusive_end_bit > 0);
90
91 if (GetAndSet(start_bit) == 0) {
92 uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
93 << bytes_per_bit_pow_;
94 RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
95 new_useful_bytes);
96 }
97 }
98
99 Statistics* GetStatistics() {
100 return statistics_.load(std::memory_order_relaxed);
101 }
102
103 void SetStatistics(Statistics* stats) { statistics_.store(stats); }
104
105 uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
106
107 size_t ApproximateMemoryUsage() const {
108 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
109 return malloc_usable_size((void*)this);
110 #endif // ROCKSDB_MALLOC_USABLE_SIZE
111 return sizeof(*this);
112 }
113
114 private:
115 // Get the current value of bit at `bit_idx` and set it to 1
116 inline bool GetAndSet(uint32_t bit_idx) {
117 const uint32_t byte_idx = bit_idx / kBitsPerEntry;
118 const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
119
120 return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
121 bit_mask;
122 }
123
124 const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes
125 const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits
126
127 // Bitmap used to record the bytes that we read, use atomic to protect
128 // against multiple threads updating the same bit
129 std::atomic<uint32_t>* bitmap_;
130 // (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
131 // muliplication and division
132 uint8_t bytes_per_bit_pow_;
133 // Pointer to DB Statistics object, Since this bitmap may outlive the DB
134 // this pointer maybe invalid, but the DB will update it to a valid pointer
135 // by using SetStatistics() before calling Mark()
136 std::atomic<Statistics*> statistics_;
137 uint32_t rnd_;
138 };
139
140 // class Block is the uncompressed and "parsed" form for blocks containing
141 // key-value pairs. (See BlockContents comments for more on terminology.)
142 // This includes the in-memory representation of data blocks, index blocks
143 // (including partitions), range deletion blocks, properties blocks, metaindex
144 // blocks, as well as the top level of the partitioned filter structure (which
145 // is actually an index of the filter partitions). It is NOT suitable for
146 // compressed blocks in general, filter blocks/partitions, or compression
147 // dictionaries.
148 //
149 // See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
150 // for details of the format and the various block types.
151 //
152 // TODO: Rename to ParsedKvBlock?
153 class Block {
154 public:
155 // Initialize the block with the specified contents.
156 explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
157 Statistics* statistics = nullptr);
158 // No copying allowed
159 Block(const Block&) = delete;
160 void operator=(const Block&) = delete;
161
162 ~Block();
163
164 size_t size() const { return size_; }
165 const char* data() const { return data_; }
166 // The additional memory space taken by the block data.
167 size_t usable_size() const { return contents_.usable_size(); }
168 uint32_t NumRestarts() const;
169 bool own_bytes() const { return contents_.own_bytes(); }
170
171 BlockBasedTableOptions::DataBlockIndexType IndexType() const;
172
173 // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
174 // comparator.
175 //
176 // If iter is null, return new Iterator
177 // If iter is not null, update this one and return it as Iterator*
178 //
179 // Updates read_amp_bitmap_ if it is not nullptr.
180 //
181 // If `block_contents_pinned` is true, the caller will guarantee that when
182 // the cleanup functions are transferred from the iterator to other
183 // classes, e.g. PinnableSlice, the pointer to the bytes will still be
184 // valid. Either the iterator holds cache handle or ownership of some resource
185 // and release them in a release function, or caller is sure that the data
186 // will not go away (for example, it's from mmapped file which will not be
187 // closed).
188 //
189 // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
190 // the iterator will simply be set as "invalid", rather than returning
191 // the key that is just pass the target key.
192 DataBlockIter* NewDataIterator(const Comparator* raw_ucmp,
193 SequenceNumber global_seqno,
194 DataBlockIter* iter = nullptr,
195 Statistics* stats = nullptr,
196 bool block_contents_pinned = false);
197
198 // Returns an MetaBlockIter for iterating over blocks containing metadata
199 // (like Properties blocks). Unlike data blocks, the keys for these blocks
200 // do not contain sequence numbers, do not use a user-define comparator, and
201 // do not track read amplification/statistics. Additionally, MetaBlocks will
202 // not assert if the block is formatted improperly.
203 //
204 // If `block_contents_pinned` is true, the caller will guarantee that when
205 // the cleanup functions are transferred from the iterator to other
206 // classes, e.g. PinnableSlice, the pointer to the bytes will still be
207 // valid. Either the iterator holds cache handle or ownership of some resource
208 // and release them in a release function, or caller is sure that the data
209 // will not go away (for example, it's from mmapped file which will not be
210 // closed).
211 MetaBlockIter* NewMetaIterator(bool block_contents_pinned = false);
212
213 // raw_ucmp is a raw (i.e., not wrapped by `UserComparatorWrapper`) user key
214 // comparator.
215 //
216 // key_includes_seq, default true, means that the keys are in internal key
217 // format.
218 // value_is_full, default true, means that no delta encoding is
219 // applied to values.
220 //
221 // If `prefix_index` is not nullptr this block will do hash lookup for the key
222 // prefix. If total_order_seek is true, prefix_index_ is ignored.
223 //
224 // `have_first_key` controls whether IndexValue will contain
225 // first_internal_key. It affects data serialization format, so the same value
226 // have_first_key must be used when writing and reading index.
227 // It is determined by IndexType property of the table.
228 IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp,
229 SequenceNumber global_seqno,
230 IndexBlockIter* iter, Statistics* stats,
231 bool total_order_seek, bool have_first_key,
232 bool key_includes_seq, bool value_is_full,
233 bool block_contents_pinned = false,
234 BlockPrefixIndex* prefix_index = nullptr);
235
236 // Report an approximation of how much memory has been used.
237 size_t ApproximateMemoryUsage() const;
238
239 private:
240 BlockContents contents_;
241 const char* data_; // contents_.data.data()
242 size_t size_; // contents_.data.size()
243 uint32_t restart_offset_; // Offset in data_ of restart array
244 uint32_t num_restarts_;
245 std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
246 DataBlockHashIndex data_block_hash_index_;
247 };
248
249 // A `BlockIter` iterates over the entries in a `Block`'s data buffer. The
250 // format of this data buffer is an uncompressed, sorted sequence of key-value
251 // pairs (see `Block` API for more details).
252 //
253 // Notably, the keys may either be in internal key format or user key format.
254 // Subclasses are responsible for configuring the key format.
255 //
256 // `BlockIter` intends to provide final overrides for all of
257 // `InternalIteratorBase` functions that can move the iterator. It does
258 // this to guarantee `UpdateKey()` is called exactly once after each key
259 // movement potentially visible to users. In this step, the key is prepared
260 // (e.g., serialized if global seqno is in effect) so it can be returned
261 // immediately when the user asks for it via calling `key() const`.
262 //
263 // For its subclasses, it provides protected variants of the above-mentioned
264 // final-overridden methods. They are named with the "Impl" suffix, e.g.,
265 // `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These
266 // "Impl" functions are responsible for positioning `raw_key_` but not
267 // invoking `UpdateKey()`.
268 template <class TValue>
269 class BlockIter : public InternalIteratorBase<TValue> {
270 public:
271 // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
272 // nothing. Calls cleanup functions.
273 virtual void Invalidate(const Status& s) {
274 // Assert that the BlockIter is never deleted while Pinning is Enabled.
275 assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
276
277 data_ = nullptr;
278 current_ = restarts_;
279 status_ = s;
280
281 // Call cleanup callbacks.
282 Cleanable::Reset();
283 }
284
285 bool Valid() const override { return current_ < restarts_; }
286
287 virtual void SeekToFirst() override final {
288 SeekToFirstImpl();
289 UpdateKey();
290 }
291
292 virtual void SeekToLast() override final {
293 SeekToLastImpl();
294 UpdateKey();
295 }
296
297 virtual void Seek(const Slice& target) override final {
298 SeekImpl(target);
299 UpdateKey();
300 }
301
302 virtual void SeekForPrev(const Slice& target) override final {
303 SeekForPrevImpl(target);
304 UpdateKey();
305 }
306
307 virtual void Next() override final {
308 NextImpl();
309 UpdateKey();
310 }
311
312 virtual bool NextAndGetResult(IterateResult* result) override final {
313 // This does not need to call `UpdateKey()` as the parent class only has
314 // access to the `UpdateKey()`-invoking functions.
315 return InternalIteratorBase<TValue>::NextAndGetResult(result);
316 }
317
318 virtual void Prev() override final {
319 PrevImpl();
320 UpdateKey();
321 }
322
323 Status status() const override { return status_; }
324 Slice key() const override {
325 assert(Valid());
326 return key_;
327 }
328
329 #ifndef NDEBUG
330 ~BlockIter() override {
331 // Assert that the BlockIter is never deleted while Pinning is Enabled.
332 assert(!pinned_iters_mgr_ ||
333 (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
334 status_.PermitUncheckedError();
335 }
336 void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
337 pinned_iters_mgr_ = pinned_iters_mgr;
338 }
339 PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
340 #endif
341
342 bool IsKeyPinned() const override {
343 return block_contents_pinned_ && key_pinned_;
344 }
345
346 bool IsValuePinned() const override { return block_contents_pinned_; }
347
348 size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
349
350 uint32_t ValueOffset() const {
351 return static_cast<uint32_t>(value_.data() - data_);
352 }
353
354 void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
355
356 Cache::Handle* cache_handle() { return cache_handle_; }
357
358 protected:
359 std::unique_ptr<InternalKeyComparator> icmp_;
360 const char* data_; // underlying block contents
361 uint32_t num_restarts_; // Number of uint32_t entries in restart array
362
363 // Index of restart block in which current_ or current_-1 falls
364 uint32_t restart_index_;
365 uint32_t restarts_; // Offset of restart array (list of fixed32)
366 // current_ is offset in data_ of current entry. >= restarts_ if !Valid
367 uint32_t current_;
368 // Raw key from block.
369 IterKey raw_key_;
370 // Buffer for key data when global seqno assignment is enabled.
371 IterKey key_buf_;
372 Slice value_;
373 Status status_;
374 // Key to be exposed to users.
375 Slice key_;
376 bool key_pinned_;
377 // Whether the block data is guaranteed to outlive this iterator, and
378 // as long as the cleanup functions are transferred to another class,
379 // e.g. PinnableSlice, the pointer to the bytes will still be valid.
380 bool block_contents_pinned_;
381 SequenceNumber global_seqno_;
382
383 virtual void SeekToFirstImpl() = 0;
384 virtual void SeekToLastImpl() = 0;
385 virtual void SeekImpl(const Slice& target) = 0;
386 virtual void SeekForPrevImpl(const Slice& target) = 0;
387 virtual void NextImpl() = 0;
388
389 virtual void PrevImpl() = 0;
390
391 template <typename DecodeEntryFunc>
392 inline bool ParseNextKey(bool* is_shared);
393
394 void InitializeBase(const Comparator* raw_ucmp, const char* data,
395 uint32_t restarts, uint32_t num_restarts,
396 SequenceNumber global_seqno, bool block_contents_pinned) {
397 assert(data_ == nullptr); // Ensure it is called only once
398 assert(num_restarts > 0); // Ensure the param is valid
399
400 icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
401 data_ = data;
402 restarts_ = restarts;
403 num_restarts_ = num_restarts;
404 current_ = restarts_;
405 restart_index_ = num_restarts_;
406 global_seqno_ = global_seqno;
407 block_contents_pinned_ = block_contents_pinned;
408 cache_handle_ = nullptr;
409 }
410
411 // Must be called every time a key is found that needs to be returned to user,
412 // and may be called when no key is found (as a no-op). Updates `key_`,
413 // `key_buf_`, and `key_pinned_` with info about the found key.
414 void UpdateKey() {
415 key_buf_.Clear();
416 if (!Valid()) {
417 return;
418 }
419 if (raw_key_.IsUserKey()) {
420 assert(global_seqno_ == kDisableGlobalSequenceNumber);
421 key_ = raw_key_.GetUserKey();
422 key_pinned_ = raw_key_.IsKeyPinned();
423 } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
424 key_ = raw_key_.GetInternalKey();
425 key_pinned_ = raw_key_.IsKeyPinned();
426 } else {
427 key_buf_.SetInternalKey(raw_key_.GetUserKey(), global_seqno_,
428 ExtractValueType(raw_key_.GetInternalKey()));
429 key_ = key_buf_.GetInternalKey();
430 key_pinned_ = false;
431 }
432 }
433
434 // Returns the result of `Comparator::Compare()`, where the appropriate
435 // comparator is used for the block contents, the LHS argument is the current
436 // key with global seqno applied, and the RHS argument is `other`.
437 int CompareCurrentKey(const Slice& other) {
438 if (raw_key_.IsUserKey()) {
439 assert(global_seqno_ == kDisableGlobalSequenceNumber);
440 return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
441 } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
442 return icmp_->Compare(raw_key_.GetInternalKey(), other);
443 }
444 return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
445 kDisableGlobalSequenceNumber);
446 }
447
448 private:
449 // Store the cache handle, if the block is cached. We need this since the
450 // only other place the handle is stored is as an argument to the Cleanable
451 // function callback, which is hard to retrieve. When multiple value
452 // PinnableSlices reference the block, they need the cache handle in order
453 // to bump up the ref count
454 Cache::Handle* cache_handle_;
455
456 public:
457 // Return the offset in data_ just past the end of the current entry.
458 inline uint32_t NextEntryOffset() const {
459 // NOTE: We don't support blocks bigger than 2GB
460 return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
461 }
462
463 uint32_t GetRestartPoint(uint32_t index) {
464 assert(index < num_restarts_);
465 return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
466 }
467
468 void SeekToRestartPoint(uint32_t index) {
469 raw_key_.Clear();
470 restart_index_ = index;
471 // current_ will be fixed by ParseNextKey();
472
473 // ParseNextKey() starts at the end of value_, so set value_ accordingly
474 uint32_t offset = GetRestartPoint(index);
475 value_ = Slice(data_ + offset, 0);
476 }
477
478 void CorruptionError();
479
480 protected:
481 template <typename DecodeKeyFunc>
482 inline bool BinarySeek(const Slice& target, uint32_t* index,
483 bool* is_index_key_result);
484
485 void FindKeyAfterBinarySeek(const Slice& target, uint32_t index,
486 bool is_index_key_result);
487 };
488
489 class DataBlockIter final : public BlockIter<Slice> {
490 public:
491 DataBlockIter()
492 : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
493 DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts,
494 uint32_t num_restarts, SequenceNumber global_seqno,
495 BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
496 DataBlockHashIndex* data_block_hash_index)
497 : DataBlockIter() {
498 Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno,
499 read_amp_bitmap, block_contents_pinned, data_block_hash_index);
500 }
501 void Initialize(const Comparator* raw_ucmp, const char* data,
502 uint32_t restarts, uint32_t num_restarts,
503 SequenceNumber global_seqno,
504 BlockReadAmpBitmap* read_amp_bitmap,
505 bool block_contents_pinned,
506 DataBlockHashIndex* data_block_hash_index) {
507 InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno,
508 block_contents_pinned);
509 raw_key_.SetIsUserKey(false);
510 read_amp_bitmap_ = read_amp_bitmap;
511 last_bitmap_offset_ = current_ + 1;
512 data_block_hash_index_ = data_block_hash_index;
513 }
514
515 Slice value() const override {
516 assert(Valid());
517 if (read_amp_bitmap_ && current_ < restarts_ &&
518 current_ != last_bitmap_offset_) {
519 read_amp_bitmap_->Mark(current_ /* current entry offset */,
520 NextEntryOffset() - 1);
521 last_bitmap_offset_ = current_;
522 }
523 return value_;
524 }
525
526 inline bool SeekForGet(const Slice& target) {
527 if (!data_block_hash_index_) {
528 SeekImpl(target);
529 UpdateKey();
530 return true;
531 }
532 bool res = SeekForGetImpl(target);
533 UpdateKey();
534 return res;
535 }
536
537 void Invalidate(const Status& s) override {
538 BlockIter::Invalidate(s);
539 // Clear prev entries cache.
540 prev_entries_keys_buff_.clear();
541 prev_entries_.clear();
542 prev_entries_idx_ = -1;
543 }
544
545 protected:
546 friend Block;
547 inline bool ParseNextDataKey(bool* is_shared);
548 void SeekToFirstImpl() override;
549 void SeekToLastImpl() override;
550 void SeekImpl(const Slice& target) override;
551 void SeekForPrevImpl(const Slice& target) override;
552 void NextImpl() override;
553 void PrevImpl() override;
554
555 private:
556 // read-amp bitmap
557 BlockReadAmpBitmap* read_amp_bitmap_;
558 // last `current_` value we report to read-amp bitmp
559 mutable uint32_t last_bitmap_offset_;
560 struct CachedPrevEntry {
561 explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
562 size_t _key_offset, size_t _key_size, Slice _value)
563 : offset(_offset),
564 key_ptr(_key_ptr),
565 key_offset(_key_offset),
566 key_size(_key_size),
567 value(_value) {}
568
569 // offset of entry in block
570 uint32_t offset;
571 // Pointer to key data in block (nullptr if key is delta-encoded)
572 const char* key_ptr;
573 // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr)
574 size_t key_offset;
575 // size of key
576 size_t key_size;
577 // value slice pointing to data in block
578 Slice value;
579 };
580 std::string prev_entries_keys_buff_;
581 std::vector<CachedPrevEntry> prev_entries_;
582 int32_t prev_entries_idx_ = -1;
583
584 DataBlockHashIndex* data_block_hash_index_;
585
586 bool SeekForGetImpl(const Slice& target);
587 };
588
589 // Iterator over MetaBlocks. MetaBlocks are similar to Data Blocks and
590 // are used to store Properties associated with table.
591 // Meta blocks always store user keys (no sequence number) and always
592 // use the BytewiseComparator. Additionally, MetaBlock accesses are
593 // not recorded in the Statistics or for Read-Amplification.
594 class MetaBlockIter final : public BlockIter<Slice> {
595 public:
596 MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); }
597 void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts,
598 bool block_contents_pinned) {
599 // Initializes the iterator with a BytewiseComparator and
600 // the raw key being a user key.
601 InitializeBase(BytewiseComparator(), data, restarts, num_restarts,
602 kDisableGlobalSequenceNumber, block_contents_pinned);
603 raw_key_.SetIsUserKey(true);
604 }
605
606 Slice value() const override {
607 assert(Valid());
608 return value_;
609 }
610
611 protected:
612 void SeekToFirstImpl() override;
613 void SeekToLastImpl() override;
614 void SeekImpl(const Slice& target) override;
615 void SeekForPrevImpl(const Slice& target) override;
616 void NextImpl() override;
617 void PrevImpl() override;
618 };
619
620 class IndexBlockIter final : public BlockIter<IndexValue> {
621 public:
622 IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
623
624 // key_includes_seq, default true, means that the keys are in internal key
625 // format.
626 // value_is_full, default true, means that no delta encoding is
627 // applied to values.
628 void Initialize(const Comparator* raw_ucmp, const char* data,
629 uint32_t restarts, uint32_t num_restarts,
630 SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
631 bool have_first_key, bool key_includes_seq,
632 bool value_is_full, bool block_contents_pinned) {
633 InitializeBase(raw_ucmp, data, restarts, num_restarts,
634 kDisableGlobalSequenceNumber, block_contents_pinned);
635 raw_key_.SetIsUserKey(!key_includes_seq);
636 prefix_index_ = prefix_index;
637 value_delta_encoded_ = !value_is_full;
638 have_first_key_ = have_first_key;
639 if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
640 global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
641 } else {
642 global_seqno_state_.reset();
643 }
644 }
645
646 Slice user_key() const override {
647 assert(Valid());
648 return raw_key_.GetUserKey();
649 }
650
651 IndexValue value() const override {
652 assert(Valid());
653 if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
654 return decoded_value_;
655 } else {
656 IndexValue entry;
657 Slice v = value_;
658 Status decode_s __attribute__((__unused__)) =
659 entry.DecodeFrom(&v, have_first_key_, nullptr);
660 assert(decode_s.ok());
661 return entry;
662 }
663 }
664
665 bool IsValuePinned() const override {
666 return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
667 }
668
669 protected:
670 // IndexBlockIter follows a different contract for prefix iterator
671 // from data iterators.
672 // If prefix of the seek key `target` exists in the file, it must
673 // return the same result as total order seek.
674 // If the prefix of `target` doesn't exist in the file, it can either
675 // return the result of total order seek, or set both of Valid() = false
676 // and status() = NotFound().
677 void SeekImpl(const Slice& target) override;
678
679 void SeekForPrevImpl(const Slice&) override {
680 assert(false);
681 current_ = restarts_;
682 restart_index_ = num_restarts_;
683 status_ = Status::InvalidArgument(
684 "RocksDB internal error: should never call SeekForPrev() on index "
685 "blocks");
686 raw_key_.Clear();
687 value_.clear();
688 }
689
690 void PrevImpl() override;
691
692 void NextImpl() override;
693
694 void SeekToFirstImpl() override;
695
696 void SeekToLastImpl() override;
697
698 private:
699 bool value_delta_encoded_;
700 bool have_first_key_; // value includes first_internal_key
701 BlockPrefixIndex* prefix_index_;
702 // Whether the value is delta encoded. In that case the value is assumed to be
703 // BlockHandle. The first value in each restart interval is the full encoded
704 // BlockHandle; the restart of encoded size part of the BlockHandle. The
705 // offset of delta encoded BlockHandles is computed by adding the size of
706 // previous delta encoded values in the same restart interval to the offset of
707 // the first value in that restart interval.
708 IndexValue decoded_value_;
709
710 // When sequence number overwriting is enabled, this struct contains the seqno
711 // to overwrite with, and current first_internal_key with overwritten seqno.
712 // This is rarely used, so we put it behind a pointer and only allocate when
713 // needed.
714 struct GlobalSeqnoState {
715 // First internal key according to current index entry, but with sequence
716 // number overwritten to global_seqno.
717 IterKey first_internal_key;
718 SequenceNumber global_seqno;
719
720 explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
721 };
722
723 std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
724
725 // Set *prefix_may_exist to false if no key possibly share the same prefix
726 // as `target`. If not set, the result position should be the same as total
727 // order Seek.
728 bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
729 // Set *prefix_may_exist to false if no key can possibly share the same
730 // prefix as `target`. If not set, the result position should be the same
731 // as total order seek.
732 bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
733 uint32_t left, uint32_t right, uint32_t* index,
734 bool* prefix_may_exist);
735 inline int CompareBlockKey(uint32_t block_index, const Slice& target);
736
737 inline bool ParseNextIndexKey();
738
739 // When value_delta_encoded_ is enabled it decodes the value which is assumed
740 // to be BlockHandle and put it to decoded_value_
741 inline void DecodeCurrentValue(bool is_shared);
742 };
743
744 } // namespace ROCKSDB_NAMESPACE