]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/merging_iterator.cc
bump version to 18.2.4-pve3
[ceph.git] / ceph / src / rocksdb / table / merging_iterator.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "table/merging_iterator.h"
11
12 #include "db/arena_wrapped_db_iter.h"
13 #include "db/dbformat.h"
14 #include "db/pinned_iterators_manager.h"
15 #include "memory/arena.h"
16 #include "monitoring/perf_context_imp.h"
17 #include "rocksdb/comparator.h"
18 #include "rocksdb/iterator.h"
19 #include "rocksdb/options.h"
20 #include "table/internal_iterator.h"
21 #include "table/iter_heap.h"
22 #include "table/iterator_wrapper.h"
23 #include "test_util/sync_point.h"
24 #include "util/autovector.h"
25 #include "util/heap.h"
26 #include "util/stop_watch.h"
27
28 namespace ROCKSDB_NAMESPACE {
29 // For merging iterator to process range tombstones, we treat the start and end
30 // keys of a range tombstone as point keys and put them into the minHeap/maxHeap
31 // used in merging iterator. Take minHeap for example, we are able to keep track
32 // of currently "active" range tombstones (the ones whose start keys are popped
33 // but end keys are still in the heap) in `active_`. This `active_` set of range
34 // tombstones is then used to quickly determine whether the point key at heap
35 // top is deleted (by heap property, the point key at heap top must be within
36 // internal key range of active range tombstones).
37 //
38 // The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
39 // point key and the start and end keys of a range tombstone.
40 struct HeapItem {
41 HeapItem() = default;
42
43 enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
44 IteratorWrapper iter;
45 size_t level = 0;
46 std::string pinned_key;
47 // Will be overwritten before use, initialize here so compiler does not
48 // complain.
49 Type type = ITERATOR;
50
51 explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
52 : level(_level), type(Type::ITERATOR) {
53 iter.Set(_iter);
54 }
55
56 void SetTombstoneKey(ParsedInternalKey&& pik) {
57 pinned_key.clear();
58 // Range tombstone end key is exclusive. If a point internal key has the
59 // same user key and sequence number as the start or end key of a range
60 // tombstone, the order will be start < end key < internal key with the
61 // following op_type change. This is helpful to ensure keys popped from
62 // heap are in expected order since range tombstone start/end keys will
63 // be distinct from point internal keys. Strictly speaking, this is only
64 // needed for tombstone end points that are truncated in
65 // TruncatedRangeDelIterator since untruncated tombstone end points always
66 // have kMaxSequenceNumber and kTypeRangeDeletion (see
67 // TruncatedRangeDelIterator::start_key()/end_key()).
68 ParsedInternalKey p(pik.user_key, pik.sequence, kTypeMaxValid);
69 AppendInternalKey(&pinned_key, p);
70 }
71
72 Slice key() const {
73 if (type == Type::ITERATOR) {
74 return iter.key();
75 }
76 return pinned_key;
77 }
78
79 bool IsDeleteRangeSentinelKey() const {
80 if (type == Type::ITERATOR) {
81 return iter.IsDeleteRangeSentinelKey();
82 }
83 return false;
84 }
85 };
86
87 class MinHeapItemComparator {
88 public:
89 MinHeapItemComparator(const InternalKeyComparator* comparator)
90 : comparator_(comparator) {}
91 bool operator()(HeapItem* a, HeapItem* b) const {
92 return comparator_->Compare(a->key(), b->key()) > 0;
93 }
94
95 private:
96 const InternalKeyComparator* comparator_;
97 };
98
99 class MaxHeapItemComparator {
100 public:
101 MaxHeapItemComparator(const InternalKeyComparator* comparator)
102 : comparator_(comparator) {}
103 bool operator()(HeapItem* a, HeapItem* b) const {
104 return comparator_->Compare(a->key(), b->key()) < 0;
105 }
106
107 private:
108 const InternalKeyComparator* comparator_;
109 };
110 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
111 namespace {
112 using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
113 using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
114 } // namespace
115
116 class MergingIterator : public InternalIterator {
117 public:
118 MergingIterator(const InternalKeyComparator* comparator,
119 InternalIterator** children, int n, bool is_arena_mode,
120 bool prefix_seek_mode,
121 const Slice* iterate_upper_bound = nullptr)
122 : is_arena_mode_(is_arena_mode),
123 prefix_seek_mode_(prefix_seek_mode),
124 direction_(kForward),
125 comparator_(comparator),
126 current_(nullptr),
127 minHeap_(comparator_),
128 pinned_iters_mgr_(nullptr),
129 iterate_upper_bound_(iterate_upper_bound) {
130 children_.resize(n);
131 for (int i = 0; i < n; i++) {
132 children_[i].level = i;
133 children_[i].iter.Set(children[i]);
134 }
135 }
136
137 void considerStatus(Status s) {
138 if (!s.ok() && status_.ok()) {
139 status_ = s;
140 }
141 }
142
143 virtual void AddIterator(InternalIterator* iter) {
144 children_.emplace_back(children_.size(), iter);
145 if (pinned_iters_mgr_) {
146 iter->SetPinnedItersMgr(pinned_iters_mgr_);
147 }
148 // Invalidate to ensure `Seek*()` is called to construct the heaps before
149 // use.
150 current_ = nullptr;
151 }
152
153 // Merging iterator can optionally process range tombstones: if a key is
154 // covered by a range tombstone, the merging iterator will not output it but
155 // skip it.
156 //
157 // Add the next range tombstone iterator to this merging iterator.
158 // There must be either no range tombstone iterator, or same number of
159 // range tombstone iterators as point iterators after all range tombstone
160 // iters are added. The i-th added range tombstone iterator and the i-th point
161 // iterator must point to the same sorted run.
162 // Merging iterator takes ownership of the range tombstone iterator and
163 // is responsible for freeing it. Note that during Iterator::Refresh()
164 // and when a level iterator moves to a different SST file, the range
165 // tombstone iterator could be updated. In that case, the merging iterator
166 // is only responsible to freeing the new range tombstone iterator
167 // that it has pointers to in range_tombstone_iters_.
168 void AddRangeTombstoneIterator(TruncatedRangeDelIterator* iter) {
169 range_tombstone_iters_.emplace_back(iter);
170 }
171
172 // Called by MergingIteratorBuilder when all point iterators and range
173 // tombstone iterators are added. Initializes HeapItems for range tombstone
174 // iterators so that no further allocation is needed for HeapItem.
175 void Finish() {
176 if (!range_tombstone_iters_.empty()) {
177 pinned_heap_item_.resize(range_tombstone_iters_.size());
178 for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
179 pinned_heap_item_[i].level = i;
180 }
181 }
182 }
183
184 ~MergingIterator() override {
185 for (auto child : range_tombstone_iters_) {
186 delete child;
187 }
188
189 for (auto& child : children_) {
190 child.iter.DeleteIter(is_arena_mode_);
191 }
192 status_.PermitUncheckedError();
193 }
194
195 bool Valid() const override { return current_ != nullptr && status_.ok(); }
196
197 Status status() const override { return status_; }
198
199 // Add range_tombstone_iters_[level] into min heap.
200 // Updates active_ if the end key of a range tombstone is inserted.
201 // @param start_key specifies which end point of the range tombstone to add.
202 void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
203 bool replace_top = false) {
204 assert(!range_tombstone_iters_.empty() &&
205 range_tombstone_iters_[level]->Valid());
206 if (start_key) {
207 ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
208 // iterate_upper_bound does not have timestamp
209 if (iterate_upper_bound_ &&
210 comparator_->user_comparator()->CompareWithoutTimestamp(
211 pik.user_key, true /* a_has_ts */, *iterate_upper_bound_,
212 false /* b_has_ts */) >= 0) {
213 if (replace_top) {
214 // replace_top implies this range tombstone iterator is still in
215 // minHeap_ and at the top.
216 minHeap_.pop();
217 }
218 return;
219 }
220 pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
221 pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
222 assert(active_.count(level) == 0);
223 } else {
224 // allow end key to go over upper bound (if present) since start key is
225 // before upper bound and the range tombstone could still cover a
226 // range before upper bound.
227 pinned_heap_item_[level].SetTombstoneKey(
228 range_tombstone_iters_[level]->end_key());
229 pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
230 active_.insert(level);
231 }
232 if (replace_top) {
233 minHeap_.replace_top(&pinned_heap_item_[level]);
234 } else {
235 minHeap_.push(&pinned_heap_item_[level]);
236 }
237 }
238
239 // Add range_tombstone_iters_[level] into max heap.
240 // Updates active_ if the start key of a range tombstone is inserted.
241 // @param end_key specifies which end point of the range tombstone to add.
242 void InsertRangeTombstoneToMaxHeap(size_t level, bool end_key = true,
243 bool replace_top = false) {
244 assert(!range_tombstone_iters_.empty() &&
245 range_tombstone_iters_[level]->Valid());
246 if (end_key) {
247 pinned_heap_item_[level].SetTombstoneKey(
248 range_tombstone_iters_[level]->end_key());
249 pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
250 assert(active_.count(level) == 0);
251 } else {
252 pinned_heap_item_[level].SetTombstoneKey(
253 range_tombstone_iters_[level]->start_key());
254 pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
255 active_.insert(level);
256 }
257 if (replace_top) {
258 maxHeap_->replace_top(&pinned_heap_item_[level]);
259 } else {
260 maxHeap_->push(&pinned_heap_item_[level]);
261 }
262 }
263
264 // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START
265 // until minHeap_ is empty or the top of the minHeap_ is not of type
266 // DELETE_RANGE_START. Each such item means a range tombstone becomes active,
267 // so `active_` is updated accordingly.
268 void PopDeleteRangeStart() {
269 while (!minHeap_.empty() &&
270 minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
271 TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
272 // insert end key of this range tombstone and updates active_
273 InsertRangeTombstoneToMinHeap(
274 minHeap_.top()->level, false /* start_key */, true /* replace_top */);
275 }
276 }
277
278 // Remove HeapItems from top of maxHeap_ that are of type DELETE_RANGE_END
279 // until maxHeap_ is empty or the top of the maxHeap_ is not of type
280 // DELETE_RANGE_END. Each such item means a range tombstone becomes active,
281 // so `active_` is updated accordingly.
282 void PopDeleteRangeEnd() {
283 while (!maxHeap_->empty() &&
284 maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) {
285 // insert start key of this range tombstone and updates active_
286 InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
287 true /* replace_top */);
288 }
289 }
290
291 void SeekToFirst() override {
292 ClearHeaps();
293 status_ = Status::OK();
294 for (auto& child : children_) {
295 child.iter.SeekToFirst();
296 AddToMinHeapOrCheckStatus(&child);
297 }
298
299 for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
300 if (range_tombstone_iters_[i]) {
301 range_tombstone_iters_[i]->SeekToFirst();
302 if (range_tombstone_iters_[i]->Valid()) {
303 // It is possible to be invalid due to snapshots.
304 InsertRangeTombstoneToMinHeap(i);
305 }
306 }
307 }
308 FindNextVisibleKey();
309 direction_ = kForward;
310 current_ = CurrentForward();
311 }
312
313 void SeekToLast() override {
314 ClearHeaps();
315 InitMaxHeap();
316 status_ = Status::OK();
317 for (auto& child : children_) {
318 child.iter.SeekToLast();
319 AddToMaxHeapOrCheckStatus(&child);
320 }
321
322 for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
323 if (range_tombstone_iters_[i]) {
324 range_tombstone_iters_[i]->SeekToLast();
325 if (range_tombstone_iters_[i]->Valid()) {
326 // It is possible to be invalid due to snapshots.
327 InsertRangeTombstoneToMaxHeap(i);
328 }
329 }
330 }
331 FindPrevVisibleKey();
332 direction_ = kReverse;
333 current_ = CurrentReverse();
334 }
335
336 // Position this merging iterator at the first key >= target (internal key).
337 // If range tombstones are present, keys covered by range tombstones are
338 // skipped, and this merging iter points to the first non-range-deleted key >=
339 // target after Seek(). If !Valid() and status().ok() then end of the iterator
340 // is reached.
341 //
342 // Internally, this involves positioning all child iterators at the first key
343 // >= target. If range tombstones are present, we apply a similar
344 // optimization, cascading seek, as in Pebble
345 // (https://github.com/cockroachdb/pebble). Specifically, if there is a range
346 // tombstone [start, end) that covers the target user key at level L, then
347 // this range tombstone must cover the range [target key, end) in all levels >
348 // L. So for all levels > L, we can pretend the target key is `end`. This
349 // optimization is applied at each level and hence the name "cascading seek".
350 // After a round of (cascading) seeks, the top of the heap is checked to see
351 // if it is covered by a range tombstone (see FindNextVisibleKey() for more
352 // detail), and advanced if so. The process is repeated until a
353 // non-range-deleted key is at the top of the heap, or heap becomes empty.
354 //
355 // As mentioned in comments above HeapItem, to make the checking of whether
356 // top of the heap is covered by some range tombstone efficient, we treat each
357 // range deletion [start, end) as two point keys and insert them into the same
358 // min/maxHeap_ where point iterators are. The set `active_` tracks the levels
359 // that have active range tombstones. If level L is in `active_`, and the
360 // point key at top of the heap is from level >= L, then the point key is
361 // within the internal key range of the range tombstone that
362 // range_tombstone_iters_[L] currently points to. For correctness reasoning,
363 // one invariant that Seek() (and every other public APIs Seek*(),
364 // Next/Prev()) guarantees is as follows. After Seek(), suppose `k` is the
365 // current key of level L's point iterator. Then for each range tombstone
366 // iterator at level <= L, it is at or before the first range tombstone with
367 // end key > `k`. This ensures that when level L's point iterator reaches top
368 // of the heap, `active_` is calculated correctly (it contains the covering
369 // range tombstone's level if there is one), since no range tombstone iterator
370 // was skipped beyond that point iterator's current key during Seek().
371 // Next()/Prev() maintains a stronger version of this invariant where all
372 // range tombstone iterators from level <= L are *at* the first range
373 // tombstone with end key > `k`.
374 void Seek(const Slice& target) override {
375 assert(range_tombstone_iters_.empty() ||
376 range_tombstone_iters_.size() == children_.size());
377 SeekImpl(target);
378 FindNextVisibleKey();
379
380 direction_ = kForward;
381 {
382 PERF_TIMER_GUARD(seek_min_heap_time);
383 current_ = CurrentForward();
384 }
385 }
386
387 void SeekForPrev(const Slice& target) override {
388 assert(range_tombstone_iters_.empty() ||
389 range_tombstone_iters_.size() == children_.size());
390 SeekForPrevImpl(target);
391 FindPrevVisibleKey();
392
393 direction_ = kReverse;
394 {
395 PERF_TIMER_GUARD(seek_max_heap_time);
396 current_ = CurrentReverse();
397 }
398 }
399
400 void Next() override {
401 assert(Valid());
402 // Ensure that all children are positioned after key().
403 // If we are moving in the forward direction, it is already
404 // true for all of the non-current children since current_ is
405 // the smallest child and key() == current_->key().
406 if (direction_ != kForward) {
407 // The loop advanced all non-current children to be > key() so current_
408 // should still be strictly the smallest key.
409 SwitchToForward();
410 }
411
412 // For the heap modifications below to be correct, current_ must be the
413 // current top of the heap.
414 assert(current_ == CurrentForward());
415 // as the current points to the current record. move the iterator forward.
416 current_->Next();
417 if (current_->Valid()) {
418 // current is still valid after the Next() call above. Call
419 // replace_top() to restore the heap property. When the same child
420 // iterator yields a sequence of keys, this is cheap.
421 assert(current_->status().ok());
422 minHeap_.replace_top(minHeap_.top());
423 } else {
424 // current stopped being valid, remove it from the heap.
425 considerStatus(current_->status());
426 minHeap_.pop();
427 }
428 FindNextVisibleKey();
429 current_ = CurrentForward();
430 }
431
432 bool NextAndGetResult(IterateResult* result) override {
433 Next();
434 bool is_valid = Valid();
435 if (is_valid) {
436 result->key = key();
437 result->bound_check_result = UpperBoundCheckResult();
438 result->value_prepared = current_->IsValuePrepared();
439 }
440 return is_valid;
441 }
442
443 void Prev() override {
444 assert(Valid());
445 // Ensure that all children are positioned before key().
446 // If we are moving in the reverse direction, it is already
447 // true for all of the non-current children since current_ is
448 // the largest child and key() == current_->key().
449 if (direction_ != kReverse) {
450 // Otherwise, retreat the non-current children. We retreat current_
451 // just after the if-block.
452 SwitchToBackward();
453 }
454
455 // For the heap modifications below to be correct, current_ must be the
456 // current top of the heap.
457 assert(current_ == CurrentReverse());
458 current_->Prev();
459 if (current_->Valid()) {
460 // current is still valid after the Prev() call above. Call
461 // replace_top() to restore the heap property. When the same child
462 // iterator yields a sequence of keys, this is cheap.
463 assert(current_->status().ok());
464 maxHeap_->replace_top(maxHeap_->top());
465 } else {
466 // current stopped being valid, remove it from the heap.
467 considerStatus(current_->status());
468 maxHeap_->pop();
469 }
470 FindPrevVisibleKey();
471 current_ = CurrentReverse();
472 }
473
474 Slice key() const override {
475 assert(Valid());
476 return current_->key();
477 }
478
479 Slice value() const override {
480 assert(Valid());
481 return current_->value();
482 }
483
484 bool PrepareValue() override {
485 assert(Valid());
486 if (current_->PrepareValue()) {
487 return true;
488 }
489
490 considerStatus(current_->status());
491 assert(!status_.ok());
492 return false;
493 }
494
495 // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
496 // from current child iterator. Potentially as long as one of child iterator
497 // report out of bound is not possible, we know current key is within bound.
498
499 bool MayBeOutOfLowerBound() override {
500 assert(Valid());
501 return current_->MayBeOutOfLowerBound();
502 }
503
504 IterBoundCheck UpperBoundCheckResult() override {
505 assert(Valid());
506 return current_->UpperBoundCheckResult();
507 }
508
509 void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
510 pinned_iters_mgr_ = pinned_iters_mgr;
511 for (auto& child : children_) {
512 child.iter.SetPinnedItersMgr(pinned_iters_mgr);
513 }
514 }
515
516 bool IsKeyPinned() const override {
517 assert(Valid());
518 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
519 current_->IsKeyPinned();
520 }
521
522 bool IsValuePinned() const override {
523 assert(Valid());
524 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
525 current_->IsValuePinned();
526 }
527
528 private:
529 friend class MergeIteratorBuilder;
530 // Clears heaps for both directions, used when changing direction or seeking
531 void ClearHeaps(bool clear_active = true);
532 // Ensures that maxHeap_ is initialized when starting to go in the reverse
533 // direction
534 void InitMaxHeap();
535
536 // Advance this merging iterator until the current key (top of min heap) is
537 // not covered by any range tombstone or that there is no more keys (heap is
538 // empty). After this call, if Valid(), current_ points to the next key that
539 // is not covered by any range tombstone.
540 void FindNextVisibleKey();
541 void FindPrevVisibleKey();
542
543 void SeekImpl(const Slice& target, size_t starting_level = 0,
544 bool range_tombstone_reseek = false);
545
546 // Seek to fist key <= target key (internal key) for
547 // children_[starting_level:].
548 void SeekForPrevImpl(const Slice& target, size_t starting_level = 0,
549 bool range_tombstone_reseek = false);
550
551 bool is_arena_mode_;
552 bool prefix_seek_mode_;
553 // Which direction is the iterator moving?
554 enum Direction : uint8_t { kForward, kReverse };
555 Direction direction_;
556 const InternalKeyComparator* comparator_;
557 // We could also use an autovector with a larger reserved size.
558 // HeapItem for all child point iterators.
559 std::vector<HeapItem> children_;
560 // HeapItem for range tombstone start and end keys. Each range tombstone
561 // iterator will have at most one side (start key or end key) in a heap
562 // at the same time, so this vector will be of size children_.size();
563 // pinned_heap_item_[i] corresponds to the start key and end key HeapItem
564 // for range_tombstone_iters_[i].
565 std::vector<HeapItem> pinned_heap_item_;
566 // range_tombstone_iters_[i] contains range tombstones in the sorted run that
567 // corresponds to children_[i]. range_tombstone_iters_.empty() means not
568 // handling range tombstones in merging iterator. range_tombstone_iters_[i] ==
569 // nullptr means the sorted run of children_[i] does not have range
570 // tombstones.
571 std::vector<TruncatedRangeDelIterator*> range_tombstone_iters_;
572
573 // Levels (indices into range_tombstone_iters_/children_ ) that currently have
574 // "active" range tombstones. See comments above Seek() for meaning of
575 // "active".
576 std::set<size_t> active_;
577
578 bool SkipNextDeleted();
579 bool SkipPrevDeleted();
580
581 // Cached pointer to child iterator with the current key, or nullptr if no
582 // child iterators are valid. This is the top of minHeap_ or maxHeap_
583 // depending on the direction.
584 IteratorWrapper* current_;
585 // If any of the children have non-ok status, this is one of them.
586 Status status_;
587 MergerMinIterHeap minHeap_;
588
589 // Max heap is used for reverse iteration, which is way less common than
590 // forward. Lazily initialize it to save memory.
591 std::unique_ptr<MergerMaxIterHeap> maxHeap_;
592 PinnedIteratorsManager* pinned_iters_mgr_;
593
594 // Used to bound range tombstones. For point keys, DBIter and SSTable iterator
595 // take care of boundary checking.
596 const Slice* iterate_upper_bound_;
597
598 // In forward direction, process a child that is not in the min heap.
599 // If valid, add to the min heap. Otherwise, check status.
600 void AddToMinHeapOrCheckStatus(HeapItem*);
601
602 // In backward direction, process a child that is not in the max heap.
603 // If valid, add to the min heap. Otherwise, check status.
604 void AddToMaxHeapOrCheckStatus(HeapItem*);
605
606 void SwitchToForward();
607
608 // Switch the direction from forward to backward without changing the
609 // position. Iterator should still be valid.
610 void SwitchToBackward();
611
612 IteratorWrapper* CurrentForward() const {
613 assert(direction_ == kForward);
614 assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::ITERATOR);
615 return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr;
616 }
617
618 IteratorWrapper* CurrentReverse() const {
619 assert(direction_ == kReverse);
620 assert(maxHeap_);
621 assert(maxHeap_->empty() || maxHeap_->top()->type == HeapItem::ITERATOR);
622 return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr;
623 }
624 };
625
626 // Seek to fist key >= target key (internal key) for children_[starting_level:].
627 // Cascading seek optimizations are applied if range tombstones are present (see
628 // comment above Seek() for more).
629 //
630 // @param range_tombstone_reseek Whether target is some range tombstone
631 // end, i.e., whether this SeekImpl() call is a part of a "cascading seek". This
632 // is used only for recoding relevant perf_context.
633 void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
634 bool range_tombstone_reseek) {
635 // active range tombstones before `starting_level` remain active
636 ClearHeaps(false /* clear_active */);
637 ParsedInternalKey pik;
638 if (!range_tombstone_iters_.empty()) {
639 // pik is only used in InsertRangeTombstoneToMinHeap().
640 ParseInternalKey(target, &pik, false).PermitUncheckedError();
641 }
642
643 // TODO: perhaps we could save some upheap cost by add all child iters first
644 // and then do a single heapify.
645 for (size_t level = 0; level < starting_level; ++level) {
646 PERF_TIMER_GUARD(seek_min_heap_time);
647 AddToMinHeapOrCheckStatus(&children_[level]);
648 }
649 if (!range_tombstone_iters_.empty()) {
650 // Add range tombstones from levels < starting_level. We can insert from
651 // pinned_heap_item_ for the following reasons:
652 // - pinned_heap_item_[level] is in minHeap_ iff
653 // range_tombstone_iters[level]->Valid().
654 // - If `level` is in active_, then range_tombstone_iters_[level]->Valid()
655 // and pinned_heap_item_[level] is of type RANGE_DELETION_END.
656 for (size_t level = 0; level < starting_level; ++level) {
657 if (range_tombstone_iters_[level] &&
658 range_tombstone_iters_[level]->Valid()) {
659 // use an iterator on active_ if performance becomes an issue here
660 if (active_.count(level) > 0) {
661 assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END);
662 // if it was active, then start key must be within upper_bound,
663 // so we can add to minHeap_ directly.
664 minHeap_.push(&pinned_heap_item_[level]);
665 } else {
666 // this takes care of checking iterate_upper_bound, but with an extra
667 // key comparison if range_tombstone_iters_[level] was already out of
668 // bound. Consider using a new HeapItem type or some flag to remember
669 // boundary checking result.
670 InsertRangeTombstoneToMinHeap(level);
671 }
672 } else {
673 assert(!active_.count(level));
674 }
675 }
676 // levels >= starting_level will be reseeked below, so clearing their active
677 // state here.
678 active_.erase(active_.lower_bound(starting_level), active_.end());
679 }
680
681 status_ = Status::OK();
682 IterKey current_search_key;
683 current_search_key.SetInternalKey(target, false /* copy */);
684 // Seek target might change to some range tombstone end key, so
685 // we need to remember them for async requests.
686 // (level, target) pairs
687 autovector<std::pair<size_t, std::string>> prefetched_target;
688 for (auto level = starting_level; level < children_.size(); ++level) {
689 {
690 PERF_TIMER_GUARD(seek_child_seek_time);
691 children_[level].iter.Seek(current_search_key.GetInternalKey());
692 }
693
694 PERF_COUNTER_ADD(seek_child_seek_count, 1);
695
696 if (!range_tombstone_iters_.empty()) {
697 if (range_tombstone_reseek) {
698 // This seek is to some range tombstone end key.
699 // Should only happen when there are range tombstones.
700 PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
701 }
702 if (children_[level].iter.status().IsTryAgain()) {
703 prefetched_target.emplace_back(
704 level, current_search_key.GetInternalKey().ToString());
705 }
706 auto range_tombstone_iter = range_tombstone_iters_[level];
707 if (range_tombstone_iter) {
708 range_tombstone_iter->Seek(current_search_key.GetUserKey());
709 if (range_tombstone_iter->Valid()) {
710 // insert the range tombstone end that is closer to and >=
711 // current_search_key. Strictly speaking, since the Seek() call above
712 // is on user key, it is possible that range_tombstone_iter->end_key()
713 // < current_search_key. This can happen when range_tombstone_iter is
714 // truncated and range_tombstone_iter.largest_ has the same user key
715 // as current_search_key.GetUserKey() but with a larger sequence
716 // number than current_search_key. Correctness is not affected as this
717 // tombstone end key will be popped during FindNextVisibleKey().
718 InsertRangeTombstoneToMinHeap(
719 level, comparator_->Compare(range_tombstone_iter->start_key(),
720 pik) > 0 /* start_key */);
721 // current_search_key < end_key guaranteed by the Seek() and Valid()
722 // calls above. Only interested in user key coverage since older
723 // sorted runs must have smaller sequence numbers than this range
724 // tombstone.
725 //
726 // TODO: range_tombstone_iter->Seek() finds the max covering
727 // sequence number, can make it cheaper by not looking for max.
728 if (comparator_->user_comparator()->Compare(
729 range_tombstone_iter->start_key().user_key,
730 current_search_key.GetUserKey()) <= 0) {
731 // Since range_tombstone_iter->Valid(), seqno should be valid, so
732 // there is no need to check it.
733 range_tombstone_reseek = true;
734 // Current target user key is covered by this range tombstone.
735 // All older sorted runs will seek to range tombstone end key.
736 // Note that for prefix seek case, it is possible that the prefix
737 // is not the same as the original target, it should not affect
738 // correctness. Besides, in most cases, range tombstone start and
739 // end key should have the same prefix?
740 // If range_tombstone_iter->end_key() is truncated to its largest_
741 // boundary, the timestamp in user_key will not be max timestamp,
742 // but the timestamp of `range_tombstone_iter.largest_`. This should
743 // be fine here as current_search_key is used to Seek into lower
744 // levels.
745 current_search_key.SetInternalKey(
746 range_tombstone_iter->end_key().user_key, kMaxSequenceNumber);
747 }
748 }
749 }
750 }
751 // child.iter.status() is set to Status::TryAgain indicating asynchronous
752 // request for retrieval of data blocks has been submitted. So it should
753 // return at this point and Seek should be called again to retrieve the
754 // requested block and add the child to min heap.
755 if (children_[level].iter.status().IsTryAgain()) {
756 continue;
757 }
758 {
759 // Strictly, we timed slightly more than min heap operation,
760 // but these operations are very cheap.
761 PERF_TIMER_GUARD(seek_min_heap_time);
762 AddToMinHeapOrCheckStatus(&children_[level]);
763 }
764 }
765
766 if (range_tombstone_iters_.empty()) {
767 for (auto& child : children_) {
768 if (child.iter.status().IsTryAgain()) {
769 child.iter.Seek(target);
770 {
771 PERF_TIMER_GUARD(seek_min_heap_time);
772 AddToMinHeapOrCheckStatus(&child);
773 }
774 PERF_COUNTER_ADD(number_async_seek, 1);
775 }
776 }
777 } else {
778 for (auto& prefetch : prefetched_target) {
779 // (level, target) pairs
780 children_[prefetch.first].iter.Seek(prefetch.second);
781 {
782 PERF_TIMER_GUARD(seek_min_heap_time);
783 AddToMinHeapOrCheckStatus(&children_[prefetch.first]);
784 }
785 PERF_COUNTER_ADD(number_async_seek, 1);
786 }
787 }
788 }
789
790 // Returns true iff the current key (min heap top) should not be returned
791 // to user (of the merging iterator). This can be because the current key
792 // is deleted by some range tombstone, the current key is some fake file
793 // boundary sentinel key, or the current key is an end point of a range
794 // tombstone. Advance the iterator at heap top if needed. Heap order is restored
795 // and `active_` is updated accordingly.
796 // See FindNextVisibleKey() for more detail on internal implementation
797 // of advancing child iters.
798 //
799 // REQUIRES:
800 // - min heap is currently not empty, and iter is in kForward direction.
801 // - minHeap_ top is not DELETE_RANGE_START (so that `active_` is current).
802 bool MergingIterator::SkipNextDeleted() {
803 // 3 types of keys:
804 // - point key
805 // - file boundary sentinel keys
806 // - range deletion end key
807 auto current = minHeap_.top();
808 if (current->type == HeapItem::DELETE_RANGE_END) {
809 active_.erase(current->level);
810 assert(range_tombstone_iters_[current->level] &&
811 range_tombstone_iters_[current->level]->Valid());
812 range_tombstone_iters_[current->level]->Next();
813 if (range_tombstone_iters_[current->level]->Valid()) {
814 InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
815 true /* replace_top */);
816 } else {
817 minHeap_.pop();
818 }
819 return true /* current key deleted */;
820 }
821 if (current->iter.IsDeleteRangeSentinelKey()) {
822 // If the file boundary is defined by a range deletion, the range
823 // tombstone's end key must come before this sentinel key (see op_type in
824 // SetTombstoneKey()).
825 assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
826 active_.count(current->level) == 0);
827 // LevelIterator enters a new SST file
828 current->iter.Next();
829 if (current->iter.Valid()) {
830 assert(current->iter.status().ok());
831 minHeap_.replace_top(current);
832 } else {
833 minHeap_.pop();
834 }
835 // Remove last SST file's range tombstone end key if there is one.
836 // This means file boundary is before range tombstone end key,
837 // which could happen when a range tombstone and a user key
838 // straddle two SST files. Note that in TruncatedRangeDelIterator
839 // constructor, parsed_largest.sequence is decremented 1 in this case.
840 if (!minHeap_.empty() && minHeap_.top()->level == current->level &&
841 minHeap_.top()->type == HeapItem::DELETE_RANGE_END) {
842 minHeap_.pop();
843 active_.erase(current->level);
844 }
845 if (range_tombstone_iters_[current->level] &&
846 range_tombstone_iters_[current->level]->Valid()) {
847 InsertRangeTombstoneToMinHeap(current->level);
848 }
849 return true /* current key deleted */;
850 }
851 assert(current->type == HeapItem::ITERATOR);
852 // Point key case: check active_ for range tombstone coverage.
853 ParsedInternalKey pik;
854 ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
855 if (!active_.empty()) {
856 auto i = *active_.begin();
857 if (i < current->level) {
858 // range tombstone is from a newer level, definitely covers
859 assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
860 pik) <= 0);
861 assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
862 0);
863 std::string target;
864 AppendInternalKey(&target, range_tombstone_iters_[i]->end_key());
865 SeekImpl(target, current->level, true);
866 return true /* current key deleted */;
867 } else if (i == current->level) {
868 // range tombstone is from the same level as current, check sequence
869 // number. By `active_` we know current key is between start key and end
870 // key.
871 assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
872 pik) <= 0);
873 assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
874 0);
875 if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
876 // covered by range tombstone
877 current->iter.Next();
878 if (current->iter.Valid()) {
879 minHeap_.replace_top(current);
880 } else {
881 minHeap_.pop();
882 }
883 return true /* current key deleted */;
884 } else {
885 return false /* current key not deleted */;
886 }
887 } else {
888 return false /* current key not deleted */;
889 // range tombstone from an older sorted run with current key < end key.
890 // current key is not deleted and the older sorted run will have its range
891 // tombstone updated when the range tombstone's end key are popped from
892 // minHeap_.
893 }
894 }
895 // we can reach here only if active_ is empty
896 assert(active_.empty());
897 assert(minHeap_.top()->type == HeapItem::ITERATOR);
898 return false /* current key not deleted */;
899 }
900
901 void MergingIterator::SeekForPrevImpl(const Slice& target,
902 size_t starting_level,
903 bool range_tombstone_reseek) {
904 // active range tombstones before `starting_level` remain active
905 ClearHeaps(false /* clear_active */);
906 InitMaxHeap();
907 ParsedInternalKey pik;
908 if (!range_tombstone_iters_.empty()) {
909 ParseInternalKey(target, &pik, false).PermitUncheckedError();
910 }
911 for (size_t level = 0; level < starting_level; ++level) {
912 PERF_TIMER_GUARD(seek_max_heap_time);
913 AddToMaxHeapOrCheckStatus(&children_[level]);
914 }
915 if (!range_tombstone_iters_.empty()) {
916 // Add range tombstones before starting_level.
917 for (size_t level = 0; level < starting_level; ++level) {
918 if (range_tombstone_iters_[level] &&
919 range_tombstone_iters_[level]->Valid()) {
920 assert(static_cast<bool>(active_.count(level)) ==
921 (pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_START));
922 maxHeap_->push(&pinned_heap_item_[level]);
923 } else {
924 assert(!active_.count(level));
925 }
926 }
927 // levels >= starting_level will be reseeked below,
928 active_.erase(active_.lower_bound(starting_level), active_.end());
929 }
930
931 status_ = Status::OK();
932 IterKey current_search_key;
933 current_search_key.SetInternalKey(target, false /* copy */);
934 // Seek target might change to some range tombstone end key, so
935 // we need to remember them for async requests.
936 // (level, target) pairs
937 autovector<std::pair<size_t, std::string>> prefetched_target;
938 for (auto level = starting_level; level < children_.size(); ++level) {
939 {
940 PERF_TIMER_GUARD(seek_child_seek_time);
941 children_[level].iter.SeekForPrev(current_search_key.GetInternalKey());
942 }
943
944 PERF_COUNTER_ADD(seek_child_seek_count, 1);
945
946 if (!range_tombstone_iters_.empty()) {
947 if (range_tombstone_reseek) {
948 // This seek is to some range tombstone end key.
949 // Should only happen when there are range tombstones.
950 PERF_COUNTER_ADD(internal_range_del_reseek_count, 1);
951 }
952 if (children_[level].iter.status().IsTryAgain()) {
953 prefetched_target.emplace_back(
954 level, current_search_key.GetInternalKey().ToString());
955 }
956 auto range_tombstone_iter = range_tombstone_iters_[level];
957 if (range_tombstone_iter) {
958 range_tombstone_iter->SeekForPrev(current_search_key.GetUserKey());
959 if (range_tombstone_iter->Valid()) {
960 InsertRangeTombstoneToMaxHeap(
961 level, comparator_->Compare(range_tombstone_iter->end_key(),
962 pik) <= 0 /* end_key */);
963 // start key <= current_search_key guaranteed by the Seek() call above
964 // Only interested in user key coverage since older sorted runs must
965 // have smaller sequence numbers than this tombstone.
966 if (comparator_->user_comparator()->Compare(
967 current_search_key.GetUserKey(),
968 range_tombstone_iter->end_key().user_key) < 0) {
969 range_tombstone_reseek = true;
970 current_search_key.SetInternalKey(
971 range_tombstone_iter->start_key().user_key, kMaxSequenceNumber,
972 kValueTypeForSeekForPrev);
973 }
974 }
975 }
976 }
977 // child.iter.status() is set to Status::TryAgain indicating asynchronous
978 // request for retrieval of data blocks has been submitted. So it should
979 // return at this point and Seek should be called again to retrieve the
980 // requested block and add the child to min heap.
981 if (children_[level].iter.status().IsTryAgain()) {
982 continue;
983 }
984 {
985 // Strictly, we timed slightly more than min heap operation,
986 // but these operations are very cheap.
987 PERF_TIMER_GUARD(seek_max_heap_time);
988 AddToMaxHeapOrCheckStatus(&children_[level]);
989 }
990 }
991
992 if (range_tombstone_iters_.empty()) {
993 for (auto& child : children_) {
994 if (child.iter.status().IsTryAgain()) {
995 child.iter.SeekForPrev(target);
996 {
997 PERF_TIMER_GUARD(seek_min_heap_time);
998 AddToMaxHeapOrCheckStatus(&child);
999 }
1000 PERF_COUNTER_ADD(number_async_seek, 1);
1001 }
1002 }
1003 } else {
1004 for (auto& prefetch : prefetched_target) {
1005 // (level, target) pairs
1006 children_[prefetch.first].iter.SeekForPrev(prefetch.second);
1007 {
1008 PERF_TIMER_GUARD(seek_max_heap_time);
1009 AddToMaxHeapOrCheckStatus(&children_[prefetch.first]);
1010 }
1011 PERF_COUNTER_ADD(number_async_seek, 1);
1012 }
1013 }
1014 }
1015
1016 // See more in comments above SkipNextDeleted().
1017 // REQUIRES:
1018 // - max heap is currently not empty, and iter is in kReverse direction.
1019 // - maxHeap_ top is not DELETE_RANGE_END (so that `active_` is current).
1020 bool MergingIterator::SkipPrevDeleted() {
1021 // 3 types of keys:
1022 // - point key
1023 // - file boundary sentinel keys
1024 // - range deletion start key
1025 auto current = maxHeap_->top();
1026 if (current->type == HeapItem::DELETE_RANGE_START) {
1027 active_.erase(current->level);
1028 assert(range_tombstone_iters_[current->level] &&
1029 range_tombstone_iters_[current->level]->Valid());
1030 range_tombstone_iters_[current->level]->Prev();
1031 if (range_tombstone_iters_[current->level]->Valid()) {
1032 InsertRangeTombstoneToMaxHeap(current->level, true /* end_key */,
1033 true /* replace_top */);
1034 } else {
1035 maxHeap_->pop();
1036 }
1037 return true /* current key deleted */;
1038 }
1039 if (current->iter.IsDeleteRangeSentinelKey()) {
1040 // LevelIterator enters a new SST file
1041 current->iter.Prev();
1042 if (current->iter.Valid()) {
1043 assert(current->iter.status().ok());
1044 maxHeap_->replace_top(current);
1045 } else {
1046 maxHeap_->pop();
1047 }
1048 if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
1049 maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) {
1050 maxHeap_->pop();
1051 active_.erase(current->level);
1052 }
1053 if (range_tombstone_iters_[current->level] &&
1054 range_tombstone_iters_[current->level]->Valid()) {
1055 InsertRangeTombstoneToMaxHeap(current->level);
1056 }
1057 return true /* current key deleted */;
1058 }
1059 assert(current->type == HeapItem::ITERATOR);
1060 // Point key case: check active_ for range tombstone coverage.
1061 ParsedInternalKey pik;
1062 ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
1063 if (!active_.empty()) {
1064 auto i = *active_.begin();
1065 if (i < current->level) {
1066 // range tombstone is from a newer level, definitely covers
1067 assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1068 pik) <= 0);
1069 assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
1070 0);
1071 std::string target;
1072 AppendInternalKey(&target, range_tombstone_iters_[i]->start_key());
1073 // This is different from SkipNextDeleted() which does reseek at sorted
1074 // runs >= level (instead of i+1 here). With min heap, if level L is at
1075 // top of the heap, then levels <L all have internal keys > level L's
1076 // current internal key, which means levels <L are already at a different
1077 // user key. With max heap, if level L is at top of the heap, then levels
1078 // <L all have internal keys smaller than level L's current internal key,
1079 // which might still be the same user key.
1080 SeekForPrevImpl(target, i + 1, true);
1081 return true /* current key deleted */;
1082 } else if (i == current->level) {
1083 // By `active_` we know current key is between start key and end key.
1084 assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1085 pik) <= 0);
1086 assert(comparator_->Compare(pik, range_tombstone_iters_[i]->end_key()) <
1087 0);
1088 if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
1089 current->iter.Prev();
1090 if (current->iter.Valid()) {
1091 maxHeap_->replace_top(current);
1092 } else {
1093 maxHeap_->pop();
1094 }
1095 return true /* current key deleted */;
1096 } else {
1097 return false /* current key not deleted */;
1098 }
1099 } else {
1100 return false /* current key not deleted */;
1101 }
1102 }
1103
1104 assert(active_.empty());
1105 assert(maxHeap_->top()->type == HeapItem::ITERATOR);
1106 return false /* current key not deleted */;
1107 }
1108
1109 void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
1110 if (child->iter.Valid()) {
1111 assert(child->iter.status().ok());
1112 minHeap_.push(child);
1113 } else {
1114 considerStatus(child->iter.status());
1115 }
1116 }
1117
1118 void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) {
1119 if (child->iter.Valid()) {
1120 assert(child->iter.status().ok());
1121 maxHeap_->push(child);
1122 } else {
1123 considerStatus(child->iter.status());
1124 }
1125 }
1126
1127 // Advance all non current_ child to > current_.key().
1128 // We advance current_ after the this function call as it does not require
1129 // Seek().
1130 // Advance all range tombstones iters, including the one corresponding to
1131 // current_, to the first tombstone with end_key > current_.key().
1132 // TODO: potentially do cascading seek here too
1133 void MergingIterator::SwitchToForward() {
1134 ClearHeaps();
1135 Slice target = key();
1136 for (auto& child : children_) {
1137 if (&child.iter != current_) {
1138 child.iter.Seek(target);
1139 // child.iter.status() is set to Status::TryAgain indicating asynchronous
1140 // request for retrieval of data blocks has been submitted. So it should
1141 // return at this point and Seek should be called again to retrieve the
1142 // requested block and add the child to min heap.
1143 if (child.iter.status() == Status::TryAgain()) {
1144 continue;
1145 }
1146 if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1147 assert(child.iter.status().ok());
1148 child.iter.Next();
1149 }
1150 }
1151 AddToMinHeapOrCheckStatus(&child);
1152 }
1153
1154 for (auto& child : children_) {
1155 if (child.iter.status() == Status::TryAgain()) {
1156 child.iter.Seek(target);
1157 if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1158 assert(child.iter.status().ok());
1159 child.iter.Next();
1160 }
1161 AddToMinHeapOrCheckStatus(&child);
1162 }
1163 }
1164
1165 // Current range tombstone iter also needs to seek for the following case:
1166 // Previous direction is backward, so range tombstone iter may point to a
1167 // tombstone before current_. If there is no such tombstone, then the range
1168 // tombstone iter is !Valid(). Need to reseek here to make it valid again.
1169 if (!range_tombstone_iters_.empty()) {
1170 ParsedInternalKey pik;
1171 ParseInternalKey(target, &pik, false /* log_err_key */)
1172 .PermitUncheckedError();
1173 for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
1174 auto iter = range_tombstone_iters_[i];
1175 if (iter) {
1176 iter->Seek(pik.user_key);
1177 // The while loop is needed as the Seek() call above is only for user
1178 // key. We could have a range tombstone with end_key covering user_key,
1179 // but still is smaller than target. This happens when the range
1180 // tombstone is truncated at iter.largest_.
1181 while (iter->Valid() &&
1182 comparator_->Compare(iter->end_key(), pik) <= 0) {
1183 iter->Next();
1184 }
1185 if (range_tombstone_iters_[i]->Valid()) {
1186 InsertRangeTombstoneToMinHeap(
1187 i, comparator_->Compare(range_tombstone_iters_[i]->start_key(),
1188 pik) > 0 /* start_key */);
1189 }
1190 }
1191 }
1192 }
1193
1194 direction_ = kForward;
1195 assert(current_ == CurrentForward());
1196 }
1197
1198 // Advance all range tombstones iters, including the one corresponding to
1199 // current_, to the first tombstone with start_key <= current_.key().
1200 void MergingIterator::SwitchToBackward() {
1201 ClearHeaps();
1202 InitMaxHeap();
1203 Slice target = key();
1204 for (auto& child : children_) {
1205 if (&child.iter != current_) {
1206 child.iter.SeekForPrev(target);
1207 TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
1208 if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1209 assert(child.iter.status().ok());
1210 child.iter.Prev();
1211 }
1212 }
1213 AddToMaxHeapOrCheckStatus(&child);
1214 }
1215
1216 ParsedInternalKey pik;
1217 ParseInternalKey(target, &pik, false /* log_err_key */)
1218 .PermitUncheckedError();
1219 for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
1220 auto iter = range_tombstone_iters_[i];
1221 if (iter) {
1222 iter->SeekForPrev(pik.user_key);
1223 // Since the SeekForPrev() call above is only for user key,
1224 // we may end up with some range tombstone with start key having the
1225 // same user key at current_, but with a smaller sequence number. This
1226 // makes current_ not at maxHeap_ top for the CurrentReverse() call
1227 // below. If there is a range tombstone start key with the same user
1228 // key and the same sequence number as current_.key(), it will be fine as
1229 // in InsertRangeTombstoneToMaxHeap() we change op_type to be the smallest
1230 // op_type.
1231 while (iter->Valid() &&
1232 comparator_->Compare(iter->start_key(), pik) > 0) {
1233 iter->Prev();
1234 }
1235 if (iter->Valid()) {
1236 InsertRangeTombstoneToMaxHeap(
1237 i, comparator_->Compare(range_tombstone_iters_[i]->end_key(),
1238 pik) <= 0 /* end_key */);
1239 }
1240 }
1241 }
1242
1243 direction_ = kReverse;
1244 if (!prefix_seek_mode_) {
1245 // Note that we don't do assert(current_ == CurrentReverse()) here
1246 // because it is possible to have some keys larger than the seek-key
1247 // inserted between Seek() and SeekToLast(), which makes current_ not
1248 // equal to CurrentReverse().
1249 current_ = CurrentReverse();
1250 }
1251 assert(current_ == CurrentReverse());
1252 }
1253
1254 void MergingIterator::ClearHeaps(bool clear_active) {
1255 minHeap_.clear();
1256 if (maxHeap_) {
1257 maxHeap_->clear();
1258 }
1259 if (clear_active) {
1260 active_.clear();
1261 }
1262 }
1263
1264 void MergingIterator::InitMaxHeap() {
1265 if (!maxHeap_) {
1266 maxHeap_ = std::make_unique<MergerMaxIterHeap>(comparator_);
1267 }
1268 }
1269
1270 // Repeatedly check and remove heap top key if it is not a point key
1271 // that is not covered by range tombstones. SeekImpl() is called to seek to end
1272 // of a range tombstone if the heap top is a point key covered by some range
1273 // tombstone from a newer sorted run. If the covering tombstone is from current
1274 // key's level, then the current child iterator is simply advanced to its next
1275 // key without reseeking.
1276 inline void MergingIterator::FindNextVisibleKey() {
1277 // When active_ is empty, we know heap top cannot be a range tombstone end
1278 // key. It cannot be a range tombstone start key per PopDeleteRangeStart().
1279 PopDeleteRangeStart();
1280 while (!minHeap_.empty() &&
1281 (!active_.empty() || minHeap_.top()->IsDeleteRangeSentinelKey()) &&
1282 SkipNextDeleted()) {
1283 PopDeleteRangeStart();
1284 }
1285 }
1286
1287 inline void MergingIterator::FindPrevVisibleKey() {
1288 PopDeleteRangeEnd();
1289 while (!maxHeap_->empty() &&
1290 (!active_.empty() || maxHeap_->top()->IsDeleteRangeSentinelKey()) &&
1291 SkipPrevDeleted()) {
1292 PopDeleteRangeEnd();
1293 }
1294 }
1295
1296 InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
1297 InternalIterator** list, int n,
1298 Arena* arena, bool prefix_seek_mode) {
1299 assert(n >= 0);
1300 if (n == 0) {
1301 return NewEmptyInternalIterator<Slice>(arena);
1302 } else if (n == 1) {
1303 return list[0];
1304 } else {
1305 if (arena == nullptr) {
1306 return new MergingIterator(cmp, list, n, false, prefix_seek_mode);
1307 } else {
1308 auto mem = arena->AllocateAligned(sizeof(MergingIterator));
1309 return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode);
1310 }
1311 }
1312 }
1313
1314 MergeIteratorBuilder::MergeIteratorBuilder(
1315 const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode,
1316 const Slice* iterate_upper_bound)
1317 : first_iter(nullptr), use_merging_iter(false), arena(a) {
1318 auto mem = arena->AllocateAligned(sizeof(MergingIterator));
1319 merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true,
1320 prefix_seek_mode, iterate_upper_bound);
1321 }
1322
1323 MergeIteratorBuilder::~MergeIteratorBuilder() {
1324 if (first_iter != nullptr) {
1325 first_iter->~InternalIterator();
1326 }
1327 if (merge_iter != nullptr) {
1328 merge_iter->~MergingIterator();
1329 }
1330 }
1331
1332 void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
1333 if (!use_merging_iter && first_iter != nullptr) {
1334 merge_iter->AddIterator(first_iter);
1335 use_merging_iter = true;
1336 first_iter = nullptr;
1337 }
1338 if (use_merging_iter) {
1339 merge_iter->AddIterator(iter);
1340 } else {
1341 first_iter = iter;
1342 }
1343 }
1344
1345 void MergeIteratorBuilder::AddPointAndTombstoneIterator(
1346 InternalIterator* point_iter, TruncatedRangeDelIterator* tombstone_iter,
1347 TruncatedRangeDelIterator*** tombstone_iter_ptr) {
1348 // tombstone_iter_ptr != nullptr means point_iter is a LevelIterator.
1349 bool add_range_tombstone = tombstone_iter ||
1350 !merge_iter->range_tombstone_iters_.empty() ||
1351 tombstone_iter_ptr;
1352 if (!use_merging_iter && (add_range_tombstone || first_iter)) {
1353 use_merging_iter = true;
1354 if (first_iter) {
1355 merge_iter->AddIterator(first_iter);
1356 first_iter = nullptr;
1357 }
1358 }
1359 if (use_merging_iter) {
1360 merge_iter->AddIterator(point_iter);
1361 if (add_range_tombstone) {
1362 // If there was a gap, fill in nullptr as empty range tombstone iterators.
1363 while (merge_iter->range_tombstone_iters_.size() <
1364 merge_iter->children_.size() - 1) {
1365 merge_iter->AddRangeTombstoneIterator(nullptr);
1366 }
1367 merge_iter->AddRangeTombstoneIterator(tombstone_iter);
1368 }
1369
1370 if (tombstone_iter_ptr) {
1371 // This is needed instead of setting to &range_tombstone_iters_[i]
1372 // directly here since the memory address of range_tombstone_iters_[i]
1373 // might change during vector resizing.
1374 range_del_iter_ptrs_.emplace_back(
1375 merge_iter->range_tombstone_iters_.size() - 1, tombstone_iter_ptr);
1376 }
1377 } else {
1378 first_iter = point_iter;
1379 }
1380 }
1381
1382 InternalIterator* MergeIteratorBuilder::Finish(ArenaWrappedDBIter* db_iter) {
1383 InternalIterator* ret = nullptr;
1384 if (!use_merging_iter) {
1385 ret = first_iter;
1386 first_iter = nullptr;
1387 } else {
1388 for (auto& p : range_del_iter_ptrs_) {
1389 *(p.second) = &(merge_iter->range_tombstone_iters_[p.first]);
1390 }
1391 if (db_iter && !merge_iter->range_tombstone_iters_.empty()) {
1392 // memtable is always the first level
1393 db_iter->SetMemtableRangetombstoneIter(
1394 &merge_iter->range_tombstone_iters_.front());
1395 }
1396 merge_iter->Finish();
1397 ret = merge_iter;
1398 merge_iter = nullptr;
1399 }
1400 return ret;
1401 }
1402
1403 } // namespace ROCKSDB_NAMESPACE