1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
10 #include "table/merging_iterator.h"
12 #include "db/arena_wrapped_db_iter.h"
13 #include "db/dbformat.h"
14 #include "db/pinned_iterators_manager.h"
15 #include "memory/arena.h"
16 #include "monitoring/perf_context_imp.h"
17 #include "rocksdb/comparator.h"
18 #include "rocksdb/iterator.h"
19 #include "rocksdb/options.h"
20 #include "table/internal_iterator.h"
21 #include "table/iter_heap.h"
22 #include "table/iterator_wrapper.h"
23 #include "test_util/sync_point.h"
24 #include "util/autovector.h"
25 #include "util/heap.h"
26 #include "util/stop_watch.h"
28 namespace ROCKSDB_NAMESPACE
{
29 // For merging iterator to process range tombstones, we treat the start and end
30 // keys of a range tombstone as point keys and put them into the minHeap/maxHeap
31 // used in merging iterator. Take minHeap for example, we are able to keep track
32 // of currently "active" range tombstones (the ones whose start keys are popped
33 // but end keys are still in the heap) in `active_`. This `active_` set of range
34 // tombstones is then used to quickly determine whether the point key at heap
35 // top is deleted (by heap property, the point key at heap top must be within
36 // internal key range of active range tombstones).
38 // The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
39 // point key and the start and end keys of a range tombstone.
43 enum Type
{ ITERATOR
, DELETE_RANGE_START
, DELETE_RANGE_END
};
46 std::string pinned_key
;
47 // Will be overwritten before use, initialize here so compiler does not
51 explicit HeapItem(size_t _level
, InternalIteratorBase
<Slice
>* _iter
)
52 : level(_level
), type(Type::ITERATOR
) {
56 void SetTombstoneKey(ParsedInternalKey
&& pik
) {
58 // Range tombstone end key is exclusive. If a point internal key has the
59 // same user key and sequence number as the start or end key of a range
60 // tombstone, the order will be start < end key < internal key with the
61 // following op_type change. This is helpful to ensure keys popped from
62 // heap are in expected order since range tombstone start/end keys will
63 // be distinct from point internal keys. Strictly speaking, this is only
64 // needed for tombstone end points that are truncated in
65 // TruncatedRangeDelIterator since untruncated tombstone end points always
66 // have kMaxSequenceNumber and kTypeRangeDeletion (see
67 // TruncatedRangeDelIterator::start_key()/end_key()).
68 ParsedInternalKey
p(pik
.user_key
, pik
.sequence
, kTypeMaxValid
);
69 AppendInternalKey(&pinned_key
, p
);
73 if (type
== Type::ITERATOR
) {
79 bool IsDeleteRangeSentinelKey() const {
80 if (type
== Type::ITERATOR
) {
81 return iter
.IsDeleteRangeSentinelKey();
87 class MinHeapItemComparator
{
89 MinHeapItemComparator(const InternalKeyComparator
* comparator
)
90 : comparator_(comparator
) {}
91 bool operator()(HeapItem
* a
, HeapItem
* b
) const {
92 return comparator_
->Compare(a
->key(), b
->key()) > 0;
96 const InternalKeyComparator
* comparator_
;
99 class MaxHeapItemComparator
{
101 MaxHeapItemComparator(const InternalKeyComparator
* comparator
)
102 : comparator_(comparator
) {}
103 bool operator()(HeapItem
* a
, HeapItem
* b
) const {
104 return comparator_
->Compare(a
->key(), b
->key()) < 0;
108 const InternalKeyComparator
* comparator_
;
110 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
112 using MergerMinIterHeap
= BinaryHeap
<HeapItem
*, MinHeapItemComparator
>;
113 using MergerMaxIterHeap
= BinaryHeap
<HeapItem
*, MaxHeapItemComparator
>;
116 class MergingIterator
: public InternalIterator
{
118 MergingIterator(const InternalKeyComparator
* comparator
,
119 InternalIterator
** children
, int n
, bool is_arena_mode
,
120 bool prefix_seek_mode
,
121 const Slice
* iterate_upper_bound
= nullptr)
122 : is_arena_mode_(is_arena_mode
),
123 prefix_seek_mode_(prefix_seek_mode
),
124 direction_(kForward
),
125 comparator_(comparator
),
127 minHeap_(comparator_
),
128 pinned_iters_mgr_(nullptr),
129 iterate_upper_bound_(iterate_upper_bound
) {
131 for (int i
= 0; i
< n
; i
++) {
132 children_
[i
].level
= i
;
133 children_
[i
].iter
.Set(children
[i
]);
137 void considerStatus(Status s
) {
138 if (!s
.ok() && status_
.ok()) {
143 virtual void AddIterator(InternalIterator
* iter
) {
144 children_
.emplace_back(children_
.size(), iter
);
145 if (pinned_iters_mgr_
) {
146 iter
->SetPinnedItersMgr(pinned_iters_mgr_
);
148 // Invalidate to ensure `Seek*()` is called to construct the heaps before
153 // Merging iterator can optionally process range tombstones: if a key is
154 // covered by a range tombstone, the merging iterator will not output it but
157 // Add the next range tombstone iterator to this merging iterator.
158 // There must be either no range tombstone iterator, or same number of
159 // range tombstone iterators as point iterators after all range tombstone
160 // iters are added. The i-th added range tombstone iterator and the i-th point
161 // iterator must point to the same sorted run.
162 // Merging iterator takes ownership of the range tombstone iterator and
163 // is responsible for freeing it. Note that during Iterator::Refresh()
164 // and when a level iterator moves to a different SST file, the range
165 // tombstone iterator could be updated. In that case, the merging iterator
166 // is only responsible to freeing the new range tombstone iterator
167 // that it has pointers to in range_tombstone_iters_.
168 void AddRangeTombstoneIterator(TruncatedRangeDelIterator
* iter
) {
169 range_tombstone_iters_
.emplace_back(iter
);
172 // Called by MergingIteratorBuilder when all point iterators and range
173 // tombstone iterators are added. Initializes HeapItems for range tombstone
174 // iterators so that no further allocation is needed for HeapItem.
176 if (!range_tombstone_iters_
.empty()) {
177 pinned_heap_item_
.resize(range_tombstone_iters_
.size());
178 for (size_t i
= 0; i
< range_tombstone_iters_
.size(); ++i
) {
179 pinned_heap_item_
[i
].level
= i
;
184 ~MergingIterator() override
{
185 for (auto child
: range_tombstone_iters_
) {
189 for (auto& child
: children_
) {
190 child
.iter
.DeleteIter(is_arena_mode_
);
192 status_
.PermitUncheckedError();
195 bool Valid() const override
{ return current_
!= nullptr && status_
.ok(); }
197 Status
status() const override
{ return status_
; }
199 // Add range_tombstone_iters_[level] into min heap.
200 // Updates active_ if the end key of a range tombstone is inserted.
201 // @param start_key specifies which end point of the range tombstone to add.
202 void InsertRangeTombstoneToMinHeap(size_t level
, bool start_key
= true,
203 bool replace_top
= false) {
204 assert(!range_tombstone_iters_
.empty() &&
205 range_tombstone_iters_
[level
]->Valid());
207 ParsedInternalKey pik
= range_tombstone_iters_
[level
]->start_key();
208 // iterate_upper_bound does not have timestamp
209 if (iterate_upper_bound_
&&
210 comparator_
->user_comparator()->CompareWithoutTimestamp(
211 pik
.user_key
, true /* a_has_ts */, *iterate_upper_bound_
,
212 false /* b_has_ts */) >= 0) {
214 // replace_top implies this range tombstone iterator is still in
215 // minHeap_ and at the top.
220 pinned_heap_item_
[level
].SetTombstoneKey(std::move(pik
));
221 pinned_heap_item_
[level
].type
= HeapItem::DELETE_RANGE_START
;
222 assert(active_
.count(level
) == 0);
224 // allow end key to go over upper bound (if present) since start key is
225 // before upper bound and the range tombstone could still cover a
226 // range before upper bound.
227 pinned_heap_item_
[level
].SetTombstoneKey(
228 range_tombstone_iters_
[level
]->end_key());
229 pinned_heap_item_
[level
].type
= HeapItem::DELETE_RANGE_END
;
230 active_
.insert(level
);
233 minHeap_
.replace_top(&pinned_heap_item_
[level
]);
235 minHeap_
.push(&pinned_heap_item_
[level
]);
239 // Add range_tombstone_iters_[level] into max heap.
240 // Updates active_ if the start key of a range tombstone is inserted.
241 // @param end_key specifies which end point of the range tombstone to add.
242 void InsertRangeTombstoneToMaxHeap(size_t level
, bool end_key
= true,
243 bool replace_top
= false) {
244 assert(!range_tombstone_iters_
.empty() &&
245 range_tombstone_iters_
[level
]->Valid());
247 pinned_heap_item_
[level
].SetTombstoneKey(
248 range_tombstone_iters_
[level
]->end_key());
249 pinned_heap_item_
[level
].type
= HeapItem::DELETE_RANGE_END
;
250 assert(active_
.count(level
) == 0);
252 pinned_heap_item_
[level
].SetTombstoneKey(
253 range_tombstone_iters_
[level
]->start_key());
254 pinned_heap_item_
[level
].type
= HeapItem::DELETE_RANGE_START
;
255 active_
.insert(level
);
258 maxHeap_
->replace_top(&pinned_heap_item_
[level
]);
260 maxHeap_
->push(&pinned_heap_item_
[level
]);
264 // Remove HeapItems from top of minHeap_ that are of type DELETE_RANGE_START
265 // until minHeap_ is empty or the top of the minHeap_ is not of type
266 // DELETE_RANGE_START. Each such item means a range tombstone becomes active,
267 // so `active_` is updated accordingly.
268 void PopDeleteRangeStart() {
269 while (!minHeap_
.empty() &&
270 minHeap_
.top()->type
== HeapItem::DELETE_RANGE_START
) {
271 TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
272 // insert end key of this range tombstone and updates active_
273 InsertRangeTombstoneToMinHeap(
274 minHeap_
.top()->level
, false /* start_key */, true /* replace_top */);
278 // Remove HeapItems from top of maxHeap_ that are of type DELETE_RANGE_END
279 // until maxHeap_ is empty or the top of the maxHeap_ is not of type
280 // DELETE_RANGE_END. Each such item means a range tombstone becomes active,
281 // so `active_` is updated accordingly.
282 void PopDeleteRangeEnd() {
283 while (!maxHeap_
->empty() &&
284 maxHeap_
->top()->type
== HeapItem::DELETE_RANGE_END
) {
285 // insert start key of this range tombstone and updates active_
286 InsertRangeTombstoneToMaxHeap(maxHeap_
->top()->level
, false /* end_key */,
287 true /* replace_top */);
291 void SeekToFirst() override
{
293 status_
= Status::OK();
294 for (auto& child
: children_
) {
295 child
.iter
.SeekToFirst();
296 AddToMinHeapOrCheckStatus(&child
);
299 for (size_t i
= 0; i
< range_tombstone_iters_
.size(); ++i
) {
300 if (range_tombstone_iters_
[i
]) {
301 range_tombstone_iters_
[i
]->SeekToFirst();
302 if (range_tombstone_iters_
[i
]->Valid()) {
303 // It is possible to be invalid due to snapshots.
304 InsertRangeTombstoneToMinHeap(i
);
308 FindNextVisibleKey();
309 direction_
= kForward
;
310 current_
= CurrentForward();
313 void SeekToLast() override
{
316 status_
= Status::OK();
317 for (auto& child
: children_
) {
318 child
.iter
.SeekToLast();
319 AddToMaxHeapOrCheckStatus(&child
);
322 for (size_t i
= 0; i
< range_tombstone_iters_
.size(); ++i
) {
323 if (range_tombstone_iters_
[i
]) {
324 range_tombstone_iters_
[i
]->SeekToLast();
325 if (range_tombstone_iters_
[i
]->Valid()) {
326 // It is possible to be invalid due to snapshots.
327 InsertRangeTombstoneToMaxHeap(i
);
331 FindPrevVisibleKey();
332 direction_
= kReverse
;
333 current_
= CurrentReverse();
336 // Position this merging iterator at the first key >= target (internal key).
337 // If range tombstones are present, keys covered by range tombstones are
338 // skipped, and this merging iter points to the first non-range-deleted key >=
339 // target after Seek(). If !Valid() and status().ok() then end of the iterator
342 // Internally, this involves positioning all child iterators at the first key
343 // >= target. If range tombstones are present, we apply a similar
344 // optimization, cascading seek, as in Pebble
345 // (https://github.com/cockroachdb/pebble). Specifically, if there is a range
346 // tombstone [start, end) that covers the target user key at level L, then
347 // this range tombstone must cover the range [target key, end) in all levels >
348 // L. So for all levels > L, we can pretend the target key is `end`. This
349 // optimization is applied at each level and hence the name "cascading seek".
350 // After a round of (cascading) seeks, the top of the heap is checked to see
351 // if it is covered by a range tombstone (see FindNextVisibleKey() for more
352 // detail), and advanced if so. The process is repeated until a
353 // non-range-deleted key is at the top of the heap, or heap becomes empty.
355 // As mentioned in comments above HeapItem, to make the checking of whether
356 // top of the heap is covered by some range tombstone efficient, we treat each
357 // range deletion [start, end) as two point keys and insert them into the same
358 // min/maxHeap_ where point iterators are. The set `active_` tracks the levels
359 // that have active range tombstones. If level L is in `active_`, and the
360 // point key at top of the heap is from level >= L, then the point key is
361 // within the internal key range of the range tombstone that
362 // range_tombstone_iters_[L] currently points to. For correctness reasoning,
363 // one invariant that Seek() (and every other public APIs Seek*(),
364 // Next/Prev()) guarantees is as follows. After Seek(), suppose `k` is the
365 // current key of level L's point iterator. Then for each range tombstone
366 // iterator at level <= L, it is at or before the first range tombstone with
367 // end key > `k`. This ensures that when level L's point iterator reaches top
368 // of the heap, `active_` is calculated correctly (it contains the covering
369 // range tombstone's level if there is one), since no range tombstone iterator
370 // was skipped beyond that point iterator's current key during Seek().
371 // Next()/Prev() maintains a stronger version of this invariant where all
372 // range tombstone iterators from level <= L are *at* the first range
373 // tombstone with end key > `k`.
374 void Seek(const Slice
& target
) override
{
375 assert(range_tombstone_iters_
.empty() ||
376 range_tombstone_iters_
.size() == children_
.size());
378 FindNextVisibleKey();
380 direction_
= kForward
;
382 PERF_TIMER_GUARD(seek_min_heap_time
);
383 current_
= CurrentForward();
387 void SeekForPrev(const Slice
& target
) override
{
388 assert(range_tombstone_iters_
.empty() ||
389 range_tombstone_iters_
.size() == children_
.size());
390 SeekForPrevImpl(target
);
391 FindPrevVisibleKey();
393 direction_
= kReverse
;
395 PERF_TIMER_GUARD(seek_max_heap_time
);
396 current_
= CurrentReverse();
400 void Next() override
{
402 // Ensure that all children are positioned after key().
403 // If we are moving in the forward direction, it is already
404 // true for all of the non-current children since current_ is
405 // the smallest child and key() == current_->key().
406 if (direction_
!= kForward
) {
407 // The loop advanced all non-current children to be > key() so current_
408 // should still be strictly the smallest key.
412 // For the heap modifications below to be correct, current_ must be the
413 // current top of the heap.
414 assert(current_
== CurrentForward());
415 // as the current points to the current record. move the iterator forward.
417 if (current_
->Valid()) {
418 // current is still valid after the Next() call above. Call
419 // replace_top() to restore the heap property. When the same child
420 // iterator yields a sequence of keys, this is cheap.
421 assert(current_
->status().ok());
422 minHeap_
.replace_top(minHeap_
.top());
424 // current stopped being valid, remove it from the heap.
425 considerStatus(current_
->status());
428 FindNextVisibleKey();
429 current_
= CurrentForward();
432 bool NextAndGetResult(IterateResult
* result
) override
{
434 bool is_valid
= Valid();
437 result
->bound_check_result
= UpperBoundCheckResult();
438 result
->value_prepared
= current_
->IsValuePrepared();
443 void Prev() override
{
445 // Ensure that all children are positioned before key().
446 // If we are moving in the reverse direction, it is already
447 // true for all of the non-current children since current_ is
448 // the largest child and key() == current_->key().
449 if (direction_
!= kReverse
) {
450 // Otherwise, retreat the non-current children. We retreat current_
451 // just after the if-block.
455 // For the heap modifications below to be correct, current_ must be the
456 // current top of the heap.
457 assert(current_
== CurrentReverse());
459 if (current_
->Valid()) {
460 // current is still valid after the Prev() call above. Call
461 // replace_top() to restore the heap property. When the same child
462 // iterator yields a sequence of keys, this is cheap.
463 assert(current_
->status().ok());
464 maxHeap_
->replace_top(maxHeap_
->top());
466 // current stopped being valid, remove it from the heap.
467 considerStatus(current_
->status());
470 FindPrevVisibleKey();
471 current_
= CurrentReverse();
474 Slice
key() const override
{
476 return current_
->key();
479 Slice
value() const override
{
481 return current_
->value();
484 bool PrepareValue() override
{
486 if (current_
->PrepareValue()) {
490 considerStatus(current_
->status());
491 assert(!status_
.ok());
495 // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
496 // from current child iterator. Potentially as long as one of child iterator
497 // report out of bound is not possible, we know current key is within bound.
499 bool MayBeOutOfLowerBound() override
{
501 return current_
->MayBeOutOfLowerBound();
504 IterBoundCheck
UpperBoundCheckResult() override
{
506 return current_
->UpperBoundCheckResult();
509 void SetPinnedItersMgr(PinnedIteratorsManager
* pinned_iters_mgr
) override
{
510 pinned_iters_mgr_
= pinned_iters_mgr
;
511 for (auto& child
: children_
) {
512 child
.iter
.SetPinnedItersMgr(pinned_iters_mgr
);
516 bool IsKeyPinned() const override
{
518 return pinned_iters_mgr_
&& pinned_iters_mgr_
->PinningEnabled() &&
519 current_
->IsKeyPinned();
522 bool IsValuePinned() const override
{
524 return pinned_iters_mgr_
&& pinned_iters_mgr_
->PinningEnabled() &&
525 current_
->IsValuePinned();
529 friend class MergeIteratorBuilder
;
530 // Clears heaps for both directions, used when changing direction or seeking
531 void ClearHeaps(bool clear_active
= true);
532 // Ensures that maxHeap_ is initialized when starting to go in the reverse
536 // Advance this merging iterator until the current key (top of min heap) is
537 // not covered by any range tombstone or that there is no more keys (heap is
538 // empty). After this call, if Valid(), current_ points to the next key that
539 // is not covered by any range tombstone.
540 void FindNextVisibleKey();
541 void FindPrevVisibleKey();
543 void SeekImpl(const Slice
& target
, size_t starting_level
= 0,
544 bool range_tombstone_reseek
= false);
546 // Seek to fist key <= target key (internal key) for
547 // children_[starting_level:].
548 void SeekForPrevImpl(const Slice
& target
, size_t starting_level
= 0,
549 bool range_tombstone_reseek
= false);
552 bool prefix_seek_mode_
;
553 // Which direction is the iterator moving?
554 enum Direction
: uint8_t { kForward
, kReverse
};
555 Direction direction_
;
556 const InternalKeyComparator
* comparator_
;
557 // We could also use an autovector with a larger reserved size.
558 // HeapItem for all child point iterators.
559 std::vector
<HeapItem
> children_
;
560 // HeapItem for range tombstone start and end keys. Each range tombstone
561 // iterator will have at most one side (start key or end key) in a heap
562 // at the same time, so this vector will be of size children_.size();
563 // pinned_heap_item_[i] corresponds to the start key and end key HeapItem
564 // for range_tombstone_iters_[i].
565 std::vector
<HeapItem
> pinned_heap_item_
;
566 // range_tombstone_iters_[i] contains range tombstones in the sorted run that
567 // corresponds to children_[i]. range_tombstone_iters_.empty() means not
568 // handling range tombstones in merging iterator. range_tombstone_iters_[i] ==
569 // nullptr means the sorted run of children_[i] does not have range
571 std::vector
<TruncatedRangeDelIterator
*> range_tombstone_iters_
;
573 // Levels (indices into range_tombstone_iters_/children_ ) that currently have
574 // "active" range tombstones. See comments above Seek() for meaning of
576 std::set
<size_t> active_
;
578 bool SkipNextDeleted();
579 bool SkipPrevDeleted();
581 // Cached pointer to child iterator with the current key, or nullptr if no
582 // child iterators are valid. This is the top of minHeap_ or maxHeap_
583 // depending on the direction.
584 IteratorWrapper
* current_
;
585 // If any of the children have non-ok status, this is one of them.
587 MergerMinIterHeap minHeap_
;
589 // Max heap is used for reverse iteration, which is way less common than
590 // forward. Lazily initialize it to save memory.
591 std::unique_ptr
<MergerMaxIterHeap
> maxHeap_
;
592 PinnedIteratorsManager
* pinned_iters_mgr_
;
594 // Used to bound range tombstones. For point keys, DBIter and SSTable iterator
595 // take care of boundary checking.
596 const Slice
* iterate_upper_bound_
;
598 // In forward direction, process a child that is not in the min heap.
599 // If valid, add to the min heap. Otherwise, check status.
600 void AddToMinHeapOrCheckStatus(HeapItem
*);
602 // In backward direction, process a child that is not in the max heap.
603 // If valid, add to the min heap. Otherwise, check status.
604 void AddToMaxHeapOrCheckStatus(HeapItem
*);
606 void SwitchToForward();
608 // Switch the direction from forward to backward without changing the
609 // position. Iterator should still be valid.
610 void SwitchToBackward();
612 IteratorWrapper
* CurrentForward() const {
613 assert(direction_
== kForward
);
614 assert(minHeap_
.empty() || minHeap_
.top()->type
== HeapItem::ITERATOR
);
615 return !minHeap_
.empty() ? &minHeap_
.top()->iter
: nullptr;
618 IteratorWrapper
* CurrentReverse() const {
619 assert(direction_
== kReverse
);
621 assert(maxHeap_
->empty() || maxHeap_
->top()->type
== HeapItem::ITERATOR
);
622 return !maxHeap_
->empty() ? &maxHeap_
->top()->iter
: nullptr;
626 // Seek to fist key >= target key (internal key) for children_[starting_level:].
627 // Cascading seek optimizations are applied if range tombstones are present (see
628 // comment above Seek() for more).
630 // @param range_tombstone_reseek Whether target is some range tombstone
631 // end, i.e., whether this SeekImpl() call is a part of a "cascading seek". This
632 // is used only for recoding relevant perf_context.
633 void MergingIterator::SeekImpl(const Slice
& target
, size_t starting_level
,
634 bool range_tombstone_reseek
) {
635 // active range tombstones before `starting_level` remain active
636 ClearHeaps(false /* clear_active */);
637 ParsedInternalKey pik
;
638 if (!range_tombstone_iters_
.empty()) {
639 // pik is only used in InsertRangeTombstoneToMinHeap().
640 ParseInternalKey(target
, &pik
, false).PermitUncheckedError();
643 // TODO: perhaps we could save some upheap cost by add all child iters first
644 // and then do a single heapify.
645 for (size_t level
= 0; level
< starting_level
; ++level
) {
646 PERF_TIMER_GUARD(seek_min_heap_time
);
647 AddToMinHeapOrCheckStatus(&children_
[level
]);
649 if (!range_tombstone_iters_
.empty()) {
650 // Add range tombstones from levels < starting_level. We can insert from
651 // pinned_heap_item_ for the following reasons:
652 // - pinned_heap_item_[level] is in minHeap_ iff
653 // range_tombstone_iters[level]->Valid().
654 // - If `level` is in active_, then range_tombstone_iters_[level]->Valid()
655 // and pinned_heap_item_[level] is of type RANGE_DELETION_END.
656 for (size_t level
= 0; level
< starting_level
; ++level
) {
657 if (range_tombstone_iters_
[level
] &&
658 range_tombstone_iters_
[level
]->Valid()) {
659 // use an iterator on active_ if performance becomes an issue here
660 if (active_
.count(level
) > 0) {
661 assert(pinned_heap_item_
[level
].type
== HeapItem::DELETE_RANGE_END
);
662 // if it was active, then start key must be within upper_bound,
663 // so we can add to minHeap_ directly.
664 minHeap_
.push(&pinned_heap_item_
[level
]);
666 // this takes care of checking iterate_upper_bound, but with an extra
667 // key comparison if range_tombstone_iters_[level] was already out of
668 // bound. Consider using a new HeapItem type or some flag to remember
669 // boundary checking result.
670 InsertRangeTombstoneToMinHeap(level
);
673 assert(!active_
.count(level
));
676 // levels >= starting_level will be reseeked below, so clearing their active
678 active_
.erase(active_
.lower_bound(starting_level
), active_
.end());
681 status_
= Status::OK();
682 IterKey current_search_key
;
683 current_search_key
.SetInternalKey(target
, false /* copy */);
684 // Seek target might change to some range tombstone end key, so
685 // we need to remember them for async requests.
686 // (level, target) pairs
687 autovector
<std::pair
<size_t, std::string
>> prefetched_target
;
688 for (auto level
= starting_level
; level
< children_
.size(); ++level
) {
690 PERF_TIMER_GUARD(seek_child_seek_time
);
691 children_
[level
].iter
.Seek(current_search_key
.GetInternalKey());
694 PERF_COUNTER_ADD(seek_child_seek_count
, 1);
696 if (!range_tombstone_iters_
.empty()) {
697 if (range_tombstone_reseek
) {
698 // This seek is to some range tombstone end key.
699 // Should only happen when there are range tombstones.
700 PERF_COUNTER_ADD(internal_range_del_reseek_count
, 1);
702 if (children_
[level
].iter
.status().IsTryAgain()) {
703 prefetched_target
.emplace_back(
704 level
, current_search_key
.GetInternalKey().ToString());
706 auto range_tombstone_iter
= range_tombstone_iters_
[level
];
707 if (range_tombstone_iter
) {
708 range_tombstone_iter
->Seek(current_search_key
.GetUserKey());
709 if (range_tombstone_iter
->Valid()) {
710 // insert the range tombstone end that is closer to and >=
711 // current_search_key. Strictly speaking, since the Seek() call above
712 // is on user key, it is possible that range_tombstone_iter->end_key()
713 // < current_search_key. This can happen when range_tombstone_iter is
714 // truncated and range_tombstone_iter.largest_ has the same user key
715 // as current_search_key.GetUserKey() but with a larger sequence
716 // number than current_search_key. Correctness is not affected as this
717 // tombstone end key will be popped during FindNextVisibleKey().
718 InsertRangeTombstoneToMinHeap(
719 level
, comparator_
->Compare(range_tombstone_iter
->start_key(),
720 pik
) > 0 /* start_key */);
721 // current_search_key < end_key guaranteed by the Seek() and Valid()
722 // calls above. Only interested in user key coverage since older
723 // sorted runs must have smaller sequence numbers than this range
726 // TODO: range_tombstone_iter->Seek() finds the max covering
727 // sequence number, can make it cheaper by not looking for max.
728 if (comparator_
->user_comparator()->Compare(
729 range_tombstone_iter
->start_key().user_key
,
730 current_search_key
.GetUserKey()) <= 0) {
731 // Since range_tombstone_iter->Valid(), seqno should be valid, so
732 // there is no need to check it.
733 range_tombstone_reseek
= true;
734 // Current target user key is covered by this range tombstone.
735 // All older sorted runs will seek to range tombstone end key.
736 // Note that for prefix seek case, it is possible that the prefix
737 // is not the same as the original target, it should not affect
738 // correctness. Besides, in most cases, range tombstone start and
739 // end key should have the same prefix?
740 // If range_tombstone_iter->end_key() is truncated to its largest_
741 // boundary, the timestamp in user_key will not be max timestamp,
742 // but the timestamp of `range_tombstone_iter.largest_`. This should
743 // be fine here as current_search_key is used to Seek into lower
745 current_search_key
.SetInternalKey(
746 range_tombstone_iter
->end_key().user_key
, kMaxSequenceNumber
);
751 // child.iter.status() is set to Status::TryAgain indicating asynchronous
752 // request for retrieval of data blocks has been submitted. So it should
753 // return at this point and Seek should be called again to retrieve the
754 // requested block and add the child to min heap.
755 if (children_
[level
].iter
.status().IsTryAgain()) {
759 // Strictly, we timed slightly more than min heap operation,
760 // but these operations are very cheap.
761 PERF_TIMER_GUARD(seek_min_heap_time
);
762 AddToMinHeapOrCheckStatus(&children_
[level
]);
766 if (range_tombstone_iters_
.empty()) {
767 for (auto& child
: children_
) {
768 if (child
.iter
.status().IsTryAgain()) {
769 child
.iter
.Seek(target
);
771 PERF_TIMER_GUARD(seek_min_heap_time
);
772 AddToMinHeapOrCheckStatus(&child
);
774 PERF_COUNTER_ADD(number_async_seek
, 1);
778 for (auto& prefetch
: prefetched_target
) {
779 // (level, target) pairs
780 children_
[prefetch
.first
].iter
.Seek(prefetch
.second
);
782 PERF_TIMER_GUARD(seek_min_heap_time
);
783 AddToMinHeapOrCheckStatus(&children_
[prefetch
.first
]);
785 PERF_COUNTER_ADD(number_async_seek
, 1);
790 // Returns true iff the current key (min heap top) should not be returned
791 // to user (of the merging iterator). This can be because the current key
792 // is deleted by some range tombstone, the current key is some fake file
793 // boundary sentinel key, or the current key is an end point of a range
794 // tombstone. Advance the iterator at heap top if needed. Heap order is restored
795 // and `active_` is updated accordingly.
796 // See FindNextVisibleKey() for more detail on internal implementation
797 // of advancing child iters.
800 // - min heap is currently not empty, and iter is in kForward direction.
801 // - minHeap_ top is not DELETE_RANGE_START (so that `active_` is current).
802 bool MergingIterator::SkipNextDeleted() {
805 // - file boundary sentinel keys
806 // - range deletion end key
807 auto current
= minHeap_
.top();
808 if (current
->type
== HeapItem::DELETE_RANGE_END
) {
809 active_
.erase(current
->level
);
810 assert(range_tombstone_iters_
[current
->level
] &&
811 range_tombstone_iters_
[current
->level
]->Valid());
812 range_tombstone_iters_
[current
->level
]->Next();
813 if (range_tombstone_iters_
[current
->level
]->Valid()) {
814 InsertRangeTombstoneToMinHeap(current
->level
, true /* start_key */,
815 true /* replace_top */);
819 return true /* current key deleted */;
821 if (current
->iter
.IsDeleteRangeSentinelKey()) {
822 // If the file boundary is defined by a range deletion, the range
823 // tombstone's end key must come before this sentinel key (see op_type in
824 // SetTombstoneKey()).
825 assert(ExtractValueType(current
->iter
.key()) != kTypeRangeDeletion
||
826 active_
.count(current
->level
) == 0);
827 // LevelIterator enters a new SST file
828 current
->iter
.Next();
829 if (current
->iter
.Valid()) {
830 assert(current
->iter
.status().ok());
831 minHeap_
.replace_top(current
);
835 // Remove last SST file's range tombstone end key if there is one.
836 // This means file boundary is before range tombstone end key,
837 // which could happen when a range tombstone and a user key
838 // straddle two SST files. Note that in TruncatedRangeDelIterator
839 // constructor, parsed_largest.sequence is decremented 1 in this case.
840 if (!minHeap_
.empty() && minHeap_
.top()->level
== current
->level
&&
841 minHeap_
.top()->type
== HeapItem::DELETE_RANGE_END
) {
843 active_
.erase(current
->level
);
845 if (range_tombstone_iters_
[current
->level
] &&
846 range_tombstone_iters_
[current
->level
]->Valid()) {
847 InsertRangeTombstoneToMinHeap(current
->level
);
849 return true /* current key deleted */;
851 assert(current
->type
== HeapItem::ITERATOR
);
852 // Point key case: check active_ for range tombstone coverage.
853 ParsedInternalKey pik
;
854 ParseInternalKey(current
->iter
.key(), &pik
, false).PermitUncheckedError();
855 if (!active_
.empty()) {
856 auto i
= *active_
.begin();
857 if (i
< current
->level
) {
858 // range tombstone is from a newer level, definitely covers
859 assert(comparator_
->Compare(range_tombstone_iters_
[i
]->start_key(),
861 assert(comparator_
->Compare(pik
, range_tombstone_iters_
[i
]->end_key()) <
864 AppendInternalKey(&target
, range_tombstone_iters_
[i
]->end_key());
865 SeekImpl(target
, current
->level
, true);
866 return true /* current key deleted */;
867 } else if (i
== current
->level
) {
868 // range tombstone is from the same level as current, check sequence
869 // number. By `active_` we know current key is between start key and end
871 assert(comparator_
->Compare(range_tombstone_iters_
[i
]->start_key(),
873 assert(comparator_
->Compare(pik
, range_tombstone_iters_
[i
]->end_key()) <
875 if (pik
.sequence
< range_tombstone_iters_
[current
->level
]->seq()) {
876 // covered by range tombstone
877 current
->iter
.Next();
878 if (current
->iter
.Valid()) {
879 minHeap_
.replace_top(current
);
883 return true /* current key deleted */;
885 return false /* current key not deleted */;
888 return false /* current key not deleted */;
889 // range tombstone from an older sorted run with current key < end key.
890 // current key is not deleted and the older sorted run will have its range
891 // tombstone updated when the range tombstone's end key are popped from
895 // we can reach here only if active_ is empty
896 assert(active_
.empty());
897 assert(minHeap_
.top()->type
== HeapItem::ITERATOR
);
898 return false /* current key not deleted */;
901 void MergingIterator::SeekForPrevImpl(const Slice
& target
,
902 size_t starting_level
,
903 bool range_tombstone_reseek
) {
904 // active range tombstones before `starting_level` remain active
905 ClearHeaps(false /* clear_active */);
907 ParsedInternalKey pik
;
908 if (!range_tombstone_iters_
.empty()) {
909 ParseInternalKey(target
, &pik
, false).PermitUncheckedError();
911 for (size_t level
= 0; level
< starting_level
; ++level
) {
912 PERF_TIMER_GUARD(seek_max_heap_time
);
913 AddToMaxHeapOrCheckStatus(&children_
[level
]);
915 if (!range_tombstone_iters_
.empty()) {
916 // Add range tombstones before starting_level.
917 for (size_t level
= 0; level
< starting_level
; ++level
) {
918 if (range_tombstone_iters_
[level
] &&
919 range_tombstone_iters_
[level
]->Valid()) {
920 assert(static_cast<bool>(active_
.count(level
)) ==
921 (pinned_heap_item_
[level
].type
== HeapItem::DELETE_RANGE_START
));
922 maxHeap_
->push(&pinned_heap_item_
[level
]);
924 assert(!active_
.count(level
));
927 // levels >= starting_level will be reseeked below,
928 active_
.erase(active_
.lower_bound(starting_level
), active_
.end());
931 status_
= Status::OK();
932 IterKey current_search_key
;
933 current_search_key
.SetInternalKey(target
, false /* copy */);
934 // Seek target might change to some range tombstone end key, so
935 // we need to remember them for async requests.
936 // (level, target) pairs
937 autovector
<std::pair
<size_t, std::string
>> prefetched_target
;
938 for (auto level
= starting_level
; level
< children_
.size(); ++level
) {
940 PERF_TIMER_GUARD(seek_child_seek_time
);
941 children_
[level
].iter
.SeekForPrev(current_search_key
.GetInternalKey());
944 PERF_COUNTER_ADD(seek_child_seek_count
, 1);
946 if (!range_tombstone_iters_
.empty()) {
947 if (range_tombstone_reseek
) {
948 // This seek is to some range tombstone end key.
949 // Should only happen when there are range tombstones.
950 PERF_COUNTER_ADD(internal_range_del_reseek_count
, 1);
952 if (children_
[level
].iter
.status().IsTryAgain()) {
953 prefetched_target
.emplace_back(
954 level
, current_search_key
.GetInternalKey().ToString());
956 auto range_tombstone_iter
= range_tombstone_iters_
[level
];
957 if (range_tombstone_iter
) {
958 range_tombstone_iter
->SeekForPrev(current_search_key
.GetUserKey());
959 if (range_tombstone_iter
->Valid()) {
960 InsertRangeTombstoneToMaxHeap(
961 level
, comparator_
->Compare(range_tombstone_iter
->end_key(),
962 pik
) <= 0 /* end_key */);
963 // start key <= current_search_key guaranteed by the Seek() call above
964 // Only interested in user key coverage since older sorted runs must
965 // have smaller sequence numbers than this tombstone.
966 if (comparator_
->user_comparator()->Compare(
967 current_search_key
.GetUserKey(),
968 range_tombstone_iter
->end_key().user_key
) < 0) {
969 range_tombstone_reseek
= true;
970 current_search_key
.SetInternalKey(
971 range_tombstone_iter
->start_key().user_key
, kMaxSequenceNumber
,
972 kValueTypeForSeekForPrev
);
977 // child.iter.status() is set to Status::TryAgain indicating asynchronous
978 // request for retrieval of data blocks has been submitted. So it should
979 // return at this point and Seek should be called again to retrieve the
980 // requested block and add the child to min heap.
981 if (children_
[level
].iter
.status().IsTryAgain()) {
985 // Strictly, we timed slightly more than min heap operation,
986 // but these operations are very cheap.
987 PERF_TIMER_GUARD(seek_max_heap_time
);
988 AddToMaxHeapOrCheckStatus(&children_
[level
]);
992 if (range_tombstone_iters_
.empty()) {
993 for (auto& child
: children_
) {
994 if (child
.iter
.status().IsTryAgain()) {
995 child
.iter
.SeekForPrev(target
);
997 PERF_TIMER_GUARD(seek_min_heap_time
);
998 AddToMaxHeapOrCheckStatus(&child
);
1000 PERF_COUNTER_ADD(number_async_seek
, 1);
1004 for (auto& prefetch
: prefetched_target
) {
1005 // (level, target) pairs
1006 children_
[prefetch
.first
].iter
.SeekForPrev(prefetch
.second
);
1008 PERF_TIMER_GUARD(seek_max_heap_time
);
1009 AddToMaxHeapOrCheckStatus(&children_
[prefetch
.first
]);
1011 PERF_COUNTER_ADD(number_async_seek
, 1);
1016 // See more in comments above SkipNextDeleted().
1018 // - max heap is currently not empty, and iter is in kReverse direction.
1019 // - maxHeap_ top is not DELETE_RANGE_END (so that `active_` is current).
1020 bool MergingIterator::SkipPrevDeleted() {
1023 // - file boundary sentinel keys
1024 // - range deletion start key
1025 auto current
= maxHeap_
->top();
1026 if (current
->type
== HeapItem::DELETE_RANGE_START
) {
1027 active_
.erase(current
->level
);
1028 assert(range_tombstone_iters_
[current
->level
] &&
1029 range_tombstone_iters_
[current
->level
]->Valid());
1030 range_tombstone_iters_
[current
->level
]->Prev();
1031 if (range_tombstone_iters_
[current
->level
]->Valid()) {
1032 InsertRangeTombstoneToMaxHeap(current
->level
, true /* end_key */,
1033 true /* replace_top */);
1037 return true /* current key deleted */;
1039 if (current
->iter
.IsDeleteRangeSentinelKey()) {
1040 // LevelIterator enters a new SST file
1041 current
->iter
.Prev();
1042 if (current
->iter
.Valid()) {
1043 assert(current
->iter
.status().ok());
1044 maxHeap_
->replace_top(current
);
1048 if (!maxHeap_
->empty() && maxHeap_
->top()->level
== current
->level
&&
1049 maxHeap_
->top()->type
== HeapItem::DELETE_RANGE_START
) {
1051 active_
.erase(current
->level
);
1053 if (range_tombstone_iters_
[current
->level
] &&
1054 range_tombstone_iters_
[current
->level
]->Valid()) {
1055 InsertRangeTombstoneToMaxHeap(current
->level
);
1057 return true /* current key deleted */;
1059 assert(current
->type
== HeapItem::ITERATOR
);
1060 // Point key case: check active_ for range tombstone coverage.
1061 ParsedInternalKey pik
;
1062 ParseInternalKey(current
->iter
.key(), &pik
, false).PermitUncheckedError();
1063 if (!active_
.empty()) {
1064 auto i
= *active_
.begin();
1065 if (i
< current
->level
) {
1066 // range tombstone is from a newer level, definitely covers
1067 assert(comparator_
->Compare(range_tombstone_iters_
[i
]->start_key(),
1069 assert(comparator_
->Compare(pik
, range_tombstone_iters_
[i
]->end_key()) <
1072 AppendInternalKey(&target
, range_tombstone_iters_
[i
]->start_key());
1073 // This is different from SkipNextDeleted() which does reseek at sorted
1074 // runs >= level (instead of i+1 here). With min heap, if level L is at
1075 // top of the heap, then levels <L all have internal keys > level L's
1076 // current internal key, which means levels <L are already at a different
1077 // user key. With max heap, if level L is at top of the heap, then levels
1078 // <L all have internal keys smaller than level L's current internal key,
1079 // which might still be the same user key.
1080 SeekForPrevImpl(target
, i
+ 1, true);
1081 return true /* current key deleted */;
1082 } else if (i
== current
->level
) {
1083 // By `active_` we know current key is between start key and end key.
1084 assert(comparator_
->Compare(range_tombstone_iters_
[i
]->start_key(),
1086 assert(comparator_
->Compare(pik
, range_tombstone_iters_
[i
]->end_key()) <
1088 if (pik
.sequence
< range_tombstone_iters_
[current
->level
]->seq()) {
1089 current
->iter
.Prev();
1090 if (current
->iter
.Valid()) {
1091 maxHeap_
->replace_top(current
);
1095 return true /* current key deleted */;
1097 return false /* current key not deleted */;
1100 return false /* current key not deleted */;
1104 assert(active_
.empty());
1105 assert(maxHeap_
->top()->type
== HeapItem::ITERATOR
);
1106 return false /* current key not deleted */;
1109 void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem
* child
) {
1110 if (child
->iter
.Valid()) {
1111 assert(child
->iter
.status().ok());
1112 minHeap_
.push(child
);
1114 considerStatus(child
->iter
.status());
1118 void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem
* child
) {
1119 if (child
->iter
.Valid()) {
1120 assert(child
->iter
.status().ok());
1121 maxHeap_
->push(child
);
1123 considerStatus(child
->iter
.status());
1127 // Advance all non current_ child to > current_.key().
1128 // We advance current_ after the this function call as it does not require
1130 // Advance all range tombstones iters, including the one corresponding to
1131 // current_, to the first tombstone with end_key > current_.key().
1132 // TODO: potentially do cascading seek here too
1133 void MergingIterator::SwitchToForward() {
1135 Slice target
= key();
1136 for (auto& child
: children_
) {
1137 if (&child
.iter
!= current_
) {
1138 child
.iter
.Seek(target
);
1139 // child.iter.status() is set to Status::TryAgain indicating asynchronous
1140 // request for retrieval of data blocks has been submitted. So it should
1141 // return at this point and Seek should be called again to retrieve the
1142 // requested block and add the child to min heap.
1143 if (child
.iter
.status() == Status::TryAgain()) {
1146 if (child
.iter
.Valid() && comparator_
->Equal(target
, child
.key())) {
1147 assert(child
.iter
.status().ok());
1151 AddToMinHeapOrCheckStatus(&child
);
1154 for (auto& child
: children_
) {
1155 if (child
.iter
.status() == Status::TryAgain()) {
1156 child
.iter
.Seek(target
);
1157 if (child
.iter
.Valid() && comparator_
->Equal(target
, child
.key())) {
1158 assert(child
.iter
.status().ok());
1161 AddToMinHeapOrCheckStatus(&child
);
1165 // Current range tombstone iter also needs to seek for the following case:
1166 // Previous direction is backward, so range tombstone iter may point to a
1167 // tombstone before current_. If there is no such tombstone, then the range
1168 // tombstone iter is !Valid(). Need to reseek here to make it valid again.
1169 if (!range_tombstone_iters_
.empty()) {
1170 ParsedInternalKey pik
;
1171 ParseInternalKey(target
, &pik
, false /* log_err_key */)
1172 .PermitUncheckedError();
1173 for (size_t i
= 0; i
< range_tombstone_iters_
.size(); ++i
) {
1174 auto iter
= range_tombstone_iters_
[i
];
1176 iter
->Seek(pik
.user_key
);
1177 // The while loop is needed as the Seek() call above is only for user
1178 // key. We could have a range tombstone with end_key covering user_key,
1179 // but still is smaller than target. This happens when the range
1180 // tombstone is truncated at iter.largest_.
1181 while (iter
->Valid() &&
1182 comparator_
->Compare(iter
->end_key(), pik
) <= 0) {
1185 if (range_tombstone_iters_
[i
]->Valid()) {
1186 InsertRangeTombstoneToMinHeap(
1187 i
, comparator_
->Compare(range_tombstone_iters_
[i
]->start_key(),
1188 pik
) > 0 /* start_key */);
1194 direction_
= kForward
;
1195 assert(current_
== CurrentForward());
1198 // Advance all range tombstones iters, including the one corresponding to
1199 // current_, to the first tombstone with start_key <= current_.key().
1200 void MergingIterator::SwitchToBackward() {
1203 Slice target
= key();
1204 for (auto& child
: children_
) {
1205 if (&child
.iter
!= current_
) {
1206 child
.iter
.SeekForPrev(target
);
1207 TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child
);
1208 if (child
.iter
.Valid() && comparator_
->Equal(target
, child
.key())) {
1209 assert(child
.iter
.status().ok());
1213 AddToMaxHeapOrCheckStatus(&child
);
1216 ParsedInternalKey pik
;
1217 ParseInternalKey(target
, &pik
, false /* log_err_key */)
1218 .PermitUncheckedError();
1219 for (size_t i
= 0; i
< range_tombstone_iters_
.size(); ++i
) {
1220 auto iter
= range_tombstone_iters_
[i
];
1222 iter
->SeekForPrev(pik
.user_key
);
1223 // Since the SeekForPrev() call above is only for user key,
1224 // we may end up with some range tombstone with start key having the
1225 // same user key at current_, but with a smaller sequence number. This
1226 // makes current_ not at maxHeap_ top for the CurrentReverse() call
1227 // below. If there is a range tombstone start key with the same user
1228 // key and the same sequence number as current_.key(), it will be fine as
1229 // in InsertRangeTombstoneToMaxHeap() we change op_type to be the smallest
1231 while (iter
->Valid() &&
1232 comparator_
->Compare(iter
->start_key(), pik
) > 0) {
1235 if (iter
->Valid()) {
1236 InsertRangeTombstoneToMaxHeap(
1237 i
, comparator_
->Compare(range_tombstone_iters_
[i
]->end_key(),
1238 pik
) <= 0 /* end_key */);
1243 direction_
= kReverse
;
1244 if (!prefix_seek_mode_
) {
1245 // Note that we don't do assert(current_ == CurrentReverse()) here
1246 // because it is possible to have some keys larger than the seek-key
1247 // inserted between Seek() and SeekToLast(), which makes current_ not
1248 // equal to CurrentReverse().
1249 current_
= CurrentReverse();
1251 assert(current_
== CurrentReverse());
1254 void MergingIterator::ClearHeaps(bool clear_active
) {
1264 void MergingIterator::InitMaxHeap() {
1266 maxHeap_
= std::make_unique
<MergerMaxIterHeap
>(comparator_
);
1270 // Repeatedly check and remove heap top key if it is not a point key
1271 // that is not covered by range tombstones. SeekImpl() is called to seek to end
1272 // of a range tombstone if the heap top is a point key covered by some range
1273 // tombstone from a newer sorted run. If the covering tombstone is from current
1274 // key's level, then the current child iterator is simply advanced to its next
1275 // key without reseeking.
1276 inline void MergingIterator::FindNextVisibleKey() {
1277 // When active_ is empty, we know heap top cannot be a range tombstone end
1278 // key. It cannot be a range tombstone start key per PopDeleteRangeStart().
1279 PopDeleteRangeStart();
1280 while (!minHeap_
.empty() &&
1281 (!active_
.empty() || minHeap_
.top()->IsDeleteRangeSentinelKey()) &&
1282 SkipNextDeleted()) {
1283 PopDeleteRangeStart();
1287 inline void MergingIterator::FindPrevVisibleKey() {
1288 PopDeleteRangeEnd();
1289 while (!maxHeap_
->empty() &&
1290 (!active_
.empty() || maxHeap_
->top()->IsDeleteRangeSentinelKey()) &&
1291 SkipPrevDeleted()) {
1292 PopDeleteRangeEnd();
1296 InternalIterator
* NewMergingIterator(const InternalKeyComparator
* cmp
,
1297 InternalIterator
** list
, int n
,
1298 Arena
* arena
, bool prefix_seek_mode
) {
1301 return NewEmptyInternalIterator
<Slice
>(arena
);
1302 } else if (n
== 1) {
1305 if (arena
== nullptr) {
1306 return new MergingIterator(cmp
, list
, n
, false, prefix_seek_mode
);
1308 auto mem
= arena
->AllocateAligned(sizeof(MergingIterator
));
1309 return new (mem
) MergingIterator(cmp
, list
, n
, true, prefix_seek_mode
);
1314 MergeIteratorBuilder::MergeIteratorBuilder(
1315 const InternalKeyComparator
* comparator
, Arena
* a
, bool prefix_seek_mode
,
1316 const Slice
* iterate_upper_bound
)
1317 : first_iter(nullptr), use_merging_iter(false), arena(a
) {
1318 auto mem
= arena
->AllocateAligned(sizeof(MergingIterator
));
1319 merge_iter
= new (mem
) MergingIterator(comparator
, nullptr, 0, true,
1320 prefix_seek_mode
, iterate_upper_bound
);
1323 MergeIteratorBuilder::~MergeIteratorBuilder() {
1324 if (first_iter
!= nullptr) {
1325 first_iter
->~InternalIterator();
1327 if (merge_iter
!= nullptr) {
1328 merge_iter
->~MergingIterator();
1332 void MergeIteratorBuilder::AddIterator(InternalIterator
* iter
) {
1333 if (!use_merging_iter
&& first_iter
!= nullptr) {
1334 merge_iter
->AddIterator(first_iter
);
1335 use_merging_iter
= true;
1336 first_iter
= nullptr;
1338 if (use_merging_iter
) {
1339 merge_iter
->AddIterator(iter
);
1345 void MergeIteratorBuilder::AddPointAndTombstoneIterator(
1346 InternalIterator
* point_iter
, TruncatedRangeDelIterator
* tombstone_iter
,
1347 TruncatedRangeDelIterator
*** tombstone_iter_ptr
) {
1348 // tombstone_iter_ptr != nullptr means point_iter is a LevelIterator.
1349 bool add_range_tombstone
= tombstone_iter
||
1350 !merge_iter
->range_tombstone_iters_
.empty() ||
1352 if (!use_merging_iter
&& (add_range_tombstone
|| first_iter
)) {
1353 use_merging_iter
= true;
1355 merge_iter
->AddIterator(first_iter
);
1356 first_iter
= nullptr;
1359 if (use_merging_iter
) {
1360 merge_iter
->AddIterator(point_iter
);
1361 if (add_range_tombstone
) {
1362 // If there was a gap, fill in nullptr as empty range tombstone iterators.
1363 while (merge_iter
->range_tombstone_iters_
.size() <
1364 merge_iter
->children_
.size() - 1) {
1365 merge_iter
->AddRangeTombstoneIterator(nullptr);
1367 merge_iter
->AddRangeTombstoneIterator(tombstone_iter
);
1370 if (tombstone_iter_ptr
) {
1371 // This is needed instead of setting to &range_tombstone_iters_[i]
1372 // directly here since the memory address of range_tombstone_iters_[i]
1373 // might change during vector resizing.
1374 range_del_iter_ptrs_
.emplace_back(
1375 merge_iter
->range_tombstone_iters_
.size() - 1, tombstone_iter_ptr
);
1378 first_iter
= point_iter
;
1382 InternalIterator
* MergeIteratorBuilder::Finish(ArenaWrappedDBIter
* db_iter
) {
1383 InternalIterator
* ret
= nullptr;
1384 if (!use_merging_iter
) {
1386 first_iter
= nullptr;
1388 for (auto& p
: range_del_iter_ptrs_
) {
1389 *(p
.second
) = &(merge_iter
->range_tombstone_iters_
[p
.first
]);
1391 if (db_iter
&& !merge_iter
->range_tombstone_iters_
.empty()) {
1392 // memtable is always the first level
1393 db_iter
->SetMemtableRangetombstoneIter(
1394 &merge_iter
->range_tombstone_iters_
.front());
1396 merge_iter
->Finish();
1398 merge_iter
= nullptr;
1403 } // namespace ROCKSDB_NAMESPACE