]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/range_del_aggregator.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / db / range_del_aggregator.cc
CommitLineData
494da23a 1// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#include "db/range_del_aggregator.h"
7
f67539c2 8#include "db/compaction/compaction_iteration_stats.h"
494da23a
TL
9#include "db/dbformat.h"
10#include "db/pinned_iterators_manager.h"
11#include "db/range_del_aggregator.h"
12#include "db/range_tombstone_fragmenter.h"
13#include "db/version_edit.h"
f67539c2
TL
14#include "rocksdb/comparator.h"
15#include "rocksdb/types.h"
494da23a
TL
16#include "table/internal_iterator.h"
17#include "table/scoped_arena_iterator.h"
18#include "table/table_builder.h"
19#include "util/heap.h"
20#include "util/kv_map.h"
21#include "util/vector_iterator.h"
7c673cae 22
f67539c2 23namespace ROCKSDB_NAMESPACE {
7c673cae 24
494da23a
TL
25TruncatedRangeDelIterator::TruncatedRangeDelIterator(
26 std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
27 const InternalKeyComparator* icmp, const InternalKey* smallest,
28 const InternalKey* largest)
29 : iter_(std::move(iter)),
30 icmp_(icmp),
31 smallest_ikey_(smallest),
32 largest_ikey_(largest) {
33 if (smallest != nullptr) {
34 pinned_bounds_.emplace_back();
35 auto& parsed_smallest = pinned_bounds_.back();
20effc67
TL
36 Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
37 false /* log_err_key */); // TODO
38 pik_status.PermitUncheckedError();
39 assert(pik_status.ok());
40
494da23a 41 smallest_ = &parsed_smallest;
11fdf7f2 42 }
494da23a
TL
43 if (largest != nullptr) {
44 pinned_bounds_.emplace_back();
45 auto& parsed_largest = pinned_bounds_.back();
20effc67
TL
46
47 Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest,
48 false /* log_err_key */); // TODO
49 pik_status.PermitUncheckedError();
50 assert(pik_status.ok());
51
494da23a
TL
52 if (parsed_largest.type == kTypeRangeDeletion &&
53 parsed_largest.sequence == kMaxSequenceNumber) {
54 // The file boundary has been artificially extended by a range tombstone.
55 // We do not need to adjust largest to properly truncate range
56 // tombstones that extend past the boundary.
57 } else if (parsed_largest.sequence == 0) {
58 // The largest key in the sstable has a sequence number of 0. Since we
59 // guarantee that no internal keys with the same user key and sequence
60 // number can exist in a DB, we know that the largest key in this sstable
61 // cannot exist as the smallest key in the next sstable. This further
62 // implies that no range tombstone in this sstable covers largest;
63 // otherwise, the file boundary would have been artificially extended.
64 //
65 // Therefore, we will never truncate a range tombstone at largest, so we
66 // can leave it unchanged.
67 } else {
68 // The same user key may straddle two sstable boundaries. To ensure that
69 // the truncated end key can cover the largest key in this sstable, reduce
70 // its sequence number by 1.
71 parsed_largest.sequence -= 1;
11fdf7f2 72 }
494da23a 73 largest_ = &parsed_largest;
11fdf7f2 74 }
494da23a 75}
11fdf7f2 76
494da23a
TL
77bool TruncatedRangeDelIterator::Valid() const {
78 return iter_->Valid() &&
79 (smallest_ == nullptr ||
80 icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
81 (largest_ == nullptr ||
82 icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
83}
11fdf7f2 84
494da23a 85void TruncatedRangeDelIterator::Next() { iter_->TopNext(); }
11fdf7f2 86
494da23a 87void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); }
11fdf7f2 88
494da23a 89void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); }
11fdf7f2 90
494da23a
TL
91// NOTE: target is a user key
92void TruncatedRangeDelIterator::Seek(const Slice& target) {
93 if (largest_ != nullptr &&
94 icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
95 kTypeRangeDeletion)) <= 0) {
96 iter_->Invalidate();
97 return;
11fdf7f2 98 }
494da23a
TL
99 if (smallest_ != nullptr &&
100 icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
101 iter_->Seek(smallest_->user_key);
102 return;
11fdf7f2 103 }
494da23a
TL
104 iter_->Seek(target);
105}
11fdf7f2 106
494da23a
TL
107// NOTE: target is a user key
108void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
109 if (smallest_ != nullptr &&
110 icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
111 *smallest_) < 0) {
112 iter_->Invalidate();
113 return;
11fdf7f2 114 }
494da23a
TL
115 if (largest_ != nullptr &&
116 icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
117 iter_->SeekForPrev(largest_->user_key);
118 return;
11fdf7f2 119 }
494da23a
TL
120 iter_->SeekForPrev(target);
121}
11fdf7f2 122
494da23a
TL
123void TruncatedRangeDelIterator::SeekToFirst() {
124 if (smallest_ != nullptr) {
125 iter_->Seek(smallest_->user_key);
126 return;
127 }
128 iter_->SeekToTopFirst();
129}
11fdf7f2 130
494da23a
TL
131void TruncatedRangeDelIterator::SeekToLast() {
132 if (largest_ != nullptr) {
133 iter_->SeekForPrev(largest_->user_key);
134 return;
11fdf7f2 135 }
494da23a
TL
136 iter_->SeekToTopLast();
137}
11fdf7f2 138
494da23a
TL
139std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
140TruncatedRangeDelIterator::SplitBySnapshot(
141 const std::vector<SequenceNumber>& snapshots) {
142 using FragmentedIterPair =
143 std::pair<const SequenceNumber,
144 std::unique_ptr<FragmentedRangeTombstoneIterator>>;
145
146 auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
147 std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
148 split_truncated_iters;
149 std::for_each(
150 split_untruncated_iters.begin(), split_untruncated_iters.end(),
151 [&](FragmentedIterPair& iter_pair) {
152 std::unique_ptr<TruncatedRangeDelIterator> truncated_iter(
153 new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_,
154 smallest_ikey_, largest_ikey_));
155 split_truncated_iters.emplace(iter_pair.first,
156 std::move(truncated_iter));
157 });
158 return split_truncated_iters;
159}
11fdf7f2 160
494da23a
TL
161ForwardRangeDelIterator::ForwardRangeDelIterator(
162 const InternalKeyComparator* icmp)
163 : icmp_(icmp),
164 unused_idx_(0),
165 active_seqnums_(SeqMaxComparator()),
166 active_iters_(EndKeyMinComparator(icmp)),
167 inactive_iters_(StartKeyMinComparator(icmp)) {}
168
169bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
170 // Move active iterators that end before parsed.
171 while (!active_iters_.empty() &&
172 icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
173 TruncatedRangeDelIterator* iter = PopActiveIter();
174 do {
175 iter->Next();
176 } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
177 PushIter(iter, parsed);
178 assert(active_iters_.size() == active_seqnums_.size());
179 }
11fdf7f2 180
494da23a
TL
181 // Move inactive iterators that start before parsed.
182 while (!inactive_iters_.empty() &&
183 icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
184 TruncatedRangeDelIterator* iter = PopInactiveIter();
185 while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
186 iter->Next();
187 }
188 PushIter(iter, parsed);
189 assert(active_iters_.size() == active_seqnums_.size());
11fdf7f2 190 }
11fdf7f2 191
494da23a
TL
192 return active_seqnums_.empty()
193 ? false
194 : (*active_seqnums_.begin())->seq() > parsed.sequence;
7c673cae
FG
195}
196
494da23a
TL
197void ForwardRangeDelIterator::Invalidate() {
198 unused_idx_ = 0;
199 active_iters_.clear();
200 active_seqnums_.clear();
201 inactive_iters_.clear();
7c673cae
FG
202}
203
494da23a
TL
204ReverseRangeDelIterator::ReverseRangeDelIterator(
205 const InternalKeyComparator* icmp)
206 : icmp_(icmp),
207 unused_idx_(0),
208 active_seqnums_(SeqMaxComparator()),
209 active_iters_(StartKeyMaxComparator(icmp)),
210 inactive_iters_(EndKeyMaxComparator(icmp)) {}
211
212bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
213 // Move active iterators that start after parsed.
214 while (!active_iters_.empty() &&
215 icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
216 TruncatedRangeDelIterator* iter = PopActiveIter();
217 do {
218 iter->Prev();
219 } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
220 PushIter(iter, parsed);
221 assert(active_iters_.size() == active_seqnums_.size());
7c673cae 222 }
11fdf7f2 223
494da23a
TL
224 // Move inactive iterators that end after parsed.
225 while (!inactive_iters_.empty() &&
226 icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
227 TruncatedRangeDelIterator* iter = PopInactiveIter();
228 while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
229 iter->Prev();
230 }
231 PushIter(iter, parsed);
232 assert(active_iters_.size() == active_seqnums_.size());
7c673cae 233 }
494da23a
TL
234
235 return active_seqnums_.empty()
236 ? false
237 : (*active_seqnums_.begin())->seq() > parsed.sequence;
7c673cae
FG
238}
239
494da23a
TL
240void ReverseRangeDelIterator::Invalidate() {
241 unused_idx_ = 0;
242 active_iters_.clear();
243 active_seqnums_.clear();
244 inactive_iters_.clear();
7c673cae
FG
245}
246
494da23a
TL
247bool RangeDelAggregator::StripeRep::ShouldDelete(
248 const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
249 if (!InStripe(parsed.sequence) || IsEmpty()) {
7c673cae
FG
250 return false;
251 }
494da23a
TL
252 switch (mode) {
253 case RangeDelPositioningMode::kForwardTraversal:
254 InvalidateReverseIter();
255
256 // Pick up previously unseen iterators.
257 for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
258 it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
259 auto& iter = *it;
260 forward_iter_.AddNewIter(iter.get(), parsed);
261 }
262
263 return forward_iter_.ShouldDelete(parsed);
264 case RangeDelPositioningMode::kBackwardTraversal:
265 InvalidateForwardIter();
266
267 // Pick up previously unseen iterators.
268 for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
269 it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
270 auto& iter = *it;
271 reverse_iter_.AddNewIter(iter.get(), parsed);
272 }
273
274 return reverse_iter_.ShouldDelete(parsed);
275 default:
276 assert(false);
277 return false;
7c673cae 278 }
7c673cae
FG
279}
280
494da23a
TL
281bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
282 const Slice& end) {
283 Invalidate();
284
285 // Set the internal start/end keys so that:
286 // - if start_ikey has the same user key and sequence number as the
287 // current end key, start_ikey will be considered greater; and
288 // - if end_ikey has the same user key and sequence number as the current
289 // start key, end_ikey will be considered greater.
290 ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
291 static_cast<ValueType>(0));
292 ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
293 for (auto& iter : iters_) {
294 bool checked_candidate_tombstones = false;
295 for (iter->SeekForPrev(start);
296 iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
297 iter->Next()) {
298 checked_candidate_tombstones = true;
299 if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
300 icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
301 return true;
11fdf7f2
TL
302 }
303 }
494da23a
TL
304
305 if (!checked_candidate_tombstones) {
306 // Do an additional check for when the end of the range is the begin
307 // key of a tombstone, which we missed earlier since SeekForPrev'ing
308 // to the start was invalid.
309 iter->SeekForPrev(end);
310 if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
311 icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
312 return true;
11fdf7f2
TL
313 }
314 }
7c673cae 315 }
494da23a 316 return false;
7c673cae
FG
317}
318
494da23a
TL
319void ReadRangeDelAggregator::AddTombstones(
320 std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
321 const InternalKey* smallest, const InternalKey* largest) {
322 if (input_iter == nullptr || input_iter->empty()) {
7c673cae
FG
323 return;
324 }
494da23a
TL
325 rep_.AddTombstones(
326 std::unique_ptr<TruncatedRangeDelIterator>(new TruncatedRangeDelIterator(
327 std::move(input_iter), icmp_, smallest, largest)));
7c673cae
FG
328}
329
f67539c2
TL
330bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
331 RangeDelPositioningMode mode) {
494da23a 332 return rep_.ShouldDelete(parsed, mode);
7c673cae
FG
333}
334
494da23a
TL
335bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
336 const Slice& end) {
337 InvalidateRangeDelMapPositions();
338 return rep_.IsRangeOverlapped(start, end);
339}
340
341void CompactionRangeDelAggregator::AddTombstones(
342 std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
343 const InternalKey* smallest, const InternalKey* largest) {
344 if (input_iter == nullptr || input_iter->empty()) {
345 return;
7c673cae 346 }
494da23a
TL
347 assert(input_iter->lower_bound() == 0);
348 assert(input_iter->upper_bound() == kMaxSequenceNumber);
349 parent_iters_.emplace_back(new TruncatedRangeDelIterator(
350 std::move(input_iter), icmp_, smallest, largest));
351
352 auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
353 for (auto& split_iter : split_iters) {
354 auto it = reps_.find(split_iter.first);
355 if (it == reps_.end()) {
356 bool inserted;
357 SequenceNumber upper_bound = split_iter.second->upper_bound();
358 SequenceNumber lower_bound = split_iter.second->lower_bound();
359 std::tie(it, inserted) = reps_.emplace(
360 split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
361 assert(inserted);
7c673cae 362 }
494da23a
TL
363 assert(it != reps_.end());
364 it->second.AddTombstones(std::move(split_iter.second));
7c673cae 365 }
11fdf7f2 366}
7c673cae 367
494da23a
TL
368bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
369 RangeDelPositioningMode mode) {
370 auto it = reps_.lower_bound(parsed.sequence);
371 if (it == reps_.end()) {
372 return false;
11fdf7f2 373 }
494da23a 374 return it->second.ShouldDelete(parsed, mode);
11fdf7f2 375}
7c673cae 376
494da23a
TL
377namespace {
378
379class TruncatedRangeDelMergingIter : public InternalIterator {
11fdf7f2 380 public:
494da23a
TL
381 TruncatedRangeDelMergingIter(
382 const InternalKeyComparator* icmp, const Slice* lower_bound,
383 const Slice* upper_bound, bool upper_bound_inclusive,
384 const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
385 : icmp_(icmp),
386 lower_bound_(lower_bound),
387 upper_bound_(upper_bound),
388 upper_bound_inclusive_(upper_bound_inclusive),
389 heap_(StartKeyMinComparator(icmp)) {
390 for (auto& child : children) {
391 if (child != nullptr) {
392 assert(child->lower_bound() == 0);
393 assert(child->upper_bound() == kMaxSequenceNumber);
394 children_.push_back(child.get());
395 }
7c673cae 396 }
7c673cae 397 }
7c673cae 398
494da23a
TL
399 bool Valid() const override {
400 return !heap_.empty() && BeforeEndKey(heap_.top());
401 }
402 Status status() const override { return Status::OK(); }
403
404 void SeekToFirst() override {
405 heap_.clear();
406 for (auto& child : children_) {
407 if (lower_bound_ != nullptr) {
408 child->Seek(*lower_bound_);
409 } else {
410 child->SeekToFirst();
411 }
412 if (child->Valid()) {
413 heap_.push(child);
414 }
415 }
416 }
11fdf7f2
TL
417
418 void Next() override {
494da23a
TL
419 auto* top = heap_.top();
420 top->InternalNext();
421 if (top->Valid()) {
422 heap_.replace_top(top);
11fdf7f2
TL
423 } else {
424 heap_.pop();
425 }
7c673cae 426 }
11fdf7f2 427
494da23a
TL
428 Slice key() const override {
429 auto* top = heap_.top();
430 cur_start_key_.Set(top->start_key().user_key, top->seq(),
431 kTypeRangeDeletion);
432 return cur_start_key_.Encode();
7c673cae 433 }
11fdf7f2 434
494da23a
TL
435 Slice value() const override {
436 auto* top = heap_.top();
437 assert(top->end_key().sequence == kMaxSequenceNumber);
438 return top->end_key().user_key;
439 }
440
441 // Unused InternalIterator methods
442 void Prev() override { assert(false); }
443 void Seek(const Slice& /* target */) override { assert(false); }
444 void SeekForPrev(const Slice& /* target */) override { assert(false); }
445 void SeekToLast() override { assert(false); }
11fdf7f2
TL
446
447 private:
494da23a
TL
448 bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
449 if (upper_bound_ == nullptr) {
450 return true;
11fdf7f2 451 }
494da23a
TL
452 int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key,
453 *upper_bound_);
454 return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
455 }
11fdf7f2 456
494da23a
TL
457 const InternalKeyComparator* icmp_;
458 const Slice* lower_bound_;
459 const Slice* upper_bound_;
460 bool upper_bound_inclusive_;
461 BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
462 std::vector<TruncatedRangeDelIterator*> children_;
11fdf7f2 463
494da23a 464 mutable InternalKey cur_start_key_;
11fdf7f2
TL
465};
466
494da23a
TL
467} // namespace
468
469std::unique_ptr<FragmentedRangeTombstoneIterator>
470CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
471 const Slice* upper_bound,
472 bool upper_bound_inclusive) {
473 InvalidateRangeDelMapPositions();
474 std::unique_ptr<TruncatedRangeDelMergingIter> merging_iter(
475 new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound,
476 upper_bound_inclusive, parent_iters_));
477
478 auto fragmented_tombstone_list =
479 std::make_shared<FragmentedRangeTombstoneList>(
480 std::move(merging_iter), *icmp_, true /* for_compaction */,
481 *snapshots_);
482
483 return std::unique_ptr<FragmentedRangeTombstoneIterator>(
484 new FragmentedRangeTombstoneIterator(
485 fragmented_tombstone_list, *icmp_,
486 kMaxSequenceNumber /* upper_bound */));
7c673cae
FG
487}
488
f67539c2 489} // namespace ROCKSDB_NAMESPACE