]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/range_del_aggregator.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / range_del_aggregator.cc
CommitLineData
494da23a 1// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6#include "db/range_del_aggregator.h"
7
f67539c2 8#include "db/compaction/compaction_iteration_stats.h"
494da23a
TL
9#include "db/dbformat.h"
10#include "db/pinned_iterators_manager.h"
11#include "db/range_del_aggregator.h"
12#include "db/range_tombstone_fragmenter.h"
13#include "db/version_edit.h"
f67539c2
TL
14#include "rocksdb/comparator.h"
15#include "rocksdb/types.h"
494da23a
TL
16#include "table/internal_iterator.h"
17#include "table/scoped_arena_iterator.h"
18#include "table/table_builder.h"
19#include "util/heap.h"
20#include "util/kv_map.h"
21#include "util/vector_iterator.h"
7c673cae 22
f67539c2 23namespace ROCKSDB_NAMESPACE {
7c673cae 24
494da23a
TL
25TruncatedRangeDelIterator::TruncatedRangeDelIterator(
26 std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
27 const InternalKeyComparator* icmp, const InternalKey* smallest,
28 const InternalKey* largest)
29 : iter_(std::move(iter)),
30 icmp_(icmp),
31 smallest_ikey_(smallest),
32 largest_ikey_(largest) {
33 if (smallest != nullptr) {
34 pinned_bounds_.emplace_back();
35 auto& parsed_smallest = pinned_bounds_.back();
20effc67
TL
36 Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
37 false /* log_err_key */); // TODO
38 pik_status.PermitUncheckedError();
39 assert(pik_status.ok());
494da23a 40 smallest_ = &parsed_smallest;
11fdf7f2 41 }
494da23a
TL
42 if (largest != nullptr) {
43 pinned_bounds_.emplace_back();
44 auto& parsed_largest = pinned_bounds_.back();
20effc67
TL
45
46 Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest,
47 false /* log_err_key */); // TODO
48 pik_status.PermitUncheckedError();
49 assert(pik_status.ok());
50
494da23a
TL
51 if (parsed_largest.type == kTypeRangeDeletion &&
52 parsed_largest.sequence == kMaxSequenceNumber) {
53 // The file boundary has been artificially extended by a range tombstone.
54 // We do not need to adjust largest to properly truncate range
55 // tombstones that extend past the boundary.
56 } else if (parsed_largest.sequence == 0) {
57 // The largest key in the sstable has a sequence number of 0. Since we
58 // guarantee that no internal keys with the same user key and sequence
59 // number can exist in a DB, we know that the largest key in this sstable
60 // cannot exist as the smallest key in the next sstable. This further
61 // implies that no range tombstone in this sstable covers largest;
62 // otherwise, the file boundary would have been artificially extended.
63 //
64 // Therefore, we will never truncate a range tombstone at largest, so we
65 // can leave it unchanged.
66 } else {
67 // The same user key may straddle two sstable boundaries. To ensure that
68 // the truncated end key can cover the largest key in this sstable, reduce
69 // its sequence number by 1.
70 parsed_largest.sequence -= 1;
1e59de90
TL
71 // This line is not needed for correctness, but it ensures that the
72 // truncated end key is not covering keys from the next SST file.
73 parsed_largest.type = kValueTypeForSeek;
11fdf7f2 74 }
494da23a 75 largest_ = &parsed_largest;
11fdf7f2 76 }
494da23a 77}
11fdf7f2 78
494da23a 79bool TruncatedRangeDelIterator::Valid() const {
1e59de90 80 assert(iter_ != nullptr);
494da23a
TL
81 return iter_->Valid() &&
82 (smallest_ == nullptr ||
83 icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
84 (largest_ == nullptr ||
85 icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
86}
11fdf7f2 87
1e59de90 88// NOTE: target is a user key, with timestamp if enabled.
494da23a
TL
89void TruncatedRangeDelIterator::Seek(const Slice& target) {
90 if (largest_ != nullptr &&
91 icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
92 kTypeRangeDeletion)) <= 0) {
93 iter_->Invalidate();
94 return;
11fdf7f2 95 }
494da23a
TL
96 if (smallest_ != nullptr &&
97 icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
98 iter_->Seek(smallest_->user_key);
99 return;
11fdf7f2 100 }
494da23a
TL
101 iter_->Seek(target);
102}
11fdf7f2 103
1e59de90 104// NOTE: target is a user key, with timestamp if enabled.
494da23a
TL
105void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
106 if (smallest_ != nullptr &&
107 icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
108 *smallest_) < 0) {
109 iter_->Invalidate();
110 return;
11fdf7f2 111 }
494da23a
TL
112 if (largest_ != nullptr &&
113 icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
114 iter_->SeekForPrev(largest_->user_key);
115 return;
11fdf7f2 116 }
494da23a
TL
117 iter_->SeekForPrev(target);
118}
11fdf7f2 119
494da23a
TL
120void TruncatedRangeDelIterator::SeekToFirst() {
121 if (smallest_ != nullptr) {
122 iter_->Seek(smallest_->user_key);
123 return;
124 }
125 iter_->SeekToTopFirst();
126}
11fdf7f2 127
494da23a
TL
128void TruncatedRangeDelIterator::SeekToLast() {
129 if (largest_ != nullptr) {
130 iter_->SeekForPrev(largest_->user_key);
131 return;
11fdf7f2 132 }
494da23a
TL
133 iter_->SeekToTopLast();
134}
11fdf7f2 135
494da23a
TL
136std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
137TruncatedRangeDelIterator::SplitBySnapshot(
138 const std::vector<SequenceNumber>& snapshots) {
139 using FragmentedIterPair =
140 std::pair<const SequenceNumber,
141 std::unique_ptr<FragmentedRangeTombstoneIterator>>;
142
143 auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
144 std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
145 split_truncated_iters;
146 std::for_each(
147 split_untruncated_iters.begin(), split_untruncated_iters.end(),
148 [&](FragmentedIterPair& iter_pair) {
1e59de90
TL
149 auto truncated_iter = std::make_unique<TruncatedRangeDelIterator>(
150 std::move(iter_pair.second), icmp_, smallest_ikey_, largest_ikey_);
494da23a
TL
151 split_truncated_iters.emplace(iter_pair.first,
152 std::move(truncated_iter));
153 });
154 return split_truncated_iters;
155}
11fdf7f2 156
494da23a
TL
157ForwardRangeDelIterator::ForwardRangeDelIterator(
158 const InternalKeyComparator* icmp)
159 : icmp_(icmp),
160 unused_idx_(0),
161 active_seqnums_(SeqMaxComparator()),
162 active_iters_(EndKeyMinComparator(icmp)),
163 inactive_iters_(StartKeyMinComparator(icmp)) {}
164
165bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
166 // Move active iterators that end before parsed.
167 while (!active_iters_.empty() &&
168 icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
169 TruncatedRangeDelIterator* iter = PopActiveIter();
170 do {
171 iter->Next();
172 } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
173 PushIter(iter, parsed);
174 assert(active_iters_.size() == active_seqnums_.size());
175 }
11fdf7f2 176
494da23a
TL
177 // Move inactive iterators that start before parsed.
178 while (!inactive_iters_.empty() &&
179 icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
180 TruncatedRangeDelIterator* iter = PopInactiveIter();
181 while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
182 iter->Next();
183 }
184 PushIter(iter, parsed);
185 assert(active_iters_.size() == active_seqnums_.size());
11fdf7f2 186 }
11fdf7f2 187
494da23a
TL
188 return active_seqnums_.empty()
189 ? false
190 : (*active_seqnums_.begin())->seq() > parsed.sequence;
7c673cae
FG
191}
192
494da23a
TL
193void ForwardRangeDelIterator::Invalidate() {
194 unused_idx_ = 0;
195 active_iters_.clear();
196 active_seqnums_.clear();
197 inactive_iters_.clear();
7c673cae
FG
198}
199
494da23a
TL
200ReverseRangeDelIterator::ReverseRangeDelIterator(
201 const InternalKeyComparator* icmp)
202 : icmp_(icmp),
203 unused_idx_(0),
204 active_seqnums_(SeqMaxComparator()),
205 active_iters_(StartKeyMaxComparator(icmp)),
206 inactive_iters_(EndKeyMaxComparator(icmp)) {}
207
208bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
209 // Move active iterators that start after parsed.
210 while (!active_iters_.empty() &&
211 icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
212 TruncatedRangeDelIterator* iter = PopActiveIter();
213 do {
214 iter->Prev();
215 } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
216 PushIter(iter, parsed);
217 assert(active_iters_.size() == active_seqnums_.size());
7c673cae 218 }
11fdf7f2 219
494da23a
TL
220 // Move inactive iterators that end after parsed.
221 while (!inactive_iters_.empty() &&
222 icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
223 TruncatedRangeDelIterator* iter = PopInactiveIter();
224 while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
225 iter->Prev();
226 }
227 PushIter(iter, parsed);
228 assert(active_iters_.size() == active_seqnums_.size());
7c673cae 229 }
494da23a
TL
230
231 return active_seqnums_.empty()
232 ? false
233 : (*active_seqnums_.begin())->seq() > parsed.sequence;
7c673cae
FG
234}
235
494da23a
TL
236void ReverseRangeDelIterator::Invalidate() {
237 unused_idx_ = 0;
238 active_iters_.clear();
239 active_seqnums_.clear();
240 inactive_iters_.clear();
7c673cae
FG
241}
242
494da23a
TL
243bool RangeDelAggregator::StripeRep::ShouldDelete(
244 const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
245 if (!InStripe(parsed.sequence) || IsEmpty()) {
7c673cae
FG
246 return false;
247 }
494da23a
TL
248 switch (mode) {
249 case RangeDelPositioningMode::kForwardTraversal:
250 InvalidateReverseIter();
251
252 // Pick up previously unseen iterators.
253 for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
254 it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
255 auto& iter = *it;
256 forward_iter_.AddNewIter(iter.get(), parsed);
257 }
258
259 return forward_iter_.ShouldDelete(parsed);
260 case RangeDelPositioningMode::kBackwardTraversal:
261 InvalidateForwardIter();
262
263 // Pick up previously unseen iterators.
264 for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
265 it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
266 auto& iter = *it;
267 reverse_iter_.AddNewIter(iter.get(), parsed);
268 }
269
270 return reverse_iter_.ShouldDelete(parsed);
271 default:
272 assert(false);
273 return false;
7c673cae 274 }
7c673cae
FG
275}
276
494da23a
TL
277bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
278 const Slice& end) {
279 Invalidate();
280
281 // Set the internal start/end keys so that:
282 // - if start_ikey has the same user key and sequence number as the
283 // current end key, start_ikey will be considered greater; and
284 // - if end_ikey has the same user key and sequence number as the current
285 // start key, end_ikey will be considered greater.
286 ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
287 static_cast<ValueType>(0));
288 ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
289 for (auto& iter : iters_) {
290 bool checked_candidate_tombstones = false;
291 for (iter->SeekForPrev(start);
292 iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
293 iter->Next()) {
294 checked_candidate_tombstones = true;
295 if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
296 icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
297 return true;
11fdf7f2
TL
298 }
299 }
494da23a
TL
300
301 if (!checked_candidate_tombstones) {
302 // Do an additional check for when the end of the range is the begin
303 // key of a tombstone, which we missed earlier since SeekForPrev'ing
304 // to the start was invalid.
305 iter->SeekForPrev(end);
306 if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
307 icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
308 return true;
11fdf7f2
TL
309 }
310 }
7c673cae 311 }
494da23a 312 return false;
7c673cae
FG
313}
314
494da23a
TL
315void ReadRangeDelAggregator::AddTombstones(
316 std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
317 const InternalKey* smallest, const InternalKey* largest) {
318 if (input_iter == nullptr || input_iter->empty()) {
7c673cae
FG
319 return;
320 }
1e59de90
TL
321 rep_.AddTombstones(std::make_unique<TruncatedRangeDelIterator>(
322 std::move(input_iter), icmp_, smallest, largest));
7c673cae
FG
323}
324
f67539c2
TL
325bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
326 RangeDelPositioningMode mode) {
494da23a 327 return rep_.ShouldDelete(parsed, mode);
7c673cae
FG
328}
329
494da23a
TL
330bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
331 const Slice& end) {
332 InvalidateRangeDelMapPositions();
333 return rep_.IsRangeOverlapped(start, end);
334}
335
336void CompactionRangeDelAggregator::AddTombstones(
337 std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
338 const InternalKey* smallest, const InternalKey* largest) {
339 if (input_iter == nullptr || input_iter->empty()) {
340 return;
7c673cae 341 }
1e59de90
TL
342 // This bounds output of CompactionRangeDelAggregator::NewIterator.
343 if (!trim_ts_.empty()) {
344 assert(icmp_->user_comparator()->timestamp_size() > 0);
345 input_iter->SetTimestampUpperBound(&trim_ts_);
346 }
347
494da23a
TL
348 assert(input_iter->lower_bound() == 0);
349 assert(input_iter->upper_bound() == kMaxSequenceNumber);
350 parent_iters_.emplace_back(new TruncatedRangeDelIterator(
351 std::move(input_iter), icmp_, smallest, largest));
352
1e59de90
TL
353 Slice* ts_upper_bound = nullptr;
354 if (!ts_upper_bound_.empty()) {
355 assert(icmp_->user_comparator()->timestamp_size() > 0);
356 ts_upper_bound = &ts_upper_bound_;
357 }
494da23a
TL
358 auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
359 for (auto& split_iter : split_iters) {
360 auto it = reps_.find(split_iter.first);
361 if (it == reps_.end()) {
362 bool inserted;
363 SequenceNumber upper_bound = split_iter.second->upper_bound();
364 SequenceNumber lower_bound = split_iter.second->lower_bound();
365 std::tie(it, inserted) = reps_.emplace(
366 split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
367 assert(inserted);
7c673cae 368 }
494da23a 369 assert(it != reps_.end());
1e59de90
TL
370 // ts_upper_bound is used to bound ShouldDelete() to only consider
371 // range tombstones under full_history_ts_low_ and trim_ts_. Keys covered by
372 // range tombstones that are above full_history_ts_low_ should not be
373 // dropped prematurely: user may read with a timestamp between the range
374 // tombstone and the covered key. Note that we cannot set timestamp
375 // upperbound on the original `input_iter` since `input_iter`s are later
376 // used in CompactionRangeDelAggregator::NewIterator to output range
377 // tombstones for persistence. We do not want to only persist range
378 // tombstones with timestamp lower than ts_upper_bound.
379 split_iter.second->SetTimestampUpperBound(ts_upper_bound);
494da23a 380 it->second.AddTombstones(std::move(split_iter.second));
7c673cae 381 }
11fdf7f2 382}
7c673cae 383
494da23a
TL
384bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
385 RangeDelPositioningMode mode) {
386 auto it = reps_.lower_bound(parsed.sequence);
387 if (it == reps_.end()) {
388 return false;
11fdf7f2 389 }
494da23a 390 return it->second.ShouldDelete(parsed, mode);
11fdf7f2 391}
7c673cae 392
494da23a
TL
393namespace {
394
1e59de90
TL
395// Produce a sorted (by start internal key) stream of range tombstones from
396// `children`. lower_bound and upper_bound on user key can be
397// optionally specified. Range tombstones that ends before lower_bound or starts
398// after upper_bound are excluded.
399// If user-defined timestamp is enabled, lower_bound and upper_bound should
400// contain timestamp, but comparison is done ignoring timestamps.
494da23a 401class TruncatedRangeDelMergingIter : public InternalIterator {
11fdf7f2 402 public:
494da23a
TL
403 TruncatedRangeDelMergingIter(
404 const InternalKeyComparator* icmp, const Slice* lower_bound,
405 const Slice* upper_bound, bool upper_bound_inclusive,
406 const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
407 : icmp_(icmp),
408 lower_bound_(lower_bound),
409 upper_bound_(upper_bound),
410 upper_bound_inclusive_(upper_bound_inclusive),
1e59de90
TL
411 heap_(StartKeyMinComparator(icmp)),
412 ts_sz_(icmp_->user_comparator()->timestamp_size()) {
494da23a
TL
413 for (auto& child : children) {
414 if (child != nullptr) {
415 assert(child->lower_bound() == 0);
416 assert(child->upper_bound() == kMaxSequenceNumber);
417 children_.push_back(child.get());
418 }
7c673cae 419 }
7c673cae 420 }
7c673cae 421
494da23a
TL
422 bool Valid() const override {
423 return !heap_.empty() && BeforeEndKey(heap_.top());
424 }
425 Status status() const override { return Status::OK(); }
426
427 void SeekToFirst() override {
428 heap_.clear();
429 for (auto& child : children_) {
430 if (lower_bound_ != nullptr) {
431 child->Seek(*lower_bound_);
432 } else {
433 child->SeekToFirst();
434 }
435 if (child->Valid()) {
436 heap_.push(child);
437 }
438 }
439 }
11fdf7f2
TL
440
441 void Next() override {
494da23a
TL
442 auto* top = heap_.top();
443 top->InternalNext();
444 if (top->Valid()) {
445 heap_.replace_top(top);
11fdf7f2
TL
446 } else {
447 heap_.pop();
448 }
7c673cae 449 }
11fdf7f2 450
494da23a
TL
451 Slice key() const override {
452 auto* top = heap_.top();
1e59de90
TL
453 if (ts_sz_) {
454 cur_start_key_.Set(top->start_key().user_key, top->seq(),
455 kTypeRangeDeletion, top->timestamp());
456 } else {
457 cur_start_key_.Set(top->start_key().user_key, top->seq(),
458 kTypeRangeDeletion);
459 }
460 assert(top->start_key().user_key.size() >= ts_sz_);
494da23a 461 return cur_start_key_.Encode();
7c673cae 462 }
11fdf7f2 463
494da23a
TL
464 Slice value() const override {
465 auto* top = heap_.top();
1e59de90
TL
466 if (!ts_sz_) {
467 return top->end_key().user_key;
468 }
469 assert(top->timestamp().size() == ts_sz_);
470 cur_end_key_.clear();
471 cur_end_key_.append(top->end_key().user_key.data(),
472 top->end_key().user_key.size() - ts_sz_);
473 cur_end_key_.append(top->timestamp().data(), ts_sz_);
474 return cur_end_key_;
494da23a
TL
475 }
476
477 // Unused InternalIterator methods
478 void Prev() override { assert(false); }
479 void Seek(const Slice& /* target */) override { assert(false); }
480 void SeekForPrev(const Slice& /* target */) override { assert(false); }
481 void SeekToLast() override { assert(false); }
11fdf7f2
TL
482
483 private:
494da23a
TL
484 bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
485 if (upper_bound_ == nullptr) {
486 return true;
11fdf7f2 487 }
1e59de90
TL
488 int cmp = icmp_->user_comparator()->CompareWithoutTimestamp(
489 iter->start_key().user_key, *upper_bound_);
494da23a
TL
490 return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
491 }
11fdf7f2 492
494da23a
TL
493 const InternalKeyComparator* icmp_;
494 const Slice* lower_bound_;
495 const Slice* upper_bound_;
496 bool upper_bound_inclusive_;
497 BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
498 std::vector<TruncatedRangeDelIterator*> children_;
11fdf7f2 499
494da23a 500 mutable InternalKey cur_start_key_;
1e59de90
TL
501 mutable std::string cur_end_key_;
502 size_t ts_sz_;
11fdf7f2
TL
503};
504
1e59de90 505} // anonymous namespace
494da23a
TL
506
507std::unique_ptr<FragmentedRangeTombstoneIterator>
508CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
509 const Slice* upper_bound,
510 bool upper_bound_inclusive) {
511 InvalidateRangeDelMapPositions();
1e59de90
TL
512 auto merging_iter = std::make_unique<TruncatedRangeDelMergingIter>(
513 icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_);
494da23a
TL
514
515 auto fragmented_tombstone_list =
516 std::make_shared<FragmentedRangeTombstoneList>(
517 std::move(merging_iter), *icmp_, true /* for_compaction */,
518 *snapshots_);
519
1e59de90
TL
520 return std::make_unique<FragmentedRangeTombstoneIterator>(
521 fragmented_tombstone_list, *icmp_, kMaxSequenceNumber /* upper_bound */);
7c673cae
FG
522}
523
f67539c2 524} // namespace ROCKSDB_NAMESPACE