]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/db/range_tombstone_fragmenter.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / db / range_tombstone_fragmenter.h
CommitLineData
494da23a
TL
1// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5
6#pragma once
7
8#include <list>
9#include <memory>
10#include <set>
11#include <string>
12#include <vector>
13
14#include "db/dbformat.h"
15#include "db/pinned_iterators_manager.h"
16#include "rocksdb/status.h"
17#include "table/internal_iterator.h"
18
f67539c2 19namespace ROCKSDB_NAMESPACE {
1e59de90
TL
20struct FragmentedRangeTombstoneList;
21
22struct FragmentedRangeTombstoneListCache {
23 // ensure only the first reader needs to initialize l
24 std::mutex reader_mutex;
25 std::unique_ptr<FragmentedRangeTombstoneList> tombstones = nullptr;
26 // readers will first check this bool to avoid
27 std::atomic<bool> initialized = false;
28};
494da23a
TL
29
30struct FragmentedRangeTombstoneList {
31 public:
32 // A compact representation of a "stack" of range tombstone fragments, which
33 // start and end at the same user keys but have different sequence numbers.
34 // The members seq_start_idx and seq_end_idx are intended to be parameters to
35 // seq_iter().
1e59de90
TL
36 // If user-defined timestamp is enabled, `start` and `end` should be user keys
37 // with timestamp, and the timestamps are set to max timestamp to be returned
38 // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will
39 // also be used as parameters to ts_iter().
494da23a
TL
40 struct RangeTombstoneStack {
41 RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
42 size_t end_idx)
43 : start_key(start),
44 end_key(end),
45 seq_start_idx(start_idx),
46 seq_end_idx(end_idx) {}
494da23a
TL
47 Slice start_key;
48 Slice end_key;
49 size_t seq_start_idx;
50 size_t seq_end_idx;
51 };
1e59de90
TL
52 // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value()
53 // both contain timestamp if enabled.
494da23a
TL
54 FragmentedRangeTombstoneList(
55 std::unique_ptr<InternalIterator> unfragmented_tombstones,
56 const InternalKeyComparator& icmp, bool for_compaction = false,
57 const std::vector<SequenceNumber>& snapshots = {});
58
59 std::vector<RangeTombstoneStack>::const_iterator begin() const {
60 return tombstones_.begin();
61 }
62
63 std::vector<RangeTombstoneStack>::const_iterator end() const {
64 return tombstones_.end();
65 }
66
67 std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
68 return std::next(tombstone_seqs_.begin(), idx);
69 }
70
1e59de90
TL
71 std::vector<Slice>::const_iterator ts_iter(size_t idx) const {
72 return std::next(tombstone_timestamps_.begin(), idx);
73 }
74
494da23a
TL
75 std::vector<SequenceNumber>::const_iterator seq_begin() const {
76 return tombstone_seqs_.begin();
77 }
78
79 std::vector<SequenceNumber>::const_iterator seq_end() const {
80 return tombstone_seqs_.end();
81 }
82
83 bool empty() const { return tombstones_.empty(); }
84
85 // Returns true if the stored tombstones contain with one with a sequence
86 // number in [lower, upper].
1e59de90
TL
87 // This method is not const as it internally lazy initialize a set of
88 // sequence numbers (`seq_set_`).
89 bool ContainsRange(SequenceNumber lower, SequenceNumber upper);
90
91 uint64_t num_unfragmented_tombstones() const {
92 return num_unfragmented_tombstones_;
93 }
94
95 uint64_t total_tombstone_payload_bytes() const {
96 return total_tombstone_payload_bytes_;
97 }
494da23a
TL
98
99 private:
100 // Given an ordered range tombstone iterator unfragmented_tombstones,
1e59de90
TL
101 // "fragment" the tombstones into non-overlapping pieces. Each
102 // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which
103 // contains start_key, end_key, and indices that points to sequence numbers
104 // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If
105 // for_compaction is true, then `snapshots` should be provided. Range
106 // tombstone fragments are dropped if they are not visible in any snapshot and
107 // user-defined timestamp is not enabled. That is, for each snapshot stripe
108 // [lower, upper], the range tombstone fragment with largest seqno in [lower,
109 // upper] is preserved, and all the other range tombstones are dropped.
494da23a
TL
110 void FragmentTombstones(
111 std::unique_ptr<InternalIterator> unfragmented_tombstones,
112 const InternalKeyComparator& icmp, bool for_compaction,
113 const std::vector<SequenceNumber>& snapshots);
114
115 std::vector<RangeTombstoneStack> tombstones_;
116 std::vector<SequenceNumber> tombstone_seqs_;
1e59de90
TL
117 std::vector<Slice> tombstone_timestamps_;
118 std::once_flag seq_set_init_once_flag_;
494da23a
TL
119 std::set<SequenceNumber> seq_set_;
120 std::list<std::string> pinned_slices_;
121 PinnedIteratorsManager pinned_iters_mgr_;
1e59de90
TL
122 uint64_t num_unfragmented_tombstones_;
123 uint64_t total_tombstone_payload_bytes_;
494da23a
TL
124};
125
126// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
127// meta block into an iterator over non-overlapping tombstone fragments. The
128// tombstone fragmentation process should be more efficient than the range
129// tombstone collapsing algorithm in RangeDelAggregator because this leverages
130// the internal key ordering already provided by the input iterator, if
131// applicable (when the iterator is unsorted, a new sorted iterator is created
132// before proceeding). If there are few overlaps, creating a
133// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
134// tombstone collapsing is always O(n log n).
135class FragmentedRangeTombstoneIterator : public InternalIterator {
136 public:
1e59de90
TL
137 FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones,
138 const InternalKeyComparator& icmp,
139 SequenceNumber upper_bound,
140 const Slice* ts_upper_bound = nullptr,
141 SequenceNumber lower_bound = 0);
494da23a 142 FragmentedRangeTombstoneIterator(
1e59de90 143 const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
494da23a 144 const InternalKeyComparator& icmp, SequenceNumber upper_bound,
1e59de90 145 const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
494da23a 146 FragmentedRangeTombstoneIterator(
1e59de90 147 const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones,
494da23a 148 const InternalKeyComparator& icmp, SequenceNumber upper_bound,
1e59de90 149 const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
494da23a
TL
150
151 void SeekToFirst() override;
152 void SeekToLast() override;
153
154 void SeekToTopFirst();
155 void SeekToTopLast();
156
157 // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
158 // seeking should behave. This is OK because they are not currently used, but
159 // eventually FragmentedRangeTombstoneIterator should no longer implement
160 // InternalIterator.
161 //
162 // Seeks to the range tombstone that covers target at a seqnum in the
163 // snapshot. If no such tombstone exists, seek to the earliest tombstone in
164 // the snapshot that ends after target.
165 void Seek(const Slice& target) override;
166 // Seeks to the range tombstone that covers target at a seqnum in the
167 // snapshot. If no such tombstone exists, seek to the latest tombstone in the
168 // snapshot that starts before target.
169 void SeekForPrev(const Slice& target) override;
170
171 void Next() override;
172 void Prev() override;
173
174 void TopNext();
175 void TopPrev();
176
177 bool Valid() const override;
1e59de90
TL
178 // Note that key() and value() do not return correct timestamp.
179 // Caller should call timestamp() to get the current timestamp.
494da23a
TL
180 Slice key() const override {
181 MaybePinKey();
182 return current_start_key_.Encode();
183 }
184 Slice value() const override { return pos_->end_key; }
185 bool IsKeyPinned() const override { return false; }
186 bool IsValuePinned() const override { return true; }
187 Status status() const override { return Status::OK(); }
188
189 bool empty() const { return tombstones_->empty(); }
190 void Invalidate() {
191 pos_ = tombstones_->end();
192 seq_pos_ = tombstones_->seq_end();
f67539c2
TL
193 pinned_pos_ = tombstones_->end();
194 pinned_seq_pos_ = tombstones_->seq_end();
494da23a
TL
195 }
196
197 RangeTombstone Tombstone() const {
1e59de90
TL
198 assert(Valid());
199 if (icmp_->user_comparator()->timestamp_size()) {
200 return RangeTombstone(start_key(), end_key(), seq(), timestamp());
201 }
494da23a
TL
202 return RangeTombstone(start_key(), end_key(), seq());
203 }
1e59de90
TL
204 // Note that start_key() and end_key() are not guaranteed to have the
205 // correct timestamp. User can call timestamp() to get the correct
206 // timestamp().
494da23a
TL
207 Slice start_key() const { return pos_->start_key; }
208 Slice end_key() const { return pos_->end_key; }
209 SequenceNumber seq() const { return *seq_pos_; }
1e59de90
TL
210 Slice timestamp() const {
211 // seqno and timestamp are stored in the same order.
212 return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin());
213 }
214 // Current use case is by CompactionRangeDelAggregator to set
215 // full_history_ts_low_.
216 void SetTimestampUpperBound(const Slice* ts_upper_bound) {
217 ts_upper_bound_ = ts_upper_bound;
218 }
219
494da23a
TL
220 ParsedInternalKey parsed_start_key() const {
221 return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
222 kTypeRangeDeletion);
223 }
224 ParsedInternalKey parsed_end_key() const {
225 return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
226 kTypeRangeDeletion);
227 }
228
1e59de90
TL
229 // Return the max sequence number of a range tombstone that covers
230 // the given user key.
231 // If there is no covering tombstone, then 0 is returned.
494da23a
TL
232 SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
233
234 // Splits the iterator into n+1 iterators (where n is the number of
235 // snapshots), each providing a view over a "stripe" of sequence numbers. The
236 // iterators are keyed by the upper bound of their ranges (the provided
237 // snapshots + kMaxSequenceNumber).
238 //
239 // NOTE: the iterators in the returned map are no longer valid if their
240 // parent iterator is deleted, since they do not modify the refcount of the
241 // underlying tombstone list. Therefore, this map should be deleted before
242 // the parent iterator.
243 std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
244 SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
245
246 SequenceNumber upper_bound() const { return upper_bound_; }
247 SequenceNumber lower_bound() const { return lower_bound_; }
248
1e59de90
TL
249 uint64_t num_unfragmented_tombstones() const {
250 return tombstones_->num_unfragmented_tombstones();
251 }
252 uint64_t total_tombstone_payload_bytes() const {
253 return tombstones_->total_tombstone_payload_bytes();
254 }
255
494da23a
TL
256 private:
257 using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
258
259 struct RangeTombstoneStackStartComparator {
260 explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
261
262 bool operator()(const RangeTombstoneStack& a,
263 const RangeTombstoneStack& b) const {
1e59de90 264 return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0;
494da23a
TL
265 }
266
267 bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
1e59de90 268 return cmp->CompareWithoutTimestamp(a.start_key, b) < 0;
494da23a
TL
269 }
270
271 bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
1e59de90 272 return cmp->CompareWithoutTimestamp(a, b.start_key) < 0;
494da23a
TL
273 }
274
275 const Comparator* cmp;
276 };
277
278 struct RangeTombstoneStackEndComparator {
279 explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
280
281 bool operator()(const RangeTombstoneStack& a,
282 const RangeTombstoneStack& b) const {
1e59de90 283 return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0;
494da23a
TL
284 }
285
286 bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
1e59de90 287 return cmp->CompareWithoutTimestamp(a.end_key, b) < 0;
494da23a
TL
288 }
289
290 bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
1e59de90 291 return cmp->CompareWithoutTimestamp(a, b.end_key) < 0;
494da23a
TL
292 }
293
294 const Comparator* cmp;
295 };
296
297 void MaybePinKey() const {
298 if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
299 (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
300 current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
301 pinned_pos_ = pos_;
302 pinned_seq_pos_ = seq_pos_;
303 }
304 }
305
306 void SeekToCoveringTombstone(const Slice& key);
307 void SeekForPrevToCoveringTombstone(const Slice& key);
308 void ScanForwardToVisibleTombstone();
309 void ScanBackwardToVisibleTombstone();
310 bool ValidPos() const {
311 return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
312 }
313
314 const RangeTombstoneStackStartComparator tombstone_start_cmp_;
315 const RangeTombstoneStackEndComparator tombstone_end_cmp_;
316 const InternalKeyComparator* icmp_;
317 const Comparator* ucmp_;
1e59de90
TL
318 std::shared_ptr<FragmentedRangeTombstoneList> tombstones_ref_;
319 std::shared_ptr<FragmentedRangeTombstoneListCache> tombstones_cache_ref_;
320 FragmentedRangeTombstoneList* tombstones_;
494da23a
TL
321 SequenceNumber upper_bound_;
322 SequenceNumber lower_bound_;
1e59de90
TL
323 // Only consider timestamps <= ts_upper_bound_.
324 const Slice* ts_upper_bound_;
494da23a
TL
325 std::vector<RangeTombstoneStack>::const_iterator pos_;
326 std::vector<SequenceNumber>::const_iterator seq_pos_;
327 mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
328 mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
329 mutable InternalKey current_start_key_;
1e59de90
TL
330
331 // Check the current RangeTombstoneStack `pos_` against timestamp
332 // upper bound `ts_upper_bound_` and sequence number upper bound
333 // `upper_bound_`. Update the sequence number (and timestamp) pointer
334 // `seq_pos_` to the first valid position satisfying both bounds.
335 void SetMaxVisibleSeqAndTimestamp() {
336 seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
337 tombstones_->seq_iter(pos_->seq_end_idx),
338 upper_bound_, std::greater<SequenceNumber>());
339 if (ts_upper_bound_ && !ts_upper_bound_->empty()) {
340 auto ts_pos = std::lower_bound(
341 tombstones_->ts_iter(pos_->seq_start_idx),
342 tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_,
343 [this](const Slice& s1, const Slice& s2) {
344 return ucmp_->CompareTimestamp(s1, s2) > 0;
345 });
346 auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx);
347 auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx);
348 if (seq_idx < ts_idx) {
349 // seq and ts are ordered in non-increasing order. Only updates seq_pos_
350 // to a larger index for smaller sequence number and timestamp.
351 seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx);
352 }
353 }
354 }
494da23a
TL
355};
356
f67539c2 357} // namespace ROCKSDB_NAMESPACE