]>
Commit | Line | Data |
---|---|---|
494da23a TL |
1 | // Copyright (c) 2018-present, Facebook, Inc. All rights reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the | |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
5 | ||
6 | #pragma once | |
7 | ||
8 | #include <list> | |
9 | #include <memory> | |
10 | #include <set> | |
11 | #include <string> | |
12 | #include <vector> | |
13 | ||
14 | #include "db/dbformat.h" | |
15 | #include "db/pinned_iterators_manager.h" | |
16 | #include "rocksdb/status.h" | |
17 | #include "table/internal_iterator.h" | |
18 | ||
f67539c2 | 19 | namespace ROCKSDB_NAMESPACE { |
1e59de90 TL |
20 | struct FragmentedRangeTombstoneList; |
21 | ||
22 | struct FragmentedRangeTombstoneListCache { | |
23 | // ensure only the first reader needs to initialize l | |
24 | std::mutex reader_mutex; | |
25 | std::unique_ptr<FragmentedRangeTombstoneList> tombstones = nullptr; | |
26 | // readers will first check this bool to avoid | |
27 | std::atomic<bool> initialized = false; | |
28 | }; | |
494da23a TL |
29 | |
30 | struct FragmentedRangeTombstoneList { | |
31 | public: | |
32 | // A compact representation of a "stack" of range tombstone fragments, which | |
33 | // start and end at the same user keys but have different sequence numbers. | |
34 | // The members seq_start_idx and seq_end_idx are intended to be parameters to | |
35 | // seq_iter(). | |
1e59de90 TL |
36 | // If user-defined timestamp is enabled, `start` and `end` should be user keys |
37 | // with timestamp, and the timestamps are set to max timestamp to be returned | |
38 | // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will | |
39 | // also be used as parameters to ts_iter(). | |
494da23a TL |
40 | struct RangeTombstoneStack { |
41 | RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx, | |
42 | size_t end_idx) | |
43 | : start_key(start), | |
44 | end_key(end), | |
45 | seq_start_idx(start_idx), | |
46 | seq_end_idx(end_idx) {} | |
494da23a TL |
47 | Slice start_key; |
48 | Slice end_key; | |
49 | size_t seq_start_idx; | |
50 | size_t seq_end_idx; | |
51 | }; | |
1e59de90 TL |
52 | // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value() |
53 | // both contain timestamp if enabled. | |
494da23a TL |
54 | FragmentedRangeTombstoneList( |
55 | std::unique_ptr<InternalIterator> unfragmented_tombstones, | |
56 | const InternalKeyComparator& icmp, bool for_compaction = false, | |
57 | const std::vector<SequenceNumber>& snapshots = {}); | |
58 | ||
59 | std::vector<RangeTombstoneStack>::const_iterator begin() const { | |
60 | return tombstones_.begin(); | |
61 | } | |
62 | ||
63 | std::vector<RangeTombstoneStack>::const_iterator end() const { | |
64 | return tombstones_.end(); | |
65 | } | |
66 | ||
67 | std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const { | |
68 | return std::next(tombstone_seqs_.begin(), idx); | |
69 | } | |
70 | ||
1e59de90 TL |
71 | std::vector<Slice>::const_iterator ts_iter(size_t idx) const { |
72 | return std::next(tombstone_timestamps_.begin(), idx); | |
73 | } | |
74 | ||
494da23a TL |
75 | std::vector<SequenceNumber>::const_iterator seq_begin() const { |
76 | return tombstone_seqs_.begin(); | |
77 | } | |
78 | ||
79 | std::vector<SequenceNumber>::const_iterator seq_end() const { | |
80 | return tombstone_seqs_.end(); | |
81 | } | |
82 | ||
83 | bool empty() const { return tombstones_.empty(); } | |
84 | ||
85 | // Returns true if the stored tombstones contain with one with a sequence | |
86 | // number in [lower, upper]. | |
1e59de90 TL |
87 | // This method is not const as it internally lazy initialize a set of |
88 | // sequence numbers (`seq_set_`). | |
89 | bool ContainsRange(SequenceNumber lower, SequenceNumber upper); | |
90 | ||
91 | uint64_t num_unfragmented_tombstones() const { | |
92 | return num_unfragmented_tombstones_; | |
93 | } | |
94 | ||
95 | uint64_t total_tombstone_payload_bytes() const { | |
96 | return total_tombstone_payload_bytes_; | |
97 | } | |
494da23a TL |
98 | |
99 | private: | |
100 | // Given an ordered range tombstone iterator unfragmented_tombstones, | |
1e59de90 TL |
101 | // "fragment" the tombstones into non-overlapping pieces. Each |
102 | // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which | |
103 | // contains start_key, end_key, and indices that points to sequence numbers | |
104 | // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If | |
105 | // for_compaction is true, then `snapshots` should be provided. Range | |
106 | // tombstone fragments are dropped if they are not visible in any snapshot and | |
107 | // user-defined timestamp is not enabled. That is, for each snapshot stripe | |
108 | // [lower, upper], the range tombstone fragment with largest seqno in [lower, | |
109 | // upper] is preserved, and all the other range tombstones are dropped. | |
494da23a TL |
110 | void FragmentTombstones( |
111 | std::unique_ptr<InternalIterator> unfragmented_tombstones, | |
112 | const InternalKeyComparator& icmp, bool for_compaction, | |
113 | const std::vector<SequenceNumber>& snapshots); | |
114 | ||
115 | std::vector<RangeTombstoneStack> tombstones_; | |
116 | std::vector<SequenceNumber> tombstone_seqs_; | |
1e59de90 TL |
117 | std::vector<Slice> tombstone_timestamps_; |
118 | std::once_flag seq_set_init_once_flag_; | |
494da23a TL |
119 | std::set<SequenceNumber> seq_set_; |
120 | std::list<std::string> pinned_slices_; | |
121 | PinnedIteratorsManager pinned_iters_mgr_; | |
1e59de90 TL |
122 | uint64_t num_unfragmented_tombstones_; |
123 | uint64_t total_tombstone_payload_bytes_; | |
494da23a TL |
124 | }; |
125 | ||
126 | // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del | |
127 | // meta block into an iterator over non-overlapping tombstone fragments. The | |
128 | // tombstone fragmentation process should be more efficient than the range | |
129 | // tombstone collapsing algorithm in RangeDelAggregator because this leverages | |
130 | // the internal key ordering already provided by the input iterator, if | |
131 | // applicable (when the iterator is unsorted, a new sorted iterator is created | |
132 | // before proceeding). If there are few overlaps, creating a | |
133 | // FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator | |
134 | // tombstone collapsing is always O(n log n). | |
135 | class FragmentedRangeTombstoneIterator : public InternalIterator { | |
136 | public: | |
1e59de90 TL |
137 | FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones, |
138 | const InternalKeyComparator& icmp, | |
139 | SequenceNumber upper_bound, | |
140 | const Slice* ts_upper_bound = nullptr, | |
141 | SequenceNumber lower_bound = 0); | |
494da23a | 142 | FragmentedRangeTombstoneIterator( |
1e59de90 | 143 | const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones, |
494da23a | 144 | const InternalKeyComparator& icmp, SequenceNumber upper_bound, |
1e59de90 | 145 | const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); |
494da23a | 146 | FragmentedRangeTombstoneIterator( |
1e59de90 | 147 | const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones, |
494da23a | 148 | const InternalKeyComparator& icmp, SequenceNumber upper_bound, |
1e59de90 | 149 | const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); |
494da23a TL |
150 | |
151 | void SeekToFirst() override; | |
152 | void SeekToLast() override; | |
153 | ||
154 | void SeekToTopFirst(); | |
155 | void SeekToTopLast(); | |
156 | ||
157 | // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator | |
158 | // seeking should behave. This is OK because they are not currently used, but | |
159 | // eventually FragmentedRangeTombstoneIterator should no longer implement | |
160 | // InternalIterator. | |
161 | // | |
162 | // Seeks to the range tombstone that covers target at a seqnum in the | |
163 | // snapshot. If no such tombstone exists, seek to the earliest tombstone in | |
164 | // the snapshot that ends after target. | |
165 | void Seek(const Slice& target) override; | |
166 | // Seeks to the range tombstone that covers target at a seqnum in the | |
167 | // snapshot. If no such tombstone exists, seek to the latest tombstone in the | |
168 | // snapshot that starts before target. | |
169 | void SeekForPrev(const Slice& target) override; | |
170 | ||
171 | void Next() override; | |
172 | void Prev() override; | |
173 | ||
174 | void TopNext(); | |
175 | void TopPrev(); | |
176 | ||
177 | bool Valid() const override; | |
1e59de90 TL |
178 | // Note that key() and value() do not return correct timestamp. |
179 | // Caller should call timestamp() to get the current timestamp. | |
494da23a TL |
180 | Slice key() const override { |
181 | MaybePinKey(); | |
182 | return current_start_key_.Encode(); | |
183 | } | |
184 | Slice value() const override { return pos_->end_key; } | |
185 | bool IsKeyPinned() const override { return false; } | |
186 | bool IsValuePinned() const override { return true; } | |
187 | Status status() const override { return Status::OK(); } | |
188 | ||
189 | bool empty() const { return tombstones_->empty(); } | |
190 | void Invalidate() { | |
191 | pos_ = tombstones_->end(); | |
192 | seq_pos_ = tombstones_->seq_end(); | |
f67539c2 TL |
193 | pinned_pos_ = tombstones_->end(); |
194 | pinned_seq_pos_ = tombstones_->seq_end(); | |
494da23a TL |
195 | } |
196 | ||
197 | RangeTombstone Tombstone() const { | |
1e59de90 TL |
198 | assert(Valid()); |
199 | if (icmp_->user_comparator()->timestamp_size()) { | |
200 | return RangeTombstone(start_key(), end_key(), seq(), timestamp()); | |
201 | } | |
494da23a TL |
202 | return RangeTombstone(start_key(), end_key(), seq()); |
203 | } | |
1e59de90 TL |
204 | // Note that start_key() and end_key() are not guaranteed to have the |
205 | // correct timestamp. User can call timestamp() to get the correct | |
206 | // timestamp(). | |
494da23a TL |
207 | Slice start_key() const { return pos_->start_key; } |
208 | Slice end_key() const { return pos_->end_key; } | |
209 | SequenceNumber seq() const { return *seq_pos_; } | |
1e59de90 TL |
210 | Slice timestamp() const { |
211 | // seqno and timestamp are stored in the same order. | |
212 | return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin()); | |
213 | } | |
214 | // Current use case is by CompactionRangeDelAggregator to set | |
215 | // full_history_ts_low_. | |
216 | void SetTimestampUpperBound(const Slice* ts_upper_bound) { | |
217 | ts_upper_bound_ = ts_upper_bound; | |
218 | } | |
219 | ||
494da23a TL |
220 | ParsedInternalKey parsed_start_key() const { |
221 | return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber, | |
222 | kTypeRangeDeletion); | |
223 | } | |
224 | ParsedInternalKey parsed_end_key() const { | |
225 | return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber, | |
226 | kTypeRangeDeletion); | |
227 | } | |
228 | ||
1e59de90 TL |
229 | // Return the max sequence number of a range tombstone that covers |
230 | // the given user key. | |
231 | // If there is no covering tombstone, then 0 is returned. | |
494da23a TL |
232 | SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key); |
233 | ||
234 | // Splits the iterator into n+1 iterators (where n is the number of | |
235 | // snapshots), each providing a view over a "stripe" of sequence numbers. The | |
236 | // iterators are keyed by the upper bound of their ranges (the provided | |
237 | // snapshots + kMaxSequenceNumber). | |
238 | // | |
239 | // NOTE: the iterators in the returned map are no longer valid if their | |
240 | // parent iterator is deleted, since they do not modify the refcount of the | |
241 | // underlying tombstone list. Therefore, this map should be deleted before | |
242 | // the parent iterator. | |
243 | std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>> | |
244 | SplitBySnapshot(const std::vector<SequenceNumber>& snapshots); | |
245 | ||
246 | SequenceNumber upper_bound() const { return upper_bound_; } | |
247 | SequenceNumber lower_bound() const { return lower_bound_; } | |
248 | ||
1e59de90 TL |
249 | uint64_t num_unfragmented_tombstones() const { |
250 | return tombstones_->num_unfragmented_tombstones(); | |
251 | } | |
252 | uint64_t total_tombstone_payload_bytes() const { | |
253 | return tombstones_->total_tombstone_payload_bytes(); | |
254 | } | |
255 | ||
494da23a TL |
256 | private: |
257 | using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; | |
258 | ||
259 | struct RangeTombstoneStackStartComparator { | |
260 | explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {} | |
261 | ||
262 | bool operator()(const RangeTombstoneStack& a, | |
263 | const RangeTombstoneStack& b) const { | |
1e59de90 | 264 | return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0; |
494da23a TL |
265 | } |
266 | ||
267 | bool operator()(const RangeTombstoneStack& a, const Slice& b) const { | |
1e59de90 | 268 | return cmp->CompareWithoutTimestamp(a.start_key, b) < 0; |
494da23a TL |
269 | } |
270 | ||
271 | bool operator()(const Slice& a, const RangeTombstoneStack& b) const { | |
1e59de90 | 272 | return cmp->CompareWithoutTimestamp(a, b.start_key) < 0; |
494da23a TL |
273 | } |
274 | ||
275 | const Comparator* cmp; | |
276 | }; | |
277 | ||
278 | struct RangeTombstoneStackEndComparator { | |
279 | explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {} | |
280 | ||
281 | bool operator()(const RangeTombstoneStack& a, | |
282 | const RangeTombstoneStack& b) const { | |
1e59de90 | 283 | return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0; |
494da23a TL |
284 | } |
285 | ||
286 | bool operator()(const RangeTombstoneStack& a, const Slice& b) const { | |
1e59de90 | 287 | return cmp->CompareWithoutTimestamp(a.end_key, b) < 0; |
494da23a TL |
288 | } |
289 | ||
290 | bool operator()(const Slice& a, const RangeTombstoneStack& b) const { | |
1e59de90 | 291 | return cmp->CompareWithoutTimestamp(a, b.end_key) < 0; |
494da23a TL |
292 | } |
293 | ||
294 | const Comparator* cmp; | |
295 | }; | |
296 | ||
297 | void MaybePinKey() const { | |
298 | if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() && | |
299 | (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) { | |
300 | current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion); | |
301 | pinned_pos_ = pos_; | |
302 | pinned_seq_pos_ = seq_pos_; | |
303 | } | |
304 | } | |
305 | ||
306 | void SeekToCoveringTombstone(const Slice& key); | |
307 | void SeekForPrevToCoveringTombstone(const Slice& key); | |
308 | void ScanForwardToVisibleTombstone(); | |
309 | void ScanBackwardToVisibleTombstone(); | |
310 | bool ValidPos() const { | |
311 | return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx); | |
312 | } | |
313 | ||
314 | const RangeTombstoneStackStartComparator tombstone_start_cmp_; | |
315 | const RangeTombstoneStackEndComparator tombstone_end_cmp_; | |
316 | const InternalKeyComparator* icmp_; | |
317 | const Comparator* ucmp_; | |
1e59de90 TL |
318 | std::shared_ptr<FragmentedRangeTombstoneList> tombstones_ref_; |
319 | std::shared_ptr<FragmentedRangeTombstoneListCache> tombstones_cache_ref_; | |
320 | FragmentedRangeTombstoneList* tombstones_; | |
494da23a TL |
321 | SequenceNumber upper_bound_; |
322 | SequenceNumber lower_bound_; | |
1e59de90 TL |
323 | // Only consider timestamps <= ts_upper_bound_. |
324 | const Slice* ts_upper_bound_; | |
494da23a TL |
325 | std::vector<RangeTombstoneStack>::const_iterator pos_; |
326 | std::vector<SequenceNumber>::const_iterator seq_pos_; | |
327 | mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_; | |
328 | mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_; | |
329 | mutable InternalKey current_start_key_; | |
1e59de90 TL |
330 | |
331 | // Check the current RangeTombstoneStack `pos_` against timestamp | |
332 | // upper bound `ts_upper_bound_` and sequence number upper bound | |
333 | // `upper_bound_`. Update the sequence number (and timestamp) pointer | |
334 | // `seq_pos_` to the first valid position satisfying both bounds. | |
335 | void SetMaxVisibleSeqAndTimestamp() { | |
336 | seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), | |
337 | tombstones_->seq_iter(pos_->seq_end_idx), | |
338 | upper_bound_, std::greater<SequenceNumber>()); | |
339 | if (ts_upper_bound_ && !ts_upper_bound_->empty()) { | |
340 | auto ts_pos = std::lower_bound( | |
341 | tombstones_->ts_iter(pos_->seq_start_idx), | |
342 | tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_, | |
343 | [this](const Slice& s1, const Slice& s2) { | |
344 | return ucmp_->CompareTimestamp(s1, s2) > 0; | |
345 | }); | |
346 | auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx); | |
347 | auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx); | |
348 | if (seq_idx < ts_idx) { | |
349 | // seq and ts are ordered in non-increasing order. Only updates seq_pos_ | |
350 | // to a larger index for smaller sequence number and timestamp. | |
351 | seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx); | |
352 | } | |
353 | } | |
354 | } | |
494da23a TL |
355 | }; |
356 | ||
f67539c2 | 357 | } // namespace ROCKSDB_NAMESPACE |