#include "table/internal_iterator.h"
namespace ROCKSDB_NAMESPACE {
+struct FragmentedRangeTombstoneList;
+
+struct FragmentedRangeTombstoneListCache {
+ // ensure only the first reader needs to initialize l
+ std::mutex reader_mutex;
+ std::unique_ptr<FragmentedRangeTombstoneList> tombstones = nullptr;
+ // readers will first check this bool to avoid
+ std::atomic<bool> initialized = false;
+};
struct FragmentedRangeTombstoneList {
public:
// start and end at the same user keys but have different sequence numbers.
// The members seq_start_idx and seq_end_idx are intended to be parameters to
// seq_iter().
+ // If user-defined timestamp is enabled, `start` and `end` should be user keys
+ // with timestamp, and the timestamps are set to max timestamp to be returned
+ // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will
+ // also be used as parameters to ts_iter().
struct RangeTombstoneStack {
RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
size_t end_idx)
end_key(end),
seq_start_idx(start_idx),
seq_end_idx(end_idx) {}
-
Slice start_key;
Slice end_key;
size_t seq_start_idx;
size_t seq_end_idx;
};
+ // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value()
+ // both contain timestamp if enabled.
FragmentedRangeTombstoneList(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool for_compaction = false,
return std::next(tombstone_seqs_.begin(), idx);
}
+ std::vector<Slice>::const_iterator ts_iter(size_t idx) const {
+ return std::next(tombstone_timestamps_.begin(), idx);
+ }
+
std::vector<SequenceNumber>::const_iterator seq_begin() const {
return tombstone_seqs_.begin();
}
// Returns true if the stored tombstones contain with one with a sequence
// number in [lower, upper].
- bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;
+ // This method is not const as it internally lazy initialize a set of
+ // sequence numbers (`seq_set_`).
+ bool ContainsRange(SequenceNumber lower, SequenceNumber upper);
+
+ uint64_t num_unfragmented_tombstones() const {
+ return num_unfragmented_tombstones_;
+ }
+
+ uint64_t total_tombstone_payload_bytes() const {
+ return total_tombstone_payload_bytes_;
+ }
private:
// Given an ordered range tombstone iterator unfragmented_tombstones,
- // "fragment" the tombstones into non-overlapping pieces, and store them in
- // tombstones_ and tombstone_seqs_.
+ // "fragment" the tombstones into non-overlapping pieces. Each
+ // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which
+ // contains start_key, end_key, and indices that points to sequence numbers
+ // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If
+ // for_compaction is true, then `snapshots` should be provided. Range
+ // tombstone fragments are dropped if they are not visible in any snapshot and
+ // user-defined timestamp is not enabled. That is, for each snapshot stripe
+ // [lower, upper], the range tombstone fragment with largest seqno in [lower,
+ // upper] is preserved, and all the other range tombstones are dropped.
void FragmentTombstones(
std::unique_ptr<InternalIterator> unfragmented_tombstones,
const InternalKeyComparator& icmp, bool for_compaction,
std::vector<RangeTombstoneStack> tombstones_;
std::vector<SequenceNumber> tombstone_seqs_;
+ std::vector<Slice> tombstone_timestamps_;
+ std::once_flag seq_set_init_once_flag_;
std::set<SequenceNumber> seq_set_;
std::list<std::string> pinned_slices_;
PinnedIteratorsManager pinned_iters_mgr_;
+ uint64_t num_unfragmented_tombstones_;
+ uint64_t total_tombstone_payload_bytes_;
};
// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
// tombstone collapsing is always O(n log n).
class FragmentedRangeTombstoneIterator : public InternalIterator {
public:
+ FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones,
+ const InternalKeyComparator& icmp,
+ SequenceNumber upper_bound,
+ const Slice* ts_upper_bound = nullptr,
+ SequenceNumber lower_bound = 0);
FragmentedRangeTombstoneIterator(
- const FragmentedRangeTombstoneList* tombstones,
+ const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
const InternalKeyComparator& icmp, SequenceNumber upper_bound,
- SequenceNumber lower_bound = 0);
+ const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
FragmentedRangeTombstoneIterator(
- const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+ const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones,
const InternalKeyComparator& icmp, SequenceNumber upper_bound,
- SequenceNumber lower_bound = 0);
+ const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
void SeekToFirst() override;
void SeekToLast() override;
void TopPrev();
bool Valid() const override;
+ // Note that key() and value() do not return correct timestamp.
+ // Caller should call timestamp() to get the current timestamp.
Slice key() const override {
MaybePinKey();
return current_start_key_.Encode();
}
RangeTombstone Tombstone() const {
+ assert(Valid());
+ if (icmp_->user_comparator()->timestamp_size()) {
+ return RangeTombstone(start_key(), end_key(), seq(), timestamp());
+ }
return RangeTombstone(start_key(), end_key(), seq());
}
+ // Note that start_key() and end_key() are not guaranteed to have the
+ // correct timestamp. User can call timestamp() to get the correct
+ // timestamp().
Slice start_key() const { return pos_->start_key; }
Slice end_key() const { return pos_->end_key; }
SequenceNumber seq() const { return *seq_pos_; }
+ Slice timestamp() const {
+ // seqno and timestamp are stored in the same order.
+ return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin());
+ }
+ // Current use case is by CompactionRangeDelAggregator to set
+ // full_history_ts_low_.
+ void SetTimestampUpperBound(const Slice* ts_upper_bound) {
+ ts_upper_bound_ = ts_upper_bound;
+ }
+
ParsedInternalKey parsed_start_key() const {
return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
kTypeRangeDeletion);
kTypeRangeDeletion);
}
+ // Return the max sequence number of a range tombstone that covers
+ // the given user key.
+ // If there is no covering tombstone, then 0 is returned.
SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
// Splits the iterator into n+1 iterators (where n is the number of
SequenceNumber upper_bound() const { return upper_bound_; }
SequenceNumber lower_bound() const { return lower_bound_; }
+ uint64_t num_unfragmented_tombstones() const {
+ return tombstones_->num_unfragmented_tombstones();
+ }
+ uint64_t total_tombstone_payload_bytes() const {
+ return tombstones_->total_tombstone_payload_bytes();
+ }
+
private:
using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
bool operator()(const RangeTombstoneStack& a,
const RangeTombstoneStack& b) const {
- return cmp->Compare(a.start_key, b.start_key) < 0;
+ return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0;
}
bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
- return cmp->Compare(a.start_key, b) < 0;
+ return cmp->CompareWithoutTimestamp(a.start_key, b) < 0;
}
bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
- return cmp->Compare(a, b.start_key) < 0;
+ return cmp->CompareWithoutTimestamp(a, b.start_key) < 0;
}
const Comparator* cmp;
bool operator()(const RangeTombstoneStack& a,
const RangeTombstoneStack& b) const {
- return cmp->Compare(a.end_key, b.end_key) < 0;
+ return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0;
}
bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
- return cmp->Compare(a.end_key, b) < 0;
+ return cmp->CompareWithoutTimestamp(a.end_key, b) < 0;
}
bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
- return cmp->Compare(a, b.end_key) < 0;
+ return cmp->CompareWithoutTimestamp(a, b.end_key) < 0;
}
const Comparator* cmp;
const RangeTombstoneStackEndComparator tombstone_end_cmp_;
const InternalKeyComparator* icmp_;
const Comparator* ucmp_;
- std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
- const FragmentedRangeTombstoneList* tombstones_;
+ std::shared_ptr<FragmentedRangeTombstoneList> tombstones_ref_;
+ std::shared_ptr<FragmentedRangeTombstoneListCache> tombstones_cache_ref_;
+ FragmentedRangeTombstoneList* tombstones_;
SequenceNumber upper_bound_;
SequenceNumber lower_bound_;
+ // Only consider timestamps <= ts_upper_bound_.
+ const Slice* ts_upper_bound_;
std::vector<RangeTombstoneStack>::const_iterator pos_;
std::vector<SequenceNumber>::const_iterator seq_pos_;
mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
mutable InternalKey current_start_key_;
+
+ // Check the current RangeTombstoneStack `pos_` against timestamp
+ // upper bound `ts_upper_bound_` and sequence number upper bound
+ // `upper_bound_`. Update the sequence number (and timestamp) pointer
+ // `seq_pos_` to the first valid position satisfying both bounds.
+ void SetMaxVisibleSeqAndTimestamp() {
+ seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+ tombstones_->seq_iter(pos_->seq_end_idx),
+ upper_bound_, std::greater<SequenceNumber>());
+ if (ts_upper_bound_ && !ts_upper_bound_->empty()) {
+ auto ts_pos = std::lower_bound(
+ tombstones_->ts_iter(pos_->seq_start_idx),
+ tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_,
+ [this](const Slice& s1, const Slice& s2) {
+ return ucmp_->CompareTimestamp(s1, s2) > 0;
+ });
+ auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx);
+ auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx);
+ if (seq_idx < ts_idx) {
+ // seq and ts are ordered in non-increasing order. Only updates seq_pos_
+ // to a larger index for smaller sequence number and timestamp.
+ seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx);
+ }
+ }
+ }
};
} // namespace ROCKSDB_NAMESPACE