]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/db/db_iter.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / db / db_iter.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #pragma once
11 #include <stdint.h>
12 #include <string>
13 #include "db/db_impl/db_impl.h"
14 #include "db/dbformat.h"
15 #include "db/range_del_aggregator.h"
16 #include "memory/arena.h"
17 #include "options/cf_options.h"
18 #include "rocksdb/db.h"
19 #include "rocksdb/iterator.h"
20 #include "table/iterator_wrapper.h"
21 #include "util/autovector.h"
22
23 namespace ROCKSDB_NAMESPACE {
24
25 // This file declares the factory functions of DBIter, in its original form
26 // or a wrapped form with class ArenaWrappedDBIter, which is defined here.
27 // Class DBIter, which is declared and implemented inside db_iter.cc, is
28 // an iterator that converts internal keys (yielded by an InternalIterator)
29 // that were live at the specified sequence number into appropriate user
30 // keys.
31 // Each internal key consists of a user key, a sequence number, and a value
32 // type. DBIter deals with multiple key versions, tombstones, merge operands,
33 // etc, and exposes an Iterator.
34 // For example, DBIter may wrap following InternalIterator:
35 // user key: AAA value: v3 seqno: 100 type: Put
36 // user key: AAA value: v2 seqno: 97 type: Put
37 // user key: AAA value: v1 seqno: 95 type: Put
38 // user key: BBB value: v1 seqno: 90 type: Put
39 // user key: BBC value: N/A seqno: 98 type: Delete
40 // user key: BBC value: v1 seqno: 95 type: Put
41 // If the snapshot passed in is 102, then the DBIter is expected to
42 // expose the following iterator:
43 // key: AAA value: v3
44 // key: BBB value: v1
45 // If the snapshot passed in is 96, then it should expose:
46 // key: AAA value: v1
47 // key: BBB value: v1
48 // key: BBC value: v1
49 //
50
51 // Memtables and sstables that make the DB representation contain
52 // (userkey,seq,type) => uservalue entries. DBIter
53 // combines multiple entries for the same userkey found in the DB
54 // representation into a single entry while accounting for sequence
55 // numbers, deletion markers, overwrites, etc.
56 class DBIter final : public Iterator {
57 public:
58 // The following is grossly complicated. TODO: clean it up
59 // Which direction is the iterator currently moving?
60 // (1) When moving forward:
61 // (1a) if current_entry_is_merged_ = false, the internal iterator is
62 // positioned at the exact entry that yields this->key(), this->value()
63 // (1b) if current_entry_is_merged_ = true, the internal iterator is
64 // positioned immediately after the last entry that contributed to the
65 // current this->value(). That entry may or may not have key equal to
66 // this->key().
67 // (2) When moving backwards, the internal iterator is positioned
68 // just before all entries whose user key == this->key().
69 enum Direction { kForward, kReverse };
70
71 // LocalStatistics contain Statistics counters that will be aggregated per
72 // each iterator instance and then will be sent to the global statistics when
73 // the iterator is destroyed.
74 //
75 // The purpose of this approach is to avoid perf regression happening
76 // when multiple threads bump the atomic counters from a DBIter::Next().
77 struct LocalStatistics {
78 explicit LocalStatistics() { ResetCounters(); }
79
80 void ResetCounters() {
81 next_count_ = 0;
82 next_found_count_ = 0;
83 prev_count_ = 0;
84 prev_found_count_ = 0;
85 bytes_read_ = 0;
86 skip_count_ = 0;
87 }
88
89 void BumpGlobalStatistics(Statistics* global_statistics) {
90 RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
91 RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
92 RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
93 RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
94 RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
95 RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
96 PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
97 ResetCounters();
98 }
99
100 // Map to Tickers::NUMBER_DB_NEXT
101 uint64_t next_count_;
102 // Map to Tickers::NUMBER_DB_NEXT_FOUND
103 uint64_t next_found_count_;
104 // Map to Tickers::NUMBER_DB_PREV
105 uint64_t prev_count_;
106 // Map to Tickers::NUMBER_DB_PREV_FOUND
107 uint64_t prev_found_count_;
108 // Map to Tickers::ITER_BYTES_READ
109 uint64_t bytes_read_;
110 // Map to Tickers::NUMBER_ITER_SKIP
111 uint64_t skip_count_;
112 };
113
114 DBIter(Env* _env, const ReadOptions& read_options,
115 const ImmutableCFOptions& cf_options,
116 const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
117 InternalIterator* iter, SequenceNumber s, bool arena_mode,
118 uint64_t max_sequential_skip_in_iterations,
119 ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
120 bool allow_blob);
121
122 // No copying allowed
123 DBIter(const DBIter&) = delete;
124 void operator=(const DBIter&) = delete;
125
126 ~DBIter() override {
127 // Release pinned data if any
128 if (pinned_iters_mgr_.PinningEnabled()) {
129 pinned_iters_mgr_.ReleasePinnedData();
130 }
131 RecordTick(statistics_, NO_ITERATOR_DELETED);
132 ResetInternalKeysSkippedCounter();
133 local_stats_.BumpGlobalStatistics(statistics_);
134 iter_.DeleteIter(arena_mode_);
135 }
136 void SetIter(InternalIterator* iter) {
137 assert(iter_.iter() == nullptr);
138 iter_.Set(iter);
139 iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
140 }
141 ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; }
142
143 bool Valid() const override {
144 #ifdef ROCKSDB_ASSERT_STATUS_CHECKED
145 if (valid_) {
146 status_.PermitUncheckedError();
147 }
148 #endif // ROCKSDB_ASSERT_STATUS_CHECKED
149 return valid_;
150 }
151 Slice key() const override {
152 assert(valid_);
153 if (start_seqnum_ > 0 || timestamp_lb_) {
154 return saved_key_.GetInternalKey();
155 } else {
156 const Slice ukey_and_ts = saved_key_.GetUserKey();
157 return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_);
158 }
159 }
160 Slice value() const override {
161 assert(valid_);
162 if (current_entry_is_merged_) {
163 // If pinned_value_ is set then the result of merge operator is one of
164 // the merge operands and we should return it.
165 return pinned_value_.data() ? pinned_value_ : saved_value_;
166 } else if (direction_ == kReverse) {
167 return pinned_value_;
168 } else {
169 return iter_.value();
170 }
171 }
172 Status status() const override {
173 if (status_.ok()) {
174 return iter_.status();
175 } else {
176 assert(!valid_);
177 return status_;
178 }
179 }
180 Slice timestamp() const override {
181 assert(valid_);
182 assert(timestamp_size_ > 0);
183 const Slice ukey_and_ts = saved_key_.GetUserKey();
184 assert(timestamp_size_ < ukey_and_ts.size());
185 return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_);
186 }
187 bool IsBlob() const {
188 assert(valid_ && (allow_blob_ || !is_blob_));
189 return is_blob_;
190 }
191
192 Status GetProperty(std::string prop_name, std::string* prop) override;
193
194 void Next() final override;
195 void Prev() final override;
196 // 'target' does not contain timestamp, even if user timestamp feature is
197 // enabled.
198 void Seek(const Slice& target) final override;
199 void SeekForPrev(const Slice& target) final override;
200 void SeekToFirst() final override;
201 void SeekToLast() final override;
202 Env* env() const { return env_; }
203 void set_sequence(uint64_t s) {
204 sequence_ = s;
205 if (read_callback_) {
206 read_callback_->Refresh(s);
207 }
208 }
209 void set_valid(bool v) { valid_ = v; }
210
211 private:
212 // For all methods in this block:
213 // PRE: iter_->Valid() && status_.ok()
214 // Return false if there was an error, and status() is non-ok, valid_ = false;
215 // in this case callers would usually stop what they were doing and return.
216 bool ReverseToForward();
217 bool ReverseToBackward();
218 // Set saved_key_ to the seek key to target, with proper sequence number set.
219 // It might get adjusted if the seek key is smaller than iterator lower bound.
220 void SetSavedKeyToSeekTarget(const Slice& target);
221 // Set saved_key_ to the seek key to target, with proper sequence number set.
222 // It might get adjusted if the seek key is larger than iterator upper bound.
223 void SetSavedKeyToSeekForPrevTarget(const Slice& target);
224 bool FindValueForCurrentKey();
225 bool FindValueForCurrentKeyUsingSeek();
226 bool FindUserKeyBeforeSavedKey();
227 // If `skipping_saved_key` is true, the function will keep iterating until it
228 // finds a user key that is larger than `saved_key_`.
229 // If `prefix` is not null, the iterator needs to stop when all keys for the
230 // prefix are exhausted and the interator is set to invalid.
231 bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
232 // Internal implementation of FindNextUserEntry().
233 bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
234 bool ParseKey(ParsedInternalKey* key);
235 bool MergeValuesNewToOld();
236
237 // If prefix is not null, we need to set the iterator to invalid if no more
238 // entry can be found within the prefix.
239 void PrevInternal(const Slice* prefix);
240 bool TooManyInternalKeysSkipped(bool increment = true);
241 bool IsVisible(SequenceNumber sequence, const Slice& ts,
242 bool* more_recent = nullptr);
243
244 // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
245 // is called
246 void TempPinData() {
247 if (!pin_thru_lifetime_) {
248 pinned_iters_mgr_.StartPinning();
249 }
250 }
251
252 // Release blocks pinned by TempPinData()
253 void ReleaseTempPinnedData() {
254 if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
255 pinned_iters_mgr_.ReleasePinnedData();
256 }
257 }
258
259 inline void ClearSavedValue() {
260 if (saved_value_.capacity() > 1048576) {
261 std::string empty;
262 swap(empty, saved_value_);
263 } else {
264 saved_value_.clear();
265 }
266 }
267
268 inline void ResetInternalKeysSkippedCounter() {
269 local_stats_.skip_count_ += num_internal_keys_skipped_;
270 if (valid_) {
271 local_stats_.skip_count_--;
272 }
273 num_internal_keys_skipped_ = 0;
274 }
275
276 bool expect_total_order_inner_iter() {
277 assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
278 return expect_total_order_inner_iter_;
279 }
280
281 // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need
282 // to return versions of the same key. We cannot just skip if the key value
283 // is the same but timestamps are different but fall in timestamp range.
284 inline int CompareKeyForSkip(const Slice& a, const Slice& b) {
285 return timestamp_lb_ != nullptr
286 ? user_comparator_.Compare(a, b)
287 : user_comparator_.CompareWithoutTimestamp(a, b);
288 }
289
290 const SliceTransform* prefix_extractor_;
291 Env* const env_;
292 Logger* logger_;
293 UserComparatorWrapper user_comparator_;
294 const MergeOperator* const merge_operator_;
295 IteratorWrapper iter_;
296 ReadCallback* read_callback_;
297 // Max visible sequence number. It is normally the snapshot seq unless we have
298 // uncommitted data in db as in WriteUnCommitted.
299 SequenceNumber sequence_;
300
301 IterKey saved_key_;
302 // Reusable internal key data structure. This is only used inside one function
303 // and should not be used across functions. Reusing this object can reduce
304 // overhead of calling construction of the function if creating it each time.
305 ParsedInternalKey ikey_;
306 std::string saved_value_;
307 Slice pinned_value_;
308 // for prefix seek mode to support prev()
309 Statistics* statistics_;
310 uint64_t max_skip_;
311 uint64_t max_skippable_internal_keys_;
312 uint64_t num_internal_keys_skipped_;
313 const Slice* iterate_lower_bound_;
314 const Slice* iterate_upper_bound_;
315
316 // The prefix of the seek key. It is only used when prefix_same_as_start_
317 // is true and prefix extractor is not null. In Next() or Prev(), current keys
318 // will be checked against this prefix, so that the iterator can be
319 // invalidated if the keys in this prefix has been exhausted. Set it using
320 // SetUserKey() and use it using GetUserKey().
321 IterKey prefix_;
322
323 Status status_;
324 Direction direction_;
325 bool valid_;
326 bool current_entry_is_merged_;
327 // True if we know that the current entry's seqnum is 0.
328 // This information is used as that the next entry will be for another
329 // user key.
330 bool is_key_seqnum_zero_;
331 const bool prefix_same_as_start_;
332 // Means that we will pin all data blocks we read as long the Iterator
333 // is not deleted, will be true if ReadOptions::pin_data is true
334 const bool pin_thru_lifetime_;
335 // Expect the inner iterator to maintain a total order.
336 // prefix_extractor_ must be non-NULL if the value is false.
337 const bool expect_total_order_inner_iter_;
338 bool allow_blob_;
339 bool is_blob_;
340 bool arena_mode_;
341 // List of operands for merge operator.
342 MergeContext merge_context_;
343 ReadRangeDelAggregator range_del_agg_;
344 LocalStatistics local_stats_;
345 PinnedIteratorsManager pinned_iters_mgr_;
346 #ifdef ROCKSDB_LITE
347 ROCKSDB_FIELD_UNUSED
348 #endif
349 DBImpl* db_impl_;
350 #ifdef ROCKSDB_LITE
351 ROCKSDB_FIELD_UNUSED
352 #endif
353 ColumnFamilyData* cfd_;
354 // for diff snapshots we want the lower bound on the seqnum;
355 // if this value > 0 iterator will return internal keys
356 SequenceNumber start_seqnum_;
357 const Slice* const timestamp_ub_;
358 const Slice* const timestamp_lb_;
359 const size_t timestamp_size_;
360 };
361
362 // Return a new iterator that converts internal keys (yielded by
363 // "*internal_iter") that were live at the specified `sequence` number
364 // into appropriate user keys.
365 extern Iterator* NewDBIterator(
366 Env* env, const ReadOptions& read_options,
367 const ImmutableCFOptions& cf_options,
368 const MutableCFOptions& mutable_cf_options,
369 const Comparator* user_key_comparator, InternalIterator* internal_iter,
370 const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
371 ReadCallback* read_callback, DBImpl* db_impl = nullptr,
372 ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
373
374 } // namespace ROCKSDB_NAMESPACE