1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
13 #include "db/db_impl/db_impl.h"
14 #include "db/dbformat.h"
15 #include "db/range_del_aggregator.h"
16 #include "memory/arena.h"
17 #include "options/cf_options.h"
18 #include "rocksdb/db.h"
19 #include "rocksdb/iterator.h"
20 #include "table/iterator_wrapper.h"
21 #include "util/autovector.h"
23 namespace ROCKSDB_NAMESPACE
{
25 // This file declares the factory functions of DBIter, in its original form
26 // or a wrapped form with class ArenaWrappedDBIter, which is defined here.
27 // Class DBIter, which is declared and implemented inside db_iter.cc, is
28 // an iterator that converts internal keys (yielded by an InternalIterator)
29 // that were live at the specified sequence number into appropriate user
31 // Each internal key consists of a user key, a sequence number, and a value
32 // type. DBIter deals with multiple key versions, tombstones, merge operands,
33 // etc, and exposes an Iterator.
34 // For example, DBIter may wrap following InternalIterator:
35 // user key: AAA value: v3 seqno: 100 type: Put
36 // user key: AAA value: v2 seqno: 97 type: Put
37 // user key: AAA value: v1 seqno: 95 type: Put
38 // user key: BBB value: v1 seqno: 90 type: Put
39 // user key: BBC value: N/A seqno: 98 type: Delete
40 // user key: BBC value: v1 seqno: 95 type: Put
41 // If the snapshot passed in is 102, then the DBIter is expected to
42 // expose the following iterator:
45 // If the snapshot passed in is 96, then it should expose:
51 // Memtables and sstables that make the DB representation contain
52 // (userkey,seq,type) => uservalue entries. DBIter
53 // combines multiple entries for the same userkey found in the DB
54 // representation into a single entry while accounting for sequence
55 // numbers, deletion markers, overwrites, etc.
56 class DBIter final
: public Iterator
{
58 // The following is grossly complicated. TODO: clean it up
59 // Which direction is the iterator currently moving?
60 // (1) When moving forward:
61 // (1a) if current_entry_is_merged_ = false, the internal iterator is
62 // positioned at the exact entry that yields this->key(), this->value()
63 // (1b) if current_entry_is_merged_ = true, the internal iterator is
64 // positioned immediately after the last entry that contributed to the
65 // current this->value(). That entry may or may not have key equal to
67 // (2) When moving backwards, the internal iterator is positioned
68 // just before all entries whose user key == this->key().
69 enum Direction
{ kForward
, kReverse
};
71 // LocalStatistics contain Statistics counters that will be aggregated per
72 // each iterator instance and then will be sent to the global statistics when
73 // the iterator is destroyed.
75 // The purpose of this approach is to avoid perf regression happening
76 // when multiple threads bump the atomic counters from a DBIter::Next().
77 struct LocalStatistics
{
78 explicit LocalStatistics() { ResetCounters(); }
80 void ResetCounters() {
82 next_found_count_
= 0;
84 prev_found_count_
= 0;
89 void BumpGlobalStatistics(Statistics
* global_statistics
) {
90 RecordTick(global_statistics
, NUMBER_DB_NEXT
, next_count_
);
91 RecordTick(global_statistics
, NUMBER_DB_NEXT_FOUND
, next_found_count_
);
92 RecordTick(global_statistics
, NUMBER_DB_PREV
, prev_count_
);
93 RecordTick(global_statistics
, NUMBER_DB_PREV_FOUND
, prev_found_count_
);
94 RecordTick(global_statistics
, ITER_BYTES_READ
, bytes_read_
);
95 RecordTick(global_statistics
, NUMBER_ITER_SKIP
, skip_count_
);
96 PERF_COUNTER_ADD(iter_read_bytes
, bytes_read_
);
100 // Map to Tickers::NUMBER_DB_NEXT
101 uint64_t next_count_
;
102 // Map to Tickers::NUMBER_DB_NEXT_FOUND
103 uint64_t next_found_count_
;
104 // Map to Tickers::NUMBER_DB_PREV
105 uint64_t prev_count_
;
106 // Map to Tickers::NUMBER_DB_PREV_FOUND
107 uint64_t prev_found_count_
;
108 // Map to Tickers::ITER_BYTES_READ
109 uint64_t bytes_read_
;
110 // Map to Tickers::NUMBER_ITER_SKIP
111 uint64_t skip_count_
;
114 DBIter(Env
* _env
, const ReadOptions
& read_options
,
115 const ImmutableCFOptions
& cf_options
,
116 const MutableCFOptions
& mutable_cf_options
, const Comparator
* cmp
,
117 InternalIterator
* iter
, SequenceNumber s
, bool arena_mode
,
118 uint64_t max_sequential_skip_in_iterations
,
119 ReadCallback
* read_callback
, DBImpl
* db_impl
, ColumnFamilyData
* cfd
,
122 // No copying allowed
123 DBIter(const DBIter
&) = delete;
124 void operator=(const DBIter
&) = delete;
127 // Release pinned data if any
128 if (pinned_iters_mgr_
.PinningEnabled()) {
129 pinned_iters_mgr_
.ReleasePinnedData();
131 RecordTick(statistics_
, NO_ITERATOR_DELETED
);
132 ResetInternalKeysSkippedCounter();
133 local_stats_
.BumpGlobalStatistics(statistics_
);
134 iter_
.DeleteIter(arena_mode_
);
136 void SetIter(InternalIterator
* iter
) {
137 assert(iter_
.iter() == nullptr);
139 iter_
.iter()->SetPinnedItersMgr(&pinned_iters_mgr_
);
141 ReadRangeDelAggregator
* GetRangeDelAggregator() { return &range_del_agg_
; }
143 bool Valid() const override
{
144 #ifdef ROCKSDB_ASSERT_STATUS_CHECKED
146 status_
.PermitUncheckedError();
148 #endif // ROCKSDB_ASSERT_STATUS_CHECKED
151 Slice
key() const override
{
153 if (start_seqnum_
> 0 || timestamp_lb_
) {
154 return saved_key_
.GetInternalKey();
156 const Slice ukey_and_ts
= saved_key_
.GetUserKey();
157 return Slice(ukey_and_ts
.data(), ukey_and_ts
.size() - timestamp_size_
);
160 Slice
value() const override
{
162 if (current_entry_is_merged_
) {
163 // If pinned_value_ is set then the result of merge operator is one of
164 // the merge operands and we should return it.
165 return pinned_value_
.data() ? pinned_value_
: saved_value_
;
166 } else if (direction_
== kReverse
) {
167 return pinned_value_
;
169 return iter_
.value();
172 Status
status() const override
{
174 return iter_
.status();
180 Slice
timestamp() const override
{
182 assert(timestamp_size_
> 0);
183 const Slice ukey_and_ts
= saved_key_
.GetUserKey();
184 assert(timestamp_size_
< ukey_and_ts
.size());
185 return ExtractTimestampFromUserKey(ukey_and_ts
, timestamp_size_
);
187 bool IsBlob() const {
188 assert(valid_
&& (allow_blob_
|| !is_blob_
));
192 Status
GetProperty(std::string prop_name
, std::string
* prop
) override
;
194 void Next() final override
;
195 void Prev() final override
;
196 // 'target' does not contain timestamp, even if user timestamp feature is
198 void Seek(const Slice
& target
) final override
;
199 void SeekForPrev(const Slice
& target
) final override
;
200 void SeekToFirst() final override
;
201 void SeekToLast() final override
;
202 Env
* env() const { return env_
; }
203 void set_sequence(uint64_t s
) {
205 if (read_callback_
) {
206 read_callback_
->Refresh(s
);
209 void set_valid(bool v
) { valid_
= v
; }
212 // For all methods in this block:
213 // PRE: iter_->Valid() && status_.ok()
214 // Return false if there was an error, and status() is non-ok, valid_ = false;
215 // in this case callers would usually stop what they were doing and return.
216 bool ReverseToForward();
217 bool ReverseToBackward();
218 // Set saved_key_ to the seek key to target, with proper sequence number set.
219 // It might get adjusted if the seek key is smaller than iterator lower bound.
220 void SetSavedKeyToSeekTarget(const Slice
& target
);
221 // Set saved_key_ to the seek key to target, with proper sequence number set.
222 // It might get adjusted if the seek key is larger than iterator upper bound.
223 void SetSavedKeyToSeekForPrevTarget(const Slice
& target
);
224 bool FindValueForCurrentKey();
225 bool FindValueForCurrentKeyUsingSeek();
226 bool FindUserKeyBeforeSavedKey();
227 // If `skipping_saved_key` is true, the function will keep iterating until it
228 // finds a user key that is larger than `saved_key_`.
229 // If `prefix` is not null, the iterator needs to stop when all keys for the
230 // prefix are exhausted and the interator is set to invalid.
231 bool FindNextUserEntry(bool skipping_saved_key
, const Slice
* prefix
);
232 // Internal implementation of FindNextUserEntry().
233 bool FindNextUserEntryInternal(bool skipping_saved_key
, const Slice
* prefix
);
234 bool ParseKey(ParsedInternalKey
* key
);
235 bool MergeValuesNewToOld();
237 // If prefix is not null, we need to set the iterator to invalid if no more
238 // entry can be found within the prefix.
239 void PrevInternal(const Slice
* prefix
);
240 bool TooManyInternalKeysSkipped(bool increment
= true);
241 bool IsVisible(SequenceNumber sequence
, const Slice
& ts
,
242 bool* more_recent
= nullptr);
244 // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
247 if (!pin_thru_lifetime_
) {
248 pinned_iters_mgr_
.StartPinning();
252 // Release blocks pinned by TempPinData()
253 void ReleaseTempPinnedData() {
254 if (!pin_thru_lifetime_
&& pinned_iters_mgr_
.PinningEnabled()) {
255 pinned_iters_mgr_
.ReleasePinnedData();
259 inline void ClearSavedValue() {
260 if (saved_value_
.capacity() > 1048576) {
262 swap(empty
, saved_value_
);
264 saved_value_
.clear();
268 inline void ResetInternalKeysSkippedCounter() {
269 local_stats_
.skip_count_
+= num_internal_keys_skipped_
;
271 local_stats_
.skip_count_
--;
273 num_internal_keys_skipped_
= 0;
276 bool expect_total_order_inner_iter() {
277 assert(expect_total_order_inner_iter_
|| prefix_extractor_
!= nullptr);
278 return expect_total_order_inner_iter_
;
281 // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need
282 // to return versions of the same key. We cannot just skip if the key value
283 // is the same but timestamps are different but fall in timestamp range.
284 inline int CompareKeyForSkip(const Slice
& a
, const Slice
& b
) {
285 return timestamp_lb_
!= nullptr
286 ? user_comparator_
.Compare(a
, b
)
287 : user_comparator_
.CompareWithoutTimestamp(a
, b
);
290 const SliceTransform
* prefix_extractor_
;
293 UserComparatorWrapper user_comparator_
;
294 const MergeOperator
* const merge_operator_
;
295 IteratorWrapper iter_
;
296 ReadCallback
* read_callback_
;
297 // Max visible sequence number. It is normally the snapshot seq unless we have
298 // uncommitted data in db as in WriteUnCommitted.
299 SequenceNumber sequence_
;
302 // Reusable internal key data structure. This is only used inside one function
303 // and should not be used across functions. Reusing this object can reduce
304 // overhead of calling construction of the function if creating it each time.
305 ParsedInternalKey ikey_
;
306 std::string saved_value_
;
308 // for prefix seek mode to support prev()
309 Statistics
* statistics_
;
311 uint64_t max_skippable_internal_keys_
;
312 uint64_t num_internal_keys_skipped_
;
313 const Slice
* iterate_lower_bound_
;
314 const Slice
* iterate_upper_bound_
;
316 // The prefix of the seek key. It is only used when prefix_same_as_start_
317 // is true and prefix extractor is not null. In Next() or Prev(), current keys
318 // will be checked against this prefix, so that the iterator can be
319 // invalidated if the keys in this prefix has been exhausted. Set it using
320 // SetUserKey() and use it using GetUserKey().
324 Direction direction_
;
326 bool current_entry_is_merged_
;
327 // True if we know that the current entry's seqnum is 0.
328 // This information is used as that the next entry will be for another
330 bool is_key_seqnum_zero_
;
331 const bool prefix_same_as_start_
;
332 // Means that we will pin all data blocks we read as long the Iterator
333 // is not deleted, will be true if ReadOptions::pin_data is true
334 const bool pin_thru_lifetime_
;
335 // Expect the inner iterator to maintain a total order.
336 // prefix_extractor_ must be non-NULL if the value is false.
337 const bool expect_total_order_inner_iter_
;
341 // List of operands for merge operator.
342 MergeContext merge_context_
;
343 ReadRangeDelAggregator range_del_agg_
;
344 LocalStatistics local_stats_
;
345 PinnedIteratorsManager pinned_iters_mgr_
;
353 ColumnFamilyData
* cfd_
;
354 // for diff snapshots we want the lower bound on the seqnum;
355 // if this value > 0 iterator will return internal keys
356 SequenceNumber start_seqnum_
;
357 const Slice
* const timestamp_ub_
;
358 const Slice
* const timestamp_lb_
;
359 const size_t timestamp_size_
;
362 // Return a new iterator that converts internal keys (yielded by
363 // "*internal_iter") that were live at the specified `sequence` number
364 // into appropriate user keys.
365 extern Iterator
* NewDBIterator(
366 Env
* env
, const ReadOptions
& read_options
,
367 const ImmutableCFOptions
& cf_options
,
368 const MutableCFOptions
& mutable_cf_options
,
369 const Comparator
* user_key_comparator
, InternalIterator
* internal_iter
,
370 const SequenceNumber
& sequence
, uint64_t max_sequential_skip_in_iterations
,
371 ReadCallback
* read_callback
, DBImpl
* db_impl
= nullptr,
372 ColumnFamilyData
* cfd
= nullptr, bool allow_blob
= false);
374 } // namespace ROCKSDB_NAMESPACE