]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_based/block_based_table_iterator.h
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / block_based_table_iterator.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #pragma once
10 #include "table/block_based/block_based_table_reader.h"
11 #include "table/block_based/block_based_table_reader_impl.h"
12 #include "table/block_based/block_prefetcher.h"
13 #include "table/block_based/reader_common.h"
14
15 namespace ROCKSDB_NAMESPACE {
16 // Iterates over the contents of BlockBasedTable.
17 class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
18 // compaction_readahead_size: its value will only be used if for_compaction =
19 // true
20 // @param read_options Must outlive this iterator.
21 public:
22 BlockBasedTableIterator(
23 const BlockBasedTable* table, const ReadOptions& read_options,
24 const InternalKeyComparator& icomp,
25 std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
26 bool check_filter, bool need_upper_bound_check,
27 const SliceTransform* prefix_extractor, TableReaderCaller caller,
28 size_t compaction_readahead_size = 0, bool allow_unprepared_value = false)
29 : index_iter_(std::move(index_iter)),
30 table_(table),
31 read_options_(read_options),
32 icomp_(icomp),
33 user_comparator_(icomp.user_comparator()),
34 pinned_iters_mgr_(nullptr),
35 prefix_extractor_(prefix_extractor),
36 lookup_context_(caller),
37 block_prefetcher_(
38 compaction_readahead_size,
39 table_->get_rep()->table_options.initial_auto_readahead_size),
40 allow_unprepared_value_(allow_unprepared_value),
41 block_iter_points_to_real_block_(false),
42 check_filter_(check_filter),
43 need_upper_bound_check_(need_upper_bound_check),
44 async_read_in_progress_(false) {}
45
46 ~BlockBasedTableIterator() {}
47
48 void Seek(const Slice& target) override;
49 void SeekForPrev(const Slice& target) override;
50 void SeekToFirst() override;
51 void SeekToLast() override;
52 void Next() final override;
53 bool NextAndGetResult(IterateResult* result) override;
54 void Prev() override;
55 bool Valid() const override {
56 return !is_out_of_bound_ &&
57 (is_at_first_key_from_index_ ||
58 (block_iter_points_to_real_block_ && block_iter_.Valid()));
59 }
60 Slice key() const override {
61 assert(Valid());
62 if (is_at_first_key_from_index_) {
63 return index_iter_->value().first_internal_key;
64 } else {
65 return block_iter_.key();
66 }
67 }
68 Slice user_key() const override {
69 assert(Valid());
70 if (is_at_first_key_from_index_) {
71 return ExtractUserKey(index_iter_->value().first_internal_key);
72 } else {
73 return block_iter_.user_key();
74 }
75 }
76 bool PrepareValue() override {
77 assert(Valid());
78
79 if (!is_at_first_key_from_index_) {
80 return true;
81 }
82
83 return const_cast<BlockBasedTableIterator*>(this)
84 ->MaterializeCurrentBlock();
85 }
86 Slice value() const override {
87 // PrepareValue() must have been called.
88 assert(!is_at_first_key_from_index_);
89 assert(Valid());
90
91 return block_iter_.value();
92 }
93 Status status() const override {
94 // Prefix index set status to NotFound when the prefix does not exist
95 if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
96 return index_iter_->status();
97 } else if (block_iter_points_to_real_block_) {
98 return block_iter_.status();
99 } else if (async_read_in_progress_) {
100 return Status::TryAgain();
101 } else {
102 return Status::OK();
103 }
104 }
105
106 inline IterBoundCheck UpperBoundCheckResult() override {
107 if (is_out_of_bound_) {
108 return IterBoundCheck::kOutOfBound;
109 } else if (block_upper_bound_check_ ==
110 BlockUpperBound::kUpperBoundBeyondCurBlock) {
111 assert(!is_out_of_bound_);
112 return IterBoundCheck::kInbound;
113 } else {
114 return IterBoundCheck::kUnknown;
115 }
116 }
117
118 void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
119 pinned_iters_mgr_ = pinned_iters_mgr;
120 }
121 bool IsKeyPinned() const override {
122 // Our key comes either from block_iter_'s current key
123 // or index_iter_'s current *value*.
124 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
125 ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
126 (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
127 }
128 bool IsValuePinned() const override {
129 assert(!is_at_first_key_from_index_);
130 assert(Valid());
131
132 // BlockIter::IsValuePinned() is always true. No need to check
133 return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
134 block_iter_points_to_real_block_;
135 }
136
137 void ResetDataIter() {
138 if (block_iter_points_to_real_block_) {
139 if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
140 block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
141 }
142 block_iter_.Invalidate(Status::OK());
143 block_iter_points_to_real_block_ = false;
144 }
145 block_upper_bound_check_ = BlockUpperBound::kUnknown;
146 }
147
148 void SavePrevIndexValue() {
149 if (block_iter_points_to_real_block_) {
150 // Reseek. If they end up with the same data block, we shouldn't re-fetch
151 // the same data block.
152 prev_block_offset_ = index_iter_->value().handle.offset();
153 }
154 }
155
156 void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
157 if (block_prefetcher_.prefetch_buffer() != nullptr &&
158 read_options_.adaptive_readahead) {
159 block_prefetcher_.prefetch_buffer()->GetReadaheadState(
160 &(readahead_file_info->data_block_readahead_info));
161 if (index_iter_) {
162 index_iter_->GetReadaheadState(readahead_file_info);
163 }
164 }
165 }
166
167 void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
168 if (read_options_.adaptive_readahead) {
169 block_prefetcher_.SetReadaheadState(
170 &(readahead_file_info->data_block_readahead_info));
171 if (index_iter_) {
172 index_iter_->SetReadaheadState(readahead_file_info);
173 }
174 }
175 }
176
177 std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
178
179 private:
180 enum class IterDirection {
181 kForward,
182 kBackward,
183 };
184 // This enum indicates whether the upper bound falls into current block
185 // or beyond.
186 // +-------------+
187 // | cur block | <-- (1)
188 // +-------------+
189 // <-- (2)
190 // --- <boundary key> ---
191 // <-- (3)
192 // +-------------+
193 // | next block | <-- (4)
194 // ......
195 //
196 // When the block is smaller than <boundary key>, kUpperBoundInCurBlock
197 // is the value to use. The examples are (1) or (2) in the graph. It means
198 // all keys in the next block or beyond will be out of bound. Keys within
199 // the current block may or may not be out of bound.
200 // When the block is larger or equal to <boundary key>,
201 // kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4)
202 // in the graph. It means that all keys in the current block is within the
203 // upper bound and keys in the next block may or may not be within the uppder
204 // bound.
205 // If the boundary key hasn't been checked against the upper bound,
206 // kUnknown can be used.
207 enum class BlockUpperBound {
208 kUpperBoundInCurBlock,
209 kUpperBoundBeyondCurBlock,
210 kUnknown,
211 };
212
213 const BlockBasedTable* table_;
214 const ReadOptions& read_options_;
215 const InternalKeyComparator& icomp_;
216 UserComparatorWrapper user_comparator_;
217 PinnedIteratorsManager* pinned_iters_mgr_;
218 DataBlockIter block_iter_;
219 const SliceTransform* prefix_extractor_;
220 uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
221 BlockCacheLookupContext lookup_context_;
222
223 BlockPrefetcher block_prefetcher_;
224
225 const bool allow_unprepared_value_;
226 // True if block_iter_ is initialized and points to the same block
227 // as index iterator.
228 bool block_iter_points_to_real_block_;
229 // See InternalIteratorBase::IsOutOfBound().
230 bool is_out_of_bound_ = false;
231 // How current data block's boundary key with the next block is compared with
232 // iterate upper bound.
233 BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
234 // True if we're standing at the first key of a block, and we haven't loaded
235 // that block yet. A call to PrepareValue() will trigger loading the block.
236 bool is_at_first_key_from_index_ = false;
237 bool check_filter_;
238 // TODO(Zhongyi): pick a better name
239 bool need_upper_bound_check_;
240
241 bool async_read_in_progress_;
242
243 // If `target` is null, seek to first.
244 void SeekImpl(const Slice* target, bool async_prefetch);
245
246 void InitDataBlock();
247 void AsyncInitDataBlock(bool is_first_pass);
248 bool MaterializeCurrentBlock();
249 void FindKeyForward();
250 void FindBlockForward();
251 void FindKeyBackward();
252 void CheckOutOfBound();
253
254 // Check if data block is fully within iterate_upper_bound.
255 //
256 // Note MyRocks may update iterate bounds between seek. To workaround it,
257 // we need to check and update data_block_within_upper_bound_ accordingly.
258 void CheckDataBlockWithinUpperBound();
259
260 bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
261 if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
262 // Upper bound check isn't sufficient for backward direction to
263 // guarantee the same result as total order, so disable prefix
264 // check.
265 return true;
266 }
267 if (check_filter_ && !table_->PrefixRangeMayMatch(
268 ikey, read_options_, prefix_extractor_,
269 need_upper_bound_check_, &lookup_context_)) {
270 // TODO remember the iterator is invalidated because of prefix
271 // match. This can avoid the upper level file iterator to falsely
272 // believe the position is the end of the SST file and move to
273 // the first key of the next file.
274 ResetDataIter();
275 return false;
276 }
277 return true;
278 }
279 };
280 } // namespace ROCKSDB_NAMESPACE