]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_based/block_based_table_iterator.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / block_based / block_based_table_iterator.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_iterator.h"
10
11 namespace ROCKSDB_NAMESPACE {
12 void BlockBasedTableIterator::Seek(const Slice& target) { SeekImpl(&target); }
13
14 void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr); }
15
16 void BlockBasedTableIterator::SeekImpl(const Slice* target) {
17 is_out_of_bound_ = false;
18 is_at_first_key_from_index_ = false;
19 if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
20 ResetDataIter();
21 return;
22 }
23
24 bool need_seek_index = true;
25 if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
26 // Reseek.
27 prev_block_offset_ = index_iter_->value().handle.offset();
28
29 if (target) {
30 // We can avoid an index seek if:
31 // 1. The new seek key is larger than the current key
32 // 2. The new seek key is within the upper bound of the block
33 // Since we don't necessarily know the internal key for either
34 // the current key or the upper bound, we check user keys and
35 // exclude the equality case. Considering internal keys can
36 // improve for the boundary cases, but it would complicate the
37 // code.
38 if (user_comparator_.Compare(ExtractUserKey(*target),
39 block_iter_.user_key()) > 0 &&
40 user_comparator_.Compare(ExtractUserKey(*target),
41 index_iter_->user_key()) < 0) {
42 need_seek_index = false;
43 }
44 }
45 }
46
47 if (need_seek_index) {
48 if (target) {
49 index_iter_->Seek(*target);
50 } else {
51 index_iter_->SeekToFirst();
52 }
53
54 if (!index_iter_->Valid()) {
55 ResetDataIter();
56 return;
57 }
58 }
59
60 IndexValue v = index_iter_->value();
61 const bool same_block = block_iter_points_to_real_block_ &&
62 v.handle.offset() == prev_block_offset_;
63
64 if (!v.first_internal_key.empty() && !same_block &&
65 (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
66 allow_unprepared_value_) {
67 // Index contains the first key of the block, and it's >= target.
68 // We can defer reading the block.
69 is_at_first_key_from_index_ = true;
70 // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
71 // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
72 // as that will be done later when the data block is actually read.
73 ResetDataIter();
74 } else {
75 // Need to use the data block.
76 if (!same_block) {
77 InitDataBlock();
78 } else {
79 // When the user does a reseek, the iterate_upper_bound might have
80 // changed. CheckDataBlockWithinUpperBound() needs to be called
81 // explicitly if the reseek ends up in the same data block.
82 // If the reseek ends up in a different block, InitDataBlock() will do
83 // the iterator upper bound check.
84 CheckDataBlockWithinUpperBound();
85 }
86
87 if (target) {
88 block_iter_.Seek(*target);
89 } else {
90 block_iter_.SeekToFirst();
91 }
92 FindKeyForward();
93 }
94
95 CheckOutOfBound();
96
97 if (target) {
98 assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
99 }
100 }
101
102 void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
103 is_out_of_bound_ = false;
104 is_at_first_key_from_index_ = false;
105 // For now totally disable prefix seek in auto prefix mode because we don't
106 // have logic
107 if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
108 ResetDataIter();
109 return;
110 }
111
112 SavePrevIndexValue();
113
114 // Call Seek() rather than SeekForPrev() in the index block, because the
115 // target data block will likely to contain the position for `target`, the
116 // same as Seek(), rather than than before.
117 // For example, if we have three data blocks, each containing two keys:
118 // [2, 4] [6, 8] [10, 12]
119 // (the keys in the index block would be [4, 8, 12])
120 // and the user calls SeekForPrev(7), we need to go to the second block,
121 // just like if they call Seek(7).
122 // The only case where the block is difference is when they seek to a position
123 // in the boundary. For example, if they SeekForPrev(5), we should go to the
124 // first block, rather than the second. However, we don't have the information
125 // to distinguish the two unless we read the second block. In this case, we'll
126 // end up with reading two blocks.
127 index_iter_->Seek(target);
128
129 if (!index_iter_->Valid()) {
130 auto seek_status = index_iter_->status();
131 // Check for IO error
132 if (!seek_status.IsNotFound() && !seek_status.ok()) {
133 ResetDataIter();
134 return;
135 }
136
137 // With prefix index, Seek() returns NotFound if the prefix doesn't exist
138 if (seek_status.IsNotFound()) {
139 // Any key less than the target is fine for prefix seek
140 ResetDataIter();
141 return;
142 } else {
143 index_iter_->SeekToLast();
144 }
145 // Check for IO error
146 if (!index_iter_->Valid()) {
147 ResetDataIter();
148 return;
149 }
150 }
151
152 InitDataBlock();
153
154 block_iter_.SeekForPrev(target);
155
156 FindKeyBackward();
157 CheckDataBlockWithinUpperBound();
158 assert(!block_iter_.Valid() ||
159 icomp_.Compare(target, block_iter_.key()) >= 0);
160 }
161
162 void BlockBasedTableIterator::SeekToLast() {
163 is_out_of_bound_ = false;
164 is_at_first_key_from_index_ = false;
165 SavePrevIndexValue();
166 index_iter_->SeekToLast();
167 if (!index_iter_->Valid()) {
168 ResetDataIter();
169 return;
170 }
171 InitDataBlock();
172 block_iter_.SeekToLast();
173 FindKeyBackward();
174 CheckDataBlockWithinUpperBound();
175 }
176
177 void BlockBasedTableIterator::Next() {
178 if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
179 return;
180 }
181 assert(block_iter_points_to_real_block_);
182 block_iter_.Next();
183 FindKeyForward();
184 CheckOutOfBound();
185 }
186
187 bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
188 Next();
189 bool is_valid = Valid();
190 if (is_valid) {
191 result->key = key();
192 result->bound_check_result = UpperBoundCheckResult();
193 result->value_prepared = !is_at_first_key_from_index_;
194 }
195 return is_valid;
196 }
197
198 void BlockBasedTableIterator::Prev() {
199 if (is_at_first_key_from_index_) {
200 is_at_first_key_from_index_ = false;
201
202 index_iter_->Prev();
203 if (!index_iter_->Valid()) {
204 return;
205 }
206
207 InitDataBlock();
208 block_iter_.SeekToLast();
209 } else {
210 assert(block_iter_points_to_real_block_);
211 block_iter_.Prev();
212 }
213
214 FindKeyBackward();
215 }
216
217 void BlockBasedTableIterator::InitDataBlock() {
218 BlockHandle data_block_handle = index_iter_->value().handle;
219 if (!block_iter_points_to_real_block_ ||
220 data_block_handle.offset() != prev_block_offset_ ||
221 // if previous attempt of reading the block missed cache, try again
222 block_iter_.status().IsIncomplete()) {
223 if (block_iter_points_to_real_block_) {
224 ResetDataIter();
225 }
226 auto* rep = table_->get_rep();
227
228 bool is_for_compaction =
229 lookup_context_.caller == TableReaderCaller::kCompaction;
230 // Prefetch additional data for range scans (iterators).
231 // Implicit auto readahead:
232 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
233 // Explicit user requested readahead:
234 // Enabled from the very first IO when ReadOptions.readahead_size is set.
235 block_prefetcher_.PrefetchIfNeeded(rep, data_block_handle,
236 read_options_.readahead_size,
237 is_for_compaction);
238
239 Status s;
240 table_->NewDataBlockIterator<DataBlockIter>(
241 read_options_, data_block_handle, &block_iter_, BlockType::kData,
242 /*get_context=*/nullptr, &lookup_context_, s,
243 block_prefetcher_.prefetch_buffer(),
244 /*for_compaction=*/is_for_compaction);
245 block_iter_points_to_real_block_ = true;
246 CheckDataBlockWithinUpperBound();
247 }
248 }
249
250 bool BlockBasedTableIterator::MaterializeCurrentBlock() {
251 assert(is_at_first_key_from_index_);
252 assert(!block_iter_points_to_real_block_);
253 assert(index_iter_->Valid());
254
255 is_at_first_key_from_index_ = false;
256 InitDataBlock();
257 assert(block_iter_points_to_real_block_);
258
259 if (!block_iter_.status().ok()) {
260 return false;
261 }
262
263 block_iter_.SeekToFirst();
264
265 if (!block_iter_.Valid() ||
266 icomp_.Compare(block_iter_.key(),
267 index_iter_->value().first_internal_key) != 0) {
268 block_iter_.Invalidate(Status::Corruption(
269 "first key in index doesn't match first key in block"));
270 return false;
271 }
272
273 return true;
274 }
275
276 void BlockBasedTableIterator::FindKeyForward() {
277 // This method's code is kept short to make it likely to be inlined.
278
279 assert(!is_out_of_bound_);
280 assert(block_iter_points_to_real_block_);
281
282 if (!block_iter_.Valid()) {
283 // This is the only call site of FindBlockForward(), but it's extracted into
284 // a separate method to keep FindKeyForward() short and likely to be
285 // inlined. When transitioning to a different block, we call
286 // FindBlockForward(), which is much longer and is probably not inlined.
287 FindBlockForward();
288 } else {
289 // This is the fast path that avoids a function call.
290 }
291 }
292
293 void BlockBasedTableIterator::FindBlockForward() {
294 // TODO the while loop inherits from two-level-iterator. We don't know
295 // whether a block can be empty so it can be replaced by an "if".
296 do {
297 if (!block_iter_.status().ok()) {
298 return;
299 }
300 // Whether next data block is out of upper bound, if there is one.
301 const bool next_block_is_out_of_bound =
302 read_options_.iterate_upper_bound != nullptr &&
303 block_iter_points_to_real_block_ &&
304 block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock;
305 assert(!next_block_is_out_of_bound ||
306 user_comparator_.CompareWithoutTimestamp(
307 *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
308 index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
309 ResetDataIter();
310 index_iter_->Next();
311 if (next_block_is_out_of_bound) {
312 // The next block is out of bound. No need to read it.
313 TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
314 // We need to make sure this is not the last data block before setting
315 // is_out_of_bound_, since the index key for the last data block can be
316 // larger than smallest key of the next file on the same level.
317 if (index_iter_->Valid()) {
318 is_out_of_bound_ = true;
319 }
320 return;
321 }
322
323 if (!index_iter_->Valid()) {
324 return;
325 }
326
327 IndexValue v = index_iter_->value();
328
329 if (!v.first_internal_key.empty() && allow_unprepared_value_) {
330 // Index contains the first key of the block. Defer reading the block.
331 is_at_first_key_from_index_ = true;
332 return;
333 }
334
335 InitDataBlock();
336 block_iter_.SeekToFirst();
337 } while (!block_iter_.Valid());
338 }
339
340 void BlockBasedTableIterator::FindKeyBackward() {
341 while (!block_iter_.Valid()) {
342 if (!block_iter_.status().ok()) {
343 return;
344 }
345
346 ResetDataIter();
347 index_iter_->Prev();
348
349 if (index_iter_->Valid()) {
350 InitDataBlock();
351 block_iter_.SeekToLast();
352 } else {
353 return;
354 }
355 }
356
357 // We could have check lower bound here too, but we opt not to do it for
358 // code simplicity.
359 }
360
361 void BlockBasedTableIterator::CheckOutOfBound() {
362 if (read_options_.iterate_upper_bound != nullptr &&
363 block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock &&
364 Valid()) {
365 is_out_of_bound_ =
366 user_comparator_.CompareWithoutTimestamp(
367 *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
368 /*b_has_ts=*/true) <= 0;
369 }
370 }
371
372 void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
373 if (read_options_.iterate_upper_bound != nullptr &&
374 block_iter_points_to_real_block_) {
375 block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp(
376 *read_options_.iterate_upper_bound,
377 /*a_has_ts=*/false, index_iter_->user_key(),
378 /*b_has_ts=*/true) > 0)
379 ? BlockUpperBound::kUpperBoundBeyondCurBlock
380 : BlockUpperBound::kUpperBoundInCurBlock;
381 }
382 }
383 } // namespace ROCKSDB_NAMESPACE