]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/block_based/block_based_table_iterator.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / block_based / block_based_table_iterator.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_iterator.h"
10
11 namespace ROCKSDB_NAMESPACE {
12
13 void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr, false); }
14
15 void BlockBasedTableIterator::Seek(const Slice& target) {
16 SeekImpl(&target, true);
17 }
18
19 void BlockBasedTableIterator::SeekImpl(const Slice* target,
20 bool async_prefetch) {
21 bool is_first_pass = true;
22 if (async_read_in_progress_) {
23 AsyncInitDataBlock(false);
24 is_first_pass = false;
25 }
26
27 is_out_of_bound_ = false;
28 is_at_first_key_from_index_ = false;
29 if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
30 ResetDataIter();
31 return;
32 }
33
34 bool need_seek_index = true;
35 if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
36 // Reseek.
37 prev_block_offset_ = index_iter_->value().handle.offset();
38
39 if (target) {
40 // We can avoid an index seek if:
41 // 1. The new seek key is larger than the current key
42 // 2. The new seek key is within the upper bound of the block
43 // Since we don't necessarily know the internal key for either
44 // the current key or the upper bound, we check user keys and
45 // exclude the equality case. Considering internal keys can
46 // improve for the boundary cases, but it would complicate the
47 // code.
48 if (user_comparator_.Compare(ExtractUserKey(*target),
49 block_iter_.user_key()) > 0 &&
50 user_comparator_.Compare(ExtractUserKey(*target),
51 index_iter_->user_key()) < 0) {
52 need_seek_index = false;
53 }
54 }
55 }
56
57 if (need_seek_index) {
58 if (target) {
59 index_iter_->Seek(*target);
60 } else {
61 index_iter_->SeekToFirst();
62 }
63
64 if (!index_iter_->Valid()) {
65 ResetDataIter();
66 return;
67 }
68 }
69
70 IndexValue v = index_iter_->value();
71 const bool same_block = block_iter_points_to_real_block_ &&
72 v.handle.offset() == prev_block_offset_;
73
74 if (!v.first_internal_key.empty() && !same_block &&
75 (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
76 allow_unprepared_value_) {
77 // Index contains the first key of the block, and it's >= target.
78 // We can defer reading the block.
79 is_at_first_key_from_index_ = true;
80 // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
81 // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
82 // as that will be done later when the data block is actually read.
83 ResetDataIter();
84 } else {
85 // Need to use the data block.
86 if (!same_block) {
87 if (read_options_.async_io && async_prefetch) {
88 if (is_first_pass) {
89 AsyncInitDataBlock(is_first_pass);
90 }
91 if (async_read_in_progress_) {
92 // Status::TryAgain indicates asynchronous request for retrieval of
93 // data blocks has been submitted. So it should return at this point
94 // and Seek should be called again to retrieve the requested block and
95 // execute the remaining code.
96 return;
97 }
98 } else {
99 InitDataBlock();
100 }
101 } else {
102 // When the user does a reseek, the iterate_upper_bound might have
103 // changed. CheckDataBlockWithinUpperBound() needs to be called
104 // explicitly if the reseek ends up in the same data block.
105 // If the reseek ends up in a different block, InitDataBlock() will do
106 // the iterator upper bound check.
107 CheckDataBlockWithinUpperBound();
108 }
109
110 if (target) {
111 block_iter_.Seek(*target);
112 } else {
113 block_iter_.SeekToFirst();
114 }
115 FindKeyForward();
116 }
117
118 CheckOutOfBound();
119
120 if (target) {
121 assert(!Valid() || icomp_.Compare(*target, key()) <= 0);
122 }
123 }
124
125 void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
126 is_out_of_bound_ = false;
127 is_at_first_key_from_index_ = false;
128 // For now totally disable prefix seek in auto prefix mode because we don't
129 // have logic
130 if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
131 ResetDataIter();
132 return;
133 }
134
135 SavePrevIndexValue();
136
137 // Call Seek() rather than SeekForPrev() in the index block, because the
138 // target data block will likely to contain the position for `target`, the
139 // same as Seek(), rather than than before.
140 // For example, if we have three data blocks, each containing two keys:
141 // [2, 4] [6, 8] [10, 12]
142 // (the keys in the index block would be [4, 8, 12])
143 // and the user calls SeekForPrev(7), we need to go to the second block,
144 // just like if they call Seek(7).
145 // The only case where the block is difference is when they seek to a position
146 // in the boundary. For example, if they SeekForPrev(5), we should go to the
147 // first block, rather than the second. However, we don't have the information
148 // to distinguish the two unless we read the second block. In this case, we'll
149 // end up with reading two blocks.
150 index_iter_->Seek(target);
151
152 if (!index_iter_->Valid()) {
153 auto seek_status = index_iter_->status();
154 // Check for IO error
155 if (!seek_status.IsNotFound() && !seek_status.ok()) {
156 ResetDataIter();
157 return;
158 }
159
160 // With prefix index, Seek() returns NotFound if the prefix doesn't exist
161 if (seek_status.IsNotFound()) {
162 // Any key less than the target is fine for prefix seek
163 ResetDataIter();
164 return;
165 } else {
166 index_iter_->SeekToLast();
167 }
168 // Check for IO error
169 if (!index_iter_->Valid()) {
170 ResetDataIter();
171 return;
172 }
173 }
174
175 InitDataBlock();
176
177 block_iter_.SeekForPrev(target);
178
179 FindKeyBackward();
180 CheckDataBlockWithinUpperBound();
181 assert(!block_iter_.Valid() ||
182 icomp_.Compare(target, block_iter_.key()) >= 0);
183 }
184
185 void BlockBasedTableIterator::SeekToLast() {
186 is_out_of_bound_ = false;
187 is_at_first_key_from_index_ = false;
188 SavePrevIndexValue();
189 index_iter_->SeekToLast();
190 if (!index_iter_->Valid()) {
191 ResetDataIter();
192 return;
193 }
194 InitDataBlock();
195 block_iter_.SeekToLast();
196 FindKeyBackward();
197 CheckDataBlockWithinUpperBound();
198 }
199
200 void BlockBasedTableIterator::Next() {
201 if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
202 return;
203 }
204 assert(block_iter_points_to_real_block_);
205 block_iter_.Next();
206 FindKeyForward();
207 CheckOutOfBound();
208 }
209
210 bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
211 Next();
212 bool is_valid = Valid();
213 if (is_valid) {
214 result->key = key();
215 result->bound_check_result = UpperBoundCheckResult();
216 result->value_prepared = !is_at_first_key_from_index_;
217 }
218 return is_valid;
219 }
220
221 void BlockBasedTableIterator::Prev() {
222 if (is_at_first_key_from_index_) {
223 is_at_first_key_from_index_ = false;
224
225 index_iter_->Prev();
226 if (!index_iter_->Valid()) {
227 return;
228 }
229
230 InitDataBlock();
231 block_iter_.SeekToLast();
232 } else {
233 assert(block_iter_points_to_real_block_);
234 block_iter_.Prev();
235 }
236
237 FindKeyBackward();
238 }
239
240 void BlockBasedTableIterator::InitDataBlock() {
241 BlockHandle data_block_handle = index_iter_->value().handle;
242 if (!block_iter_points_to_real_block_ ||
243 data_block_handle.offset() != prev_block_offset_ ||
244 // if previous attempt of reading the block missed cache, try again
245 block_iter_.status().IsIncomplete()) {
246 if (block_iter_points_to_real_block_) {
247 ResetDataIter();
248 }
249 auto* rep = table_->get_rep();
250
251 bool is_for_compaction =
252 lookup_context_.caller == TableReaderCaller::kCompaction;
253 // Prefetch additional data for range scans (iterators).
254 // Implicit auto readahead:
255 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
256 // Explicit user requested readahead:
257 // Enabled from the very first IO when ReadOptions.readahead_size is set.
258 block_prefetcher_.PrefetchIfNeeded(
259 rep, data_block_handle, read_options_.readahead_size, is_for_compaction,
260 /*no_sequential_checking=*/false, read_options_.rate_limiter_priority);
261 Status s;
262 table_->NewDataBlockIterator<DataBlockIter>(
263 read_options_, data_block_handle, &block_iter_, BlockType::kData,
264 /*get_context=*/nullptr, &lookup_context_,
265 block_prefetcher_.prefetch_buffer(),
266 /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
267 block_iter_points_to_real_block_ = true;
268 CheckDataBlockWithinUpperBound();
269 }
270 }
271
272 void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) {
273 BlockHandle data_block_handle = index_iter_->value().handle;
274 bool is_for_compaction =
275 lookup_context_.caller == TableReaderCaller::kCompaction;
276 if (is_first_pass) {
277 if (!block_iter_points_to_real_block_ ||
278 data_block_handle.offset() != prev_block_offset_ ||
279 // if previous attempt of reading the block missed cache, try again
280 block_iter_.status().IsIncomplete()) {
281 if (block_iter_points_to_real_block_) {
282 ResetDataIter();
283 }
284 auto* rep = table_->get_rep();
285 // Prefetch additional data for range scans (iterators).
286 // Implicit auto readahead:
287 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
288 // Explicit user requested readahead:
289 // Enabled from the very first IO when ReadOptions.readahead_size is
290 // set.
291 // In case of async_io with Implicit readahead, block_prefetcher_ will
292 // always the create the prefetch buffer by setting no_sequential_checking
293 // = true.
294 block_prefetcher_.PrefetchIfNeeded(
295 rep, data_block_handle, read_options_.readahead_size,
296 is_for_compaction, /*no_sequential_checking=*/read_options_.async_io,
297 read_options_.rate_limiter_priority);
298
299 Status s;
300 table_->NewDataBlockIterator<DataBlockIter>(
301 read_options_, data_block_handle, &block_iter_, BlockType::kData,
302 /*get_context=*/nullptr, &lookup_context_,
303 block_prefetcher_.prefetch_buffer(),
304 /*for_compaction=*/is_for_compaction, /*async_read=*/true, s);
305
306 if (s.IsTryAgain()) {
307 async_read_in_progress_ = true;
308 return;
309 }
310 }
311 } else {
312 // Second pass will call the Poll to get the data block which has been
313 // requested asynchronously.
314 Status s;
315 table_->NewDataBlockIterator<DataBlockIter>(
316 read_options_, data_block_handle, &block_iter_, BlockType::kData,
317 /*get_context=*/nullptr, &lookup_context_,
318 block_prefetcher_.prefetch_buffer(),
319 /*for_compaction=*/is_for_compaction, /*async_read=*/false, s);
320 }
321 block_iter_points_to_real_block_ = true;
322 CheckDataBlockWithinUpperBound();
323 async_read_in_progress_ = false;
324 }
325
326 bool BlockBasedTableIterator::MaterializeCurrentBlock() {
327 assert(is_at_first_key_from_index_);
328 assert(!block_iter_points_to_real_block_);
329 assert(index_iter_->Valid());
330
331 is_at_first_key_from_index_ = false;
332 InitDataBlock();
333 assert(block_iter_points_to_real_block_);
334
335 if (!block_iter_.status().ok()) {
336 return false;
337 }
338
339 block_iter_.SeekToFirst();
340
341 if (!block_iter_.Valid() ||
342 icomp_.Compare(block_iter_.key(),
343 index_iter_->value().first_internal_key) != 0) {
344 block_iter_.Invalidate(Status::Corruption(
345 "first key in index doesn't match first key in block"));
346 return false;
347 }
348
349 return true;
350 }
351
352 void BlockBasedTableIterator::FindKeyForward() {
353 // This method's code is kept short to make it likely to be inlined.
354
355 assert(!is_out_of_bound_);
356 assert(block_iter_points_to_real_block_);
357
358 if (!block_iter_.Valid()) {
359 // This is the only call site of FindBlockForward(), but it's extracted into
360 // a separate method to keep FindKeyForward() short and likely to be
361 // inlined. When transitioning to a different block, we call
362 // FindBlockForward(), which is much longer and is probably not inlined.
363 FindBlockForward();
364 } else {
365 // This is the fast path that avoids a function call.
366 }
367 }
368
369 void BlockBasedTableIterator::FindBlockForward() {
370 // TODO the while loop inherits from two-level-iterator. We don't know
371 // whether a block can be empty so it can be replaced by an "if".
372 do {
373 if (!block_iter_.status().ok()) {
374 return;
375 }
376 // Whether next data block is out of upper bound, if there is one.
377 const bool next_block_is_out_of_bound =
378 read_options_.iterate_upper_bound != nullptr &&
379 block_iter_points_to_real_block_ &&
380 block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock;
381 assert(!next_block_is_out_of_bound ||
382 user_comparator_.CompareWithoutTimestamp(
383 *read_options_.iterate_upper_bound, /*a_has_ts=*/false,
384 index_iter_->user_key(), /*b_has_ts=*/true) <= 0);
385 ResetDataIter();
386 index_iter_->Next();
387 if (next_block_is_out_of_bound) {
388 // The next block is out of bound. No need to read it.
389 TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
390 // We need to make sure this is not the last data block before setting
391 // is_out_of_bound_, since the index key for the last data block can be
392 // larger than smallest key of the next file on the same level.
393 if (index_iter_->Valid()) {
394 is_out_of_bound_ = true;
395 }
396 return;
397 }
398
399 if (!index_iter_->Valid()) {
400 return;
401 }
402
403 IndexValue v = index_iter_->value();
404
405 if (!v.first_internal_key.empty() && allow_unprepared_value_) {
406 // Index contains the first key of the block. Defer reading the block.
407 is_at_first_key_from_index_ = true;
408 return;
409 }
410
411 InitDataBlock();
412 block_iter_.SeekToFirst();
413 } while (!block_iter_.Valid());
414 }
415
416 void BlockBasedTableIterator::FindKeyBackward() {
417 while (!block_iter_.Valid()) {
418 if (!block_iter_.status().ok()) {
419 return;
420 }
421
422 ResetDataIter();
423 index_iter_->Prev();
424
425 if (index_iter_->Valid()) {
426 InitDataBlock();
427 block_iter_.SeekToLast();
428 } else {
429 return;
430 }
431 }
432
433 // We could have check lower bound here too, but we opt not to do it for
434 // code simplicity.
435 }
436
437 void BlockBasedTableIterator::CheckOutOfBound() {
438 if (read_options_.iterate_upper_bound != nullptr &&
439 block_upper_bound_check_ != BlockUpperBound::kUpperBoundBeyondCurBlock &&
440 Valid()) {
441 is_out_of_bound_ =
442 user_comparator_.CompareWithoutTimestamp(
443 *read_options_.iterate_upper_bound, /*a_has_ts=*/false, user_key(),
444 /*b_has_ts=*/true) <= 0;
445 }
446 }
447
448 void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
449 if (read_options_.iterate_upper_bound != nullptr &&
450 block_iter_points_to_real_block_) {
451 block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp(
452 *read_options_.iterate_upper_bound,
453 /*a_has_ts=*/false, index_iter_->user_key(),
454 /*b_has_ts=*/true) > 0)
455 ? BlockUpperBound::kUpperBoundBeyondCurBlock
456 : BlockUpperBound::kUpperBoundInCurBlock;
457 }
458 }
459 } // namespace ROCKSDB_NAMESPACE