1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 #include "table/block_based/block_based_table_iterator.h"
11 namespace ROCKSDB_NAMESPACE
{
13 void BlockBasedTableIterator::SeekToFirst() { SeekImpl(nullptr, false); }
15 void BlockBasedTableIterator::Seek(const Slice
& target
) {
16 SeekImpl(&target
, true);
19 void BlockBasedTableIterator::SeekImpl(const Slice
* target
,
20 bool async_prefetch
) {
21 bool is_first_pass
= true;
22 if (async_read_in_progress_
) {
23 AsyncInitDataBlock(false);
24 is_first_pass
= false;
27 is_out_of_bound_
= false;
28 is_at_first_key_from_index_
= false;
29 if (target
&& !CheckPrefixMayMatch(*target
, IterDirection::kForward
)) {
34 bool need_seek_index
= true;
35 if (block_iter_points_to_real_block_
&& block_iter_
.Valid()) {
37 prev_block_offset_
= index_iter_
->value().handle
.offset();
40 // We can avoid an index seek if:
41 // 1. The new seek key is larger than the current key
42 // 2. The new seek key is within the upper bound of the block
43 // Since we don't necessarily know the internal key for either
44 // the current key or the upper bound, we check user keys and
45 // exclude the equality case. Considering internal keys can
46 // improve for the boundary cases, but it would complicate the
48 if (user_comparator_
.Compare(ExtractUserKey(*target
),
49 block_iter_
.user_key()) > 0 &&
50 user_comparator_
.Compare(ExtractUserKey(*target
),
51 index_iter_
->user_key()) < 0) {
52 need_seek_index
= false;
57 if (need_seek_index
) {
59 index_iter_
->Seek(*target
);
61 index_iter_
->SeekToFirst();
64 if (!index_iter_
->Valid()) {
70 IndexValue v
= index_iter_
->value();
71 const bool same_block
= block_iter_points_to_real_block_
&&
72 v
.handle
.offset() == prev_block_offset_
;
74 if (!v
.first_internal_key
.empty() && !same_block
&&
75 (!target
|| icomp_
.Compare(*target
, v
.first_internal_key
) <= 0) &&
76 allow_unprepared_value_
) {
77 // Index contains the first key of the block, and it's >= target.
78 // We can defer reading the block.
79 is_at_first_key_from_index_
= true;
80 // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
81 // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
82 // as that will be done later when the data block is actually read.
85 // Need to use the data block.
87 if (read_options_
.async_io
&& async_prefetch
) {
89 AsyncInitDataBlock(is_first_pass
);
91 if (async_read_in_progress_
) {
92 // Status::TryAgain indicates asynchronous request for retrieval of
93 // data blocks has been submitted. So it should return at this point
94 // and Seek should be called again to retrieve the requested block and
95 // execute the remaining code.
102 // When the user does a reseek, the iterate_upper_bound might have
103 // changed. CheckDataBlockWithinUpperBound() needs to be called
104 // explicitly if the reseek ends up in the same data block.
105 // If the reseek ends up in a different block, InitDataBlock() will do
106 // the iterator upper bound check.
107 CheckDataBlockWithinUpperBound();
111 block_iter_
.Seek(*target
);
113 block_iter_
.SeekToFirst();
121 assert(!Valid() || icomp_
.Compare(*target
, key()) <= 0);
125 void BlockBasedTableIterator::SeekForPrev(const Slice
& target
) {
126 is_out_of_bound_
= false;
127 is_at_first_key_from_index_
= false;
128 // For now totally disable prefix seek in auto prefix mode because we don't
130 if (!CheckPrefixMayMatch(target
, IterDirection::kBackward
)) {
135 SavePrevIndexValue();
137 // Call Seek() rather than SeekForPrev() in the index block, because the
138 // target data block will likely to contain the position for `target`, the
139 // same as Seek(), rather than than before.
140 // For example, if we have three data blocks, each containing two keys:
141 // [2, 4] [6, 8] [10, 12]
142 // (the keys in the index block would be [4, 8, 12])
143 // and the user calls SeekForPrev(7), we need to go to the second block,
144 // just like if they call Seek(7).
145 // The only case where the block is difference is when they seek to a position
146 // in the boundary. For example, if they SeekForPrev(5), we should go to the
147 // first block, rather than the second. However, we don't have the information
148 // to distinguish the two unless we read the second block. In this case, we'll
149 // end up with reading two blocks.
150 index_iter_
->Seek(target
);
152 if (!index_iter_
->Valid()) {
153 auto seek_status
= index_iter_
->status();
154 // Check for IO error
155 if (!seek_status
.IsNotFound() && !seek_status
.ok()) {
160 // With prefix index, Seek() returns NotFound if the prefix doesn't exist
161 if (seek_status
.IsNotFound()) {
162 // Any key less than the target is fine for prefix seek
166 index_iter_
->SeekToLast();
168 // Check for IO error
169 if (!index_iter_
->Valid()) {
177 block_iter_
.SeekForPrev(target
);
180 CheckDataBlockWithinUpperBound();
181 assert(!block_iter_
.Valid() ||
182 icomp_
.Compare(target
, block_iter_
.key()) >= 0);
185 void BlockBasedTableIterator::SeekToLast() {
186 is_out_of_bound_
= false;
187 is_at_first_key_from_index_
= false;
188 SavePrevIndexValue();
189 index_iter_
->SeekToLast();
190 if (!index_iter_
->Valid()) {
195 block_iter_
.SeekToLast();
197 CheckDataBlockWithinUpperBound();
200 void BlockBasedTableIterator::Next() {
201 if (is_at_first_key_from_index_
&& !MaterializeCurrentBlock()) {
204 assert(block_iter_points_to_real_block_
);
210 bool BlockBasedTableIterator::NextAndGetResult(IterateResult
* result
) {
212 bool is_valid
= Valid();
215 result
->bound_check_result
= UpperBoundCheckResult();
216 result
->value_prepared
= !is_at_first_key_from_index_
;
221 void BlockBasedTableIterator::Prev() {
222 if (is_at_first_key_from_index_
) {
223 is_at_first_key_from_index_
= false;
226 if (!index_iter_
->Valid()) {
231 block_iter_
.SeekToLast();
233 assert(block_iter_points_to_real_block_
);
240 void BlockBasedTableIterator::InitDataBlock() {
241 BlockHandle data_block_handle
= index_iter_
->value().handle
;
242 if (!block_iter_points_to_real_block_
||
243 data_block_handle
.offset() != prev_block_offset_
||
244 // if previous attempt of reading the block missed cache, try again
245 block_iter_
.status().IsIncomplete()) {
246 if (block_iter_points_to_real_block_
) {
249 auto* rep
= table_
->get_rep();
251 bool is_for_compaction
=
252 lookup_context_
.caller
== TableReaderCaller::kCompaction
;
253 // Prefetch additional data for range scans (iterators).
254 // Implicit auto readahead:
255 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
256 // Explicit user requested readahead:
257 // Enabled from the very first IO when ReadOptions.readahead_size is set.
258 block_prefetcher_
.PrefetchIfNeeded(
259 rep
, data_block_handle
, read_options_
.readahead_size
, is_for_compaction
,
260 /*no_sequential_checking=*/false, read_options_
.rate_limiter_priority
);
262 table_
->NewDataBlockIterator
<DataBlockIter
>(
263 read_options_
, data_block_handle
, &block_iter_
, BlockType::kData
,
264 /*get_context=*/nullptr, &lookup_context_
,
265 block_prefetcher_
.prefetch_buffer(),
266 /*for_compaction=*/is_for_compaction
, /*async_read=*/false, s
);
267 block_iter_points_to_real_block_
= true;
268 CheckDataBlockWithinUpperBound();
272 void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass
) {
273 BlockHandle data_block_handle
= index_iter_
->value().handle
;
274 bool is_for_compaction
=
275 lookup_context_
.caller
== TableReaderCaller::kCompaction
;
277 if (!block_iter_points_to_real_block_
||
278 data_block_handle
.offset() != prev_block_offset_
||
279 // if previous attempt of reading the block missed cache, try again
280 block_iter_
.status().IsIncomplete()) {
281 if (block_iter_points_to_real_block_
) {
284 auto* rep
= table_
->get_rep();
285 // Prefetch additional data for range scans (iterators).
286 // Implicit auto readahead:
287 // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
288 // Explicit user requested readahead:
289 // Enabled from the very first IO when ReadOptions.readahead_size is
291 // In case of async_io with Implicit readahead, block_prefetcher_ will
292 // always the create the prefetch buffer by setting no_sequential_checking
294 block_prefetcher_
.PrefetchIfNeeded(
295 rep
, data_block_handle
, read_options_
.readahead_size
,
296 is_for_compaction
, /*no_sequential_checking=*/read_options_
.async_io
,
297 read_options_
.rate_limiter_priority
);
300 table_
->NewDataBlockIterator
<DataBlockIter
>(
301 read_options_
, data_block_handle
, &block_iter_
, BlockType::kData
,
302 /*get_context=*/nullptr, &lookup_context_
,
303 block_prefetcher_
.prefetch_buffer(),
304 /*for_compaction=*/is_for_compaction
, /*async_read=*/true, s
);
306 if (s
.IsTryAgain()) {
307 async_read_in_progress_
= true;
312 // Second pass will call the Poll to get the data block which has been
313 // requested asynchronously.
315 table_
->NewDataBlockIterator
<DataBlockIter
>(
316 read_options_
, data_block_handle
, &block_iter_
, BlockType::kData
,
317 /*get_context=*/nullptr, &lookup_context_
,
318 block_prefetcher_
.prefetch_buffer(),
319 /*for_compaction=*/is_for_compaction
, /*async_read=*/false, s
);
321 block_iter_points_to_real_block_
= true;
322 CheckDataBlockWithinUpperBound();
323 async_read_in_progress_
= false;
326 bool BlockBasedTableIterator::MaterializeCurrentBlock() {
327 assert(is_at_first_key_from_index_
);
328 assert(!block_iter_points_to_real_block_
);
329 assert(index_iter_
->Valid());
331 is_at_first_key_from_index_
= false;
333 assert(block_iter_points_to_real_block_
);
335 if (!block_iter_
.status().ok()) {
339 block_iter_
.SeekToFirst();
341 if (!block_iter_
.Valid() ||
342 icomp_
.Compare(block_iter_
.key(),
343 index_iter_
->value().first_internal_key
) != 0) {
344 block_iter_
.Invalidate(Status::Corruption(
345 "first key in index doesn't match first key in block"));
352 void BlockBasedTableIterator::FindKeyForward() {
353 // This method's code is kept short to make it likely to be inlined.
355 assert(!is_out_of_bound_
);
356 assert(block_iter_points_to_real_block_
);
358 if (!block_iter_
.Valid()) {
359 // This is the only call site of FindBlockForward(), but it's extracted into
360 // a separate method to keep FindKeyForward() short and likely to be
361 // inlined. When transitioning to a different block, we call
362 // FindBlockForward(), which is much longer and is probably not inlined.
365 // This is the fast path that avoids a function call.
369 void BlockBasedTableIterator::FindBlockForward() {
370 // TODO the while loop inherits from two-level-iterator. We don't know
371 // whether a block can be empty so it can be replaced by an "if".
373 if (!block_iter_
.status().ok()) {
376 // Whether next data block is out of upper bound, if there is one.
377 const bool next_block_is_out_of_bound
=
378 read_options_
.iterate_upper_bound
!= nullptr &&
379 block_iter_points_to_real_block_
&&
380 block_upper_bound_check_
== BlockUpperBound::kUpperBoundInCurBlock
;
381 assert(!next_block_is_out_of_bound
||
382 user_comparator_
.CompareWithoutTimestamp(
383 *read_options_
.iterate_upper_bound
, /*a_has_ts=*/false,
384 index_iter_
->user_key(), /*b_has_ts=*/true) <= 0);
387 if (next_block_is_out_of_bound
) {
388 // The next block is out of bound. No need to read it.
389 TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
390 // We need to make sure this is not the last data block before setting
391 // is_out_of_bound_, since the index key for the last data block can be
392 // larger than smallest key of the next file on the same level.
393 if (index_iter_
->Valid()) {
394 is_out_of_bound_
= true;
399 if (!index_iter_
->Valid()) {
403 IndexValue v
= index_iter_
->value();
405 if (!v
.first_internal_key
.empty() && allow_unprepared_value_
) {
406 // Index contains the first key of the block. Defer reading the block.
407 is_at_first_key_from_index_
= true;
412 block_iter_
.SeekToFirst();
413 } while (!block_iter_
.Valid());
416 void BlockBasedTableIterator::FindKeyBackward() {
417 while (!block_iter_
.Valid()) {
418 if (!block_iter_
.status().ok()) {
425 if (index_iter_
->Valid()) {
427 block_iter_
.SeekToLast();
433 // We could have check lower bound here too, but we opt not to do it for
437 void BlockBasedTableIterator::CheckOutOfBound() {
438 if (read_options_
.iterate_upper_bound
!= nullptr &&
439 block_upper_bound_check_
!= BlockUpperBound::kUpperBoundBeyondCurBlock
&&
442 user_comparator_
.CompareWithoutTimestamp(
443 *read_options_
.iterate_upper_bound
, /*a_has_ts=*/false, user_key(),
444 /*b_has_ts=*/true) <= 0;
448 void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() {
449 if (read_options_
.iterate_upper_bound
!= nullptr &&
450 block_iter_points_to_real_block_
) {
451 block_upper_bound_check_
= (user_comparator_
.CompareWithoutTimestamp(
452 *read_options_
.iterate_upper_bound
,
453 /*a_has_ts=*/false, index_iter_
->user_key(),
454 /*b_has_ts=*/true) > 0)
455 ? BlockUpperBound::kUpperBoundBeyondCurBlock
456 : BlockUpperBound::kUpperBoundInCurBlock
;
459 } // namespace ROCKSDB_NAMESPACE