]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/plain/plain_table_reader.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_reader.cc
1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file. See the AUTHORS file for names of contributors.
5
6 #ifndef ROCKSDB_LITE
7
8 #include "table/plain/plain_table_reader.h"
9
10 #include <string>
11 #include <vector>
12
13 #include "db/dbformat.h"
14 #include "memory/arena.h"
15 #include "monitoring/histogram.h"
16 #include "monitoring/perf_context_imp.h"
17 #include "rocksdb/cache.h"
18 #include "rocksdb/comparator.h"
19 #include "rocksdb/env.h"
20 #include "rocksdb/filter_policy.h"
21 #include "rocksdb/options.h"
22 #include "rocksdb/statistics.h"
23 #include "table/block_based/block.h"
24 #include "table/block_based/filter_block.h"
25 #include "table/format.h"
26 #include "table/get_context.h"
27 #include "table/internal_iterator.h"
28 #include "table/meta_blocks.h"
29 #include "table/plain/plain_table_bloom.h"
30 #include "table/plain/plain_table_factory.h"
31 #include "table/plain/plain_table_key_coding.h"
32 #include "table/two_level_iterator.h"
33 #include "util/coding.h"
34 #include "util/dynamic_bloom.h"
35 #include "util/hash.h"
36 #include "util/stop_watch.h"
37 #include "util/string_util.h"
38
39 namespace ROCKSDB_NAMESPACE {
40
41 namespace {
42
43 // Safely getting a uint32_t element from a char array, where, starting from
44 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
45 inline uint32_t GetFixed32Element(const char* base, size_t offset) {
46 return DecodeFixed32(base + offset * sizeof(uint32_t));
47 }
48 } // namespace
49
50 // Iterator to iterate IndexedTable
51 class PlainTableIterator : public InternalIterator {
52 public:
53 explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
54 // No copying allowed
55 PlainTableIterator(const PlainTableIterator&) = delete;
56 void operator=(const Iterator&) = delete;
57
58 ~PlainTableIterator() override;
59
60 bool Valid() const override;
61
62 void SeekToFirst() override;
63
64 void SeekToLast() override;
65
66 void Seek(const Slice& target) override;
67
68 void SeekForPrev(const Slice& target) override;
69
70 void Next() override;
71
72 void Prev() override;
73
74 Slice key() const override;
75
76 Slice value() const override;
77
78 Status status() const override;
79
80 private:
81 PlainTableReader* table_;
82 PlainTableKeyDecoder decoder_;
83 bool use_prefix_seek_;
84 uint32_t offset_;
85 uint32_t next_offset_;
86 Slice key_;
87 Slice value_;
88 Status status_;
89 };
90
91 extern const uint64_t kPlainTableMagicNumber;
92 PlainTableReader::PlainTableReader(
93 const ImmutableOptions& ioptions,
94 std::unique_ptr<RandomAccessFileReader>&& file,
95 const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
96 EncodingType encoding_type, uint64_t file_size,
97 const TableProperties* table_properties,
98 const SliceTransform* prefix_extractor)
99 : internal_comparator_(icomparator),
100 encoding_type_(encoding_type),
101 full_scan_mode_(false),
102 user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
103 prefix_extractor_(prefix_extractor),
104 enable_bloom_(false),
105 bloom_(6),
106 file_info_(std::move(file), storage_options,
107 static_cast<uint32_t>(table_properties->data_size)),
108 ioptions_(ioptions),
109 file_size_(file_size),
110 table_properties_(nullptr) {}
111
112 PlainTableReader::~PlainTableReader() {
113 // Should fix?
114 status_.PermitUncheckedError();
115 }
116
117 Status PlainTableReader::Open(
118 const ImmutableOptions& ioptions, const EnvOptions& env_options,
119 const InternalKeyComparator& internal_comparator,
120 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
121 std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
122 double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
123 bool full_scan_mode, const bool immortal_table,
124 const SliceTransform* prefix_extractor) {
125 if (file_size > PlainTableIndex::kMaxFileSize) {
126 return Status::NotSupported("File is too large for PlainTableReader!");
127 }
128
129 std::unique_ptr<TableProperties> props;
130 auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
131 ioptions, &props);
132 if (!s.ok()) {
133 return s;
134 }
135
136 assert(hash_table_ratio >= 0.0);
137 auto& user_props = props->user_collected_properties;
138 auto prefix_extractor_in_file = props->prefix_extractor_name;
139
140 if (!full_scan_mode &&
141 !prefix_extractor_in_file.empty() /* old version sst file*/
142 && prefix_extractor_in_file != "nullptr") {
143 if (!prefix_extractor) {
144 return Status::InvalidArgument(
145 "Prefix extractor is missing when opening a PlainTable built "
146 "using a prefix extractor");
147 } else if (prefix_extractor_in_file != prefix_extractor->AsString()) {
148 return Status::InvalidArgument(
149 "Prefix extractor given doesn't match the one used to build "
150 "PlainTable");
151 }
152 }
153
154 EncodingType encoding_type = kPlain;
155 auto encoding_type_prop =
156 user_props.find(PlainTablePropertyNames::kEncodingType);
157 if (encoding_type_prop != user_props.end()) {
158 encoding_type = static_cast<EncodingType>(
159 DecodeFixed32(encoding_type_prop->second.c_str()));
160 }
161
162 std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
163 ioptions, std::move(file), env_options, internal_comparator,
164 encoding_type, file_size, props.get(), prefix_extractor));
165
166 s = new_reader->MmapDataIfNeeded();
167 if (!s.ok()) {
168 return s;
169 }
170
171 if (!full_scan_mode) {
172 s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
173 hash_table_ratio, index_sparseness,
174 huge_page_tlb_size);
175 if (!s.ok()) {
176 return s;
177 }
178 } else {
179 // Flag to indicate it is a full scan mode so that none of the indexes
180 // can be used.
181 new_reader->full_scan_mode_ = true;
182 }
183 // PopulateIndex can add to the props, so don't store them until now
184 new_reader->table_properties_ = std::move(props);
185
186 if (immortal_table && new_reader->file_info_.is_mmap_mode) {
187 new_reader->dummy_cleanable_.reset(new Cleanable());
188 }
189
190 *table_reader = std::move(new_reader);
191 return s;
192 }
193
194 void PlainTableReader::SetupForCompaction() {}
195
196 InternalIterator* PlainTableReader::NewIterator(
197 const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
198 Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
199 size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
200 // Not necessarily used here, but make sure this has been initialized
201 assert(table_properties_);
202
203 // Auto prefix mode is not implemented in PlainTable.
204 bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
205 !options.auto_prefix_mode;
206 if (arena == nullptr) {
207 return new PlainTableIterator(this, use_prefix_seek);
208 } else {
209 auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
210 return new (mem) PlainTableIterator(this, use_prefix_seek);
211 }
212 }
213
214 Status PlainTableReader::PopulateIndexRecordList(
215 PlainTableIndexBuilder* index_builder,
216 std::vector<uint32_t>* prefix_hashes) {
217 Slice prev_key_prefix_slice;
218 std::string prev_key_prefix_buf;
219 uint32_t pos = data_start_offset_;
220
221 bool is_first_record = true;
222 Slice key_prefix_slice;
223 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
224 prefix_extractor_);
225 while (pos < file_info_.data_end_offset) {
226 uint32_t key_offset = pos;
227 ParsedInternalKey key;
228 Slice value_slice;
229 bool seekable = false;
230 Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
231 if (!s.ok()) {
232 return s;
233 }
234
235 key_prefix_slice = GetPrefix(key);
236 if (enable_bloom_) {
237 bloom_.AddHash(GetSliceHash(key.user_key));
238 } else {
239 if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
240 if (!is_first_record) {
241 prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
242 }
243 if (file_info_.is_mmap_mode) {
244 prev_key_prefix_slice = key_prefix_slice;
245 } else {
246 prev_key_prefix_buf = key_prefix_slice.ToString();
247 prev_key_prefix_slice = prev_key_prefix_buf;
248 }
249 }
250 }
251
252 index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
253
254 if (!seekable && is_first_record) {
255 return Status::Corruption("Key for a prefix is not seekable");
256 }
257
258 is_first_record = false;
259 }
260
261 prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
262 auto s = index_.InitFromRawData(index_builder->Finish());
263 return s;
264 }
265
266 void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
267 size_t huge_page_tlb_size) {
268 uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
269 if (bloom_total_bits > 0) {
270 enable_bloom_ = true;
271 bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
272 huge_page_tlb_size, ioptions_.logger);
273 }
274 }
275
276 void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
277 assert(bloom_.IsInitialized());
278 for (const auto prefix_hash : prefix_hashes) {
279 bloom_.AddHash(prefix_hash);
280 }
281 }
282
283 Status PlainTableReader::MmapDataIfNeeded() {
284 if (file_info_.is_mmap_mode) {
285 // Get mmapped memory.
286 return file_info_.file->Read(
287 IOOptions(), 0, static_cast<size_t>(file_size_), &file_info_.file_data,
288 nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
289 }
290 return Status::OK();
291 }
292
293 Status PlainTableReader::PopulateIndex(TableProperties* props,
294 int bloom_bits_per_key,
295 double hash_table_ratio,
296 size_t index_sparseness,
297 size_t huge_page_tlb_size) {
298 assert(props != nullptr);
299
300 BlockContents index_block_contents;
301 Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
302 file_size_, kPlainTableMagicNumber, ioptions_,
303 PlainTableIndexBuilder::kPlainTableIndexBlock,
304 BlockType::kIndex, &index_block_contents);
305
306 bool index_in_file = s.ok();
307
308 BlockContents bloom_block_contents;
309 bool bloom_in_file = false;
310 // We only need to read the bloom block if index block is in file.
311 if (index_in_file) {
312 s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
313 file_size_, kPlainTableMagicNumber, ioptions_,
314 BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
315 &bloom_block_contents);
316 bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
317 }
318
319 Slice* bloom_block;
320 if (bloom_in_file) {
321 // If bloom_block_contents.allocation is not empty (which will be the case
322 // for non-mmap mode), it holds the alloated memory for the bloom block.
323 // It needs to be kept alive to keep `bloom_block` valid.
324 bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
325 bloom_block = &bloom_block_contents.data;
326 } else {
327 bloom_block = nullptr;
328 }
329
330 Slice* index_block;
331 if (index_in_file) {
332 // If index_block_contents.allocation is not empty (which will be the case
333 // for non-mmap mode), it holds the alloated memory for the index block.
334 // It needs to be kept alive to keep `index_block` valid.
335 index_block_alloc_ = std::move(index_block_contents.allocation);
336 index_block = &index_block_contents.data;
337 } else {
338 index_block = nullptr;
339 }
340
341 if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
342 // moptions.prefix_extractor is requried for a hash-based look-up.
343 return Status::NotSupported(
344 "PlainTable requires a prefix extractor enable prefix hash mode.");
345 }
346
347 // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
348 // for a prefix (starting from the first one), generate a record of (hash,
349 // offset) and append it to IndexRecordList, which is a data structure created
350 // to store them.
351
352 if (!index_in_file) {
353 // Allocate bloom filter here for total order mode.
354 if (IsTotalOrderMode()) {
355 AllocateBloom(bloom_bits_per_key,
356 static_cast<uint32_t>(props->num_entries),
357 huge_page_tlb_size);
358 }
359 } else if (bloom_in_file) {
360 enable_bloom_ = true;
361 auto num_blocks_property = props->user_collected_properties.find(
362 PlainTablePropertyNames::kNumBloomBlocks);
363
364 uint32_t num_blocks = 0;
365 if (num_blocks_property != props->user_collected_properties.end()) {
366 Slice temp_slice(num_blocks_property->second);
367 if (!GetVarint32(&temp_slice, &num_blocks)) {
368 num_blocks = 0;
369 }
370 }
371 // cast away const qualifier, because bloom_ won't be changed
372 bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
373 static_cast<uint32_t>(bloom_block->size()) * 8,
374 num_blocks);
375 } else {
376 // Index in file but no bloom in file. Disable bloom filter in this case.
377 enable_bloom_ = false;
378 bloom_bits_per_key = 0;
379 }
380
381 PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
382 index_sparseness, hash_table_ratio,
383 huge_page_tlb_size);
384
385 std::vector<uint32_t> prefix_hashes;
386 if (!index_in_file) {
387 // Populates _bloom if enabled (total order mode)
388 s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
389 if (!s.ok()) {
390 return s;
391 }
392 } else {
393 s = index_.InitFromRawData(*index_block);
394 if (!s.ok()) {
395 return s;
396 }
397 }
398
399 if (!index_in_file) {
400 if (!IsTotalOrderMode()) {
401 // Calculated bloom filter size and allocate memory for
402 // bloom filter based on the number of prefixes, then fill it.
403 AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
404 huge_page_tlb_size);
405 if (enable_bloom_) {
406 FillBloom(prefix_hashes);
407 }
408 }
409 }
410
411 // Fill two table properties.
412 if (!index_in_file) {
413 props->user_collected_properties["plain_table_hash_table_size"] =
414 std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
415 props->user_collected_properties["plain_table_sub_index_size"] =
416 std::to_string(index_.GetSubIndexSize());
417 } else {
418 props->user_collected_properties["plain_table_hash_table_size"] =
419 std::to_string(0);
420 props->user_collected_properties["plain_table_sub_index_size"] =
421 std::to_string(0);
422 }
423
424 return Status::OK();
425 }
426
427 Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
428 const Slice& target, const Slice& prefix,
429 uint32_t prefix_hash, bool& prefix_matched,
430 uint32_t* offset) const {
431 prefix_matched = false;
432 uint32_t prefix_index_offset;
433 auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
434 if (res == PlainTableIndex::kNoPrefixForBucket) {
435 *offset = file_info_.data_end_offset;
436 return Status::OK();
437 } else if (res == PlainTableIndex::kDirectToFile) {
438 *offset = prefix_index_offset;
439 return Status::OK();
440 }
441
442 // point to sub-index, need to do a binary search
443 uint32_t upper_bound = 0;
444 const char* base_ptr =
445 index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
446 uint32_t low = 0;
447 uint32_t high = upper_bound;
448 ParsedInternalKey mid_key;
449 ParsedInternalKey parsed_target;
450 Status s = ParseInternalKey(target, &parsed_target,
451 false /* log_err_key */); // TODO
452 if (!s.ok()) return s;
453
454 // The key is between [low, high). Do a binary search between it.
455 while (high - low > 1) {
456 uint32_t mid = (high + low) / 2;
457 uint32_t file_offset = GetFixed32Element(base_ptr, mid);
458 uint32_t tmp;
459 s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
460 if (!s.ok()) {
461 return s;
462 }
463 int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
464 if (cmp_result < 0) {
465 low = mid;
466 } else {
467 if (cmp_result == 0) {
468 // Happen to have found the exact key or target is smaller than the
469 // first key after base_offset.
470 prefix_matched = true;
471 *offset = file_offset;
472 return Status::OK();
473 } else {
474 high = mid;
475 }
476 }
477 }
478 // Both of the key at the position low or low+1 could share the same
479 // prefix as target. We need to rule out one of them to avoid to go
480 // to the wrong prefix.
481 ParsedInternalKey low_key;
482 uint32_t tmp;
483 uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
484 s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
485 if (!s.ok()) {
486 return s;
487 }
488
489 if (GetPrefix(low_key) == prefix) {
490 prefix_matched = true;
491 *offset = low_key_offset;
492 } else if (low + 1 < upper_bound) {
493 // There is possible a next prefix, return it
494 prefix_matched = false;
495 *offset = GetFixed32Element(base_ptr, low + 1);
496 } else {
497 // target is larger than a key of the last prefix in this bucket
498 // but with a different prefix. Key does not exist.
499 *offset = file_info_.data_end_offset;
500 }
501 return Status::OK();
502 }
503
504 bool PlainTableReader::MatchBloom(uint32_t hash) const {
505 if (!enable_bloom_) {
506 return true;
507 }
508
509 if (bloom_.MayContainHash(hash)) {
510 PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
511 return true;
512 } else {
513 PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
514 return false;
515 }
516 }
517
518 Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
519 ParsedInternalKey* parsed_key,
520 Slice* internal_key, Slice* value,
521 bool* seekable) const {
522 if (*offset == file_info_.data_end_offset) {
523 *offset = file_info_.data_end_offset;
524 return Status::OK();
525 }
526
527 if (*offset > file_info_.data_end_offset) {
528 return Status::Corruption("Offset is out of file size");
529 }
530
531 uint32_t bytes_read;
532 Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
533 &bytes_read, seekable);
534 if (!s.ok()) {
535 return s;
536 }
537 *offset = *offset + bytes_read;
538 return Status::OK();
539 }
540
541 void PlainTableReader::Prepare(const Slice& target) {
542 if (enable_bloom_) {
543 uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
544 bloom_.Prefetch(prefix_hash);
545 }
546 }
547
548 Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
549 GetContext* get_context,
550 const SliceTransform* /* prefix_extractor */,
551 bool /*skip_filters*/) {
552 // Check bloom filter first.
553 Slice prefix_slice;
554 uint32_t prefix_hash;
555 if (IsTotalOrderMode()) {
556 if (full_scan_mode_) {
557 status_ =
558 Status::InvalidArgument("Get() is not allowed in full scan mode.");
559 }
560 // Match whole user key for bloom filter check.
561 if (!MatchBloom(GetSliceHash(ExtractUserKey(target)))) {
562 return Status::OK();
563 }
564 // in total order mode, there is only one bucket 0, and we always use empty
565 // prefix.
566 prefix_slice = Slice();
567 prefix_hash = 0;
568 } else {
569 prefix_slice = GetPrefix(target);
570 prefix_hash = GetSliceHash(prefix_slice);
571 if (!MatchBloom(prefix_hash)) {
572 return Status::OK();
573 }
574 }
575 uint32_t offset;
576 bool prefix_match;
577 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
578 prefix_extractor_);
579 Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
580 prefix_match, &offset);
581
582 if (!s.ok()) {
583 return s;
584 }
585 ParsedInternalKey found_key;
586 ParsedInternalKey parsed_target;
587 s = ParseInternalKey(target, &parsed_target,
588 false /* log_err_key */); // TODO
589 if (!s.ok()) return s;
590
591 Slice found_value;
592 while (offset < file_info_.data_end_offset) {
593 s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
594 if (!s.ok()) {
595 return s;
596 }
597 if (!prefix_match) {
598 // Need to verify prefix for the first key found if it is not yet
599 // checked.
600 if (GetPrefix(found_key) != prefix_slice) {
601 return Status::OK();
602 }
603 prefix_match = true;
604 }
605 // TODO(ljin): since we know the key comparison result here,
606 // can we enable the fast path?
607 if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
608 bool dont_care __attribute__((__unused__));
609 if (!get_context->SaveValue(found_key, found_value, &dont_care,
610 dummy_cleanable_.get())) {
611 break;
612 }
613 }
614 }
615 return Status::OK();
616 }
617
618 uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
619 TableReaderCaller /*caller*/) {
620 return 0;
621 }
622
623 uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
624 const Slice& /*end*/,
625 TableReaderCaller /*caller*/) {
626 return 0;
627 }
628
629 PlainTableIterator::PlainTableIterator(PlainTableReader* table,
630 bool use_prefix_seek)
631 : table_(table),
632 decoder_(&table_->file_info_, table_->encoding_type_,
633 table_->user_key_len_, table_->prefix_extractor_),
634 use_prefix_seek_(use_prefix_seek) {
635 next_offset_ = offset_ = table_->file_info_.data_end_offset;
636 }
637
638 PlainTableIterator::~PlainTableIterator() {}
639
640 bool PlainTableIterator::Valid() const {
641 return offset_ < table_->file_info_.data_end_offset &&
642 offset_ >= table_->data_start_offset_;
643 }
644
645 void PlainTableIterator::SeekToFirst() {
646 status_ = Status::OK();
647 next_offset_ = table_->data_start_offset_;
648 if (next_offset_ >= table_->file_info_.data_end_offset) {
649 next_offset_ = offset_ = table_->file_info_.data_end_offset;
650 } else {
651 Next();
652 }
653 }
654
655 void PlainTableIterator::SeekToLast() {
656 assert(false);
657 status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
658 next_offset_ = offset_ = table_->file_info_.data_end_offset;
659 }
660
661 void PlainTableIterator::Seek(const Slice& target) {
662 if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
663 // This check is done here instead of NewIterator() to permit creating an
664 // iterator with total_order_seek = true even if we won't be able to Seek()
665 // it. This is needed for compaction: it creates iterator with
666 // total_order_seek = true but usually never does Seek() on it,
667 // only SeekToFirst().
668 status_ = Status::InvalidArgument(
669 "total_order_seek not implemented for PlainTable.");
670 offset_ = next_offset_ = table_->file_info_.data_end_offset;
671 return;
672 }
673
674 // If the user doesn't set prefix seek option and we are not able to do a
675 // total Seek(). assert failure.
676 if (table_->IsTotalOrderMode()) {
677 if (table_->full_scan_mode_) {
678 status_ =
679 Status::InvalidArgument("Seek() is not allowed in full scan mode.");
680 offset_ = next_offset_ = table_->file_info_.data_end_offset;
681 return;
682 } else if (table_->GetIndexSize() > 1) {
683 assert(false);
684 status_ = Status::NotSupported(
685 "PlainTable cannot issue non-prefix seek unless in total order "
686 "mode.");
687 offset_ = next_offset_ = table_->file_info_.data_end_offset;
688 return;
689 }
690 }
691
692 Slice prefix_slice = table_->GetPrefix(target);
693 uint32_t prefix_hash = 0;
694 // Bloom filter is ignored in total-order mode.
695 if (!table_->IsTotalOrderMode()) {
696 prefix_hash = GetSliceHash(prefix_slice);
697 if (!table_->MatchBloom(prefix_hash)) {
698 status_ = Status::OK();
699 offset_ = next_offset_ = table_->file_info_.data_end_offset;
700 return;
701 }
702 }
703 bool prefix_match;
704 status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
705 prefix_match, &next_offset_);
706 if (!status_.ok()) {
707 offset_ = next_offset_ = table_->file_info_.data_end_offset;
708 return;
709 }
710
711 if (next_offset_ < table_->file_info_.data_end_offset) {
712 for (Next(); status_.ok() && Valid(); Next()) {
713 if (!prefix_match) {
714 // Need to verify the first key's prefix
715 if (table_->GetPrefix(key()) != prefix_slice) {
716 offset_ = next_offset_ = table_->file_info_.data_end_offset;
717 break;
718 }
719 prefix_match = true;
720 }
721 if (table_->internal_comparator_.Compare(key(), target) >= 0) {
722 break;
723 }
724 }
725 } else {
726 offset_ = table_->file_info_.data_end_offset;
727 }
728 }
729
730 void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
731 assert(false);
732 status_ =
733 Status::NotSupported("SeekForPrev() is not supported in PlainTable");
734 offset_ = next_offset_ = table_->file_info_.data_end_offset;
735 }
736
737 void PlainTableIterator::Next() {
738 offset_ = next_offset_;
739 if (offset_ < table_->file_info_.data_end_offset) {
740 Slice tmp_slice;
741 ParsedInternalKey parsed_key;
742 status_ =
743 table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
744 if (!status_.ok()) {
745 offset_ = next_offset_ = table_->file_info_.data_end_offset;
746 }
747 }
748 }
749
750 void PlainTableIterator::Prev() { assert(false); }
751
752 Slice PlainTableIterator::key() const {
753 assert(Valid());
754 return key_;
755 }
756
757 Slice PlainTableIterator::value() const {
758 assert(Valid());
759 return value_;
760 }
761
762 Status PlainTableIterator::status() const { return status_; }
763
764 } // namespace ROCKSDB_NAMESPACE
765 #endif // ROCKSDB_LITE