]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain_table_reader.cc
build: use dgit for download target
[ceph.git] / ceph / src / rocksdb / table / plain_table_reader.cc
CommitLineData
7c673cae
FG
1// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5#ifndef ROCKSDB_LITE
6
7#include "table/plain_table_reader.h"
8
9#include <string>
10#include <vector>
11
12#include "db/dbformat.h"
13
14#include "rocksdb/cache.h"
15#include "rocksdb/comparator.h"
16#include "rocksdb/env.h"
17#include "rocksdb/filter_policy.h"
18#include "rocksdb/options.h"
19#include "rocksdb/statistics.h"
20
21#include "table/block.h"
22#include "table/bloom_block.h"
23#include "table/filter_block.h"
24#include "table/format.h"
25#include "table/internal_iterator.h"
26#include "table/meta_blocks.h"
27#include "table/two_level_iterator.h"
28#include "table/plain_table_factory.h"
29#include "table/plain_table_key_coding.h"
30#include "table/get_context.h"
31
32#include "monitoring/histogram.h"
33#include "monitoring/perf_context_imp.h"
34#include "util/arena.h"
35#include "util/coding.h"
36#include "util/dynamic_bloom.h"
37#include "util/hash.h"
38#include "util/murmurhash.h"
39#include "util/stop_watch.h"
40#include "util/string_util.h"
41
42namespace rocksdb {
43
44namespace {
45
46// Safely getting a uint32_t element from a char array, where, starting from
47// `base`, every 4 bytes are considered as an fixed 32 bit integer.
48inline uint32_t GetFixed32Element(const char* base, size_t offset) {
49 return DecodeFixed32(base + offset * sizeof(uint32_t));
50}
51} // namespace
52
53// Iterator to iterate IndexedTable
54class PlainTableIterator : public InternalIterator {
55 public:
56 explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
57 ~PlainTableIterator();
58
59 bool Valid() const override;
60
61 void SeekToFirst() override;
62
63 void SeekToLast() override;
64
65 void Seek(const Slice& target) override;
66
67 void SeekForPrev(const Slice& target) override;
68
69 void Next() override;
70
71 void Prev() override;
72
73 Slice key() const override;
74
75 Slice value() const override;
76
77 Status status() const override;
78
79 private:
80 PlainTableReader* table_;
81 PlainTableKeyDecoder decoder_;
82 bool use_prefix_seek_;
83 uint32_t offset_;
84 uint32_t next_offset_;
85 Slice key_;
86 Slice value_;
87 Status status_;
88 // No copying allowed
89 PlainTableIterator(const PlainTableIterator&) = delete;
90 void operator=(const Iterator&) = delete;
91};
92
93extern const uint64_t kPlainTableMagicNumber;
94PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
95 unique_ptr<RandomAccessFileReader>&& file,
96 const EnvOptions& storage_options,
97 const InternalKeyComparator& icomparator,
98 EncodingType encoding_type,
99 uint64_t file_size,
11fdf7f2
TL
100 const TableProperties* table_properties,
101 const SliceTransform* prefix_extractor)
7c673cae
FG
102 : internal_comparator_(icomparator),
103 encoding_type_(encoding_type),
104 full_scan_mode_(false),
105 user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
11fdf7f2 106 prefix_extractor_(prefix_extractor),
7c673cae
FG
107 enable_bloom_(false),
108 bloom_(6, nullptr),
109 file_info_(std::move(file), storage_options,
110 static_cast<uint32_t>(table_properties->data_size)),
111 ioptions_(ioptions),
112 file_size_(file_size),
113 table_properties_(nullptr) {}
114
115PlainTableReader::~PlainTableReader() {
116}
117
11fdf7f2
TL
118Status PlainTableReader::Open(
119 const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
120 const InternalKeyComparator& internal_comparator,
121 unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
122 unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
123 double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
124 bool full_scan_mode, const SliceTransform* prefix_extractor) {
7c673cae
FG
125 if (file_size > PlainTableIndex::kMaxFileSize) {
126 return Status::NotSupported("File is too large for PlainTableReader!");
127 }
128
129 TableProperties* props = nullptr;
130 auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
11fdf7f2
TL
131 ioptions, &props,
132 true /* compression_type_missing */);
7c673cae
FG
133 if (!s.ok()) {
134 return s;
135 }
136
137 assert(hash_table_ratio >= 0.0);
138 auto& user_props = props->user_collected_properties;
139 auto prefix_extractor_in_file = props->prefix_extractor_name;
140
141 if (!full_scan_mode &&
142 !prefix_extractor_in_file.empty() /* old version sst file*/
143 && prefix_extractor_in_file != "nullptr") {
11fdf7f2 144 if (!prefix_extractor) {
7c673cae
FG
145 return Status::InvalidArgument(
146 "Prefix extractor is missing when opening a PlainTable built "
147 "using a prefix extractor");
11fdf7f2
TL
148 } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
149 0) {
7c673cae
FG
150 return Status::InvalidArgument(
151 "Prefix extractor given doesn't match the one used to build "
152 "PlainTable");
153 }
154 }
155
156 EncodingType encoding_type = kPlain;
157 auto encoding_type_prop =
158 user_props.find(PlainTablePropertyNames::kEncodingType);
159 if (encoding_type_prop != user_props.end()) {
160 encoding_type = static_cast<EncodingType>(
161 DecodeFixed32(encoding_type_prop->second.c_str()));
162 }
163
164 std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
165 ioptions, std::move(file), env_options, internal_comparator,
11fdf7f2 166 encoding_type, file_size, props, prefix_extractor));
7c673cae
FG
167
168 s = new_reader->MmapDataIfNeeded();
169 if (!s.ok()) {
170 return s;
171 }
172
173 if (!full_scan_mode) {
174 s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
175 index_sparseness, huge_page_tlb_size);
176 if (!s.ok()) {
177 return s;
178 }
179 } else {
180 // Flag to indicate it is a full scan mode so that none of the indexes
181 // can be used.
182 new_reader->full_scan_mode_ = true;
183 }
184
185 *table_reader = std::move(new_reader);
186 return s;
187}
188
189void PlainTableReader::SetupForCompaction() {
190}
191
11fdf7f2
TL
192InternalIterator* PlainTableReader::NewIterator(
193 const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
194 Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) {
195 bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
7c673cae 196 if (arena == nullptr) {
11fdf7f2 197 return new PlainTableIterator(this, use_prefix_seek);
7c673cae
FG
198 } else {
199 auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
11fdf7f2 200 return new (mem) PlainTableIterator(this, use_prefix_seek);
7c673cae
FG
201 }
202}
203
204Status PlainTableReader::PopulateIndexRecordList(
205 PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
206 Slice prev_key_prefix_slice;
207 std::string prev_key_prefix_buf;
208 uint32_t pos = data_start_offset_;
209
210 bool is_first_record = true;
211 Slice key_prefix_slice;
212 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
11fdf7f2 213 prefix_extractor_);
7c673cae
FG
214 while (pos < file_info_.data_end_offset) {
215 uint32_t key_offset = pos;
216 ParsedInternalKey key;
217 Slice value_slice;
218 bool seekable = false;
219 Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
220 if (!s.ok()) {
221 return s;
222 }
223
224 key_prefix_slice = GetPrefix(key);
225 if (enable_bloom_) {
226 bloom_.AddHash(GetSliceHash(key.user_key));
227 } else {
228 if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
229 if (!is_first_record) {
230 prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
231 }
232 if (file_info_.is_mmap_mode) {
233 prev_key_prefix_slice = key_prefix_slice;
234 } else {
235 prev_key_prefix_buf = key_prefix_slice.ToString();
236 prev_key_prefix_slice = prev_key_prefix_buf;
237 }
238 }
239 }
240
241 index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
242
243 if (!seekable && is_first_record) {
244 return Status::Corruption("Key for a prefix is not seekable");
245 }
246
247 is_first_record = false;
248 }
249
250 prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
251 auto s = index_.InitFromRawData(index_builder->Finish());
252 return s;
253}
254
255void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
256 int num_prefixes,
257 size_t huge_page_tlb_size,
258 vector<uint32_t>* prefix_hashes) {
259 if (!IsTotalOrderMode()) {
260 uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
261 if (bloom_total_bits > 0) {
262 enable_bloom_ = true;
263 bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
264 huge_page_tlb_size, ioptions_.info_log);
265 FillBloom(prefix_hashes);
266 }
267 }
268}
269
270void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
271 assert(bloom_.IsInitialized());
272 for (auto prefix_hash : *prefix_hashes) {
273 bloom_.AddHash(prefix_hash);
274 }
275}
276
277Status PlainTableReader::MmapDataIfNeeded() {
278 if (file_info_.is_mmap_mode) {
279 // Get mmapped memory.
11fdf7f2 280 return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr);
7c673cae
FG
281 }
282 return Status::OK();
283}
284
285Status PlainTableReader::PopulateIndex(TableProperties* props,
286 int bloom_bits_per_key,
287 double hash_table_ratio,
288 size_t index_sparseness,
289 size_t huge_page_tlb_size) {
290 assert(props != nullptr);
291 table_properties_.reset(props);
292
293 BlockContents index_block_contents;
11fdf7f2
TL
294 Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
295 file_size_, kPlainTableMagicNumber, ioptions_,
296 PlainTableIndexBuilder::kPlainTableIndexBlock,
297 &index_block_contents,
298 true /* compression_type_missing */);
7c673cae
FG
299
300 bool index_in_file = s.ok();
301
302 BlockContents bloom_block_contents;
303 bool bloom_in_file = false;
304 // We only need to read the bloom block if index block is in file.
305 if (index_in_file) {
11fdf7f2
TL
306 s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
307 file_size_, kPlainTableMagicNumber, ioptions_,
308 BloomBlockBuilder::kBloomBlock, &bloom_block_contents,
309 true /* compression_type_missing */);
7c673cae
FG
310 bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
311 }
312
313 Slice* bloom_block;
314 if (bloom_in_file) {
315 // If bloom_block_contents.allocation is not empty (which will be the case
316 // for non-mmap mode), it holds the alloated memory for the bloom block.
317 // It needs to be kept alive to keep `bloom_block` valid.
318 bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
319 bloom_block = &bloom_block_contents.data;
320 } else {
321 bloom_block = nullptr;
322 }
323
324 Slice* index_block;
325 if (index_in_file) {
326 // If index_block_contents.allocation is not empty (which will be the case
327 // for non-mmap mode), it holds the alloated memory for the index block.
328 // It needs to be kept alive to keep `index_block` valid.
329 index_block_alloc_ = std::move(index_block_contents.allocation);
330 index_block = &index_block_contents.data;
331 } else {
332 index_block = nullptr;
333 }
334
11fdf7f2
TL
335 if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
336 // moptions.prefix_extractor is requried for a hash-based look-up.
7c673cae
FG
337 return Status::NotSupported(
338 "PlainTable requires a prefix extractor enable prefix hash mode.");
339 }
340
341 // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
342 // for a prefix (starting from the first one), generate a record of (hash,
343 // offset) and append it to IndexRecordList, which is a data structure created
344 // to store them.
345
346 if (!index_in_file) {
347 // Allocate bloom filter here for total order mode.
348 if (IsTotalOrderMode()) {
349 uint32_t num_bloom_bits =
350 static_cast<uint32_t>(table_properties_->num_entries) *
351 bloom_bits_per_key;
352 if (num_bloom_bits > 0) {
353 enable_bloom_ = true;
354 bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality,
355 huge_page_tlb_size, ioptions_.info_log);
356 }
357 }
358 } else if (bloom_in_file) {
359 enable_bloom_ = true;
360 auto num_blocks_property = props->user_collected_properties.find(
361 PlainTablePropertyNames::kNumBloomBlocks);
362
363 uint32_t num_blocks = 0;
364 if (num_blocks_property != props->user_collected_properties.end()) {
365 Slice temp_slice(num_blocks_property->second);
366 if (!GetVarint32(&temp_slice, &num_blocks)) {
367 num_blocks = 0;
368 }
369 }
370 // cast away const qualifier, because bloom_ won't be changed
371 bloom_.SetRawData(
372 const_cast<unsigned char*>(
373 reinterpret_cast<const unsigned char*>(bloom_block->data())),
374 static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
375 } else {
376 // Index in file but no bloom in file. Disable bloom filter in this case.
377 enable_bloom_ = false;
378 bloom_bits_per_key = 0;
379 }
380
11fdf7f2
TL
381 PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
382 index_sparseness, hash_table_ratio,
383 huge_page_tlb_size);
7c673cae
FG
384
385 std::vector<uint32_t> prefix_hashes;
386 if (!index_in_file) {
387 s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
388 if (!s.ok()) {
389 return s;
390 }
391 } else {
392 s = index_.InitFromRawData(*index_block);
393 if (!s.ok()) {
394 return s;
395 }
396 }
397
398 if (!index_in_file) {
399 // Calculated bloom filter size and allocate memory for
400 // bloom filter based on the number of prefixes, then fill it.
401 AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
402 huge_page_tlb_size, &prefix_hashes);
403 }
404
405 // Fill two table properties.
406 if (!index_in_file) {
407 props->user_collected_properties["plain_table_hash_table_size"] =
408 ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
409 props->user_collected_properties["plain_table_sub_index_size"] =
410 ToString(index_.GetSubIndexSize());
411 } else {
412 props->user_collected_properties["plain_table_hash_table_size"] =
413 ToString(0);
414 props->user_collected_properties["plain_table_sub_index_size"] =
415 ToString(0);
416 }
417
418 return Status::OK();
419}
420
421Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
422 const Slice& target, const Slice& prefix,
423 uint32_t prefix_hash, bool& prefix_matched,
424 uint32_t* offset) const {
425 prefix_matched = false;
426 uint32_t prefix_index_offset;
427 auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
428 if (res == PlainTableIndex::kNoPrefixForBucket) {
429 *offset = file_info_.data_end_offset;
430 return Status::OK();
431 } else if (res == PlainTableIndex::kDirectToFile) {
432 *offset = prefix_index_offset;
433 return Status::OK();
434 }
435
436 // point to sub-index, need to do a binary search
437 uint32_t upper_bound;
438 const char* base_ptr =
439 index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
440 uint32_t low = 0;
441 uint32_t high = upper_bound;
442 ParsedInternalKey mid_key;
443 ParsedInternalKey parsed_target;
444 if (!ParseInternalKey(target, &parsed_target)) {
445 return Status::Corruption(Slice());
446 }
447
448 // The key is between [low, high). Do a binary search between it.
449 while (high - low > 1) {
450 uint32_t mid = (high + low) / 2;
451 uint32_t file_offset = GetFixed32Element(base_ptr, mid);
452 uint32_t tmp;
453 Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
454 if (!s.ok()) {
455 return s;
456 }
457 int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
458 if (cmp_result < 0) {
459 low = mid;
460 } else {
461 if (cmp_result == 0) {
462 // Happen to have found the exact key or target is smaller than the
463 // first key after base_offset.
464 prefix_matched = true;
465 *offset = file_offset;
466 return Status::OK();
467 } else {
468 high = mid;
469 }
470 }
471 }
472 // Both of the key at the position low or low+1 could share the same
473 // prefix as target. We need to rule out one of them to avoid to go
474 // to the wrong prefix.
475 ParsedInternalKey low_key;
476 uint32_t tmp;
477 uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
478 Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
479 if (!s.ok()) {
480 return s;
481 }
482
483 if (GetPrefix(low_key) == prefix) {
484 prefix_matched = true;
485 *offset = low_key_offset;
486 } else if (low + 1 < upper_bound) {
487 // There is possible a next prefix, return it
488 prefix_matched = false;
489 *offset = GetFixed32Element(base_ptr, low + 1);
490 } else {
491 // target is larger than a key of the last prefix in this bucket
492 // but with a different prefix. Key does not exist.
493 *offset = file_info_.data_end_offset;
494 }
495 return Status::OK();
496}
497
498bool PlainTableReader::MatchBloom(uint32_t hash) const {
499 if (!enable_bloom_) {
500 return true;
501 }
502
503 if (bloom_.MayContainHash(hash)) {
504 PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
505 return true;
506 } else {
507 PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
508 return false;
509 }
510}
511
512Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
513 ParsedInternalKey* parsed_key,
514 Slice* internal_key, Slice* value,
515 bool* seekable) const {
516 if (*offset == file_info_.data_end_offset) {
517 *offset = file_info_.data_end_offset;
518 return Status::OK();
519 }
520
521 if (*offset > file_info_.data_end_offset) {
522 return Status::Corruption("Offset is out of file size");
523 }
524
525 uint32_t bytes_read;
526 Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
527 &bytes_read, seekable);
528 if (!s.ok()) {
529 return s;
530 }
531 *offset = *offset + bytes_read;
532 return Status::OK();
533}
534
535void PlainTableReader::Prepare(const Slice& target) {
536 if (enable_bloom_) {
537 uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
538 bloom_.Prefetch(prefix_hash);
539 }
540}
541
11fdf7f2
TL
542Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
543 GetContext* get_context,
544 const SliceTransform* /* prefix_extractor */,
545 bool /*skip_filters*/) {
7c673cae
FG
546 // Check bloom filter first.
547 Slice prefix_slice;
548 uint32_t prefix_hash;
549 if (IsTotalOrderMode()) {
550 if (full_scan_mode_) {
551 status_ =
552 Status::InvalidArgument("Get() is not allowed in full scan mode.");
553 }
554 // Match whole user key for bloom filter check.
555 if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
556 return Status::OK();
557 }
558 // in total order mode, there is only one bucket 0, and we always use empty
559 // prefix.
560 prefix_slice = Slice();
561 prefix_hash = 0;
562 } else {
563 prefix_slice = GetPrefix(target);
564 prefix_hash = GetSliceHash(prefix_slice);
565 if (!MatchBloom(prefix_hash)) {
566 return Status::OK();
567 }
568 }
569 uint32_t offset;
570 bool prefix_match;
571 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
11fdf7f2 572 prefix_extractor_);
7c673cae
FG
573 Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
574 prefix_match, &offset);
575
576 if (!s.ok()) {
577 return s;
578 }
579 ParsedInternalKey found_key;
580 ParsedInternalKey parsed_target;
581 if (!ParseInternalKey(target, &parsed_target)) {
582 return Status::Corruption(Slice());
583 }
584 Slice found_value;
585 while (offset < file_info_.data_end_offset) {
586 s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
587 if (!s.ok()) {
588 return s;
589 }
590 if (!prefix_match) {
591 // Need to verify prefix for the first key found if it is not yet
592 // checked.
593 if (GetPrefix(found_key) != prefix_slice) {
594 return Status::OK();
595 }
596 prefix_match = true;
597 }
598 // TODO(ljin): since we know the key comparison result here,
599 // can we enable the fast path?
600 if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
11fdf7f2
TL
601 bool dont_care __attribute__((__unused__));
602 if (!get_context->SaveValue(found_key, found_value, &dont_care)) {
7c673cae
FG
603 break;
604 }
605 }
606 }
607 return Status::OK();
608}
609
11fdf7f2 610uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) {
7c673cae
FG
611 return 0;
612}
613
614PlainTableIterator::PlainTableIterator(PlainTableReader* table,
615 bool use_prefix_seek)
616 : table_(table),
617 decoder_(&table_->file_info_, table_->encoding_type_,
618 table_->user_key_len_, table_->prefix_extractor_),
619 use_prefix_seek_(use_prefix_seek) {
620 next_offset_ = offset_ = table_->file_info_.data_end_offset;
621}
622
623PlainTableIterator::~PlainTableIterator() {
624}
625
626bool PlainTableIterator::Valid() const {
627 return offset_ < table_->file_info_.data_end_offset &&
628 offset_ >= table_->data_start_offset_;
629}
630
631void PlainTableIterator::SeekToFirst() {
11fdf7f2 632 status_ = Status::OK();
7c673cae
FG
633 next_offset_ = table_->data_start_offset_;
634 if (next_offset_ >= table_->file_info_.data_end_offset) {
635 next_offset_ = offset_ = table_->file_info_.data_end_offset;
636 } else {
637 Next();
638 }
639}
640
641void PlainTableIterator::SeekToLast() {
642 assert(false);
643 status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
11fdf7f2 644 next_offset_ = offset_ = table_->file_info_.data_end_offset;
7c673cae
FG
645}
646
647void PlainTableIterator::Seek(const Slice& target) {
11fdf7f2
TL
648 if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
649 // This check is done here instead of NewIterator() to permit creating an
650 // iterator with total_order_seek = true even if we won't be able to Seek()
651 // it. This is needed for compaction: it creates iterator with
652 // total_order_seek = true but usually never does Seek() on it,
653 // only SeekToFirst().
654 status_ =
655 Status::InvalidArgument(
656 "total_order_seek not implemented for PlainTable.");
657 offset_ = next_offset_ = table_->file_info_.data_end_offset;
658 return;
659 }
660
7c673cae
FG
661 // If the user doesn't set prefix seek option and we are not able to do a
662 // total Seek(). assert failure.
11fdf7f2 663 if (table_->IsTotalOrderMode()) {
7c673cae
FG
664 if (table_->full_scan_mode_) {
665 status_ =
666 Status::InvalidArgument("Seek() is not allowed in full scan mode.");
667 offset_ = next_offset_ = table_->file_info_.data_end_offset;
668 return;
669 } else if (table_->GetIndexSize() > 1) {
670 assert(false);
671 status_ = Status::NotSupported(
672 "PlainTable cannot issue non-prefix seek unless in total order "
673 "mode.");
674 offset_ = next_offset_ = table_->file_info_.data_end_offset;
675 return;
676 }
677 }
678
679 Slice prefix_slice = table_->GetPrefix(target);
680 uint32_t prefix_hash = 0;
681 // Bloom filter is ignored in total-order mode.
682 if (!table_->IsTotalOrderMode()) {
683 prefix_hash = GetSliceHash(prefix_slice);
684 if (!table_->MatchBloom(prefix_hash)) {
11fdf7f2 685 status_ = Status::OK();
7c673cae
FG
686 offset_ = next_offset_ = table_->file_info_.data_end_offset;
687 return;
688 }
689 }
690 bool prefix_match;
691 status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
692 prefix_match, &next_offset_);
693 if (!status_.ok()) {
694 offset_ = next_offset_ = table_->file_info_.data_end_offset;
695 return;
696 }
697
698 if (next_offset_ < table_->file_info_.data_end_offset) {
699 for (Next(); status_.ok() && Valid(); Next()) {
700 if (!prefix_match) {
701 // Need to verify the first key's prefix
702 if (table_->GetPrefix(key()) != prefix_slice) {
703 offset_ = next_offset_ = table_->file_info_.data_end_offset;
704 break;
705 }
706 prefix_match = true;
707 }
708 if (table_->internal_comparator_.Compare(key(), target) >= 0) {
709 break;
710 }
711 }
712 } else {
713 offset_ = table_->file_info_.data_end_offset;
714 }
715}
716
11fdf7f2 717void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
7c673cae
FG
718 assert(false);
719 status_ =
720 Status::NotSupported("SeekForPrev() is not supported in PlainTable");
11fdf7f2 721 offset_ = next_offset_ = table_->file_info_.data_end_offset;
7c673cae
FG
722}
723
724void PlainTableIterator::Next() {
725 offset_ = next_offset_;
726 if (offset_ < table_->file_info_.data_end_offset) {
727 Slice tmp_slice;
728 ParsedInternalKey parsed_key;
729 status_ =
730 table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
731 if (!status_.ok()) {
732 offset_ = next_offset_ = table_->file_info_.data_end_offset;
733 }
734 }
735}
736
737void PlainTableIterator::Prev() {
738 assert(false);
739}
740
741Slice PlainTableIterator::key() const {
742 assert(Valid());
743 return key_;
744}
745
746Slice PlainTableIterator::value() const {
747 assert(Valid());
748 return value_;
749}
750
751Status PlainTableIterator::status() const {
752 return status_;
753}
754
755} // namespace rocksdb
756#endif // ROCKSDB_LITE