]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/table/plain/plain_table_reader.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / table / plain / plain_table_reader.cc
CommitLineData
f67539c2 1// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
7c673cae
FG
2// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file. See the AUTHORS file for names of contributors.
5
6#ifndef ROCKSDB_LITE
7
f67539c2 8#include "table/plain/plain_table_reader.h"
7c673cae
FG
9
10#include <string>
11#include <vector>
12
13#include "db/dbformat.h"
14
15#include "rocksdb/cache.h"
16#include "rocksdb/comparator.h"
17#include "rocksdb/env.h"
18#include "rocksdb/filter_policy.h"
19#include "rocksdb/options.h"
20#include "rocksdb/statistics.h"
21
f67539c2
TL
22#include "table/block_based/block.h"
23#include "table/block_based/filter_block.h"
7c673cae 24#include "table/format.h"
f67539c2 25#include "table/get_context.h"
7c673cae
FG
26#include "table/internal_iterator.h"
27#include "table/meta_blocks.h"
f67539c2
TL
28#include "table/plain/plain_table_bloom.h"
29#include "table/plain/plain_table_factory.h"
30#include "table/plain/plain_table_key_coding.h"
7c673cae 31#include "table/two_level_iterator.h"
7c673cae 32
f67539c2 33#include "memory/arena.h"
7c673cae
FG
34#include "monitoring/histogram.h"
35#include "monitoring/perf_context_imp.h"
7c673cae
FG
36#include "util/coding.h"
37#include "util/dynamic_bloom.h"
38#include "util/hash.h"
7c673cae
FG
39#include "util/stop_watch.h"
40#include "util/string_util.h"
41
f67539c2 42namespace ROCKSDB_NAMESPACE {
7c673cae
FG
43
44namespace {
45
46// Safely getting a uint32_t element from a char array, where, starting from
47// `base`, every 4 bytes are considered as an fixed 32 bit integer.
48inline uint32_t GetFixed32Element(const char* base, size_t offset) {
49 return DecodeFixed32(base + offset * sizeof(uint32_t));
50}
51} // namespace
52
53// Iterator to iterate IndexedTable
54class PlainTableIterator : public InternalIterator {
55 public:
56 explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
f67539c2
TL
57 // No copying allowed
58 PlainTableIterator(const PlainTableIterator&) = delete;
59 void operator=(const Iterator&) = delete;
60
494da23a 61 ~PlainTableIterator() override;
7c673cae
FG
62
63 bool Valid() const override;
64
65 void SeekToFirst() override;
66
67 void SeekToLast() override;
68
69 void Seek(const Slice& target) override;
70
71 void SeekForPrev(const Slice& target) override;
72
73 void Next() override;
74
75 void Prev() override;
76
77 Slice key() const override;
78
79 Slice value() const override;
80
81 Status status() const override;
82
83 private:
84 PlainTableReader* table_;
85 PlainTableKeyDecoder decoder_;
86 bool use_prefix_seek_;
87 uint32_t offset_;
88 uint32_t next_offset_;
89 Slice key_;
90 Slice value_;
91 Status status_;
7c673cae
FG
92};
93
94extern const uint64_t kPlainTableMagicNumber;
494da23a
TL
95PlainTableReader::PlainTableReader(
96 const ImmutableCFOptions& ioptions,
97 std::unique_ptr<RandomAccessFileReader>&& file,
98 const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
99 EncodingType encoding_type, uint64_t file_size,
100 const TableProperties* table_properties,
101 const SliceTransform* prefix_extractor)
7c673cae
FG
102 : internal_comparator_(icomparator),
103 encoding_type_(encoding_type),
104 full_scan_mode_(false),
105 user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
11fdf7f2 106 prefix_extractor_(prefix_extractor),
7c673cae 107 enable_bloom_(false),
494da23a 108 bloom_(6),
7c673cae
FG
109 file_info_(std::move(file), storage_options,
110 static_cast<uint32_t>(table_properties->data_size)),
111 ioptions_(ioptions),
112 file_size_(file_size),
113 table_properties_(nullptr) {}
114
115PlainTableReader::~PlainTableReader() {
20effc67
TL
116 // Should fix?
117 status_.PermitUncheckedError();
7c673cae
FG
118}
119
11fdf7f2
TL
120Status PlainTableReader::Open(
121 const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
122 const InternalKeyComparator& internal_comparator,
494da23a
TL
123 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
124 std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
11fdf7f2 125 double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
494da23a
TL
126 bool full_scan_mode, const bool immortal_table,
127 const SliceTransform* prefix_extractor) {
7c673cae
FG
128 if (file_size > PlainTableIndex::kMaxFileSize) {
129 return Status::NotSupported("File is too large for PlainTableReader!");
130 }
131
f67539c2 132 TableProperties* props_ptr = nullptr;
7c673cae 133 auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
f67539c2 134 ioptions, &props_ptr,
11fdf7f2 135 true /* compression_type_missing */);
f67539c2 136 std::shared_ptr<TableProperties> props(props_ptr);
7c673cae
FG
137 if (!s.ok()) {
138 return s;
139 }
140
141 assert(hash_table_ratio >= 0.0);
142 auto& user_props = props->user_collected_properties;
143 auto prefix_extractor_in_file = props->prefix_extractor_name;
144
145 if (!full_scan_mode &&
146 !prefix_extractor_in_file.empty() /* old version sst file*/
147 && prefix_extractor_in_file != "nullptr") {
11fdf7f2 148 if (!prefix_extractor) {
7c673cae
FG
149 return Status::InvalidArgument(
150 "Prefix extractor is missing when opening a PlainTable built "
151 "using a prefix extractor");
11fdf7f2
TL
152 } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
153 0) {
7c673cae
FG
154 return Status::InvalidArgument(
155 "Prefix extractor given doesn't match the one used to build "
156 "PlainTable");
157 }
158 }
159
160 EncodingType encoding_type = kPlain;
161 auto encoding_type_prop =
162 user_props.find(PlainTablePropertyNames::kEncodingType);
163 if (encoding_type_prop != user_props.end()) {
164 encoding_type = static_cast<EncodingType>(
165 DecodeFixed32(encoding_type_prop->second.c_str()));
166 }
167
168 std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
169 ioptions, std::move(file), env_options, internal_comparator,
f67539c2 170 encoding_type, file_size, props.get(), prefix_extractor));
7c673cae
FG
171
172 s = new_reader->MmapDataIfNeeded();
173 if (!s.ok()) {
174 return s;
175 }
176
177 if (!full_scan_mode) {
f67539c2
TL
178 s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
179 hash_table_ratio, index_sparseness,
180 huge_page_tlb_size);
7c673cae
FG
181 if (!s.ok()) {
182 return s;
183 }
184 } else {
185 // Flag to indicate it is a full scan mode so that none of the indexes
186 // can be used.
187 new_reader->full_scan_mode_ = true;
188 }
f67539c2
TL
189 // PopulateIndex can add to the props, so don't store them until now
190 new_reader->table_properties_ = props;
7c673cae 191
494da23a
TL
192 if (immortal_table && new_reader->file_info_.is_mmap_mode) {
193 new_reader->dummy_cleanable_.reset(new Cleanable());
194 }
195
7c673cae
FG
196 *table_reader = std::move(new_reader);
197 return s;
198}
199
200void PlainTableReader::SetupForCompaction() {
201}
202
11fdf7f2
TL
203InternalIterator* PlainTableReader::NewIterator(
204 const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
f67539c2 205 Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
20effc67
TL
206 size_t /*compaction_readahead_size*/,
207 bool /* allow_unprepared_value */) {
f67539c2
TL
208 // Not necessarily used here, but make sure this has been initialized
209 assert(table_properties_);
210
211 // Auto prefix mode is not implemented in PlainTable.
212 bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
213 !options.auto_prefix_mode;
7c673cae 214 if (arena == nullptr) {
11fdf7f2 215 return new PlainTableIterator(this, use_prefix_seek);
7c673cae
FG
216 } else {
217 auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
11fdf7f2 218 return new (mem) PlainTableIterator(this, use_prefix_seek);
7c673cae
FG
219 }
220}
221
222Status PlainTableReader::PopulateIndexRecordList(
494da23a
TL
223 PlainTableIndexBuilder* index_builder,
224 std::vector<uint32_t>* prefix_hashes) {
7c673cae
FG
225 Slice prev_key_prefix_slice;
226 std::string prev_key_prefix_buf;
227 uint32_t pos = data_start_offset_;
228
229 bool is_first_record = true;
230 Slice key_prefix_slice;
231 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
11fdf7f2 232 prefix_extractor_);
7c673cae
FG
233 while (pos < file_info_.data_end_offset) {
234 uint32_t key_offset = pos;
235 ParsedInternalKey key;
236 Slice value_slice;
237 bool seekable = false;
238 Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
239 if (!s.ok()) {
240 return s;
241 }
242
243 key_prefix_slice = GetPrefix(key);
244 if (enable_bloom_) {
245 bloom_.AddHash(GetSliceHash(key.user_key));
246 } else {
247 if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
248 if (!is_first_record) {
249 prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
250 }
251 if (file_info_.is_mmap_mode) {
252 prev_key_prefix_slice = key_prefix_slice;
253 } else {
254 prev_key_prefix_buf = key_prefix_slice.ToString();
255 prev_key_prefix_slice = prev_key_prefix_buf;
256 }
257 }
258 }
259
260 index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
261
262 if (!seekable && is_first_record) {
263 return Status::Corruption("Key for a prefix is not seekable");
264 }
265
266 is_first_record = false;
267 }
268
269 prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
270 auto s = index_.InitFromRawData(index_builder->Finish());
271 return s;
272}
273
f67539c2
TL
274void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
275 size_t huge_page_tlb_size) {
276 uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
277 if (bloom_total_bits > 0) {
278 enable_bloom_ = true;
279 bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
280 huge_page_tlb_size, ioptions_.info_log);
7c673cae
FG
281 }
282}
283
f67539c2 284void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
7c673cae 285 assert(bloom_.IsInitialized());
f67539c2 286 for (const auto prefix_hash : prefix_hashes) {
7c673cae
FG
287 bloom_.AddHash(prefix_hash);
288 }
289}
290
291Status PlainTableReader::MmapDataIfNeeded() {
292 if (file_info_.is_mmap_mode) {
293 // Get mmapped memory.
20effc67
TL
294 return file_info_.file->Read(IOOptions(), 0,
295 static_cast<size_t>(file_size_),
296 &file_info_.file_data, nullptr, nullptr);
7c673cae
FG
297 }
298 return Status::OK();
299}
300
301Status PlainTableReader::PopulateIndex(TableProperties* props,
302 int bloom_bits_per_key,
303 double hash_table_ratio,
304 size_t index_sparseness,
305 size_t huge_page_tlb_size) {
306 assert(props != nullptr);
7c673cae
FG
307
308 BlockContents index_block_contents;
11fdf7f2
TL
309 Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
310 file_size_, kPlainTableMagicNumber, ioptions_,
311 PlainTableIndexBuilder::kPlainTableIndexBlock,
f67539c2 312 BlockType::kIndex, &index_block_contents,
11fdf7f2 313 true /* compression_type_missing */);
7c673cae
FG
314
315 bool index_in_file = s.ok();
316
317 BlockContents bloom_block_contents;
318 bool bloom_in_file = false;
319 // We only need to read the bloom block if index block is in file.
320 if (index_in_file) {
11fdf7f2
TL
321 s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
322 file_size_, kPlainTableMagicNumber, ioptions_,
f67539c2
TL
323 BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
324 &bloom_block_contents,
11fdf7f2 325 true /* compression_type_missing */);
7c673cae
FG
326 bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
327 }
328
329 Slice* bloom_block;
330 if (bloom_in_file) {
331 // If bloom_block_contents.allocation is not empty (which will be the case
332 // for non-mmap mode), it holds the alloated memory for the bloom block.
333 // It needs to be kept alive to keep `bloom_block` valid.
334 bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
335 bloom_block = &bloom_block_contents.data;
336 } else {
337 bloom_block = nullptr;
338 }
339
340 Slice* index_block;
341 if (index_in_file) {
342 // If index_block_contents.allocation is not empty (which will be the case
343 // for non-mmap mode), it holds the alloated memory for the index block.
344 // It needs to be kept alive to keep `index_block` valid.
345 index_block_alloc_ = std::move(index_block_contents.allocation);
346 index_block = &index_block_contents.data;
347 } else {
348 index_block = nullptr;
349 }
350
11fdf7f2
TL
351 if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
352 // moptions.prefix_extractor is requried for a hash-based look-up.
7c673cae
FG
353 return Status::NotSupported(
354 "PlainTable requires a prefix extractor enable prefix hash mode.");
355 }
356
357 // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
358 // for a prefix (starting from the first one), generate a record of (hash,
359 // offset) and append it to IndexRecordList, which is a data structure created
360 // to store them.
361
362 if (!index_in_file) {
363 // Allocate bloom filter here for total order mode.
364 if (IsTotalOrderMode()) {
f67539c2
TL
365 AllocateBloom(bloom_bits_per_key,
366 static_cast<uint32_t>(props->num_entries),
367 huge_page_tlb_size);
7c673cae
FG
368 }
369 } else if (bloom_in_file) {
370 enable_bloom_ = true;
371 auto num_blocks_property = props->user_collected_properties.find(
372 PlainTablePropertyNames::kNumBloomBlocks);
373
374 uint32_t num_blocks = 0;
375 if (num_blocks_property != props->user_collected_properties.end()) {
376 Slice temp_slice(num_blocks_property->second);
377 if (!GetVarint32(&temp_slice, &num_blocks)) {
378 num_blocks = 0;
379 }
380 }
381 // cast away const qualifier, because bloom_ won't be changed
f67539c2
TL
382 bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
383 static_cast<uint32_t>(bloom_block->size()) * 8,
384 num_blocks);
7c673cae
FG
385 } else {
386 // Index in file but no bloom in file. Disable bloom filter in this case.
387 enable_bloom_ = false;
388 bloom_bits_per_key = 0;
389 }
390
11fdf7f2
TL
391 PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
392 index_sparseness, hash_table_ratio,
393 huge_page_tlb_size);
7c673cae
FG
394
395 std::vector<uint32_t> prefix_hashes;
396 if (!index_in_file) {
f67539c2 397 // Populates _bloom if enabled (total order mode)
7c673cae
FG
398 s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
399 if (!s.ok()) {
400 return s;
401 }
402 } else {
403 s = index_.InitFromRawData(*index_block);
404 if (!s.ok()) {
405 return s;
406 }
407 }
408
409 if (!index_in_file) {
f67539c2
TL
410 if (!IsTotalOrderMode()) {
411 // Calculated bloom filter size and allocate memory for
412 // bloom filter based on the number of prefixes, then fill it.
413 AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
414 huge_page_tlb_size);
415 if (enable_bloom_) {
416 FillBloom(prefix_hashes);
417 }
418 }
7c673cae
FG
419 }
420
421 // Fill two table properties.
422 if (!index_in_file) {
423 props->user_collected_properties["plain_table_hash_table_size"] =
424 ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
425 props->user_collected_properties["plain_table_sub_index_size"] =
426 ToString(index_.GetSubIndexSize());
427 } else {
428 props->user_collected_properties["plain_table_hash_table_size"] =
429 ToString(0);
430 props->user_collected_properties["plain_table_sub_index_size"] =
431 ToString(0);
432 }
433
434 return Status::OK();
435}
436
437Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
438 const Slice& target, const Slice& prefix,
439 uint32_t prefix_hash, bool& prefix_matched,
440 uint32_t* offset) const {
441 prefix_matched = false;
442 uint32_t prefix_index_offset;
443 auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
444 if (res == PlainTableIndex::kNoPrefixForBucket) {
445 *offset = file_info_.data_end_offset;
446 return Status::OK();
447 } else if (res == PlainTableIndex::kDirectToFile) {
448 *offset = prefix_index_offset;
449 return Status::OK();
450 }
451
452 // point to sub-index, need to do a binary search
20effc67 453 uint32_t upper_bound = 0;
7c673cae
FG
454 const char* base_ptr =
455 index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
456 uint32_t low = 0;
457 uint32_t high = upper_bound;
458 ParsedInternalKey mid_key;
459 ParsedInternalKey parsed_target;
20effc67
TL
460 Status s = ParseInternalKey(target, &parsed_target,
461 false /* log_err_key */); // TODO
462 if (!s.ok()) return s;
7c673cae
FG
463
464 // The key is between [low, high). Do a binary search between it.
465 while (high - low > 1) {
466 uint32_t mid = (high + low) / 2;
467 uint32_t file_offset = GetFixed32Element(base_ptr, mid);
468 uint32_t tmp;
20effc67 469 s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
7c673cae
FG
470 if (!s.ok()) {
471 return s;
472 }
473 int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
474 if (cmp_result < 0) {
475 low = mid;
476 } else {
477 if (cmp_result == 0) {
478 // Happen to have found the exact key or target is smaller than the
479 // first key after base_offset.
480 prefix_matched = true;
481 *offset = file_offset;
482 return Status::OK();
483 } else {
484 high = mid;
485 }
486 }
487 }
488 // Both of the key at the position low or low+1 could share the same
489 // prefix as target. We need to rule out one of them to avoid to go
490 // to the wrong prefix.
491 ParsedInternalKey low_key;
492 uint32_t tmp;
493 uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
20effc67 494 s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
7c673cae
FG
495 if (!s.ok()) {
496 return s;
497 }
498
499 if (GetPrefix(low_key) == prefix) {
500 prefix_matched = true;
501 *offset = low_key_offset;
502 } else if (low + 1 < upper_bound) {
503 // There is possible a next prefix, return it
504 prefix_matched = false;
505 *offset = GetFixed32Element(base_ptr, low + 1);
506 } else {
507 // target is larger than a key of the last prefix in this bucket
508 // but with a different prefix. Key does not exist.
509 *offset = file_info_.data_end_offset;
510 }
511 return Status::OK();
512}
513
514bool PlainTableReader::MatchBloom(uint32_t hash) const {
515 if (!enable_bloom_) {
516 return true;
517 }
518
519 if (bloom_.MayContainHash(hash)) {
520 PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
521 return true;
522 } else {
523 PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
524 return false;
525 }
526}
527
528Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
529 ParsedInternalKey* parsed_key,
530 Slice* internal_key, Slice* value,
531 bool* seekable) const {
532 if (*offset == file_info_.data_end_offset) {
533 *offset = file_info_.data_end_offset;
534 return Status::OK();
535 }
536
537 if (*offset > file_info_.data_end_offset) {
538 return Status::Corruption("Offset is out of file size");
539 }
540
541 uint32_t bytes_read;
542 Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
543 &bytes_read, seekable);
544 if (!s.ok()) {
545 return s;
546 }
547 *offset = *offset + bytes_read;
548 return Status::OK();
549}
550
551void PlainTableReader::Prepare(const Slice& target) {
552 if (enable_bloom_) {
553 uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
554 bloom_.Prefetch(prefix_hash);
555 }
556}
557
11fdf7f2
TL
558Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
559 GetContext* get_context,
560 const SliceTransform* /* prefix_extractor */,
561 bool /*skip_filters*/) {
7c673cae
FG
562 // Check bloom filter first.
563 Slice prefix_slice;
564 uint32_t prefix_hash;
565 if (IsTotalOrderMode()) {
566 if (full_scan_mode_) {
567 status_ =
568 Status::InvalidArgument("Get() is not allowed in full scan mode.");
569 }
570 // Match whole user key for bloom filter check.
571 if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
572 return Status::OK();
573 }
574 // in total order mode, there is only one bucket 0, and we always use empty
575 // prefix.
576 prefix_slice = Slice();
577 prefix_hash = 0;
578 } else {
579 prefix_slice = GetPrefix(target);
580 prefix_hash = GetSliceHash(prefix_slice);
581 if (!MatchBloom(prefix_hash)) {
582 return Status::OK();
583 }
584 }
585 uint32_t offset;
586 bool prefix_match;
587 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
11fdf7f2 588 prefix_extractor_);
7c673cae
FG
589 Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
590 prefix_match, &offset);
591
592 if (!s.ok()) {
593 return s;
594 }
595 ParsedInternalKey found_key;
596 ParsedInternalKey parsed_target;
20effc67
TL
597 s = ParseInternalKey(target, &parsed_target,
598 false /* log_err_key */); // TODO
599 if (!s.ok()) return s;
600
7c673cae
FG
601 Slice found_value;
602 while (offset < file_info_.data_end_offset) {
603 s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
604 if (!s.ok()) {
605 return s;
606 }
607 if (!prefix_match) {
608 // Need to verify prefix for the first key found if it is not yet
609 // checked.
610 if (GetPrefix(found_key) != prefix_slice) {
611 return Status::OK();
612 }
613 prefix_match = true;
614 }
615 // TODO(ljin): since we know the key comparison result here,
616 // can we enable the fast path?
617 if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
11fdf7f2 618 bool dont_care __attribute__((__unused__));
494da23a
TL
619 if (!get_context->SaveValue(found_key, found_value, &dont_care,
620 dummy_cleanable_.get())) {
7c673cae
FG
621 break;
622 }
623 }
624 }
625 return Status::OK();
626}
627
f67539c2
TL
628uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
629 TableReaderCaller /*caller*/) {
630 return 0;
631}
632
633uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
634 const Slice& /*end*/,
635 TableReaderCaller /*caller*/) {
7c673cae
FG
636 return 0;
637}
638
639PlainTableIterator::PlainTableIterator(PlainTableReader* table,
640 bool use_prefix_seek)
641 : table_(table),
642 decoder_(&table_->file_info_, table_->encoding_type_,
643 table_->user_key_len_, table_->prefix_extractor_),
644 use_prefix_seek_(use_prefix_seek) {
645 next_offset_ = offset_ = table_->file_info_.data_end_offset;
646}
647
648PlainTableIterator::~PlainTableIterator() {
649}
650
651bool PlainTableIterator::Valid() const {
652 return offset_ < table_->file_info_.data_end_offset &&
653 offset_ >= table_->data_start_offset_;
654}
655
656void PlainTableIterator::SeekToFirst() {
11fdf7f2 657 status_ = Status::OK();
7c673cae
FG
658 next_offset_ = table_->data_start_offset_;
659 if (next_offset_ >= table_->file_info_.data_end_offset) {
660 next_offset_ = offset_ = table_->file_info_.data_end_offset;
661 } else {
662 Next();
663 }
664}
665
666void PlainTableIterator::SeekToLast() {
667 assert(false);
668 status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
11fdf7f2 669 next_offset_ = offset_ = table_->file_info_.data_end_offset;
7c673cae
FG
670}
671
672void PlainTableIterator::Seek(const Slice& target) {
11fdf7f2
TL
673 if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
674 // This check is done here instead of NewIterator() to permit creating an
675 // iterator with total_order_seek = true even if we won't be able to Seek()
676 // it. This is needed for compaction: it creates iterator with
677 // total_order_seek = true but usually never does Seek() on it,
678 // only SeekToFirst().
679 status_ =
680 Status::InvalidArgument(
681 "total_order_seek not implemented for PlainTable.");
682 offset_ = next_offset_ = table_->file_info_.data_end_offset;
683 return;
684 }
685
7c673cae
FG
686 // If the user doesn't set prefix seek option and we are not able to do a
687 // total Seek(). assert failure.
11fdf7f2 688 if (table_->IsTotalOrderMode()) {
7c673cae
FG
689 if (table_->full_scan_mode_) {
690 status_ =
691 Status::InvalidArgument("Seek() is not allowed in full scan mode.");
692 offset_ = next_offset_ = table_->file_info_.data_end_offset;
693 return;
694 } else if (table_->GetIndexSize() > 1) {
695 assert(false);
696 status_ = Status::NotSupported(
697 "PlainTable cannot issue non-prefix seek unless in total order "
698 "mode.");
699 offset_ = next_offset_ = table_->file_info_.data_end_offset;
700 return;
701 }
702 }
703
704 Slice prefix_slice = table_->GetPrefix(target);
705 uint32_t prefix_hash = 0;
706 // Bloom filter is ignored in total-order mode.
707 if (!table_->IsTotalOrderMode()) {
708 prefix_hash = GetSliceHash(prefix_slice);
709 if (!table_->MatchBloom(prefix_hash)) {
11fdf7f2 710 status_ = Status::OK();
7c673cae
FG
711 offset_ = next_offset_ = table_->file_info_.data_end_offset;
712 return;
713 }
714 }
715 bool prefix_match;
716 status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
717 prefix_match, &next_offset_);
718 if (!status_.ok()) {
719 offset_ = next_offset_ = table_->file_info_.data_end_offset;
720 return;
721 }
722
723 if (next_offset_ < table_->file_info_.data_end_offset) {
724 for (Next(); status_.ok() && Valid(); Next()) {
725 if (!prefix_match) {
726 // Need to verify the first key's prefix
727 if (table_->GetPrefix(key()) != prefix_slice) {
728 offset_ = next_offset_ = table_->file_info_.data_end_offset;
729 break;
730 }
731 prefix_match = true;
732 }
733 if (table_->internal_comparator_.Compare(key(), target) >= 0) {
734 break;
735 }
736 }
737 } else {
738 offset_ = table_->file_info_.data_end_offset;
739 }
740}
741
11fdf7f2 742void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
7c673cae
FG
743 assert(false);
744 status_ =
745 Status::NotSupported("SeekForPrev() is not supported in PlainTable");
11fdf7f2 746 offset_ = next_offset_ = table_->file_info_.data_end_offset;
7c673cae
FG
747}
748
749void PlainTableIterator::Next() {
750 offset_ = next_offset_;
751 if (offset_ < table_->file_info_.data_end_offset) {
752 Slice tmp_slice;
753 ParsedInternalKey parsed_key;
754 status_ =
755 table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
756 if (!status_.ok()) {
757 offset_ = next_offset_ = table_->file_info_.data_end_offset;
758 }
759 }
760}
761
762void PlainTableIterator::Prev() {
763 assert(false);
764}
765
766Slice PlainTableIterator::key() const {
767 assert(Valid());
768 return key_;
769}
770
771Slice PlainTableIterator::value() const {
772 assert(Valid());
773 return value_;
774}
775
776Status PlainTableIterator::status() const {
777 return status_;
778}
779
f67539c2 780} // namespace ROCKSDB_NAMESPACE
7c673cae 781#endif // ROCKSDB_LITE