1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
7 #include "table/plain_table_reader.h"
12 #include "db/dbformat.h"
14 #include "rocksdb/cache.h"
15 #include "rocksdb/comparator.h"
16 #include "rocksdb/env.h"
17 #include "rocksdb/filter_policy.h"
18 #include "rocksdb/options.h"
19 #include "rocksdb/statistics.h"
21 #include "table/block.h"
22 #include "table/bloom_block.h"
23 #include "table/filter_block.h"
24 #include "table/format.h"
25 #include "table/internal_iterator.h"
26 #include "table/meta_blocks.h"
27 #include "table/two_level_iterator.h"
28 #include "table/plain_table_factory.h"
29 #include "table/plain_table_key_coding.h"
30 #include "table/get_context.h"
32 #include "monitoring/histogram.h"
33 #include "monitoring/perf_context_imp.h"
34 #include "util/arena.h"
35 #include "util/coding.h"
36 #include "util/dynamic_bloom.h"
37 #include "util/hash.h"
38 #include "util/murmurhash.h"
39 #include "util/stop_watch.h"
40 #include "util/string_util.h"
46 // Safely getting a uint32_t element from a char array, where, starting from
47 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
48 inline uint32_t GetFixed32Element(const char* base
, size_t offset
) {
49 return DecodeFixed32(base
+ offset
* sizeof(uint32_t));
53 // Iterator to iterate IndexedTable
54 class PlainTableIterator
: public InternalIterator
{
56 explicit PlainTableIterator(PlainTableReader
* table
, bool use_prefix_seek
);
57 ~PlainTableIterator();
59 bool Valid() const override
;
61 void SeekToFirst() override
;
63 void SeekToLast() override
;
65 void Seek(const Slice
& target
) override
;
67 void SeekForPrev(const Slice
& target
) override
;
73 Slice
key() const override
;
75 Slice
value() const override
;
77 Status
status() const override
;
80 PlainTableReader
* table_
;
81 PlainTableKeyDecoder decoder_
;
82 bool use_prefix_seek_
;
84 uint32_t next_offset_
;
89 PlainTableIterator(const PlainTableIterator
&) = delete;
90 void operator=(const Iterator
&) = delete;
93 extern const uint64_t kPlainTableMagicNumber
;
94 PlainTableReader::PlainTableReader(const ImmutableCFOptions
& ioptions
,
95 unique_ptr
<RandomAccessFileReader
>&& file
,
96 const EnvOptions
& storage_options
,
97 const InternalKeyComparator
& icomparator
,
98 EncodingType encoding_type
,
100 const TableProperties
* table_properties
,
101 const SliceTransform
* prefix_extractor
)
102 : internal_comparator_(icomparator
),
103 encoding_type_(encoding_type
),
104 full_scan_mode_(false),
105 user_key_len_(static_cast<uint32_t>(table_properties
->fixed_key_len
)),
106 prefix_extractor_(prefix_extractor
),
107 enable_bloom_(false),
109 file_info_(std::move(file
), storage_options
,
110 static_cast<uint32_t>(table_properties
->data_size
)),
112 file_size_(file_size
),
113 table_properties_(nullptr) {}
115 PlainTableReader::~PlainTableReader() {
118 Status
PlainTableReader::Open(
119 const ImmutableCFOptions
& ioptions
, const EnvOptions
& env_options
,
120 const InternalKeyComparator
& internal_comparator
,
121 unique_ptr
<RandomAccessFileReader
>&& file
, uint64_t file_size
,
122 unique_ptr
<TableReader
>* table_reader
, const int bloom_bits_per_key
,
123 double hash_table_ratio
, size_t index_sparseness
, size_t huge_page_tlb_size
,
124 bool full_scan_mode
, const SliceTransform
* prefix_extractor
) {
125 if (file_size
> PlainTableIndex::kMaxFileSize
) {
126 return Status::NotSupported("File is too large for PlainTableReader!");
129 TableProperties
* props
= nullptr;
130 auto s
= ReadTableProperties(file
.get(), file_size
, kPlainTableMagicNumber
,
132 true /* compression_type_missing */);
137 assert(hash_table_ratio
>= 0.0);
138 auto& user_props
= props
->user_collected_properties
;
139 auto prefix_extractor_in_file
= props
->prefix_extractor_name
;
141 if (!full_scan_mode
&&
142 !prefix_extractor_in_file
.empty() /* old version sst file*/
143 && prefix_extractor_in_file
!= "nullptr") {
144 if (!prefix_extractor
) {
145 return Status::InvalidArgument(
146 "Prefix extractor is missing when opening a PlainTable built "
147 "using a prefix extractor");
148 } else if (prefix_extractor_in_file
.compare(prefix_extractor
->Name()) !=
150 return Status::InvalidArgument(
151 "Prefix extractor given doesn't match the one used to build "
156 EncodingType encoding_type
= kPlain
;
157 auto encoding_type_prop
=
158 user_props
.find(PlainTablePropertyNames::kEncodingType
);
159 if (encoding_type_prop
!= user_props
.end()) {
160 encoding_type
= static_cast<EncodingType
>(
161 DecodeFixed32(encoding_type_prop
->second
.c_str()));
164 std::unique_ptr
<PlainTableReader
> new_reader(new PlainTableReader(
165 ioptions
, std::move(file
), env_options
, internal_comparator
,
166 encoding_type
, file_size
, props
, prefix_extractor
));
168 s
= new_reader
->MmapDataIfNeeded();
173 if (!full_scan_mode
) {
174 s
= new_reader
->PopulateIndex(props
, bloom_bits_per_key
, hash_table_ratio
,
175 index_sparseness
, huge_page_tlb_size
);
180 // Flag to indicate it is a full scan mode so that none of the indexes
182 new_reader
->full_scan_mode_
= true;
185 *table_reader
= std::move(new_reader
);
189 void PlainTableReader::SetupForCompaction() {
192 InternalIterator
* PlainTableReader::NewIterator(
193 const ReadOptions
& options
, const SliceTransform
* /* prefix_extractor */,
194 Arena
* arena
, bool /*skip_filters*/, bool /*for_compaction*/) {
195 bool use_prefix_seek
= !IsTotalOrderMode() && !options
.total_order_seek
;
196 if (arena
== nullptr) {
197 return new PlainTableIterator(this, use_prefix_seek
);
199 auto mem
= arena
->AllocateAligned(sizeof(PlainTableIterator
));
200 return new (mem
) PlainTableIterator(this, use_prefix_seek
);
204 Status
PlainTableReader::PopulateIndexRecordList(
205 PlainTableIndexBuilder
* index_builder
, vector
<uint32_t>* prefix_hashes
) {
206 Slice prev_key_prefix_slice
;
207 std::string prev_key_prefix_buf
;
208 uint32_t pos
= data_start_offset_
;
210 bool is_first_record
= true;
211 Slice key_prefix_slice
;
212 PlainTableKeyDecoder
decoder(&file_info_
, encoding_type_
, user_key_len_
,
214 while (pos
< file_info_
.data_end_offset
) {
215 uint32_t key_offset
= pos
;
216 ParsedInternalKey key
;
218 bool seekable
= false;
219 Status s
= Next(&decoder
, &pos
, &key
, nullptr, &value_slice
, &seekable
);
224 key_prefix_slice
= GetPrefix(key
);
226 bloom_
.AddHash(GetSliceHash(key
.user_key
));
228 if (is_first_record
|| prev_key_prefix_slice
!= key_prefix_slice
) {
229 if (!is_first_record
) {
230 prefix_hashes
->push_back(GetSliceHash(prev_key_prefix_slice
));
232 if (file_info_
.is_mmap_mode
) {
233 prev_key_prefix_slice
= key_prefix_slice
;
235 prev_key_prefix_buf
= key_prefix_slice
.ToString();
236 prev_key_prefix_slice
= prev_key_prefix_buf
;
241 index_builder
->AddKeyPrefix(GetPrefix(key
), key_offset
);
243 if (!seekable
&& is_first_record
) {
244 return Status::Corruption("Key for a prefix is not seekable");
247 is_first_record
= false;
250 prefix_hashes
->push_back(GetSliceHash(key_prefix_slice
));
251 auto s
= index_
.InitFromRawData(index_builder
->Finish());
255 void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key
,
257 size_t huge_page_tlb_size
,
258 vector
<uint32_t>* prefix_hashes
) {
259 if (!IsTotalOrderMode()) {
260 uint32_t bloom_total_bits
= num_prefixes
* bloom_bits_per_key
;
261 if (bloom_total_bits
> 0) {
262 enable_bloom_
= true;
263 bloom_
.SetTotalBits(&arena_
, bloom_total_bits
, ioptions_
.bloom_locality
,
264 huge_page_tlb_size
, ioptions_
.info_log
);
265 FillBloom(prefix_hashes
);
270 void PlainTableReader::FillBloom(vector
<uint32_t>* prefix_hashes
) {
271 assert(bloom_
.IsInitialized());
272 for (auto prefix_hash
: *prefix_hashes
) {
273 bloom_
.AddHash(prefix_hash
);
277 Status
PlainTableReader::MmapDataIfNeeded() {
278 if (file_info_
.is_mmap_mode
) {
279 // Get mmapped memory.
280 return file_info_
.file
->Read(0, static_cast<size_t>(file_size_
), &file_info_
.file_data
, nullptr);
285 Status
PlainTableReader::PopulateIndex(TableProperties
* props
,
286 int bloom_bits_per_key
,
287 double hash_table_ratio
,
288 size_t index_sparseness
,
289 size_t huge_page_tlb_size
) {
290 assert(props
!= nullptr);
291 table_properties_
.reset(props
);
293 BlockContents index_block_contents
;
294 Status s
= ReadMetaBlock(file_info_
.file
.get(), nullptr /* prefetch_buffer */,
295 file_size_
, kPlainTableMagicNumber
, ioptions_
,
296 PlainTableIndexBuilder::kPlainTableIndexBlock
,
297 &index_block_contents
,
298 true /* compression_type_missing */);
300 bool index_in_file
= s
.ok();
302 BlockContents bloom_block_contents
;
303 bool bloom_in_file
= false;
304 // We only need to read the bloom block if index block is in file.
306 s
= ReadMetaBlock(file_info_
.file
.get(), nullptr /* prefetch_buffer */,
307 file_size_
, kPlainTableMagicNumber
, ioptions_
,
308 BloomBlockBuilder::kBloomBlock
, &bloom_block_contents
,
309 true /* compression_type_missing */);
310 bloom_in_file
= s
.ok() && bloom_block_contents
.data
.size() > 0;
315 // If bloom_block_contents.allocation is not empty (which will be the case
316 // for non-mmap mode), it holds the alloated memory for the bloom block.
317 // It needs to be kept alive to keep `bloom_block` valid.
318 bloom_block_alloc_
= std::move(bloom_block_contents
.allocation
);
319 bloom_block
= &bloom_block_contents
.data
;
321 bloom_block
= nullptr;
326 // If index_block_contents.allocation is not empty (which will be the case
327 // for non-mmap mode), it holds the alloated memory for the index block.
328 // It needs to be kept alive to keep `index_block` valid.
329 index_block_alloc_
= std::move(index_block_contents
.allocation
);
330 index_block
= &index_block_contents
.data
;
332 index_block
= nullptr;
335 if ((prefix_extractor_
== nullptr) && (hash_table_ratio
!= 0)) {
336 // moptions.prefix_extractor is requried for a hash-based look-up.
337 return Status::NotSupported(
338 "PlainTable requires a prefix extractor enable prefix hash mode.");
341 // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
342 // for a prefix (starting from the first one), generate a record of (hash,
343 // offset) and append it to IndexRecordList, which is a data structure created
346 if (!index_in_file
) {
347 // Allocate bloom filter here for total order mode.
348 if (IsTotalOrderMode()) {
349 uint32_t num_bloom_bits
=
350 static_cast<uint32_t>(table_properties_
->num_entries
) *
352 if (num_bloom_bits
> 0) {
353 enable_bloom_
= true;
354 bloom_
.SetTotalBits(&arena_
, num_bloom_bits
, ioptions_
.bloom_locality
,
355 huge_page_tlb_size
, ioptions_
.info_log
);
358 } else if (bloom_in_file
) {
359 enable_bloom_
= true;
360 auto num_blocks_property
= props
->user_collected_properties
.find(
361 PlainTablePropertyNames::kNumBloomBlocks
);
363 uint32_t num_blocks
= 0;
364 if (num_blocks_property
!= props
->user_collected_properties
.end()) {
365 Slice
temp_slice(num_blocks_property
->second
);
366 if (!GetVarint32(&temp_slice
, &num_blocks
)) {
370 // cast away const qualifier, because bloom_ won't be changed
372 const_cast<unsigned char*>(
373 reinterpret_cast<const unsigned char*>(bloom_block
->data())),
374 static_cast<uint32_t>(bloom_block
->size()) * 8, num_blocks
);
376 // Index in file but no bloom in file. Disable bloom filter in this case.
377 enable_bloom_
= false;
378 bloom_bits_per_key
= 0;
381 PlainTableIndexBuilder
index_builder(&arena_
, ioptions_
, prefix_extractor_
,
382 index_sparseness
, hash_table_ratio
,
385 std::vector
<uint32_t> prefix_hashes
;
386 if (!index_in_file
) {
387 s
= PopulateIndexRecordList(&index_builder
, &prefix_hashes
);
392 s
= index_
.InitFromRawData(*index_block
);
398 if (!index_in_file
) {
399 // Calculated bloom filter size and allocate memory for
400 // bloom filter based on the number of prefixes, then fill it.
401 AllocateAndFillBloom(bloom_bits_per_key
, index_
.GetNumPrefixes(),
402 huge_page_tlb_size
, &prefix_hashes
);
405 // Fill two table properties.
406 if (!index_in_file
) {
407 props
->user_collected_properties
["plain_table_hash_table_size"] =
408 ToString(index_
.GetIndexSize() * PlainTableIndex::kOffsetLen
);
409 props
->user_collected_properties
["plain_table_sub_index_size"] =
410 ToString(index_
.GetSubIndexSize());
412 props
->user_collected_properties
["plain_table_hash_table_size"] =
414 props
->user_collected_properties
["plain_table_sub_index_size"] =
421 Status
PlainTableReader::GetOffset(PlainTableKeyDecoder
* decoder
,
422 const Slice
& target
, const Slice
& prefix
,
423 uint32_t prefix_hash
, bool& prefix_matched
,
424 uint32_t* offset
) const {
425 prefix_matched
= false;
426 uint32_t prefix_index_offset
;
427 auto res
= index_
.GetOffset(prefix_hash
, &prefix_index_offset
);
428 if (res
== PlainTableIndex::kNoPrefixForBucket
) {
429 *offset
= file_info_
.data_end_offset
;
431 } else if (res
== PlainTableIndex::kDirectToFile
) {
432 *offset
= prefix_index_offset
;
436 // point to sub-index, need to do a binary search
437 uint32_t upper_bound
;
438 const char* base_ptr
=
439 index_
.GetSubIndexBasePtrAndUpperBound(prefix_index_offset
, &upper_bound
);
441 uint32_t high
= upper_bound
;
442 ParsedInternalKey mid_key
;
443 ParsedInternalKey parsed_target
;
444 if (!ParseInternalKey(target
, &parsed_target
)) {
445 return Status::Corruption(Slice());
448 // The key is between [low, high). Do a binary search between it.
449 while (high
- low
> 1) {
450 uint32_t mid
= (high
+ low
) / 2;
451 uint32_t file_offset
= GetFixed32Element(base_ptr
, mid
);
453 Status s
= decoder
->NextKeyNoValue(file_offset
, &mid_key
, nullptr, &tmp
);
457 int cmp_result
= internal_comparator_
.Compare(mid_key
, parsed_target
);
458 if (cmp_result
< 0) {
461 if (cmp_result
== 0) {
462 // Happen to have found the exact key or target is smaller than the
463 // first key after base_offset.
464 prefix_matched
= true;
465 *offset
= file_offset
;
472 // Both of the key at the position low or low+1 could share the same
473 // prefix as target. We need to rule out one of them to avoid to go
474 // to the wrong prefix.
475 ParsedInternalKey low_key
;
477 uint32_t low_key_offset
= GetFixed32Element(base_ptr
, low
);
478 Status s
= decoder
->NextKeyNoValue(low_key_offset
, &low_key
, nullptr, &tmp
);
483 if (GetPrefix(low_key
) == prefix
) {
484 prefix_matched
= true;
485 *offset
= low_key_offset
;
486 } else if (low
+ 1 < upper_bound
) {
487 // There is possible a next prefix, return it
488 prefix_matched
= false;
489 *offset
= GetFixed32Element(base_ptr
, low
+ 1);
491 // target is larger than a key of the last prefix in this bucket
492 // but with a different prefix. Key does not exist.
493 *offset
= file_info_
.data_end_offset
;
498 bool PlainTableReader::MatchBloom(uint32_t hash
) const {
499 if (!enable_bloom_
) {
503 if (bloom_
.MayContainHash(hash
)) {
504 PERF_COUNTER_ADD(bloom_sst_hit_count
, 1);
507 PERF_COUNTER_ADD(bloom_sst_miss_count
, 1);
512 Status
PlainTableReader::Next(PlainTableKeyDecoder
* decoder
, uint32_t* offset
,
513 ParsedInternalKey
* parsed_key
,
514 Slice
* internal_key
, Slice
* value
,
515 bool* seekable
) const {
516 if (*offset
== file_info_
.data_end_offset
) {
517 *offset
= file_info_
.data_end_offset
;
521 if (*offset
> file_info_
.data_end_offset
) {
522 return Status::Corruption("Offset is out of file size");
526 Status s
= decoder
->NextKey(*offset
, parsed_key
, internal_key
, value
,
527 &bytes_read
, seekable
);
531 *offset
= *offset
+ bytes_read
;
535 void PlainTableReader::Prepare(const Slice
& target
) {
537 uint32_t prefix_hash
= GetSliceHash(GetPrefix(target
));
538 bloom_
.Prefetch(prefix_hash
);
542 Status
PlainTableReader::Get(const ReadOptions
& /*ro*/, const Slice
& target
,
543 GetContext
* get_context
,
544 const SliceTransform
* /* prefix_extractor */,
545 bool /*skip_filters*/) {
546 // Check bloom filter first.
548 uint32_t prefix_hash
;
549 if (IsTotalOrderMode()) {
550 if (full_scan_mode_
) {
552 Status::InvalidArgument("Get() is not allowed in full scan mode.");
554 // Match whole user key for bloom filter check.
555 if (!MatchBloom(GetSliceHash(GetUserKey(target
)))) {
558 // in total order mode, there is only one bucket 0, and we always use empty
560 prefix_slice
= Slice();
563 prefix_slice
= GetPrefix(target
);
564 prefix_hash
= GetSliceHash(prefix_slice
);
565 if (!MatchBloom(prefix_hash
)) {
571 PlainTableKeyDecoder
decoder(&file_info_
, encoding_type_
, user_key_len_
,
573 Status s
= GetOffset(&decoder
, target
, prefix_slice
, prefix_hash
,
574 prefix_match
, &offset
);
579 ParsedInternalKey found_key
;
580 ParsedInternalKey parsed_target
;
581 if (!ParseInternalKey(target
, &parsed_target
)) {
582 return Status::Corruption(Slice());
585 while (offset
< file_info_
.data_end_offset
) {
586 s
= Next(&decoder
, &offset
, &found_key
, nullptr, &found_value
);
591 // Need to verify prefix for the first key found if it is not yet
593 if (GetPrefix(found_key
) != prefix_slice
) {
598 // TODO(ljin): since we know the key comparison result here,
599 // can we enable the fast path?
600 if (internal_comparator_
.Compare(found_key
, parsed_target
) >= 0) {
601 bool dont_care
__attribute__((__unused__
));
602 if (!get_context
->SaveValue(found_key
, found_value
, &dont_care
)) {
610 uint64_t PlainTableReader::ApproximateOffsetOf(const Slice
& /*key*/) {
614 PlainTableIterator::PlainTableIterator(PlainTableReader
* table
,
615 bool use_prefix_seek
)
617 decoder_(&table_
->file_info_
, table_
->encoding_type_
,
618 table_
->user_key_len_
, table_
->prefix_extractor_
),
619 use_prefix_seek_(use_prefix_seek
) {
620 next_offset_
= offset_
= table_
->file_info_
.data_end_offset
;
623 PlainTableIterator::~PlainTableIterator() {
626 bool PlainTableIterator::Valid() const {
627 return offset_
< table_
->file_info_
.data_end_offset
&&
628 offset_
>= table_
->data_start_offset_
;
631 void PlainTableIterator::SeekToFirst() {
632 status_
= Status::OK();
633 next_offset_
= table_
->data_start_offset_
;
634 if (next_offset_
>= table_
->file_info_
.data_end_offset
) {
635 next_offset_
= offset_
= table_
->file_info_
.data_end_offset
;
641 void PlainTableIterator::SeekToLast() {
643 status_
= Status::NotSupported("SeekToLast() is not supported in PlainTable");
644 next_offset_
= offset_
= table_
->file_info_
.data_end_offset
;
647 void PlainTableIterator::Seek(const Slice
& target
) {
648 if (use_prefix_seek_
!= !table_
->IsTotalOrderMode()) {
649 // This check is done here instead of NewIterator() to permit creating an
650 // iterator with total_order_seek = true even if we won't be able to Seek()
651 // it. This is needed for compaction: it creates iterator with
652 // total_order_seek = true but usually never does Seek() on it,
653 // only SeekToFirst().
655 Status::InvalidArgument(
656 "total_order_seek not implemented for PlainTable.");
657 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
661 // If the user doesn't set prefix seek option and we are not able to do a
662 // total Seek(). assert failure.
663 if (table_
->IsTotalOrderMode()) {
664 if (table_
->full_scan_mode_
) {
666 Status::InvalidArgument("Seek() is not allowed in full scan mode.");
667 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
669 } else if (table_
->GetIndexSize() > 1) {
671 status_
= Status::NotSupported(
672 "PlainTable cannot issue non-prefix seek unless in total order "
674 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
679 Slice prefix_slice
= table_
->GetPrefix(target
);
680 uint32_t prefix_hash
= 0;
681 // Bloom filter is ignored in total-order mode.
682 if (!table_
->IsTotalOrderMode()) {
683 prefix_hash
= GetSliceHash(prefix_slice
);
684 if (!table_
->MatchBloom(prefix_hash
)) {
685 status_
= Status::OK();
686 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
691 status_
= table_
->GetOffset(&decoder_
, target
, prefix_slice
, prefix_hash
,
692 prefix_match
, &next_offset_
);
694 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
698 if (next_offset_
< table_
->file_info_
.data_end_offset
) {
699 for (Next(); status_
.ok() && Valid(); Next()) {
701 // Need to verify the first key's prefix
702 if (table_
->GetPrefix(key()) != prefix_slice
) {
703 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
708 if (table_
->internal_comparator_
.Compare(key(), target
) >= 0) {
713 offset_
= table_
->file_info_
.data_end_offset
;
717 void PlainTableIterator::SeekForPrev(const Slice
& /*target*/) {
720 Status::NotSupported("SeekForPrev() is not supported in PlainTable");
721 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
724 void PlainTableIterator::Next() {
725 offset_
= next_offset_
;
726 if (offset_
< table_
->file_info_
.data_end_offset
) {
728 ParsedInternalKey parsed_key
;
730 table_
->Next(&decoder_
, &next_offset_
, &parsed_key
, &key_
, &value_
);
732 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
737 void PlainTableIterator::Prev() {
741 Slice
PlainTableIterator::key() const {
746 Slice
PlainTableIterator::value() const {
751 Status
PlainTableIterator::status() const {
755 } // namespace rocksdb
756 #endif // ROCKSDB_LITE