1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
7 #include "table/plain_table_reader.h"
12 #include "db/dbformat.h"
14 #include "rocksdb/cache.h"
15 #include "rocksdb/comparator.h"
16 #include "rocksdb/env.h"
17 #include "rocksdb/filter_policy.h"
18 #include "rocksdb/options.h"
19 #include "rocksdb/statistics.h"
21 #include "table/block.h"
22 #include "table/bloom_block.h"
23 #include "table/filter_block.h"
24 #include "table/format.h"
25 #include "table/internal_iterator.h"
26 #include "table/meta_blocks.h"
27 #include "table/two_level_iterator.h"
28 #include "table/plain_table_factory.h"
29 #include "table/plain_table_key_coding.h"
30 #include "table/get_context.h"
32 #include "monitoring/histogram.h"
33 #include "monitoring/perf_context_imp.h"
34 #include "util/arena.h"
35 #include "util/coding.h"
36 #include "util/dynamic_bloom.h"
37 #include "util/hash.h"
38 #include "util/murmurhash.h"
39 #include "util/stop_watch.h"
40 #include "util/string_util.h"
46 // Safely getting a uint32_t element from a char array, where, starting from
47 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
48 inline uint32_t GetFixed32Element(const char* base
, size_t offset
) {
49 return DecodeFixed32(base
+ offset
* sizeof(uint32_t));
53 // Iterator to iterate IndexedTable
54 class PlainTableIterator
: public InternalIterator
{
56 explicit PlainTableIterator(PlainTableReader
* table
, bool use_prefix_seek
);
57 ~PlainTableIterator();
59 bool Valid() const override
;
61 void SeekToFirst() override
;
63 void SeekToLast() override
;
65 void Seek(const Slice
& target
) override
;
67 void SeekForPrev(const Slice
& target
) override
;
73 Slice
key() const override
;
75 Slice
value() const override
;
77 Status
status() const override
;
80 PlainTableReader
* table_
;
81 PlainTableKeyDecoder decoder_
;
82 bool use_prefix_seek_
;
84 uint32_t next_offset_
;
89 PlainTableIterator(const PlainTableIterator
&) = delete;
90 void operator=(const Iterator
&) = delete;
93 extern const uint64_t kPlainTableMagicNumber
;
94 PlainTableReader::PlainTableReader(const ImmutableCFOptions
& ioptions
,
95 unique_ptr
<RandomAccessFileReader
>&& file
,
96 const EnvOptions
& storage_options
,
97 const InternalKeyComparator
& icomparator
,
98 EncodingType encoding_type
,
100 const TableProperties
* table_properties
)
101 : internal_comparator_(icomparator
),
102 encoding_type_(encoding_type
),
103 full_scan_mode_(false),
104 user_key_len_(static_cast<uint32_t>(table_properties
->fixed_key_len
)),
105 prefix_extractor_(ioptions
.prefix_extractor
),
106 enable_bloom_(false),
108 file_info_(std::move(file
), storage_options
,
109 static_cast<uint32_t>(table_properties
->data_size
)),
111 file_size_(file_size
),
112 table_properties_(nullptr) {}
114 PlainTableReader::~PlainTableReader() {
117 Status
PlainTableReader::Open(const ImmutableCFOptions
& ioptions
,
118 const EnvOptions
& env_options
,
119 const InternalKeyComparator
& internal_comparator
,
120 unique_ptr
<RandomAccessFileReader
>&& file
,
122 unique_ptr
<TableReader
>* table_reader
,
123 const int bloom_bits_per_key
,
124 double hash_table_ratio
, size_t index_sparseness
,
125 size_t huge_page_tlb_size
, bool full_scan_mode
) {
126 if (file_size
> PlainTableIndex::kMaxFileSize
) {
127 return Status::NotSupported("File is too large for PlainTableReader!");
130 TableProperties
* props
= nullptr;
131 auto s
= ReadTableProperties(file
.get(), file_size
, kPlainTableMagicNumber
,
137 assert(hash_table_ratio
>= 0.0);
138 auto& user_props
= props
->user_collected_properties
;
139 auto prefix_extractor_in_file
= props
->prefix_extractor_name
;
141 if (!full_scan_mode
&&
142 !prefix_extractor_in_file
.empty() /* old version sst file*/
143 && prefix_extractor_in_file
!= "nullptr") {
144 if (!ioptions
.prefix_extractor
) {
145 return Status::InvalidArgument(
146 "Prefix extractor is missing when opening a PlainTable built "
147 "using a prefix extractor");
148 } else if (prefix_extractor_in_file
.compare(
149 ioptions
.prefix_extractor
->Name()) != 0) {
150 return Status::InvalidArgument(
151 "Prefix extractor given doesn't match the one used to build "
156 EncodingType encoding_type
= kPlain
;
157 auto encoding_type_prop
=
158 user_props
.find(PlainTablePropertyNames::kEncodingType
);
159 if (encoding_type_prop
!= user_props
.end()) {
160 encoding_type
= static_cast<EncodingType
>(
161 DecodeFixed32(encoding_type_prop
->second
.c_str()));
164 std::unique_ptr
<PlainTableReader
> new_reader(new PlainTableReader(
165 ioptions
, std::move(file
), env_options
, internal_comparator
,
166 encoding_type
, file_size
, props
));
168 s
= new_reader
->MmapDataIfNeeded();
173 if (!full_scan_mode
) {
174 s
= new_reader
->PopulateIndex(props
, bloom_bits_per_key
, hash_table_ratio
,
175 index_sparseness
, huge_page_tlb_size
);
180 // Flag to indicate it is a full scan mode so that none of the indexes
182 new_reader
->full_scan_mode_
= true;
185 *table_reader
= std::move(new_reader
);
189 void PlainTableReader::SetupForCompaction() {
192 InternalIterator
* PlainTableReader::NewIterator(const ReadOptions
& options
,
195 if (options
.total_order_seek
&& !IsTotalOrderMode()) {
196 return NewErrorInternalIterator(
197 Status::InvalidArgument("total_order_seek not supported"), arena
);
199 if (arena
== nullptr) {
200 return new PlainTableIterator(this, prefix_extractor_
!= nullptr);
202 auto mem
= arena
->AllocateAligned(sizeof(PlainTableIterator
));
203 return new (mem
) PlainTableIterator(this, prefix_extractor_
!= nullptr);
207 Status
PlainTableReader::PopulateIndexRecordList(
208 PlainTableIndexBuilder
* index_builder
, vector
<uint32_t>* prefix_hashes
) {
209 Slice prev_key_prefix_slice
;
210 std::string prev_key_prefix_buf
;
211 uint32_t pos
= data_start_offset_
;
213 bool is_first_record
= true;
214 Slice key_prefix_slice
;
215 PlainTableKeyDecoder
decoder(&file_info_
, encoding_type_
, user_key_len_
,
216 ioptions_
.prefix_extractor
);
217 while (pos
< file_info_
.data_end_offset
) {
218 uint32_t key_offset
= pos
;
219 ParsedInternalKey key
;
221 bool seekable
= false;
222 Status s
= Next(&decoder
, &pos
, &key
, nullptr, &value_slice
, &seekable
);
227 key_prefix_slice
= GetPrefix(key
);
229 bloom_
.AddHash(GetSliceHash(key
.user_key
));
231 if (is_first_record
|| prev_key_prefix_slice
!= key_prefix_slice
) {
232 if (!is_first_record
) {
233 prefix_hashes
->push_back(GetSliceHash(prev_key_prefix_slice
));
235 if (file_info_
.is_mmap_mode
) {
236 prev_key_prefix_slice
= key_prefix_slice
;
238 prev_key_prefix_buf
= key_prefix_slice
.ToString();
239 prev_key_prefix_slice
= prev_key_prefix_buf
;
244 index_builder
->AddKeyPrefix(GetPrefix(key
), key_offset
);
246 if (!seekable
&& is_first_record
) {
247 return Status::Corruption("Key for a prefix is not seekable");
250 is_first_record
= false;
253 prefix_hashes
->push_back(GetSliceHash(key_prefix_slice
));
254 auto s
= index_
.InitFromRawData(index_builder
->Finish());
258 void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key
,
260 size_t huge_page_tlb_size
,
261 vector
<uint32_t>* prefix_hashes
) {
262 if (!IsTotalOrderMode()) {
263 uint32_t bloom_total_bits
= num_prefixes
* bloom_bits_per_key
;
264 if (bloom_total_bits
> 0) {
265 enable_bloom_
= true;
266 bloom_
.SetTotalBits(&arena_
, bloom_total_bits
, ioptions_
.bloom_locality
,
267 huge_page_tlb_size
, ioptions_
.info_log
);
268 FillBloom(prefix_hashes
);
273 void PlainTableReader::FillBloom(vector
<uint32_t>* prefix_hashes
) {
274 assert(bloom_
.IsInitialized());
275 for (auto prefix_hash
: *prefix_hashes
) {
276 bloom_
.AddHash(prefix_hash
);
280 Status
PlainTableReader::MmapDataIfNeeded() {
281 if (file_info_
.is_mmap_mode
) {
282 // Get mmapped memory.
283 return file_info_
.file
->Read(0, file_size_
, &file_info_
.file_data
, nullptr);
288 Status
PlainTableReader::PopulateIndex(TableProperties
* props
,
289 int bloom_bits_per_key
,
290 double hash_table_ratio
,
291 size_t index_sparseness
,
292 size_t huge_page_tlb_size
) {
293 assert(props
!= nullptr);
294 table_properties_
.reset(props
);
296 BlockContents index_block_contents
;
297 Status s
= ReadMetaBlock(
298 file_info_
.file
.get(), file_size_
, kPlainTableMagicNumber
, ioptions_
,
299 PlainTableIndexBuilder::kPlainTableIndexBlock
, &index_block_contents
);
301 bool index_in_file
= s
.ok();
303 BlockContents bloom_block_contents
;
304 bool bloom_in_file
= false;
305 // We only need to read the bloom block if index block is in file.
307 s
= ReadMetaBlock(file_info_
.file
.get(), file_size_
, kPlainTableMagicNumber
,
308 ioptions_
, BloomBlockBuilder::kBloomBlock
,
309 &bloom_block_contents
);
310 bloom_in_file
= s
.ok() && bloom_block_contents
.data
.size() > 0;
315 // If bloom_block_contents.allocation is not empty (which will be the case
316 // for non-mmap mode), it holds the alloated memory for the bloom block.
317 // It needs to be kept alive to keep `bloom_block` valid.
318 bloom_block_alloc_
= std::move(bloom_block_contents
.allocation
);
319 bloom_block
= &bloom_block_contents
.data
;
321 bloom_block
= nullptr;
326 // If index_block_contents.allocation is not empty (which will be the case
327 // for non-mmap mode), it holds the alloated memory for the index block.
328 // It needs to be kept alive to keep `index_block` valid.
329 index_block_alloc_
= std::move(index_block_contents
.allocation
);
330 index_block
= &index_block_contents
.data
;
332 index_block
= nullptr;
335 if ((ioptions_
.prefix_extractor
== nullptr) &&
336 (hash_table_ratio
!= 0)) {
337 // ioptions.prefix_extractor is requried for a hash-based look-up.
338 return Status::NotSupported(
339 "PlainTable requires a prefix extractor enable prefix hash mode.");
342 // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
343 // for a prefix (starting from the first one), generate a record of (hash,
344 // offset) and append it to IndexRecordList, which is a data structure created
347 if (!index_in_file
) {
348 // Allocate bloom filter here for total order mode.
349 if (IsTotalOrderMode()) {
350 uint32_t num_bloom_bits
=
351 static_cast<uint32_t>(table_properties_
->num_entries
) *
353 if (num_bloom_bits
> 0) {
354 enable_bloom_
= true;
355 bloom_
.SetTotalBits(&arena_
, num_bloom_bits
, ioptions_
.bloom_locality
,
356 huge_page_tlb_size
, ioptions_
.info_log
);
359 } else if (bloom_in_file
) {
360 enable_bloom_
= true;
361 auto num_blocks_property
= props
->user_collected_properties
.find(
362 PlainTablePropertyNames::kNumBloomBlocks
);
364 uint32_t num_blocks
= 0;
365 if (num_blocks_property
!= props
->user_collected_properties
.end()) {
366 Slice
temp_slice(num_blocks_property
->second
);
367 if (!GetVarint32(&temp_slice
, &num_blocks
)) {
371 // cast away const qualifier, because bloom_ won't be changed
373 const_cast<unsigned char*>(
374 reinterpret_cast<const unsigned char*>(bloom_block
->data())),
375 static_cast<uint32_t>(bloom_block
->size()) * 8, num_blocks
);
377 // Index in file but no bloom in file. Disable bloom filter in this case.
378 enable_bloom_
= false;
379 bloom_bits_per_key
= 0;
382 PlainTableIndexBuilder
index_builder(&arena_
, ioptions_
, index_sparseness
,
383 hash_table_ratio
, huge_page_tlb_size
);
385 std::vector
<uint32_t> prefix_hashes
;
386 if (!index_in_file
) {
387 s
= PopulateIndexRecordList(&index_builder
, &prefix_hashes
);
392 s
= index_
.InitFromRawData(*index_block
);
398 if (!index_in_file
) {
399 // Calculated bloom filter size and allocate memory for
400 // bloom filter based on the number of prefixes, then fill it.
401 AllocateAndFillBloom(bloom_bits_per_key
, index_
.GetNumPrefixes(),
402 huge_page_tlb_size
, &prefix_hashes
);
405 // Fill two table properties.
406 if (!index_in_file
) {
407 props
->user_collected_properties
["plain_table_hash_table_size"] =
408 ToString(index_
.GetIndexSize() * PlainTableIndex::kOffsetLen
);
409 props
->user_collected_properties
["plain_table_sub_index_size"] =
410 ToString(index_
.GetSubIndexSize());
412 props
->user_collected_properties
["plain_table_hash_table_size"] =
414 props
->user_collected_properties
["plain_table_sub_index_size"] =
421 Status
PlainTableReader::GetOffset(PlainTableKeyDecoder
* decoder
,
422 const Slice
& target
, const Slice
& prefix
,
423 uint32_t prefix_hash
, bool& prefix_matched
,
424 uint32_t* offset
) const {
425 prefix_matched
= false;
426 uint32_t prefix_index_offset
;
427 auto res
= index_
.GetOffset(prefix_hash
, &prefix_index_offset
);
428 if (res
== PlainTableIndex::kNoPrefixForBucket
) {
429 *offset
= file_info_
.data_end_offset
;
431 } else if (res
== PlainTableIndex::kDirectToFile
) {
432 *offset
= prefix_index_offset
;
436 // point to sub-index, need to do a binary search
437 uint32_t upper_bound
;
438 const char* base_ptr
=
439 index_
.GetSubIndexBasePtrAndUpperBound(prefix_index_offset
, &upper_bound
);
441 uint32_t high
= upper_bound
;
442 ParsedInternalKey mid_key
;
443 ParsedInternalKey parsed_target
;
444 if (!ParseInternalKey(target
, &parsed_target
)) {
445 return Status::Corruption(Slice());
448 // The key is between [low, high). Do a binary search between it.
449 while (high
- low
> 1) {
450 uint32_t mid
= (high
+ low
) / 2;
451 uint32_t file_offset
= GetFixed32Element(base_ptr
, mid
);
453 Status s
= decoder
->NextKeyNoValue(file_offset
, &mid_key
, nullptr, &tmp
);
457 int cmp_result
= internal_comparator_
.Compare(mid_key
, parsed_target
);
458 if (cmp_result
< 0) {
461 if (cmp_result
== 0) {
462 // Happen to have found the exact key or target is smaller than the
463 // first key after base_offset.
464 prefix_matched
= true;
465 *offset
= file_offset
;
472 // Both of the key at the position low or low+1 could share the same
473 // prefix as target. We need to rule out one of them to avoid to go
474 // to the wrong prefix.
475 ParsedInternalKey low_key
;
477 uint32_t low_key_offset
= GetFixed32Element(base_ptr
, low
);
478 Status s
= decoder
->NextKeyNoValue(low_key_offset
, &low_key
, nullptr, &tmp
);
483 if (GetPrefix(low_key
) == prefix
) {
484 prefix_matched
= true;
485 *offset
= low_key_offset
;
486 } else if (low
+ 1 < upper_bound
) {
487 // There is possible a next prefix, return it
488 prefix_matched
= false;
489 *offset
= GetFixed32Element(base_ptr
, low
+ 1);
491 // target is larger than a key of the last prefix in this bucket
492 // but with a different prefix. Key does not exist.
493 *offset
= file_info_
.data_end_offset
;
498 bool PlainTableReader::MatchBloom(uint32_t hash
) const {
499 if (!enable_bloom_
) {
503 if (bloom_
.MayContainHash(hash
)) {
504 PERF_COUNTER_ADD(bloom_sst_hit_count
, 1);
507 PERF_COUNTER_ADD(bloom_sst_miss_count
, 1);
512 Status
PlainTableReader::Next(PlainTableKeyDecoder
* decoder
, uint32_t* offset
,
513 ParsedInternalKey
* parsed_key
,
514 Slice
* internal_key
, Slice
* value
,
515 bool* seekable
) const {
516 if (*offset
== file_info_
.data_end_offset
) {
517 *offset
= file_info_
.data_end_offset
;
521 if (*offset
> file_info_
.data_end_offset
) {
522 return Status::Corruption("Offset is out of file size");
526 Status s
= decoder
->NextKey(*offset
, parsed_key
, internal_key
, value
,
527 &bytes_read
, seekable
);
531 *offset
= *offset
+ bytes_read
;
535 void PlainTableReader::Prepare(const Slice
& target
) {
537 uint32_t prefix_hash
= GetSliceHash(GetPrefix(target
));
538 bloom_
.Prefetch(prefix_hash
);
542 Status
PlainTableReader::Get(const ReadOptions
& ro
, const Slice
& target
,
543 GetContext
* get_context
, bool skip_filters
) {
544 // Check bloom filter first.
546 uint32_t prefix_hash
;
547 if (IsTotalOrderMode()) {
548 if (full_scan_mode_
) {
550 Status::InvalidArgument("Get() is not allowed in full scan mode.");
552 // Match whole user key for bloom filter check.
553 if (!MatchBloom(GetSliceHash(GetUserKey(target
)))) {
556 // in total order mode, there is only one bucket 0, and we always use empty
558 prefix_slice
= Slice();
561 prefix_slice
= GetPrefix(target
);
562 prefix_hash
= GetSliceHash(prefix_slice
);
563 if (!MatchBloom(prefix_hash
)) {
569 PlainTableKeyDecoder
decoder(&file_info_
, encoding_type_
, user_key_len_
,
570 ioptions_
.prefix_extractor
);
571 Status s
= GetOffset(&decoder
, target
, prefix_slice
, prefix_hash
,
572 prefix_match
, &offset
);
577 ParsedInternalKey found_key
;
578 ParsedInternalKey parsed_target
;
579 if (!ParseInternalKey(target
, &parsed_target
)) {
580 return Status::Corruption(Slice());
583 while (offset
< file_info_
.data_end_offset
) {
584 s
= Next(&decoder
, &offset
, &found_key
, nullptr, &found_value
);
589 // Need to verify prefix for the first key found if it is not yet
591 if (GetPrefix(found_key
) != prefix_slice
) {
596 // TODO(ljin): since we know the key comparison result here,
597 // can we enable the fast path?
598 if (internal_comparator_
.Compare(found_key
, parsed_target
) >= 0) {
599 if (!get_context
->SaveValue(found_key
, found_value
)) {
607 uint64_t PlainTableReader::ApproximateOffsetOf(const Slice
& key
) {
611 PlainTableIterator::PlainTableIterator(PlainTableReader
* table
,
612 bool use_prefix_seek
)
614 decoder_(&table_
->file_info_
, table_
->encoding_type_
,
615 table_
->user_key_len_
, table_
->prefix_extractor_
),
616 use_prefix_seek_(use_prefix_seek
) {
617 next_offset_
= offset_
= table_
->file_info_
.data_end_offset
;
620 PlainTableIterator::~PlainTableIterator() {
623 bool PlainTableIterator::Valid() const {
624 return offset_
< table_
->file_info_
.data_end_offset
&&
625 offset_
>= table_
->data_start_offset_
;
628 void PlainTableIterator::SeekToFirst() {
629 next_offset_
= table_
->data_start_offset_
;
630 if (next_offset_
>= table_
->file_info_
.data_end_offset
) {
631 next_offset_
= offset_
= table_
->file_info_
.data_end_offset
;
637 void PlainTableIterator::SeekToLast() {
639 status_
= Status::NotSupported("SeekToLast() is not supported in PlainTable");
642 void PlainTableIterator::Seek(const Slice
& target
) {
643 // If the user doesn't set prefix seek option and we are not able to do a
644 // total Seek(). assert failure.
645 if (!use_prefix_seek_
) {
646 if (table_
->full_scan_mode_
) {
648 Status::InvalidArgument("Seek() is not allowed in full scan mode.");
649 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
651 } else if (table_
->GetIndexSize() > 1) {
653 status_
= Status::NotSupported(
654 "PlainTable cannot issue non-prefix seek unless in total order "
656 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
661 Slice prefix_slice
= table_
->GetPrefix(target
);
662 uint32_t prefix_hash
= 0;
663 // Bloom filter is ignored in total-order mode.
664 if (!table_
->IsTotalOrderMode()) {
665 prefix_hash
= GetSliceHash(prefix_slice
);
666 if (!table_
->MatchBloom(prefix_hash
)) {
667 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
672 status_
= table_
->GetOffset(&decoder_
, target
, prefix_slice
, prefix_hash
,
673 prefix_match
, &next_offset_
);
675 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
679 if (next_offset_
< table_
->file_info_
.data_end_offset
) {
680 for (Next(); status_
.ok() && Valid(); Next()) {
682 // Need to verify the first key's prefix
683 if (table_
->GetPrefix(key()) != prefix_slice
) {
684 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
689 if (table_
->internal_comparator_
.Compare(key(), target
) >= 0) {
694 offset_
= table_
->file_info_
.data_end_offset
;
698 void PlainTableIterator::SeekForPrev(const Slice
& target
) {
701 Status::NotSupported("SeekForPrev() is not supported in PlainTable");
704 void PlainTableIterator::Next() {
705 offset_
= next_offset_
;
706 if (offset_
< table_
->file_info_
.data_end_offset
) {
708 ParsedInternalKey parsed_key
;
710 table_
->Next(&decoder_
, &next_offset_
, &parsed_key
, &key_
, &value_
);
712 offset_
= next_offset_
= table_
->file_info_
.data_end_offset
;
717 void PlainTableIterator::Prev() {
721 Slice
PlainTableIterator::key() const {
726 Slice
PlainTableIterator::value() const {
731 Status
PlainTableIterator::status() const {
735 } // namespace rocksdb
736 #endif // ROCKSDB_LITE