1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8 #include <unordered_map>
14 #include "db/dbformat.h"
15 #include "rocksdb/env.h"
16 #include "rocksdb/iterator.h"
17 #include "rocksdb/slice_transform.h"
18 #include "rocksdb/table.h"
19 #include "rocksdb/table_properties.h"
20 #include "table/table_reader.h"
21 #include "table/plain_table_factory.h"
22 #include "table/plain_table_index.h"
23 #include "util/arena.h"
24 #include "util/dynamic_bloom.h"
25 #include "util/file_reader_writer.h"
34 class RandomAccessFile
;
38 class InternalKeyComparator
;
39 class PlainTableKeyDecoder
;
42 extern const uint32_t kPlainTableVariableLength
;
44 struct PlainTableReaderFileInfo
{
47 uint32_t data_end_offset
;
48 std::unique_ptr
<RandomAccessFileReader
> file
;
50 PlainTableReaderFileInfo(std::unique_ptr
<RandomAccessFileReader
>&& _file
,
51 const EnvOptions
& storage_options
,
52 uint32_t _data_size_offset
)
53 : is_mmap_mode(storage_options
.use_mmap_reads
),
54 data_end_offset(_data_size_offset
),
55 file(std::move(_file
)) {}
58 // Based on following output file format shown in plain_table_factory.h
59 // When opening the output file, IndexedTableReader creates a hash table
60 // from key prefixes to offset of the output file. IndexedTable will decide
61 // whether it points to the data offset of the first key with the key prefix
62 // or the offset of it. If there are too many keys share this prefix, it will
63 // create a binary search-able index from the suffix to offset on disk.
65 // The implementation of IndexedTableReader requires output file is mmaped
66 class PlainTableReader
: public TableReader
{
68 static Status
Open(const ImmutableCFOptions
& ioptions
,
69 const EnvOptions
& env_options
,
70 const InternalKeyComparator
& internal_comparator
,
71 std::unique_ptr
<RandomAccessFileReader
>&& file
,
72 uint64_t file_size
, std::unique_ptr
<TableReader
>* table
,
73 const int bloom_bits_per_key
, double hash_table_ratio
,
74 size_t index_sparseness
, size_t huge_page_tlb_size
,
75 bool full_scan_mode
, const bool immortal_table
= false,
76 const SliceTransform
* prefix_extractor
= nullptr);
78 InternalIterator
* NewIterator(const ReadOptions
&,
79 const SliceTransform
* prefix_extractor
,
80 Arena
* arena
= nullptr,
81 bool skip_filters
= false,
82 bool for_compaction
= false) override
;
84 void Prepare(const Slice
& target
) override
;
86 Status
Get(const ReadOptions
& readOptions
, const Slice
& key
,
87 GetContext
* get_context
, const SliceTransform
* prefix_extractor
,
88 bool skip_filters
= false) override
;
90 uint64_t ApproximateOffsetOf(const Slice
& key
) override
;
92 uint32_t GetIndexSize() const { return index_
.GetIndexSize(); }
93 void SetupForCompaction() override
;
95 std::shared_ptr
<const TableProperties
> GetTableProperties() const override
{
96 return table_properties_
;
99 virtual size_t ApproximateMemoryUsage() const override
{
100 return arena_
.MemoryAllocatedBytes();
103 PlainTableReader(const ImmutableCFOptions
& ioptions
,
104 std::unique_ptr
<RandomAccessFileReader
>&& file
,
105 const EnvOptions
& env_options
,
106 const InternalKeyComparator
& internal_comparator
,
107 EncodingType encoding_type
, uint64_t file_size
,
108 const TableProperties
* table_properties
,
109 const SliceTransform
* prefix_extractor
);
110 virtual ~PlainTableReader();
113 // Check bloom filter to see whether it might contain this prefix.
114 // The hash of the prefix is given, since it can be reused for index lookup
116 virtual bool MatchBloom(uint32_t hash
) const;
118 // PopulateIndex() builds index of keys. It must be called before any query
121 // props: the table properties object that need to be stored. Ownership of
122 // the object will be passed.
125 Status
PopulateIndex(TableProperties
* props
, int bloom_bits_per_key
,
126 double hash_table_ratio
, size_t index_sparseness
,
127 size_t huge_page_tlb_size
);
129 Status
MmapDataIfNeeded();
132 const InternalKeyComparator internal_comparator_
;
133 EncodingType encoding_type_
;
134 // represents plain table's current status.
137 PlainTableIndex index_
;
138 bool full_scan_mode_
;
140 // data_start_offset_ and data_end_offset_ defines the range of the
141 // sst file that stores data.
142 const uint32_t data_start_offset_
= 0;
143 const uint32_t user_key_len_
;
144 const SliceTransform
* prefix_extractor_
;
146 static const size_t kNumInternalBytes
= 8;
148 // Bloom filter is used to rule out non-existent key
151 PlainTableReaderFileInfo file_info_
;
153 CacheAllocationPtr index_block_alloc_
;
154 CacheAllocationPtr bloom_block_alloc_
;
156 const ImmutableCFOptions
& ioptions_
;
157 std::unique_ptr
<Cleanable
> dummy_cleanable_
;
159 std::shared_ptr
<const TableProperties
> table_properties_
;
161 bool IsFixedLength() const {
162 return user_key_len_
!= kPlainTableVariableLength
;
165 size_t GetFixedInternalKeyLength() const {
166 return user_key_len_
+ kNumInternalBytes
;
169 Slice
GetPrefix(const Slice
& target
) const {
170 assert(target
.size() >= 8); // target is internal key
171 return GetPrefixFromUserKey(GetUserKey(target
));
174 Slice
GetPrefix(const ParsedInternalKey
& target
) const {
175 return GetPrefixFromUserKey(target
.user_key
);
178 Slice
GetUserKey(const Slice
& key
) const {
179 return Slice(key
.data(), key
.size() - 8);
182 Slice
GetPrefixFromUserKey(const Slice
& user_key
) const {
183 if (!IsTotalOrderMode()) {
184 return prefix_extractor_
->Transform(user_key
);
186 // Use empty slice as prefix if prefix_extractor is not set.
188 // it falls back to pure binary search and
189 // total iterator seek is supported.
194 friend class TableCache
;
195 friend class PlainTableIterator
;
197 // Internal helper function to generate an IndexRecordList object from all
198 // the rows, which contains index records as a list.
199 // If bloom_ is not null, all the keys' full-key hash will be added to the
201 Status
PopulateIndexRecordList(PlainTableIndexBuilder
* index_builder
,
202 std::vector
<uint32_t>* prefix_hashes
);
204 // Internal helper function to allocate memory for bloom filter and fill it
205 void AllocateAndFillBloom(int bloom_bits_per_key
, int num_prefixes
,
206 size_t huge_page_tlb_size
,
207 std::vector
<uint32_t>* prefix_hashes
);
209 void FillBloom(std::vector
<uint32_t>* prefix_hashes
);
211 // Read the key and value at `offset` to parameters for keys, the and
213 // On success, `offset` will be updated as the offset for the next key.
214 // `parsed_key` will be key in parsed format.
215 // if `internal_key` is not empty, it will be filled with key with slice
217 // if `seekable` is not null, it will return whether we can directly read
218 // data using this offset.
219 Status
Next(PlainTableKeyDecoder
* decoder
, uint32_t* offset
,
220 ParsedInternalKey
* parsed_key
, Slice
* internal_key
, Slice
* value
,
221 bool* seekable
= nullptr) const;
222 // Get file offset for key target.
223 // return value prefix_matched is set to true if the offset is confirmed
224 // for a key with the same prefix as target.
225 Status
GetOffset(PlainTableKeyDecoder
* decoder
, const Slice
& target
,
226 const Slice
& prefix
, uint32_t prefix_hash
,
227 bool& prefix_matched
, uint32_t* offset
) const;
229 bool IsTotalOrderMode() const { return (prefix_extractor_
== nullptr); }
231 // No copying allowed
232 explicit PlainTableReader(const TableReader
&) = delete;
233 void operator=(const TableReader
&) = delete;
235 } // namespace rocksdb
236 #endif // ROCKSDB_LITE