1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8 #include <unordered_map>
14 #include "db/dbformat.h"
15 #include "rocksdb/env.h"
16 #include "rocksdb/iterator.h"
17 #include "rocksdb/slice_transform.h"
18 #include "rocksdb/table.h"
19 #include "rocksdb/table_properties.h"
20 #include "table/table_reader.h"
21 #include "table/plain_table_factory.h"
22 #include "table/plain_table_index.h"
23 #include "util/arena.h"
24 #include "util/dynamic_bloom.h"
25 #include "util/file_reader_writer.h"
34 class RandomAccessFile
;
38 class InternalKeyComparator
;
39 class PlainTableKeyDecoder
;
42 using std::unique_ptr
;
43 using std::unordered_map
;
45 extern const uint32_t kPlainTableVariableLength
;
47 struct PlainTableReaderFileInfo
{
50 uint32_t data_end_offset
;
51 unique_ptr
<RandomAccessFileReader
> file
;
53 PlainTableReaderFileInfo(unique_ptr
<RandomAccessFileReader
>&& _file
,
54 const EnvOptions
& storage_options
,
55 uint32_t _data_size_offset
)
56 : is_mmap_mode(storage_options
.use_mmap_reads
),
57 data_end_offset(_data_size_offset
),
58 file(std::move(_file
)) {}
61 // Based on following output file format shown in plain_table_factory.h
62 // When opening the output file, IndexedTableReader creates a hash table
63 // from key prefixes to offset of the output file. IndexedTable will decide
64 // whether it points to the data offset of the first key with the key prefix
65 // or the offset of it. If there are too many keys share this prefix, it will
66 // create a binary search-able index from the suffix to offset on disk.
68 // The implementation of IndexedTableReader requires output file is mmaped
69 class PlainTableReader
: public TableReader
{
71 static Status
Open(const ImmutableCFOptions
& ioptions
,
72 const EnvOptions
& env_options
,
73 const InternalKeyComparator
& internal_comparator
,
74 unique_ptr
<RandomAccessFileReader
>&& file
,
75 uint64_t file_size
, unique_ptr
<TableReader
>* table
,
76 const int bloom_bits_per_key
, double hash_table_ratio
,
77 size_t index_sparseness
, size_t huge_page_tlb_size
,
79 const SliceTransform
* prefix_extractor
= nullptr);
81 InternalIterator
* NewIterator(const ReadOptions
&,
82 const SliceTransform
* prefix_extractor
,
83 Arena
* arena
= nullptr,
84 bool skip_filters
= false,
85 bool for_compaction
= false) override
;
87 void Prepare(const Slice
& target
) override
;
89 Status
Get(const ReadOptions
& readOptions
, const Slice
& key
,
90 GetContext
* get_context
, const SliceTransform
* prefix_extractor
,
91 bool skip_filters
= false) override
;
93 uint64_t ApproximateOffsetOf(const Slice
& key
) override
;
95 uint32_t GetIndexSize() const { return index_
.GetIndexSize(); }
96 void SetupForCompaction() override
;
98 std::shared_ptr
<const TableProperties
> GetTableProperties() const override
{
99 return table_properties_
;
102 virtual size_t ApproximateMemoryUsage() const override
{
103 return arena_
.MemoryAllocatedBytes();
106 PlainTableReader(const ImmutableCFOptions
& ioptions
,
107 unique_ptr
<RandomAccessFileReader
>&& file
,
108 const EnvOptions
& env_options
,
109 const InternalKeyComparator
& internal_comparator
,
110 EncodingType encoding_type
, uint64_t file_size
,
111 const TableProperties
* table_properties
,
112 const SliceTransform
* prefix_extractor
);
113 virtual ~PlainTableReader();
116 // Check bloom filter to see whether it might contain this prefix.
117 // The hash of the prefix is given, since it can be reused for index lookup
119 virtual bool MatchBloom(uint32_t hash
) const;
121 // PopulateIndex() builds index of keys. It must be called before any query
124 // props: the table properties object that need to be stored. Ownership of
125 // the object will be passed.
128 Status
PopulateIndex(TableProperties
* props
, int bloom_bits_per_key
,
129 double hash_table_ratio
, size_t index_sparseness
,
130 size_t huge_page_tlb_size
);
132 Status
MmapDataIfNeeded();
135 const InternalKeyComparator internal_comparator_
;
136 EncodingType encoding_type_
;
137 // represents plain table's current status.
140 PlainTableIndex index_
;
141 bool full_scan_mode_
;
143 // data_start_offset_ and data_end_offset_ defines the range of the
144 // sst file that stores data.
145 const uint32_t data_start_offset_
= 0;
146 const uint32_t user_key_len_
;
147 const SliceTransform
* prefix_extractor_
;
149 static const size_t kNumInternalBytes
= 8;
151 // Bloom filter is used to rule out non-existent key
154 PlainTableReaderFileInfo file_info_
;
156 std::unique_ptr
<char[]> index_block_alloc_
;
157 std::unique_ptr
<char[]> bloom_block_alloc_
;
159 const ImmutableCFOptions
& ioptions_
;
161 std::shared_ptr
<const TableProperties
> table_properties_
;
163 bool IsFixedLength() const {
164 return user_key_len_
!= kPlainTableVariableLength
;
167 size_t GetFixedInternalKeyLength() const {
168 return user_key_len_
+ kNumInternalBytes
;
171 Slice
GetPrefix(const Slice
& target
) const {
172 assert(target
.size() >= 8); // target is internal key
173 return GetPrefixFromUserKey(GetUserKey(target
));
176 Slice
GetPrefix(const ParsedInternalKey
& target
) const {
177 return GetPrefixFromUserKey(target
.user_key
);
180 Slice
GetUserKey(const Slice
& key
) const {
181 return Slice(key
.data(), key
.size() - 8);
184 Slice
GetPrefixFromUserKey(const Slice
& user_key
) const {
185 if (!IsTotalOrderMode()) {
186 return prefix_extractor_
->Transform(user_key
);
188 // Use empty slice as prefix if prefix_extractor is not set.
190 // it falls back to pure binary search and
191 // total iterator seek is supported.
196 friend class TableCache
;
197 friend class PlainTableIterator
;
199 // Internal helper function to generate an IndexRecordList object from all
200 // the rows, which contains index records as a list.
201 // If bloom_ is not null, all the keys' full-key hash will be added to the
203 Status
PopulateIndexRecordList(PlainTableIndexBuilder
* index_builder
,
204 vector
<uint32_t>* prefix_hashes
);
206 // Internal helper function to allocate memory for bloom filter and fill it
207 void AllocateAndFillBloom(int bloom_bits_per_key
, int num_prefixes
,
208 size_t huge_page_tlb_size
,
209 vector
<uint32_t>* prefix_hashes
);
211 void FillBloom(vector
<uint32_t>* prefix_hashes
);
213 // Read the key and value at `offset` to parameters for keys, the and
215 // On success, `offset` will be updated as the offset for the next key.
216 // `parsed_key` will be key in parsed format.
217 // if `internal_key` is not empty, it will be filled with key with slice
219 // if `seekable` is not null, it will return whether we can directly read
220 // data using this offset.
221 Status
Next(PlainTableKeyDecoder
* decoder
, uint32_t* offset
,
222 ParsedInternalKey
* parsed_key
, Slice
* internal_key
, Slice
* value
,
223 bool* seekable
= nullptr) const;
224 // Get file offset for key target.
225 // return value prefix_matched is set to true if the offset is confirmed
226 // for a key with the same prefix as target.
227 Status
GetOffset(PlainTableKeyDecoder
* decoder
, const Slice
& target
,
228 const Slice
& prefix
, uint32_t prefix_hash
,
229 bool& prefix_matched
, uint32_t* offset
) const;
231 bool IsTotalOrderMode() const { return (prefix_extractor_
== nullptr); }
233 // No copying allowed
234 explicit PlainTableReader(const TableReader
&) = delete;
235 void operator=(const TableReader
&) = delete;
237 } // namespace rocksdb
238 #endif // ROCKSDB_LITE