]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/plain_table_reader.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / table / plain_table_reader.h
1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5 #pragma once
6
7 #ifndef ROCKSDB_LITE
8 #include <unordered_map>
9 #include <memory>
10 #include <vector>
11 #include <string>
12 #include <stdint.h>
13
14 #include "db/dbformat.h"
15 #include "rocksdb/env.h"
16 #include "rocksdb/iterator.h"
17 #include "rocksdb/slice_transform.h"
18 #include "rocksdb/table.h"
19 #include "rocksdb/table_properties.h"
20 #include "table/table_reader.h"
21 #include "table/plain_table_factory.h"
22 #include "table/plain_table_index.h"
23 #include "util/arena.h"
24 #include "util/dynamic_bloom.h"
25 #include "util/file_reader_writer.h"
26
27 namespace rocksdb {
28
29 class Block;
30 struct BlockContents;
31 class BlockHandle;
32 class Footer;
33 struct Options;
34 class RandomAccessFile;
35 struct ReadOptions;
36 class TableCache;
37 class TableReader;
38 class InternalKeyComparator;
39 class PlainTableKeyDecoder;
40 class GetContext;
41
42 using std::unique_ptr;
43 using std::unordered_map;
44 using std::vector;
45 extern const uint32_t kPlainTableVariableLength;
46
47 struct PlainTableReaderFileInfo {
48 bool is_mmap_mode;
49 Slice file_data;
50 uint32_t data_end_offset;
51 unique_ptr<RandomAccessFileReader> file;
52
53 PlainTableReaderFileInfo(unique_ptr<RandomAccessFileReader>&& _file,
54 const EnvOptions& storage_options,
55 uint32_t _data_size_offset)
56 : is_mmap_mode(storage_options.use_mmap_reads),
57 data_end_offset(_data_size_offset),
58 file(std::move(_file)) {}
59 };
60
61 // Based on following output file format shown in plain_table_factory.h
62 // When opening the output file, IndexedTableReader creates a hash table
63 // from key prefixes to offset of the output file. IndexedTable will decide
64 // whether it points to the data offset of the first key with the key prefix
65 // or the offset of it. If there are too many keys share this prefix, it will
66 // create a binary search-able index from the suffix to offset on disk.
67 //
68 // The implementation of IndexedTableReader requires output file is mmaped
69 class PlainTableReader: public TableReader {
70 public:
71 static Status Open(const ImmutableCFOptions& ioptions,
72 const EnvOptions& env_options,
73 const InternalKeyComparator& internal_comparator,
74 unique_ptr<RandomAccessFileReader>&& file,
75 uint64_t file_size, unique_ptr<TableReader>* table,
76 const int bloom_bits_per_key, double hash_table_ratio,
77 size_t index_sparseness, size_t huge_page_tlb_size,
78 bool full_scan_mode,
79 const SliceTransform* prefix_extractor = nullptr);
80
81 InternalIterator* NewIterator(const ReadOptions&,
82 const SliceTransform* prefix_extractor,
83 Arena* arena = nullptr,
84 bool skip_filters = false,
85 bool for_compaction = false) override;
86
87 void Prepare(const Slice& target) override;
88
89 Status Get(const ReadOptions& readOptions, const Slice& key,
90 GetContext* get_context, const SliceTransform* prefix_extractor,
91 bool skip_filters = false) override;
92
93 uint64_t ApproximateOffsetOf(const Slice& key) override;
94
95 uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
96 void SetupForCompaction() override;
97
98 std::shared_ptr<const TableProperties> GetTableProperties() const override {
99 return table_properties_;
100 }
101
102 virtual size_t ApproximateMemoryUsage() const override {
103 return arena_.MemoryAllocatedBytes();
104 }
105
106 PlainTableReader(const ImmutableCFOptions& ioptions,
107 unique_ptr<RandomAccessFileReader>&& file,
108 const EnvOptions& env_options,
109 const InternalKeyComparator& internal_comparator,
110 EncodingType encoding_type, uint64_t file_size,
111 const TableProperties* table_properties,
112 const SliceTransform* prefix_extractor);
113 virtual ~PlainTableReader();
114
115 protected:
116 // Check bloom filter to see whether it might contain this prefix.
117 // The hash of the prefix is given, since it can be reused for index lookup
118 // too.
119 virtual bool MatchBloom(uint32_t hash) const;
120
121 // PopulateIndex() builds index of keys. It must be called before any query
122 // to the table.
123 //
124 // props: the table properties object that need to be stored. Ownership of
125 // the object will be passed.
126 //
127
128 Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
129 double hash_table_ratio, size_t index_sparseness,
130 size_t huge_page_tlb_size);
131
132 Status MmapDataIfNeeded();
133
134 private:
135 const InternalKeyComparator internal_comparator_;
136 EncodingType encoding_type_;
137 // represents plain table's current status.
138 Status status_;
139
140 PlainTableIndex index_;
141 bool full_scan_mode_;
142
143 // data_start_offset_ and data_end_offset_ defines the range of the
144 // sst file that stores data.
145 const uint32_t data_start_offset_ = 0;
146 const uint32_t user_key_len_;
147 const SliceTransform* prefix_extractor_;
148
149 static const size_t kNumInternalBytes = 8;
150
151 // Bloom filter is used to rule out non-existent key
152 bool enable_bloom_;
153 DynamicBloom bloom_;
154 PlainTableReaderFileInfo file_info_;
155 Arena arena_;
156 std::unique_ptr<char[]> index_block_alloc_;
157 std::unique_ptr<char[]> bloom_block_alloc_;
158
159 const ImmutableCFOptions& ioptions_;
160 uint64_t file_size_;
161 std::shared_ptr<const TableProperties> table_properties_;
162
163 bool IsFixedLength() const {
164 return user_key_len_ != kPlainTableVariableLength;
165 }
166
167 size_t GetFixedInternalKeyLength() const {
168 return user_key_len_ + kNumInternalBytes;
169 }
170
171 Slice GetPrefix(const Slice& target) const {
172 assert(target.size() >= 8); // target is internal key
173 return GetPrefixFromUserKey(GetUserKey(target));
174 }
175
176 Slice GetPrefix(const ParsedInternalKey& target) const {
177 return GetPrefixFromUserKey(target.user_key);
178 }
179
180 Slice GetUserKey(const Slice& key) const {
181 return Slice(key.data(), key.size() - 8);
182 }
183
184 Slice GetPrefixFromUserKey(const Slice& user_key) const {
185 if (!IsTotalOrderMode()) {
186 return prefix_extractor_->Transform(user_key);
187 } else {
188 // Use empty slice as prefix if prefix_extractor is not set.
189 // In that case,
190 // it falls back to pure binary search and
191 // total iterator seek is supported.
192 return Slice();
193 }
194 }
195
196 friend class TableCache;
197 friend class PlainTableIterator;
198
199 // Internal helper function to generate an IndexRecordList object from all
200 // the rows, which contains index records as a list.
201 // If bloom_ is not null, all the keys' full-key hash will be added to the
202 // bloom filter.
203 Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
204 vector<uint32_t>* prefix_hashes);
205
206 // Internal helper function to allocate memory for bloom filter and fill it
207 void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
208 size_t huge_page_tlb_size,
209 vector<uint32_t>* prefix_hashes);
210
211 void FillBloom(vector<uint32_t>* prefix_hashes);
212
213 // Read the key and value at `offset` to parameters for keys, the and
214 // `seekable`.
215 // On success, `offset` will be updated as the offset for the next key.
216 // `parsed_key` will be key in parsed format.
217 // if `internal_key` is not empty, it will be filled with key with slice
218 // format.
219 // if `seekable` is not null, it will return whether we can directly read
220 // data using this offset.
221 Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
222 ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
223 bool* seekable = nullptr) const;
224 // Get file offset for key target.
225 // return value prefix_matched is set to true if the offset is confirmed
226 // for a key with the same prefix as target.
227 Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
228 const Slice& prefix, uint32_t prefix_hash,
229 bool& prefix_matched, uint32_t* offset) const;
230
231 bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
232
233 // No copying allowed
234 explicit PlainTableReader(const TableReader&) = delete;
235 void operator=(const TableReader&) = delete;
236 };
237 } // namespace rocksdb
238 #endif // ROCKSDB_LITE