]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/plain_table_reader.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / table / plain_table_reader.h
1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5 #pragma once
6
7 #ifndef ROCKSDB_LITE
8 #include <unordered_map>
9 #include <memory>
10 #include <vector>
11 #include <string>
12 #include <stdint.h>
13
14 #include "db/dbformat.h"
15 #include "rocksdb/env.h"
16 #include "rocksdb/iterator.h"
17 #include "rocksdb/slice_transform.h"
18 #include "rocksdb/table.h"
19 #include "rocksdb/table_properties.h"
20 #include "table/table_reader.h"
21 #include "table/plain_table_factory.h"
22 #include "table/plain_table_index.h"
23 #include "util/arena.h"
24 #include "util/dynamic_bloom.h"
25 #include "util/file_reader_writer.h"
26
27 namespace rocksdb {
28
29 class Block;
30 struct BlockContents;
31 class BlockHandle;
32 class Footer;
33 struct Options;
34 class RandomAccessFile;
35 struct ReadOptions;
36 class TableCache;
37 class TableReader;
38 class InternalKeyComparator;
39 class PlainTableKeyDecoder;
40 class GetContext;
41
42 extern const uint32_t kPlainTableVariableLength;
43
44 struct PlainTableReaderFileInfo {
45 bool is_mmap_mode;
46 Slice file_data;
47 uint32_t data_end_offset;
48 std::unique_ptr<RandomAccessFileReader> file;
49
50 PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
51 const EnvOptions& storage_options,
52 uint32_t _data_size_offset)
53 : is_mmap_mode(storage_options.use_mmap_reads),
54 data_end_offset(_data_size_offset),
55 file(std::move(_file)) {}
56 };
57
58 // Based on following output file format shown in plain_table_factory.h
59 // When opening the output file, IndexedTableReader creates a hash table
60 // from key prefixes to offset of the output file. IndexedTable will decide
61 // whether it points to the data offset of the first key with the key prefix
62 // or the offset of it. If there are too many keys share this prefix, it will
63 // create a binary search-able index from the suffix to offset on disk.
64 //
65 // The implementation of IndexedTableReader requires output file is mmaped
66 class PlainTableReader: public TableReader {
67 public:
68 static Status Open(const ImmutableCFOptions& ioptions,
69 const EnvOptions& env_options,
70 const InternalKeyComparator& internal_comparator,
71 std::unique_ptr<RandomAccessFileReader>&& file,
72 uint64_t file_size, std::unique_ptr<TableReader>* table,
73 const int bloom_bits_per_key, double hash_table_ratio,
74 size_t index_sparseness, size_t huge_page_tlb_size,
75 bool full_scan_mode, const bool immortal_table = false,
76 const SliceTransform* prefix_extractor = nullptr);
77
78 InternalIterator* NewIterator(const ReadOptions&,
79 const SliceTransform* prefix_extractor,
80 Arena* arena = nullptr,
81 bool skip_filters = false,
82 bool for_compaction = false) override;
83
84 void Prepare(const Slice& target) override;
85
86 Status Get(const ReadOptions& readOptions, const Slice& key,
87 GetContext* get_context, const SliceTransform* prefix_extractor,
88 bool skip_filters = false) override;
89
90 uint64_t ApproximateOffsetOf(const Slice& key) override;
91
92 uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
93 void SetupForCompaction() override;
94
95 std::shared_ptr<const TableProperties> GetTableProperties() const override {
96 return table_properties_;
97 }
98
99 virtual size_t ApproximateMemoryUsage() const override {
100 return arena_.MemoryAllocatedBytes();
101 }
102
103 PlainTableReader(const ImmutableCFOptions& ioptions,
104 std::unique_ptr<RandomAccessFileReader>&& file,
105 const EnvOptions& env_options,
106 const InternalKeyComparator& internal_comparator,
107 EncodingType encoding_type, uint64_t file_size,
108 const TableProperties* table_properties,
109 const SliceTransform* prefix_extractor);
110 virtual ~PlainTableReader();
111
112 protected:
113 // Check bloom filter to see whether it might contain this prefix.
114 // The hash of the prefix is given, since it can be reused for index lookup
115 // too.
116 virtual bool MatchBloom(uint32_t hash) const;
117
118 // PopulateIndex() builds index of keys. It must be called before any query
119 // to the table.
120 //
121 // props: the table properties object that need to be stored. Ownership of
122 // the object will be passed.
123 //
124
125 Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
126 double hash_table_ratio, size_t index_sparseness,
127 size_t huge_page_tlb_size);
128
129 Status MmapDataIfNeeded();
130
131 private:
132 const InternalKeyComparator internal_comparator_;
133 EncodingType encoding_type_;
134 // represents plain table's current status.
135 Status status_;
136
137 PlainTableIndex index_;
138 bool full_scan_mode_;
139
140 // data_start_offset_ and data_end_offset_ defines the range of the
141 // sst file that stores data.
142 const uint32_t data_start_offset_ = 0;
143 const uint32_t user_key_len_;
144 const SliceTransform* prefix_extractor_;
145
146 static const size_t kNumInternalBytes = 8;
147
148 // Bloom filter is used to rule out non-existent key
149 bool enable_bloom_;
150 DynamicBloom bloom_;
151 PlainTableReaderFileInfo file_info_;
152 Arena arena_;
153 CacheAllocationPtr index_block_alloc_;
154 CacheAllocationPtr bloom_block_alloc_;
155
156 const ImmutableCFOptions& ioptions_;
157 std::unique_ptr<Cleanable> dummy_cleanable_;
158 uint64_t file_size_;
159 std::shared_ptr<const TableProperties> table_properties_;
160
161 bool IsFixedLength() const {
162 return user_key_len_ != kPlainTableVariableLength;
163 }
164
165 size_t GetFixedInternalKeyLength() const {
166 return user_key_len_ + kNumInternalBytes;
167 }
168
169 Slice GetPrefix(const Slice& target) const {
170 assert(target.size() >= 8); // target is internal key
171 return GetPrefixFromUserKey(GetUserKey(target));
172 }
173
174 Slice GetPrefix(const ParsedInternalKey& target) const {
175 return GetPrefixFromUserKey(target.user_key);
176 }
177
178 Slice GetUserKey(const Slice& key) const {
179 return Slice(key.data(), key.size() - 8);
180 }
181
182 Slice GetPrefixFromUserKey(const Slice& user_key) const {
183 if (!IsTotalOrderMode()) {
184 return prefix_extractor_->Transform(user_key);
185 } else {
186 // Use empty slice as prefix if prefix_extractor is not set.
187 // In that case,
188 // it falls back to pure binary search and
189 // total iterator seek is supported.
190 return Slice();
191 }
192 }
193
194 friend class TableCache;
195 friend class PlainTableIterator;
196
197 // Internal helper function to generate an IndexRecordList object from all
198 // the rows, which contains index records as a list.
199 // If bloom_ is not null, all the keys' full-key hash will be added to the
200 // bloom filter.
201 Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
202 std::vector<uint32_t>* prefix_hashes);
203
204 // Internal helper function to allocate memory for bloom filter and fill it
205 void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
206 size_t huge_page_tlb_size,
207 std::vector<uint32_t>* prefix_hashes);
208
209 void FillBloom(std::vector<uint32_t>* prefix_hashes);
210
211 // Read the key and value at `offset` to parameters for keys, the and
212 // `seekable`.
213 // On success, `offset` will be updated as the offset for the next key.
214 // `parsed_key` will be key in parsed format.
215 // if `internal_key` is not empty, it will be filled with key with slice
216 // format.
217 // if `seekable` is not null, it will return whether we can directly read
218 // data using this offset.
219 Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
220 ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
221 bool* seekable = nullptr) const;
222 // Get file offset for key target.
223 // return value prefix_matched is set to true if the offset is confirmed
224 // for a key with the same prefix as target.
225 Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
226 const Slice& prefix, uint32_t prefix_hash,
227 bool& prefix_matched, uint32_t* offset) const;
228
229 bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
230
231 // No copying allowed
232 explicit PlainTableReader(const TableReader&) = delete;
233 void operator=(const TableReader&) = delete;
234 };
235 } // namespace rocksdb
236 #endif // ROCKSDB_LITE