]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/table/plain_table_factory.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / rocksdb / table / plain_table_factory.h
1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5 #pragma once
6
7 #ifndef ROCKSDB_LITE
8 #include <memory>
9 #include <string>
10 #include <stdint.h>
11
12 #include "options/options_helper.h"
13 #include "rocksdb/options.h"
14 #include "rocksdb/table.h"
15
16 namespace rocksdb {
17
18 struct EnvOptions;
19
20 class Status;
21 class RandomAccessFile;
22 class WritableFile;
23 class Table;
24 class TableBuilder;
25
26 // IndexedTable requires fixed length key, configured as a constructor
27 // parameter of the factory class. Output file format:
28 // +-------------+-----------------+
29 // | version | user_key_length |
30 // +------------++------------+-----------------+ <= key1 offset
31 // | encoded key1 | value_size | |
32 // +------------+-------------+-------------+ |
33 // | value1 |
34 // | |
35 // +--------------------------+-------------+---+ <= key2 offset
36 // | encoded key2 | value_size | |
37 // +------------+-------------+-------------+ |
38 // | value2 |
39 // | |
40 // | ...... |
41 // +-----------------+--------------------------+
42 //
43 // When the key encoding type is kPlain. Key part is encoded as:
44 // +------------+--------------------+
45 // | [key_size] | internal key |
46 // +------------+--------------------+
47 // for the case of user_key_len = kPlainTableVariableLength case,
48 // and simply:
49 // +----------------------+
50 // | internal key |
51 // +----------------------+
52 // for user_key_len != kPlainTableVariableLength case.
53 //
54 // If key encoding type is kPrefix. Keys are encoding in this format.
55 // There are three ways to encode a key:
56 // (1) Full Key
57 // +---------------+---------------+-------------------+
58 // | Full Key Flag | Full Key Size | Full Internal Key |
59 // +---------------+---------------+-------------------+
60 // which simply encodes a full key
61 //
62 // (2) A key shared the same prefix as the previous key, which is encoded as
63 // format of (1).
64 // +-------------+-------------+-------------+-------------+------------+
65 // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
66 // +-------------+-------------+-------------+-------------+------------+
67 // where key is the suffix part of the key, including the internal bytes.
68 // the actual key will be constructed by concatenating prefix part of the
69 // previous key, with the suffix part of the key here, with sizes given here.
70 //
71 // (3) A key shared the same prefix as the previous key, which is encoded as
72 // the format of (2).
73 // +-----------------+-----------------+------------------------+
74 // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
75 // +-----------------+-----------------+------------------------+
76 // The key will be constructed by concatenating previous key's prefix (which is
77 // also a prefix which the last key encoded in the format of (1)) and the
78 // key given here.
79 //
80 // For example, we for following keys (prefix and suffix are separated by
81 // spaces):
82 // 0000 0001
83 // 0000 00021
84 // 0000 0002
85 // 00011 00
86 // 0002 0001
87 // Will be encoded like this:
88 // FK 8 00000001
89 // PF 4 SF 5 00021
90 // SF 4 0002
91 // FK 7 0001100
92 // FK 8 00020001
93 // (where FK means full key flag, PF means prefix flag and SF means suffix flag)
94 //
95 // All those "key flag + key size" shown above are in this format:
96 // The 8 bits of the first byte:
97 // +----+----+----+----+----+----+----+----+
98 // | Type | Size |
99 // +----+----+----+----+----+----+----+----+
100 // Type indicates: full key, prefix, or suffix.
101 // The last 6 bits are for size. If the size bits are not all 1, it means the
102 // size of the key. Otherwise, varint32 is read after this byte. This varint
103 // value + 0x3F (the value of all 1) will be the key size.
104 //
105 // For example, full key with length 16 will be encoded as (binary):
106 // 00 010000
107 // (00 means full key)
108 // and a prefix with 100 bytes will be encoded as:
109 // 01 111111 00100101
110 // (63) (37)
111 // (01 means key suffix)
112 //
113 // All the internal keys above (including kPlain and kPrefix) are encoded in
114 // this format:
115 // There are two types:
116 // (1) normal internal key format
117 // +----------- ...... -------------+----+---+---+---+---+---+---+---+
118 // | user key |type| sequence ID |
119 // +----------- ..... --------------+----+---+---+---+---+---+---+---+
120 // (2) Special case for keys whose sequence ID is 0 and is value type
121 // +----------- ...... -------------+----+
122 // | user key |0x80|
123 // +----------- ..... --------------+----+
124 // To save 7 bytes for the special case where sequence ID = 0.
125 //
126 //
127 class PlainTableFactory : public TableFactory {
128 public:
129 ~PlainTableFactory() {}
130 // user_key_len is the length of the user key. If it is set to be
131 // kPlainTableVariableLength, then it means variable length. Otherwise, all
132 // the keys need to have the fix length of this value. bloom_bits_per_key is
133 // number of bits used for bloom filer per key. hash_table_ratio is
134 // the desired utilization of the hash table used for prefix hashing.
135 // hash_table_ratio = number of prefixes / #buckets in the hash table
136 // hash_table_ratio = 0 means skip hash table but only replying on binary
137 // search.
138 // index_sparseness determines index interval for keys
139 // inside the same prefix. It will be the maximum number of linear search
140 // required after hash and binary search.
141 // index_sparseness = 0 means index for every key.
142 // huge_page_tlb_size determines whether to allocate hash indexes from huge
143 // page TLB and the page size if allocating from there. See comments of
144 // Arena::AllocateAligned() for details.
145 explicit PlainTableFactory(
146 const PlainTableOptions& _table_options = PlainTableOptions())
147 : table_options_(_table_options) {}
148
149 const char* Name() const override { return "PlainTable"; }
150 Status NewTableReader(const TableReaderOptions& table_reader_options,
151 std::unique_ptr<RandomAccessFileReader>&& file,
152 uint64_t file_size, std::unique_ptr<TableReader>* table,
153 bool prefetch_index_and_filter_in_cache) const override;
154
155 TableBuilder* NewTableBuilder(
156 const TableBuilderOptions& table_builder_options,
157 uint32_t column_family_id, WritableFileWriter* file) const override;
158
159 std::string GetPrintableTableOptions() const override;
160
161 const PlainTableOptions& table_options() const;
162
163 static const char kValueTypeSeqId0 = char(~0);
164
165 // Sanitizes the specified DB Options.
166 Status SanitizeOptions(
167 const DBOptions& /*db_opts*/,
168 const ColumnFamilyOptions& /*cf_opts*/) const override {
169 return Status::OK();
170 }
171
172 void* GetOptions() override { return &table_options_; }
173
174 Status GetOptionString(std::string* /*opt_string*/,
175 const std::string& /*delimiter*/) const override {
176 return Status::OK();
177 }
178
179 private:
180 PlainTableOptions table_options_;
181 };
182
183 static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
184 {"user_key_len",
185 {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
186 OptionVerificationType::kNormal, false, 0}},
187 {"bloom_bits_per_key",
188 {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
189 OptionVerificationType::kNormal, false, 0}},
190 {"hash_table_ratio",
191 {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
192 OptionVerificationType::kNormal, false, 0}},
193 {"index_sparseness",
194 {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
195 OptionVerificationType::kNormal, false, 0}},
196 {"huge_page_tlb_size",
197 {offsetof(struct PlainTableOptions, huge_page_tlb_size),
198 OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
199 {"encoding_type",
200 {offsetof(struct PlainTableOptions, encoding_type),
201 OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
202 {"full_scan_mode",
203 {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
204 OptionVerificationType::kNormal, false, 0}},
205 {"store_index_in_file",
206 {offsetof(struct PlainTableOptions, store_index_in_file),
207 OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
208
209 } // namespace rocksdb
210 #endif // ROCKSDB_LITE