]>
Commit | Line | Data |
---|---|---|
f67539c2 | 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
7c673cae FG |
2 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
3 | // Use of this source code is governed by a BSD-style license that can be | |
4 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
5 | ||
6 | #pragma once | |
7 | ||
8 | #ifndef ROCKSDB_LITE | |
9 | #include <memory> | |
10 | #include <string> | |
11 | #include <stdint.h> | |
12 | ||
11fdf7f2 | 13 | #include "options/options_helper.h" |
7c673cae FG |
14 | #include "rocksdb/options.h" |
15 | #include "rocksdb/table.h" | |
16 | ||
f67539c2 | 17 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
18 | |
19 | struct EnvOptions; | |
20 | ||
7c673cae FG |
21 | class Status; |
22 | class RandomAccessFile; | |
23 | class WritableFile; | |
24 | class Table; | |
25 | class TableBuilder; | |
26 | ||
f67539c2 TL |
27 | // PlainTableFactory is the entrance function to the PlainTable format of |
28 | // SST files. It returns instances PlainTableBuilder as the builder | |
29 | // class and PlainTableReader as the reader class, where the format is | |
30 | // actually implemented. | |
31 | // | |
32 | // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs. | |
33 | // Data is not organized in blocks, which allows fast access. Because of | |
34 | // following downsides | |
35 | // 1. Data compression is not supported. | |
36 | // 2. Data is not checksumed. | |
37 | // it is not recommended to use this format on other type of file systems. | |
38 | // | |
39 | // PlainTable requires fixed length key, configured as a constructor | |
7c673cae FG |
40 | // parameter of the factory class. Output file format: |
41 | // +-------------+-----------------+ | |
42 | // | version | user_key_length | | |
43 | // +------------++------------+-----------------+ <= key1 offset | |
44 | // | encoded key1 | value_size | | | |
45 | // +------------+-------------+-------------+ | | |
46 | // | value1 | | |
47 | // | | | |
48 | // +--------------------------+-------------+---+ <= key2 offset | |
49 | // | encoded key2 | value_size | | | |
50 | // +------------+-------------+-------------+ | | |
51 | // | value2 | | |
52 | // | | | |
53 | // | ...... | | |
54 | // +-----------------+--------------------------+ | |
55 | // | |
56 | // When the key encoding type is kPlain. Key part is encoded as: | |
57 | // +------------+--------------------+ | |
58 | // | [key_size] | internal key | | |
59 | // +------------+--------------------+ | |
60 | // for the case of user_key_len = kPlainTableVariableLength case, | |
61 | // and simply: | |
62 | // +----------------------+ | |
63 | // | internal key | | |
64 | // +----------------------+ | |
65 | // for user_key_len != kPlainTableVariableLength case. | |
66 | // | |
67 | // If key encoding type is kPrefix. Keys are encoding in this format. | |
68 | // There are three ways to encode a key: | |
69 | // (1) Full Key | |
70 | // +---------------+---------------+-------------------+ | |
71 | // | Full Key Flag | Full Key Size | Full Internal Key | | |
72 | // +---------------+---------------+-------------------+ | |
73 | // which simply encodes a full key | |
74 | // | |
75 | // (2) A key shared the same prefix as the previous key, which is encoded as | |
76 | // format of (1). | |
77 | // +-------------+-------------+-------------+-------------+------------+ | |
78 | // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix | | |
79 | // +-------------+-------------+-------------+-------------+------------+ | |
80 | // where key is the suffix part of the key, including the internal bytes. | |
81 | // the actual key will be constructed by concatenating prefix part of the | |
82 | // previous key, with the suffix part of the key here, with sizes given here. | |
83 | // | |
84 | // (3) A key shared the same prefix as the previous key, which is encoded as | |
85 | // the format of (2). | |
86 | // +-----------------+-----------------+------------------------+ | |
87 | // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key | | |
88 | // +-----------------+-----------------+------------------------+ | |
89 | // The key will be constructed by concatenating previous key's prefix (which is | |
90 | // also a prefix which the last key encoded in the format of (1)) and the | |
91 | // key given here. | |
92 | // | |
93 | // For example, we for following keys (prefix and suffix are separated by | |
94 | // spaces): | |
95 | // 0000 0001 | |
96 | // 0000 00021 | |
97 | // 0000 0002 | |
98 | // 00011 00 | |
99 | // 0002 0001 | |
100 | // Will be encoded like this: | |
101 | // FK 8 00000001 | |
102 | // PF 4 SF 5 00021 | |
103 | // SF 4 0002 | |
104 | // FK 7 0001100 | |
105 | // FK 8 00020001 | |
106 | // (where FK means full key flag, PF means prefix flag and SF means suffix flag) | |
107 | // | |
108 | // All those "key flag + key size" shown above are in this format: | |
109 | // The 8 bits of the first byte: | |
110 | // +----+----+----+----+----+----+----+----+ | |
111 | // | Type | Size | | |
112 | // +----+----+----+----+----+----+----+----+ | |
113 | // Type indicates: full key, prefix, or suffix. | |
114 | // The last 6 bits are for size. If the size bits are not all 1, it means the | |
115 | // size of the key. Otherwise, varint32 is read after this byte. This varint | |
116 | // value + 0x3F (the value of all 1) will be the key size. | |
117 | // | |
118 | // For example, full key with length 16 will be encoded as (binary): | |
119 | // 00 010000 | |
120 | // (00 means full key) | |
121 | // and a prefix with 100 bytes will be encoded as: | |
122 | // 01 111111 00100101 | |
123 | // (63) (37) | |
124 | // (01 means key suffix) | |
125 | // | |
126 | // All the internal keys above (including kPlain and kPrefix) are encoded in | |
127 | // this format: | |
128 | // There are two types: | |
129 | // (1) normal internal key format | |
130 | // +----------- ...... -------------+----+---+---+---+---+---+---+---+ | |
131 | // | user key |type| sequence ID | | |
132 | // +----------- ..... --------------+----+---+---+---+---+---+---+---+ | |
133 | // (2) Special case for keys whose sequence ID is 0 and is value type | |
134 | // +----------- ...... -------------+----+ | |
135 | // | user key |0x80| | |
136 | // +----------- ..... --------------+----+ | |
137 | // To save 7 bytes for the special case where sequence ID = 0. | |
138 | // | |
139 | // | |
140 | class PlainTableFactory : public TableFactory { | |
141 | public: | |
142 | ~PlainTableFactory() {} | |
143 | // user_key_len is the length of the user key. If it is set to be | |
144 | // kPlainTableVariableLength, then it means variable length. Otherwise, all | |
145 | // the keys need to have the fix length of this value. bloom_bits_per_key is | |
146 | // number of bits used for bloom filer per key. hash_table_ratio is | |
147 | // the desired utilization of the hash table used for prefix hashing. | |
148 | // hash_table_ratio = number of prefixes / #buckets in the hash table | |
149 | // hash_table_ratio = 0 means skip hash table but only replying on binary | |
150 | // search. | |
151 | // index_sparseness determines index interval for keys | |
152 | // inside the same prefix. It will be the maximum number of linear search | |
153 | // required after hash and binary search. | |
154 | // index_sparseness = 0 means index for every key. | |
155 | // huge_page_tlb_size determines whether to allocate hash indexes from huge | |
156 | // page TLB and the page size if allocating from there. See comments of | |
157 | // Arena::AllocateAligned() for details. | |
158 | explicit PlainTableFactory( | |
159 | const PlainTableOptions& _table_options = PlainTableOptions()) | |
160 | : table_options_(_table_options) {} | |
161 | ||
162 | const char* Name() const override { return "PlainTable"; } | |
163 | Status NewTableReader(const TableReaderOptions& table_reader_options, | |
494da23a TL |
164 | std::unique_ptr<RandomAccessFileReader>&& file, |
165 | uint64_t file_size, std::unique_ptr<TableReader>* table, | |
7c673cae FG |
166 | bool prefetch_index_and_filter_in_cache) const override; |
167 | ||
168 | TableBuilder* NewTableBuilder( | |
169 | const TableBuilderOptions& table_builder_options, | |
170 | uint32_t column_family_id, WritableFileWriter* file) const override; | |
171 | ||
172 | std::string GetPrintableTableOptions() const override; | |
173 | ||
174 | const PlainTableOptions& table_options() const; | |
175 | ||
11fdf7f2 | 176 | static const char kValueTypeSeqId0 = char(~0); |
7c673cae FG |
177 | |
178 | // Sanitizes the specified DB Options. | |
11fdf7f2 TL |
179 | Status SanitizeOptions( |
180 | const DBOptions& /*db_opts*/, | |
181 | const ColumnFamilyOptions& /*cf_opts*/) const override { | |
7c673cae FG |
182 | return Status::OK(); |
183 | } | |
184 | ||
185 | void* GetOptions() override { return &table_options_; } | |
186 | ||
11fdf7f2 TL |
187 | Status GetOptionString(std::string* /*opt_string*/, |
188 | const std::string& /*delimiter*/) const override { | |
189 | return Status::OK(); | |
190 | } | |
191 | ||
7c673cae FG |
192 | private: |
193 | PlainTableOptions table_options_; | |
194 | }; | |
195 | ||
11fdf7f2 TL |
196 | static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = { |
197 | {"user_key_len", | |
198 | {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, | |
199 | OptionVerificationType::kNormal, false, 0}}, | |
200 | {"bloom_bits_per_key", | |
201 | {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt, | |
202 | OptionVerificationType::kNormal, false, 0}}, | |
203 | {"hash_table_ratio", | |
204 | {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble, | |
205 | OptionVerificationType::kNormal, false, 0}}, | |
206 | {"index_sparseness", | |
207 | {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT, | |
208 | OptionVerificationType::kNormal, false, 0}}, | |
209 | {"huge_page_tlb_size", | |
210 | {offsetof(struct PlainTableOptions, huge_page_tlb_size), | |
211 | OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}, | |
212 | {"encoding_type", | |
213 | {offsetof(struct PlainTableOptions, encoding_type), | |
214 | OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}}, | |
215 | {"full_scan_mode", | |
216 | {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean, | |
217 | OptionVerificationType::kNormal, false, 0}}, | |
218 | {"store_index_in_file", | |
219 | {offsetof(struct PlainTableOptions, store_index_in_file), | |
220 | OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; | |
221 | ||
f67539c2 | 222 | } // namespace ROCKSDB_NAMESPACE |
7c673cae | 223 | #endif // ROCKSDB_LITE |