]>
Commit | Line | Data |
---|---|---|
f67539c2 | 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
7c673cae FG |
2 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
3 | // Use of this source code is governed by a BSD-style license that can be | |
4 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
5 | // | |
6 | // Currently we support two types of tables: plain table and block-based table. | |
7 | // 1. Block-based table: this is the default table type that we inherited from | |
8 | // LevelDB, which was designed for storing data in hard disk or flash | |
9 | // device. | |
10 | // 2. Plain table: it is one of RocksDB's SST file format optimized | |
11 | // for low query latency on pure-memory or really low-latency media. | |
12 | // | |
13 | // A tutorial of rocksdb table formats is available here: | |
14 | // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats | |
15 | // | |
16 | // Example code is also available | |
17 | // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples | |
18 | ||
19 | #pragma once | |
11fdf7f2 | 20 | |
7c673cae FG |
21 | #include <memory> |
22 | #include <string> | |
23 | #include <unordered_map> | |
24 | ||
20effc67 | 25 | #include "rocksdb/customizable.h" |
7c673cae | 26 | #include "rocksdb/env.h" |
7c673cae FG |
27 | #include "rocksdb/options.h" |
28 | #include "rocksdb/status.h" | |
29 | ||
f67539c2 | 30 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
31 | |
32 | // -- Block-based Table | |
20effc67 TL |
33 | class Cache; |
34 | class FilterPolicy; | |
7c673cae FG |
35 | class FlushBlockPolicyFactory; |
36 | class PersistentCache; | |
37 | class RandomAccessFile; | |
38 | struct TableReaderOptions; | |
39 | struct TableBuilderOptions; | |
40 | class TableBuilder; | |
20effc67 | 41 | class TableFactory; |
7c673cae FG |
42 | class TableReader; |
43 | class WritableFileWriter; | |
20effc67 | 44 | struct ConfigOptions; |
7c673cae | 45 | struct EnvOptions; |
7c673cae | 46 | |
7c673cae | 47 | enum ChecksumType : char { |
11fdf7f2 | 48 | kNoChecksum = 0x0, |
7c673cae FG |
49 | kCRC32c = 0x1, |
50 | kxxHash = 0x2, | |
494da23a | 51 | kxxHash64 = 0x3, |
7c673cae FG |
52 | }; |
53 | ||
20effc67 TL |
54 | // `PinningTier` is used to specify which tier of block-based tables should |
55 | // be affected by a block cache pinning setting (see | |
56 | // `MetadataCacheOptions` below). | |
57 | enum class PinningTier { | |
58 | // For compatibility, this value specifies to fallback to the behavior | |
59 | // indicated by the deprecated options, | |
60 | // `pin_l0_filter_and_index_blocks_in_cache` and | |
61 | // `pin_top_level_index_and_filter`. | |
62 | kFallback, | |
63 | ||
64 | // This tier contains no block-based tables. | |
65 | kNone, | |
66 | ||
67 | // This tier contains block-based tables that may have originated from a | |
68 | // memtable flush. In particular, it includes tables from L0 that are smaller | |
69 | // than 1.5 times the current `write_buffer_size`. Note these criteria imply | |
70 | // it can include intra-L0 compaction outputs and ingested files, as long as | |
71 | // they are not abnormally large compared to flushed files in L0. | |
72 | kFlushedAndSimilar, | |
73 | ||
74 | // This tier contains all block-based tables. | |
75 | kAll, | |
76 | }; | |
77 | ||
78 | // `MetadataCacheOptions` contains members indicating the desired caching | |
79 | // behavior for the different categories of metadata blocks. | |
80 | struct MetadataCacheOptions { | |
81 | // The tier of block-based tables whose top-level index into metadata | |
82 | // partitions will be pinned. Currently indexes and filters may be | |
83 | // partitioned. | |
84 | // | |
85 | // Note `cache_index_and_filter_blocks` must be true for this option to have | |
86 | // any effect. Otherwise any top-level index into metadata partitions would be | |
87 | // held in table reader memory, outside the block cache. | |
88 | PinningTier top_level_index_pinning = PinningTier::kFallback; | |
89 | ||
90 | // The tier of block-based tables whose metadata partitions will be pinned. | |
91 | // Currently indexes and filters may be partitioned. | |
92 | PinningTier partition_pinning = PinningTier::kFallback; | |
93 | ||
94 | // The tier of block-based tables whose unpartitioned metadata blocks will be | |
95 | // pinned. | |
96 | // | |
97 | // Note `cache_index_and_filter_blocks` must be true for this option to have | |
98 | // any effect. Otherwise the unpartitioned meta-blocks would be held in table | |
99 | // reader memory, outside the block cache. | |
100 | PinningTier unpartitioned_pinning = PinningTier::kFallback; | |
101 | }; | |
102 | ||
7c673cae FG |
103 | // For advanced user only |
104 | struct BlockBasedTableOptions { | |
20effc67 | 105 | static const char* kName() { return "BlockTableOptions"; }; |
7c673cae FG |
106 | // @flush_block_policy_factory creates the instances of flush block policy. |
107 | // which provides a configurable way to determine when to flush a block in | |
108 | // the block based tables. If not set, table builder will use the default | |
109 | // block flush policy, which cut blocks by block size (please refer to | |
110 | // `FlushBlockBySizePolicy`). | |
111 | std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory; | |
112 | ||
113 | // TODO(kailiu) Temporarily disable this feature by making the default value | |
114 | // to be false. | |
115 | // | |
494da23a TL |
116 | // TODO(ajkr) we need to update names of variables controlling meta-block |
117 | // caching as they should now apply to range tombstone and compression | |
118 | // dictionary meta-blocks, in addition to index and filter meta-blocks. | |
119 | // | |
7c673cae FG |
120 | // Indicating if we'd put index/filter blocks to the block cache. |
121 | // If not specified, each "table reader" object will pre-load index/filter | |
122 | // block during table initialization. | |
123 | bool cache_index_and_filter_blocks = false; | |
124 | ||
125 | // If cache_index_and_filter_blocks is enabled, cache index and filter | |
126 | // blocks with high priority. If set to true, depending on implementation of | |
11fdf7f2 | 127 | // block cache, index and filter blocks may be less likely to be evicted |
7c673cae | 128 | // than data blocks. |
f67539c2 | 129 | bool cache_index_and_filter_blocks_with_high_priority = true; |
7c673cae | 130 | |
20effc67 TL |
131 | // DEPRECATED: This option will be removed in a future version. For now, this |
132 | // option still takes effect by updating each of the following variables that | |
133 | // has the default value, `PinningTier::kFallback`: | |
134 | // | |
135 | // - `MetadataCacheOptions::partition_pinning` | |
136 | // - `MetadataCacheOptions::unpartitioned_pinning` | |
137 | // | |
138 | // The updated value is chosen as follows: | |
139 | // | |
140 | // - `pin_l0_filter_and_index_blocks_in_cache == false` -> | |
141 | // `PinningTier::kNone` | |
142 | // - `pin_l0_filter_and_index_blocks_in_cache == true` -> | |
143 | // `PinningTier::kFlushedAndSimilar` | |
144 | // | |
145 | // To migrate away from this flag, explicitly configure | |
146 | // `MetadataCacheOptions` as described above. | |
147 | // | |
7c673cae FG |
148 | // if cache_index_and_filter_blocks is true and the below is true, then |
149 | // filter and index blocks are stored in the cache, but a reference is | |
150 | // held in the "table reader" object so the blocks are pinned and only | |
151 | // evicted from cache when the table reader is freed. | |
152 | bool pin_l0_filter_and_index_blocks_in_cache = false; | |
153 | ||
20effc67 TL |
154 | // DEPRECATED: This option will be removed in a future version. For now, this |
155 | // option still takes effect by updating | |
156 | // `MetadataCacheOptions::top_level_index_pinning` when it has the | |
157 | // default value, `PinningTier::kFallback`. | |
158 | // | |
159 | // The updated value is chosen as follows: | |
160 | // | |
161 | // - `pin_top_level_index_and_filter == false` -> | |
162 | // `PinningTier::kNone` | |
163 | // - `pin_top_level_index_and_filter == true` -> | |
164 | // `PinningTier::kAll` | |
165 | // | |
166 | // To migrate away from this flag, explicitly configure | |
167 | // `MetadataCacheOptions` as described above. | |
168 | // | |
11fdf7f2 TL |
169 | // If cache_index_and_filter_blocks is true and the below is true, then |
170 | // the top-level index of partitioned filter and index blocks are stored in | |
171 | // the cache, but a reference is held in the "table reader" object so the | |
172 | // blocks are pinned and only evicted from cache when the table reader is | |
173 | // freed. This is not limited to l0 in LSM tree. | |
174 | bool pin_top_level_index_and_filter = true; | |
175 | ||
20effc67 TL |
176 | // The desired block cache pinning behavior for the different categories of |
177 | // metadata blocks. While pinning can reduce block cache contention, users | |
178 | // must take care not to pin excessive amounts of data, which risks | |
179 | // overflowing block cache. | |
180 | MetadataCacheOptions metadata_cache_options; | |
181 | ||
7c673cae FG |
182 | // The index type that will be used for this table. |
183 | enum IndexType : char { | |
184 | // A space efficient index block that is optimized for | |
185 | // binary-search-based index. | |
f67539c2 | 186 | kBinarySearch = 0x00, |
7c673cae FG |
187 | |
188 | // The hash index, if enabled, will do the hash lookup when | |
189 | // `Options.prefix_extractor` is provided. | |
f67539c2 | 190 | kHashSearch = 0x01, |
7c673cae | 191 | |
7c673cae | 192 | // A two-level index implementation. Both levels are binary search indexes. |
f67539c2 TL |
193 | kTwoLevelIndexSearch = 0x02, |
194 | ||
195 | // Like kBinarySearch, but index also contains first key of each block. | |
196 | // This allows iterators to defer reading the block until it's actually | |
197 | // needed. May significantly reduce read amplification of short range scans. | |
198 | // Without it, iterator seek usually reads one block from each level-0 file | |
199 | // and from each level, which may be expensive. | |
200 | // Works best in combination with: | |
201 | // - IndexShorteningMode::kNoShortening, | |
202 | // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries, | |
203 | // e.g. when prefix changes. | |
204 | // Makes the index significantly bigger (2x or more), especially when keys | |
205 | // are long. | |
f67539c2 | 206 | kBinarySearchWithFirstKey = 0x03, |
7c673cae FG |
207 | }; |
208 | ||
209 | IndexType index_type = kBinarySearch; | |
210 | ||
11fdf7f2 TL |
211 | // The index type that will be used for the data block. |
212 | enum DataBlockIndexType : char { | |
213 | kDataBlockBinarySearch = 0, // traditional block type | |
214 | kDataBlockBinaryAndHash = 1, // additional hash index | |
215 | }; | |
216 | ||
217 | DataBlockIndexType data_block_index_type = kDataBlockBinarySearch; | |
218 | ||
219 | // #entries/#buckets. It is valid only when data_block_hash_index_type is | |
220 | // kDataBlockBinaryAndHash. | |
221 | double data_block_hash_table_util_ratio = 0.75; | |
222 | ||
7c673cae FG |
223 | // This option is now deprecated. No matter what value it is set to, |
224 | // it will behave as if hash_index_allow_collision=true. | |
225 | bool hash_index_allow_collision = true; | |
226 | ||
227 | // Use the specified checksum type. Newly created table files will be | |
228 | // protected with this checksum type. Old table files will still be readable, | |
229 | // even though they have different checksum type. | |
230 | ChecksumType checksum = kCRC32c; | |
231 | ||
232 | // Disable block cache. If this is set to true, | |
233 | // then no block cache should be used, and the block_cache should | |
234 | // point to a nullptr object. | |
235 | bool no_block_cache = false; | |
236 | ||
237 | // If non-NULL use the specified cache for blocks. | |
238 | // If NULL, rocksdb will automatically create and use an 8MB internal cache. | |
239 | std::shared_ptr<Cache> block_cache = nullptr; | |
240 | ||
241 | // If non-NULL use the specified cache for pages read from device | |
242 | // IF NULL, no page cache is used | |
243 | std::shared_ptr<PersistentCache> persistent_cache = nullptr; | |
244 | ||
245 | // If non-NULL use the specified cache for compressed blocks. | |
246 | // If NULL, rocksdb will not use a compressed block cache. | |
494da23a TL |
247 | // Note: though it looks similar to `block_cache`, RocksDB doesn't put the |
248 | // same type of object there. | |
7c673cae FG |
249 | std::shared_ptr<Cache> block_cache_compressed = nullptr; |
250 | ||
251 | // Approximate size of user data packed per block. Note that the | |
252 | // block size specified here corresponds to uncompressed data. The | |
253 | // actual size of the unit read from disk may be smaller if | |
254 | // compression is enabled. This parameter can be changed dynamically. | |
255 | size_t block_size = 4 * 1024; | |
256 | ||
257 | // This is used to close a block before it reaches the configured | |
258 | // 'block_size'. If the percentage of free space in the current block is less | |
259 | // than this specified number and adding a new record to the block will | |
260 | // exceed the configured block size, then this block will be closed and the | |
261 | // new record will be written to the next block. | |
262 | int block_size_deviation = 10; | |
263 | ||
264 | // Number of keys between restart points for delta encoding of keys. | |
265 | // This parameter can be changed dynamically. Most clients should | |
266 | // leave this parameter alone. The minimum value allowed is 1. Any smaller | |
267 | // value will be silently overwritten with 1. | |
268 | int block_restart_interval = 16; | |
269 | ||
270 | // Same as block_restart_interval but used for the index block. | |
271 | int index_block_restart_interval = 1; | |
272 | ||
273 | // Block size for partitioned metadata. Currently applied to indexes when | |
274 | // kTwoLevelIndexSearch is used and to filters when partition_filters is used. | |
275 | // Note: Since in the current implementation the filters and index partitions | |
11fdf7f2 | 276 | // are aligned, an index/filter block is created when either index or filter |
7c673cae FG |
277 | // block size reaches the specified limit. |
278 | // Note: this limit is currently applied to only index blocks; a filter | |
279 | // partition is cut right after an index block is cut | |
280 | // TODO(myabandeh): remove the note above when filter partitions are cut | |
281 | // separately | |
282 | uint64_t metadata_block_size = 4096; | |
283 | ||
284 | // Note: currently this option requires kTwoLevelIndexSearch to be set as | |
285 | // well. | |
286 | // TODO(myabandeh): remove the note above once the limitation is lifted | |
11fdf7f2 TL |
287 | // Use partitioned full filters for each SST file. This option is |
288 | // incompatible with block-based filters. | |
7c673cae FG |
289 | bool partition_filters = false; |
290 | ||
20effc67 TL |
291 | // EXPERIMENTAL Option to generate Bloom filters that minimize memory |
292 | // internal fragmentation. | |
293 | // | |
294 | // When false, malloc_usable_size is not available, or format_version < 5, | |
295 | // filters are generated without regard to internal fragmentation when | |
296 | // loaded into memory (historical behavior). When true (and | |
297 | // malloc_usable_size is available and format_version >= 5), then Bloom | |
298 | // filters are generated to "round up" and "round down" their sizes to | |
299 | // minimize internal fragmentation when loaded into memory, assuming the | |
300 | // reading DB has the same memory allocation characteristics as the | |
301 | // generating DB. This option does not break forward or backward | |
302 | // compatibility. | |
303 | // | |
304 | // While individual filters will vary in bits/key and false positive rate | |
305 | // when setting is true, the implementation attempts to maintain a weighted | |
306 | // average FP rate for filters consistent with this option set to false. | |
307 | // | |
308 | // With Jemalloc for example, this setting is expected to save about 10% of | |
309 | // the memory footprint and block cache charge of filters, while increasing | |
310 | // disk usage of filters by about 1-2% due to encoding efficiency losses | |
311 | // with variance in bits/key. | |
312 | // | |
313 | // NOTE: Because some memory counted by block cache might be unmapped pages | |
314 | // within internal fragmentation, this option can increase observed RSS | |
315 | // memory usage. With cache_index_and_filter_blocks=true, this option makes | |
316 | // the block cache better at using space it is allowed. | |
317 | // | |
318 | // NOTE: Do not set to true if you do not trust malloc_usable_size. With | |
319 | // this option, RocksDB might access an allocated memory object beyond its | |
320 | // original size if malloc_usable_size says it is safe to do so. While this | |
321 | // can be considered bad practice, it should not produce undefined behavior | |
322 | // unless malloc_usable_size is buggy or broken. | |
323 | bool optimize_filters_for_memory = false; | |
324 | ||
7c673cae FG |
325 | // Use delta encoding to compress keys in blocks. |
326 | // ReadOptions::pin_data requires this option to be disabled. | |
327 | // | |
328 | // Default: true | |
329 | bool use_delta_encoding = true; | |
330 | ||
331 | // If non-nullptr, use the specified filter policy to reduce disk reads. | |
332 | // Many applications will benefit from passing the result of | |
333 | // NewBloomFilterPolicy() here. | |
334 | std::shared_ptr<const FilterPolicy> filter_policy = nullptr; | |
335 | ||
336 | // If true, place whole keys in the filter (not just prefixes). | |
337 | // This must generally be true for gets to be efficient. | |
338 | bool whole_key_filtering = true; | |
339 | ||
340 | // Verify that decompressing the compressed block gives back the input. This | |
341 | // is a verification mode that we use to detect bugs in compression | |
342 | // algorithms. | |
343 | bool verify_compression = false; | |
344 | ||
345 | // If used, For every data block we load into memory, we will create a bitmap | |
346 | // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap | |
347 | // will be used to figure out the percentage we actually read of the blocks. | |
348 | // | |
349 | // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and | |
350 | // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the | |
351 | // read amplification using this formula | |
352 | // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) | |
353 | // | |
354 | // value => memory usage (percentage of loaded blocks memory) | |
355 | // 1 => 12.50 % | |
356 | // 2 => 06.25 % | |
357 | // 4 => 03.12 % | |
358 | // 8 => 01.56 % | |
359 | // 16 => 00.78 % | |
360 | // | |
361 | // Note: This number must be a power of 2, if not it will be sanitized | |
362 | // to be the next lowest power of 2, for example a value of 7 will be | |
363 | // treated as 4, a value of 19 will be treated as 16. | |
364 | // | |
365 | // Default: 0 (disabled) | |
366 | uint32_t read_amp_bytes_per_bit = 0; | |
367 | ||
494da23a | 368 | // We currently have five versions: |
7c673cae FG |
369 | // 0 -- This version is currently written out by all RocksDB's versions by |
370 | // default. Can be read by really old RocksDB's. Doesn't support changing | |
371 | // checksum (default is CRC32). | |
372 | // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default | |
373 | // checksum, like xxHash. It is written by RocksDB when | |
374 | // BlockBasedTableOptions::checksum is something other than kCRC32c. (version | |
375 | // 0 is silently upconverted) | |
376 | // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we | |
377 | // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you | |
378 | // don't plan to run RocksDB before version 3.10, you should probably use | |
379 | // this. | |
11fdf7f2 TL |
380 | // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we |
381 | // encode the keys in index blocks. If you don't plan to run RocksDB before | |
382 | // version 5.15, you should probably use this. | |
383 | // This option only affects newly written tables. When reading existing | |
384 | // tables, the information about version is read from the footer. | |
385 | // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we | |
386 | // encode the values in index blocks. If you don't plan to run RocksDB before | |
387 | // version 5.16 and you are using index_block_restart_interval > 1, you should | |
388 | // probably use this as it would reduce the index size. | |
389 | // This option only affects newly written tables. When reading existing | |
390 | // tables, the information about version is read from the footer. | |
f67539c2 TL |
391 | // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned |
392 | // filters use a generally faster and more accurate Bloom filter | |
393 | // implementation, with a different schema. | |
20effc67 | 394 | uint32_t format_version = 4; |
11fdf7f2 TL |
395 | |
396 | // Store index blocks on disk in compressed format. Changing this option to | |
397 | // false will avoid the overhead of decompression if index blocks are evicted | |
398 | // and read back | |
399 | bool enable_index_compression = true; | |
400 | ||
401 | // Align data blocks on lesser of page size and block size | |
402 | bool block_align = false; | |
f67539c2 TL |
403 | |
404 | // This enum allows trading off increased index size for improved iterator | |
405 | // seek performance in some situations, particularly when block cache is | |
406 | // disabled (ReadOptions::fill_cache = false) and direct IO is | |
407 | // enabled (DBOptions::use_direct_reads = true). | |
408 | // The default mode is the best tradeoff for most use cases. | |
409 | // This option only affects newly written tables. | |
410 | // | |
411 | // The index contains a key separating each pair of consecutive blocks. | |
412 | // Let A be the highest key in one block, B the lowest key in the next block, | |
413 | // and I the index entry separating these two blocks: | |
414 | // [ ... A] I [B ...] | |
415 | // I is allowed to be anywhere in [A, B). | |
416 | // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the | |
417 | // first block, then immediately fall through to the second block. | |
418 | // However, if I=A, this can't happen, and we'll read only the second block. | |
419 | // In kNoShortening mode, we use I=A. In other modes, we use the shortest | |
420 | // key in [A, B), which usually significantly reduces index size. | |
421 | // | |
422 | // There's a similar story for the last index entry, which is an upper bound | |
423 | // of the highest key in the file. If it's shortened and therefore | |
424 | // overestimated, iterator is likely to unnecessarily read the last data block | |
425 | // from each file on each seek. | |
426 | enum class IndexShorteningMode : char { | |
427 | // Use full keys. | |
428 | kNoShortening, | |
429 | // Shorten index keys between blocks, but use full key for the last index | |
430 | // key, which is the upper bound of the whole file. | |
431 | kShortenSeparators, | |
432 | // Shorten both keys between blocks and key after last block. | |
433 | kShortenSeparatorsAndSuccessor, | |
434 | }; | |
435 | ||
436 | IndexShorteningMode index_shortening = | |
437 | IndexShorteningMode::kShortenSeparators; | |
7c673cae FG |
438 | }; |
439 | ||
440 | // Table Properties that are specific to block-based table properties. | |
441 | struct BlockBasedTablePropertyNames { | |
11fdf7f2 | 442 | // value of this properties is a fixed int32 number. |
7c673cae FG |
443 | static const std::string kIndexType; |
444 | // value is "1" for true and "0" for false. | |
445 | static const std::string kWholeKeyFiltering; | |
446 | // value is "1" for true and "0" for false. | |
447 | static const std::string kPrefixFiltering; | |
448 | }; | |
449 | ||
450 | // Create default block based table factory. | |
451 | extern TableFactory* NewBlockBasedTableFactory( | |
452 | const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); | |
453 | ||
454 | #ifndef ROCKSDB_LITE | |
455 | ||
456 | enum EncodingType : char { | |
457 | // Always write full keys without any special encoding. | |
458 | kPlain, | |
459 | // Find opportunity to write the same prefix once for multiple rows. | |
460 | // In some cases, when a key follows a previous key with the same prefix, | |
461 | // instead of writing out the full key, it just writes out the size of the | |
462 | // shared prefix, as well as other bytes, to save some bytes. | |
463 | // | |
464 | // When using this option, the user is required to use the same prefix | |
465 | // extractor to make sure the same prefix will be extracted from the same key. | |
466 | // The Name() value of the prefix extractor will be stored in the file. When | |
467 | // reopening the file, the name of the options.prefix_extractor given will be | |
468 | // bitwise compared to the prefix extractors stored in the file. An error | |
469 | // will be returned if the two don't match. | |
470 | kPrefix, | |
471 | }; | |
472 | ||
473 | // Table Properties that are specific to plain table properties. | |
474 | struct PlainTablePropertyNames { | |
475 | static const std::string kEncodingType; | |
476 | static const std::string kBloomVersion; | |
477 | static const std::string kNumBloomBlocks; | |
478 | }; | |
479 | ||
480 | const uint32_t kPlainTableVariableLength = 0; | |
481 | ||
482 | struct PlainTableOptions { | |
20effc67 | 483 | static const char* kName() { return "PlainTableOptions"; }; |
7c673cae FG |
484 | // @user_key_len: plain table has optimization for fix-sized keys, which can |
485 | // be specified via user_key_len. Alternatively, you can pass | |
486 | // `kPlainTableVariableLength` if your keys have variable | |
487 | // lengths. | |
488 | uint32_t user_key_len = kPlainTableVariableLength; | |
489 | ||
490 | // @bloom_bits_per_key: the number of bits used for bloom filer per prefix. | |
491 | // You may disable it by passing a zero. | |
492 | int bloom_bits_per_key = 10; | |
493 | ||
494 | // @hash_table_ratio: the desired utilization of the hash table used for | |
495 | // prefix hashing. | |
496 | // hash_table_ratio = number of prefixes / #buckets in the | |
497 | // hash table | |
498 | double hash_table_ratio = 0.75; | |
499 | ||
500 | // @index_sparseness: inside each prefix, need to build one index record for | |
501 | // how many keys for binary search inside each hash bucket. | |
502 | // For encoding type kPrefix, the value will be used when | |
503 | // writing to determine an interval to rewrite the full | |
504 | // key. It will also be used as a suggestion and satisfied | |
505 | // when possible. | |
506 | size_t index_sparseness = 16; | |
507 | ||
508 | // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. | |
509 | // Otherwise from huge page TLB. The user needs to | |
510 | // reserve huge pages for it to be allocated, like: | |
511 | // sysctl -w vm.nr_hugepages=20 | |
512 | // See linux doc Documentation/vm/hugetlbpage.txt | |
513 | size_t huge_page_tlb_size = 0; | |
514 | ||
515 | // @encoding_type: how to encode the keys. See enum EncodingType above for | |
516 | // the choices. The value will determine how to encode keys | |
517 | // when writing to a new SST file. This value will be stored | |
518 | // inside the SST file which will be used when reading from | |
519 | // the file, which makes it possible for users to choose | |
520 | // different encoding type when reopening a DB. Files with | |
521 | // different encoding types can co-exist in the same DB and | |
522 | // can be read. | |
523 | EncodingType encoding_type = kPlain; | |
524 | ||
525 | // @full_scan_mode: mode for reading the whole file one record by one without | |
526 | // using the index. | |
527 | bool full_scan_mode = false; | |
528 | ||
529 | // @store_index_in_file: compute plain table index and bloom filter during | |
530 | // file building and store it in file. When reading | |
531 | // file, index will be mmaped instead of recomputation. | |
532 | bool store_index_in_file = false; | |
533 | }; | |
534 | ||
535 | // -- Plain Table with prefix-only seek | |
494da23a TL |
536 | // For this factory, you need to set Options.prefix_extractor properly to make |
537 | // it work. Look-up will starts with prefix hash lookup for key prefix. Inside | |
538 | // the hash bucket found, a binary search is executed for hash conflicts. | |
539 | // Finally, a linear search is used. | |
7c673cae | 540 | |
494da23a TL |
541 | extern TableFactory* NewPlainTableFactory( |
542 | const PlainTableOptions& options = PlainTableOptions()); | |
7c673cae FG |
543 | |
544 | struct CuckooTablePropertyNames { | |
545 | // The key that is used to fill empty buckets. | |
546 | static const std::string kEmptyKey; | |
547 | // Fixed length of value. | |
548 | static const std::string kValueLength; | |
549 | // Number of hash functions used in Cuckoo Hash. | |
550 | static const std::string kNumHashFunc; | |
551 | // It denotes the number of buckets in a Cuckoo Block. Given a key and a | |
552 | // particular hash function, a Cuckoo Block is a set of consecutive buckets, | |
553 | // where starting bucket id is given by the hash function on the key. In case | |
554 | // of a collision during inserting the key, the builder tries to insert the | |
555 | // key in other locations of the cuckoo block before using the next hash | |
556 | // function. This reduces cache miss during read operation in case of | |
557 | // collision. | |
558 | static const std::string kCuckooBlockSize; | |
559 | // Size of the hash table. Use this number to compute the modulo of hash | |
560 | // function. The actual number of buckets will be kMaxHashTableSize + | |
561 | // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to | |
562 | // accommodate the Cuckoo Block from end of hash table, due to cache friendly | |
563 | // implementation. | |
564 | static const std::string kHashTableSize; | |
565 | // Denotes if the key sorted in the file is Internal Key (if false) | |
566 | // or User Key only (if true). | |
567 | static const std::string kIsLastLevel; | |
568 | // Indicate if using identity function for the first hash function. | |
569 | static const std::string kIdentityAsFirstHash; | |
570 | // Indicate if using module or bit and to calculate hash value | |
571 | static const std::string kUseModuleHash; | |
572 | // Fixed user key length | |
573 | static const std::string kUserKeyLength; | |
574 | }; | |
575 | ||
576 | struct CuckooTableOptions { | |
20effc67 TL |
577 | static const char* kName() { return "CuckooTableOptions"; }; |
578 | ||
7c673cae FG |
579 | // Determines the utilization of hash tables. Smaller values |
580 | // result in larger hash tables with fewer collisions. | |
581 | double hash_table_ratio = 0.9; | |
582 | // A property used by builder to determine the depth to go to | |
583 | // to search for a path to displace elements in case of | |
584 | // collision. See Builder.MakeSpaceForKey method. Higher | |
585 | // values result in more efficient hash tables with fewer | |
586 | // lookups but take more time to build. | |
587 | uint32_t max_search_depth = 100; | |
588 | // In case of collision while inserting, the builder | |
589 | // attempts to insert in the next cuckoo_block_size | |
590 | // locations before skipping over to the next Cuckoo hash | |
591 | // function. This makes lookups more cache friendly in case | |
592 | // of collisions. | |
593 | uint32_t cuckoo_block_size = 5; | |
594 | // If this option is enabled, user key is treated as uint64_t and its value | |
595 | // is used as hash value directly. This option changes builder's behavior. | |
596 | // Reader ignore this option and behave according to what specified in table | |
597 | // property. | |
598 | bool identity_as_first_hash = false; | |
599 | // If this option is set to true, module is used during hash calculation. | |
600 | // This often yields better space efficiency at the cost of performance. | |
11fdf7f2 | 601 | // If this option is set to false, # of entries in table is constrained to be |
7c673cae FG |
602 | // power of two, and bit and is used to calculate hash, which is faster in |
603 | // general. | |
604 | bool use_module_hash = true; | |
605 | }; | |
606 | ||
607 | // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing | |
608 | extern TableFactory* NewCuckooTableFactory( | |
609 | const CuckooTableOptions& table_options = CuckooTableOptions()); | |
610 | ||
611 | #endif // ROCKSDB_LITE | |
612 | ||
613 | class RandomAccessFileReader; | |
614 | ||
615 | // A base class for table factories. | |
20effc67 | 616 | class TableFactory : public Customizable { |
7c673cae | 617 | public: |
20effc67 | 618 | virtual ~TableFactory() override {} |
7c673cae | 619 | |
20effc67 TL |
620 | static const char* kBlockCacheOpts() { return "BlockCache"; }; |
621 | static const char* kBlockBasedTableName() { return "BlockBasedTable"; }; | |
622 | static const char* kPlainTableName() { return "PlainTable"; } | |
623 | static const char* kCuckooTableName() { return "CuckooTable"; }; | |
624 | ||
625 | // Creates and configures a new TableFactory from the input options and id. | |
626 | static Status CreateFromString(const ConfigOptions& config_options, | |
627 | const std::string& id, | |
628 | std::shared_ptr<TableFactory>* factory); | |
629 | ||
630 | static const char* Type() { return "TableFactory"; } | |
7c673cae FG |
631 | |
632 | // Returns a Table object table that can fetch data from file specified | |
633 | // in parameter file. It's the caller's responsibility to make sure | |
634 | // file is in the correct format. | |
635 | // | |
636 | // NewTableReader() is called in three places: | |
637 | // (1) TableCache::FindTable() calls the function when table cache miss | |
638 | // and cache the table object returned. | |
494da23a | 639 | // (2) SstFileDumper (for SST Dump) opens the table and dump the table |
11fdf7f2 | 640 | // contents using the iterator of the table. |
494da23a TL |
641 | // (3) DBImpl::IngestExternalFile() calls this function to read the contents |
642 | // of the sst file it's attempting to add | |
7c673cae FG |
643 | // |
644 | // table_reader_options is a TableReaderOptions which contain all the | |
645 | // needed parameters and configuration to open the table. | |
646 | // file is a file handler to handle the file for the table. | |
647 | // file_size is the physical file size of the file. | |
648 | // table_reader is the output table reader. | |
649 | virtual Status NewTableReader( | |
650 | const TableReaderOptions& table_reader_options, | |
494da23a TL |
651 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, |
652 | std::unique_ptr<TableReader>* table_reader, | |
20effc67 TL |
653 | bool prefetch_index_and_filter_in_cache = true) const { |
654 | ReadOptions ro; | |
655 | return NewTableReader(ro, table_reader_options, std::move(file), file_size, | |
656 | table_reader, prefetch_index_and_filter_in_cache); | |
657 | } | |
658 | ||
659 | // Overload of the above function that allows the caller to pass in a | |
660 | // ReadOptions | |
661 | virtual Status NewTableReader( | |
662 | const ReadOptions& ro, const TableReaderOptions& table_reader_options, | |
663 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, | |
664 | std::unique_ptr<TableReader>* table_reader, | |
665 | bool prefetch_index_and_filter_in_cache) const = 0; | |
7c673cae FG |
666 | |
667 | // Return a table builder to write to a file for this table type. | |
668 | // | |
669 | // It is called in several places: | |
670 | // (1) When flushing memtable to a level-0 output file, it creates a table | |
671 | // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) | |
672 | // (2) During compaction, it gets the builder for writing compaction output | |
673 | // files in DBImpl::OpenCompactionOutputFile(). | |
674 | // (3) When recovering from transaction logs, it creates a table builder to | |
675 | // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, | |
676 | // by calling BuildTable()) | |
677 | // (4) When running Repairer, it creates a table builder to convert logs to | |
678 | // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) | |
679 | // | |
11fdf7f2 | 680 | // Multiple configured can be accessed from there, including and not limited |
7c673cae FG |
681 | // to compression options. file is a handle of a writable file. |
682 | // It is the caller's responsibility to keep the file open and close the file | |
683 | // after closing the table builder. compression_type is the compression type | |
684 | // to use in this table. | |
685 | virtual TableBuilder* NewTableBuilder( | |
686 | const TableBuilderOptions& table_builder_options, | |
687 | uint32_t column_family_id, WritableFileWriter* file) const = 0; | |
688 | ||
11fdf7f2 TL |
689 | // Return is delete range supported |
690 | virtual bool IsDeleteRangeSupported() const { return false; } | |
7c673cae FG |
691 | }; |
692 | ||
693 | #ifndef ROCKSDB_LITE | |
694 | // Create a special table factory that can open either of the supported | |
695 | // table formats, based on setting inside the SST files. It should be used to | |
696 | // convert a DB from one table format to another. | |
697 | // @table_factory_to_write: the table factory used when writing to new files. | |
698 | // @block_based_table_factory: block based table factory to use. If NULL, use | |
699 | // a default one. | |
700 | // @plain_table_factory: plain table factory to use. If NULL, use a default one. | |
494da23a TL |
701 | // @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default |
702 | // one. | |
7c673cae FG |
703 | extern TableFactory* NewAdaptiveTableFactory( |
704 | std::shared_ptr<TableFactory> table_factory_to_write = nullptr, | |
705 | std::shared_ptr<TableFactory> block_based_table_factory = nullptr, | |
706 | std::shared_ptr<TableFactory> plain_table_factory = nullptr, | |
707 | std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr); | |
708 | ||
709 | #endif // ROCKSDB_LITE | |
710 | ||
f67539c2 | 711 | } // namespace ROCKSDB_NAMESPACE |