]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be | |
3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
4 | // | |
5 | // Currently we support two types of tables: plain table and block-based table. | |
6 | // 1. Block-based table: this is the default table type that we inherited from | |
7 | // LevelDB, which was designed for storing data in hard disk or flash | |
8 | // device. | |
9 | // 2. Plain table: it is one of RocksDB's SST file format optimized | |
10 | // for low query latency on pure-memory or really low-latency media. | |
11 | // | |
12 | // A tutorial of rocksdb table formats is available here: | |
13 | // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats | |
14 | // | |
15 | // Example code is also available | |
16 | // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples | |
17 | ||
18 | #pragma once | |
11fdf7f2 | 19 | |
7c673cae FG |
20 | #include <memory> |
21 | #include <string> | |
22 | #include <unordered_map> | |
23 | ||
24 | #include "rocksdb/cache.h" | |
25 | #include "rocksdb/env.h" | |
26 | #include "rocksdb/iterator.h" | |
27 | #include "rocksdb/options.h" | |
28 | #include "rocksdb/status.h" | |
29 | ||
30 | namespace rocksdb { | |
31 | ||
32 | // -- Block-based Table | |
33 | class FlushBlockPolicyFactory; | |
34 | class PersistentCache; | |
35 | class RandomAccessFile; | |
36 | struct TableReaderOptions; | |
37 | struct TableBuilderOptions; | |
38 | class TableBuilder; | |
39 | class TableReader; | |
40 | class WritableFileWriter; | |
41 | struct EnvOptions; | |
42 | struct Options; | |
43 | ||
7c673cae | 44 | enum ChecksumType : char { |
11fdf7f2 | 45 | kNoChecksum = 0x0, |
7c673cae FG |
46 | kCRC32c = 0x1, |
47 | kxxHash = 0x2, | |
494da23a | 48 | kxxHash64 = 0x3, |
7c673cae FG |
49 | }; |
50 | ||
51 | // For advanced user only | |
52 | struct BlockBasedTableOptions { | |
53 | // @flush_block_policy_factory creates the instances of flush block policy. | |
54 | // which provides a configurable way to determine when to flush a block in | |
55 | // the block based tables. If not set, table builder will use the default | |
56 | // block flush policy, which cut blocks by block size (please refer to | |
57 | // `FlushBlockBySizePolicy`). | |
58 | std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory; | |
59 | ||
60 | // TODO(kailiu) Temporarily disable this feature by making the default value | |
61 | // to be false. | |
62 | // | |
494da23a TL |
63 | // TODO(ajkr) we need to update names of variables controlling meta-block |
64 | // caching as they should now apply to range tombstone and compression | |
65 | // dictionary meta-blocks, in addition to index and filter meta-blocks. | |
66 | // | |
7c673cae FG |
67 | // Indicating if we'd put index/filter blocks to the block cache. |
68 | // If not specified, each "table reader" object will pre-load index/filter | |
69 | // block during table initialization. | |
70 | bool cache_index_and_filter_blocks = false; | |
71 | ||
72 | // If cache_index_and_filter_blocks is enabled, cache index and filter | |
73 | // blocks with high priority. If set to true, depending on implementation of | |
11fdf7f2 | 74 | // block cache, index and filter blocks may be less likely to be evicted |
7c673cae FG |
75 | // than data blocks. |
76 | bool cache_index_and_filter_blocks_with_high_priority = false; | |
77 | ||
78 | // if cache_index_and_filter_blocks is true and the below is true, then | |
79 | // filter and index blocks are stored in the cache, but a reference is | |
80 | // held in the "table reader" object so the blocks are pinned and only | |
81 | // evicted from cache when the table reader is freed. | |
82 | bool pin_l0_filter_and_index_blocks_in_cache = false; | |
83 | ||
11fdf7f2 TL |
84 | // If cache_index_and_filter_blocks is true and the below is true, then |
85 | // the top-level index of partitioned filter and index blocks are stored in | |
86 | // the cache, but a reference is held in the "table reader" object so the | |
87 | // blocks are pinned and only evicted from cache when the table reader is | |
88 | // freed. This is not limited to l0 in LSM tree. | |
89 | bool pin_top_level_index_and_filter = true; | |
90 | ||
7c673cae FG |
91 | // The index type that will be used for this table. |
92 | enum IndexType : char { | |
93 | // A space efficient index block that is optimized for | |
94 | // binary-search-based index. | |
95 | kBinarySearch, | |
96 | ||
97 | // The hash index, if enabled, will do the hash lookup when | |
98 | // `Options.prefix_extractor` is provided. | |
99 | kHashSearch, | |
100 | ||
7c673cae FG |
101 | // A two-level index implementation. Both levels are binary search indexes. |
102 | kTwoLevelIndexSearch, | |
103 | }; | |
104 | ||
105 | IndexType index_type = kBinarySearch; | |
106 | ||
11fdf7f2 TL |
107 | // The index type that will be used for the data block. |
108 | enum DataBlockIndexType : char { | |
109 | kDataBlockBinarySearch = 0, // traditional block type | |
110 | kDataBlockBinaryAndHash = 1, // additional hash index | |
111 | }; | |
112 | ||
113 | DataBlockIndexType data_block_index_type = kDataBlockBinarySearch; | |
114 | ||
115 | // #entries/#buckets. It is valid only when data_block_hash_index_type is | |
116 | // kDataBlockBinaryAndHash. | |
117 | double data_block_hash_table_util_ratio = 0.75; | |
118 | ||
7c673cae FG |
119 | // This option is now deprecated. No matter what value it is set to, |
120 | // it will behave as if hash_index_allow_collision=true. | |
121 | bool hash_index_allow_collision = true; | |
122 | ||
123 | // Use the specified checksum type. Newly created table files will be | |
124 | // protected with this checksum type. Old table files will still be readable, | |
125 | // even though they have different checksum type. | |
126 | ChecksumType checksum = kCRC32c; | |
127 | ||
128 | // Disable block cache. If this is set to true, | |
129 | // then no block cache should be used, and the block_cache should | |
130 | // point to a nullptr object. | |
131 | bool no_block_cache = false; | |
132 | ||
133 | // If non-NULL use the specified cache for blocks. | |
134 | // If NULL, rocksdb will automatically create and use an 8MB internal cache. | |
135 | std::shared_ptr<Cache> block_cache = nullptr; | |
136 | ||
137 | // If non-NULL use the specified cache for pages read from device | |
138 | // IF NULL, no page cache is used | |
139 | std::shared_ptr<PersistentCache> persistent_cache = nullptr; | |
140 | ||
141 | // If non-NULL use the specified cache for compressed blocks. | |
142 | // If NULL, rocksdb will not use a compressed block cache. | |
494da23a TL |
143 | // Note: though it looks similar to `block_cache`, RocksDB doesn't put the |
144 | // same type of object there. | |
7c673cae FG |
145 | std::shared_ptr<Cache> block_cache_compressed = nullptr; |
146 | ||
147 | // Approximate size of user data packed per block. Note that the | |
148 | // block size specified here corresponds to uncompressed data. The | |
149 | // actual size of the unit read from disk may be smaller if | |
150 | // compression is enabled. This parameter can be changed dynamically. | |
151 | size_t block_size = 4 * 1024; | |
152 | ||
153 | // This is used to close a block before it reaches the configured | |
154 | // 'block_size'. If the percentage of free space in the current block is less | |
155 | // than this specified number and adding a new record to the block will | |
156 | // exceed the configured block size, then this block will be closed and the | |
157 | // new record will be written to the next block. | |
158 | int block_size_deviation = 10; | |
159 | ||
160 | // Number of keys between restart points for delta encoding of keys. | |
161 | // This parameter can be changed dynamically. Most clients should | |
162 | // leave this parameter alone. The minimum value allowed is 1. Any smaller | |
163 | // value will be silently overwritten with 1. | |
164 | int block_restart_interval = 16; | |
165 | ||
166 | // Same as block_restart_interval but used for the index block. | |
167 | int index_block_restart_interval = 1; | |
168 | ||
169 | // Block size for partitioned metadata. Currently applied to indexes when | |
170 | // kTwoLevelIndexSearch is used and to filters when partition_filters is used. | |
171 | // Note: Since in the current implementation the filters and index partitions | |
11fdf7f2 | 172 | // are aligned, an index/filter block is created when either index or filter |
7c673cae FG |
173 | // block size reaches the specified limit. |
174 | // Note: this limit is currently applied to only index blocks; a filter | |
175 | // partition is cut right after an index block is cut | |
176 | // TODO(myabandeh): remove the note above when filter partitions are cut | |
177 | // separately | |
178 | uint64_t metadata_block_size = 4096; | |
179 | ||
180 | // Note: currently this option requires kTwoLevelIndexSearch to be set as | |
181 | // well. | |
182 | // TODO(myabandeh): remove the note above once the limitation is lifted | |
11fdf7f2 TL |
183 | // Use partitioned full filters for each SST file. This option is |
184 | // incompatible with block-based filters. | |
7c673cae FG |
185 | bool partition_filters = false; |
186 | ||
187 | // Use delta encoding to compress keys in blocks. | |
188 | // ReadOptions::pin_data requires this option to be disabled. | |
189 | // | |
190 | // Default: true | |
191 | bool use_delta_encoding = true; | |
192 | ||
193 | // If non-nullptr, use the specified filter policy to reduce disk reads. | |
194 | // Many applications will benefit from passing the result of | |
195 | // NewBloomFilterPolicy() here. | |
196 | std::shared_ptr<const FilterPolicy> filter_policy = nullptr; | |
197 | ||
198 | // If true, place whole keys in the filter (not just prefixes). | |
199 | // This must generally be true for gets to be efficient. | |
200 | bool whole_key_filtering = true; | |
201 | ||
202 | // Verify that decompressing the compressed block gives back the input. This | |
203 | // is a verification mode that we use to detect bugs in compression | |
204 | // algorithms. | |
205 | bool verify_compression = false; | |
206 | ||
207 | // If used, For every data block we load into memory, we will create a bitmap | |
208 | // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap | |
209 | // will be used to figure out the percentage we actually read of the blocks. | |
210 | // | |
211 | // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and | |
212 | // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the | |
213 | // read amplification using this formula | |
214 | // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) | |
215 | // | |
216 | // value => memory usage (percentage of loaded blocks memory) | |
217 | // 1 => 12.50 % | |
218 | // 2 => 06.25 % | |
219 | // 4 => 03.12 % | |
220 | // 8 => 01.56 % | |
221 | // 16 => 00.78 % | |
222 | // | |
223 | // Note: This number must be a power of 2, if not it will be sanitized | |
224 | // to be the next lowest power of 2, for example a value of 7 will be | |
225 | // treated as 4, a value of 19 will be treated as 16. | |
226 | // | |
227 | // Default: 0 (disabled) | |
228 | uint32_t read_amp_bytes_per_bit = 0; | |
229 | ||
494da23a | 230 | // We currently have five versions: |
7c673cae FG |
231 | // 0 -- This version is currently written out by all RocksDB's versions by |
232 | // default. Can be read by really old RocksDB's. Doesn't support changing | |
233 | // checksum (default is CRC32). | |
234 | // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default | |
235 | // checksum, like xxHash. It is written by RocksDB when | |
236 | // BlockBasedTableOptions::checksum is something other than kCRC32c. (version | |
237 | // 0 is silently upconverted) | |
238 | // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we | |
239 | // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you | |
240 | // don't plan to run RocksDB before version 3.10, you should probably use | |
241 | // this. | |
11fdf7f2 TL |
242 | // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we |
243 | // encode the keys in index blocks. If you don't plan to run RocksDB before | |
244 | // version 5.15, you should probably use this. | |
245 | // This option only affects newly written tables. When reading existing | |
246 | // tables, the information about version is read from the footer. | |
247 | // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we | |
248 | // encode the values in index blocks. If you don't plan to run RocksDB before | |
249 | // version 5.16 and you are using index_block_restart_interval > 1, you should | |
250 | // probably use this as it would reduce the index size. | |
251 | // This option only affects newly written tables. When reading existing | |
252 | // tables, the information about version is read from the footer. | |
7c673cae | 253 | uint32_t format_version = 2; |
11fdf7f2 TL |
254 | |
255 | // Store index blocks on disk in compressed format. Changing this option to | |
256 | // false will avoid the overhead of decompression if index blocks are evicted | |
257 | // and read back | |
258 | bool enable_index_compression = true; | |
259 | ||
260 | // Align data blocks on lesser of page size and block size | |
261 | bool block_align = false; | |
7c673cae FG |
262 | }; |
263 | ||
264 | // Table Properties that are specific to block-based table properties. | |
265 | struct BlockBasedTablePropertyNames { | |
11fdf7f2 | 266 | // value of this properties is a fixed int32 number. |
7c673cae FG |
267 | static const std::string kIndexType; |
268 | // value is "1" for true and "0" for false. | |
269 | static const std::string kWholeKeyFiltering; | |
270 | // value is "1" for true and "0" for false. | |
271 | static const std::string kPrefixFiltering; | |
272 | }; | |
273 | ||
274 | // Create default block based table factory. | |
275 | extern TableFactory* NewBlockBasedTableFactory( | |
276 | const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); | |
277 | ||
278 | #ifndef ROCKSDB_LITE | |
279 | ||
280 | enum EncodingType : char { | |
281 | // Always write full keys without any special encoding. | |
282 | kPlain, | |
283 | // Find opportunity to write the same prefix once for multiple rows. | |
284 | // In some cases, when a key follows a previous key with the same prefix, | |
285 | // instead of writing out the full key, it just writes out the size of the | |
286 | // shared prefix, as well as other bytes, to save some bytes. | |
287 | // | |
288 | // When using this option, the user is required to use the same prefix | |
289 | // extractor to make sure the same prefix will be extracted from the same key. | |
290 | // The Name() value of the prefix extractor will be stored in the file. When | |
291 | // reopening the file, the name of the options.prefix_extractor given will be | |
292 | // bitwise compared to the prefix extractors stored in the file. An error | |
293 | // will be returned if the two don't match. | |
294 | kPrefix, | |
295 | }; | |
296 | ||
297 | // Table Properties that are specific to plain table properties. | |
298 | struct PlainTablePropertyNames { | |
299 | static const std::string kEncodingType; | |
300 | static const std::string kBloomVersion; | |
301 | static const std::string kNumBloomBlocks; | |
302 | }; | |
303 | ||
304 | const uint32_t kPlainTableVariableLength = 0; | |
305 | ||
306 | struct PlainTableOptions { | |
307 | // @user_key_len: plain table has optimization for fix-sized keys, which can | |
308 | // be specified via user_key_len. Alternatively, you can pass | |
309 | // `kPlainTableVariableLength` if your keys have variable | |
310 | // lengths. | |
311 | uint32_t user_key_len = kPlainTableVariableLength; | |
312 | ||
313 | // @bloom_bits_per_key: the number of bits used for bloom filer per prefix. | |
314 | // You may disable it by passing a zero. | |
315 | int bloom_bits_per_key = 10; | |
316 | ||
317 | // @hash_table_ratio: the desired utilization of the hash table used for | |
318 | // prefix hashing. | |
319 | // hash_table_ratio = number of prefixes / #buckets in the | |
320 | // hash table | |
321 | double hash_table_ratio = 0.75; | |
322 | ||
323 | // @index_sparseness: inside each prefix, need to build one index record for | |
324 | // how many keys for binary search inside each hash bucket. | |
325 | // For encoding type kPrefix, the value will be used when | |
326 | // writing to determine an interval to rewrite the full | |
327 | // key. It will also be used as a suggestion and satisfied | |
328 | // when possible. | |
329 | size_t index_sparseness = 16; | |
330 | ||
331 | // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc. | |
332 | // Otherwise from huge page TLB. The user needs to | |
333 | // reserve huge pages for it to be allocated, like: | |
334 | // sysctl -w vm.nr_hugepages=20 | |
335 | // See linux doc Documentation/vm/hugetlbpage.txt | |
336 | size_t huge_page_tlb_size = 0; | |
337 | ||
338 | // @encoding_type: how to encode the keys. See enum EncodingType above for | |
339 | // the choices. The value will determine how to encode keys | |
340 | // when writing to a new SST file. This value will be stored | |
341 | // inside the SST file which will be used when reading from | |
342 | // the file, which makes it possible for users to choose | |
343 | // different encoding type when reopening a DB. Files with | |
344 | // different encoding types can co-exist in the same DB and | |
345 | // can be read. | |
346 | EncodingType encoding_type = kPlain; | |
347 | ||
348 | // @full_scan_mode: mode for reading the whole file one record by one without | |
349 | // using the index. | |
350 | bool full_scan_mode = false; | |
351 | ||
352 | // @store_index_in_file: compute plain table index and bloom filter during | |
353 | // file building and store it in file. When reading | |
354 | // file, index will be mmaped instead of recomputation. | |
355 | bool store_index_in_file = false; | |
356 | }; | |
357 | ||
358 | // -- Plain Table with prefix-only seek | |
494da23a TL |
359 | // For this factory, you need to set Options.prefix_extractor properly to make |
360 | // it work. Look-up will starts with prefix hash lookup for key prefix. Inside | |
361 | // the hash bucket found, a binary search is executed for hash conflicts. | |
362 | // Finally, a linear search is used. | |
7c673cae | 363 | |
494da23a TL |
364 | extern TableFactory* NewPlainTableFactory( |
365 | const PlainTableOptions& options = PlainTableOptions()); | |
7c673cae FG |
366 | |
367 | struct CuckooTablePropertyNames { | |
368 | // The key that is used to fill empty buckets. | |
369 | static const std::string kEmptyKey; | |
370 | // Fixed length of value. | |
371 | static const std::string kValueLength; | |
372 | // Number of hash functions used in Cuckoo Hash. | |
373 | static const std::string kNumHashFunc; | |
374 | // It denotes the number of buckets in a Cuckoo Block. Given a key and a | |
375 | // particular hash function, a Cuckoo Block is a set of consecutive buckets, | |
376 | // where starting bucket id is given by the hash function on the key. In case | |
377 | // of a collision during inserting the key, the builder tries to insert the | |
378 | // key in other locations of the cuckoo block before using the next hash | |
379 | // function. This reduces cache miss during read operation in case of | |
380 | // collision. | |
381 | static const std::string kCuckooBlockSize; | |
382 | // Size of the hash table. Use this number to compute the modulo of hash | |
383 | // function. The actual number of buckets will be kMaxHashTableSize + | |
384 | // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to | |
385 | // accommodate the Cuckoo Block from end of hash table, due to cache friendly | |
386 | // implementation. | |
387 | static const std::string kHashTableSize; | |
388 | // Denotes if the key sorted in the file is Internal Key (if false) | |
389 | // or User Key only (if true). | |
390 | static const std::string kIsLastLevel; | |
391 | // Indicate if using identity function for the first hash function. | |
392 | static const std::string kIdentityAsFirstHash; | |
393 | // Indicate if using module or bit and to calculate hash value | |
394 | static const std::string kUseModuleHash; | |
395 | // Fixed user key length | |
396 | static const std::string kUserKeyLength; | |
397 | }; | |
398 | ||
399 | struct CuckooTableOptions { | |
400 | // Determines the utilization of hash tables. Smaller values | |
401 | // result in larger hash tables with fewer collisions. | |
402 | double hash_table_ratio = 0.9; | |
403 | // A property used by builder to determine the depth to go to | |
404 | // to search for a path to displace elements in case of | |
405 | // collision. See Builder.MakeSpaceForKey method. Higher | |
406 | // values result in more efficient hash tables with fewer | |
407 | // lookups but take more time to build. | |
408 | uint32_t max_search_depth = 100; | |
409 | // In case of collision while inserting, the builder | |
410 | // attempts to insert in the next cuckoo_block_size | |
411 | // locations before skipping over to the next Cuckoo hash | |
412 | // function. This makes lookups more cache friendly in case | |
413 | // of collisions. | |
414 | uint32_t cuckoo_block_size = 5; | |
415 | // If this option is enabled, user key is treated as uint64_t and its value | |
416 | // is used as hash value directly. This option changes builder's behavior. | |
417 | // Reader ignore this option and behave according to what specified in table | |
418 | // property. | |
419 | bool identity_as_first_hash = false; | |
420 | // If this option is set to true, module is used during hash calculation. | |
421 | // This often yields better space efficiency at the cost of performance. | |
11fdf7f2 | 422 | // If this option is set to false, # of entries in table is constrained to be |
7c673cae FG |
423 | // power of two, and bit and is used to calculate hash, which is faster in |
424 | // general. | |
425 | bool use_module_hash = true; | |
426 | }; | |
427 | ||
428 | // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing | |
429 | extern TableFactory* NewCuckooTableFactory( | |
430 | const CuckooTableOptions& table_options = CuckooTableOptions()); | |
431 | ||
432 | #endif // ROCKSDB_LITE | |
433 | ||
434 | class RandomAccessFileReader; | |
435 | ||
436 | // A base class for table factories. | |
437 | class TableFactory { | |
438 | public: | |
439 | virtual ~TableFactory() {} | |
440 | ||
441 | // The type of the table. | |
442 | // | |
443 | // The client of this package should switch to a new name whenever | |
444 | // the table format implementation changes. | |
445 | // | |
446 | // Names starting with "rocksdb." are reserved and should not be used | |
447 | // by any clients of this package. | |
448 | virtual const char* Name() const = 0; | |
449 | ||
450 | // Returns a Table object table that can fetch data from file specified | |
451 | // in parameter file. It's the caller's responsibility to make sure | |
452 | // file is in the correct format. | |
453 | // | |
454 | // NewTableReader() is called in three places: | |
455 | // (1) TableCache::FindTable() calls the function when table cache miss | |
456 | // and cache the table object returned. | |
494da23a | 457 | // (2) SstFileDumper (for SST Dump) opens the table and dump the table |
11fdf7f2 | 458 | // contents using the iterator of the table. |
494da23a TL |
459 | // (3) DBImpl::IngestExternalFile() calls this function to read the contents |
460 | // of the sst file it's attempting to add | |
7c673cae FG |
461 | // |
462 | // table_reader_options is a TableReaderOptions which contain all the | |
463 | // needed parameters and configuration to open the table. | |
464 | // file is a file handler to handle the file for the table. | |
465 | // file_size is the physical file size of the file. | |
466 | // table_reader is the output table reader. | |
467 | virtual Status NewTableReader( | |
468 | const TableReaderOptions& table_reader_options, | |
494da23a TL |
469 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, |
470 | std::unique_ptr<TableReader>* table_reader, | |
7c673cae FG |
471 | bool prefetch_index_and_filter_in_cache = true) const = 0; |
472 | ||
473 | // Return a table builder to write to a file for this table type. | |
474 | // | |
475 | // It is called in several places: | |
476 | // (1) When flushing memtable to a level-0 output file, it creates a table | |
477 | // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable()) | |
478 | // (2) During compaction, it gets the builder for writing compaction output | |
479 | // files in DBImpl::OpenCompactionOutputFile(). | |
480 | // (3) When recovering from transaction logs, it creates a table builder to | |
481 | // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery, | |
482 | // by calling BuildTable()) | |
483 | // (4) When running Repairer, it creates a table builder to convert logs to | |
484 | // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) | |
485 | // | |
11fdf7f2 | 486 | // Multiple configured can be accessed from there, including and not limited |
7c673cae FG |
487 | // to compression options. file is a handle of a writable file. |
488 | // It is the caller's responsibility to keep the file open and close the file | |
489 | // after closing the table builder. compression_type is the compression type | |
490 | // to use in this table. | |
491 | virtual TableBuilder* NewTableBuilder( | |
492 | const TableBuilderOptions& table_builder_options, | |
493 | uint32_t column_family_id, WritableFileWriter* file) const = 0; | |
494 | ||
495 | // Sanitizes the specified DB Options and ColumnFamilyOptions. | |
496 | // | |
497 | // If the function cannot find a way to sanitize the input DB Options, | |
498 | // a non-ok Status will be returned. | |
494da23a TL |
499 | virtual Status SanitizeOptions(const DBOptions& db_opts, |
500 | const ColumnFamilyOptions& cf_opts) const = 0; | |
7c673cae FG |
501 | |
502 | // Return a string that contains printable format of table configurations. | |
503 | // RocksDB prints configurations at DB Open(). | |
504 | virtual std::string GetPrintableTableOptions() const = 0; | |
505 | ||
11fdf7f2 TL |
506 | virtual Status GetOptionString(std::string* /*opt_string*/, |
507 | const std::string& /*delimiter*/) const { | |
508 | return Status::NotSupported( | |
509 | "The table factory doesn't implement GetOptionString()."); | |
510 | } | |
511 | ||
7c673cae FG |
512 | // Returns the raw pointer of the table options that is used by this |
513 | // TableFactory, or nullptr if this function is not supported. | |
514 | // Since the return value is a raw pointer, the TableFactory owns the | |
515 | // pointer and the caller should not delete the pointer. | |
516 | // | |
11fdf7f2 | 517 | // In certain case, it is desirable to alter the underlying options when the |
7c673cae FG |
518 | // TableFactory is not used by any open DB by casting the returned pointer |
519 | // to the right class. For instance, if BlockBasedTableFactory is used, | |
520 | // then the pointer can be casted to BlockBasedTableOptions. | |
521 | // | |
522 | // Note that changing the underlying TableFactory options while the | |
523 | // TableFactory is currently used by any open DB is undefined behavior. | |
524 | // Developers should use DB::SetOption() instead to dynamically change | |
525 | // options while the DB is open. | |
526 | virtual void* GetOptions() { return nullptr; } | |
11fdf7f2 TL |
527 | |
528 | // Return is delete range supported | |
529 | virtual bool IsDeleteRangeSupported() const { return false; } | |
7c673cae FG |
530 | }; |
531 | ||
532 | #ifndef ROCKSDB_LITE | |
533 | // Create a special table factory that can open either of the supported | |
534 | // table formats, based on setting inside the SST files. It should be used to | |
535 | // convert a DB from one table format to another. | |
536 | // @table_factory_to_write: the table factory used when writing to new files. | |
537 | // @block_based_table_factory: block based table factory to use. If NULL, use | |
538 | // a default one. | |
539 | // @plain_table_factory: plain table factory to use. If NULL, use a default one. | |
494da23a TL |
540 | // @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default |
541 | // one. | |
7c673cae FG |
542 | extern TableFactory* NewAdaptiveTableFactory( |
543 | std::shared_ptr<TableFactory> table_factory_to_write = nullptr, | |
544 | std::shared_ptr<TableFactory> block_based_table_factory = nullptr, | |
545 | std::shared_ptr<TableFactory> plain_table_factory = nullptr, | |
546 | std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr); | |
547 | ||
548 | #endif // ROCKSDB_LITE | |
549 | ||
550 | } // namespace rocksdb |