]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/table.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / rocksdb / include / rocksdb / table.h
1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4 //
5 // Currently we support two types of tables: plain table and block-based table.
6 // 1. Block-based table: this is the default table type that we inherited from
7 // LevelDB, which was designed for storing data in hard disk or flash
8 // device.
9 // 2. Plain table: it is one of RocksDB's SST file format optimized
10 // for low query latency on pure-memory or really low-latency media.
11 //
12 // A tutorial of rocksdb table formats is available here:
13 // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
14 //
15 // Example code is also available
16 // https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
17
18 #pragma once
19
20 #include <memory>
21 #include <string>
22 #include <unordered_map>
23
24 #include "rocksdb/cache.h"
25 #include "rocksdb/env.h"
26 #include "rocksdb/iterator.h"
27 #include "rocksdb/options.h"
28 #include "rocksdb/status.h"
29
30 namespace rocksdb {
31
32 // -- Block-based Table
33 class FlushBlockPolicyFactory;
34 class PersistentCache;
35 class RandomAccessFile;
36 struct TableReaderOptions;
37 struct TableBuilderOptions;
38 class TableBuilder;
39 class TableReader;
40 class WritableFileWriter;
41 struct EnvOptions;
42 struct Options;
43
44 using std::unique_ptr;
45
46 enum ChecksumType : char {
47 kNoChecksum = 0x0,
48 kCRC32c = 0x1,
49 kxxHash = 0x2,
50 };
51
52 // For advanced user only
53 struct BlockBasedTableOptions {
54 // @flush_block_policy_factory creates the instances of flush block policy.
55 // which provides a configurable way to determine when to flush a block in
56 // the block based tables. If not set, table builder will use the default
57 // block flush policy, which cut blocks by block size (please refer to
58 // `FlushBlockBySizePolicy`).
59 std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
60
61 // TODO(kailiu) Temporarily disable this feature by making the default value
62 // to be false.
63 //
64 // Indicating if we'd put index/filter blocks to the block cache.
65 // If not specified, each "table reader" object will pre-load index/filter
66 // block during table initialization.
67 bool cache_index_and_filter_blocks = false;
68
69 // If cache_index_and_filter_blocks is enabled, cache index and filter
70 // blocks with high priority. If set to true, depending on implementation of
71 // block cache, index and filter blocks may be less likely to be evicted
72 // than data blocks.
73 bool cache_index_and_filter_blocks_with_high_priority = false;
74
75 // if cache_index_and_filter_blocks is true and the below is true, then
76 // filter and index blocks are stored in the cache, but a reference is
77 // held in the "table reader" object so the blocks are pinned and only
78 // evicted from cache when the table reader is freed.
79 bool pin_l0_filter_and_index_blocks_in_cache = false;
80
81 // If cache_index_and_filter_blocks is true and the below is true, then
82 // the top-level index of partitioned filter and index blocks are stored in
83 // the cache, but a reference is held in the "table reader" object so the
84 // blocks are pinned and only evicted from cache when the table reader is
85 // freed. This is not limited to l0 in LSM tree.
86 bool pin_top_level_index_and_filter = true;
87
88 // The index type that will be used for this table.
89 enum IndexType : char {
90 // A space efficient index block that is optimized for
91 // binary-search-based index.
92 kBinarySearch,
93
94 // The hash index, if enabled, will do the hash lookup when
95 // `Options.prefix_extractor` is provided.
96 kHashSearch,
97
98 // A two-level index implementation. Both levels are binary search indexes.
99 kTwoLevelIndexSearch,
100 };
101
102 IndexType index_type = kBinarySearch;
103
104 // The index type that will be used for the data block.
105 enum DataBlockIndexType : char {
106 kDataBlockBinarySearch = 0, // traditional block type
107 kDataBlockBinaryAndHash = 1, // additional hash index
108 };
109
110 DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
111
112 // #entries/#buckets. It is valid only when data_block_hash_index_type is
113 // kDataBlockBinaryAndHash.
114 double data_block_hash_table_util_ratio = 0.75;
115
116 // This option is now deprecated. No matter what value it is set to,
117 // it will behave as if hash_index_allow_collision=true.
118 bool hash_index_allow_collision = true;
119
120 // Use the specified checksum type. Newly created table files will be
121 // protected with this checksum type. Old table files will still be readable,
122 // even though they have different checksum type.
123 ChecksumType checksum = kCRC32c;
124
125 // Disable block cache. If this is set to true,
126 // then no block cache should be used, and the block_cache should
127 // point to a nullptr object.
128 bool no_block_cache = false;
129
130 // If non-NULL use the specified cache for blocks.
131 // If NULL, rocksdb will automatically create and use an 8MB internal cache.
132 std::shared_ptr<Cache> block_cache = nullptr;
133
134 // If non-NULL use the specified cache for pages read from device
135 // IF NULL, no page cache is used
136 std::shared_ptr<PersistentCache> persistent_cache = nullptr;
137
138 // If non-NULL use the specified cache for compressed blocks.
139 // If NULL, rocksdb will not use a compressed block cache.
140 std::shared_ptr<Cache> block_cache_compressed = nullptr;
141
142 // Approximate size of user data packed per block. Note that the
143 // block size specified here corresponds to uncompressed data. The
144 // actual size of the unit read from disk may be smaller if
145 // compression is enabled. This parameter can be changed dynamically.
146 size_t block_size = 4 * 1024;
147
148 // This is used to close a block before it reaches the configured
149 // 'block_size'. If the percentage of free space in the current block is less
150 // than this specified number and adding a new record to the block will
151 // exceed the configured block size, then this block will be closed and the
152 // new record will be written to the next block.
153 int block_size_deviation = 10;
154
155 // Number of keys between restart points for delta encoding of keys.
156 // This parameter can be changed dynamically. Most clients should
157 // leave this parameter alone. The minimum value allowed is 1. Any smaller
158 // value will be silently overwritten with 1.
159 int block_restart_interval = 16;
160
161 // Same as block_restart_interval but used for the index block.
162 int index_block_restart_interval = 1;
163
164 // Block size for partitioned metadata. Currently applied to indexes when
165 // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
166 // Note: Since in the current implementation the filters and index partitions
167 // are aligned, an index/filter block is created when either index or filter
168 // block size reaches the specified limit.
169 // Note: this limit is currently applied to only index blocks; a filter
170 // partition is cut right after an index block is cut
171 // TODO(myabandeh): remove the note above when filter partitions are cut
172 // separately
173 uint64_t metadata_block_size = 4096;
174
175 // Note: currently this option requires kTwoLevelIndexSearch to be set as
176 // well.
177 // TODO(myabandeh): remove the note above once the limitation is lifted
178 // Use partitioned full filters for each SST file. This option is
179 // incompatible with block-based filters.
180 bool partition_filters = false;
181
182 // Use delta encoding to compress keys in blocks.
183 // ReadOptions::pin_data requires this option to be disabled.
184 //
185 // Default: true
186 bool use_delta_encoding = true;
187
188 // If non-nullptr, use the specified filter policy to reduce disk reads.
189 // Many applications will benefit from passing the result of
190 // NewBloomFilterPolicy() here.
191 std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
192
193 // If true, place whole keys in the filter (not just prefixes).
194 // This must generally be true for gets to be efficient.
195 bool whole_key_filtering = true;
196
197 // Verify that decompressing the compressed block gives back the input. This
198 // is a verification mode that we use to detect bugs in compression
199 // algorithms.
200 bool verify_compression = false;
201
202 // If used, For every data block we load into memory, we will create a bitmap
203 // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
204 // will be used to figure out the percentage we actually read of the blocks.
205 //
206 // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
207 // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
208 // read amplification using this formula
209 // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
210 //
211 // value => memory usage (percentage of loaded blocks memory)
212 // 1 => 12.50 %
213 // 2 => 06.25 %
214 // 4 => 03.12 %
215 // 8 => 01.56 %
216 // 16 => 00.78 %
217 //
218 // Note: This number must be a power of 2, if not it will be sanitized
219 // to be the next lowest power of 2, for example a value of 7 will be
220 // treated as 4, a value of 19 will be treated as 16.
221 //
222 // Default: 0 (disabled)
223 uint32_t read_amp_bytes_per_bit = 0;
224
225 // We currently have three versions:
226 // 0 -- This version is currently written out by all RocksDB's versions by
227 // default. Can be read by really old RocksDB's. Doesn't support changing
228 // checksum (default is CRC32).
229 // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
230 // checksum, like xxHash. It is written by RocksDB when
231 // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
232 // 0 is silently upconverted)
233 // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
234 // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
235 // don't plan to run RocksDB before version 3.10, you should probably use
236 // this.
237 // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
238 // encode the keys in index blocks. If you don't plan to run RocksDB before
239 // version 5.15, you should probably use this.
240 // This option only affects newly written tables. When reading existing
241 // tables, the information about version is read from the footer.
242 // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
243 // encode the values in index blocks. If you don't plan to run RocksDB before
244 // version 5.16 and you are using index_block_restart_interval > 1, you should
245 // probably use this as it would reduce the index size.
246 // This option only affects newly written tables. When reading existing
247 // tables, the information about version is read from the footer.
248 uint32_t format_version = 2;
249
250 // Store index blocks on disk in compressed format. Changing this option to
251 // false will avoid the overhead of decompression if index blocks are evicted
252 // and read back
253 bool enable_index_compression = true;
254
255 // Align data blocks on lesser of page size and block size
256 bool block_align = false;
257 };
258
259 // Table Properties that are specific to block-based table properties.
260 struct BlockBasedTablePropertyNames {
261 // value of this properties is a fixed int32 number.
262 static const std::string kIndexType;
263 // value is "1" for true and "0" for false.
264 static const std::string kWholeKeyFiltering;
265 // value is "1" for true and "0" for false.
266 static const std::string kPrefixFiltering;
267 };
268
269 // Create default block based table factory.
270 extern TableFactory* NewBlockBasedTableFactory(
271 const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
272
273 #ifndef ROCKSDB_LITE
274
275 enum EncodingType : char {
276 // Always write full keys without any special encoding.
277 kPlain,
278 // Find opportunity to write the same prefix once for multiple rows.
279 // In some cases, when a key follows a previous key with the same prefix,
280 // instead of writing out the full key, it just writes out the size of the
281 // shared prefix, as well as other bytes, to save some bytes.
282 //
283 // When using this option, the user is required to use the same prefix
284 // extractor to make sure the same prefix will be extracted from the same key.
285 // The Name() value of the prefix extractor will be stored in the file. When
286 // reopening the file, the name of the options.prefix_extractor given will be
287 // bitwise compared to the prefix extractors stored in the file. An error
288 // will be returned if the two don't match.
289 kPrefix,
290 };
291
292 // Table Properties that are specific to plain table properties.
293 struct PlainTablePropertyNames {
294 static const std::string kEncodingType;
295 static const std::string kBloomVersion;
296 static const std::string kNumBloomBlocks;
297 };
298
299 const uint32_t kPlainTableVariableLength = 0;
300
301 struct PlainTableOptions {
302 // @user_key_len: plain table has optimization for fix-sized keys, which can
303 // be specified via user_key_len. Alternatively, you can pass
304 // `kPlainTableVariableLength` if your keys have variable
305 // lengths.
306 uint32_t user_key_len = kPlainTableVariableLength;
307
308 // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
309 // You may disable it by passing a zero.
310 int bloom_bits_per_key = 10;
311
312 // @hash_table_ratio: the desired utilization of the hash table used for
313 // prefix hashing.
314 // hash_table_ratio = number of prefixes / #buckets in the
315 // hash table
316 double hash_table_ratio = 0.75;
317
318 // @index_sparseness: inside each prefix, need to build one index record for
319 // how many keys for binary search inside each hash bucket.
320 // For encoding type kPrefix, the value will be used when
321 // writing to determine an interval to rewrite the full
322 // key. It will also be used as a suggestion and satisfied
323 // when possible.
324 size_t index_sparseness = 16;
325
326 // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
327 // Otherwise from huge page TLB. The user needs to
328 // reserve huge pages for it to be allocated, like:
329 // sysctl -w vm.nr_hugepages=20
330 // See linux doc Documentation/vm/hugetlbpage.txt
331 size_t huge_page_tlb_size = 0;
332
333 // @encoding_type: how to encode the keys. See enum EncodingType above for
334 // the choices. The value will determine how to encode keys
335 // when writing to a new SST file. This value will be stored
336 // inside the SST file which will be used when reading from
337 // the file, which makes it possible for users to choose
338 // different encoding type when reopening a DB. Files with
339 // different encoding types can co-exist in the same DB and
340 // can be read.
341 EncodingType encoding_type = kPlain;
342
343 // @full_scan_mode: mode for reading the whole file one record by one without
344 // using the index.
345 bool full_scan_mode = false;
346
347 // @store_index_in_file: compute plain table index and bloom filter during
348 // file building and store it in file. When reading
349 // file, index will be mmaped instead of recomputation.
350 bool store_index_in_file = false;
351 };
352
353 // -- Plain Table with prefix-only seek
354 // For this factory, you need to set Options.prefix_extractor properly to make it
355 // work. Look-up will starts with prefix hash lookup for key prefix. Inside the
356 // hash bucket found, a binary search is executed for hash conflicts. Finally,
357 // a linear search is used.
358
359 extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
360 PlainTableOptions());
361
362 struct CuckooTablePropertyNames {
363 // The key that is used to fill empty buckets.
364 static const std::string kEmptyKey;
365 // Fixed length of value.
366 static const std::string kValueLength;
367 // Number of hash functions used in Cuckoo Hash.
368 static const std::string kNumHashFunc;
369 // It denotes the number of buckets in a Cuckoo Block. Given a key and a
370 // particular hash function, a Cuckoo Block is a set of consecutive buckets,
371 // where starting bucket id is given by the hash function on the key. In case
372 // of a collision during inserting the key, the builder tries to insert the
373 // key in other locations of the cuckoo block before using the next hash
374 // function. This reduces cache miss during read operation in case of
375 // collision.
376 static const std::string kCuckooBlockSize;
377 // Size of the hash table. Use this number to compute the modulo of hash
378 // function. The actual number of buckets will be kMaxHashTableSize +
379 // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
380 // accommodate the Cuckoo Block from end of hash table, due to cache friendly
381 // implementation.
382 static const std::string kHashTableSize;
383 // Denotes if the key sorted in the file is Internal Key (if false)
384 // or User Key only (if true).
385 static const std::string kIsLastLevel;
386 // Indicate if using identity function for the first hash function.
387 static const std::string kIdentityAsFirstHash;
388 // Indicate if using module or bit and to calculate hash value
389 static const std::string kUseModuleHash;
390 // Fixed user key length
391 static const std::string kUserKeyLength;
392 };
393
394 struct CuckooTableOptions {
395 // Determines the utilization of hash tables. Smaller values
396 // result in larger hash tables with fewer collisions.
397 double hash_table_ratio = 0.9;
398 // A property used by builder to determine the depth to go to
399 // to search for a path to displace elements in case of
400 // collision. See Builder.MakeSpaceForKey method. Higher
401 // values result in more efficient hash tables with fewer
402 // lookups but take more time to build.
403 uint32_t max_search_depth = 100;
404 // In case of collision while inserting, the builder
405 // attempts to insert in the next cuckoo_block_size
406 // locations before skipping over to the next Cuckoo hash
407 // function. This makes lookups more cache friendly in case
408 // of collisions.
409 uint32_t cuckoo_block_size = 5;
410 // If this option is enabled, user key is treated as uint64_t and its value
411 // is used as hash value directly. This option changes builder's behavior.
412 // Reader ignore this option and behave according to what specified in table
413 // property.
414 bool identity_as_first_hash = false;
415 // If this option is set to true, module is used during hash calculation.
416 // This often yields better space efficiency at the cost of performance.
417 // If this option is set to false, # of entries in table is constrained to be
418 // power of two, and bit and is used to calculate hash, which is faster in
419 // general.
420 bool use_module_hash = true;
421 };
422
423 // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
424 extern TableFactory* NewCuckooTableFactory(
425 const CuckooTableOptions& table_options = CuckooTableOptions());
426
427 #endif // ROCKSDB_LITE
428
429 class RandomAccessFileReader;
430
431 // A base class for table factories.
432 class TableFactory {
433 public:
434 virtual ~TableFactory() {}
435
436 // The type of the table.
437 //
438 // The client of this package should switch to a new name whenever
439 // the table format implementation changes.
440 //
441 // Names starting with "rocksdb." are reserved and should not be used
442 // by any clients of this package.
443 virtual const char* Name() const = 0;
444
445 // Returns a Table object table that can fetch data from file specified
446 // in parameter file. It's the caller's responsibility to make sure
447 // file is in the correct format.
448 //
449 // NewTableReader() is called in three places:
450 // (1) TableCache::FindTable() calls the function when table cache miss
451 // and cache the table object returned.
452 // (2) SstFileReader (for SST Dump) opens the table and dump the table
453 // contents using the iterator of the table.
454 // (3) DBImpl::IngestExternalFile() calls this function to read the contents of
455 // the sst file it's attempting to add
456 //
457 // table_reader_options is a TableReaderOptions which contain all the
458 // needed parameters and configuration to open the table.
459 // file is a file handler to handle the file for the table.
460 // file_size is the physical file size of the file.
461 // table_reader is the output table reader.
462 virtual Status NewTableReader(
463 const TableReaderOptions& table_reader_options,
464 unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
465 unique_ptr<TableReader>* table_reader,
466 bool prefetch_index_and_filter_in_cache = true) const = 0;
467
468 // Return a table builder to write to a file for this table type.
469 //
470 // It is called in several places:
471 // (1) When flushing memtable to a level-0 output file, it creates a table
472 // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
473 // (2) During compaction, it gets the builder for writing compaction output
474 // files in DBImpl::OpenCompactionOutputFile().
475 // (3) When recovering from transaction logs, it creates a table builder to
476 // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
477 // by calling BuildTable())
478 // (4) When running Repairer, it creates a table builder to convert logs to
479 // SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
480 //
481 // Multiple configured can be accessed from there, including and not limited
482 // to compression options. file is a handle of a writable file.
483 // It is the caller's responsibility to keep the file open and close the file
484 // after closing the table builder. compression_type is the compression type
485 // to use in this table.
486 virtual TableBuilder* NewTableBuilder(
487 const TableBuilderOptions& table_builder_options,
488 uint32_t column_family_id, WritableFileWriter* file) const = 0;
489
490 // Sanitizes the specified DB Options and ColumnFamilyOptions.
491 //
492 // If the function cannot find a way to sanitize the input DB Options,
493 // a non-ok Status will be returned.
494 virtual Status SanitizeOptions(
495 const DBOptions& db_opts,
496 const ColumnFamilyOptions& cf_opts) const = 0;
497
498 // Return a string that contains printable format of table configurations.
499 // RocksDB prints configurations at DB Open().
500 virtual std::string GetPrintableTableOptions() const = 0;
501
502 virtual Status GetOptionString(std::string* /*opt_string*/,
503 const std::string& /*delimiter*/) const {
504 return Status::NotSupported(
505 "The table factory doesn't implement GetOptionString().");
506 }
507
508 // Returns the raw pointer of the table options that is used by this
509 // TableFactory, or nullptr if this function is not supported.
510 // Since the return value is a raw pointer, the TableFactory owns the
511 // pointer and the caller should not delete the pointer.
512 //
513 // In certain case, it is desirable to alter the underlying options when the
514 // TableFactory is not used by any open DB by casting the returned pointer
515 // to the right class. For instance, if BlockBasedTableFactory is used,
516 // then the pointer can be casted to BlockBasedTableOptions.
517 //
518 // Note that changing the underlying TableFactory options while the
519 // TableFactory is currently used by any open DB is undefined behavior.
520 // Developers should use DB::SetOption() instead to dynamically change
521 // options while the DB is open.
522 virtual void* GetOptions() { return nullptr; }
523
524 // Return is delete range supported
525 virtual bool IsDeleteRangeSupported() const { return false; }
526 };
527
528 #ifndef ROCKSDB_LITE
529 // Create a special table factory that can open either of the supported
530 // table formats, based on setting inside the SST files. It should be used to
531 // convert a DB from one table format to another.
532 // @table_factory_to_write: the table factory used when writing to new files.
533 // @block_based_table_factory: block based table factory to use. If NULL, use
534 // a default one.
535 // @plain_table_factory: plain table factory to use. If NULL, use a default one.
536 // @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default one.
537 extern TableFactory* NewAdaptiveTableFactory(
538 std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
539 std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
540 std::shared_ptr<TableFactory> plain_table_factory = nullptr,
541 std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
542
543 #endif // ROCKSDB_LITE
544
545 } // namespace rocksdb