]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/table.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / rocksdb / include / rocksdb / table.h
CommitLineData
7c673cae
FG
1// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file. See the AUTHORS file for names of contributors.
4//
5// Currently we support two types of tables: plain table and block-based table.
6// 1. Block-based table: this is the default table type that we inherited from
7// LevelDB, which was designed for storing data in hard disk or flash
8// device.
9// 2. Plain table: it is one of RocksDB's SST file format optimized
10// for low query latency on pure-memory or really low-latency media.
11//
12// A tutorial of rocksdb table formats is available here:
13// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
14//
15// Example code is also available
16// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
17
18#pragma once
11fdf7f2 19
7c673cae
FG
20#include <memory>
21#include <string>
22#include <unordered_map>
23
24#include "rocksdb/cache.h"
25#include "rocksdb/env.h"
26#include "rocksdb/iterator.h"
27#include "rocksdb/options.h"
28#include "rocksdb/status.h"
29
30namespace rocksdb {
31
32// -- Block-based Table
33class FlushBlockPolicyFactory;
34class PersistentCache;
35class RandomAccessFile;
36struct TableReaderOptions;
37struct TableBuilderOptions;
38class TableBuilder;
39class TableReader;
40class WritableFileWriter;
41struct EnvOptions;
42struct Options;
43
7c673cae 44enum ChecksumType : char {
11fdf7f2 45 kNoChecksum = 0x0,
7c673cae
FG
46 kCRC32c = 0x1,
47 kxxHash = 0x2,
494da23a 48 kxxHash64 = 0x3,
7c673cae
FG
49};
50
51// For advanced user only
52struct BlockBasedTableOptions {
53 // @flush_block_policy_factory creates the instances of flush block policy.
54 // which provides a configurable way to determine when to flush a block in
55 // the block based tables. If not set, table builder will use the default
56 // block flush policy, which cut blocks by block size (please refer to
57 // `FlushBlockBySizePolicy`).
58 std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
59
60 // TODO(kailiu) Temporarily disable this feature by making the default value
61 // to be false.
62 //
494da23a
TL
63 // TODO(ajkr) we need to update names of variables controlling meta-block
64 // caching as they should now apply to range tombstone and compression
65 // dictionary meta-blocks, in addition to index and filter meta-blocks.
66 //
7c673cae
FG
67 // Indicating if we'd put index/filter blocks to the block cache.
68 // If not specified, each "table reader" object will pre-load index/filter
69 // block during table initialization.
70 bool cache_index_and_filter_blocks = false;
71
72 // If cache_index_and_filter_blocks is enabled, cache index and filter
73 // blocks with high priority. If set to true, depending on implementation of
11fdf7f2 74 // block cache, index and filter blocks may be less likely to be evicted
7c673cae
FG
75 // than data blocks.
76 bool cache_index_and_filter_blocks_with_high_priority = false;
77
78 // if cache_index_and_filter_blocks is true and the below is true, then
79 // filter and index blocks are stored in the cache, but a reference is
80 // held in the "table reader" object so the blocks are pinned and only
81 // evicted from cache when the table reader is freed.
82 bool pin_l0_filter_and_index_blocks_in_cache = false;
83
11fdf7f2
TL
84 // If cache_index_and_filter_blocks is true and the below is true, then
85 // the top-level index of partitioned filter and index blocks are stored in
86 // the cache, but a reference is held in the "table reader" object so the
87 // blocks are pinned and only evicted from cache when the table reader is
88 // freed. This is not limited to l0 in LSM tree.
89 bool pin_top_level_index_and_filter = true;
90
7c673cae
FG
91 // The index type that will be used for this table.
92 enum IndexType : char {
93 // A space efficient index block that is optimized for
94 // binary-search-based index.
95 kBinarySearch,
96
97 // The hash index, if enabled, will do the hash lookup when
98 // `Options.prefix_extractor` is provided.
99 kHashSearch,
100
7c673cae
FG
101 // A two-level index implementation. Both levels are binary search indexes.
102 kTwoLevelIndexSearch,
103 };
104
105 IndexType index_type = kBinarySearch;
106
11fdf7f2
TL
107 // The index type that will be used for the data block.
108 enum DataBlockIndexType : char {
109 kDataBlockBinarySearch = 0, // traditional block type
110 kDataBlockBinaryAndHash = 1, // additional hash index
111 };
112
113 DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
114
115 // #entries/#buckets. It is valid only when data_block_hash_index_type is
116 // kDataBlockBinaryAndHash.
117 double data_block_hash_table_util_ratio = 0.75;
118
7c673cae
FG
119 // This option is now deprecated. No matter what value it is set to,
120 // it will behave as if hash_index_allow_collision=true.
121 bool hash_index_allow_collision = true;
122
123 // Use the specified checksum type. Newly created table files will be
124 // protected with this checksum type. Old table files will still be readable,
125 // even though they have different checksum type.
126 ChecksumType checksum = kCRC32c;
127
128 // Disable block cache. If this is set to true,
129 // then no block cache should be used, and the block_cache should
130 // point to a nullptr object.
131 bool no_block_cache = false;
132
133 // If non-NULL use the specified cache for blocks.
134 // If NULL, rocksdb will automatically create and use an 8MB internal cache.
135 std::shared_ptr<Cache> block_cache = nullptr;
136
137 // If non-NULL use the specified cache for pages read from device
138 // IF NULL, no page cache is used
139 std::shared_ptr<PersistentCache> persistent_cache = nullptr;
140
141 // If non-NULL use the specified cache for compressed blocks.
142 // If NULL, rocksdb will not use a compressed block cache.
494da23a
TL
143 // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
144 // same type of object there.
7c673cae
FG
145 std::shared_ptr<Cache> block_cache_compressed = nullptr;
146
147 // Approximate size of user data packed per block. Note that the
148 // block size specified here corresponds to uncompressed data. The
149 // actual size of the unit read from disk may be smaller if
150 // compression is enabled. This parameter can be changed dynamically.
151 size_t block_size = 4 * 1024;
152
153 // This is used to close a block before it reaches the configured
154 // 'block_size'. If the percentage of free space in the current block is less
155 // than this specified number and adding a new record to the block will
156 // exceed the configured block size, then this block will be closed and the
157 // new record will be written to the next block.
158 int block_size_deviation = 10;
159
160 // Number of keys between restart points for delta encoding of keys.
161 // This parameter can be changed dynamically. Most clients should
162 // leave this parameter alone. The minimum value allowed is 1. Any smaller
163 // value will be silently overwritten with 1.
164 int block_restart_interval = 16;
165
166 // Same as block_restart_interval but used for the index block.
167 int index_block_restart_interval = 1;
168
169 // Block size for partitioned metadata. Currently applied to indexes when
170 // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
171 // Note: Since in the current implementation the filters and index partitions
11fdf7f2 172 // are aligned, an index/filter block is created when either index or filter
7c673cae
FG
173 // block size reaches the specified limit.
174 // Note: this limit is currently applied to only index blocks; a filter
175 // partition is cut right after an index block is cut
176 // TODO(myabandeh): remove the note above when filter partitions are cut
177 // separately
178 uint64_t metadata_block_size = 4096;
179
180 // Note: currently this option requires kTwoLevelIndexSearch to be set as
181 // well.
182 // TODO(myabandeh): remove the note above once the limitation is lifted
11fdf7f2
TL
183 // Use partitioned full filters for each SST file. This option is
184 // incompatible with block-based filters.
7c673cae
FG
185 bool partition_filters = false;
186
187 // Use delta encoding to compress keys in blocks.
188 // ReadOptions::pin_data requires this option to be disabled.
189 //
190 // Default: true
191 bool use_delta_encoding = true;
192
193 // If non-nullptr, use the specified filter policy to reduce disk reads.
194 // Many applications will benefit from passing the result of
195 // NewBloomFilterPolicy() here.
196 std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
197
198 // If true, place whole keys in the filter (not just prefixes).
199 // This must generally be true for gets to be efficient.
200 bool whole_key_filtering = true;
201
202 // Verify that decompressing the compressed block gives back the input. This
203 // is a verification mode that we use to detect bugs in compression
204 // algorithms.
205 bool verify_compression = false;
206
207 // If used, For every data block we load into memory, we will create a bitmap
208 // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
209 // will be used to figure out the percentage we actually read of the blocks.
210 //
211 // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
212 // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
213 // read amplification using this formula
214 // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
215 //
216 // value => memory usage (percentage of loaded blocks memory)
217 // 1 => 12.50 %
218 // 2 => 06.25 %
219 // 4 => 03.12 %
220 // 8 => 01.56 %
221 // 16 => 00.78 %
222 //
223 // Note: This number must be a power of 2, if not it will be sanitized
224 // to be the next lowest power of 2, for example a value of 7 will be
225 // treated as 4, a value of 19 will be treated as 16.
226 //
227 // Default: 0 (disabled)
228 uint32_t read_amp_bytes_per_bit = 0;
229
494da23a 230 // We currently have five versions:
7c673cae
FG
231 // 0 -- This version is currently written out by all RocksDB's versions by
232 // default. Can be read by really old RocksDB's. Doesn't support changing
233 // checksum (default is CRC32).
234 // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
235 // checksum, like xxHash. It is written by RocksDB when
236 // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
237 // 0 is silently upconverted)
238 // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
239 // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
240 // don't plan to run RocksDB before version 3.10, you should probably use
241 // this.
11fdf7f2
TL
242 // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
243 // encode the keys in index blocks. If you don't plan to run RocksDB before
244 // version 5.15, you should probably use this.
245 // This option only affects newly written tables. When reading existing
246 // tables, the information about version is read from the footer.
247 // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
248 // encode the values in index blocks. If you don't plan to run RocksDB before
249 // version 5.16 and you are using index_block_restart_interval > 1, you should
250 // probably use this as it would reduce the index size.
251 // This option only affects newly written tables. When reading existing
252 // tables, the information about version is read from the footer.
7c673cae 253 uint32_t format_version = 2;
11fdf7f2
TL
254
255 // Store index blocks on disk in compressed format. Changing this option to
256 // false will avoid the overhead of decompression if index blocks are evicted
257 // and read back
258 bool enable_index_compression = true;
259
260 // Align data blocks on lesser of page size and block size
261 bool block_align = false;
7c673cae
FG
262};
263
264// Table Properties that are specific to block-based table properties.
265struct BlockBasedTablePropertyNames {
11fdf7f2 266 // value of this properties is a fixed int32 number.
7c673cae
FG
267 static const std::string kIndexType;
268 // value is "1" for true and "0" for false.
269 static const std::string kWholeKeyFiltering;
270 // value is "1" for true and "0" for false.
271 static const std::string kPrefixFiltering;
272};
273
274// Create default block based table factory.
275extern TableFactory* NewBlockBasedTableFactory(
276 const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
277
278#ifndef ROCKSDB_LITE
279
280enum EncodingType : char {
281 // Always write full keys without any special encoding.
282 kPlain,
283 // Find opportunity to write the same prefix once for multiple rows.
284 // In some cases, when a key follows a previous key with the same prefix,
285 // instead of writing out the full key, it just writes out the size of the
286 // shared prefix, as well as other bytes, to save some bytes.
287 //
288 // When using this option, the user is required to use the same prefix
289 // extractor to make sure the same prefix will be extracted from the same key.
290 // The Name() value of the prefix extractor will be stored in the file. When
291 // reopening the file, the name of the options.prefix_extractor given will be
292 // bitwise compared to the prefix extractors stored in the file. An error
293 // will be returned if the two don't match.
294 kPrefix,
295};
296
297// Table Properties that are specific to plain table properties.
298struct PlainTablePropertyNames {
299 static const std::string kEncodingType;
300 static const std::string kBloomVersion;
301 static const std::string kNumBloomBlocks;
302};
303
304const uint32_t kPlainTableVariableLength = 0;
305
306struct PlainTableOptions {
307 // @user_key_len: plain table has optimization for fix-sized keys, which can
308 // be specified via user_key_len. Alternatively, you can pass
309 // `kPlainTableVariableLength` if your keys have variable
310 // lengths.
311 uint32_t user_key_len = kPlainTableVariableLength;
312
313 // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
314 // You may disable it by passing a zero.
315 int bloom_bits_per_key = 10;
316
317 // @hash_table_ratio: the desired utilization of the hash table used for
318 // prefix hashing.
319 // hash_table_ratio = number of prefixes / #buckets in the
320 // hash table
321 double hash_table_ratio = 0.75;
322
323 // @index_sparseness: inside each prefix, need to build one index record for
324 // how many keys for binary search inside each hash bucket.
325 // For encoding type kPrefix, the value will be used when
326 // writing to determine an interval to rewrite the full
327 // key. It will also be used as a suggestion and satisfied
328 // when possible.
329 size_t index_sparseness = 16;
330
331 // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
332 // Otherwise from huge page TLB. The user needs to
333 // reserve huge pages for it to be allocated, like:
334 // sysctl -w vm.nr_hugepages=20
335 // See linux doc Documentation/vm/hugetlbpage.txt
336 size_t huge_page_tlb_size = 0;
337
338 // @encoding_type: how to encode the keys. See enum EncodingType above for
339 // the choices. The value will determine how to encode keys
340 // when writing to a new SST file. This value will be stored
341 // inside the SST file which will be used when reading from
342 // the file, which makes it possible for users to choose
343 // different encoding type when reopening a DB. Files with
344 // different encoding types can co-exist in the same DB and
345 // can be read.
346 EncodingType encoding_type = kPlain;
347
348 // @full_scan_mode: mode for reading the whole file one record by one without
349 // using the index.
350 bool full_scan_mode = false;
351
352 // @store_index_in_file: compute plain table index and bloom filter during
353 // file building and store it in file. When reading
354 // file, index will be mmaped instead of recomputation.
355 bool store_index_in_file = false;
356};
357
358// -- Plain Table with prefix-only seek
494da23a
TL
359// For this factory, you need to set Options.prefix_extractor properly to make
360// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
361// the hash bucket found, a binary search is executed for hash conflicts.
362// Finally, a linear search is used.
7c673cae 363
494da23a
TL
364extern TableFactory* NewPlainTableFactory(
365 const PlainTableOptions& options = PlainTableOptions());
7c673cae
FG
366
367struct CuckooTablePropertyNames {
368 // The key that is used to fill empty buckets.
369 static const std::string kEmptyKey;
370 // Fixed length of value.
371 static const std::string kValueLength;
372 // Number of hash functions used in Cuckoo Hash.
373 static const std::string kNumHashFunc;
374 // It denotes the number of buckets in a Cuckoo Block. Given a key and a
375 // particular hash function, a Cuckoo Block is a set of consecutive buckets,
376 // where starting bucket id is given by the hash function on the key. In case
377 // of a collision during inserting the key, the builder tries to insert the
378 // key in other locations of the cuckoo block before using the next hash
379 // function. This reduces cache miss during read operation in case of
380 // collision.
381 static const std::string kCuckooBlockSize;
382 // Size of the hash table. Use this number to compute the modulo of hash
383 // function. The actual number of buckets will be kMaxHashTableSize +
384 // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
385 // accommodate the Cuckoo Block from end of hash table, due to cache friendly
386 // implementation.
387 static const std::string kHashTableSize;
388 // Denotes if the key sorted in the file is Internal Key (if false)
389 // or User Key only (if true).
390 static const std::string kIsLastLevel;
391 // Indicate if using identity function for the first hash function.
392 static const std::string kIdentityAsFirstHash;
393 // Indicate if using module or bit and to calculate hash value
394 static const std::string kUseModuleHash;
395 // Fixed user key length
396 static const std::string kUserKeyLength;
397};
398
399struct CuckooTableOptions {
400 // Determines the utilization of hash tables. Smaller values
401 // result in larger hash tables with fewer collisions.
402 double hash_table_ratio = 0.9;
403 // A property used by builder to determine the depth to go to
404 // to search for a path to displace elements in case of
405 // collision. See Builder.MakeSpaceForKey method. Higher
406 // values result in more efficient hash tables with fewer
407 // lookups but take more time to build.
408 uint32_t max_search_depth = 100;
409 // In case of collision while inserting, the builder
410 // attempts to insert in the next cuckoo_block_size
411 // locations before skipping over to the next Cuckoo hash
412 // function. This makes lookups more cache friendly in case
413 // of collisions.
414 uint32_t cuckoo_block_size = 5;
415 // If this option is enabled, user key is treated as uint64_t and its value
416 // is used as hash value directly. This option changes builder's behavior.
417 // Reader ignore this option and behave according to what specified in table
418 // property.
419 bool identity_as_first_hash = false;
420 // If this option is set to true, module is used during hash calculation.
421 // This often yields better space efficiency at the cost of performance.
11fdf7f2 422 // If this option is set to false, # of entries in table is constrained to be
7c673cae
FG
423 // power of two, and bit and is used to calculate hash, which is faster in
424 // general.
425 bool use_module_hash = true;
426};
427
428// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
429extern TableFactory* NewCuckooTableFactory(
430 const CuckooTableOptions& table_options = CuckooTableOptions());
431
432#endif // ROCKSDB_LITE
433
434class RandomAccessFileReader;
435
436// A base class for table factories.
437class TableFactory {
438 public:
439 virtual ~TableFactory() {}
440
441 // The type of the table.
442 //
443 // The client of this package should switch to a new name whenever
444 // the table format implementation changes.
445 //
446 // Names starting with "rocksdb." are reserved and should not be used
447 // by any clients of this package.
448 virtual const char* Name() const = 0;
449
450 // Returns a Table object table that can fetch data from file specified
451 // in parameter file. It's the caller's responsibility to make sure
452 // file is in the correct format.
453 //
454 // NewTableReader() is called in three places:
455 // (1) TableCache::FindTable() calls the function when table cache miss
456 // and cache the table object returned.
494da23a 457 // (2) SstFileDumper (for SST Dump) opens the table and dump the table
11fdf7f2 458 // contents using the iterator of the table.
494da23a
TL
459 // (3) DBImpl::IngestExternalFile() calls this function to read the contents
460 // of the sst file it's attempting to add
7c673cae
FG
461 //
462 // table_reader_options is a TableReaderOptions which contain all the
463 // needed parameters and configuration to open the table.
464 // file is a file handler to handle the file for the table.
465 // file_size is the physical file size of the file.
466 // table_reader is the output table reader.
467 virtual Status NewTableReader(
468 const TableReaderOptions& table_reader_options,
494da23a
TL
469 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
470 std::unique_ptr<TableReader>* table_reader,
7c673cae
FG
471 bool prefetch_index_and_filter_in_cache = true) const = 0;
472
473 // Return a table builder to write to a file for this table type.
474 //
475 // It is called in several places:
476 // (1) When flushing memtable to a level-0 output file, it creates a table
477 // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
478 // (2) During compaction, it gets the builder for writing compaction output
479 // files in DBImpl::OpenCompactionOutputFile().
480 // (3) When recovering from transaction logs, it creates a table builder to
481 // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
482 // by calling BuildTable())
483 // (4) When running Repairer, it creates a table builder to convert logs to
484 // SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
485 //
11fdf7f2 486 // Multiple configured can be accessed from there, including and not limited
7c673cae
FG
487 // to compression options. file is a handle of a writable file.
488 // It is the caller's responsibility to keep the file open and close the file
489 // after closing the table builder. compression_type is the compression type
490 // to use in this table.
491 virtual TableBuilder* NewTableBuilder(
492 const TableBuilderOptions& table_builder_options,
493 uint32_t column_family_id, WritableFileWriter* file) const = 0;
494
495 // Sanitizes the specified DB Options and ColumnFamilyOptions.
496 //
497 // If the function cannot find a way to sanitize the input DB Options,
498 // a non-ok Status will be returned.
494da23a
TL
499 virtual Status SanitizeOptions(const DBOptions& db_opts,
500 const ColumnFamilyOptions& cf_opts) const = 0;
7c673cae
FG
501
502 // Return a string that contains printable format of table configurations.
503 // RocksDB prints configurations at DB Open().
504 virtual std::string GetPrintableTableOptions() const = 0;
505
11fdf7f2
TL
506 virtual Status GetOptionString(std::string* /*opt_string*/,
507 const std::string& /*delimiter*/) const {
508 return Status::NotSupported(
509 "The table factory doesn't implement GetOptionString().");
510 }
511
7c673cae
FG
512 // Returns the raw pointer of the table options that is used by this
513 // TableFactory, or nullptr if this function is not supported.
514 // Since the return value is a raw pointer, the TableFactory owns the
515 // pointer and the caller should not delete the pointer.
516 //
11fdf7f2 517 // In certain case, it is desirable to alter the underlying options when the
7c673cae
FG
518 // TableFactory is not used by any open DB by casting the returned pointer
519 // to the right class. For instance, if BlockBasedTableFactory is used,
520 // then the pointer can be casted to BlockBasedTableOptions.
521 //
522 // Note that changing the underlying TableFactory options while the
523 // TableFactory is currently used by any open DB is undefined behavior.
524 // Developers should use DB::SetOption() instead to dynamically change
525 // options while the DB is open.
526 virtual void* GetOptions() { return nullptr; }
11fdf7f2
TL
527
528 // Return is delete range supported
529 virtual bool IsDeleteRangeSupported() const { return false; }
7c673cae
FG
530};
531
532#ifndef ROCKSDB_LITE
533// Create a special table factory that can open either of the supported
534// table formats, based on setting inside the SST files. It should be used to
535// convert a DB from one table format to another.
536// @table_factory_to_write: the table factory used when writing to new files.
537// @block_based_table_factory: block based table factory to use. If NULL, use
538// a default one.
539// @plain_table_factory: plain table factory to use. If NULL, use a default one.
494da23a
TL
540// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
541// one.
7c673cae
FG
542extern TableFactory* NewAdaptiveTableFactory(
543 std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
544 std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
545 std::shared_ptr<TableFactory> plain_table_factory = nullptr,
546 std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
547
548#endif // ROCKSDB_LITE
549
550} // namespace rocksdb