]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/include/rocksdb/table.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / include / rocksdb / table.h
CommitLineData
f67539c2 1// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
7c673cae
FG
2// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file. See the AUTHORS file for names of contributors.
5//
6// Currently we support two types of tables: plain table and block-based table.
7// 1. Block-based table: this is the default table type that we inherited from
8// LevelDB, which was designed for storing data in hard disk or flash
9// device.
10// 2. Plain table: it is one of RocksDB's SST file format optimized
11// for low query latency on pure-memory or really low-latency media.
12//
13// A tutorial of rocksdb table formats is available here:
14// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
15//
16// Example code is also available
17// https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
18
19#pragma once
11fdf7f2 20
7c673cae
FG
21#include <memory>
22#include <string>
23#include <unordered_map>
24
20effc67 25#include "rocksdb/customizable.h"
7c673cae 26#include "rocksdb/env.h"
7c673cae
FG
27#include "rocksdb/options.h"
28#include "rocksdb/status.h"
29
f67539c2 30namespace ROCKSDB_NAMESPACE {
7c673cae
FG
31
32// -- Block-based Table
20effc67
TL
33class Cache;
34class FilterPolicy;
7c673cae
FG
35class FlushBlockPolicyFactory;
36class PersistentCache;
37class RandomAccessFile;
38struct TableReaderOptions;
39struct TableBuilderOptions;
40class TableBuilder;
20effc67 41class TableFactory;
7c673cae
FG
42class TableReader;
43class WritableFileWriter;
20effc67 44struct ConfigOptions;
7c673cae 45struct EnvOptions;
7c673cae 46
7c673cae 47enum ChecksumType : char {
11fdf7f2 48 kNoChecksum = 0x0,
7c673cae
FG
49 kCRC32c = 0x1,
50 kxxHash = 0x2,
494da23a 51 kxxHash64 = 0x3,
7c673cae
FG
52};
53
20effc67
TL
54// `PinningTier` is used to specify which tier of block-based tables should
55// be affected by a block cache pinning setting (see
56// `MetadataCacheOptions` below).
57enum class PinningTier {
58 // For compatibility, this value specifies to fallback to the behavior
59 // indicated by the deprecated options,
60 // `pin_l0_filter_and_index_blocks_in_cache` and
61 // `pin_top_level_index_and_filter`.
62 kFallback,
63
64 // This tier contains no block-based tables.
65 kNone,
66
67 // This tier contains block-based tables that may have originated from a
68 // memtable flush. In particular, it includes tables from L0 that are smaller
69 // than 1.5 times the current `write_buffer_size`. Note these criteria imply
70 // it can include intra-L0 compaction outputs and ingested files, as long as
71 // they are not abnormally large compared to flushed files in L0.
72 kFlushedAndSimilar,
73
74 // This tier contains all block-based tables.
75 kAll,
76};
77
78// `MetadataCacheOptions` contains members indicating the desired caching
79// behavior for the different categories of metadata blocks.
80struct MetadataCacheOptions {
81 // The tier of block-based tables whose top-level index into metadata
82 // partitions will be pinned. Currently indexes and filters may be
83 // partitioned.
84 //
85 // Note `cache_index_and_filter_blocks` must be true for this option to have
86 // any effect. Otherwise any top-level index into metadata partitions would be
87 // held in table reader memory, outside the block cache.
88 PinningTier top_level_index_pinning = PinningTier::kFallback;
89
90 // The tier of block-based tables whose metadata partitions will be pinned.
91 // Currently indexes and filters may be partitioned.
92 PinningTier partition_pinning = PinningTier::kFallback;
93
94 // The tier of block-based tables whose unpartitioned metadata blocks will be
95 // pinned.
96 //
97 // Note `cache_index_and_filter_blocks` must be true for this option to have
98 // any effect. Otherwise the unpartitioned meta-blocks would be held in table
99 // reader memory, outside the block cache.
100 PinningTier unpartitioned_pinning = PinningTier::kFallback;
101};
102
7c673cae
FG
103// For advanced user only
104struct BlockBasedTableOptions {
20effc67 105 static const char* kName() { return "BlockTableOptions"; };
7c673cae
FG
106 // @flush_block_policy_factory creates the instances of flush block policy.
107 // which provides a configurable way to determine when to flush a block in
108 // the block based tables. If not set, table builder will use the default
109 // block flush policy, which cut blocks by block size (please refer to
110 // `FlushBlockBySizePolicy`).
111 std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
112
113 // TODO(kailiu) Temporarily disable this feature by making the default value
114 // to be false.
115 //
494da23a
TL
116 // TODO(ajkr) we need to update names of variables controlling meta-block
117 // caching as they should now apply to range tombstone and compression
118 // dictionary meta-blocks, in addition to index and filter meta-blocks.
119 //
7c673cae
FG
120 // Indicating if we'd put index/filter blocks to the block cache.
121 // If not specified, each "table reader" object will pre-load index/filter
122 // block during table initialization.
123 bool cache_index_and_filter_blocks = false;
124
125 // If cache_index_and_filter_blocks is enabled, cache index and filter
126 // blocks with high priority. If set to true, depending on implementation of
11fdf7f2 127 // block cache, index and filter blocks may be less likely to be evicted
7c673cae 128 // than data blocks.
f67539c2 129 bool cache_index_and_filter_blocks_with_high_priority = true;
7c673cae 130
20effc67
TL
131 // DEPRECATED: This option will be removed in a future version. For now, this
132 // option still takes effect by updating each of the following variables that
133 // has the default value, `PinningTier::kFallback`:
134 //
135 // - `MetadataCacheOptions::partition_pinning`
136 // - `MetadataCacheOptions::unpartitioned_pinning`
137 //
138 // The updated value is chosen as follows:
139 //
140 // - `pin_l0_filter_and_index_blocks_in_cache == false` ->
141 // `PinningTier::kNone`
142 // - `pin_l0_filter_and_index_blocks_in_cache == true` ->
143 // `PinningTier::kFlushedAndSimilar`
144 //
145 // To migrate away from this flag, explicitly configure
146 // `MetadataCacheOptions` as described above.
147 //
7c673cae
FG
148 // if cache_index_and_filter_blocks is true and the below is true, then
149 // filter and index blocks are stored in the cache, but a reference is
150 // held in the "table reader" object so the blocks are pinned and only
151 // evicted from cache when the table reader is freed.
152 bool pin_l0_filter_and_index_blocks_in_cache = false;
153
20effc67
TL
154 // DEPRECATED: This option will be removed in a future version. For now, this
155 // option still takes effect by updating
156 // `MetadataCacheOptions::top_level_index_pinning` when it has the
157 // default value, `PinningTier::kFallback`.
158 //
159 // The updated value is chosen as follows:
160 //
161 // - `pin_top_level_index_and_filter == false` ->
162 // `PinningTier::kNone`
163 // - `pin_top_level_index_and_filter == true` ->
164 // `PinningTier::kAll`
165 //
166 // To migrate away from this flag, explicitly configure
167 // `MetadataCacheOptions` as described above.
168 //
11fdf7f2
TL
169 // If cache_index_and_filter_blocks is true and the below is true, then
170 // the top-level index of partitioned filter and index blocks are stored in
171 // the cache, but a reference is held in the "table reader" object so the
172 // blocks are pinned and only evicted from cache when the table reader is
173 // freed. This is not limited to l0 in LSM tree.
174 bool pin_top_level_index_and_filter = true;
175
20effc67
TL
176 // The desired block cache pinning behavior for the different categories of
177 // metadata blocks. While pinning can reduce block cache contention, users
178 // must take care not to pin excessive amounts of data, which risks
179 // overflowing block cache.
180 MetadataCacheOptions metadata_cache_options;
181
7c673cae
FG
182 // The index type that will be used for this table.
183 enum IndexType : char {
184 // A space efficient index block that is optimized for
185 // binary-search-based index.
f67539c2 186 kBinarySearch = 0x00,
7c673cae
FG
187
188 // The hash index, if enabled, will do the hash lookup when
189 // `Options.prefix_extractor` is provided.
f67539c2 190 kHashSearch = 0x01,
7c673cae 191
7c673cae 192 // A two-level index implementation. Both levels are binary search indexes.
f67539c2
TL
193 kTwoLevelIndexSearch = 0x02,
194
195 // Like kBinarySearch, but index also contains first key of each block.
196 // This allows iterators to defer reading the block until it's actually
197 // needed. May significantly reduce read amplification of short range scans.
198 // Without it, iterator seek usually reads one block from each level-0 file
199 // and from each level, which may be expensive.
200 // Works best in combination with:
201 // - IndexShorteningMode::kNoShortening,
202 // - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
203 // e.g. when prefix changes.
204 // Makes the index significantly bigger (2x or more), especially when keys
205 // are long.
f67539c2 206 kBinarySearchWithFirstKey = 0x03,
7c673cae
FG
207 };
208
209 IndexType index_type = kBinarySearch;
210
11fdf7f2
TL
211 // The index type that will be used for the data block.
212 enum DataBlockIndexType : char {
213 kDataBlockBinarySearch = 0, // traditional block type
214 kDataBlockBinaryAndHash = 1, // additional hash index
215 };
216
217 DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
218
219 // #entries/#buckets. It is valid only when data_block_hash_index_type is
220 // kDataBlockBinaryAndHash.
221 double data_block_hash_table_util_ratio = 0.75;
222
7c673cae
FG
223 // This option is now deprecated. No matter what value it is set to,
224 // it will behave as if hash_index_allow_collision=true.
225 bool hash_index_allow_collision = true;
226
227 // Use the specified checksum type. Newly created table files will be
228 // protected with this checksum type. Old table files will still be readable,
229 // even though they have different checksum type.
230 ChecksumType checksum = kCRC32c;
231
232 // Disable block cache. If this is set to true,
233 // then no block cache should be used, and the block_cache should
234 // point to a nullptr object.
235 bool no_block_cache = false;
236
237 // If non-NULL use the specified cache for blocks.
238 // If NULL, rocksdb will automatically create and use an 8MB internal cache.
239 std::shared_ptr<Cache> block_cache = nullptr;
240
241 // If non-NULL use the specified cache for pages read from device
242 // IF NULL, no page cache is used
243 std::shared_ptr<PersistentCache> persistent_cache = nullptr;
244
245 // If non-NULL use the specified cache for compressed blocks.
246 // If NULL, rocksdb will not use a compressed block cache.
494da23a
TL
247 // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
248 // same type of object there.
7c673cae
FG
249 std::shared_ptr<Cache> block_cache_compressed = nullptr;
250
251 // Approximate size of user data packed per block. Note that the
252 // block size specified here corresponds to uncompressed data. The
253 // actual size of the unit read from disk may be smaller if
254 // compression is enabled. This parameter can be changed dynamically.
255 size_t block_size = 4 * 1024;
256
257 // This is used to close a block before it reaches the configured
258 // 'block_size'. If the percentage of free space in the current block is less
259 // than this specified number and adding a new record to the block will
260 // exceed the configured block size, then this block will be closed and the
261 // new record will be written to the next block.
262 int block_size_deviation = 10;
263
264 // Number of keys between restart points for delta encoding of keys.
265 // This parameter can be changed dynamically. Most clients should
266 // leave this parameter alone. The minimum value allowed is 1. Any smaller
267 // value will be silently overwritten with 1.
268 int block_restart_interval = 16;
269
270 // Same as block_restart_interval but used for the index block.
271 int index_block_restart_interval = 1;
272
273 // Block size for partitioned metadata. Currently applied to indexes when
274 // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
275 // Note: Since in the current implementation the filters and index partitions
11fdf7f2 276 // are aligned, an index/filter block is created when either index or filter
7c673cae
FG
277 // block size reaches the specified limit.
278 // Note: this limit is currently applied to only index blocks; a filter
279 // partition is cut right after an index block is cut
280 // TODO(myabandeh): remove the note above when filter partitions are cut
281 // separately
282 uint64_t metadata_block_size = 4096;
283
284 // Note: currently this option requires kTwoLevelIndexSearch to be set as
285 // well.
286 // TODO(myabandeh): remove the note above once the limitation is lifted
11fdf7f2
TL
287 // Use partitioned full filters for each SST file. This option is
288 // incompatible with block-based filters.
7c673cae
FG
289 bool partition_filters = false;
290
20effc67
TL
291 // EXPERIMENTAL Option to generate Bloom filters that minimize memory
292 // internal fragmentation.
293 //
294 // When false, malloc_usable_size is not available, or format_version < 5,
295 // filters are generated without regard to internal fragmentation when
296 // loaded into memory (historical behavior). When true (and
297 // malloc_usable_size is available and format_version >= 5), then Bloom
298 // filters are generated to "round up" and "round down" their sizes to
299 // minimize internal fragmentation when loaded into memory, assuming the
300 // reading DB has the same memory allocation characteristics as the
301 // generating DB. This option does not break forward or backward
302 // compatibility.
303 //
304 // While individual filters will vary in bits/key and false positive rate
305 // when setting is true, the implementation attempts to maintain a weighted
306 // average FP rate for filters consistent with this option set to false.
307 //
308 // With Jemalloc for example, this setting is expected to save about 10% of
309 // the memory footprint and block cache charge of filters, while increasing
310 // disk usage of filters by about 1-2% due to encoding efficiency losses
311 // with variance in bits/key.
312 //
313 // NOTE: Because some memory counted by block cache might be unmapped pages
314 // within internal fragmentation, this option can increase observed RSS
315 // memory usage. With cache_index_and_filter_blocks=true, this option makes
316 // the block cache better at using space it is allowed.
317 //
318 // NOTE: Do not set to true if you do not trust malloc_usable_size. With
319 // this option, RocksDB might access an allocated memory object beyond its
320 // original size if malloc_usable_size says it is safe to do so. While this
321 // can be considered bad practice, it should not produce undefined behavior
322 // unless malloc_usable_size is buggy or broken.
323 bool optimize_filters_for_memory = false;
324
7c673cae
FG
325 // Use delta encoding to compress keys in blocks.
326 // ReadOptions::pin_data requires this option to be disabled.
327 //
328 // Default: true
329 bool use_delta_encoding = true;
330
331 // If non-nullptr, use the specified filter policy to reduce disk reads.
332 // Many applications will benefit from passing the result of
333 // NewBloomFilterPolicy() here.
334 std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
335
336 // If true, place whole keys in the filter (not just prefixes).
337 // This must generally be true for gets to be efficient.
338 bool whole_key_filtering = true;
339
340 // Verify that decompressing the compressed block gives back the input. This
341 // is a verification mode that we use to detect bugs in compression
342 // algorithms.
343 bool verify_compression = false;
344
345 // If used, For every data block we load into memory, we will create a bitmap
346 // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
347 // will be used to figure out the percentage we actually read of the blocks.
348 //
349 // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
350 // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
351 // read amplification using this formula
352 // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
353 //
354 // value => memory usage (percentage of loaded blocks memory)
355 // 1 => 12.50 %
356 // 2 => 06.25 %
357 // 4 => 03.12 %
358 // 8 => 01.56 %
359 // 16 => 00.78 %
360 //
361 // Note: This number must be a power of 2, if not it will be sanitized
362 // to be the next lowest power of 2, for example a value of 7 will be
363 // treated as 4, a value of 19 will be treated as 16.
364 //
365 // Default: 0 (disabled)
366 uint32_t read_amp_bytes_per_bit = 0;
367
494da23a 368 // We currently have five versions:
7c673cae
FG
369 // 0 -- This version is currently written out by all RocksDB's versions by
370 // default. Can be read by really old RocksDB's. Doesn't support changing
371 // checksum (default is CRC32).
372 // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
373 // checksum, like xxHash. It is written by RocksDB when
374 // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
375 // 0 is silently upconverted)
376 // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
377 // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
378 // don't plan to run RocksDB before version 3.10, you should probably use
379 // this.
11fdf7f2
TL
380 // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
381 // encode the keys in index blocks. If you don't plan to run RocksDB before
382 // version 5.15, you should probably use this.
383 // This option only affects newly written tables. When reading existing
384 // tables, the information about version is read from the footer.
385 // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
386 // encode the values in index blocks. If you don't plan to run RocksDB before
387 // version 5.16 and you are using index_block_restart_interval > 1, you should
388 // probably use this as it would reduce the index size.
389 // This option only affects newly written tables. When reading existing
390 // tables, the information about version is read from the footer.
f67539c2
TL
391 // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
392 // filters use a generally faster and more accurate Bloom filter
393 // implementation, with a different schema.
20effc67 394 uint32_t format_version = 4;
11fdf7f2
TL
395
396 // Store index blocks on disk in compressed format. Changing this option to
397 // false will avoid the overhead of decompression if index blocks are evicted
398 // and read back
399 bool enable_index_compression = true;
400
401 // Align data blocks on lesser of page size and block size
402 bool block_align = false;
f67539c2
TL
403
404 // This enum allows trading off increased index size for improved iterator
405 // seek performance in some situations, particularly when block cache is
406 // disabled (ReadOptions::fill_cache = false) and direct IO is
407 // enabled (DBOptions::use_direct_reads = true).
408 // The default mode is the best tradeoff for most use cases.
409 // This option only affects newly written tables.
410 //
411 // The index contains a key separating each pair of consecutive blocks.
412 // Let A be the highest key in one block, B the lowest key in the next block,
413 // and I the index entry separating these two blocks:
414 // [ ... A] I [B ...]
415 // I is allowed to be anywhere in [A, B).
416 // If an iterator is seeked to a key in (A, I], we'll unnecessarily read the
417 // first block, then immediately fall through to the second block.
418 // However, if I=A, this can't happen, and we'll read only the second block.
419 // In kNoShortening mode, we use I=A. In other modes, we use the shortest
420 // key in [A, B), which usually significantly reduces index size.
421 //
422 // There's a similar story for the last index entry, which is an upper bound
423 // of the highest key in the file. If it's shortened and therefore
424 // overestimated, iterator is likely to unnecessarily read the last data block
425 // from each file on each seek.
426 enum class IndexShorteningMode : char {
427 // Use full keys.
428 kNoShortening,
429 // Shorten index keys between blocks, but use full key for the last index
430 // key, which is the upper bound of the whole file.
431 kShortenSeparators,
432 // Shorten both keys between blocks and key after last block.
433 kShortenSeparatorsAndSuccessor,
434 };
435
436 IndexShorteningMode index_shortening =
437 IndexShorteningMode::kShortenSeparators;
7c673cae
FG
438};
439
440// Table Properties that are specific to block-based table properties.
441struct BlockBasedTablePropertyNames {
11fdf7f2 442 // value of this properties is a fixed int32 number.
7c673cae
FG
443 static const std::string kIndexType;
444 // value is "1" for true and "0" for false.
445 static const std::string kWholeKeyFiltering;
446 // value is "1" for true and "0" for false.
447 static const std::string kPrefixFiltering;
448};
449
450// Create default block based table factory.
451extern TableFactory* NewBlockBasedTableFactory(
452 const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
453
454#ifndef ROCKSDB_LITE
455
456enum EncodingType : char {
457 // Always write full keys without any special encoding.
458 kPlain,
459 // Find opportunity to write the same prefix once for multiple rows.
460 // In some cases, when a key follows a previous key with the same prefix,
461 // instead of writing out the full key, it just writes out the size of the
462 // shared prefix, as well as other bytes, to save some bytes.
463 //
464 // When using this option, the user is required to use the same prefix
465 // extractor to make sure the same prefix will be extracted from the same key.
466 // The Name() value of the prefix extractor will be stored in the file. When
467 // reopening the file, the name of the options.prefix_extractor given will be
468 // bitwise compared to the prefix extractors stored in the file. An error
469 // will be returned if the two don't match.
470 kPrefix,
471};
472
473// Table Properties that are specific to plain table properties.
474struct PlainTablePropertyNames {
475 static const std::string kEncodingType;
476 static const std::string kBloomVersion;
477 static const std::string kNumBloomBlocks;
478};
479
480const uint32_t kPlainTableVariableLength = 0;
481
482struct PlainTableOptions {
20effc67 483 static const char* kName() { return "PlainTableOptions"; };
7c673cae
FG
484 // @user_key_len: plain table has optimization for fix-sized keys, which can
485 // be specified via user_key_len. Alternatively, you can pass
486 // `kPlainTableVariableLength` if your keys have variable
487 // lengths.
488 uint32_t user_key_len = kPlainTableVariableLength;
489
490 // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
491 // You may disable it by passing a zero.
492 int bloom_bits_per_key = 10;
493
494 // @hash_table_ratio: the desired utilization of the hash table used for
495 // prefix hashing.
496 // hash_table_ratio = number of prefixes / #buckets in the
497 // hash table
498 double hash_table_ratio = 0.75;
499
500 // @index_sparseness: inside each prefix, need to build one index record for
501 // how many keys for binary search inside each hash bucket.
502 // For encoding type kPrefix, the value will be used when
503 // writing to determine an interval to rewrite the full
504 // key. It will also be used as a suggestion and satisfied
505 // when possible.
506 size_t index_sparseness = 16;
507
508 // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
509 // Otherwise from huge page TLB. The user needs to
510 // reserve huge pages for it to be allocated, like:
511 // sysctl -w vm.nr_hugepages=20
512 // See linux doc Documentation/vm/hugetlbpage.txt
513 size_t huge_page_tlb_size = 0;
514
515 // @encoding_type: how to encode the keys. See enum EncodingType above for
516 // the choices. The value will determine how to encode keys
517 // when writing to a new SST file. This value will be stored
518 // inside the SST file which will be used when reading from
519 // the file, which makes it possible for users to choose
520 // different encoding type when reopening a DB. Files with
521 // different encoding types can co-exist in the same DB and
522 // can be read.
523 EncodingType encoding_type = kPlain;
524
525 // @full_scan_mode: mode for reading the whole file one record by one without
526 // using the index.
527 bool full_scan_mode = false;
528
529 // @store_index_in_file: compute plain table index and bloom filter during
530 // file building and store it in file. When reading
531 // file, index will be mmaped instead of recomputation.
532 bool store_index_in_file = false;
533};
534
535// -- Plain Table with prefix-only seek
494da23a
TL
536// For this factory, you need to set Options.prefix_extractor properly to make
537// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
538// the hash bucket found, a binary search is executed for hash conflicts.
539// Finally, a linear search is used.
7c673cae 540
494da23a
TL
541extern TableFactory* NewPlainTableFactory(
542 const PlainTableOptions& options = PlainTableOptions());
7c673cae
FG
543
544struct CuckooTablePropertyNames {
545 // The key that is used to fill empty buckets.
546 static const std::string kEmptyKey;
547 // Fixed length of value.
548 static const std::string kValueLength;
549 // Number of hash functions used in Cuckoo Hash.
550 static const std::string kNumHashFunc;
551 // It denotes the number of buckets in a Cuckoo Block. Given a key and a
552 // particular hash function, a Cuckoo Block is a set of consecutive buckets,
553 // where starting bucket id is given by the hash function on the key. In case
554 // of a collision during inserting the key, the builder tries to insert the
555 // key in other locations of the cuckoo block before using the next hash
556 // function. This reduces cache miss during read operation in case of
557 // collision.
558 static const std::string kCuckooBlockSize;
559 // Size of the hash table. Use this number to compute the modulo of hash
560 // function. The actual number of buckets will be kMaxHashTableSize +
561 // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
562 // accommodate the Cuckoo Block from end of hash table, due to cache friendly
563 // implementation.
564 static const std::string kHashTableSize;
565 // Denotes if the key sorted in the file is Internal Key (if false)
566 // or User Key only (if true).
567 static const std::string kIsLastLevel;
568 // Indicate if using identity function for the first hash function.
569 static const std::string kIdentityAsFirstHash;
570 // Indicate if using module or bit and to calculate hash value
571 static const std::string kUseModuleHash;
572 // Fixed user key length
573 static const std::string kUserKeyLength;
574};
575
576struct CuckooTableOptions {
20effc67
TL
577 static const char* kName() { return "CuckooTableOptions"; };
578
7c673cae
FG
579 // Determines the utilization of hash tables. Smaller values
580 // result in larger hash tables with fewer collisions.
581 double hash_table_ratio = 0.9;
582 // A property used by builder to determine the depth to go to
583 // to search for a path to displace elements in case of
584 // collision. See Builder.MakeSpaceForKey method. Higher
585 // values result in more efficient hash tables with fewer
586 // lookups but take more time to build.
587 uint32_t max_search_depth = 100;
588 // In case of collision while inserting, the builder
589 // attempts to insert in the next cuckoo_block_size
590 // locations before skipping over to the next Cuckoo hash
591 // function. This makes lookups more cache friendly in case
592 // of collisions.
593 uint32_t cuckoo_block_size = 5;
594 // If this option is enabled, user key is treated as uint64_t and its value
595 // is used as hash value directly. This option changes builder's behavior.
596 // Reader ignore this option and behave according to what specified in table
597 // property.
598 bool identity_as_first_hash = false;
599 // If this option is set to true, module is used during hash calculation.
600 // This often yields better space efficiency at the cost of performance.
11fdf7f2 601 // If this option is set to false, # of entries in table is constrained to be
7c673cae
FG
602 // power of two, and bit and is used to calculate hash, which is faster in
603 // general.
604 bool use_module_hash = true;
605};
606
607// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
608extern TableFactory* NewCuckooTableFactory(
609 const CuckooTableOptions& table_options = CuckooTableOptions());
610
611#endif // ROCKSDB_LITE
612
613class RandomAccessFileReader;
614
615// A base class for table factories.
20effc67 616class TableFactory : public Customizable {
7c673cae 617 public:
20effc67 618 virtual ~TableFactory() override {}
7c673cae 619
20effc67
TL
620 static const char* kBlockCacheOpts() { return "BlockCache"; };
621 static const char* kBlockBasedTableName() { return "BlockBasedTable"; };
622 static const char* kPlainTableName() { return "PlainTable"; }
623 static const char* kCuckooTableName() { return "CuckooTable"; };
624
625 // Creates and configures a new TableFactory from the input options and id.
626 static Status CreateFromString(const ConfigOptions& config_options,
627 const std::string& id,
628 std::shared_ptr<TableFactory>* factory);
629
630 static const char* Type() { return "TableFactory"; }
7c673cae
FG
631
632 // Returns a Table object table that can fetch data from file specified
633 // in parameter file. It's the caller's responsibility to make sure
634 // file is in the correct format.
635 //
636 // NewTableReader() is called in three places:
637 // (1) TableCache::FindTable() calls the function when table cache miss
638 // and cache the table object returned.
494da23a 639 // (2) SstFileDumper (for SST Dump) opens the table and dump the table
11fdf7f2 640 // contents using the iterator of the table.
494da23a
TL
641 // (3) DBImpl::IngestExternalFile() calls this function to read the contents
642 // of the sst file it's attempting to add
7c673cae
FG
643 //
644 // table_reader_options is a TableReaderOptions which contain all the
645 // needed parameters and configuration to open the table.
646 // file is a file handler to handle the file for the table.
647 // file_size is the physical file size of the file.
648 // table_reader is the output table reader.
649 virtual Status NewTableReader(
650 const TableReaderOptions& table_reader_options,
494da23a
TL
651 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
652 std::unique_ptr<TableReader>* table_reader,
20effc67
TL
653 bool prefetch_index_and_filter_in_cache = true) const {
654 ReadOptions ro;
655 return NewTableReader(ro, table_reader_options, std::move(file), file_size,
656 table_reader, prefetch_index_and_filter_in_cache);
657 }
658
659 // Overload of the above function that allows the caller to pass in a
660 // ReadOptions
661 virtual Status NewTableReader(
662 const ReadOptions& ro, const TableReaderOptions& table_reader_options,
663 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
664 std::unique_ptr<TableReader>* table_reader,
665 bool prefetch_index_and_filter_in_cache) const = 0;
7c673cae
FG
666
667 // Return a table builder to write to a file for this table type.
668 //
669 // It is called in several places:
670 // (1) When flushing memtable to a level-0 output file, it creates a table
671 // builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
672 // (2) During compaction, it gets the builder for writing compaction output
673 // files in DBImpl::OpenCompactionOutputFile().
674 // (3) When recovering from transaction logs, it creates a table builder to
675 // write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
676 // by calling BuildTable())
677 // (4) When running Repairer, it creates a table builder to convert logs to
678 // SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
679 //
11fdf7f2 680 // Multiple configured can be accessed from there, including and not limited
7c673cae
FG
681 // to compression options. file is a handle of a writable file.
682 // It is the caller's responsibility to keep the file open and close the file
683 // after closing the table builder. compression_type is the compression type
684 // to use in this table.
685 virtual TableBuilder* NewTableBuilder(
686 const TableBuilderOptions& table_builder_options,
687 uint32_t column_family_id, WritableFileWriter* file) const = 0;
688
11fdf7f2
TL
689 // Return is delete range supported
690 virtual bool IsDeleteRangeSupported() const { return false; }
7c673cae
FG
691};
692
693#ifndef ROCKSDB_LITE
694// Create a special table factory that can open either of the supported
695// table formats, based on setting inside the SST files. It should be used to
696// convert a DB from one table format to another.
697// @table_factory_to_write: the table factory used when writing to new files.
698// @block_based_table_factory: block based table factory to use. If NULL, use
699// a default one.
700// @plain_table_factory: plain table factory to use. If NULL, use a default one.
494da23a
TL
701// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
702// one.
7c673cae
FG
703extern TableFactory* NewAdaptiveTableFactory(
704 std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
705 std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
706 std::shared_ptr<TableFactory> plain_table_factory = nullptr,
707 std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
708
709#endif // ROCKSDB_LITE
710
f67539c2 711} // namespace ROCKSDB_NAMESPACE