ceph/src/rocksdb/include/rocksdb/table.h

   1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   4 //
   5 // Currently we support two types of tables: plain table and block-based table.
   6 //   1. Block-based table: this is the default table type that we inherited from
   7 //      LevelDB, which was designed for storing data in hard disk or flash
   8 //      device.
   9 //   2. Plain table: it is one of RocksDB's SST file format optimized
  10 //      for low query latency on pure-memory or really low-latency media.
  11 //
  12 // A tutorial of rocksdb table formats is available here:
  13 //   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats
  14 //
  15 // Example code is also available
  16 //   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
  17
  18 #pragma once
  19
  20 #include <memory>
  21 #include <string>
  22 #include <unordered_map>
  23
  24 #include "rocksdb/cache.h"
  25 #include "rocksdb/env.h"
  26 #include "rocksdb/iterator.h"
  27 #include "rocksdb/options.h"
  28 #include "rocksdb/status.h"
  29
  30 namespace rocksdb {
  31
  32 // -- Block-based Table
  33 class FlushBlockPolicyFactory;
  34 class PersistentCache;
  35 class RandomAccessFile;
  36 struct TableReaderOptions;
  37 struct TableBuilderOptions;
  38 class TableBuilder;
  39 class TableReader;
  40 class WritableFileWriter;
  41 struct EnvOptions;
  42 struct Options;
  43
  44 using std::unique_ptr;
  45
  46 enum ChecksumType : char {
  47   kNoChecksum = 0x0,
  48   kCRC32c = 0x1,
  49   kxxHash = 0x2,
  50 };
  51
  52 // For advanced user only
  53 struct BlockBasedTableOptions {
  54   // @flush_block_policy_factory creates the instances of flush block policy.
  55   // which provides a configurable way to determine when to flush a block in
  56   // the block based tables.  If not set, table builder will use the default
  57   // block flush policy, which cut blocks by block size (please refer to
  58   // `FlushBlockBySizePolicy`).
  59   std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
  60
  61   // TODO(kailiu) Temporarily disable this feature by making the default value
  62   // to be false.
  63   //
  64   // Indicating if we'd put index/filter blocks to the block cache.
  65   // If not specified, each "table reader" object will pre-load index/filter
  66   // block during table initialization.
  67   bool cache_index_and_filter_blocks = false;
  68
  69   // If cache_index_and_filter_blocks is enabled, cache index and filter
  70   // blocks with high priority. If set to true, depending on implementation of
  71   // block cache, index and filter blocks may be less likely to be evicted
  72   // than data blocks.
  73   bool cache_index_and_filter_blocks_with_high_priority = false;
  74
  75   // if cache_index_and_filter_blocks is true and the below is true, then
  76   // filter and index blocks are stored in the cache, but a reference is
  77   // held in the "table reader" object so the blocks are pinned and only
  78   // evicted from cache when the table reader is freed.
  79   bool pin_l0_filter_and_index_blocks_in_cache = false;
  80
  81   // If cache_index_and_filter_blocks is true and the below is true, then
  82   // the top-level index of partitioned filter and index blocks are stored in
  83   // the cache, but a reference is held in the "table reader" object so the
  84   // blocks are pinned and only evicted from cache when the table reader is
  85   // freed. This is not limited to l0 in LSM tree.
  86   bool pin_top_level_index_and_filter = true;
  87
  88   // The index type that will be used for this table.
  89   enum IndexType : char {
  90     // A space efficient index block that is optimized for
  91     // binary-search-based index.
  92     kBinarySearch,
  93
  94     // The hash index, if enabled, will do the hash lookup when
  95     // `Options.prefix_extractor` is provided.
  96     kHashSearch,
  97
  98     // A two-level index implementation. Both levels are binary search indexes.
  99     kTwoLevelIndexSearch,
 100   };
 101
 102   IndexType index_type = kBinarySearch;
 103
 104   // The index type that will be used for the data block.
 105   enum DataBlockIndexType : char {
 106     kDataBlockBinarySearch = 0,   // traditional block type
 107     kDataBlockBinaryAndHash = 1,  // additional hash index
 108   };
 109
 110   DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
 111
 112   // #entries/#buckets. It is valid only when data_block_hash_index_type is
 113   // kDataBlockBinaryAndHash.
 114   double data_block_hash_table_util_ratio = 0.75;
 115
 116   // This option is now deprecated. No matter what value it is set to,
 117   // it will behave as if hash_index_allow_collision=true.
 118   bool hash_index_allow_collision = true;
 119
 120   // Use the specified checksum type. Newly created table files will be
 121   // protected with this checksum type. Old table files will still be readable,
 122   // even though they have different checksum type.
 123   ChecksumType checksum = kCRC32c;
 124
 125   // Disable block cache. If this is set to true,
 126   // then no block cache should be used, and the block_cache should
 127   // point to a nullptr object.
 128   bool no_block_cache = false;
 129
 130   // If non-NULL use the specified cache for blocks.
 131   // If NULL, rocksdb will automatically create and use an 8MB internal cache.
 132   std::shared_ptr<Cache> block_cache = nullptr;
 133
 134   // If non-NULL use the specified cache for pages read from device
 135   // IF NULL, no page cache is used
 136   std::shared_ptr<PersistentCache> persistent_cache = nullptr;
 137
 138   // If non-NULL use the specified cache for compressed blocks.
 139   // If NULL, rocksdb will not use a compressed block cache.
 140   std::shared_ptr<Cache> block_cache_compressed = nullptr;
 141
 142   // Approximate size of user data packed per block.  Note that the
 143   // block size specified here corresponds to uncompressed data.  The
 144   // actual size of the unit read from disk may be smaller if
 145   // compression is enabled.  This parameter can be changed dynamically.
 146   size_t block_size = 4 * 1024;
 147
 148   // This is used to close a block before it reaches the configured
 149   // 'block_size'. If the percentage of free space in the current block is less
 150   // than this specified number and adding a new record to the block will
 151   // exceed the configured block size, then this block will be closed and the
 152   // new record will be written to the next block.
 153   int block_size_deviation = 10;
 154
 155   // Number of keys between restart points for delta encoding of keys.
 156   // This parameter can be changed dynamically.  Most clients should
 157   // leave this parameter alone.  The minimum value allowed is 1.  Any smaller
 158   // value will be silently overwritten with 1.
 159   int block_restart_interval = 16;
 160
 161   // Same as block_restart_interval but used for the index block.
 162   int index_block_restart_interval = 1;
 163
 164   // Block size for partitioned metadata. Currently applied to indexes when
 165   // kTwoLevelIndexSearch is used and to filters when partition_filters is used.
 166   // Note: Since in the current implementation the filters and index partitions
 167   // are aligned, an index/filter block is created when either index or filter
 168   // block size reaches the specified limit.
 169   // Note: this limit is currently applied to only index blocks; a filter
 170   // partition is cut right after an index block is cut
 171   // TODO(myabandeh): remove the note above when filter partitions are cut
 172   // separately
 173   uint64_t metadata_block_size = 4096;
 174
 175   // Note: currently this option requires kTwoLevelIndexSearch to be set as
 176   // well.
 177   // TODO(myabandeh): remove the note above once the limitation is lifted
 178   // Use partitioned full filters for each SST file. This option is
 179   // incompatible with block-based filters.
 180   bool partition_filters = false;
 181
 182   // Use delta encoding to compress keys in blocks.
 183   // ReadOptions::pin_data requires this option to be disabled.
 184   //
 185   // Default: true
 186   bool use_delta_encoding = true;
 187
 188   // If non-nullptr, use the specified filter policy to reduce disk reads.
 189   // Many applications will benefit from passing the result of
 190   // NewBloomFilterPolicy() here.
 191   std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
 192
 193   // If true, place whole keys in the filter (not just prefixes).
 194   // This must generally be true for gets to be efficient.
 195   bool whole_key_filtering = true;
 196
 197   // Verify that decompressing the compressed block gives back the input. This
 198   // is a verification mode that we use to detect bugs in compression
 199   // algorithms.
 200   bool verify_compression = false;
 201
 202   // If used, For every data block we load into memory, we will create a bitmap
 203   // of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
 204   // will be used to figure out the percentage we actually read of the blocks.
 205   //
 206   // When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
 207   // Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
 208   // read amplification using this formula
 209   // (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
 210   //
 211   // value  =>  memory usage (percentage of loaded blocks memory)
 212   // 1      =>  12.50 %
 213   // 2      =>  06.25 %
 214   // 4      =>  03.12 %
 215   // 8      =>  01.56 %
 216   // 16     =>  00.78 %
 217   //
 218   // Note: This number must be a power of 2, if not it will be sanitized
 219   // to be the next lowest power of 2, for example a value of 7 will be
 220   // treated as 4, a value of 19 will be treated as 16.
 221   //
 222   // Default: 0 (disabled)
 223   uint32_t read_amp_bytes_per_bit = 0;
 224
 225   // We currently have three versions:
 226   // 0 -- This version is currently written out by all RocksDB's versions by
 227   // default.  Can be read by really old RocksDB's. Doesn't support changing
 228   // checksum (default is CRC32).
 229   // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
 230   // checksum, like xxHash. It is written by RocksDB when
 231   // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
 232   // 0 is silently upconverted)
 233   // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
 234   // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
 235   // don't plan to run RocksDB before version 3.10, you should probably use
 236   // this.
 237   // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
 238   // encode the keys in index blocks. If you don't plan to run RocksDB before
 239   // version 5.15, you should probably use this.
 240   // This option only affects newly written tables. When reading existing
 241   // tables, the information about version is read from the footer.
 242   // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
 243   // encode the values in index blocks. If you don't plan to run RocksDB before
 244   // version 5.16 and you are using index_block_restart_interval > 1, you should
 245   // probably use this as it would reduce the index size.
 246   // This option only affects newly written tables. When reading existing
 247   // tables, the information about version is read from the footer.
 248   uint32_t format_version = 2;
 249
 250   // Store index blocks on disk in compressed format. Changing this option to
 251   // false  will avoid the overhead of decompression if index blocks are evicted
 252   // and read back
 253   bool enable_index_compression = true;
 254
 255   // Align data blocks on lesser of page size and block size
 256   bool block_align = false;
 257 };
 258
 259 // Table Properties that are specific to block-based table properties.
 260 struct BlockBasedTablePropertyNames {
 261   // value of this properties is a fixed int32 number.
 262   static const std::string kIndexType;
 263   // value is "1" for true and "0" for false.
 264   static const std::string kWholeKeyFiltering;
 265   // value is "1" for true and "0" for false.
 266   static const std::string kPrefixFiltering;
 267 };
 268
 269 // Create default block based table factory.
 270 extern TableFactory* NewBlockBasedTableFactory(
 271     const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
 272
 273 #ifndef ROCKSDB_LITE
 274
 275 enum EncodingType : char {
 276   // Always write full keys without any special encoding.
 277   kPlain,
 278   // Find opportunity to write the same prefix once for multiple rows.
 279   // In some cases, when a key follows a previous key with the same prefix,
 280   // instead of writing out the full key, it just writes out the size of the
 281   // shared prefix, as well as other bytes, to save some bytes.
 282   //
 283   // When using this option, the user is required to use the same prefix
 284   // extractor to make sure the same prefix will be extracted from the same key.
 285   // The Name() value of the prefix extractor will be stored in the file. When
 286   // reopening the file, the name of the options.prefix_extractor given will be
 287   // bitwise compared to the prefix extractors stored in the file. An error
 288   // will be returned if the two don't match.
 289   kPrefix,
 290 };
 291
 292 // Table Properties that are specific to plain table properties.
 293 struct PlainTablePropertyNames {
 294   static const std::string kEncodingType;
 295   static const std::string kBloomVersion;
 296   static const std::string kNumBloomBlocks;
 297 };
 298
 299 const uint32_t kPlainTableVariableLength = 0;
 300
 301 struct PlainTableOptions {
 302   // @user_key_len: plain table has optimization for fix-sized keys, which can
 303   //                be specified via user_key_len.  Alternatively, you can pass
 304   //                `kPlainTableVariableLength` if your keys have variable
 305   //                lengths.
 306   uint32_t user_key_len = kPlainTableVariableLength;
 307
 308   // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
 309   //                      You may disable it by passing a zero.
 310   int bloom_bits_per_key = 10;
 311
 312   // @hash_table_ratio: the desired utilization of the hash table used for
 313   //                    prefix hashing.
 314   //                    hash_table_ratio = number of prefixes / #buckets in the
 315   //                    hash table
 316   double hash_table_ratio = 0.75;
 317
 318   // @index_sparseness: inside each prefix, need to build one index record for
 319   //                    how many keys for binary search inside each hash bucket.
 320   //                    For encoding type kPrefix, the value will be used when
 321   //                    writing to determine an interval to rewrite the full
 322   //                    key. It will also be used as a suggestion and satisfied
 323   //                    when possible.
 324   size_t index_sparseness = 16;
 325
 326   // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
 327   //                      Otherwise from huge page TLB. The user needs to
 328   //                      reserve huge pages for it to be allocated, like:
 329   //                          sysctl -w vm.nr_hugepages=20
 330   //                      See linux doc Documentation/vm/hugetlbpage.txt
 331   size_t huge_page_tlb_size = 0;
 332
 333   // @encoding_type: how to encode the keys. See enum EncodingType above for
 334   //                 the choices. The value will determine how to encode keys
 335   //                 when writing to a new SST file. This value will be stored
 336   //                 inside the SST file which will be used when reading from
 337   //                 the file, which makes it possible for users to choose
 338   //                 different encoding type when reopening a DB. Files with
 339   //                 different encoding types can co-exist in the same DB and
 340   //                 can be read.
 341   EncodingType encoding_type = kPlain;
 342
 343   // @full_scan_mode: mode for reading the whole file one record by one without
 344   //                  using the index.
 345   bool full_scan_mode = false;
 346
 347   // @store_index_in_file: compute plain table index and bloom filter during
 348   //                       file building and store it in file. When reading
 349   //                       file, index will be mmaped instead of recomputation.
 350   bool store_index_in_file = false;
 351 };
 352
 353 // -- Plain Table with prefix-only seek
 354 // For this factory, you need to set Options.prefix_extractor properly to make it
 355 // work. Look-up will starts with prefix hash lookup for key prefix. Inside the
 356 // hash bucket found, a binary search is executed for hash conflicts. Finally,
 357 // a linear search is used.
 358
 359 extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
 360                                               PlainTableOptions());
 361
 362 struct CuckooTablePropertyNames {
 363   // The key that is used to fill empty buckets.
 364   static const std::string kEmptyKey;
 365   // Fixed length of value.
 366   static const std::string kValueLength;
 367   // Number of hash functions used in Cuckoo Hash.
 368   static const std::string kNumHashFunc;
 369   // It denotes the number of buckets in a Cuckoo Block. Given a key and a
 370   // particular hash function, a Cuckoo Block is a set of consecutive buckets,
 371   // where starting bucket id is given by the hash function on the key. In case
 372   // of a collision during inserting the key, the builder tries to insert the
 373   // key in other locations of the cuckoo block before using the next hash
 374   // function. This reduces cache miss during read operation in case of
 375   // collision.
 376   static const std::string kCuckooBlockSize;
 377   // Size of the hash table. Use this number to compute the modulo of hash
 378   // function. The actual number of buckets will be kMaxHashTableSize +
 379   // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
 380   // accommodate the Cuckoo Block from end of hash table, due to cache friendly
 381   // implementation.
 382   static const std::string kHashTableSize;
 383   // Denotes if the key sorted in the file is Internal Key (if false)
 384   // or User Key only (if true).
 385   static const std::string kIsLastLevel;
 386   // Indicate if using identity function for the first hash function.
 387   static const std::string kIdentityAsFirstHash;
 388   // Indicate if using module or bit and to calculate hash value
 389   static const std::string kUseModuleHash;
 390   // Fixed user key length
 391   static const std::string kUserKeyLength;
 392 };
 393
 394 struct CuckooTableOptions {
 395   // Determines the utilization of hash tables. Smaller values
 396   // result in larger hash tables with fewer collisions.
 397   double hash_table_ratio = 0.9;
 398   // A property used by builder to determine the depth to go to
 399   // to search for a path to displace elements in case of
 400   // collision. See Builder.MakeSpaceForKey method. Higher
 401   // values result in more efficient hash tables with fewer
 402   // lookups but take more time to build.
 403   uint32_t max_search_depth = 100;
 404   // In case of collision while inserting, the builder
 405   // attempts to insert in the next cuckoo_block_size
 406   // locations before skipping over to the next Cuckoo hash
 407   // function. This makes lookups more cache friendly in case
 408   // of collisions.
 409   uint32_t cuckoo_block_size = 5;
 410   // If this option is enabled, user key is treated as uint64_t and its value
 411   // is used as hash value directly. This option changes builder's behavior.
 412   // Reader ignore this option and behave according to what specified in table
 413   // property.
 414   bool identity_as_first_hash = false;
 415   // If this option is set to true, module is used during hash calculation.
 416   // This often yields better space efficiency at the cost of performance.
 417   // If this option is set to false, # of entries in table is constrained to be
 418   // power of two, and bit and is used to calculate hash, which is faster in
 419   // general.
 420   bool use_module_hash = true;
 421 };
 422
 423 // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
 424 extern TableFactory* NewCuckooTableFactory(
 425     const CuckooTableOptions& table_options = CuckooTableOptions());
 426
 427 #endif  // ROCKSDB_LITE
 428
 429 class RandomAccessFileReader;
 430
 431 // A base class for table factories.
 432 class TableFactory {
 433  public:
 434   virtual ~TableFactory() {}
 435
 436   // The type of the table.
 437   //
 438   // The client of this package should switch to a new name whenever
 439   // the table format implementation changes.
 440   //
 441   // Names starting with "rocksdb." are reserved and should not be used
 442   // by any clients of this package.
 443   virtual const char* Name() const = 0;
 444
 445   // Returns a Table object table that can fetch data from file specified
 446   // in parameter file. It's the caller's responsibility to make sure
 447   // file is in the correct format.
 448   //
 449   // NewTableReader() is called in three places:
 450   // (1) TableCache::FindTable() calls the function when table cache miss
 451   //     and cache the table object returned.
 452   // (2) SstFileReader (for SST Dump) opens the table and dump the table
 453   //     contents using the iterator of the table.
 454   // (3) DBImpl::IngestExternalFile() calls this function to read the contents of
 455   //     the sst file it's attempting to add
 456   //
 457   // table_reader_options is a TableReaderOptions which contain all the
 458   //    needed parameters and configuration to open the table.
 459   // file is a file handler to handle the file for the table.
 460   // file_size is the physical file size of the file.
 461   // table_reader is the output table reader.
 462   virtual Status NewTableReader(
 463       const TableReaderOptions& table_reader_options,
 464       unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
 465       unique_ptr<TableReader>* table_reader,
 466       bool prefetch_index_and_filter_in_cache = true) const = 0;
 467
 468   // Return a table builder to write to a file for this table type.
 469   //
 470   // It is called in several places:
 471   // (1) When flushing memtable to a level-0 output file, it creates a table
 472   //     builder (In DBImpl::WriteLevel0Table(), by calling BuildTable())
 473   // (2) During compaction, it gets the builder for writing compaction output
 474   //     files in DBImpl::OpenCompactionOutputFile().
 475   // (3) When recovering from transaction logs, it creates a table builder to
 476   //     write to a level-0 output file (In DBImpl::WriteLevel0TableForRecovery,
 477   //     by calling BuildTable())
 478   // (4) When running Repairer, it creates a table builder to convert logs to
 479   //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
 480   //
 481   // Multiple configured can be accessed from there, including and not limited
 482   // to compression options. file is a handle of a writable file.
 483   // It is the caller's responsibility to keep the file open and close the file
 484   // after closing the table builder. compression_type is the compression type
 485   // to use in this table.
 486   virtual TableBuilder* NewTableBuilder(
 487       const TableBuilderOptions& table_builder_options,
 488       uint32_t column_family_id, WritableFileWriter* file) const = 0;
 489
 490   // Sanitizes the specified DB Options and ColumnFamilyOptions.
 491   //
 492   // If the function cannot find a way to sanitize the input DB Options,
 493   // a non-ok Status will be returned.
 494   virtual Status SanitizeOptions(
 495       const DBOptions& db_opts,
 496       const ColumnFamilyOptions& cf_opts) const = 0;
 497
 498   // Return a string that contains printable format of table configurations.
 499   // RocksDB prints configurations at DB Open().
 500   virtual std::string GetPrintableTableOptions() const = 0;
 501
 502   virtual Status GetOptionString(std::string* /*opt_string*/,
 503                                  const std::string& /*delimiter*/) const {
 504     return Status::NotSupported(
 505         "The table factory doesn't implement GetOptionString().");
 506   }
 507
 508   // Returns the raw pointer of the table options that is used by this
 509   // TableFactory, or nullptr if this function is not supported.
 510   // Since the return value is a raw pointer, the TableFactory owns the
 511   // pointer and the caller should not delete the pointer.
 512   //
 513   // In certain case, it is desirable to alter the underlying options when the
 514   // TableFactory is not used by any open DB by casting the returned pointer
 515   // to the right class.   For instance, if BlockBasedTableFactory is used,
 516   // then the pointer can be casted to BlockBasedTableOptions.
 517   //
 518   // Note that changing the underlying TableFactory options while the
 519   // TableFactory is currently used by any open DB is undefined behavior.
 520   // Developers should use DB::SetOption() instead to dynamically change
 521   // options while the DB is open.
 522   virtual void* GetOptions() { return nullptr; }
 523
 524   // Return is delete range supported
 525   virtual bool IsDeleteRangeSupported() const { return false; }
 526 };
 527
 528 #ifndef ROCKSDB_LITE
 529 // Create a special table factory that can open either of the supported
 530 // table formats, based on setting inside the SST files. It should be used to
 531 // convert a DB from one table format to another.
 532 // @table_factory_to_write: the table factory used when writing to new files.
 533 // @block_based_table_factory:  block based table factory to use. If NULL, use
 534 //                              a default one.
 535 // @plain_table_factory: plain table factory to use. If NULL, use a default one.
 536 // @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default one.
 537 extern TableFactory* NewAdaptiveTableFactory(
 538     std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
 539     std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
 540     std::shared_ptr<TableFactory> plain_table_factory = nullptr,
 541     std::shared_ptr<TableFactory> cuckoo_table_factory = nullptr);
 542
 543 #endif  // ROCKSDB_LITE
 544
 545 }  // namespace rocksdb