ceph/src/rocksdb/include/rocksdb/advanced_options.h

   1 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 // This source code is licensed under the BSD-style license found in the
   3 // LICENSE file in the root directory of this source tree. An additional grant
   4 // of patent rights can be found in the PATENTS file in the same directory.
   5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   6 // Use of this source code is governed by a BSD-style license that can be
   7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   8
   9 #pragma once
  10
  11 #include <memory>
  12
  13 #include "rocksdb/memtablerep.h"
  14 #include "rocksdb/universal_compaction.h"
  15
  16 namespace rocksdb {
  17
  18 class Slice;
  19 class SliceTransform;
  20 enum CompressionType : unsigned char;
  21 class TablePropertiesCollectorFactory;
  22 class TableFactory;
  23 struct Options;
  24
  25 enum CompactionStyle : char {
  26   // level based compaction style
  27   kCompactionStyleLevel = 0x0,
  28   // Universal compaction style
  29   // Not supported in ROCKSDB_LITE.
  30   kCompactionStyleUniversal = 0x1,
  31   // FIFO compaction style
  32   // Not supported in ROCKSDB_LITE
  33   kCompactionStyleFIFO = 0x2,
  34   // Disable background compaction. Compaction jobs are submitted
  35   // via CompactFiles().
  36   // Not supported in ROCKSDB_LITE
  37   kCompactionStyleNone = 0x3,
  38 };
  39
  40 // In Level-based comapction, it Determines which file from a level to be
  41 // picked to merge to the next level. We suggest people try
  42 // kMinOverlappingRatio first when you tune your database.
  43 enum CompactionPri : char {
  44   // Slightly Priotize larger files by size compensated by #deletes
  45   kByCompensatedSize = 0x0,
  46   // First compact files whose data's latest update time is oldest.
  47   // Try this if you only update some hot keys in small ranges.
  48   kOldestLargestSeqFirst = 0x1,
  49   // First compact files whose range hasn't been compacted to the next level
  50   // for the longest. If your updates are random across the key space,
  51   // write amplification is slightly better with this option.
  52   kOldestSmallestSeqFirst = 0x2,
  53   // First compact files whose ratio between overlapping size in next level
  54   // and its size is the smallest. It in many cases can optimize write
  55   // amplification.
  56   kMinOverlappingRatio = 0x3,
  57 };
  58
  59 struct CompactionOptionsFIFO {
  60   // once the total sum of table files reaches this, we will delete the oldest
  61   // table file
  62   // Default: 1GB
  63   uint64_t max_table_files_size;
  64
  65   CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
  66   CompactionOptionsFIFO(uint64_t _max_table_files_size) :
  67           max_table_files_size(_max_table_files_size) {}
  68 };
  69
  70 // Compression options for different compression algorithms like Zlib
  71 struct CompressionOptions {
  72   int window_bits;
  73   int level;
  74   int strategy;
  75   // Maximum size of dictionary used to prime the compression library. Currently
  76   // this dictionary will be constructed by sampling the first output file in a
  77   // subcompaction when the target level is bottommost. This dictionary will be
  78   // loaded into the compression library before compressing/uncompressing each
  79   // data block of subsequent files in the subcompaction. Effectively, this
  80   // improves compression ratios when there are repetitions across data blocks.
  81   // A value of 0 indicates the feature is disabled.
  82   // Default: 0.
  83   uint32_t max_dict_bytes;
  84
  85   CompressionOptions()
  86       : window_bits(-14), level(-1), strategy(0), max_dict_bytes(0) {}
  87   CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes)
  88       : window_bits(wbits),
  89         level(_lev),
  90         strategy(_strategy),
  91         max_dict_bytes(_max_dict_bytes) {}
  92 };
  93
  94 enum UpdateStatus {    // Return status For inplace update callback
  95   UPDATE_FAILED   = 0, // Nothing to update
  96   UPDATED_INPLACE = 1, // Value updated inplace
  97   UPDATED         = 2, // No inplace update. Merged value set
  98 };
  99
 100
 101 struct AdvancedColumnFamilyOptions {
 102   // The maximum number of write buffers that are built up in memory.
 103   // The default and the minimum number is 2, so that when 1 write buffer
 104   // is being flushed to storage, new writes can continue to the other
 105   // write buffer.
 106   // If max_write_buffer_number > 3, writing will be slowed down to
 107   // options.delayed_write_rate if we are writing to the last write buffer
 108   // allowed.
 109   //
 110   // Default: 2
 111   //
 112   // Dynamically changeable through SetOptions() API
 113   int max_write_buffer_number = 2;
 114
 115   // The minimum number of write buffers that will be merged together
 116   // before writing to storage.  If set to 1, then
 117   // all write buffers are flushed to L0 as individual files and this increases
 118   // read amplification because a get request has to check in all of these
 119   // files. Also, an in-memory merge may result in writing lesser
 120   // data to storage if there are duplicate records in each of these
 121   // individual write buffers.  Default: 1
 122   int min_write_buffer_number_to_merge = 1;
 123
 124   // The total maximum number of write buffers to maintain in memory including
 125   // copies of buffers that have already been flushed.  Unlike
 126   // max_write_buffer_number, this parameter does not affect flushing.
 127   // This controls the minimum amount of write history that will be available
 128   // in memory for conflict checking when Transactions are used.
 129   //
 130   // When using an OptimisticTransactionDB:
 131   // If this value is too low, some transactions may fail at commit time due
 132   // to not being able to determine whether there were any write conflicts.
 133   //
 134   // When using a TransactionDB:
 135   // If Transaction::SetSnapshot is used, TransactionDB will read either
 136   // in-memory write buffers or SST files to do write-conflict checking.
 137   // Increasing this value can reduce the number of reads to SST files
 138   // done for conflict detection.
 139   //
 140   // Setting this value to 0 will cause write buffers to be freed immediately
 141   // after they are flushed.
 142   // If this value is set to -1, 'max_write_buffer_number' will be used.
 143   //
 144   // Default:
 145   // If using a TransactionDB/OptimisticTransactionDB, the default value will
 146   // be set to the value of 'max_write_buffer_number' if it is not explicitly
 147   // set by the user.  Otherwise, the default is 0.
 148   int max_write_buffer_number_to_maintain = 0;
 149
 150   // Allows thread-safe inplace updates. If this is true, there is no way to
 151   // achieve point-in-time consistency using snapshot or iterator (assuming
 152   // concurrent updates). Hence iterator and multi-get will return results
 153   // which are not consistent as of any point-in-time.
 154   // If inplace_callback function is not set,
 155   //   Put(key, new_value) will update inplace the existing_value iff
 156   //   * key exists in current memtable
 157   //   * new sizeof(new_value) <= sizeof(existing_value)
 158   //   * existing_value for that key is a put i.e. kTypeValue
 159   // If inplace_callback function is set, check doc for inplace_callback.
 160   // Default: false.
 161   bool inplace_update_support = false;
 162
 163   // Number of locks used for inplace update
 164   // Default: 10000, if inplace_update_support = true, else 0.
 165   //
 166   // Dynamically changeable through SetOptions() API
 167   size_t inplace_update_num_locks = 10000;
 168
 169   // existing_value - pointer to previous value (from both memtable and sst).
 170   //                  nullptr if key doesn't exist
 171   // existing_value_size - pointer to size of existing_value).
 172   //                       nullptr if key doesn't exist
 173   // delta_value - Delta value to be merged with the existing_value.
 174   //               Stored in transaction logs.
 175   // merged_value - Set when delta is applied on the previous value.
 176
 177   // Applicable only when inplace_update_support is true,
 178   // this callback function is called at the time of updating the memtable
 179   // as part of a Put operation, lets say Put(key, delta_value). It allows the
 180   // 'delta_value' specified as part of the Put operation to be merged with
 181   // an 'existing_value' of the key in the database.
 182
 183   // If the merged value is smaller in size that the 'existing_value',
 184   // then this function can update the 'existing_value' buffer inplace and
 185   // the corresponding 'existing_value'_size pointer, if it wishes to.
 186   // The callback should return UpdateStatus::UPDATED_INPLACE.
 187   // In this case. (In this case, the snapshot-semantics of the rocksdb
 188   // Iterator is not atomic anymore).
 189
 190   // If the merged value is larger in size than the 'existing_value' or the
 191   // application does not wish to modify the 'existing_value' buffer inplace,
 192   // then the merged value should be returned via *merge_value. It is set by
 193   // merging the 'existing_value' and the Put 'delta_value'. The callback should
 194   // return UpdateStatus::UPDATED in this case. This merged value will be added
 195   // to the memtable.
 196
 197   // If merging fails or the application does not wish to take any action,
 198   // then the callback should return UpdateStatus::UPDATE_FAILED.
 199
 200   // Please remember that the original call from the application is Put(key,
 201   // delta_value). So the transaction log (if enabled) will still contain (key,
 202   // delta_value). The 'merged_value' is not stored in the transaction log.
 203   // Hence the inplace_callback function should be consistent across db reopens.
 204
 205   // Default: nullptr
 206   UpdateStatus (*inplace_callback)(char* existing_value,
 207                                    uint32_t* existing_value_size,
 208                                    Slice delta_value,
 209                                    std::string* merged_value) = nullptr;
 210
 211   // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
 212   // create prefix bloom for memtable with the size of
 213   // write_buffer_size * memtable_prefix_bloom_size_ratio.
 214   // If it is larger than 0.25, it is santinized to 0.25.
 215   //
 216   // Default: 0 (disable)
 217   //
 218   // Dynamically changeable through SetOptions() API
 219   double memtable_prefix_bloom_size_ratio = 0.0;
 220
 221   // Page size for huge page for the arena used by the memtable. If <=0, it
 222   // won't allocate from huge page but from malloc.
 223   // Users are responsible to reserve huge pages for it to be allocated. For
 224   // example:
 225   //      sysctl -w vm.nr_hugepages=20
 226   // See linux doc Documentation/vm/hugetlbpage.txt
 227   // If there isn't enough free huge page available, it will fall back to
 228   // malloc.
 229   //
 230   // Dynamically changeable through SetOptions() API
 231   size_t memtable_huge_page_size = 0;
 232
 233   // If non-nullptr, memtable will use the specified function to extract
 234   // prefixes for keys, and for each prefix maintain a hint of insert location
 235   // to reduce CPU usage for inserting keys with the prefix. Keys out of
 236   // domain of the prefix extractor will be insert without using hints.
 237   //
 238   // Currently only the default skiplist based memtable implements the feature.
 239   // All other memtable implementation will ignore the option. It incurs ~250
 240   // additional bytes of memory overhead to store a hint for each prefix.
 241   // Also concurrent writes (when allow_concurrent_memtable_write is true) will
 242   // ignore the option.
 243   //
 244   // The option is best suited for workloads where keys will likely to insert
 245   // to a location close the the last inserted key with the same prefix.
 246   // One example could be inserting keys of the form (prefix + timestamp),
 247   // and keys of the same prefix always comes in with time order. Another
 248   // example would be updating the same key over and over again, in which case
 249   // the prefix can be the key itself.
 250   //
 251   // Default: nullptr (disable)
 252   std::shared_ptr<const SliceTransform>
 253       memtable_insert_with_hint_prefix_extractor = nullptr;
 254
 255   // Control locality of bloom filter probes to improve cache miss rate.
 256   // This option only applies to memtable prefix bloom and plaintable
 257   // prefix bloom. It essentially limits every bloom checking to one cache line.
 258   // This optimization is turned off when set to 0, and positive number to turn
 259   // it on.
 260   // Default: 0
 261   uint32_t bloom_locality = 0;
 262
 263   // size of one block in arena memory allocation.
 264   // If <= 0, a proper value is automatically calculated (usually 1/8 of
 265   // writer_buffer_size, rounded up to a multiple of 4KB).
 266   //
 267   // There are two additional restriction of the The specified size:
 268   // (1) size should be in the range of [4096, 2 << 30] and
 269   // (2) be the multiple of the CPU word (which helps with the memory
 270   // alignment).
 271   //
 272   // We'll automatically check and adjust the size number to make sure it
 273   // conforms to the restrictions.
 274   //
 275   // Default: 0
 276   //
 277   // Dynamically changeable through SetOptions() API
 278   size_t arena_block_size = 0;
 279
 280   // Different levels can have different compression policies. There
 281   // are cases where most lower levels would like to use quick compression
 282   // algorithms while the higher levels (which have more data) use
 283   // compression algorithms that have better compression but could
 284   // be slower. This array, if non-empty, should have an entry for
 285   // each level of the database; these override the value specified in
 286   // the previous field 'compression'.
 287   //
 288   // NOTICE if level_compaction_dynamic_level_bytes=true,
 289   // compression_per_level[0] still determines L0, but other elements
 290   // of the array are based on base level (the level L0 files are merged
 291   // to), and may not match the level users see from info log for metadata.
 292   // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
 293   // determines compaction type for level n+i-1.
 294   // For example, if we have three 5 levels, and we determine to merge L0
 295   // data to L4 (which means L1..L3 will be empty), then the new files go to
 296   // L4 uses compression type compression_per_level[1].
 297   // If now L0 is merged to L2. Data goes to L2 will be compressed
 298   // according to compression_per_level[1], L3 using compression_per_level[2]
 299   // and L4 using compression_per_level[3]. Compaction for each level can
 300   // change when data grows.
 301   std::vector<CompressionType> compression_per_level;
 302
 303   // Number of levels for this database
 304   int num_levels = 7;
 305
 306   // Soft limit on number of level-0 files. We start slowing down writes at this
 307   // point. A value <0 means that no writing slow down will be triggered by
 308   // number of files in level-0.
 309   //
 310   // Default: 20
 311   //
 312   // Dynamically changeable through SetOptions() API
 313   int level0_slowdown_writes_trigger = 20;
 314
 315   // Maximum number of level-0 files.  We stop writes at this point.
 316   //
 317   // Default: 36
 318   //
 319   // Dynamically changeable through SetOptions() API
 320   int level0_stop_writes_trigger = 36;
 321
 322   // Target file size for compaction.
 323   // target_file_size_base is per-file size for level-1.
 324   // Target file size for level L can be calculated by
 325   // target_file_size_base * (target_file_size_multiplier ^ (L-1))
 326   // For example, if target_file_size_base is 2MB and
 327   // target_file_size_multiplier is 10, then each file on level-1 will
 328   // be 2MB, and each file on level 2 will be 20MB,
 329   // and each file on level-3 will be 200MB.
 330   //
 331   // Default: 64MB.
 332   //
 333   // Dynamically changeable through SetOptions() API
 334   uint64_t target_file_size_base = 64 * 1048576;
 335
 336   // By default target_file_size_multiplier is 1, which means
 337   // by default files in different levels will have similar size.
 338   //
 339   // Dynamically changeable through SetOptions() API
 340   int target_file_size_multiplier = 1;
 341
 342   // If true, RocksDB will pick target size of each level dynamically.
 343   // We will pick a base level b >= 1. L0 will be directly merged into level b,
 344   // instead of always into level 1. Level 1 to b-1 need to be empty.
 345   // We try to pick b and its target size so that
 346   // 1. target size is in the range of
 347   //   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
 348   //    max_bytes_for_level_base]
 349   // 2. target size of the last level (level num_levels-1) equals to extra size
 350   //    of the level.
 351   // At the same time max_bytes_for_level_multiplier and
 352   // max_bytes_for_level_multiplier_additional are still satisfied.
 353   //
 354   // With this option on, from an empty DB, we make last level the base level,
 355   // which means merging L0 data into the last level, until it exceeds
 356   // max_bytes_for_level_base. And then we make the second last level to be
 357   // base level, to start to merge L0 data to second last level, with its
 358   // target size to be 1/max_bytes_for_level_multiplier of the last level's
 359   // extra size. After the data accumulates more so that we need to move the
 360   // base level to the third last one, and so on.
 361   //
 362   // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
 363   // and max_bytes_for_level_base=10MB.
 364   // Target sizes of level 1 to 5 starts with:
 365   // [- - - - 10MB]
 366   // with base level is level. Target sizes of level 1 to 4 are not applicable
 367   // because they will not be used.
 368   // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
 369   // base target to level 4 and now the targets looks like:
 370   // [- - - 1.1MB 11MB]
 371   // While data are accumulated, size targets are tuned based on actual data
 372   // of level 5. When level 5 has 50MB of data, the target is like:
 373   // [- - - 5MB 50MB]
 374   // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
 375   // level 4 to be the base level, its target size needs to be 10.1MB, which
 376   // doesn't satisfy the target size range. So now we make level 3 the target
 377   // size and the target sizes of the levels look like:
 378   // [- - 1.01MB 10.1MB 101MB]
 379   // In the same way, while level 5 further grows, all levels' targets grow,
 380   // like
 381   // [- - 5MB 50MB 500MB]
 382   // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
 383   // base level and make levels' target sizes like this:
 384   // [- 1.001MB 10.01MB 100.1MB 1001MB]
 385   // and go on...
 386   //
 387   // By doing it, we give max_bytes_for_level_multiplier a priority against
 388   // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
 389   // useful to limit worse case space amplification.
 390   //
 391   // max_bytes_for_level_multiplier_additional is ignored with this flag on.
 392   //
 393   // Turning this feature on or off for an existing DB can cause unexpected
 394   // LSM tree structure so it's not recommended.
 395   //
 396   // NOTE: this option is experimental
 397   //
 398   // Default: false
 399   bool level_compaction_dynamic_level_bytes = false;
 400
 401   // Default: 10.
 402   //
 403   // Dynamically changeable through SetOptions() API
 404   double max_bytes_for_level_multiplier = 10;
 405
 406   // Different max-size multipliers for different levels.
 407   // These are multiplied by max_bytes_for_level_multiplier to arrive
 408   // at the max-size of each level.
 409   //
 410   // Default: 1
 411   //
 412   // Dynamically changeable through SetOptions() API
 413   std::vector<int> max_bytes_for_level_multiplier_additional =
 414       std::vector<int>(num_levels, 1);
 415
 416   // We try to limit number of bytes in one compaction to be lower than this
 417   // threshold. But it's not guaranteed.
 418   // Value 0 will be sanitized.
 419   //
 420   // Default: result.target_file_size_base * 25
 421   uint64_t max_compaction_bytes = 0;
 422
 423   // All writes will be slowed down to at least delayed_write_rate if estimated
 424   // bytes needed to be compaction exceed this threshold.
 425   //
 426   // Default: 64GB
 427   uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
 428
 429   // All writes are stopped if estimated bytes needed to be compaction exceed
 430   // this threshold.
 431   //
 432   // Default: 256GB
 433   uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
 434
 435   // The compaction style. Default: kCompactionStyleLevel
 436   CompactionStyle compaction_style = kCompactionStyleLevel;
 437
 438   // If level compaction_style = kCompactionStyleLevel, for each level,
 439   // which files are prioritized to be picked to compact.
 440   // Default: kByCompensatedSize
 441   CompactionPri compaction_pri = kByCompensatedSize;
 442
 443   // The options needed to support Universal Style compactions
 444   CompactionOptionsUniversal compaction_options_universal;
 445
 446   // The options for FIFO compaction style
 447   CompactionOptionsFIFO compaction_options_fifo;
 448
 449   // An iteration->Next() sequentially skips over keys with the same
 450   // user-key unless this option is set. This number specifies the number
 451   // of keys (with the same userkey) that will be sequentially
 452   // skipped before a reseek is issued.
 453   //
 454   // Default: 8
 455   //
 456   // Dynamically changeable through SetOptions() API
 457   uint64_t max_sequential_skip_in_iterations = 8;
 458
 459   // This is a factory that provides MemTableRep objects.
 460   // Default: a factory that provides a skip-list-based implementation of
 461   // MemTableRep.
 462   std::shared_ptr<MemTableRepFactory> memtable_factory =
 463       std::shared_ptr<SkipListFactory>(new SkipListFactory);
 464
 465   // Block-based table related options are moved to BlockBasedTableOptions.
 466   // Related options that were originally here but now moved include:
 467   //   no_block_cache
 468   //   block_cache
 469   //   block_cache_compressed
 470   //   block_size
 471   //   block_size_deviation
 472   //   block_restart_interval
 473   //   filter_policy
 474   //   whole_key_filtering
 475   // If you'd like to customize some of these options, you will need to
 476   // use NewBlockBasedTableFactory() to construct a new table factory.
 477
 478   // This option allows user to collect their own interested statistics of
 479   // the tables.
 480   // Default: empty vector -- no user-defined statistics collection will be
 481   // performed.
 482   typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
 483       TablePropertiesCollectorFactories;
 484   TablePropertiesCollectorFactories table_properties_collector_factories;
 485
 486   // Maximum number of successive merge operations on a key in the memtable.
 487   //
 488   // When a merge operation is added to the memtable and the maximum number of
 489   // successive merges is reached, the value of the key will be calculated and
 490   // inserted into the memtable instead of the merge operation. This will
 491   // ensure that there are never more than max_successive_merges merge
 492   // operations in the memtable.
 493   //
 494   // Default: 0 (disabled)
 495   //
 496   // Dynamically changeable through SetOptions() API
 497   size_t max_successive_merges = 0;
 498
 499   // This flag specifies that the implementation should optimize the filters
 500   // mainly for cases where keys are found rather than also optimize for keys
 501   // missed. This would be used in cases where the application knows that
 502   // there are very few misses or the performance in the case of misses is not
 503   // important.
 504   //
 505   // For now, this flag allows us to not store filters for the last level i.e
 506   // the largest level which contains data of the LSM store. For keys which
 507   // are hits, the filters in this level are not useful because we will search
 508   // for the data anyway. NOTE: the filters in other levels are still useful
 509   // even for key hit because they tell us whether to look in that level or go
 510   // to the higher level.
 511   //
 512   // Default: false
 513   bool optimize_filters_for_hits = false;
 514
 515   // After writing every SST file, reopen it and read all the keys.
 516   // Default: false
 517   bool paranoid_file_checks = false;
 518
 519   // In debug mode, RocksDB run consistency checks on the LSM everytime the LSM
 520   // change (Flush, Compaction, AddFile). These checks are disabled in release
 521   // mode, use this option to enable them in release mode as well.
 522   // Default: false
 523   bool force_consistency_checks = false;
 524
 525   // Measure IO stats in compactions and flushes, if true.
 526   // Default: false
 527   bool report_bg_io_stats = false;
 528
 529   // Create ColumnFamilyOptions with default values for all fields
 530   AdvancedColumnFamilyOptions();
 531   // Create ColumnFamilyOptions from Options
 532   explicit AdvancedColumnFamilyOptions(const Options& options);
 533
 534   // ---------------- OPTIONS NOT SUPPORTED ANYMORE ----------------
 535
 536   // NOT SUPPORTED ANYMORE
 537   // This does not do anything anymore.
 538   int max_mem_compaction_level;
 539
 540   // NOT SUPPORTED ANYMORE -- this options is no longer used
 541   // Puts are delayed to options.delayed_write_rate when any level has a
 542   // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0.
 543   //
 544   // Default: 0 (disabled)
 545   //
 546   // Dynamically changeable through SetOptions() API
 547   double soft_rate_limit = 0.0;
 548
 549   // NOT SUPPORTED ANYMORE -- this options is no longer used
 550   double hard_rate_limit = 0.0;
 551
 552   // NOT SUPPORTED ANYMORE -- this options is no longer used
 553   unsigned int rate_limit_delay_max_milliseconds = 100;
 554
 555   // NOT SUPPORTED ANYMORE
 556   // Does not have any effect.
 557   bool purge_redundant_kvs_while_flush = true;
 558 };
 559
 560 }  // namespace rocksdb