ceph/src/rocksdb/include/rocksdb/options.h

   1 // Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
   2 //  This source code is licensed under both the GPLv2 (found in the
   3 //  COPYING file in the root directory) and Apache 2.0 License
   4 //  (found in the LICENSE.Apache file in the root directory).
   5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
   6 // Use of this source code is governed by a BSD-style license that can be
   7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
   8
   9 #pragma once
  10
  11 #include <stddef.h>
  12 #include <stdint.h>
  13
  14 #include <limits>
  15 #include <memory>
  16 #include <string>
  17 #include <unordered_map>
  18 #include <vector>
  19
  20 #include "rocksdb/advanced_options.h"
  21 #include "rocksdb/comparator.h"
  22 #include "rocksdb/compression_type.h"
  23 #include "rocksdb/env.h"
  24 #include "rocksdb/file_checksum.h"
  25 #include "rocksdb/listener.h"
  26 #include "rocksdb/sst_partitioner.h"
  27 #include "rocksdb/types.h"
  28 #include "rocksdb/universal_compaction.h"
  29 #include "rocksdb/version.h"
  30 #include "rocksdb/write_buffer_manager.h"
  31
  32 #ifdef max
  33 #undef max
  34 #endif
  35
  36 namespace ROCKSDB_NAMESPACE {
  37
  38 class Cache;
  39 class CompactionFilter;
  40 class CompactionFilterFactory;
  41 class Comparator;
  42 class ConcurrentTaskLimiter;
  43 class Env;
  44 enum InfoLogLevel : unsigned char;
  45 class SstFileManager;
  46 class FilterPolicy;
  47 class Logger;
  48 class MergeOperator;
  49 class Snapshot;
  50 class MemTableRepFactory;
  51 class RateLimiter;
  52 class Slice;
  53 class Statistics;
  54 class InternalKeyComparator;
  55 class WalFilter;
  56 class FileSystem;
  57
  58 struct Options;
  59 struct DbPath;
  60
  61 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
  62   // The function recovers options to a previous version. Only 4.6 or later
  63   // versions are supported.
  64   ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
  65                                    int rocksdb_minor_version = 6);
  66
  67   // Some functions that make it easier to optimize RocksDB
  68   // Use this if your DB is very small (like under 1GB) and you don't want to
  69   // spend lots of memory for memtables.
  70   // An optional cache object is passed in to be used as the block cache
  71   ColumnFamilyOptions* OptimizeForSmallDb(
  72       std::shared_ptr<Cache>* cache = nullptr);
  73
  74   // Use this if you don't need to keep the data sorted, i.e. you'll never use
  75   // an iterator, only Put() and Get() API calls
  76   //
  77   // Not supported in ROCKSDB_LITE
  78   ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
  79
  80   // Default values for some parameters in ColumnFamilyOptions are not
  81   // optimized for heavy workloads and big datasets, which means you might
  82   // observe write stalls under some conditions. As a starting point for tuning
  83   // RocksDB options, use the following two functions:
  84   // * OptimizeLevelStyleCompaction -- optimizes level style compaction
  85   // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
  86   // Universal style compaction is focused on reducing Write Amplification
  87   // Factor for big data sets, but increases Space Amplification. You can learn
  88   // more about the different styles here:
  89   // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
  90   // Make sure to also call IncreaseParallelism(), which will provide the
  91   // biggest performance gains.
  92   // Note: we might use more memory than memtable_memory_budget during high
  93   // write rate period
  94   //
  95   // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
  96   ColumnFamilyOptions* OptimizeLevelStyleCompaction(
  97       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
  98   ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
  99       uint64_t memtable_memory_budget = 512 * 1024 * 1024);
 100
 101   // -------------------
 102   // Parameters that affect behavior
 103
 104   // Comparator used to define the order of keys in the table.
 105   // Default: a comparator that uses lexicographic byte-wise ordering
 106   //
 107   // REQUIRES: The client must ensure that the comparator supplied
 108   // here has the same name and orders keys *exactly* the same as the
 109   // comparator provided to previous open calls on the same DB.
 110   const Comparator* comparator = BytewiseComparator();
 111
 112   // REQUIRES: The client must provide a merge operator if Merge operation
 113   // needs to be accessed. Calling Merge on a DB without a merge operator
 114   // would result in Status::NotSupported. The client must ensure that the
 115   // merge operator supplied here has the same name and *exactly* the same
 116   // semantics as the merge operator provided to previous open calls on
 117   // the same DB. The only exception is reserved for upgrade, where a DB
 118   // previously without a merge operator is introduced to Merge operation
 119   // for the first time. It's necessary to specify a merge operator when
 120   // opening the DB in this case.
 121   // Default: nullptr
 122   std::shared_ptr<MergeOperator> merge_operator = nullptr;
 123
 124   // A single CompactionFilter instance to call into during compaction.
 125   // Allows an application to modify/delete a key-value during background
 126   // compaction.
 127   //
 128   // If the client requires a new compaction filter to be used for different
 129   // compaction runs, it can specify compaction_filter_factory instead of this
 130   // option.  The client should specify only one of the two.
 131   // compaction_filter takes precedence over compaction_filter_factory if
 132   // client specifies both.
 133   //
 134   // If multithreaded compaction is being used, the supplied CompactionFilter
 135   // instance may be used from different threads concurrently and so should be
 136   // thread-safe.
 137   //
 138   // Default: nullptr
 139   const CompactionFilter* compaction_filter = nullptr;
 140
 141   // This is a factory that provides compaction filter objects which allow
 142   // an application to modify/delete a key-value during background compaction.
 143   //
 144   // A new filter will be created on each compaction run.  If multithreaded
 145   // compaction is being used, each created CompactionFilter will only be used
 146   // from a single thread and so does not need to be thread-safe.
 147   //
 148   // Default: nullptr
 149   std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
 150
 151   // -------------------
 152   // Parameters that affect performance
 153
 154   // Amount of data to build up in memory (backed by an unsorted log
 155   // on disk) before converting to a sorted on-disk file.
 156   //
 157   // Larger values increase performance, especially during bulk loads.
 158   // Up to max_write_buffer_number write buffers may be held in memory
 159   // at the same time,
 160   // so you may wish to adjust this parameter to control memory usage.
 161   // Also, a larger write buffer will result in a longer recovery time
 162   // the next time the database is opened.
 163   //
 164   // Note that write_buffer_size is enforced per column family.
 165   // See db_write_buffer_size for sharing memory across column families.
 166   //
 167   // Default: 64MB
 168   //
 169   // Dynamically changeable through SetOptions() API
 170   size_t write_buffer_size = 64 << 20;
 171
 172   // Compress blocks using the specified compression algorithm.
 173   //
 174   // Default: kSnappyCompression, if it's supported. If snappy is not linked
 175   // with the library, the default is kNoCompression.
 176   //
 177   // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
 178   //    ~200-500MB/s compression
 179   //    ~400-800MB/s decompression
 180   //
 181   // Note that these speeds are significantly faster than most
 182   // persistent storage speeds, and therefore it is typically never
 183   // worth switching to kNoCompression.  Even if the input data is
 184   // incompressible, the kSnappyCompression implementation will
 185   // efficiently detect that and will switch to uncompressed mode.
 186   //
 187   // If you do not set `compression_opts.level`, or set it to
 188   // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
 189   // default corresponding to `compression` as follows:
 190   //
 191   // - kZSTD: 3
 192   // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
 193   // - kLZ4HCCompression: 0
 194   // - For all others, we do not specify a compression level
 195   //
 196   // Dynamically changeable through SetOptions() API
 197   CompressionType compression;
 198
 199   // Compression algorithm that will be used for the bottommost level that
 200   // contain files.
 201   //
 202   // Default: kDisableCompressionOption (Disabled)
 203   CompressionType bottommost_compression = kDisableCompressionOption;
 204
 205   // different options for compression algorithms used by bottommost_compression
 206   // if it is enabled. To enable it, please see the definition of
 207   // CompressionOptions.
 208   CompressionOptions bottommost_compression_opts;
 209
 210   // different options for compression algorithms
 211   CompressionOptions compression_opts;
 212
 213   // Number of files to trigger level-0 compaction. A value <0 means that
 214   // level-0 compaction will not be triggered by number of files at all.
 215   //
 216   // Default: 4
 217   //
 218   // Dynamically changeable through SetOptions() API
 219   int level0_file_num_compaction_trigger = 4;
 220
 221   // If non-nullptr, use the specified function to determine the
 222   // prefixes for keys.  These prefixes will be placed in the filter.
 223   // Depending on the workload, this can reduce the number of read-IOP
 224   // cost for scans when a prefix is passed via ReadOptions to
 225   // db.NewIterator().  For prefix filtering to work properly,
 226   // "prefix_extractor" and "comparator" must be such that the following
 227   // properties hold:
 228   //
 229   // 1) key.starts_with(prefix(key))
 230   // 2) Compare(prefix(key), key) <= 0.
 231   // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
 232   // 4) prefix(prefix(key)) == prefix(key)
 233   //
 234   // Default: nullptr
 235   std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
 236
 237   // Control maximum total data size for a level.
 238   // max_bytes_for_level_base is the max total for level-1.
 239   // Maximum number of bytes for level L can be calculated as
 240   // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
 241   // For example, if max_bytes_for_level_base is 200MB, and if
 242   // max_bytes_for_level_multiplier is 10, total data size for level-1
 243   // will be 200MB, total file size for level-2 will be 2GB,
 244   // and total file size for level-3 will be 20GB.
 245   //
 246   // Default: 256MB.
 247   //
 248   // Dynamically changeable through SetOptions() API
 249   uint64_t max_bytes_for_level_base = 256 * 1048576;
 250
 251   // Deprecated.
 252   uint64_t snap_refresh_nanos = 0;
 253
 254   // Disable automatic compactions. Manual compactions can still
 255   // be issued on this column family
 256   //
 257   // Dynamically changeable through SetOptions() API
 258   bool disable_auto_compactions = false;
 259
 260   // This is a factory that provides TableFactory objects.
 261   // Default: a block-based table factory that provides a default
 262   // implementation of TableBuilder and TableReader with default
 263   // BlockBasedTableOptions.
 264   std::shared_ptr<TableFactory> table_factory;
 265
 266   // A list of paths where SST files for this column family
 267   // can be put into, with its target size. Similar to db_paths,
 268   // newer data is placed into paths specified earlier in the
 269   // vector while older data gradually moves to paths specified
 270   // later in the vector.
 271   // Note that, if a path is supplied to multiple column
 272   // families, it would have files and total size from all
 273   // the column families combined. User should provision for the
 274   // total size(from all the column families) in such cases.
 275   //
 276   // If left empty, db_paths will be used.
 277   // Default: empty
 278   std::vector<DbPath> cf_paths;
 279
 280   // Compaction concurrent thread limiter for the column family.
 281   // If non-nullptr, use given concurrent thread limiter to control
 282   // the max outstanding compaction tasks. Limiter can be shared with
 283   // multiple column families across db instances.
 284   //
 285   // Default: nullptr
 286   std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
 287
 288   // If non-nullptr, use the specified factory for a function to determine the
 289   // partitioning of sst files. This helps compaction to split the files
 290   // on interesting boundaries (key prefixes) to make propagation of sst
 291   // files less write amplifying (covering the whole key space).
 292   // THE FEATURE IS STILL EXPERIMENTAL
 293   //
 294   // Default: nullptr
 295   std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
 296
 297   // Create ColumnFamilyOptions with default values for all fields
 298   ColumnFamilyOptions();
 299   // Create ColumnFamilyOptions from Options
 300   explicit ColumnFamilyOptions(const Options& options);
 301
 302   void Dump(Logger* log) const;
 303 };
 304
 305 enum class WALRecoveryMode : char {
 306   // Original levelDB recovery
 307   //
 308   // We tolerate the last record in any log to be incomplete due to a crash
 309   // while writing it. Zeroed bytes from preallocation are also tolerated in the
 310   // trailing data of any log.
 311   //
 312   // Use case: Applications for which updates, once applied, must not be rolled
 313   // back even after a crash-recovery. In this recovery mode, RocksDB guarantees
 314   // this as long as `WritableFile::Append()` writes are durable. In case the
 315   // user needs the guarantee in more situations (e.g., when
 316   // `WritableFile::Append()` writes to page cache, but the user desires this
 317   // guarantee in face of power-loss crash-recovery), RocksDB offers various
 318   // mechanisms to additionally invoke `WritableFile::Sync()` in order to
 319   // strengthen the guarantee.
 320   //
 321   // This differs from `kPointInTimeRecovery` in that, in case a corruption is
 322   // detected during recovery, this mode will refuse to open the DB. Whereas,
 323   // `kPointInTimeRecovery` will stop recovery just before the corruption since
 324   // that is a valid point-in-time to which to recover.
 325   kTolerateCorruptedTailRecords = 0x00,
 326   // Recover from clean shutdown
 327   // We don't expect to find any corruption in the WAL
 328   // Use case : This is ideal for unit tests and rare applications that
 329   // can require high consistency guarantee
 330   kAbsoluteConsistency = 0x01,
 331   // Recover to point-in-time consistency (default)
 332   // We stop the WAL playback on discovering WAL inconsistency
 333   // Use case : Ideal for systems that have disk controller cache like
 334   // hard disk, SSD without super capacitor that store related data
 335   kPointInTimeRecovery = 0x02,
 336   // Recovery after a disaster
 337   // We ignore any corruption in the WAL and try to salvage as much data as
 338   // possible
 339   // Use case : Ideal for last ditch effort to recover data or systems that
 340   // operate with low grade unrelated data
 341   kSkipAnyCorruptedRecords = 0x03,
 342 };
 343
 344 struct DbPath {
 345   std::string path;
 346   uint64_t target_size;  // Target size of total files under the path, in byte.
 347
 348   DbPath() : target_size(0) {}
 349   DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
 350 };
 351
 352 extern const char* kHostnameForDbHostId;
 353
 354 struct DBOptions {
 355   // The function recovers options to the option as in version 4.6.
 356   DBOptions* OldDefaults(int rocksdb_major_version = 4,
 357                          int rocksdb_minor_version = 6);
 358
 359   // Some functions that make it easier to optimize RocksDB
 360
 361   // Use this if your DB is very small (like under 1GB) and you don't want to
 362   // spend lots of memory for memtables.
 363   // An optional cache object is passed in for the memory of the
 364   // memtable to cost to
 365   DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
 366
 367 #ifndef ROCKSDB_LITE
 368   // By default, RocksDB uses only one background thread for flush and
 369   // compaction. Calling this function will set it up such that total of
 370   // `total_threads` is used. Good value for `total_threads` is the number of
 371   // cores. You almost definitely want to call this function if your system is
 372   // bottlenecked by RocksDB.
 373   DBOptions* IncreaseParallelism(int total_threads = 16);
 374 #endif  // ROCKSDB_LITE
 375
 376   // If true, the database will be created if it is missing.
 377   // Default: false
 378   bool create_if_missing = false;
 379
 380   // If true, missing column families will be automatically created.
 381   // Default: false
 382   bool create_missing_column_families = false;
 383
 384   // If true, an error is raised if the database already exists.
 385   // Default: false
 386   bool error_if_exists = false;
 387
 388   // If true, RocksDB will aggressively check consistency of the data.
 389   // Also, if any of the  writes to the database fails (Put, Delete, Merge,
 390   // Write), the database will switch to read-only mode and fail all other
 391   // Write operations.
 392   // In most cases you want this to be set to true.
 393   // Default: true
 394   bool paranoid_checks = true;
 395
 396   // If true, track WALs in MANIFEST and verify them on recovery.
 397   //
 398   // If a WAL is tracked in MANIFEST but is missing from disk on recovery,
 399   // or the size of the tracked WAL is larger than the WAL's on-disk size,
 400   // an error is reported and recovery is aborted.
 401   //
 402   // If a WAL is not tracked in MANIFEST, then no verification will happen
 403   // during recovery.
 404   //
 405   // Default: false
 406   // FIXME(cheng): This option is part of a work in progress and does not yet
 407   // work
 408   bool track_and_verify_wals_in_manifest = false;
 409
 410   // Use the specified object to interact with the environment,
 411   // e.g. to read/write files, schedule background work, etc. In the near
 412   // future, support for doing storage operations such as read/write files
 413   // through env will be deprecated in favor of file_system (see below)
 414   // Default: Env::Default()
 415   Env* env = Env::Default();
 416
 417   // Use to control write rate of flush and compaction. Flush has higher
 418   // priority than compaction. Rate limiting is disabled if nullptr.
 419   // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
 420   // Default: nullptr
 421   std::shared_ptr<RateLimiter> rate_limiter = nullptr;
 422
 423   // Use to track SST files and control their file deletion rate.
 424   //
 425   // Features:
 426   //  - Throttle the deletion rate of the SST files.
 427   //  - Keep track the total size of all SST files.
 428   //  - Set a maximum allowed space limit for SST files that when reached
 429   //    the DB wont do any further flushes or compactions and will set the
 430   //    background error.
 431   //  - Can be shared between multiple dbs.
 432   // Limitations:
 433   //  - Only track and throttle deletes of SST files in
 434   //    first db_path (db_name if db_paths is empty).
 435   //
 436   // Default: nullptr
 437   std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
 438
 439   // Any internal progress/error information generated by the db will
 440   // be written to info_log if it is non-nullptr, or to a file stored
 441   // in the same directory as the DB contents if info_log is nullptr.
 442   // Default: nullptr
 443   std::shared_ptr<Logger> info_log = nullptr;
 444
 445 #ifdef NDEBUG
 446   InfoLogLevel info_log_level = INFO_LEVEL;
 447 #else
 448   InfoLogLevel info_log_level = DEBUG_LEVEL;
 449 #endif  // NDEBUG
 450
 451   // Number of open files that can be used by the DB.  You may need to
 452   // increase this if your database has a large working set. Value -1 means
 453   // files opened are always kept open. You can estimate number of files based
 454   // on target_file_size_base and target_file_size_multiplier for level-based
 455   // compaction. For universal-style compaction, you can usually set it to -1.
 456   //
 457   // Default: -1
 458   //
 459   // Dynamically changeable through SetDBOptions() API.
 460   int max_open_files = -1;
 461
 462   // If max_open_files is -1, DB will open all files on DB::Open(). You can
 463   // use this option to increase the number of threads used to open the files.
 464   // Default: 16
 465   int max_file_opening_threads = 16;
 466
 467   // Once write-ahead logs exceed this size, we will start forcing the flush of
 468   // column families whose memtables are backed by the oldest live WAL file
 469   // (i.e. the ones that are causing all the space amplification). If set to 0
 470   // (default), we will dynamically choose the WAL size limit to be
 471   // [sum of all write_buffer_size * max_write_buffer_number] * 4
 472   // This option takes effect only when there are more than one column family as
 473   // otherwise the wal size is dictated by the write_buffer_size.
 474   //
 475   // Default: 0
 476   //
 477   // Dynamically changeable through SetDBOptions() API.
 478   uint64_t max_total_wal_size = 0;
 479
 480   // If non-null, then we should collect metrics about database operations
 481   std::shared_ptr<Statistics> statistics = nullptr;
 482
 483   // By default, writes to stable storage use fdatasync (on platforms
 484   // where this function is available). If this option is true,
 485   // fsync is used instead.
 486   //
 487   // fsync and fdatasync are equally safe for our purposes and fdatasync is
 488   // faster, so it is rarely necessary to set this option. It is provided
 489   // as a workaround for kernel/filesystem bugs, such as one that affected
 490   // fdatasync with ext4 in kernel versions prior to 3.7.
 491   bool use_fsync = false;
 492
 493   // A list of paths where SST files can be put into, with its target size.
 494   // Newer data is placed into paths specified earlier in the vector while
 495   // older data gradually moves to paths specified later in the vector.
 496   //
 497   // For example, you have a flash device with 10GB allocated for the DB,
 498   // as well as a hard drive of 2TB, you should config it to be:
 499   //   [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
 500   //
 501   // The system will try to guarantee data under each path is close to but
 502   // not larger than the target size. But current and future file sizes used
 503   // by determining where to place a file are based on best-effort estimation,
 504   // which means there is a chance that the actual size under the directory
 505   // is slightly more than target size under some workloads. User should give
 506   // some buffer room for those cases.
 507   //
 508   // If none of the paths has sufficient room to place a file, the file will
 509   // be placed to the last path anyway, despite to the target size.
 510   //
 511   // Placing newer data to earlier paths is also best-efforts. User should
 512   // expect user files to be placed in higher levels in some extreme cases.
 513   //
 514   // If left empty, only one path will be used, which is db_name passed when
 515   // opening the DB.
 516   // Default: empty
 517   std::vector<DbPath> db_paths;
 518
 519   // This specifies the info LOG dir.
 520   // If it is empty, the log files will be in the same dir as data.
 521   // If it is non empty, the log files will be in the specified dir,
 522   // and the db data dir's absolute path will be used as the log file
 523   // name's prefix.
 524   std::string db_log_dir = "";
 525
 526   // This specifies the absolute dir path for write-ahead logs (WAL).
 527   // If it is empty, the log files will be in the same dir as data,
 528   //   dbname is used as the data dir by default
 529   // If it is non empty, the log files will be in kept the specified dir.
 530   // When destroying the db,
 531   //   all log files in wal_dir and the dir itself is deleted
 532   std::string wal_dir = "";
 533
 534   // The periodicity when obsolete files get deleted. The default
 535   // value is 6 hours. The files that get out of scope by compaction
 536   // process will still get automatically delete on every compaction,
 537   // regardless of this setting
 538   //
 539   // Default: 6 hours
 540   //
 541   // Dynamically changeable through SetDBOptions() API.
 542   uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
 543
 544   // Maximum number of concurrent background jobs (compactions and flushes).
 545   //
 546   // Default: 2
 547   //
 548   // Dynamically changeable through SetDBOptions() API.
 549   int max_background_jobs = 2;
 550
 551   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
 552   // value of max_background_jobs. This option is ignored.
 553   //
 554   // Dynamically changeable through SetDBOptions() API.
 555   int base_background_compactions = -1;
 556
 557   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
 558   // value of max_background_jobs. For backwards compatibility we will set
 559   // `max_background_jobs = max_background_compactions + max_background_flushes`
 560   // in the case where user sets at least one of `max_background_compactions` or
 561   // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
 562   //
 563   // Maximum number of concurrent background compaction jobs, submitted to
 564   // the default LOW priority thread pool.
 565   //
 566   // If you're increasing this, also consider increasing number of threads in
 567   // LOW priority thread pool. For more information, see
 568   // Env::SetBackgroundThreads
 569   //
 570   // Default: -1
 571   //
 572   // Dynamically changeable through SetDBOptions() API.
 573   int max_background_compactions = -1;
 574
 575   // This value represents the maximum number of threads that will
 576   // concurrently perform a compaction job by breaking it into multiple,
 577   // smaller ones that are run simultaneously.
 578   // Default: 1 (i.e. no subcompactions)
 579   //
 580   // Dynamically changeable through SetDBOptions() API.
 581   uint32_t max_subcompactions = 1;
 582
 583   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
 584   // value of max_background_jobs. For backwards compatibility we will set
 585   // `max_background_jobs = max_background_compactions + max_background_flushes`
 586   // in the case where user sets at least one of `max_background_compactions` or
 587   // `max_background_flushes`.
 588   //
 589   // Maximum number of concurrent background memtable flush jobs, submitted by
 590   // default to the HIGH priority thread pool. If the HIGH priority thread pool
 591   // is configured to have zero threads, flush jobs will share the LOW priority
 592   // thread pool with compaction jobs.
 593   //
 594   // It is important to use both thread pools when the same Env is shared by
 595   // multiple db instances. Without a separate pool, long running compaction
 596   // jobs could potentially block memtable flush jobs of other db instances,
 597   // leading to unnecessary Put stalls.
 598   //
 599   // If you're increasing this, also consider increasing number of threads in
 600   // HIGH priority thread pool. For more information, see
 601   // Env::SetBackgroundThreads
 602   // Default: -1
 603   int max_background_flushes = -1;
 604
 605   // Specify the maximal size of the info log file. If the log file
 606   // is larger than `max_log_file_size`, a new info log file will
 607   // be created.
 608   // If max_log_file_size == 0, all logs will be written to one
 609   // log file.
 610   size_t max_log_file_size = 0;
 611
 612   // Time for the info log file to roll (in seconds).
 613   // If specified with non-zero value, log file will be rolled
 614   // if it has been active longer than `log_file_time_to_roll`.
 615   // Default: 0 (disabled)
 616   // Not supported in ROCKSDB_LITE mode!
 617   size_t log_file_time_to_roll = 0;
 618
 619   // Maximal info log files to be kept.
 620   // Default: 1000
 621   size_t keep_log_file_num = 1000;
 622
 623   // Recycle log files.
 624   // If non-zero, we will reuse previously written log files for new
 625   // logs, overwriting the old data.  The value indicates how many
 626   // such files we will keep around at any point in time for later
 627   // use.  This is more efficient because the blocks are already
 628   // allocated and fdatasync does not need to update the inode after
 629   // each write.
 630   // Default: 0
 631   size_t recycle_log_file_num = 0;
 632
 633   // manifest file is rolled over on reaching this limit.
 634   // The older manifest file be deleted.
 635   // The default value is 1GB so that the manifest file can grow, but not
 636   // reach the limit of storage capacity.
 637   uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
 638
 639   // Number of shards used for table cache.
 640   int table_cache_numshardbits = 6;
 641
 642   // NOT SUPPORTED ANYMORE
 643   // int table_cache_remove_scan_count_limit;
 644
 645   // The following two fields affect how archived logs will be deleted.
 646   // 1. If both set to 0, logs will be deleted asap and will not get into
 647   //    the archive.
 648   // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
 649   //    WAL files will be checked every 10 min and if total size is greater
 650   //    then WAL_size_limit_MB, they will be deleted starting with the
 651   //    earliest until size_limit is met. All empty files will be deleted.
 652   // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
 653   //    WAL files will be checked every WAL_ttl_seconds / 2 and those that
 654   //    are older than WAL_ttl_seconds will be deleted.
 655   // 4. If both are not 0, WAL files will be checked every 10 min and both
 656   //    checks will be performed with ttl being first.
 657   uint64_t WAL_ttl_seconds = 0;
 658   uint64_t WAL_size_limit_MB = 0;
 659
 660   // Number of bytes to preallocate (via fallocate) the manifest
 661   // files.  Default is 4mb, which is reasonable to reduce random IO
 662   // as well as prevent overallocation for mounts that preallocate
 663   // large amounts of data (such as xfs's allocsize option).
 664   size_t manifest_preallocation_size = 4 * 1024 * 1024;
 665
 666   // Allow the OS to mmap file for reading sst tables. Default: false
 667   bool allow_mmap_reads = false;
 668
 669   // Allow the OS to mmap file for writing.
 670   // DB::SyncWAL() only works if this is set to false.
 671   // Default: false
 672   bool allow_mmap_writes = false;
 673
 674   // Enable direct I/O mode for read/write
 675   // they may or may not improve performance depending on the use case
 676   //
 677   // Files will be opened in "direct I/O" mode
 678   // which means that data r/w from the disk will not be cached or
 679   // buffered. The hardware buffer of the devices may however still
 680   // be used. Memory mapped files are not impacted by these parameters.
 681
 682   // Use O_DIRECT for user and compaction reads.
 683   // When true, we also force new_table_reader_for_compaction_inputs to true.
 684   // Default: false
 685   // Not supported in ROCKSDB_LITE mode!
 686   bool use_direct_reads = false;
 687
 688   // Use O_DIRECT for writes in background flush and compactions.
 689   // Default: false
 690   // Not supported in ROCKSDB_LITE mode!
 691   bool use_direct_io_for_flush_and_compaction = false;
 692
 693   // If false, fallocate() calls are bypassed
 694   bool allow_fallocate = true;
 695
 696   // Disable child process inherit open files. Default: true
 697   bool is_fd_close_on_exec = true;
 698
 699   // NOT SUPPORTED ANYMORE -- this options is no longer used
 700   bool skip_log_error_on_recovery = false;
 701
 702   // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
 703   //
 704   // Default: 600 (10 min)
 705   //
 706   // Dynamically changeable through SetDBOptions() API.
 707   unsigned int stats_dump_period_sec = 600;
 708
 709   // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
 710   // Default: 600
 711   unsigned int stats_persist_period_sec = 600;
 712
 713   // If true, automatically persist stats to a hidden column family (column
 714   // family name: ___rocksdb_stats_history___) every
 715   // stats_persist_period_sec seconds; otherwise, write to an in-memory
 716   // struct. User can query through `GetStatsHistory` API.
 717   // If user attempts to create a column family with the same name on a DB
 718   // which have previously set persist_stats_to_disk to true, the column family
 719   // creation will fail, but the hidden column family will survive, as well as
 720   // the previously persisted statistics.
 721   // When peristing stats to disk, the stat name will be limited at 100 bytes.
 722   // Default: false
 723   bool persist_stats_to_disk = false;
 724
 725   // if not zero, periodically take stats snapshots and store in memory, the
 726   // memory size for stats snapshots is capped at stats_history_buffer_size
 727   // Default: 1MB
 728   size_t stats_history_buffer_size = 1024 * 1024;
 729
 730   // If set true, will hint the underlying file system that the file
 731   // access pattern is random, when a sst file is opened.
 732   // Default: true
 733   bool advise_random_on_open = true;
 734
 735   // Amount of data to build up in memtables across all column
 736   // families before writing to disk.
 737   //
 738   // This is distinct from write_buffer_size, which enforces a limit
 739   // for a single memtable.
 740   //
 741   // This feature is disabled by default. Specify a non-zero value
 742   // to enable it.
 743   //
 744   // Default: 0 (disabled)
 745   size_t db_write_buffer_size = 0;
 746
 747   // The memory usage of memtable will report to this object. The same object
 748   // can be passed into multiple DBs and it will track the sum of size of all
 749   // the DBs. If the total size of all live memtables of all the DBs exceeds
 750   // a limit, a flush will be triggered in the next DB to which the next write
 751   // is issued.
 752   //
 753   // If the object is only passed to one DB, the behavior is the same as
 754   // db_write_buffer_size. When write_buffer_manager is set, the value set will
 755   // override db_write_buffer_size.
 756   //
 757   // This feature is disabled by default. Specify a non-zero value
 758   // to enable it.
 759   //
 760   // Default: null
 761   std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
 762
 763   // Specify the file access pattern once a compaction is started.
 764   // It will be applied to all input files of a compaction.
 765   // Default: NORMAL
 766   enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
 767   AccessHint access_hint_on_compaction_start = NORMAL;
 768
 769   // If true, always create a new file descriptor and new table reader
 770   // for compaction inputs. Turn this parameter on may introduce extra
 771   // memory usage in the table reader, if it allocates extra memory
 772   // for indexes. This will allow file descriptor prefetch options
 773   // to be set for compaction input files and not to impact file
 774   // descriptors for the same file used by user queries.
 775   // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
 776   // for this mode if using block-based table.
 777   //
 778   // Default: false
 779   // This flag has no affect on the behavior of compaction and plan to delete
 780   // in the future.
 781   bool new_table_reader_for_compaction_inputs = false;
 782
 783   // If non-zero, we perform bigger reads when doing compaction. If you're
 784   // running RocksDB on spinning disks, you should set this to at least 2MB.
 785   // That way RocksDB's compaction is doing sequential instead of random reads.
 786   //
 787   // When non-zero, we also force new_table_reader_for_compaction_inputs to
 788   // true.
 789   //
 790   // Default: 0
 791   //
 792   // Dynamically changeable through SetDBOptions() API.
 793   size_t compaction_readahead_size = 0;
 794
 795   // This is a maximum buffer size that is used by WinMmapReadableFile in
 796   // unbuffered disk I/O mode. We need to maintain an aligned buffer for
 797   // reads. We allow the buffer to grow until the specified value and then
 798   // for bigger requests allocate one shot buffers. In unbuffered mode we
 799   // always bypass read-ahead buffer at ReadaheadRandomAccessFile
 800   // When read-ahead is required we then make use of compaction_readahead_size
 801   // value and always try to read ahead. With read-ahead we always
 802   // pre-allocate buffer to the size instead of growing it up to a limit.
 803   //
 804   // This option is currently honored only on Windows
 805   //
 806   // Default: 1 Mb
 807   //
 808   // Special value: 0 - means do not maintain per instance buffer. Allocate
 809   //                per request buffer and avoid locking.
 810   size_t random_access_max_buffer_size = 1024 * 1024;
 811
 812   // This is the maximum buffer size that is used by WritableFileWriter.
 813   // On Windows, we need to maintain an aligned buffer for writes.
 814   // We allow the buffer to grow until it's size hits the limit in buffered
 815   // IO and fix the buffer size when using direct IO to ensure alignment of
 816   // write requests if the logical sector size is unusual
 817   //
 818   // Default: 1024 * 1024 (1 MB)
 819   //
 820   // Dynamically changeable through SetDBOptions() API.
 821   size_t writable_file_max_buffer_size = 1024 * 1024;
 822
 823   // Use adaptive mutex, which spins in the user space before resorting
 824   // to kernel. This could reduce context switch when the mutex is not
 825   // heavily contended. However, if the mutex is hot, we could end up
 826   // wasting spin time.
 827   // Default: false
 828   bool use_adaptive_mutex = false;
 829
 830   // Create DBOptions with default values for all fields
 831   DBOptions();
 832   // Create DBOptions from Options
 833   explicit DBOptions(const Options& options);
 834
 835   void Dump(Logger* log) const;
 836
 837   // Allows OS to incrementally sync files to disk while they are being
 838   // written, asynchronously, in the background. This operation can be used
 839   // to smooth out write I/Os over time. Users shouldn't rely on it for
 840   // persistency guarantee.
 841   // Issue one request for every bytes_per_sync written. 0 turns it off.
 842   //
 843   // You may consider using rate_limiter to regulate write rate to device.
 844   // When rate limiter is enabled, it automatically enables bytes_per_sync
 845   // to 1MB.
 846   //
 847   // This option applies to table files
 848   //
 849   // Default: 0, turned off
 850   //
 851   // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
 852   // Dynamically changeable through SetDBOptions() API.
 853   uint64_t bytes_per_sync = 0;
 854
 855   // Same as bytes_per_sync, but applies to WAL files
 856   //
 857   // Default: 0, turned off
 858   //
 859   // Dynamically changeable through SetDBOptions() API.
 860   uint64_t wal_bytes_per_sync = 0;
 861
 862   // When true, guarantees WAL files have at most `wal_bytes_per_sync`
 863   // bytes submitted for writeback at any given time, and SST files have at most
 864   // `bytes_per_sync` bytes pending writeback at any given time. This can be
 865   // used to handle cases where processing speed exceeds I/O speed during file
 866   // generation, which can lead to a huge sync when the file is finished, even
 867   // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
 868   //
 869   //  - If `sync_file_range` is supported it achieves this by waiting for any
 870   //    prior `sync_file_range`s to finish before proceeding. In this way,
 871   //    processing (compression, etc.) can proceed uninhibited in the gap
 872   //    between `sync_file_range`s, and we block only when I/O falls behind.
 873   //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
 874   //    always blocks, thus preventing the interleaving of I/O and processing.
 875   //
 876   // Note: Enabling this option does not provide any additional persistence
 877   // guarantees, as it may use `sync_file_range`, which does not write out
 878   // metadata.
 879   //
 880   // Default: false
 881   bool strict_bytes_per_sync = false;
 882
 883   // A vector of EventListeners whose callback functions will be called
 884   // when specific RocksDB event happens.
 885   std::vector<std::shared_ptr<EventListener>> listeners;
 886
 887   // If true, then the status of the threads involved in this DB will
 888   // be tracked and available via GetThreadList() API.
 889   //
 890   // Default: false
 891   bool enable_thread_tracking = false;
 892
 893   // The limited write rate to DB if soft_pending_compaction_bytes_limit or
 894   // level0_slowdown_writes_trigger is triggered, or we are writing to the
 895   // last mem table allowed and we allow more than 3 mem tables. It is
 896   // calculated using size of user write requests before compression.
 897   // RocksDB may decide to slow down more if the compaction still
 898   // gets behind further.
 899   // If the value is 0, we will infer a value from `rater_limiter` value
 900   // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
 901   // if users change the rate in `rate_limiter` after DB is opened,
 902   // `delayed_write_rate` won't be adjusted.
 903   //
 904   // Unit: byte per second.
 905   //
 906   // Default: 0
 907   //
 908   // Dynamically changeable through SetDBOptions() API.
 909   uint64_t delayed_write_rate = 0;
 910
 911   // By default, a single write thread queue is maintained. The thread gets
 912   // to the head of the queue becomes write batch group leader and responsible
 913   // for writing to WAL and memtable for the batch group.
 914   //
 915   // If enable_pipelined_write is true, separate write thread queue is
 916   // maintained for WAL write and memtable write. A write thread first enter WAL
 917   // writer queue and then memtable writer queue. Pending thread on the WAL
 918   // writer queue thus only have to wait for previous writers to finish their
 919   // WAL writing but not the memtable writing. Enabling the feature may improve
 920   // write throughput and reduce latency of the prepare phase of two-phase
 921   // commit.
 922   //
 923   // Default: false
 924   bool enable_pipelined_write = false;
 925
 926   // Setting unordered_write to true trades higher write throughput with
 927   // relaxing the immutability guarantee of snapshots. This violates the
 928   // repeatability one expects from ::Get from a snapshot, as well as
 929   // ::MultiGet and Iterator's consistent-point-in-time view property.
 930   // If the application cannot tolerate the relaxed guarantees, it can implement
 931   // its own mechanisms to work around that and yet benefit from the higher
 932   // throughput. Using TransactionDB with WRITE_PREPARED write policy and
 933   // two_write_queues=true is one way to achieve immutable snapshots despite
 934   // unordered_write.
 935   //
 936   // By default, i.e., when it is false, rocksdb does not advance the sequence
 937   // number for new snapshots unless all the writes with lower sequence numbers
 938   // are already finished. This provides the immutability that we except from
 939   // snapshots. Moreover, since Iterator and MultiGet internally depend on
 940   // snapshots, the snapshot immutability results into Iterator and MultiGet
 941   // offering consistent-point-in-time view. If set to true, although
 942   // Read-Your-Own-Write property is still provided, the snapshot immutability
 943   // property is relaxed: the writes issued after the snapshot is obtained (with
 944   // larger sequence numbers) will be still not visible to the reads from that
 945   // snapshot, however, there still might be pending writes (with lower sequence
 946   // number) that will change the state visible to the snapshot after they are
 947   // landed to the memtable.
 948   //
 949   // Default: false
 950   bool unordered_write = false;
 951
 952   // If true, allow multi-writers to update mem tables in parallel.
 953   // Only some memtable_factory-s support concurrent writes; currently it
 954   // is implemented only for SkipListFactory.  Concurrent memtable writes
 955   // are not compatible with inplace_update_support or filter_deletes.
 956   // It is strongly recommended to set enable_write_thread_adaptive_yield
 957   // if you are going to use this feature.
 958   //
 959   // Default: true
 960   bool allow_concurrent_memtable_write = true;
 961
 962   // If true, threads synchronizing with the write batch group leader will
 963   // wait for up to write_thread_max_yield_usec before blocking on a mutex.
 964   // This can substantially improve throughput for concurrent workloads,
 965   // regardless of whether allow_concurrent_memtable_write is enabled.
 966   //
 967   // Default: true
 968   bool enable_write_thread_adaptive_yield = true;
 969
 970   // The maximum limit of number of bytes that are written in a single batch
 971   // of WAL or memtable write. It is followed when the leader write size
 972   // is larger than 1/8 of this limit.
 973   //
 974   // Default: 1 MB
 975   uint64_t max_write_batch_group_size_bytes = 1 << 20;
 976
 977   // The maximum number of microseconds that a write operation will use
 978   // a yielding spin loop to coordinate with other write threads before
 979   // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
 980   // set properly) increasing this value is likely to increase RocksDB
 981   // throughput at the expense of increased CPU usage.
 982   //
 983   // Default: 100
 984   uint64_t write_thread_max_yield_usec = 100;
 985
 986   // The latency in microseconds after which a std::this_thread::yield
 987   // call (sched_yield on Linux) is considered to be a signal that
 988   // other processes or threads would like to use the current core.
 989   // Increasing this makes writer threads more likely to take CPU
 990   // by spinning, which will show up as an increase in the number of
 991   // involuntary context switches.
 992   //
 993   // Default: 3
 994   uint64_t write_thread_slow_yield_usec = 3;
 995
 996   // If true, then DB::Open() will not update the statistics used to optimize
 997   // compaction decision by loading table properties from many files.
 998   // Turning off this feature will improve DBOpen time especially in
 999   // disk environment.
1000   //
1001   // Default: false
1002   bool skip_stats_update_on_db_open = false;
1003
1004   // If true, then DB::Open() will not fetch and check sizes of all sst files.
1005   // This may significantly speed up startup if there are many sst files,
1006   // especially when using non-default Env with expensive GetFileSize().
1007   // We'll still check that all required sst files exist.
1008   // If paranoid_checks is false, this option is ignored, and sst files are
1009   // not checked at all.
1010   //
1011   // Default: false
1012   bool skip_checking_sst_file_sizes_on_db_open = false;
1013
1014   // Recovery mode to control the consistency while replaying WAL
1015   // Default: kPointInTimeRecovery
1016   WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
1017
1018   // if set to false then recovery will fail when a prepared
1019   // transaction is encountered in the WAL
1020   bool allow_2pc = false;
1021
1022   // A global cache for table-level rows.
1023   // Default: nullptr (disabled)
1024   // Not supported in ROCKSDB_LITE mode!
1025   std::shared_ptr<Cache> row_cache = nullptr;
1026
1027 #ifndef ROCKSDB_LITE
1028   // A filter object supplied to be invoked while processing write-ahead-logs
1029   // (WALs) during recovery. The filter provides a way to inspect log
1030   // records, ignoring a particular record or skipping replay.
1031   // The filter is invoked at startup and is invoked from a single-thread
1032   // currently.
1033   WalFilter* wal_filter = nullptr;
1034 #endif  // ROCKSDB_LITE
1035
1036   // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
1037   // / SetOptions will fail if options file is not detected or properly
1038   // persisted.
1039   //
1040   // DEFAULT: false
1041   bool fail_if_options_file_error = false;
1042
1043   // If true, then print malloc stats together with rocksdb.stats
1044   // when printing to LOG.
1045   // DEFAULT: false
1046   bool dump_malloc_stats = false;
1047
1048   // By default RocksDB replay WAL logs and flush them on DB open, which may
1049   // create very small SST files. If this option is enabled, RocksDB will try
1050   // to avoid (but not guarantee not to) flush during recovery. Also, existing
1051   // WAL logs will be kept, so that if crash happened before flush, we still
1052   // have logs to recover from.
1053   //
1054   // DEFAULT: false
1055   bool avoid_flush_during_recovery = false;
1056
1057   // By default RocksDB will flush all memtables on DB close if there are
1058   // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
1059   // DB close. Unpersisted data WILL BE LOST.
1060   //
1061   // DEFAULT: false
1062   //
1063   // Dynamically changeable through SetDBOptions() API.
1064   bool avoid_flush_during_shutdown = false;
1065
1066   // Set this option to true during creation of database if you want
1067   // to be able to ingest behind (call IngestExternalFile() skipping keys
1068   // that already exist, rather than overwriting matching keys).
1069   // Setting this option to true will affect 2 things:
1070   // 1) Disable some internal optimizations around SST file compression
1071   // 2) Reserve bottom-most level for ingested files only.
1072   // 3) Note that num_levels should be >= 3 if this option is turned on.
1073   //
1074   // DEFAULT: false
1075   // Immutable.
1076   bool allow_ingest_behind = false;
1077
1078   // Needed to support differential snapshots.
1079   // If set to true then DB will only process deletes with sequence number
1080   // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
1081   // Clients are responsible to periodically call this method to advance
1082   // the cutoff time. If this method is never called and preserve_deletes
1083   // is set to true NO deletes will ever be processed.
1084   // At the moment this only keeps normal deletes, SingleDeletes will
1085   // not be preserved.
1086   // DEFAULT: false
1087   // Immutable (TODO: make it dynamically changeable)
1088   bool preserve_deletes = false;
1089
1090   // If enabled it uses two queues for writes, one for the ones with
1091   // disable_memtable and one for the ones that also write to memtable. This
1092   // allows the memtable writes not to lag behind other writes. It can be used
1093   // to optimize MySQL 2PC in which only the commits, which are serial, write to
1094   // memtable.
1095   bool two_write_queues = false;
1096
1097   // If true WAL is not flushed automatically after each write. Instead it
1098   // relies on manual invocation of FlushWAL to write the WAL buffer to its
1099   // file.
1100   bool manual_wal_flush = false;
1101
1102   // If true, RocksDB supports flushing multiple column families and committing
1103   // their results atomically to MANIFEST. Note that it is not
1104   // necessary to set atomic_flush to true if WAL is always enabled since WAL
1105   // allows the database to be restored to the last persistent state in WAL.
1106   // This option is useful when there are column families with writes NOT
1107   // protected by WAL.
1108   // For manual flush, application has to specify which column families to
1109   // flush atomically in DB::Flush.
1110   // For auto-triggered flush, RocksDB atomically flushes ALL column families.
1111   //
1112   // Currently, any WAL-enabled writes after atomic flush may be replayed
1113   // independently if the process crashes later and tries to recover.
1114   bool atomic_flush = false;
1115
1116   // If true, working thread may avoid doing unnecessary and long-latency
1117   // operation (such as deleting obsolete files directly or deleting memtable)
1118   // and will instead schedule a background job to do it.
1119   // Use it if you're latency-sensitive.
1120   // If set to true, takes precedence over
1121   // ReadOptions::background_purge_on_iterator_cleanup.
1122   bool avoid_unnecessary_blocking_io = false;
1123
1124   // Historically DB ID has always been stored in Identity File in DB folder.
1125   // If this flag is true, the DB ID is written to Manifest file in addition
1126   // to the Identity file. By doing this 2 problems are solved
1127   // 1. We don't checksum the Identity file where as Manifest file is.
1128   // 2. Since the source of truth for DB is Manifest file DB ID will sit with
1129   //    the source of truth. Previously the Identity file could be copied
1130   //    independent of Manifest and that can result in wrong DB ID.
1131   // We recommend setting this flag to true.
1132   // Default: false
1133   bool write_dbid_to_manifest = false;
1134
1135   // The number of bytes to prefetch when reading the log. This is mostly useful
1136   // for reading a remotely located log, as it can save the number of
1137   // round-trips. If 0, then the prefetching is disabled.
1138   //
1139   // Default: 0
1140   size_t log_readahead_size = 0;
1141
1142   // If user does NOT provide the checksum generator factory, the file checksum
1143   // will NOT be used. A new file checksum generator object will be created
1144   // when a SST file is created. Therefore, each created FileChecksumGenerator
1145   // will only be used from a single thread and so does not need to be
1146   // thread-safe.
1147   //
1148   // Default: nullptr
1149   std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
1150
1151   // By default, RocksDB recovery fails if any table file referenced in
1152   // MANIFEST are missing after scanning the MANIFEST.
1153   // Best-efforts recovery is another recovery mode that
1154   // tries to restore the database to the most recent point in time without
1155   // missing file.
1156   // Currently not compatible with atomic flush. Furthermore, WAL files will
1157   // not be used for recovery if best_efforts_recovery is true.
1158   // Default: false
1159   bool best_efforts_recovery = false;
1160
1161   // It defines how many times db resume is called by a separate thread when
1162   // background retryable IO Error happens. When background retryable IO
1163   // Error happens, SetBGError is called to deal with the error. If the error
1164   // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
1165   // then db resume is called in background to recover from the error. If this
1166   // value is 0 or negative, db resume will not be called.
1167   //
1168   // Default: INT_MAX
1169   int max_bgerror_resume_count = INT_MAX;
1170
1171   // If max_bgerror_resume_count is >= 2, db resume is called multiple times.
1172   // This option decides how long to wait to retry the next resume if the
1173   // previous resume fails and satisfy redo resume conditions.
1174   //
1175   // Default: 1000000 (microseconds).
1176   uint64_t bgerror_resume_retry_interval = 1000000;
1177
1178   // It allows user to opt-in to get error messages containing corrupted
1179   // keys/values. Corrupt keys, values will be logged in the
1180   // messages/logs/status that will help users with the useful information
1181   // regarding affected data. By default value is set false to prevent users
1182   // data to be exposed in the logs/messages etc.
1183   //
1184   // Default: false
1185   bool allow_data_in_errors = false;
1186
1187   // A string identifying the machine hosting the DB. This
1188   // will be written as a property in every SST file written by the DB (or
1189   // by offline writers such as SstFileWriter and RepairDB). It can be useful
1190   // for troubleshooting in memory corruption caused by a failing host when
1191   // writing a file, by tracing back to the writing host. These corruptions
1192   // may not be caught by the checksum since they happen before checksumming.
1193   // If left as default, the table writer will substitute it with the actual
1194   // hostname when writing the SST file. If set to an empty stirng, the
1195   // property will not be written to the SST file.
1196   //
1197   // Default: hostname
1198   std::string db_host_id = kHostnameForDbHostId;
1199 };
1200
1201 // Options to control the behavior of a database (passed to DB::Open)
1202 struct Options : public DBOptions, public ColumnFamilyOptions {
1203   // Create an Options object with default values for all fields.
1204   Options() : DBOptions(), ColumnFamilyOptions() {}
1205
1206   Options(const DBOptions& db_options,
1207           const ColumnFamilyOptions& column_family_options)
1208       : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
1209
1210   // The function recovers options to the option as in version 4.6.
1211   Options* OldDefaults(int rocksdb_major_version = 4,
1212                        int rocksdb_minor_version = 6);
1213
1214   void Dump(Logger* log) const;
1215
1216   void DumpCFOptions(Logger* log) const;
1217
1218   // Some functions that make it easier to optimize RocksDB
1219
1220   // Set appropriate parameters for bulk loading.
1221   // The reason that this is a function that returns "this" instead of a
1222   // constructor is to enable chaining of multiple similar calls in the future.
1223   //
1224
1225   // All data will be in level 0 without any automatic compaction.
1226   // It's recommended to manually call CompactRange(NULL, NULL) before reading
1227   // from the database, because otherwise the read can be very slow.
1228   Options* PrepareForBulkLoad();
1229
1230   // Use this if your DB is very small (like under 1GB) and you don't want to
1231   // spend lots of memory for memtables.
1232   Options* OptimizeForSmallDb();
1233 };
1234
1235 //
1236 // An application can issue a read request (via Get/Iterators) and specify
1237 // if that read should process data that ALREADY resides on a specified cache
1238 // level. For example, if an application specifies kBlockCacheTier then the
1239 // Get call will process data that is already processed in the memtable or
1240 // the block cache. It will not page in data from the OS cache or data that
1241 // resides in storage.
1242 enum ReadTier {
1243   kReadAllTier = 0x0,     // data in memtable, block cache, OS cache or storage
1244   kBlockCacheTier = 0x1,  // data in memtable or block cache
1245   kPersistedTier = 0x2,   // persisted data.  When WAL is disabled, this option
1246                           // will skip data in memtable.
1247                           // Note that this ReadTier currently only supports
1248                           // Get and MultiGet and does not support iterators.
1249   kMemtableTier = 0x3     // data in memtable. used for memtable-only iterators.
1250 };
1251
1252 // Options that control read operations
1253 struct ReadOptions {
1254   // If "snapshot" is non-nullptr, read as of the supplied snapshot
1255   // (which must belong to the DB that is being read and which must
1256   // not have been released).  If "snapshot" is nullptr, use an implicit
1257   // snapshot of the state at the beginning of this read operation.
1258   // Default: nullptr
1259   const Snapshot* snapshot;
1260
1261   // `iterate_lower_bound` defines the smallest key at which the backward
1262   // iterator can return an entry. Once the bound is passed, Valid() will be
1263   // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
1264   // entry.
1265   //
1266   // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
1267   // need to have the same prefix. This is because ordering is not guaranteed
1268   // outside of prefix domain.
1269   //
1270   // Default: nullptr
1271   const Slice* iterate_lower_bound;
1272
1273   // "iterate_upper_bound" defines the extent upto which the forward iterator
1274   // can returns entries. Once the bound is reached, Valid() will be false.
1275   // "iterate_upper_bound" is exclusive ie the bound value is
1276   // not a valid entry. If prefix_extractor is not null, the Seek target
1277   // and iterate_upper_bound need to have the same prefix.
1278   // This is because ordering is not guaranteed outside of prefix domain.
1279   //
1280   // Default: nullptr
1281   const Slice* iterate_upper_bound;
1282
1283   // RocksDB does auto-readahead for iterators on noticing more than two reads
1284   // for a table file. The readahead starts at 8KB and doubles on every
1285   // additional read upto 256KB.
1286   // This option can help if most of the range scans are large, and if it is
1287   // determined that a larger readahead than that enabled by auto-readahead is
1288   // needed.
1289   // Using a large readahead size (> 2MB) can typically improve the performance
1290   // of forward iteration on spinning disks.
1291   // Default: 0
1292   size_t readahead_size;
1293
1294   // A threshold for the number of keys that can be skipped before failing an
1295   // iterator seek as incomplete. The default value of 0 should be used to
1296   // never fail a request as incomplete, even on skipping too many keys.
1297   // Default: 0
1298   uint64_t max_skippable_internal_keys;
1299
1300   // Specify if this read request should process data that ALREADY
1301   // resides on a particular cache. If the required data is not
1302   // found at the specified cache, then Status::Incomplete is returned.
1303   // Default: kReadAllTier
1304   ReadTier read_tier;
1305
1306   // If true, all data read from underlying storage will be
1307   // verified against corresponding checksums.
1308   // Default: true
1309   bool verify_checksums;
1310
1311   // Should the "data block"/"index block"" read for this iteration be placed in
1312   // block cache?
1313   // Callers may wish to set this field to false for bulk scans.
1314   // This would help not to the change eviction order of existing items in the
1315   // block cache.
1316   // Default: true
1317   bool fill_cache;
1318
1319   // Specify to create a tailing iterator -- a special iterator that has a
1320   // view of the complete database (i.e. it can also be used to read newly
1321   // added data) and is optimized for sequential reads. It will return records
1322   // that were inserted into the database after the creation of the iterator.
1323   // Default: false
1324   // Not supported in ROCKSDB_LITE mode!
1325   bool tailing;
1326
1327   // This options is not used anymore. It was to turn on a functionality that
1328   // has been removed.
1329   bool managed;
1330
1331   // Enable a total order seek regardless of index format (e.g. hash index)
1332   // used in the table. Some table format (e.g. plain table) may not support
1333   // this option.
1334   // If true when calling Get(), we also skip prefix bloom when reading from
1335   // block based table. It provides a way to read existing data after
1336   // changing implementation of prefix extractor.
1337   // Default: false
1338   bool total_order_seek;
1339
1340   // When true, by default use total_order_seek = true, and RocksDB can
1341   // selectively enable prefix seek mode if won't generate a different result
1342   // from total_order_seek, based on seek key, and iterator upper bound.
1343   // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
1344   // prefix mode is not used.
1345   // Default: false
1346   bool auto_prefix_mode;
1347
1348   // Enforce that the iterator only iterates over the same prefix as the seek.
1349   // This option is effective only for prefix seeks, i.e. prefix_extractor is
1350   // non-null for the column family and total_order_seek is false.  Unlike
1351   // iterate_upper_bound, prefix_same_as_start only works within a prefix
1352   // but in both directions.
1353   // Default: false
1354   bool prefix_same_as_start;
1355
1356   // Keep the blocks loaded by the iterator pinned in memory as long as the
1357   // iterator is not deleted, If used when reading from tables created with
1358   // BlockBasedTableOptions::use_delta_encoding = false,
1359   // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
1360   // return 1.
1361   // Default: false
1362   bool pin_data;
1363
1364   // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
1365   // schedule a background job in the flush job queue and delete obsolete files
1366   // in background.
1367   // Default: false
1368   bool background_purge_on_iterator_cleanup;
1369
1370   // If true, keys deleted using the DeleteRange() API will be visible to
1371   // readers until they are naturally deleted during compaction. This improves
1372   // read performance in DBs with many range deletions.
1373   // Default: false
1374   bool ignore_range_deletions;
1375
1376   // A callback to determine whether relevant keys for this scan exist in a
1377   // given table based on the table's properties. The callback is passed the
1378   // properties of each table during iteration. If the callback returns false,
1379   // the table will not be scanned. This option only affects Iterators and has
1380   // no impact on point lookups.
1381   // Default: empty (every table will be scanned)
1382   std::function<bool(const TableProperties&)> table_filter;
1383
1384   // Needed to support differential snapshots. Has 2 effects:
1385   // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
1386   // 2) if this param > 0 iterator will return INTERNAL keys instead of
1387   //    user keys; e.g. return tombstones as well.
1388   // Default: 0 (don't filter by seqnum, return user keys)
1389   SequenceNumber iter_start_seqnum;
1390
1391   // Timestamp of operation. Read should return the latest data visible to the
1392   // specified timestamp. All timestamps of the same database must be of the
1393   // same length and format. The user is responsible for providing a customized
1394   // compare function via Comparator to order <key, timestamp> tuples.
1395   // For iterator, iter_start_ts is the lower bound (older) and timestamp
1396   // serves as the upper bound. Versions of the same record that fall in
1397   // the timestamp range will be returned. If iter_start_ts is nullptr,
1398   // only the most recent version visible to timestamp is returned.
1399   // The user-specified timestamp feature is still under active development,
1400   // and the API is subject to change.
1401   // Default: nullptr
1402   const Slice* timestamp;
1403   const Slice* iter_start_ts;
1404
1405   // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
1406   // in microseconds.
1407   // It should be set to microseconds since epoch, i.e, gettimeofday or
1408   // equivalent plus allowed duration in microseconds. The best way is to use
1409   // env->NowMicros() + some timeout.
1410   // This is best efforts. The call may exceed the deadline if there is IO
1411   // involved and the file system doesn't support deadlines, or due to
1412   // checking for deadline periodically rather than for every key if
1413   // processing a batch
1414   std::chrono::microseconds deadline;
1415
1416   // A timeout in microseconds to be passed to the underlying FileSystem for
1417   // reads. As opposed to deadline, this determines the timeout for each
1418   // individual file read request. If a MultiGet/Get/Seek/Next etc call
1419   // results in multiple reads, each read can last upto io_timeout us.
1420   std::chrono::microseconds io_timeout;
1421
1422   // It limits the maximum cumulative value size of the keys in batch while
1423   // reading through MultiGet. Once the cumulative value size exceeds this
1424   // soft limit then all the remaining keys are returned with status Aborted.
1425   //
1426   // Default: std::numeric_limits<uint64_t>::max()
1427   uint64_t value_size_soft_limit;
1428
1429   ReadOptions();
1430   ReadOptions(bool cksum, bool cache);
1431 };
1432
1433 // Options that control write operations
1434 struct WriteOptions {
1435   // If true, the write will be flushed from the operating system
1436   // buffer cache (by calling WritableFile::Sync()) before the write
1437   // is considered complete.  If this flag is true, writes will be
1438   // slower.
1439   //
1440   // If this flag is false, and the machine crashes, some recent
1441   // writes may be lost.  Note that if it is just the process that
1442   // crashes (i.e., the machine does not reboot), no writes will be
1443   // lost even if sync==false.
1444   //
1445   // In other words, a DB write with sync==false has similar
1446   // crash semantics as the "write()" system call.  A DB write
1447   // with sync==true has similar crash semantics to a "write()"
1448   // system call followed by "fdatasync()".
1449   //
1450   // Default: false
1451   bool sync;
1452
1453   // If true, writes will not first go to the write ahead log,
1454   // and the write may get lost after a crash. The backup engine
1455   // relies on write-ahead logs to back up the memtable, so if
1456   // you disable write-ahead logs, you must create backups with
1457   // flush_before_backup=true to avoid losing unflushed memtable data.
1458   // Default: false
1459   bool disableWAL;
1460
1461   // If true and if user is trying to write to column families that don't exist
1462   // (they were dropped),  ignore the write (don't return an error). If there
1463   // are multiple writes in a WriteBatch, other writes will succeed.
1464   // Default: false
1465   bool ignore_missing_column_families;
1466
1467   // If true and we need to wait or sleep for the write request, fails
1468   // immediately with Status::Incomplete().
1469   // Default: false
1470   bool no_slowdown;
1471
1472   // If true, this write request is of lower priority if compaction is
1473   // behind. In this case, no_slowdown = true, the request will be cancelled
1474   // immediately with Status::Incomplete() returned. Otherwise, it will be
1475   // slowed down. The slowdown value is determined by RocksDB to guarantee
1476   // it introduces minimum impacts to high priority writes.
1477   //
1478   // Default: false
1479   bool low_pri;
1480
1481   // If true, this writebatch will maintain the last insert positions of each
1482   // memtable as hints in concurrent write. It can improve write performance
1483   // in concurrent writes if keys in one writebatch are sequential. In
1484   // non-concurrent writes (when concurrent_memtable_writes is false) this
1485   // option will be ignored.
1486   //
1487   // Default: false
1488   bool memtable_insert_hint_per_batch;
1489
1490   // Timestamp of write operation, e.g. Put. All timestamps of the same
1491   // database must share the same length and format. The user is also
1492   // responsible for providing a customized compare function via Comparator to
1493   // order <key, timestamp> tuples. If the user wants to enable timestamp, then
1494   // all write operations must be associated with timestamp because RocksDB, as
1495   // a single-node storage engine currently has no knowledge of global time,
1496   // thus has to rely on the application.
1497   // The user-specified timestamp feature is still under active development,
1498   // and the API is subject to change.
1499   const Slice* timestamp;
1500
1501   WriteOptions()
1502       : sync(false),
1503         disableWAL(false),
1504         ignore_missing_column_families(false),
1505         no_slowdown(false),
1506         low_pri(false),
1507         memtable_insert_hint_per_batch(false),
1508         timestamp(nullptr) {}
1509 };
1510
1511 // Options that control flush operations
1512 struct FlushOptions {
1513   // If true, the flush will wait until the flush is done.
1514   // Default: true
1515   bool wait;
1516   // If true, the flush would proceed immediately even it means writes will
1517   // stall for the duration of the flush; if false the operation will wait
1518   // until it's possible to do flush w/o causing stall or until required flush
1519   // is performed by someone else (foreground call or background thread).
1520   // Default: false
1521   bool allow_write_stall;
1522   FlushOptions() : wait(true), allow_write_stall(false) {}
1523 };
1524
1525 // Create a Logger from provided DBOptions
1526 extern Status CreateLoggerFromOptions(const std::string& dbname,
1527                                       const DBOptions& options,
1528                                       std::shared_ptr<Logger>* logger);
1529
1530 // CompactionOptions are used in CompactFiles() call.
1531 struct CompactionOptions {
1532   // Compaction output compression type
1533   // Default: snappy
1534   // If set to `kDisableCompressionOption`, RocksDB will choose compression type
1535   // according to the `ColumnFamilyOptions`, taking into account the output
1536   // level if `compression_per_level` is specified.
1537   CompressionType compression;
1538   // Compaction will create files of size `output_file_size_limit`.
1539   // Default: MAX, which means that compaction will create a single file
1540   uint64_t output_file_size_limit;
1541   // If > 0, it will replace the option in the DBOptions for this compaction.
1542   uint32_t max_subcompactions;
1543
1544   CompactionOptions()
1545       : compression(kSnappyCompression),
1546         output_file_size_limit(std::numeric_limits<uint64_t>::max()),
1547         max_subcompactions(0) {}
1548 };
1549
1550 // For level based compaction, we can configure if we want to skip/force
1551 // bottommost level compaction.
1552 enum class BottommostLevelCompaction {
1553   // Skip bottommost level compaction
1554   kSkip,
1555   // Only compact bottommost level if there is a compaction filter
1556   // This is the default option
1557   kIfHaveCompactionFilter,
1558   // Always compact bottommost level
1559   kForce,
1560   // Always compact bottommost level but in bottommost level avoid
1561   // double-compacting files created in the same compaction
1562   kForceOptimized,
1563 };
1564
1565 // CompactRangeOptions is used by CompactRange() call.
1566 struct CompactRangeOptions {
1567   // If true, no other compaction will run at the same time as this
1568   // manual compaction
1569   bool exclusive_manual_compaction = true;
1570   // If true, compacted files will be moved to the minimum level capable
1571   // of holding the data or given level (specified non-negative target_level).
1572   bool change_level = false;
1573   // If change_level is true and target_level have non-negative value, compacted
1574   // files will be moved to target_level.
1575   int target_level = -1;
1576   // Compaction outputs will be placed in options.db_paths[target_path_id].
1577   // Behavior is undefined if target_path_id is out of range.
1578   uint32_t target_path_id = 0;
1579   // By default level based compaction will only compact the bottommost level
1580   // if there is a compaction filter
1581   BottommostLevelCompaction bottommost_level_compaction =
1582       BottommostLevelCompaction::kIfHaveCompactionFilter;
1583   // If true, will execute immediately even if doing so would cause the DB to
1584   // enter write stall mode. Otherwise, it'll sleep until load is low enough.
1585   bool allow_write_stall = false;
1586   // If > 0, it will replace the option in the DBOptions for this compaction.
1587   uint32_t max_subcompactions = 0;
1588 };
1589
1590 // IngestExternalFileOptions is used by IngestExternalFile()
1591 struct IngestExternalFileOptions {
1592   // Can be set to true to move the files instead of copying them.
1593   bool move_files = false;
1594   // If set to true, ingestion falls back to copy when move fails.
1595   bool failed_move_fall_back_to_copy = true;
1596   // If set to false, an ingested file keys could appear in existing snapshots
1597   // that where created before the file was ingested.
1598   bool snapshot_consistency = true;
1599   // If set to false, IngestExternalFile() will fail if the file key range
1600   // overlaps with existing keys or tombstones in the DB.
1601   bool allow_global_seqno = true;
1602   // If set to false and the file key range overlaps with the memtable key range
1603   // (memtable flush required), IngestExternalFile will fail.
1604   bool allow_blocking_flush = true;
1605   // Set to true if you would like duplicate keys in the file being ingested
1606   // to be skipped rather than overwriting existing data under that key.
1607   // Usecase: back-fill of some historical data in the database without
1608   // over-writing existing newer version of data.
1609   // This option could only be used if the DB has been running
1610   // with allow_ingest_behind=true since the dawn of time.
1611   // All files will be ingested at the bottommost level with seqno=0.
1612   bool ingest_behind = false;
1613   // Set to true if you would like to write global_seqno to a given offset in
1614   // the external SST file for backward compatibility. Older versions of
1615   // RocksDB writes a global_seqno to a given offset within ingested SST files,
1616   // and new versions of RocksDB do not. If you ingest an external SST using
1617   // new version of RocksDB and would like to be able to downgrade to an
1618   // older version of RocksDB, you should set 'write_global_seqno' to true. If
1619   // your service is just starting to use the new RocksDB, we recommend that
1620   // you set this option to false, which brings two benefits:
1621   // 1. No extra random write for global_seqno during ingestion.
1622   // 2. Without writing external SST file, it's possible to do checksum.
1623   // We have a plan to set this option to false by default in the future.
1624   bool write_global_seqno = true;
1625   // Set to true if you would like to verify the checksums of each block of the
1626   // external SST file before ingestion.
1627   // Warning: setting this to true causes slowdown in file ingestion because
1628   // the external SST file has to be read.
1629   bool verify_checksums_before_ingest = false;
1630   // When verify_checksums_before_ingest = true, RocksDB uses default
1631   // readahead setting to scan the file while verifying checksums before
1632   // ingestion.
1633   // Users can override the default value using this option.
1634   // Using a large readahead size (> 2MB) can typically improve the performance
1635   // of forward iteration on spinning disks.
1636   size_t verify_checksums_readahead_size = 0;
1637   // Set to TRUE if user wants to verify the sst file checksum of ingested
1638   // files. The DB checksum function will generate the checksum of each
1639   // ingested file (if file_checksum_gen_factory is set) and compare the
1640   // checksum function name and checksum with the ingested checksum information.
1641   //
1642   // If this option is set to True: 1) if DB does not enable checksum
1643   // (file_checksum_gen_factory == nullptr), the ingested checksum information
1644   // will be ignored; 2) If DB enable the checksum function, we calculate the
1645   // sst file checksum after the file is moved or copied and compare the
1646   // checksum and checksum name. If checksum or checksum function name does
1647   // not match, ingestion will be failed. If the verification is sucessful,
1648   // checksum and checksum function name will be stored in Manifest.
1649   // If this option is set to FALSE, 1) if DB does not enable checksum,
1650   // the ingested checksum information will be ignored; 2) if DB enable the
1651   // checksum, we only verify the ingested checksum function name and we
1652   // trust the ingested checksum. If the checksum function name matches, we
1653   // store the checksum in Manifest. DB does not calculate the checksum during
1654   // ingestion. However, if no checksum information is provided with the
1655   // ingested files, DB will generate the checksum and store in the Manifest.
1656   bool verify_file_checksum = true;
1657 };
1658
1659 enum TraceFilterType : uint64_t {
1660   // Trace all the operations
1661   kTraceFilterNone = 0x0,
1662   // Do not trace the get operations
1663   kTraceFilterGet = 0x1 << 0,
1664   // Do not trace the write operations
1665   kTraceFilterWrite = 0x1 << 1
1666 };
1667
1668 // TraceOptions is used for StartTrace
1669 struct TraceOptions {
1670   // To avoid the trace file size grows large than the storage space,
1671   // user can set the max trace file size in Bytes. Default is 64GB
1672   uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
1673   // Specify trace sampling option, i.e. capture one per how many requests.
1674   // Default to 1 (capture every request).
1675   uint64_t sampling_frequency = 1;
1676   // Note: The filtering happens before sampling.
1677   uint64_t filter = kTraceFilterNone;
1678 };
1679
1680 // ImportColumnFamilyOptions is used by ImportColumnFamily()
1681 struct ImportColumnFamilyOptions {
1682   // Can be set to true to move the files instead of copying them.
1683   bool move_files = false;
1684 };
1685
1686 // Options used with DB::GetApproximateSizes()
1687 struct SizeApproximationOptions {
1688   // Defines whether the returned size should include the recently written
1689   // data in the mem-tables. If set to false, include_files must be true.
1690   bool include_memtabtles = false;
1691   // Defines whether the returned size should include data serialized to disk.
1692   // If set to false, include_memtabtles must be true.
1693   bool include_files = true;
1694   // When approximating the files total size that is used to store a keys range
1695   // using DB::GetApproximateSizes, allow approximation with an error margin of
1696   // up to total_files_size * files_size_error_margin. This allows to take some
1697   // shortcuts in files size approximation, resulting in better performance,
1698   // while guaranteeing the resulting error is within a reasonable margin.
1699   // E.g., if the value is 0.1, then the error margin of the returned files size
1700   // approximation will be within 10%.
1701   // If the value is non-positive - a more precise yet more CPU intensive
1702   // estimation is performed.
1703   double files_size_error_margin = -1.0;
1704 };
1705
1706 }  // namespace ROCKSDB_NAMESPACE