update ceph source to reef 18.1.2

[ceph.git] / ceph / src / rocksdb / include / rocksdb / options.h
diff --git a/ceph/src/rocksdb/include/rocksdb/options.h b/ceph/src/rocksdb/include/rocksdb/options.h

index 79ab8af54d2ceb5ccd125286ccf9c34c8ff6631b..7a4d8b5a6892702584a1562111c3ad44c121a45e 100644 (file)
--- a/ceph/src/rocksdb/include/rocksdb/options.h
+++ b/ceph/src/rocksdb/include/rocksdb/options.h
@@ -20,6 +20,8 @@
  #include "rocksdb/advanced_options.h"
  #include "rocksdb/comparator.h"
  #include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/data_structure.h"
  #include "rocksdb/env.h"
  #include "rocksdb/file_checksum.h"
  #include "rocksdb/listener.h"
@@ -58,9 +60,15 @@ class FileSystem;
  struct Options;
  struct DbPath;
  
+using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+
  struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
    // The function recovers options to a previous version. Only 4.6 or later
    // versions are supported.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
    ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
                                     int rocksdb_minor_version = 6);
  
@@ -125,9 +133,10 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
    // Allows an application to modify/delete a key-value during background
    // compaction.
    //
-  // If the client requires a new compaction filter to be used for different
-  // compaction runs, it can specify compaction_filter_factory instead of this
-  // option.  The client should specify only one of the two.
+  // If the client requires a new `CompactionFilter` to be used for different
+  // compaction runs and/or requires a `CompactionFilter` for table file
+  // creations outside of compaction, it can specify compaction_filter_factory
+  // instead of this option.  The client should specify only one of the two.
    // compaction_filter takes precedence over compaction_filter_factory if
    // client specifies both.
    //
@@ -138,12 +147,21 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
    // Default: nullptr
    const CompactionFilter* compaction_filter = nullptr;
  
-  // This is a factory that provides compaction filter objects which allow
-  // an application to modify/delete a key-value during background compaction.
+  // This is a factory that provides `CompactionFilter` objects which allow
+  // an application to modify/delete a key-value during table file creation.
+  //
+  // Unlike the `compaction_filter` option, which is used when compaction
+  // creates a table file, this factory allows using a `CompactionFilter` when a
+  // table file is created for various reasons. The factory can decide what
+  // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by
+  // default the decision is to use a `CompactionFilter` for
+  // `TableFileCreationReason::kCompaction` only.
    //
-  // A new filter will be created on each compaction run.  If multithreaded
-  // compaction is being used, each created CompactionFilter will only be used
-  // from a single thread and so does not need to be thread-safe.
+  // Each thread of work involving creating table files will create a new
+  // `CompactionFilter` when it will be used according to the above
+  // `TableFileCreationReason`-based decision. This allows the application to
+  // know about the different ongoing threads of work and makes it unnecessary
+  // for `CompactionFilter` to provide thread-safety.
    //
    // Default: nullptr
    std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
@@ -197,14 +215,18 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
    CompressionType compression;
  
    // Compression algorithm that will be used for the bottommost level that
-  // contain files.
+  // contain files. The behavior for num_levels = 1 is not well defined.
+  // Right now, with num_levels = 1,  all compaction outputs will use
+  // bottommost_compression and all flush outputs still use options.compression,
+  // but the behavior is subject to change.
    //
    // Default: kDisableCompressionOption (Disabled)
    CompressionType bottommost_compression = kDisableCompressionOption;
  
    // different options for compression algorithms used by bottommost_compression
    // if it is enabled. To enable it, please see the definition of
-  // CompressionOptions.
+  // CompressionOptions. Behavior for num_levels = 1 is the same as
+  // options.bottommost_compression.
    CompressionOptions bottommost_compression_opts;
  
    // different options for compression algorithms
@@ -218,18 +240,36 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
    // Dynamically changeable through SetOptions() API
    int level0_file_num_compaction_trigger = 4;
  
-  // If non-nullptr, use the specified function to determine the
-  // prefixes for keys.  These prefixes will be placed in the filter.
-  // Depending on the workload, this can reduce the number of read-IOP
-  // cost for scans when a prefix is passed via ReadOptions to
-  // db.NewIterator().  For prefix filtering to work properly,
-  // "prefix_extractor" and "comparator" must be such that the following
-  // properties hold:
-  //
-  // 1) key.starts_with(prefix(key))
-  // 2) Compare(prefix(key), key) <= 0.
-  // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
-  // 4) prefix(prefix(key)) == prefix(key)
+  // If non-nullptr, use the specified function to put keys in contiguous
+  // groups called "prefixes". These prefixes are used to place one
+  // representative entry for the group into the Bloom filter
+  // rather than an entry for each key (see whole_key_filtering).
+  // Under certain conditions, this enables optimizing some range queries
+  // (Iterators) in addition to some point lookups (Get/MultiGet).
+  //
+  // Together `prefix_extractor` and `comparator` must satisfy one essential
+  // property for valid prefix filtering of range queries:
+  //   If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
+  //      InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
+  //   Then InDomain(k2) and prefix(k2) == prefix(k1)
+  //
+  // In other words, all keys with the same prefix must be in a contiguous
+  // group by comparator order, and cannot be interrupted by keys with no
+  // prefix ("out of domain"). (This makes it valid to conclude that no
+  // entries within some bounds are present if the upper and lower bounds
+  // have a common prefix and no entries with that same prefix are present.)
+  //
+  // Some other properties are recommended but not strictly required. Under
+  // most sensible comparators, the following will need to hold true to
+  // satisfy the essential property above:
+  // * "Prefix is a prefix": key.starts_with(prefix(key))
+  // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
+  //   Compare(prefix(k1), prefix(k2)) <= 0
+  //
+  // The next two properties ensure that seeking to a prefix allows
+  // enumerating all entries with that prefix:
+  // * "Prefix starts the group": Compare(prefix(key), key) <= 0
+  // * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
    //
    // Default: nullptr
    std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
@@ -351,8 +391,68 @@ struct DbPath {
  
  extern const char* kHostnameForDbHostId;
  
+enum class CompactionServiceJobStatus : char {
+  kSuccess,
+  kFailure,
+  kUseLocal,
+};
+
+struct CompactionServiceJobInfo {
+  std::string db_name;
+  std::string db_id;
+  std::string db_session_id;
+  uint64_t job_id;  // job_id is only unique within the current DB and session,
+                    // restart DB will reset the job_id. `db_id` and
+                    // `db_session_id` could help you build unique id across
+                    // different DBs and sessions.
+
+  Env::Priority priority;
+
+  CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
+                           std::string db_session_id_, uint64_t job_id_,
+                           Env::Priority priority_)
+      : db_name(std::move(db_name_)),
+        db_id(std::move(db_id_)),
+        db_session_id(std::move(db_session_id_)),
+        job_id(job_id_),
+        priority(priority_) {}
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionService : public Customizable {
+ public:
+  static const char* Type() { return "CompactionService"; }
+
+  // Returns the name of this compaction service.
+  const char* Name() const override = 0;
+
+  // Start the remote compaction with `compaction_service_input`, which can be
+  // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the
+  // information the user might want to know, which includes `job_id`.
+  virtual CompactionServiceJobStatus StartV2(
+      const CompactionServiceJobInfo& /*info*/,
+      const std::string& /*compaction_service_input*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  // Wait for remote compaction to finish.
+  virtual CompactionServiceJobStatus WaitForCompleteV2(
+      const CompactionServiceJobInfo& /*info*/,
+      std::string* /*compaction_service_result*/) {
+    return CompactionServiceJobStatus::kUseLocal;
+  }
+
+  ~CompactionService() override = default;
+};
+
  struct DBOptions {
    // The function recovers options to the option as in version 4.6.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
    DBOptions* OldDefaults(int rocksdb_major_version = 4,
                           int rocksdb_minor_version = 6);
  
@@ -393,20 +493,49 @@ struct DBOptions {
    // Default: true
    bool paranoid_checks = true;
  
-  // If true, track WALs in MANIFEST and verify them on recovery.
+  // If true, during memtable flush, RocksDB will validate total entries
+  // read in flush, and compare with counter inserted into it.
+  // The option is here to turn the feature off in case this new validation
+  // feature has a bug.
+  // Default: true
+  bool flush_verify_memtable_count = true;
+
+  // If true, the log numbers and sizes of the synced WALs are tracked
+  // in MANIFEST. During DB recovery, if a synced WAL is missing
+  // from disk, or the WAL's size does not match the recorded size in
+  // MANIFEST, an error will be reported and the recovery will be aborted.
    //
-  // If a WAL is tracked in MANIFEST but is missing from disk on recovery,
-  // or the size of the tracked WAL is larger than the WAL's on-disk size,
-  // an error is reported and recovery is aborted.
+  // This is one additional protection against WAL corruption besides the
+  // per-WAL-entry checksum.
    //
-  // If a WAL is not tracked in MANIFEST, then no verification will happen
-  // during recovery.
+  // Note that this option does not work with secondary instance.
+  // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`,
+  // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not
+  // tracked for performance/efficiency reasons.
    //
    // Default: false
-  // FIXME(cheng): This option is part of a work in progress and does not yet
-  // work
    bool track_and_verify_wals_in_manifest = false;
  
+  // If true, verifies the SST unique id between MANIFEST and actual file
+  // each time an SST file is opened. This check ensures an SST file is not
+  // overwritten or misplaced. A corruption error will be reported if mismatch
+  // detected, but only when MANIFEST tracks the unique id, which starts from
+  // RocksDB version 7.3. Although the tracked internal unique id is related
+  // to the one returned by GetUniqueIdFromTableProperties, that is subject to
+  // change.
+  // NOTE: verification is currently only done on SST files using block-based
+  // table format.
+  //
+  // Setting to false should only be needed in case of unexpected problems.
+  //
+  // Although an early version of this option opened all SST files for
+  // verification on DB::Open, that is no longer guaranteed. However, as
+  // documented in an above option, if max_open_files is -1, DB will open all
+  // files on DB::Open().
+  //
+  // Default: true
+  bool verify_sst_unique_id_in_manifest = true;
+
    // Use the specified object to interact with the environment,
    // e.g. to read/write files, schedule background work, etc. In the near
    // future, support for doing storage operations such as read/write files
@@ -414,9 +543,21 @@ struct DBOptions {
    // Default: Env::Default()
    Env* env = Env::Default();
  
-  // Use to control write rate of flush and compaction. Flush has higher
-  // priority than compaction. Rate limiting is disabled if nullptr.
-  // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
+  // Limits internal file read/write bandwidth:
+  //
+  // - Flush requests write bandwidth at `Env::IOPriority::IO_HIGH`
+  // - Compaction requests read and write bandwidth at
+  //   `Env::IOPriority::IO_LOW`
+  // - Reads associated with a `ReadOptions` can be charged at
+  //   `ReadOptions::rate_limiter_priority` (see that option's API doc for usage
+  //   and limitations).
+  // - Writes associated with a `WriteOptions` can be charged at
+  //   `WriteOptions::rate_limiter_priority` (see that option's API doc for
+  //   usage and limitations).
+  //
+  // Rate limiting is disabled if nullptr. If rate limiter is enabled,
+  // bytes_per_sync is set to 1MB by default.
+  //
    // Default: nullptr
    std::shared_ptr<RateLimiter> rate_limiter = nullptr;
  
@@ -454,6 +595,10 @@ struct DBOptions {
    // on target_file_size_base and target_file_size_multiplier for level-based
    // compaction. For universal-style compaction, you can usually set it to -1.
    //
+  // A high value or -1 for this option can cause high memory usage.
+  // See BlockBasedTableOptions::cache_usage_options to constrain
+  // memory usage in case of block based table format.
+  //
    // Default: -1
    //
    // Dynamically changeable through SetDBOptions() API.
@@ -469,8 +614,18 @@ struct DBOptions {
    // (i.e. the ones that are causing all the space amplification). If set to 0
    // (default), we will dynamically choose the WAL size limit to be
    // [sum of all write_buffer_size * max_write_buffer_number] * 4
-  // This option takes effect only when there are more than one column family as
-  // otherwise the wal size is dictated by the write_buffer_size.
+  //
+  // For example, with 15 column families, each with
+  // write_buffer_size = 128 MB
+  // max_write_buffer_number = 6
+  // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB
+  //
+  // The RocksDB wiki has some discussion about how the WAL interacts
+  // with memtables and flushing of column families.
+  // https://github.com/facebook/rocksdb/wiki/Column-Families
+  //
+  // This option takes effect only when there are more than one column
+  // family as otherwise the wal size is dictated by the write_buffer_size.
    //
    // Default: 0
    //
@@ -548,13 +703,7 @@ struct DBOptions {
    // Dynamically changeable through SetDBOptions() API.
    int max_background_jobs = 2;
  
-  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
-  // value of max_background_jobs. This option is ignored.
-  //
-  // Dynamically changeable through SetDBOptions() API.
-  int base_background_compactions = -1;
-
-  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+  // DEPRECATED: RocksDB automatically decides this based on the
    // value of max_background_jobs. For backwards compatibility we will set
    // `max_background_jobs = max_background_compactions + max_background_flushes`
    // in the case where user sets at least one of `max_background_compactions` or
@@ -580,7 +729,7 @@ struct DBOptions {
    // Dynamically changeable through SetDBOptions() API.
    uint32_t max_subcompactions = 1;
  
-  // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+  // DEPRECATED: RocksDB automatically decides this based on the
    // value of max_background_jobs. For backwards compatibility we will set
    // `max_background_jobs = max_background_compactions + max_background_flushes`
    // in the case where user sets at least one of `max_background_compactions` or
@@ -639,9 +788,6 @@ struct DBOptions {
    // Number of shards used for table cache.
    int table_cache_numshardbits = 6;
  
-  // NOT SUPPORTED ANYMORE
-  // int table_cache_remove_scan_count_limit;
-
    // The following two fields affect how archived logs will be deleted.
    // 1. If both set to 0, logs will be deleted asap and will not get into
    //    the archive.
@@ -663,7 +809,17 @@ struct DBOptions {
    // large amounts of data (such as xfs's allocsize option).
    size_t manifest_preallocation_size = 4 * 1024 * 1024;
  
-  // Allow the OS to mmap file for reading sst tables. Default: false
+  // Allow the OS to mmap file for reading sst tables.
+  // Not recommended for 32-bit OS.
+  // When the option is set to true and compression is disabled, the blocks
+  // will not be copied and will be read directly from the mmap-ed memory
+  // area, and the block will not be inserted into the block cache. However,
+  // checksums will still be checked if ReadOptions.verify_checksums is set
+  // to be true. It means a checksum check every time a block is read, more
+  // than the setup where the option is set to false and the block cache is
+  // used. The common use of the options is to run RocksDB on ramfs, where
+  // checksum verification is usually not needed.
+  // Default: false
    bool allow_mmap_reads = false;
  
    // Allow the OS to mmap file for writing.
@@ -680,7 +836,6 @@ struct DBOptions {
    // be used. Memory mapped files are not impacted by these parameters.
  
    // Use O_DIRECT for user and compaction reads.
-  // When true, we also force new_table_reader_for_compaction_inputs to true.
    // Default: false
    // Not supported in ROCKSDB_LITE mode!
    bool use_direct_reads = false;
@@ -690,15 +845,20 @@ struct DBOptions {
    // Not supported in ROCKSDB_LITE mode!
    bool use_direct_io_for_flush_and_compaction = false;
  
-  // If false, fallocate() calls are bypassed
+  // If false, fallocate() calls are bypassed, which disables file
+  // preallocation. The file space preallocation is used to increase the file
+  // write/append performance. By default, RocksDB preallocates space for WAL,
+  // SST, Manifest files, the extra space is truncated when the file is written.
+  // Warning: if you're using btrfs, we would recommend setting
+  // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra
+  // allocated space cannot be freed, which could be significant if you have
+  // lots of files. More details about this limitation:
+  // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md
    bool allow_fallocate = true;
  
    // Disable child process inherit open files. Default: true
    bool is_fd_close_on_exec = true;
  
-  // NOT SUPPORTED ANYMORE -- this options is no longer used
-  bool skip_log_error_on_recovery = false;
-
    // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
    //
    // Default: 600 (10 min)
@@ -748,7 +908,8 @@ struct DBOptions {
    // can be passed into multiple DBs and it will track the sum of size of all
    // the DBs. If the total size of all live memtables of all the DBs exceeds
    // a limit, a flush will be triggered in the next DB to which the next write
-  // is issued.
+  // is issued, as long as there is one or more column family not already
+  // flushing.
    //
    // If the object is only passed to one DB, the behavior is the same as
    // db_write_buffer_size. When write_buffer_manager is set, the value set will
@@ -766,27 +927,10 @@ struct DBOptions {
    enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
    AccessHint access_hint_on_compaction_start = NORMAL;
  
-  // If true, always create a new file descriptor and new table reader
-  // for compaction inputs. Turn this parameter on may introduce extra
-  // memory usage in the table reader, if it allocates extra memory
-  // for indexes. This will allow file descriptor prefetch options
-  // to be set for compaction input files and not to impact file
-  // descriptors for the same file used by user queries.
-  // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
-  // for this mode if using block-based table.
-  //
-  // Default: false
-  // This flag has no affect on the behavior of compaction and plan to delete
-  // in the future.
-  bool new_table_reader_for_compaction_inputs = false;
-
    // If non-zero, we perform bigger reads when doing compaction. If you're
    // running RocksDB on spinning disks, you should set this to at least 2MB.
    // That way RocksDB's compaction is doing sequential instead of random reads.
    //
-  // When non-zero, we also force new_table_reader_for_compaction_inputs to
-  // true.
-  //
    // Default: 0
    //
    // Dynamically changeable through SetDBOptions() API.
@@ -810,7 +954,7 @@ struct DBOptions {
    size_t random_access_max_buffer_size = 1024 * 1024;
  
    // This is the maximum buffer size that is used by WritableFileWriter.
-  // On Windows, we need to maintain an aligned buffer for writes.
+  // With direct IO, we need to maintain an aligned buffer for writes.
    // We allow the buffer to grow until it's size hits the limit in buffered
    // IO and fix the buffer size when using direct IO to ensure alignment of
    // write requests if the logical sector size is unusual
@@ -837,7 +981,7 @@ struct DBOptions {
    // Allows OS to incrementally sync files to disk while they are being
    // written, asynchronously, in the background. This operation can be used
    // to smooth out write I/Os over time. Users shouldn't rely on it for
-  // persistency guarantee.
+  // persistence guarantee.
    // Issue one request for every bytes_per_sync written. 0 turns it off.
    //
    // You may consider using rate_limiter to regulate write rate to device.
@@ -1034,8 +1178,7 @@ struct DBOptions {
  #endif  // ROCKSDB_LITE
  
    // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
-  // / SetOptions will fail if options file is not detected or properly
-  // persisted.
+  // SetOptions will fail if options file is not properly persisted.
    //
    // DEFAULT: false
    bool fail_if_options_file_error = false;
@@ -1075,18 +1218,6 @@ struct DBOptions {
    // Immutable.
    bool allow_ingest_behind = false;
  
-  // Needed to support differential snapshots.
-  // If set to true then DB will only process deletes with sequence number
-  // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
-  // Clients are responsible to periodically call this method to advance
-  // the cutoff time. If this method is never called and preserve_deletes
-  // is set to true NO deletes will ever be processed.
-  // At the moment this only keeps normal deletes, SingleDeletes will
-  // not be preserved.
-  // DEFAULT: false
-  // Immutable (TODO: make it dynamically changeable)
-  bool preserve_deletes = false;
-
    // If enabled it uses two queues for writes, one for the ones with
    // disable_memtable and one for the ones that also write to memtable. This
    // allows the memtable writes not to lag behind other writes. It can be used
@@ -1099,6 +1230,12 @@ struct DBOptions {
    // file.
    bool manual_wal_flush = false;
  
+  // This feature is WORK IN PROGRESS
+  // If enabled WAL records will be compressed before they are written.
+  // Only zstd is supported. Compressed WAL records will be read in supported
+  // versions regardless of the wal_compression settings.
+  CompressionType wal_compression = kNoCompression;
+
    // If true, RocksDB supports flushing multiple column families and committing
    // their results atomically to MANIFEST. Note that it is not
    // necessary to set atomic_flush to true if WAL is always enabled since WAL
@@ -1148,13 +1285,37 @@ struct DBOptions {
    // Default: nullptr
    std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
  
-  // By default, RocksDB recovery fails if any table file referenced in
-  // MANIFEST are missing after scanning the MANIFEST.
-  // Best-efforts recovery is another recovery mode that
-  // tries to restore the database to the most recent point in time without
-  // missing file.
-  // Currently not compatible with atomic flush. Furthermore, WAL files will
-  // not be used for recovery if best_efforts_recovery is true.
+  // By default, RocksDB recovery fails if any table/blob file referenced in the
+  // final version reconstructed from the
+  // MANIFEST are missing after scanning the MANIFEST pointed to by the
+  // CURRENT file. It can also fail if verification of unique SST id fails.
+  // Best-efforts recovery is another recovery mode that does not necessarily
+  // fail when certain table/blob files are missing/corrupted or have mismatched
+  // unique id table property. Instead, best-efforts recovery recovers each
+  // column family to a point in the MANIFEST that corresponds to a version. In
+  // such a version, all valid table/blob files referenced have the expected
+  // file size. For table files, their unique id table property match the
+  // MANIFEST.
+  //
+  // Best-efforts recovery does not need a valid CURRENT file, and tries to
+  // recover the database using one of the available MANIFEST files in the db
+  // directory.
+  // Best-efforts recovery tries the available MANIFEST files from high file
+  // numbers (newer) to low file numbers (older), and stops after finding the
+  // first MANIFEST file from which the db can be recovered to a state without
+  // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob
+  // files. It is possible that the database can be restored to an empty state
+  // with no table or blob files.
+  //
+  // Regardless of this option, the IDENTITY file
+  // is updated if needed during recovery to match the DB ID in the MANIFEST (if
+  // previously using write_dbid_to_manifest) or to be in some valid state
+  // (non-empty DB ID). Currently, not compatible with atomic flush.
+  // Furthermore, WAL files will not be used for recovery if
+  // best_efforts_recovery is true. Also requires either 1) LOCK file exists or
+  // 2) underlying env's LockFile() call returns ok even for non-existing LOCK
+  // file.
+  //
    // Default: false
    bool best_efforts_recovery = false;
  
@@ -1191,11 +1352,55 @@ struct DBOptions {
    // writing a file, by tracing back to the writing host. These corruptions
    // may not be caught by the checksum since they happen before checksumming.
    // If left as default, the table writer will substitute it with the actual
-  // hostname when writing the SST file. If set to an empty stirng, the
+  // hostname when writing the SST file. If set to an empty string, the
    // property will not be written to the SST file.
    //
    // Default: hostname
    std::string db_host_id = kHostnameForDbHostId;
+
+  // Use this if your DB want to enable checksum handoff for specific file
+  // types writes. Make sure that the File_system you use support the
+  // crc32c checksum verification
+  // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.
+  // NOTE: currently RocksDB only generates crc32c based checksum for the
+  // handoff. If the storage layer has different checksum support, user
+  // should enble this set as empty. Otherwise,it may cause unexpected
+  // write failures.
+  FileTypeSet checksum_handoff_file_types;
+
+  // EXPERIMENTAL
+  // CompactionService is a feature allows the user to run compactions on a
+  // different host or process, which offloads the background load from the
+  // primary host.
+  // It's an experimental feature, the interface will be changed without
+  // backward/forward compatibility support for now. Some known issues are still
+  // under development.
+  std::shared_ptr<CompactionService> compaction_service = nullptr;
+
+  // It indicates, which lowest cache tier we want to
+  // use for a certain DB. Currently we support volatile_tier and
+  // non_volatile_tier. They are layered. By setting it to kVolatileTier, only
+  // the block cache (current implemented volatile_tier) is used. So
+  // cache entries will not spill to secondary cache (current
+  // implemented non_volatile_tier), and block cache lookup misses will not
+  // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use
+  // both block cache and secondary cache.
+  //
+  // Default: kNonVolatileBlockTier
+  CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+  // If set to false, when compaction or flush sees a SingleDelete followed by
+  // a Delete for the same user key, compaction job will not fail.
+  // Otherwise, compaction job will fail.
+  // This is a temporary option to help existing use cases migrate, and
+  // will be removed in a future release.
+  // Warning: do not set to false unless you are trying to migrate existing
+  // data in which the contract of single delete
+  // (https://github.com/facebook/rocksdb/wiki/Single-Delete) is not enforced,
+  // thus has Delete mixed with SingleDelete for the same user key. Violation
+  // of the contract leads to undefined behaviors with high possibility of data
+  // inconsistency, e.g. deleted old data become visible again, etc.
+  bool enforce_single_del_contracts = true;
  };
  
  // Options to control the behavior of a database (passed to DB::Open)
@@ -1207,7 +1412,11 @@ struct Options : public DBOptions, public ColumnFamilyOptions {
            const ColumnFamilyOptions& column_family_options)
        : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
  
-  // The function recovers options to the option as in version 4.6.
+  // Change to some default settings from an older version.
+  // NOT MAINTAINED: This function has not been and is not maintained.
+  // DEPRECATED: This function might be removed in a future release.
+  // In general, defaults are changed to suit broad interests. Opting
+  // out of a change on upgrade should be deliberate and considered.
    Options* OldDefaults(int rocksdb_major_version = 4,
                         int rocksdb_minor_version = 6);
  
@@ -1230,9 +1439,14 @@ struct Options : public DBOptions, public ColumnFamilyOptions {
    // Use this if your DB is very small (like under 1GB) and you don't want to
    // spend lots of memory for memtables.
    Options* OptimizeForSmallDb();
+
+  // Disable some checks that should not be necessary in the absence of
+  // software logic errors or CPU+memory hardware errors. This can improve
+  // write speeds but is only recommended for temporary use. Does not
+  // change protection against corrupt storage (e.g. verify_checksums).
+  Options* DisableExtraChecks();
  };
  
-//
  // An application can issue a read request (via Get/Iterators) and specify
  // if that read should process data that ALREADY resides on a specified cache
  // level. For example, if an application specifies kBlockCacheTier then the
@@ -1267,22 +1481,35 @@ struct ReadOptions {
    // need to have the same prefix. This is because ordering is not guaranteed
    // outside of prefix domain.
    //
+  // In case of user_defined timestamp, if enabled, iterate_lower_bound should
+  // point to key without timestamp part.
    // Default: nullptr
    const Slice* iterate_lower_bound;
  
-  // "iterate_upper_bound" defines the extent upto which the forward iterator
-  // can returns entries. Once the bound is reached, Valid() will be false.
+  // "iterate_upper_bound" defines the extent up to which the forward iterator
+  // can return entries. Once the bound is reached, Valid() will be false.
    // "iterate_upper_bound" is exclusive ie the bound value is
-  // not a valid entry. If prefix_extractor is not null, the Seek target
-  // and iterate_upper_bound need to have the same prefix.
-  // This is because ordering is not guaranteed outside of prefix domain.
-  //
+  // not a valid entry. If prefix_extractor is not null:
+  // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
+  //    to infer whether prefix iterating (e.g. applying prefix bloom filter)
+  //    can be used within RocksDB. This is done by comparing
+  //    iterate_upper_bound with the seek key.
+  // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes
+  //    effect if it shares the same prefix as the seek key. If
+  //    iterate_upper_bound is outside the prefix of the seek key, then keys
+  //    returned outside the prefix range will be undefined, just as if
+  //    iterate_upper_bound = null.
+  // If iterate_upper_bound is not null, SeekToLast() will position the iterator
+  // at the first key smaller than iterate_upper_bound.
+  //
+  // In case of user_defined timestamp, if enabled, iterate_upper_bound should
+  // point to key without timestamp part.
    // Default: nullptr
    const Slice* iterate_upper_bound;
  
    // RocksDB does auto-readahead for iterators on noticing more than two reads
    // for a table file. The readahead starts at 8KB and doubles on every
-  // additional read upto 256KB.
+  // additional read up to 256KB.
    // This option can help if most of the range scans are large, and if it is
    // determined that a larger readahead than that enabled by auto-readahead is
    // needed.
@@ -1308,7 +1535,7 @@ struct ReadOptions {
    // Default: true
    bool verify_checksums;
  
-  // Should the "data block"/"index block"" read for this iteration be placed in
+  // Should the "data block"/"index block" read for this iteration be placed in
    // block cache?
    // Callers may wish to set this field to false for bulk scans.
    // This would help not to the change eviction order of existing items in the
@@ -1332,16 +1559,26 @@ struct ReadOptions {
    // used in the table. Some table format (e.g. plain table) may not support
    // this option.
    // If true when calling Get(), we also skip prefix bloom when reading from
-  // block based table. It provides a way to read existing data after
-  // changing implementation of prefix extractor.
+  // block based table, which only affects Get() performance.
    // Default: false
    bool total_order_seek;
  
    // When true, by default use total_order_seek = true, and RocksDB can
    // selectively enable prefix seek mode if won't generate a different result
    // from total_order_seek, based on seek key, and iterator upper bound.
-  // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
+  // Not supported in ROCKSDB_LITE mode, in the way that even with value true
    // prefix mode is not used.
+  // BUG: Using Comparator::IsSameLengthImmediateSuccessor and
+  // SliceTransform::FullLengthEnabled to enable prefix mode in cases where
+  // prefix of upper bound differs from prefix of seek key has a flaw.
+  // If present in the DB, "short keys" (shorter than "full length" prefix)
+  // can be omitted from auto_prefix_mode iteration when they would be present
+  // in total_order_seek iteration, regardless of whether the short keys are
+  // "in domain" of the prefix extractor. This is not an issue if no short
+  // keys are added to DB or are not expected to be returned by such
+  // iterators. (We are also assuming the new condition on
+  // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
+  // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
    // Default: false
    bool auto_prefix_mode;
  
@@ -1367,9 +1604,11 @@ struct ReadOptions {
    // Default: false
    bool background_purge_on_iterator_cleanup;
  
-  // If true, keys deleted using the DeleteRange() API will be visible to
-  // readers until they are naturally deleted during compaction. This improves
-  // read performance in DBs with many range deletions.
+  // If true, range tombstones handling will be skipped in key lookup paths.
+  // For DB instances that don't use DeleteRange() calls, this setting can
+  // be used to optimize the read performance.
+  // Note that, if this assumption (of no previous DeleteRange() calls) is
+  // broken, stale keys could be served in read paths.
    // Default: false
    bool ignore_range_deletions;
  
@@ -1381,13 +1620,6 @@ struct ReadOptions {
    // Default: empty (every table will be scanned)
    std::function<bool(const TableProperties&)> table_filter;
  
-  // Needed to support differential snapshots. Has 2 effects:
-  // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
-  // 2) if this param > 0 iterator will return INTERNAL keys instead of
-  //    user keys; e.g. return tombstones as well.
-  // Default: 0 (don't filter by seqnum, return user keys)
-  SequenceNumber iter_start_seqnum;
-
    // Timestamp of operation. Read should return the latest data visible to the
    // specified timestamp. All timestamps of the same database must be of the
    // same length and format. The user is responsible for providing a customized
@@ -1416,7 +1648,7 @@ struct ReadOptions {
    // A timeout in microseconds to be passed to the underlying FileSystem for
    // reads. As opposed to deadline, this determines the timeout for each
    // individual file read request. If a MultiGet/Get/Seek/Next etc call
-  // results in multiple reads, each read can last upto io_timeout us.
+  // results in multiple reads, each read can last up to io_timeout us.
    std::chrono::microseconds io_timeout;
  
    // It limits the maximum cumulative value size of the keys in batch while
@@ -1426,6 +1658,55 @@ struct ReadOptions {
    // Default: std::numeric_limits<uint64_t>::max()
    uint64_t value_size_soft_limit;
  
+  // For iterators, RocksDB does auto-readahead on noticing more than two
+  // sequential reads for a table file if user doesn't provide readahead_size.
+  // The readahead starts at 8KB and doubles on every additional read upto
+  // max_auto_readahead_size only when reads are sequential. However at each
+  // level, if iterator moves over next file, readahead_size starts again from
+  // 8KB.
+  //
+  // By enabling this option, RocksDB will do some enhancements for
+  // prefetching the data.
+  //
+  // Default: false
+  bool adaptive_readahead;
+
+  // For file reads associated with this option, charge the internal rate
+  // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+  // special value `Env::IO_TOTAL` disables charging the rate limiter.
+  //
+  // The rate limiting is bypassed no matter this option's value for file reads
+  // on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
+  // is a `PlainTableFactory`) and cuckoo tables (these can exist when
+  // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
+  //
+  // The bytes charged to rate limiter may not exactly match the file read bytes
+  // since there are some seemingly insignificant reads, like for file
+  // headers/footers, that we currently do not charge to rate limiter.
+  //
+  // Default: `Env::IO_TOTAL`.
+  Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+
+  // Experimental
+  //
+  // If async_io is enabled, RocksDB will prefetch some of data asynchronously.
+  // RocksDB apply it if reads are sequential and its internal automatic
+  // prefetching.
+  //
+  // Default: false
+  bool async_io;
+
+  // Experimental
+  //
+  // If async_io is set, then this flag controls whether we read SST files
+  // in multiple levels asynchronously. Enabling this flag can help reduce
+  // MultiGet latency by maximizing the number of SST files read in
+  // parallel if the keys in the MultiGet batch are in different levels. It
+  // comes at the expense of slightly higher CPU overhead.
+  //
+  // Default: true
+  bool optimize_multiget_for_io;
+
    ReadOptions();
    ReadOptions(bool cksum, bool cache);
  };
@@ -1470,7 +1751,7 @@ struct WriteOptions {
    bool no_slowdown;
  
    // If true, this write request is of lower priority if compaction is
-  // behind. In this case, no_slowdown = true, the request will be cancelled
+  // behind. In this case, no_slowdown = true, the request will be canceled
    // immediately with Status::Incomplete() returned. Otherwise, it will be
    // slowed down. The slowdown value is determined by RocksDB to guarantee
    // it introduces minimum impacts to high priority writes.
@@ -1487,16 +1768,27 @@ struct WriteOptions {
    // Default: false
    bool memtable_insert_hint_per_batch;
  
-  // Timestamp of write operation, e.g. Put. All timestamps of the same
-  // database must share the same length and format. The user is also
-  // responsible for providing a customized compare function via Comparator to
-  // order <key, timestamp> tuples. If the user wants to enable timestamp, then
-  // all write operations must be associated with timestamp because RocksDB, as
-  // a single-node storage engine currently has no knowledge of global time,
-  // thus has to rely on the application.
-  // The user-specified timestamp feature is still under active development,
-  // and the API is subject to change.
-  const Slice* timestamp;
+  // For writes associated with this option, charge the internal rate
+  // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+  // special value `Env::IO_TOTAL` disables charging the rate limiter.
+  //
+  // Currently the support covers automatic WAL flushes, which happen during
+  // live updates (`Put()`, `Write()`, `Delete()`, etc.)
+  // when `WriteOptions::disableWAL == false`
+  // and `DBOptions::manual_wal_flush == false`.
+  //
+  // Only `Env::IO_USER` and `Env::IO_TOTAL` are allowed
+  // due to implementation constraints.
+  //
+  // Default: `Env::IO_TOTAL`
+  Env::IOPriority rate_limiter_priority;
+
+  // `protection_bytes_per_key` is the number of bytes used to store
+  // protection information for each key entry. Currently supported values are
+  // zero (disabled) and eight.
+  //
+  // Default: zero (disabled).
+  size_t protection_bytes_per_key;
  
    WriteOptions()
        : sync(false),
@@ -1505,7 +1797,8 @@ struct WriteOptions {
          no_slowdown(false),
          low_pri(false),
          memtable_insert_hint_per_batch(false),
-        timestamp(nullptr) {}
+        rate_limiter_priority(Env::IO_TOTAL),
+        protection_bytes_per_key(0) {}
  };
  
  // Options that control flush operations
@@ -1562,11 +1855,25 @@ enum class BottommostLevelCompaction {
    kForceOptimized,
  };
  
+// For manual compaction, we can configure if we want to skip/force garbage
+// collection of blob files.
+enum class BlobGarbageCollectionPolicy {
+  // Force blob file garbage collection.
+  kForce,
+  // Skip blob file garbage collection.
+  kDisable,
+  // Inherit blob file garbage collection policy from ColumnFamilyOptions.
+  kUseDefault,
+};
+
  // CompactRangeOptions is used by CompactRange() call.
  struct CompactRangeOptions {
    // If true, no other compaction will run at the same time as this
-  // manual compaction
-  bool exclusive_manual_compaction = true;
+  // manual compaction.
+  //
+  // Default: false
+  bool exclusive_manual_compaction = false;
+
    // If true, compacted files will be moved to the minimum level capable
    // of holding the data or given level (specified non-negative target_level).
    bool change_level = false;
@@ -1585,6 +1892,35 @@ struct CompactRangeOptions {
    bool allow_write_stall = false;
    // If > 0, it will replace the option in the DBOptions for this compaction.
    uint32_t max_subcompactions = 0;
+  // Set user-defined timestamp low bound, the data with older timestamp than
+  // low bound maybe GCed by compaction. Default: nullptr
+  const Slice* full_history_ts_low = nullptr;
+
+  // Allows cancellation of an in-progress manual compaction.
+  //
+  // Cancellation can be delayed waiting on automatic compactions when used
+  // together with `exclusive_manual_compaction == true`.
+  std::atomic<bool>* canceled = nullptr;
+  // NOTE: Calling DisableManualCompaction() overwrites the uer-provided
+  // canceled variable in CompactRangeOptions.
+  // Typically, when CompactRange is being called in one thread (t1) with
+  // canceled = false, and DisableManualCompaction is being called in the
+  // other thread (t2), manual compaction is disabled normally, even if the
+  // compaction iterator may still scan a few items before *canceled is
+  // set to true
+
+  // If set to kForce, RocksDB will override enable_blob_file_garbage_collection
+  // to true; if set to kDisable, RocksDB will override it to false, and
+  // kUseDefault leaves the setting in effect. This enables customers to both
+  // force-enable and force-disable GC when calling CompactRange.
+  BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+      BlobGarbageCollectionPolicy::kUseDefault;
+
+  // If set to < 0 or > 1, RocksDB leaves blob_garbage_collection_age_cutoff
+  // from ColumnFamilyOptions in effect. Otherwise, it will override the
+  // user-provided setting. This enables customers to selectively override the
+  // age cutoff.
+  double blob_garbage_collection_age_cutoff = -1;
  };
  
  // IngestExternalFileOptions is used by IngestExternalFile()
@@ -1604,7 +1940,7 @@ struct IngestExternalFileOptions {
    bool allow_blocking_flush = true;
    // Set to true if you would like duplicate keys in the file being ingested
    // to be skipped rather than overwriting existing data under that key.
-  // Usecase: back-fill of some historical data in the database without
+  // Use case: back-fill of some historical data in the database without
    // over-writing existing newer version of data.
    // This option could only be used if the DB has been running
    // with allow_ingest_behind=true since the dawn of time.
@@ -1644,7 +1980,7 @@ struct IngestExternalFileOptions {
    // will be ignored; 2) If DB enable the checksum function, we calculate the
    // sst file checksum after the file is moved or copied and compare the
    // checksum and checksum name. If checksum or checksum function name does
-  // not match, ingestion will be failed. If the verification is sucessful,
+  // not match, ingestion will be failed. If the verification is successful,
    // checksum and checksum function name will be stored in Manifest.
    // If this option is set to FALSE, 1) if DB does not enable checksum,
    // the ingested checksum information will be ignored; 2) if DB enable the
@@ -1654,6 +1990,14 @@ struct IngestExternalFileOptions {
    // ingestion. However, if no checksum information is provided with the
    // ingested files, DB will generate the checksum and store in the Manifest.
    bool verify_file_checksum = true;
+  // Set to TRUE if user wants file to be ingested to the bottommost level. An
+  // error of Status::TryAgain() will be returned if a file cannot fit in the
+  // bottommost level when calling
+  // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear
+  // the bottommost level in the overlapping range before re-attempt.
+  //
+  // ingest_behind takes precedence over fail_if_not_bottommost_level.
+  bool fail_if_not_bottommost_level = false;
  };
  
  enum TraceFilterType : uint64_t {
@@ -1662,7 +2006,13 @@ enum TraceFilterType : uint64_t {
    // Do not trace the get operations
    kTraceFilterGet = 0x1 << 0,
    // Do not trace the write operations
-  kTraceFilterWrite = 0x1 << 1
+  kTraceFilterWrite = 0x1 << 1,
+  // Do not trace the `Iterator::Seek()` operations
+  kTraceFilterIteratorSeek = 0x1 << 2,
+  // Do not trace the `Iterator::SeekForPrev()` operations
+  kTraceFilterIteratorSeekForPrev = 0x1 << 3,
+  // Do not trace the `MultiGet()` operations
+  kTraceFilterMultiGet = 0x1 << 4,
  };
  
  // TraceOptions is used for StartTrace
@@ -1675,6 +2025,13 @@ struct TraceOptions {
    uint64_t sampling_frequency = 1;
    // Note: The filtering happens before sampling.
    uint64_t filter = kTraceFilterNone;
+  // When true, the order of write records in the trace will match the order of
+  // the corresponding write records in the WAL and applied to the DB. There may
+  // be a performance penalty associated with preserving this ordering.
+  //
+  // Default: false. This means write records in the trace may be in an order
+  // different from the WAL's order.
+  bool preserve_write_order = false;
  };
  
  // ImportColumnFamilyOptions is used by ImportColumnFamily()
@@ -1686,10 +2043,10 @@ struct ImportColumnFamilyOptions {
  // Options used with DB::GetApproximateSizes()
  struct SizeApproximationOptions {
    // Defines whether the returned size should include the recently written
-  // data in the mem-tables. If set to false, include_files must be true.
-  bool include_memtabtles = false;
+  // data in the memtables. If set to false, include_files must be true.
+  bool include_memtables = false;
    // Defines whether the returned size should include data serialized to disk.
-  // If set to false, include_memtabtles must be true.
+  // If set to false, include_memtables must be true.
    bool include_files = true;
    // When approximating the files total size that is used to store a keys range
    // using DB::GetApproximateSizes, allow approximation with an error margin of
@@ -1703,4 +2060,54 @@ struct SizeApproximationOptions {
    double files_size_error_margin = -1.0;
  };
  
+struct CompactionServiceOptionsOverride {
+  // Currently pointer configurations are not passed to compaction service
+  // compaction so the user needs to set it. It will be removed once pointer
+  // configuration passing is supported.
+  Env* env = Env::Default();
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  const Comparator* comparator = BytewiseComparator();
+  std::shared_ptr<MergeOperator> merge_operator = nullptr;
+  const CompactionFilter* compaction_filter = nullptr;
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+  std::shared_ptr<TableFactory> table_factory;
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+  // Only subsets of events are triggered in remote compaction worker, like:
+  // `OnTableFileCreated`, `OnTableFileCreationStarted`,
+  // `ShouldBeNotifiedOnFileIO` `OnSubcompactionBegin`,
+  // `OnSubcompactionCompleted`, etc. Worth mentioning, `OnCompactionBegin` and
+  // `OnCompactionCompleted` won't be triggered. They will be triggered on the
+  // primary DB side.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+
+  // statistics is used to collect DB operation metrics, the metrics won't be
+  // returned to CompactionService primary host, to collect that, the user needs
+  // to set it here.
+  std::shared_ptr<Statistics> statistics = nullptr;
+
+  // Only compaction generated SST files use this user defined table properties
+  // collector.
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      table_properties_collector_factories;
+};
+
+struct OpenAndCompactOptions {
+  // Allows cancellation of an in-progress compaction.
+  std::atomic<bool>* canceled = nullptr;
+};
+
+#ifndef ROCKSDB_LITE
+struct LiveFilesStorageInfoOptions {
+  // Whether to populate FileStorageInfo::file_checksum* or leave blank
+  bool include_checksum_info = false;
+  // Flushes memtables if total size in bytes of live WAL files is >= this
+  // number (and DB is not read-only).
+  // Default: always force a flush without checking sizes.
+  uint64_t wal_size_for_flush = 0;
+};
+#endif  // !ROCKSDB_LITE
+
  }  // namespace ROCKSDB_NAMESPACE