#include "rocksdb/advanced_options.h"
#include "rocksdb/comparator.h"
#include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/data_structure.h"
#include "rocksdb/env.h"
#include "rocksdb/file_checksum.h"
#include "rocksdb/listener.h"
struct Options;
struct DbPath;
+using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+
struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
// The function recovers options to a previous version. Only 4.6 or later
// versions are supported.
+ // NOT MAINTAINED: This function has not been and is not maintained.
+ // DEPRECATED: This function might be removed in a future release.
+ // In general, defaults are changed to suit broad interests. Opting
+ // out of a change on upgrade should be deliberate and considered.
ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
int rocksdb_minor_version = 6);
// Allows an application to modify/delete a key-value during background
// compaction.
//
- // If the client requires a new compaction filter to be used for different
- // compaction runs, it can specify compaction_filter_factory instead of this
- // option. The client should specify only one of the two.
+ // If the client requires a new `CompactionFilter` to be used for different
+ // compaction runs and/or requires a `CompactionFilter` for table file
+ // creations outside of compaction, it can specify compaction_filter_factory
+ // instead of this option. The client should specify only one of the two.
// compaction_filter takes precedence over compaction_filter_factory if
// client specifies both.
//
// Default: nullptr
const CompactionFilter* compaction_filter = nullptr;
- // This is a factory that provides compaction filter objects which allow
- // an application to modify/delete a key-value during background compaction.
+ // This is a factory that provides `CompactionFilter` objects which allow
+ // an application to modify/delete a key-value during table file creation.
+ //
+ // Unlike the `compaction_filter` option, which is used when compaction
+ // creates a table file, this factory allows using a `CompactionFilter` when a
+ // table file is created for various reasons. The factory can decide what
+ // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by
+ // default the decision is to use a `CompactionFilter` for
+ // `TableFileCreationReason::kCompaction` only.
//
- // A new filter will be created on each compaction run. If multithreaded
- // compaction is being used, each created CompactionFilter will only be used
- // from a single thread and so does not need to be thread-safe.
+ // Each thread of work involving creating table files will create a new
+ // `CompactionFilter` when it will be used according to the above
+ // `TableFileCreationReason`-based decision. This allows the application to
+ // know about the different ongoing threads of work and makes it unnecessary
+ // for `CompactionFilter` to provide thread-safety.
//
// Default: nullptr
std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
CompressionType compression;
// Compression algorithm that will be used for the bottommost level that
- // contain files.
+ // contain files. The behavior for num_levels = 1 is not well defined.
+ // Right now, with num_levels = 1, all compaction outputs will use
+ // bottommost_compression and all flush outputs still use options.compression,
+ // but the behavior is subject to change.
//
// Default: kDisableCompressionOption (Disabled)
CompressionType bottommost_compression = kDisableCompressionOption;
// different options for compression algorithms used by bottommost_compression
// if it is enabled. To enable it, please see the definition of
- // CompressionOptions.
+ // CompressionOptions. Behavior for num_levels = 1 is the same as
+ // options.bottommost_compression.
CompressionOptions bottommost_compression_opts;
// different options for compression algorithms
// Dynamically changeable through SetOptions() API
int level0_file_num_compaction_trigger = 4;
- // If non-nullptr, use the specified function to determine the
- // prefixes for keys. These prefixes will be placed in the filter.
- // Depending on the workload, this can reduce the number of read-IOP
- // cost for scans when a prefix is passed via ReadOptions to
- // db.NewIterator(). For prefix filtering to work properly,
- // "prefix_extractor" and "comparator" must be such that the following
- // properties hold:
- //
- // 1) key.starts_with(prefix(key))
- // 2) Compare(prefix(key), key) <= 0.
- // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
- // 4) prefix(prefix(key)) == prefix(key)
+ // If non-nullptr, use the specified function to put keys in contiguous
+ // groups called "prefixes". These prefixes are used to place one
+ // representative entry for the group into the Bloom filter
+ // rather than an entry for each key (see whole_key_filtering).
+ // Under certain conditions, this enables optimizing some range queries
+ // (Iterators) in addition to some point lookups (Get/MultiGet).
+ //
+ // Together `prefix_extractor` and `comparator` must satisfy one essential
+ // property for valid prefix filtering of range queries:
+ // If Compare(k1, k2) <= 0 and Compare(k2, k3) <= 0 and
+ // InDomain(k1) and InDomain(k3) and prefix(k1) == prefix(k3),
+ // Then InDomain(k2) and prefix(k2) == prefix(k1)
+ //
+ // In other words, all keys with the same prefix must be in a contiguous
+ // group by comparator order, and cannot be interrupted by keys with no
+ // prefix ("out of domain"). (This makes it valid to conclude that no
+ // entries within some bounds are present if the upper and lower bounds
+ // have a common prefix and no entries with that same prefix are present.)
+ //
+ // Some other properties are recommended but not strictly required. Under
+ // most sensible comparators, the following will need to hold true to
+ // satisfy the essential property above:
+ // * "Prefix is a prefix": key.starts_with(prefix(key))
+ // * "Prefixes preserve ordering": If Compare(k1, k2) <= 0, then
+ // Compare(prefix(k1), prefix(k2)) <= 0
+ //
+ // The next two properties ensure that seeking to a prefix allows
+ // enumerating all entries with that prefix:
+ // * "Prefix starts the group": Compare(prefix(key), key) <= 0
+ // * "Prefix idempotent": prefix(prefix(key)) == prefix(key)
//
// Default: nullptr
std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
extern const char* kHostnameForDbHostId;
+enum class CompactionServiceJobStatus : char {
+ kSuccess,
+ kFailure,
+ kUseLocal,
+};
+
+struct CompactionServiceJobInfo {
+ std::string db_name;
+ std::string db_id;
+ std::string db_session_id;
+ uint64_t job_id; // job_id is only unique within the current DB and session,
+ // restart DB will reset the job_id. `db_id` and
+ // `db_session_id` could help you build unique id across
+ // different DBs and sessions.
+
+ Env::Priority priority;
+
+ CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
+ std::string db_session_id_, uint64_t job_id_,
+ Env::Priority priority_)
+ : db_name(std::move(db_name_)),
+ db_id(std::move(db_id_)),
+ db_session_id(std::move(db_session_id_)),
+ job_id(job_id_),
+ priority(priority_) {}
+};
+
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
+class CompactionService : public Customizable {
+ public:
+ static const char* Type() { return "CompactionService"; }
+
+ // Returns the name of this compaction service.
+ const char* Name() const override = 0;
+
+ // Start the remote compaction with `compaction_service_input`, which can be
+ // passed to `DB::OpenAndCompact()` on the remote side. `info` provides the
+ // information the user might want to know, which includes `job_id`.
+ virtual CompactionServiceJobStatus StartV2(
+ const CompactionServiceJobInfo& /*info*/,
+ const std::string& /*compaction_service_input*/) {
+ return CompactionServiceJobStatus::kUseLocal;
+ }
+
+ // Wait for remote compaction to finish.
+ virtual CompactionServiceJobStatus WaitForCompleteV2(
+ const CompactionServiceJobInfo& /*info*/,
+ std::string* /*compaction_service_result*/) {
+ return CompactionServiceJobStatus::kUseLocal;
+ }
+
+ ~CompactionService() override = default;
+};
+
struct DBOptions {
// The function recovers options to the option as in version 4.6.
+ // NOT MAINTAINED: This function has not been and is not maintained.
+ // DEPRECATED: This function might be removed in a future release.
+ // In general, defaults are changed to suit broad interests. Opting
+ // out of a change on upgrade should be deliberate and considered.
DBOptions* OldDefaults(int rocksdb_major_version = 4,
int rocksdb_minor_version = 6);
// Default: true
bool paranoid_checks = true;
- // If true, track WALs in MANIFEST and verify them on recovery.
+ // If true, during memtable flush, RocksDB will validate total entries
+ // read in flush, and compare with counter inserted into it.
+ // The option is here to turn the feature off in case this new validation
+ // feature has a bug.
+ // Default: true
+ bool flush_verify_memtable_count = true;
+
+ // If true, the log numbers and sizes of the synced WALs are tracked
+ // in MANIFEST. During DB recovery, if a synced WAL is missing
+ // from disk, or the WAL's size does not match the recorded size in
+ // MANIFEST, an error will be reported and the recovery will be aborted.
//
- // If a WAL is tracked in MANIFEST but is missing from disk on recovery,
- // or the size of the tracked WAL is larger than the WAL's on-disk size,
- // an error is reported and recovery is aborted.
+ // This is one additional protection against WAL corruption besides the
+ // per-WAL-entry checksum.
//
- // If a WAL is not tracked in MANIFEST, then no verification will happen
- // during recovery.
+ // Note that this option does not work with secondary instance.
+ // Currently, only syncing closed WALs are tracked. Calling `DB::SyncWAL()`,
+ // etc. or writing with `WriteOptions::sync=true` to sync the live WAL is not
+ // tracked for performance/efficiency reasons.
//
// Default: false
- // FIXME(cheng): This option is part of a work in progress and does not yet
- // work
bool track_and_verify_wals_in_manifest = false;
+ // If true, verifies the SST unique id between MANIFEST and actual file
+ // each time an SST file is opened. This check ensures an SST file is not
+ // overwritten or misplaced. A corruption error will be reported if mismatch
+ // detected, but only when MANIFEST tracks the unique id, which starts from
+ // RocksDB version 7.3. Although the tracked internal unique id is related
+ // to the one returned by GetUniqueIdFromTableProperties, that is subject to
+ // change.
+ // NOTE: verification is currently only done on SST files using block-based
+ // table format.
+ //
+ // Setting to false should only be needed in case of unexpected problems.
+ //
+ // Although an early version of this option opened all SST files for
+ // verification on DB::Open, that is no longer guaranteed. However, as
+ // documented in an above option, if max_open_files is -1, DB will open all
+ // files on DB::Open().
+ //
+ // Default: true
+ bool verify_sst_unique_id_in_manifest = true;
+
// Use the specified object to interact with the environment,
// e.g. to read/write files, schedule background work, etc. In the near
// future, support for doing storage operations such as read/write files
// Default: Env::Default()
Env* env = Env::Default();
- // Use to control write rate of flush and compaction. Flush has higher
- // priority than compaction. Rate limiting is disabled if nullptr.
- // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
+ // Limits internal file read/write bandwidth:
+ //
+ // - Flush requests write bandwidth at `Env::IOPriority::IO_HIGH`
+ // - Compaction requests read and write bandwidth at
+ // `Env::IOPriority::IO_LOW`
+ // - Reads associated with a `ReadOptions` can be charged at
+ // `ReadOptions::rate_limiter_priority` (see that option's API doc for usage
+ // and limitations).
+ // - Writes associated with a `WriteOptions` can be charged at
+ // `WriteOptions::rate_limiter_priority` (see that option's API doc for
+ // usage and limitations).
+ //
+ // Rate limiting is disabled if nullptr. If rate limiter is enabled,
+ // bytes_per_sync is set to 1MB by default.
+ //
// Default: nullptr
std::shared_ptr<RateLimiter> rate_limiter = nullptr;
// on target_file_size_base and target_file_size_multiplier for level-based
// compaction. For universal-style compaction, you can usually set it to -1.
//
+ // A high value or -1 for this option can cause high memory usage.
+ // See BlockBasedTableOptions::cache_usage_options to constrain
+ // memory usage in case of block based table format.
+ //
// Default: -1
//
// Dynamically changeable through SetDBOptions() API.
// (i.e. the ones that are causing all the space amplification). If set to 0
// (default), we will dynamically choose the WAL size limit to be
// [sum of all write_buffer_size * max_write_buffer_number] * 4
- // This option takes effect only when there are more than one column family as
- // otherwise the wal size is dictated by the write_buffer_size.
+ //
+ // For example, with 15 column families, each with
+ // write_buffer_size = 128 MB
+ // max_write_buffer_number = 6
+ // max_total_wal_size will be calculated to be [15 * 128MB * 6] * 4 = 45GB
+ //
+ // The RocksDB wiki has some discussion about how the WAL interacts
+ // with memtables and flushing of column families.
+ // https://github.com/facebook/rocksdb/wiki/Column-Families
+ //
+ // This option takes effect only when there are more than one column
+ // family as otherwise the wal size is dictated by the write_buffer_size.
//
// Default: 0
//
// Dynamically changeable through SetDBOptions() API.
int max_background_jobs = 2;
- // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
- // value of max_background_jobs. This option is ignored.
- //
- // Dynamically changeable through SetDBOptions() API.
- int base_background_compactions = -1;
-
- // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+ // DEPRECATED: RocksDB automatically decides this based on the
// value of max_background_jobs. For backwards compatibility we will set
// `max_background_jobs = max_background_compactions + max_background_flushes`
// in the case where user sets at least one of `max_background_compactions` or
// Dynamically changeable through SetDBOptions() API.
uint32_t max_subcompactions = 1;
- // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
+ // DEPRECATED: RocksDB automatically decides this based on the
// value of max_background_jobs. For backwards compatibility we will set
// `max_background_jobs = max_background_compactions + max_background_flushes`
// in the case where user sets at least one of `max_background_compactions` or
// Number of shards used for table cache.
int table_cache_numshardbits = 6;
- // NOT SUPPORTED ANYMORE
- // int table_cache_remove_scan_count_limit;
-
// The following two fields affect how archived logs will be deleted.
// 1. If both set to 0, logs will be deleted asap and will not get into
// the archive.
// large amounts of data (such as xfs's allocsize option).
size_t manifest_preallocation_size = 4 * 1024 * 1024;
- // Allow the OS to mmap file for reading sst tables. Default: false
+ // Allow the OS to mmap file for reading sst tables.
+ // Not recommended for 32-bit OS.
+ // When the option is set to true and compression is disabled, the blocks
+ // will not be copied and will be read directly from the mmap-ed memory
+ // area, and the block will not be inserted into the block cache. However,
+ // checksums will still be checked if ReadOptions.verify_checksums is set
+ // to be true. It means a checksum check every time a block is read, more
+ // than the setup where the option is set to false and the block cache is
+ // used. The common use of the options is to run RocksDB on ramfs, where
+ // checksum verification is usually not needed.
+ // Default: false
bool allow_mmap_reads = false;
// Allow the OS to mmap file for writing.
// be used. Memory mapped files are not impacted by these parameters.
// Use O_DIRECT for user and compaction reads.
- // When true, we also force new_table_reader_for_compaction_inputs to true.
// Default: false
// Not supported in ROCKSDB_LITE mode!
bool use_direct_reads = false;
// Not supported in ROCKSDB_LITE mode!
bool use_direct_io_for_flush_and_compaction = false;
- // If false, fallocate() calls are bypassed
+ // If false, fallocate() calls are bypassed, which disables file
+ // preallocation. The file space preallocation is used to increase the file
+ // write/append performance. By default, RocksDB preallocates space for WAL,
+ // SST, Manifest files, the extra space is truncated when the file is written.
+ // Warning: if you're using btrfs, we would recommend setting
+ // `allow_fallocate=false` to disable preallocation. As on btrfs, the extra
+ // allocated space cannot be freed, which could be significant if you have
+ // lots of files. More details about this limitation:
+ // https://github.com/btrfs/btrfs-dev-docs/blob/471c5699336e043114d4bca02adcd57d9dab9c44/data-extent-reference-counts.md
bool allow_fallocate = true;
// Disable child process inherit open files. Default: true
bool is_fd_close_on_exec = true;
- // NOT SUPPORTED ANYMORE -- this options is no longer used
- bool skip_log_error_on_recovery = false;
-
// if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
//
// Default: 600 (10 min)
// can be passed into multiple DBs and it will track the sum of size of all
// the DBs. If the total size of all live memtables of all the DBs exceeds
// a limit, a flush will be triggered in the next DB to which the next write
- // is issued.
+ // is issued, as long as there is one or more column family not already
+ // flushing.
//
// If the object is only passed to one DB, the behavior is the same as
// db_write_buffer_size. When write_buffer_manager is set, the value set will
enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
AccessHint access_hint_on_compaction_start = NORMAL;
- // If true, always create a new file descriptor and new table reader
- // for compaction inputs. Turn this parameter on may introduce extra
- // memory usage in the table reader, if it allocates extra memory
- // for indexes. This will allow file descriptor prefetch options
- // to be set for compaction input files and not to impact file
- // descriptors for the same file used by user queries.
- // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
- // for this mode if using block-based table.
- //
- // Default: false
- // This flag has no affect on the behavior of compaction and plan to delete
- // in the future.
- bool new_table_reader_for_compaction_inputs = false;
-
// If non-zero, we perform bigger reads when doing compaction. If you're
// running RocksDB on spinning disks, you should set this to at least 2MB.
// That way RocksDB's compaction is doing sequential instead of random reads.
//
- // When non-zero, we also force new_table_reader_for_compaction_inputs to
- // true.
- //
// Default: 0
//
// Dynamically changeable through SetDBOptions() API.
size_t random_access_max_buffer_size = 1024 * 1024;
// This is the maximum buffer size that is used by WritableFileWriter.
- // On Windows, we need to maintain an aligned buffer for writes.
+ // With direct IO, we need to maintain an aligned buffer for writes.
// We allow the buffer to grow until it's size hits the limit in buffered
// IO and fix the buffer size when using direct IO to ensure alignment of
// write requests if the logical sector size is unusual
// Allows OS to incrementally sync files to disk while they are being
// written, asynchronously, in the background. This operation can be used
// to smooth out write I/Os over time. Users shouldn't rely on it for
- // persistency guarantee.
+ // persistence guarantee.
// Issue one request for every bytes_per_sync written. 0 turns it off.
//
// You may consider using rate_limiter to regulate write rate to device.
#endif // ROCKSDB_LITE
// If true, then DB::Open / CreateColumnFamily / DropColumnFamily
- // / SetOptions will fail if options file is not detected or properly
- // persisted.
+ // SetOptions will fail if options file is not properly persisted.
//
// DEFAULT: false
bool fail_if_options_file_error = false;
// Immutable.
bool allow_ingest_behind = false;
- // Needed to support differential snapshots.
- // If set to true then DB will only process deletes with sequence number
- // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
- // Clients are responsible to periodically call this method to advance
- // the cutoff time. If this method is never called and preserve_deletes
- // is set to true NO deletes will ever be processed.
- // At the moment this only keeps normal deletes, SingleDeletes will
- // not be preserved.
- // DEFAULT: false
- // Immutable (TODO: make it dynamically changeable)
- bool preserve_deletes = false;
-
// If enabled it uses two queues for writes, one for the ones with
// disable_memtable and one for the ones that also write to memtable. This
// allows the memtable writes not to lag behind other writes. It can be used
// file.
bool manual_wal_flush = false;
+ // This feature is WORK IN PROGRESS
+ // If enabled WAL records will be compressed before they are written.
+ // Only zstd is supported. Compressed WAL records will be read in supported
+ // versions regardless of the wal_compression settings.
+ CompressionType wal_compression = kNoCompression;
+
// If true, RocksDB supports flushing multiple column families and committing
// their results atomically to MANIFEST. Note that it is not
// necessary to set atomic_flush to true if WAL is always enabled since WAL
// Default: nullptr
std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
- // By default, RocksDB recovery fails if any table file referenced in
- // MANIFEST are missing after scanning the MANIFEST.
- // Best-efforts recovery is another recovery mode that
- // tries to restore the database to the most recent point in time without
- // missing file.
- // Currently not compatible with atomic flush. Furthermore, WAL files will
- // not be used for recovery if best_efforts_recovery is true.
+ // By default, RocksDB recovery fails if any table/blob file referenced in the
+ // final version reconstructed from the
+ // MANIFEST are missing after scanning the MANIFEST pointed to by the
+ // CURRENT file. It can also fail if verification of unique SST id fails.
+ // Best-efforts recovery is another recovery mode that does not necessarily
+ // fail when certain table/blob files are missing/corrupted or have mismatched
+ // unique id table property. Instead, best-efforts recovery recovers each
+ // column family to a point in the MANIFEST that corresponds to a version. In
+ // such a version, all valid table/blob files referenced have the expected
+ // file size. For table files, their unique id table property match the
+ // MANIFEST.
+ //
+ // Best-efforts recovery does not need a valid CURRENT file, and tries to
+ // recover the database using one of the available MANIFEST files in the db
+ // directory.
+ // Best-efforts recovery tries the available MANIFEST files from high file
+ // numbers (newer) to low file numbers (older), and stops after finding the
+ // first MANIFEST file from which the db can be recovered to a state without
+ // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob
+ // files. It is possible that the database can be restored to an empty state
+ // with no table or blob files.
+ //
+ // Regardless of this option, the IDENTITY file
+ // is updated if needed during recovery to match the DB ID in the MANIFEST (if
+ // previously using write_dbid_to_manifest) or to be in some valid state
+ // (non-empty DB ID). Currently, not compatible with atomic flush.
+ // Furthermore, WAL files will not be used for recovery if
+ // best_efforts_recovery is true. Also requires either 1) LOCK file exists or
+ // 2) underlying env's LockFile() call returns ok even for non-existing LOCK
+ // file.
+ //
// Default: false
bool best_efforts_recovery = false;
// writing a file, by tracing back to the writing host. These corruptions
// may not be caught by the checksum since they happen before checksumming.
// If left as default, the table writer will substitute it with the actual
- // hostname when writing the SST file. If set to an empty stirng, the
+ // hostname when writing the SST file. If set to an empty string, the
// property will not be written to the SST file.
//
// Default: hostname
std::string db_host_id = kHostnameForDbHostId;
+
+ // Use this if your DB want to enable checksum handoff for specific file
+ // types writes. Make sure that the File_system you use support the
+ // crc32c checksum verification
+ // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.
+ // NOTE: currently RocksDB only generates crc32c based checksum for the
+ // handoff. If the storage layer has different checksum support, user
+ // should enble this set as empty. Otherwise,it may cause unexpected
+ // write failures.
+ FileTypeSet checksum_handoff_file_types;
+
+ // EXPERIMENTAL
+ // CompactionService is a feature allows the user to run compactions on a
+ // different host or process, which offloads the background load from the
+ // primary host.
+ // It's an experimental feature, the interface will be changed without
+ // backward/forward compatibility support for now. Some known issues are still
+ // under development.
+ std::shared_ptr<CompactionService> compaction_service = nullptr;
+
+ // It indicates, which lowest cache tier we want to
+ // use for a certain DB. Currently we support volatile_tier and
+ // non_volatile_tier. They are layered. By setting it to kVolatileTier, only
+ // the block cache (current implemented volatile_tier) is used. So
+ // cache entries will not spill to secondary cache (current
+ // implemented non_volatile_tier), and block cache lookup misses will not
+ // lookup in the secondary cache. When kNonVolatileBlockTier is used, we use
+ // both block cache and secondary cache.
+ //
+ // Default: kNonVolatileBlockTier
+ CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+ // If set to false, when compaction or flush sees a SingleDelete followed by
+ // a Delete for the same user key, compaction job will not fail.
+ // Otherwise, compaction job will fail.
+ // This is a temporary option to help existing use cases migrate, and
+ // will be removed in a future release.
+ // Warning: do not set to false unless you are trying to migrate existing
+ // data in which the contract of single delete
+ // (https://github.com/facebook/rocksdb/wiki/Single-Delete) is not enforced,
+ // thus has Delete mixed with SingleDelete for the same user key. Violation
+ // of the contract leads to undefined behaviors with high possibility of data
+ // inconsistency, e.g. deleted old data become visible again, etc.
+ bool enforce_single_del_contracts = true;
};
// Options to control the behavior of a database (passed to DB::Open)
const ColumnFamilyOptions& column_family_options)
: DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
- // The function recovers options to the option as in version 4.6.
+ // Change to some default settings from an older version.
+ // NOT MAINTAINED: This function has not been and is not maintained.
+ // DEPRECATED: This function might be removed in a future release.
+ // In general, defaults are changed to suit broad interests. Opting
+ // out of a change on upgrade should be deliberate and considered.
Options* OldDefaults(int rocksdb_major_version = 4,
int rocksdb_minor_version = 6);
// Use this if your DB is very small (like under 1GB) and you don't want to
// spend lots of memory for memtables.
Options* OptimizeForSmallDb();
+
+ // Disable some checks that should not be necessary in the absence of
+ // software logic errors or CPU+memory hardware errors. This can improve
+ // write speeds but is only recommended for temporary use. Does not
+ // change protection against corrupt storage (e.g. verify_checksums).
+ Options* DisableExtraChecks();
};
-//
// An application can issue a read request (via Get/Iterators) and specify
// if that read should process data that ALREADY resides on a specified cache
// level. For example, if an application specifies kBlockCacheTier then the
// need to have the same prefix. This is because ordering is not guaranteed
// outside of prefix domain.
//
+ // In case of user_defined timestamp, if enabled, iterate_lower_bound should
+ // point to key without timestamp part.
// Default: nullptr
const Slice* iterate_lower_bound;
- // "iterate_upper_bound" defines the extent upto which the forward iterator
- // can returns entries. Once the bound is reached, Valid() will be false.
+ // "iterate_upper_bound" defines the extent up to which the forward iterator
+ // can return entries. Once the bound is reached, Valid() will be false.
// "iterate_upper_bound" is exclusive ie the bound value is
- // not a valid entry. If prefix_extractor is not null, the Seek target
- // and iterate_upper_bound need to have the same prefix.
- // This is because ordering is not guaranteed outside of prefix domain.
- //
+ // not a valid entry. If prefix_extractor is not null:
+ // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
+ // to infer whether prefix iterating (e.g. applying prefix bloom filter)
+ // can be used within RocksDB. This is done by comparing
+ // iterate_upper_bound with the seek key.
+ // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes
+ // effect if it shares the same prefix as the seek key. If
+ // iterate_upper_bound is outside the prefix of the seek key, then keys
+ // returned outside the prefix range will be undefined, just as if
+ // iterate_upper_bound = null.
+ // If iterate_upper_bound is not null, SeekToLast() will position the iterator
+ // at the first key smaller than iterate_upper_bound.
+ //
+ // In case of user_defined timestamp, if enabled, iterate_upper_bound should
+ // point to key without timestamp part.
// Default: nullptr
const Slice* iterate_upper_bound;
// RocksDB does auto-readahead for iterators on noticing more than two reads
// for a table file. The readahead starts at 8KB and doubles on every
- // additional read upto 256KB.
+ // additional read up to 256KB.
// This option can help if most of the range scans are large, and if it is
// determined that a larger readahead than that enabled by auto-readahead is
// needed.
// Default: true
bool verify_checksums;
- // Should the "data block"/"index block"" read for this iteration be placed in
+ // Should the "data block"/"index block" read for this iteration be placed in
// block cache?
// Callers may wish to set this field to false for bulk scans.
// This would help not to the change eviction order of existing items in the
// used in the table. Some table format (e.g. plain table) may not support
// this option.
// If true when calling Get(), we also skip prefix bloom when reading from
- // block based table. It provides a way to read existing data after
- // changing implementation of prefix extractor.
+ // block based table, which only affects Get() performance.
// Default: false
bool total_order_seek;
// When true, by default use total_order_seek = true, and RocksDB can
// selectively enable prefix seek mode if won't generate a different result
// from total_order_seek, based on seek key, and iterator upper bound.
- // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
+ // Not supported in ROCKSDB_LITE mode, in the way that even with value true
// prefix mode is not used.
+ // BUG: Using Comparator::IsSameLengthImmediateSuccessor and
+ // SliceTransform::FullLengthEnabled to enable prefix mode in cases where
+ // prefix of upper bound differs from prefix of seek key has a flaw.
+ // If present in the DB, "short keys" (shorter than "full length" prefix)
+ // can be omitted from auto_prefix_mode iteration when they would be present
+ // in total_order_seek iteration, regardless of whether the short keys are
+ // "in domain" of the prefix extractor. This is not an issue if no short
+ // keys are added to DB or are not expected to be returned by such
+ // iterators. (We are also assuming the new condition on
+ // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
+ // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
// Default: false
bool auto_prefix_mode;
// Default: false
bool background_purge_on_iterator_cleanup;
- // If true, keys deleted using the DeleteRange() API will be visible to
- // readers until they are naturally deleted during compaction. This improves
- // read performance in DBs with many range deletions.
+ // If true, range tombstones handling will be skipped in key lookup paths.
+ // For DB instances that don't use DeleteRange() calls, this setting can
+ // be used to optimize the read performance.
+ // Note that, if this assumption (of no previous DeleteRange() calls) is
+ // broken, stale keys could be served in read paths.
// Default: false
bool ignore_range_deletions;
// Default: empty (every table will be scanned)
std::function<bool(const TableProperties&)> table_filter;
- // Needed to support differential snapshots. Has 2 effects:
- // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
- // 2) if this param > 0 iterator will return INTERNAL keys instead of
- // user keys; e.g. return tombstones as well.
- // Default: 0 (don't filter by seqnum, return user keys)
- SequenceNumber iter_start_seqnum;
-
// Timestamp of operation. Read should return the latest data visible to the
// specified timestamp. All timestamps of the same database must be of the
// same length and format. The user is responsible for providing a customized
// A timeout in microseconds to be passed to the underlying FileSystem for
// reads. As opposed to deadline, this determines the timeout for each
// individual file read request. If a MultiGet/Get/Seek/Next etc call
- // results in multiple reads, each read can last upto io_timeout us.
+ // results in multiple reads, each read can last up to io_timeout us.
std::chrono::microseconds io_timeout;
// It limits the maximum cumulative value size of the keys in batch while
// Default: std::numeric_limits<uint64_t>::max()
uint64_t value_size_soft_limit;
+ // For iterators, RocksDB does auto-readahead on noticing more than two
+ // sequential reads for a table file if user doesn't provide readahead_size.
+ // The readahead starts at 8KB and doubles on every additional read upto
+ // max_auto_readahead_size only when reads are sequential. However at each
+ // level, if iterator moves over next file, readahead_size starts again from
+ // 8KB.
+ //
+ // By enabling this option, RocksDB will do some enhancements for
+ // prefetching the data.
+ //
+ // Default: false
+ bool adaptive_readahead;
+
+ // For file reads associated with this option, charge the internal rate
+ // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+ // special value `Env::IO_TOTAL` disables charging the rate limiter.
+ //
+ // The rate limiting is bypassed no matter this option's value for file reads
+ // on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
+ // is a `PlainTableFactory`) and cuckoo tables (these can exist when
+ // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
+ //
+ // The bytes charged to rate limiter may not exactly match the file read bytes
+ // since there are some seemingly insignificant reads, like for file
+ // headers/footers, that we currently do not charge to rate limiter.
+ //
+ // Default: `Env::IO_TOTAL`.
+ Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
+
+ // Experimental
+ //
+ // If async_io is enabled, RocksDB will prefetch some of data asynchronously.
+ // RocksDB apply it if reads are sequential and its internal automatic
+ // prefetching.
+ //
+ // Default: false
+ bool async_io;
+
+ // Experimental
+ //
+ // If async_io is set, then this flag controls whether we read SST files
+ // in multiple levels asynchronously. Enabling this flag can help reduce
+ // MultiGet latency by maximizing the number of SST files read in
+ // parallel if the keys in the MultiGet batch are in different levels. It
+ // comes at the expense of slightly higher CPU overhead.
+ //
+ // Default: true
+ bool optimize_multiget_for_io;
+
ReadOptions();
ReadOptions(bool cksum, bool cache);
};
bool no_slowdown;
// If true, this write request is of lower priority if compaction is
- // behind. In this case, no_slowdown = true, the request will be cancelled
+ // behind. In this case, no_slowdown = true, the request will be canceled
// immediately with Status::Incomplete() returned. Otherwise, it will be
// slowed down. The slowdown value is determined by RocksDB to guarantee
// it introduces minimum impacts to high priority writes.
// Default: false
bool memtable_insert_hint_per_batch;
- // Timestamp of write operation, e.g. Put. All timestamps of the same
- // database must share the same length and format. The user is also
- // responsible for providing a customized compare function via Comparator to
- // order <key, timestamp> tuples. If the user wants to enable timestamp, then
- // all write operations must be associated with timestamp because RocksDB, as
- // a single-node storage engine currently has no knowledge of global time,
- // thus has to rely on the application.
- // The user-specified timestamp feature is still under active development,
- // and the API is subject to change.
- const Slice* timestamp;
+ // For writes associated with this option, charge the internal rate
+ // limiter (see `DBOptions::rate_limiter`) at the specified priority. The
+ // special value `Env::IO_TOTAL` disables charging the rate limiter.
+ //
+ // Currently the support covers automatic WAL flushes, which happen during
+ // live updates (`Put()`, `Write()`, `Delete()`, etc.)
+ // when `WriteOptions::disableWAL == false`
+ // and `DBOptions::manual_wal_flush == false`.
+ //
+ // Only `Env::IO_USER` and `Env::IO_TOTAL` are allowed
+ // due to implementation constraints.
+ //
+ // Default: `Env::IO_TOTAL`
+ Env::IOPriority rate_limiter_priority;
+
+ // `protection_bytes_per_key` is the number of bytes used to store
+ // protection information for each key entry. Currently supported values are
+ // zero (disabled) and eight.
+ //
+ // Default: zero (disabled).
+ size_t protection_bytes_per_key;
WriteOptions()
: sync(false),
no_slowdown(false),
low_pri(false),
memtable_insert_hint_per_batch(false),
- timestamp(nullptr) {}
+ rate_limiter_priority(Env::IO_TOTAL),
+ protection_bytes_per_key(0) {}
};
// Options that control flush operations
kForceOptimized,
};
+// For manual compaction, we can configure if we want to skip/force garbage
+// collection of blob files.
+enum class BlobGarbageCollectionPolicy {
+ // Force blob file garbage collection.
+ kForce,
+ // Skip blob file garbage collection.
+ kDisable,
+ // Inherit blob file garbage collection policy from ColumnFamilyOptions.
+ kUseDefault,
+};
+
// CompactRangeOptions is used by CompactRange() call.
struct CompactRangeOptions {
// If true, no other compaction will run at the same time as this
- // manual compaction
- bool exclusive_manual_compaction = true;
+ // manual compaction.
+ //
+ // Default: false
+ bool exclusive_manual_compaction = false;
+
// If true, compacted files will be moved to the minimum level capable
// of holding the data or given level (specified non-negative target_level).
bool change_level = false;
bool allow_write_stall = false;
// If > 0, it will replace the option in the DBOptions for this compaction.
uint32_t max_subcompactions = 0;
+ // Set user-defined timestamp low bound, the data with older timestamp than
+ // low bound maybe GCed by compaction. Default: nullptr
+ const Slice* full_history_ts_low = nullptr;
+
+ // Allows cancellation of an in-progress manual compaction.
+ //
+ // Cancellation can be delayed waiting on automatic compactions when used
+ // together with `exclusive_manual_compaction == true`.
+ std::atomic<bool>* canceled = nullptr;
+ // NOTE: Calling DisableManualCompaction() overwrites the uer-provided
+ // canceled variable in CompactRangeOptions.
+ // Typically, when CompactRange is being called in one thread (t1) with
+ // canceled = false, and DisableManualCompaction is being called in the
+ // other thread (t2), manual compaction is disabled normally, even if the
+ // compaction iterator may still scan a few items before *canceled is
+ // set to true
+
+ // If set to kForce, RocksDB will override enable_blob_file_garbage_collection
+ // to true; if set to kDisable, RocksDB will override it to false, and
+ // kUseDefault leaves the setting in effect. This enables customers to both
+ // force-enable and force-disable GC when calling CompactRange.
+ BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+ BlobGarbageCollectionPolicy::kUseDefault;
+
+ // If set to < 0 or > 1, RocksDB leaves blob_garbage_collection_age_cutoff
+ // from ColumnFamilyOptions in effect. Otherwise, it will override the
+ // user-provided setting. This enables customers to selectively override the
+ // age cutoff.
+ double blob_garbage_collection_age_cutoff = -1;
};
// IngestExternalFileOptions is used by IngestExternalFile()
bool allow_blocking_flush = true;
// Set to true if you would like duplicate keys in the file being ingested
// to be skipped rather than overwriting existing data under that key.
- // Usecase: back-fill of some historical data in the database without
+ // Use case: back-fill of some historical data in the database without
// over-writing existing newer version of data.
// This option could only be used if the DB has been running
// with allow_ingest_behind=true since the dawn of time.
// will be ignored; 2) If DB enable the checksum function, we calculate the
// sst file checksum after the file is moved or copied and compare the
// checksum and checksum name. If checksum or checksum function name does
- // not match, ingestion will be failed. If the verification is sucessful,
+ // not match, ingestion will be failed. If the verification is successful,
// checksum and checksum function name will be stored in Manifest.
// If this option is set to FALSE, 1) if DB does not enable checksum,
// the ingested checksum information will be ignored; 2) if DB enable the
// ingestion. However, if no checksum information is provided with the
// ingested files, DB will generate the checksum and store in the Manifest.
bool verify_file_checksum = true;
+ // Set to TRUE if user wants file to be ingested to the bottommost level. An
+ // error of Status::TryAgain() will be returned if a file cannot fit in the
+ // bottommost level when calling
+ // DB::IngestExternalFile()/DB::IngestExternalFiles(). The user should clear
+ // the bottommost level in the overlapping range before re-attempt.
+ //
+ // ingest_behind takes precedence over fail_if_not_bottommost_level.
+ bool fail_if_not_bottommost_level = false;
};
enum TraceFilterType : uint64_t {
// Do not trace the get operations
kTraceFilterGet = 0x1 << 0,
// Do not trace the write operations
- kTraceFilterWrite = 0x1 << 1
+ kTraceFilterWrite = 0x1 << 1,
+ // Do not trace the `Iterator::Seek()` operations
+ kTraceFilterIteratorSeek = 0x1 << 2,
+ // Do not trace the `Iterator::SeekForPrev()` operations
+ kTraceFilterIteratorSeekForPrev = 0x1 << 3,
+ // Do not trace the `MultiGet()` operations
+ kTraceFilterMultiGet = 0x1 << 4,
};
// TraceOptions is used for StartTrace
uint64_t sampling_frequency = 1;
// Note: The filtering happens before sampling.
uint64_t filter = kTraceFilterNone;
+ // When true, the order of write records in the trace will match the order of
+ // the corresponding write records in the WAL and applied to the DB. There may
+ // be a performance penalty associated with preserving this ordering.
+ //
+ // Default: false. This means write records in the trace may be in an order
+ // different from the WAL's order.
+ bool preserve_write_order = false;
};
// ImportColumnFamilyOptions is used by ImportColumnFamily()
// Options used with DB::GetApproximateSizes()
struct SizeApproximationOptions {
// Defines whether the returned size should include the recently written
- // data in the mem-tables. If set to false, include_files must be true.
- bool include_memtabtles = false;
+ // data in the memtables. If set to false, include_files must be true.
+ bool include_memtables = false;
// Defines whether the returned size should include data serialized to disk.
- // If set to false, include_memtabtles must be true.
+ // If set to false, include_memtables must be true.
bool include_files = true;
// When approximating the files total size that is used to store a keys range
// using DB::GetApproximateSizes, allow approximation with an error margin of
double files_size_error_margin = -1.0;
};
+struct CompactionServiceOptionsOverride {
+ // Currently pointer configurations are not passed to compaction service
+ // compaction so the user needs to set it. It will be removed once pointer
+ // configuration passing is supported.
+ Env* env = Env::Default();
+ std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+ const Comparator* comparator = BytewiseComparator();
+ std::shared_ptr<MergeOperator> merge_operator = nullptr;
+ const CompactionFilter* compaction_filter = nullptr;
+ std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+ std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+ std::shared_ptr<TableFactory> table_factory;
+ std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+
+ // Only subsets of events are triggered in remote compaction worker, like:
+ // `OnTableFileCreated`, `OnTableFileCreationStarted`,
+ // `ShouldBeNotifiedOnFileIO` `OnSubcompactionBegin`,
+ // `OnSubcompactionCompleted`, etc. Worth mentioning, `OnCompactionBegin` and
+ // `OnCompactionCompleted` won't be triggered. They will be triggered on the
+ // primary DB side.
+ std::vector<std::shared_ptr<EventListener>> listeners;
+
+ // statistics is used to collect DB operation metrics, the metrics won't be
+ // returned to CompactionService primary host, to collect that, the user needs
+ // to set it here.
+ std::shared_ptr<Statistics> statistics = nullptr;
+
+ // Only compaction generated SST files use this user defined table properties
+ // collector.
+ std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+ table_properties_collector_factories;
+};
+
+struct OpenAndCompactOptions {
+ // Allows cancellation of an in-progress compaction.
+ std::atomic<bool>* canceled = nullptr;
+};
+
+#ifndef ROCKSDB_LITE
+struct LiveFilesStorageInfoOptions {
+ // Whether to populate FileStorageInfo::file_checksum* or leave blank
+ bool include_checksum_info = false;
+ // Flushes memtables if total size in bytes of live WAL files is >= this
+ // number (and DB is not read-only).
+ // Default: always force a flush without checking sizes.
+ uint64_t wal_size_for_flush = 0;
+};
+#endif // !ROCKSDB_LITE
+
} // namespace ROCKSDB_NAMESPACE