]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/options.h
import quincy beta 17.1.0
[ceph.git] / ceph / src / rocksdb / include / rocksdb / options.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8
9 #pragma once
10
11 #include <stddef.h>
12 #include <stdint.h>
13
14 #include <limits>
15 #include <memory>
16 #include <string>
17 #include <unordered_map>
18 #include <vector>
19
20 #include "rocksdb/advanced_options.h"
21 #include "rocksdb/comparator.h"
22 #include "rocksdb/compression_type.h"
23 #include "rocksdb/env.h"
24 #include "rocksdb/file_checksum.h"
25 #include "rocksdb/listener.h"
26 #include "rocksdb/sst_partitioner.h"
27 #include "rocksdb/types.h"
28 #include "rocksdb/universal_compaction.h"
29 #include "rocksdb/version.h"
30 #include "rocksdb/write_buffer_manager.h"
31
32 #ifdef max
33 #undef max
34 #endif
35
36 namespace ROCKSDB_NAMESPACE {
37
38 class Cache;
39 class CompactionFilter;
40 class CompactionFilterFactory;
41 class Comparator;
42 class ConcurrentTaskLimiter;
43 class Env;
44 enum InfoLogLevel : unsigned char;
45 class SstFileManager;
46 class FilterPolicy;
47 class Logger;
48 class MergeOperator;
49 class Snapshot;
50 class MemTableRepFactory;
51 class RateLimiter;
52 class Slice;
53 class Statistics;
54 class InternalKeyComparator;
55 class WalFilter;
56 class FileSystem;
57
58 struct Options;
59 struct DbPath;
60
61 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
62 // The function recovers options to a previous version. Only 4.6 or later
63 // versions are supported.
64 ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4,
65 int rocksdb_minor_version = 6);
66
67 // Some functions that make it easier to optimize RocksDB
68 // Use this if your DB is very small (like under 1GB) and you don't want to
69 // spend lots of memory for memtables.
70 // An optional cache object is passed in to be used as the block cache
71 ColumnFamilyOptions* OptimizeForSmallDb(
72 std::shared_ptr<Cache>* cache = nullptr);
73
74 // Use this if you don't need to keep the data sorted, i.e. you'll never use
75 // an iterator, only Put() and Get() API calls
76 //
77 // Not supported in ROCKSDB_LITE
78 ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
79
80 // Default values for some parameters in ColumnFamilyOptions are not
81 // optimized for heavy workloads and big datasets, which means you might
82 // observe write stalls under some conditions. As a starting point for tuning
83 // RocksDB options, use the following two functions:
84 // * OptimizeLevelStyleCompaction -- optimizes level style compaction
85 // * OptimizeUniversalStyleCompaction -- optimizes universal style compaction
86 // Universal style compaction is focused on reducing Write Amplification
87 // Factor for big data sets, but increases Space Amplification. You can learn
88 // more about the different styles here:
89 // https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide
90 // Make sure to also call IncreaseParallelism(), which will provide the
91 // biggest performance gains.
92 // Note: we might use more memory than memtable_memory_budget during high
93 // write rate period
94 //
95 // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE
96 ColumnFamilyOptions* OptimizeLevelStyleCompaction(
97 uint64_t memtable_memory_budget = 512 * 1024 * 1024);
98 ColumnFamilyOptions* OptimizeUniversalStyleCompaction(
99 uint64_t memtable_memory_budget = 512 * 1024 * 1024);
100
101 // -------------------
102 // Parameters that affect behavior
103
104 // Comparator used to define the order of keys in the table.
105 // Default: a comparator that uses lexicographic byte-wise ordering
106 //
107 // REQUIRES: The client must ensure that the comparator supplied
108 // here has the same name and orders keys *exactly* the same as the
109 // comparator provided to previous open calls on the same DB.
110 const Comparator* comparator = BytewiseComparator();
111
112 // REQUIRES: The client must provide a merge operator if Merge operation
113 // needs to be accessed. Calling Merge on a DB without a merge operator
114 // would result in Status::NotSupported. The client must ensure that the
115 // merge operator supplied here has the same name and *exactly* the same
116 // semantics as the merge operator provided to previous open calls on
117 // the same DB. The only exception is reserved for upgrade, where a DB
118 // previously without a merge operator is introduced to Merge operation
119 // for the first time. It's necessary to specify a merge operator when
120 // opening the DB in this case.
121 // Default: nullptr
122 std::shared_ptr<MergeOperator> merge_operator = nullptr;
123
124 // A single CompactionFilter instance to call into during compaction.
125 // Allows an application to modify/delete a key-value during background
126 // compaction.
127 //
128 // If the client requires a new compaction filter to be used for different
129 // compaction runs, it can specify compaction_filter_factory instead of this
130 // option. The client should specify only one of the two.
131 // compaction_filter takes precedence over compaction_filter_factory if
132 // client specifies both.
133 //
134 // If multithreaded compaction is being used, the supplied CompactionFilter
135 // instance may be used from different threads concurrently and so should be
136 // thread-safe.
137 //
138 // Default: nullptr
139 const CompactionFilter* compaction_filter = nullptr;
140
141 // This is a factory that provides compaction filter objects which allow
142 // an application to modify/delete a key-value during background compaction.
143 //
144 // A new filter will be created on each compaction run. If multithreaded
145 // compaction is being used, each created CompactionFilter will only be used
146 // from a single thread and so does not need to be thread-safe.
147 //
148 // Default: nullptr
149 std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
150
151 // -------------------
152 // Parameters that affect performance
153
154 // Amount of data to build up in memory (backed by an unsorted log
155 // on disk) before converting to a sorted on-disk file.
156 //
157 // Larger values increase performance, especially during bulk loads.
158 // Up to max_write_buffer_number write buffers may be held in memory
159 // at the same time,
160 // so you may wish to adjust this parameter to control memory usage.
161 // Also, a larger write buffer will result in a longer recovery time
162 // the next time the database is opened.
163 //
164 // Note that write_buffer_size is enforced per column family.
165 // See db_write_buffer_size for sharing memory across column families.
166 //
167 // Default: 64MB
168 //
169 // Dynamically changeable through SetOptions() API
170 size_t write_buffer_size = 64 << 20;
171
172 // Compress blocks using the specified compression algorithm.
173 //
174 // Default: kSnappyCompression, if it's supported. If snappy is not linked
175 // with the library, the default is kNoCompression.
176 //
177 // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
178 // ~200-500MB/s compression
179 // ~400-800MB/s decompression
180 //
181 // Note that these speeds are significantly faster than most
182 // persistent storage speeds, and therefore it is typically never
183 // worth switching to kNoCompression. Even if the input data is
184 // incompressible, the kSnappyCompression implementation will
185 // efficiently detect that and will switch to uncompressed mode.
186 //
187 // If you do not set `compression_opts.level`, or set it to
188 // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
189 // default corresponding to `compression` as follows:
190 //
191 // - kZSTD: 3
192 // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
193 // - kLZ4HCCompression: 0
194 // - For all others, we do not specify a compression level
195 //
196 // Dynamically changeable through SetOptions() API
197 CompressionType compression;
198
199 // Compression algorithm that will be used for the bottommost level that
200 // contain files.
201 //
202 // Default: kDisableCompressionOption (Disabled)
203 CompressionType bottommost_compression = kDisableCompressionOption;
204
205 // different options for compression algorithms used by bottommost_compression
206 // if it is enabled. To enable it, please see the definition of
207 // CompressionOptions.
208 CompressionOptions bottommost_compression_opts;
209
210 // different options for compression algorithms
211 CompressionOptions compression_opts;
212
213 // Number of files to trigger level-0 compaction. A value <0 means that
214 // level-0 compaction will not be triggered by number of files at all.
215 //
216 // Default: 4
217 //
218 // Dynamically changeable through SetOptions() API
219 int level0_file_num_compaction_trigger = 4;
220
221 // If non-nullptr, use the specified function to determine the
222 // prefixes for keys. These prefixes will be placed in the filter.
223 // Depending on the workload, this can reduce the number of read-IOP
224 // cost for scans when a prefix is passed via ReadOptions to
225 // db.NewIterator(). For prefix filtering to work properly,
226 // "prefix_extractor" and "comparator" must be such that the following
227 // properties hold:
228 //
229 // 1) key.starts_with(prefix(key))
230 // 2) Compare(prefix(key), key) <= 0.
231 // 3) If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
232 // 4) prefix(prefix(key)) == prefix(key)
233 //
234 // Default: nullptr
235 std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
236
237 // Control maximum total data size for a level.
238 // max_bytes_for_level_base is the max total for level-1.
239 // Maximum number of bytes for level L can be calculated as
240 // (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1))
241 // For example, if max_bytes_for_level_base is 200MB, and if
242 // max_bytes_for_level_multiplier is 10, total data size for level-1
243 // will be 200MB, total file size for level-2 will be 2GB,
244 // and total file size for level-3 will be 20GB.
245 //
246 // Default: 256MB.
247 //
248 // Dynamically changeable through SetOptions() API
249 uint64_t max_bytes_for_level_base = 256 * 1048576;
250
251 // Deprecated.
252 uint64_t snap_refresh_nanos = 0;
253
254 // Disable automatic compactions. Manual compactions can still
255 // be issued on this column family
256 //
257 // Dynamically changeable through SetOptions() API
258 bool disable_auto_compactions = false;
259
260 // This is a factory that provides TableFactory objects.
261 // Default: a block-based table factory that provides a default
262 // implementation of TableBuilder and TableReader with default
263 // BlockBasedTableOptions.
264 std::shared_ptr<TableFactory> table_factory;
265
266 // A list of paths where SST files for this column family
267 // can be put into, with its target size. Similar to db_paths,
268 // newer data is placed into paths specified earlier in the
269 // vector while older data gradually moves to paths specified
270 // later in the vector.
271 // Note that, if a path is supplied to multiple column
272 // families, it would have files and total size from all
273 // the column families combined. User should provision for the
274 // total size(from all the column families) in such cases.
275 //
276 // If left empty, db_paths will be used.
277 // Default: empty
278 std::vector<DbPath> cf_paths;
279
280 // Compaction concurrent thread limiter for the column family.
281 // If non-nullptr, use given concurrent thread limiter to control
282 // the max outstanding compaction tasks. Limiter can be shared with
283 // multiple column families across db instances.
284 //
285 // Default: nullptr
286 std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
287
288 // If non-nullptr, use the specified factory for a function to determine the
289 // partitioning of sst files. This helps compaction to split the files
290 // on interesting boundaries (key prefixes) to make propagation of sst
291 // files less write amplifying (covering the whole key space).
292 // THE FEATURE IS STILL EXPERIMENTAL
293 //
294 // Default: nullptr
295 std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
296
297 // Create ColumnFamilyOptions with default values for all fields
298 ColumnFamilyOptions();
299 // Create ColumnFamilyOptions from Options
300 explicit ColumnFamilyOptions(const Options& options);
301
302 void Dump(Logger* log) const;
303 };
304
305 enum class WALRecoveryMode : char {
306 // Original levelDB recovery
307 //
308 // We tolerate the last record in any log to be incomplete due to a crash
309 // while writing it. Zeroed bytes from preallocation are also tolerated in the
310 // trailing data of any log.
311 //
312 // Use case: Applications for which updates, once applied, must not be rolled
313 // back even after a crash-recovery. In this recovery mode, RocksDB guarantees
314 // this as long as `WritableFile::Append()` writes are durable. In case the
315 // user needs the guarantee in more situations (e.g., when
316 // `WritableFile::Append()` writes to page cache, but the user desires this
317 // guarantee in face of power-loss crash-recovery), RocksDB offers various
318 // mechanisms to additionally invoke `WritableFile::Sync()` in order to
319 // strengthen the guarantee.
320 //
321 // This differs from `kPointInTimeRecovery` in that, in case a corruption is
322 // detected during recovery, this mode will refuse to open the DB. Whereas,
323 // `kPointInTimeRecovery` will stop recovery just before the corruption since
324 // that is a valid point-in-time to which to recover.
325 kTolerateCorruptedTailRecords = 0x00,
326 // Recover from clean shutdown
327 // We don't expect to find any corruption in the WAL
328 // Use case : This is ideal for unit tests and rare applications that
329 // can require high consistency guarantee
330 kAbsoluteConsistency = 0x01,
331 // Recover to point-in-time consistency (default)
332 // We stop the WAL playback on discovering WAL inconsistency
333 // Use case : Ideal for systems that have disk controller cache like
334 // hard disk, SSD without super capacitor that store related data
335 kPointInTimeRecovery = 0x02,
336 // Recovery after a disaster
337 // We ignore any corruption in the WAL and try to salvage as much data as
338 // possible
339 // Use case : Ideal for last ditch effort to recover data or systems that
340 // operate with low grade unrelated data
341 kSkipAnyCorruptedRecords = 0x03,
342 };
343
344 struct DbPath {
345 std::string path;
346 uint64_t target_size; // Target size of total files under the path, in byte.
347
348 DbPath() : target_size(0) {}
349 DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
350 };
351
352 extern const char* kHostnameForDbHostId;
353
354 struct DBOptions {
355 // The function recovers options to the option as in version 4.6.
356 DBOptions* OldDefaults(int rocksdb_major_version = 4,
357 int rocksdb_minor_version = 6);
358
359 // Some functions that make it easier to optimize RocksDB
360
361 // Use this if your DB is very small (like under 1GB) and you don't want to
362 // spend lots of memory for memtables.
363 // An optional cache object is passed in for the memory of the
364 // memtable to cost to
365 DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);
366
367 #ifndef ROCKSDB_LITE
368 // By default, RocksDB uses only one background thread for flush and
369 // compaction. Calling this function will set it up such that total of
370 // `total_threads` is used. Good value for `total_threads` is the number of
371 // cores. You almost definitely want to call this function if your system is
372 // bottlenecked by RocksDB.
373 DBOptions* IncreaseParallelism(int total_threads = 16);
374 #endif // ROCKSDB_LITE
375
376 // If true, the database will be created if it is missing.
377 // Default: false
378 bool create_if_missing = false;
379
380 // If true, missing column families will be automatically created.
381 // Default: false
382 bool create_missing_column_families = false;
383
384 // If true, an error is raised if the database already exists.
385 // Default: false
386 bool error_if_exists = false;
387
388 // If true, RocksDB will aggressively check consistency of the data.
389 // Also, if any of the writes to the database fails (Put, Delete, Merge,
390 // Write), the database will switch to read-only mode and fail all other
391 // Write operations.
392 // In most cases you want this to be set to true.
393 // Default: true
394 bool paranoid_checks = true;
395
396 // If true, track WALs in MANIFEST and verify them on recovery.
397 //
398 // If a WAL is tracked in MANIFEST but is missing from disk on recovery,
399 // or the size of the tracked WAL is larger than the WAL's on-disk size,
400 // an error is reported and recovery is aborted.
401 //
402 // If a WAL is not tracked in MANIFEST, then no verification will happen
403 // during recovery.
404 //
405 // Default: false
406 // FIXME(cheng): This option is part of a work in progress and does not yet
407 // work
408 bool track_and_verify_wals_in_manifest = false;
409
410 // Use the specified object to interact with the environment,
411 // e.g. to read/write files, schedule background work, etc. In the near
412 // future, support for doing storage operations such as read/write files
413 // through env will be deprecated in favor of file_system (see below)
414 // Default: Env::Default()
415 Env* env = Env::Default();
416
417 // Use to control write rate of flush and compaction. Flush has higher
418 // priority than compaction. Rate limiting is disabled if nullptr.
419 // If rate limiter is enabled, bytes_per_sync is set to 1MB by default.
420 // Default: nullptr
421 std::shared_ptr<RateLimiter> rate_limiter = nullptr;
422
423 // Use to track SST files and control their file deletion rate.
424 //
425 // Features:
426 // - Throttle the deletion rate of the SST files.
427 // - Keep track the total size of all SST files.
428 // - Set a maximum allowed space limit for SST files that when reached
429 // the DB wont do any further flushes or compactions and will set the
430 // background error.
431 // - Can be shared between multiple dbs.
432 // Limitations:
433 // - Only track and throttle deletes of SST files in
434 // first db_path (db_name if db_paths is empty).
435 //
436 // Default: nullptr
437 std::shared_ptr<SstFileManager> sst_file_manager = nullptr;
438
439 // Any internal progress/error information generated by the db will
440 // be written to info_log if it is non-nullptr, or to a file stored
441 // in the same directory as the DB contents if info_log is nullptr.
442 // Default: nullptr
443 std::shared_ptr<Logger> info_log = nullptr;
444
445 #ifdef NDEBUG
446 InfoLogLevel info_log_level = INFO_LEVEL;
447 #else
448 InfoLogLevel info_log_level = DEBUG_LEVEL;
449 #endif // NDEBUG
450
451 // Number of open files that can be used by the DB. You may need to
452 // increase this if your database has a large working set. Value -1 means
453 // files opened are always kept open. You can estimate number of files based
454 // on target_file_size_base and target_file_size_multiplier for level-based
455 // compaction. For universal-style compaction, you can usually set it to -1.
456 //
457 // Default: -1
458 //
459 // Dynamically changeable through SetDBOptions() API.
460 int max_open_files = -1;
461
462 // If max_open_files is -1, DB will open all files on DB::Open(). You can
463 // use this option to increase the number of threads used to open the files.
464 // Default: 16
465 int max_file_opening_threads = 16;
466
467 // Once write-ahead logs exceed this size, we will start forcing the flush of
468 // column families whose memtables are backed by the oldest live WAL file
469 // (i.e. the ones that are causing all the space amplification). If set to 0
470 // (default), we will dynamically choose the WAL size limit to be
471 // [sum of all write_buffer_size * max_write_buffer_number] * 4
472 // This option takes effect only when there are more than one column family as
473 // otherwise the wal size is dictated by the write_buffer_size.
474 //
475 // Default: 0
476 //
477 // Dynamically changeable through SetDBOptions() API.
478 uint64_t max_total_wal_size = 0;
479
480 // If non-null, then we should collect metrics about database operations
481 std::shared_ptr<Statistics> statistics = nullptr;
482
483 // By default, writes to stable storage use fdatasync (on platforms
484 // where this function is available). If this option is true,
485 // fsync is used instead.
486 //
487 // fsync and fdatasync are equally safe for our purposes and fdatasync is
488 // faster, so it is rarely necessary to set this option. It is provided
489 // as a workaround for kernel/filesystem bugs, such as one that affected
490 // fdatasync with ext4 in kernel versions prior to 3.7.
491 bool use_fsync = false;
492
493 // A list of paths where SST files can be put into, with its target size.
494 // Newer data is placed into paths specified earlier in the vector while
495 // older data gradually moves to paths specified later in the vector.
496 //
497 // For example, you have a flash device with 10GB allocated for the DB,
498 // as well as a hard drive of 2TB, you should config it to be:
499 // [{"/flash_path", 10GB}, {"/hard_drive", 2TB}]
500 //
501 // The system will try to guarantee data under each path is close to but
502 // not larger than the target size. But current and future file sizes used
503 // by determining where to place a file are based on best-effort estimation,
504 // which means there is a chance that the actual size under the directory
505 // is slightly more than target size under some workloads. User should give
506 // some buffer room for those cases.
507 //
508 // If none of the paths has sufficient room to place a file, the file will
509 // be placed to the last path anyway, despite to the target size.
510 //
511 // Placing newer data to earlier paths is also best-efforts. User should
512 // expect user files to be placed in higher levels in some extreme cases.
513 //
514 // If left empty, only one path will be used, which is db_name passed when
515 // opening the DB.
516 // Default: empty
517 std::vector<DbPath> db_paths;
518
519 // This specifies the info LOG dir.
520 // If it is empty, the log files will be in the same dir as data.
521 // If it is non empty, the log files will be in the specified dir,
522 // and the db data dir's absolute path will be used as the log file
523 // name's prefix.
524 std::string db_log_dir = "";
525
526 // This specifies the absolute dir path for write-ahead logs (WAL).
527 // If it is empty, the log files will be in the same dir as data,
528 // dbname is used as the data dir by default
529 // If it is non empty, the log files will be in kept the specified dir.
530 // When destroying the db,
531 // all log files in wal_dir and the dir itself is deleted
532 std::string wal_dir = "";
533
534 // The periodicity when obsolete files get deleted. The default
535 // value is 6 hours. The files that get out of scope by compaction
536 // process will still get automatically delete on every compaction,
537 // regardless of this setting
538 //
539 // Default: 6 hours
540 //
541 // Dynamically changeable through SetDBOptions() API.
542 uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
543
544 // Maximum number of concurrent background jobs (compactions and flushes).
545 //
546 // Default: 2
547 //
548 // Dynamically changeable through SetDBOptions() API.
549 int max_background_jobs = 2;
550
551 // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
552 // value of max_background_jobs. This option is ignored.
553 //
554 // Dynamically changeable through SetDBOptions() API.
555 int base_background_compactions = -1;
556
557 // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
558 // value of max_background_jobs. For backwards compatibility we will set
559 // `max_background_jobs = max_background_compactions + max_background_flushes`
560 // in the case where user sets at least one of `max_background_compactions` or
561 // `max_background_flushes` (we replace -1 by 1 in case one option is unset).
562 //
563 // Maximum number of concurrent background compaction jobs, submitted to
564 // the default LOW priority thread pool.
565 //
566 // If you're increasing this, also consider increasing number of threads in
567 // LOW priority thread pool. For more information, see
568 // Env::SetBackgroundThreads
569 //
570 // Default: -1
571 //
572 // Dynamically changeable through SetDBOptions() API.
573 int max_background_compactions = -1;
574
575 // This value represents the maximum number of threads that will
576 // concurrently perform a compaction job by breaking it into multiple,
577 // smaller ones that are run simultaneously.
578 // Default: 1 (i.e. no subcompactions)
579 //
580 // Dynamically changeable through SetDBOptions() API.
581 uint32_t max_subcompactions = 1;
582
583 // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
584 // value of max_background_jobs. For backwards compatibility we will set
585 // `max_background_jobs = max_background_compactions + max_background_flushes`
586 // in the case where user sets at least one of `max_background_compactions` or
587 // `max_background_flushes`.
588 //
589 // Maximum number of concurrent background memtable flush jobs, submitted by
590 // default to the HIGH priority thread pool. If the HIGH priority thread pool
591 // is configured to have zero threads, flush jobs will share the LOW priority
592 // thread pool with compaction jobs.
593 //
594 // It is important to use both thread pools when the same Env is shared by
595 // multiple db instances. Without a separate pool, long running compaction
596 // jobs could potentially block memtable flush jobs of other db instances,
597 // leading to unnecessary Put stalls.
598 //
599 // If you're increasing this, also consider increasing number of threads in
600 // HIGH priority thread pool. For more information, see
601 // Env::SetBackgroundThreads
602 // Default: -1
603 int max_background_flushes = -1;
604
605 // Specify the maximal size of the info log file. If the log file
606 // is larger than `max_log_file_size`, a new info log file will
607 // be created.
608 // If max_log_file_size == 0, all logs will be written to one
609 // log file.
610 size_t max_log_file_size = 0;
611
612 // Time for the info log file to roll (in seconds).
613 // If specified with non-zero value, log file will be rolled
614 // if it has been active longer than `log_file_time_to_roll`.
615 // Default: 0 (disabled)
616 // Not supported in ROCKSDB_LITE mode!
617 size_t log_file_time_to_roll = 0;
618
619 // Maximal info log files to be kept.
620 // Default: 1000
621 size_t keep_log_file_num = 1000;
622
623 // Recycle log files.
624 // If non-zero, we will reuse previously written log files for new
625 // logs, overwriting the old data. The value indicates how many
626 // such files we will keep around at any point in time for later
627 // use. This is more efficient because the blocks are already
628 // allocated and fdatasync does not need to update the inode after
629 // each write.
630 // Default: 0
631 size_t recycle_log_file_num = 0;
632
633 // manifest file is rolled over on reaching this limit.
634 // The older manifest file be deleted.
635 // The default value is 1GB so that the manifest file can grow, but not
636 // reach the limit of storage capacity.
637 uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
638
639 // Number of shards used for table cache.
640 int table_cache_numshardbits = 6;
641
642 // NOT SUPPORTED ANYMORE
643 // int table_cache_remove_scan_count_limit;
644
645 // The following two fields affect how archived logs will be deleted.
646 // 1. If both set to 0, logs will be deleted asap and will not get into
647 // the archive.
648 // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
649 // WAL files will be checked every 10 min and if total size is greater
650 // then WAL_size_limit_MB, they will be deleted starting with the
651 // earliest until size_limit is met. All empty files will be deleted.
652 // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
653 // WAL files will be checked every WAL_ttl_seconds / 2 and those that
654 // are older than WAL_ttl_seconds will be deleted.
655 // 4. If both are not 0, WAL files will be checked every 10 min and both
656 // checks will be performed with ttl being first.
657 uint64_t WAL_ttl_seconds = 0;
658 uint64_t WAL_size_limit_MB = 0;
659
660 // Number of bytes to preallocate (via fallocate) the manifest
661 // files. Default is 4mb, which is reasonable to reduce random IO
662 // as well as prevent overallocation for mounts that preallocate
663 // large amounts of data (such as xfs's allocsize option).
664 size_t manifest_preallocation_size = 4 * 1024 * 1024;
665
666 // Allow the OS to mmap file for reading sst tables. Default: false
667 bool allow_mmap_reads = false;
668
669 // Allow the OS to mmap file for writing.
670 // DB::SyncWAL() only works if this is set to false.
671 // Default: false
672 bool allow_mmap_writes = false;
673
674 // Enable direct I/O mode for read/write
675 // they may or may not improve performance depending on the use case
676 //
677 // Files will be opened in "direct I/O" mode
678 // which means that data r/w from the disk will not be cached or
679 // buffered. The hardware buffer of the devices may however still
680 // be used. Memory mapped files are not impacted by these parameters.
681
682 // Use O_DIRECT for user and compaction reads.
683 // When true, we also force new_table_reader_for_compaction_inputs to true.
684 // Default: false
685 // Not supported in ROCKSDB_LITE mode!
686 bool use_direct_reads = false;
687
688 // Use O_DIRECT for writes in background flush and compactions.
689 // Default: false
690 // Not supported in ROCKSDB_LITE mode!
691 bool use_direct_io_for_flush_and_compaction = false;
692
693 // If false, fallocate() calls are bypassed
694 bool allow_fallocate = true;
695
696 // Disable child process inherit open files. Default: true
697 bool is_fd_close_on_exec = true;
698
699 // NOT SUPPORTED ANYMORE -- this options is no longer used
700 bool skip_log_error_on_recovery = false;
701
702 // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
703 //
704 // Default: 600 (10 min)
705 //
706 // Dynamically changeable through SetDBOptions() API.
707 unsigned int stats_dump_period_sec = 600;
708
709 // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
710 // Default: 600
711 unsigned int stats_persist_period_sec = 600;
712
713 // If true, automatically persist stats to a hidden column family (column
714 // family name: ___rocksdb_stats_history___) every
715 // stats_persist_period_sec seconds; otherwise, write to an in-memory
716 // struct. User can query through `GetStatsHistory` API.
717 // If user attempts to create a column family with the same name on a DB
718 // which have previously set persist_stats_to_disk to true, the column family
719 // creation will fail, but the hidden column family will survive, as well as
720 // the previously persisted statistics.
721 // When peristing stats to disk, the stat name will be limited at 100 bytes.
722 // Default: false
723 bool persist_stats_to_disk = false;
724
725 // if not zero, periodically take stats snapshots and store in memory, the
726 // memory size for stats snapshots is capped at stats_history_buffer_size
727 // Default: 1MB
728 size_t stats_history_buffer_size = 1024 * 1024;
729
730 // If set true, will hint the underlying file system that the file
731 // access pattern is random, when a sst file is opened.
732 // Default: true
733 bool advise_random_on_open = true;
734
735 // Amount of data to build up in memtables across all column
736 // families before writing to disk.
737 //
738 // This is distinct from write_buffer_size, which enforces a limit
739 // for a single memtable.
740 //
741 // This feature is disabled by default. Specify a non-zero value
742 // to enable it.
743 //
744 // Default: 0 (disabled)
745 size_t db_write_buffer_size = 0;
746
747 // The memory usage of memtable will report to this object. The same object
748 // can be passed into multiple DBs and it will track the sum of size of all
749 // the DBs. If the total size of all live memtables of all the DBs exceeds
750 // a limit, a flush will be triggered in the next DB to which the next write
751 // is issued.
752 //
753 // If the object is only passed to one DB, the behavior is the same as
754 // db_write_buffer_size. When write_buffer_manager is set, the value set will
755 // override db_write_buffer_size.
756 //
757 // This feature is disabled by default. Specify a non-zero value
758 // to enable it.
759 //
760 // Default: null
761 std::shared_ptr<WriteBufferManager> write_buffer_manager = nullptr;
762
763 // Specify the file access pattern once a compaction is started.
764 // It will be applied to all input files of a compaction.
765 // Default: NORMAL
766 enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
767 AccessHint access_hint_on_compaction_start = NORMAL;
768
769 // If true, always create a new file descriptor and new table reader
770 // for compaction inputs. Turn this parameter on may introduce extra
771 // memory usage in the table reader, if it allocates extra memory
772 // for indexes. This will allow file descriptor prefetch options
773 // to be set for compaction input files and not to impact file
774 // descriptors for the same file used by user queries.
775 // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
776 // for this mode if using block-based table.
777 //
778 // Default: false
779 // This flag has no affect on the behavior of compaction and plan to delete
780 // in the future.
781 bool new_table_reader_for_compaction_inputs = false;
782
783 // If non-zero, we perform bigger reads when doing compaction. If you're
784 // running RocksDB on spinning disks, you should set this to at least 2MB.
785 // That way RocksDB's compaction is doing sequential instead of random reads.
786 //
787 // When non-zero, we also force new_table_reader_for_compaction_inputs to
788 // true.
789 //
790 // Default: 0
791 //
792 // Dynamically changeable through SetDBOptions() API.
793 size_t compaction_readahead_size = 0;
794
795 // This is a maximum buffer size that is used by WinMmapReadableFile in
796 // unbuffered disk I/O mode. We need to maintain an aligned buffer for
797 // reads. We allow the buffer to grow until the specified value and then
798 // for bigger requests allocate one shot buffers. In unbuffered mode we
799 // always bypass read-ahead buffer at ReadaheadRandomAccessFile
800 // When read-ahead is required we then make use of compaction_readahead_size
801 // value and always try to read ahead. With read-ahead we always
802 // pre-allocate buffer to the size instead of growing it up to a limit.
803 //
804 // This option is currently honored only on Windows
805 //
806 // Default: 1 Mb
807 //
808 // Special value: 0 - means do not maintain per instance buffer. Allocate
809 // per request buffer and avoid locking.
810 size_t random_access_max_buffer_size = 1024 * 1024;
811
812 // This is the maximum buffer size that is used by WritableFileWriter.
813 // On Windows, we need to maintain an aligned buffer for writes.
814 // We allow the buffer to grow until it's size hits the limit in buffered
815 // IO and fix the buffer size when using direct IO to ensure alignment of
816 // write requests if the logical sector size is unusual
817 //
818 // Default: 1024 * 1024 (1 MB)
819 //
820 // Dynamically changeable through SetDBOptions() API.
821 size_t writable_file_max_buffer_size = 1024 * 1024;
822
823 // Use adaptive mutex, which spins in the user space before resorting
824 // to kernel. This could reduce context switch when the mutex is not
825 // heavily contended. However, if the mutex is hot, we could end up
826 // wasting spin time.
827 // Default: false
828 bool use_adaptive_mutex = false;
829
830 // Create DBOptions with default values for all fields
831 DBOptions();
832 // Create DBOptions from Options
833 explicit DBOptions(const Options& options);
834
835 void Dump(Logger* log) const;
836
837 // Allows OS to incrementally sync files to disk while they are being
838 // written, asynchronously, in the background. This operation can be used
839 // to smooth out write I/Os over time. Users shouldn't rely on it for
840 // persistency guarantee.
841 // Issue one request for every bytes_per_sync written. 0 turns it off.
842 //
843 // You may consider using rate_limiter to regulate write rate to device.
844 // When rate limiter is enabled, it automatically enables bytes_per_sync
845 // to 1MB.
846 //
847 // This option applies to table files
848 //
849 // Default: 0, turned off
850 //
851 // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
852 // Dynamically changeable through SetDBOptions() API.
853 uint64_t bytes_per_sync = 0;
854
855 // Same as bytes_per_sync, but applies to WAL files
856 //
857 // Default: 0, turned off
858 //
859 // Dynamically changeable through SetDBOptions() API.
860 uint64_t wal_bytes_per_sync = 0;
861
862 // When true, guarantees WAL files have at most `wal_bytes_per_sync`
863 // bytes submitted for writeback at any given time, and SST files have at most
864 // `bytes_per_sync` bytes pending writeback at any given time. This can be
865 // used to handle cases where processing speed exceeds I/O speed during file
866 // generation, which can lead to a huge sync when the file is finished, even
867 // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
868 //
869 // - If `sync_file_range` is supported it achieves this by waiting for any
870 // prior `sync_file_range`s to finish before proceeding. In this way,
871 // processing (compression, etc.) can proceed uninhibited in the gap
872 // between `sync_file_range`s, and we block only when I/O falls behind.
873 // - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
874 // always blocks, thus preventing the interleaving of I/O and processing.
875 //
876 // Note: Enabling this option does not provide any additional persistence
877 // guarantees, as it may use `sync_file_range`, which does not write out
878 // metadata.
879 //
880 // Default: false
881 bool strict_bytes_per_sync = false;
882
883 // A vector of EventListeners whose callback functions will be called
884 // when specific RocksDB event happens.
885 std::vector<std::shared_ptr<EventListener>> listeners;
886
887 // If true, then the status of the threads involved in this DB will
888 // be tracked and available via GetThreadList() API.
889 //
890 // Default: false
891 bool enable_thread_tracking = false;
892
893 // The limited write rate to DB if soft_pending_compaction_bytes_limit or
894 // level0_slowdown_writes_trigger is triggered, or we are writing to the
895 // last mem table allowed and we allow more than 3 mem tables. It is
896 // calculated using size of user write requests before compression.
897 // RocksDB may decide to slow down more if the compaction still
898 // gets behind further.
899 // If the value is 0, we will infer a value from `rater_limiter` value
900 // if it is not empty, or 16MB if `rater_limiter` is empty. Note that
901 // if users change the rate in `rate_limiter` after DB is opened,
902 // `delayed_write_rate` won't be adjusted.
903 //
904 // Unit: byte per second.
905 //
906 // Default: 0
907 //
908 // Dynamically changeable through SetDBOptions() API.
909 uint64_t delayed_write_rate = 0;
910
911 // By default, a single write thread queue is maintained. The thread gets
912 // to the head of the queue becomes write batch group leader and responsible
913 // for writing to WAL and memtable for the batch group.
914 //
915 // If enable_pipelined_write is true, separate write thread queue is
916 // maintained for WAL write and memtable write. A write thread first enter WAL
917 // writer queue and then memtable writer queue. Pending thread on the WAL
918 // writer queue thus only have to wait for previous writers to finish their
919 // WAL writing but not the memtable writing. Enabling the feature may improve
920 // write throughput and reduce latency of the prepare phase of two-phase
921 // commit.
922 //
923 // Default: false
924 bool enable_pipelined_write = false;
925
926 // Setting unordered_write to true trades higher write throughput with
927 // relaxing the immutability guarantee of snapshots. This violates the
928 // repeatability one expects from ::Get from a snapshot, as well as
929 // ::MultiGet and Iterator's consistent-point-in-time view property.
930 // If the application cannot tolerate the relaxed guarantees, it can implement
931 // its own mechanisms to work around that and yet benefit from the higher
932 // throughput. Using TransactionDB with WRITE_PREPARED write policy and
933 // two_write_queues=true is one way to achieve immutable snapshots despite
934 // unordered_write.
935 //
936 // By default, i.e., when it is false, rocksdb does not advance the sequence
937 // number for new snapshots unless all the writes with lower sequence numbers
938 // are already finished. This provides the immutability that we except from
939 // snapshots. Moreover, since Iterator and MultiGet internally depend on
940 // snapshots, the snapshot immutability results into Iterator and MultiGet
941 // offering consistent-point-in-time view. If set to true, although
942 // Read-Your-Own-Write property is still provided, the snapshot immutability
943 // property is relaxed: the writes issued after the snapshot is obtained (with
944 // larger sequence numbers) will be still not visible to the reads from that
945 // snapshot, however, there still might be pending writes (with lower sequence
946 // number) that will change the state visible to the snapshot after they are
947 // landed to the memtable.
948 //
949 // Default: false
950 bool unordered_write = false;
951
952 // If true, allow multi-writers to update mem tables in parallel.
953 // Only some memtable_factory-s support concurrent writes; currently it
954 // is implemented only for SkipListFactory. Concurrent memtable writes
955 // are not compatible with inplace_update_support or filter_deletes.
956 // It is strongly recommended to set enable_write_thread_adaptive_yield
957 // if you are going to use this feature.
958 //
959 // Default: true
960 bool allow_concurrent_memtable_write = true;
961
962 // If true, threads synchronizing with the write batch group leader will
963 // wait for up to write_thread_max_yield_usec before blocking on a mutex.
964 // This can substantially improve throughput for concurrent workloads,
965 // regardless of whether allow_concurrent_memtable_write is enabled.
966 //
967 // Default: true
968 bool enable_write_thread_adaptive_yield = true;
969
970 // The maximum limit of number of bytes that are written in a single batch
971 // of WAL or memtable write. It is followed when the leader write size
972 // is larger than 1/8 of this limit.
973 //
974 // Default: 1 MB
975 uint64_t max_write_batch_group_size_bytes = 1 << 20;
976
977 // The maximum number of microseconds that a write operation will use
978 // a yielding spin loop to coordinate with other write threads before
979 // blocking on a mutex. (Assuming write_thread_slow_yield_usec is
980 // set properly) increasing this value is likely to increase RocksDB
981 // throughput at the expense of increased CPU usage.
982 //
983 // Default: 100
984 uint64_t write_thread_max_yield_usec = 100;
985
986 // The latency in microseconds after which a std::this_thread::yield
987 // call (sched_yield on Linux) is considered to be a signal that
988 // other processes or threads would like to use the current core.
989 // Increasing this makes writer threads more likely to take CPU
990 // by spinning, which will show up as an increase in the number of
991 // involuntary context switches.
992 //
993 // Default: 3
994 uint64_t write_thread_slow_yield_usec = 3;
995
996 // If true, then DB::Open() will not update the statistics used to optimize
997 // compaction decision by loading table properties from many files.
998 // Turning off this feature will improve DBOpen time especially in
999 // disk environment.
1000 //
1001 // Default: false
1002 bool skip_stats_update_on_db_open = false;
1003
1004 // If true, then DB::Open() will not fetch and check sizes of all sst files.
1005 // This may significantly speed up startup if there are many sst files,
1006 // especially when using non-default Env with expensive GetFileSize().
1007 // We'll still check that all required sst files exist.
1008 // If paranoid_checks is false, this option is ignored, and sst files are
1009 // not checked at all.
1010 //
1011 // Default: false
1012 bool skip_checking_sst_file_sizes_on_db_open = false;
1013
1014 // Recovery mode to control the consistency while replaying WAL
1015 // Default: kPointInTimeRecovery
1016 WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
1017
1018 // if set to false then recovery will fail when a prepared
1019 // transaction is encountered in the WAL
1020 bool allow_2pc = false;
1021
1022 // A global cache for table-level rows.
1023 // Default: nullptr (disabled)
1024 // Not supported in ROCKSDB_LITE mode!
1025 std::shared_ptr<Cache> row_cache = nullptr;
1026
1027 #ifndef ROCKSDB_LITE
1028 // A filter object supplied to be invoked while processing write-ahead-logs
1029 // (WALs) during recovery. The filter provides a way to inspect log
1030 // records, ignoring a particular record or skipping replay.
1031 // The filter is invoked at startup and is invoked from a single-thread
1032 // currently.
1033 WalFilter* wal_filter = nullptr;
1034 #endif // ROCKSDB_LITE
1035
1036 // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
1037 // / SetOptions will fail if options file is not detected or properly
1038 // persisted.
1039 //
1040 // DEFAULT: false
1041 bool fail_if_options_file_error = false;
1042
1043 // If true, then print malloc stats together with rocksdb.stats
1044 // when printing to LOG.
1045 // DEFAULT: false
1046 bool dump_malloc_stats = false;
1047
1048 // By default RocksDB replay WAL logs and flush them on DB open, which may
1049 // create very small SST files. If this option is enabled, RocksDB will try
1050 // to avoid (but not guarantee not to) flush during recovery. Also, existing
1051 // WAL logs will be kept, so that if crash happened before flush, we still
1052 // have logs to recover from.
1053 //
1054 // DEFAULT: false
1055 bool avoid_flush_during_recovery = false;
1056
1057 // By default RocksDB will flush all memtables on DB close if there are
1058 // unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
1059 // DB close. Unpersisted data WILL BE LOST.
1060 //
1061 // DEFAULT: false
1062 //
1063 // Dynamically changeable through SetDBOptions() API.
1064 bool avoid_flush_during_shutdown = false;
1065
1066 // Set this option to true during creation of database if you want
1067 // to be able to ingest behind (call IngestExternalFile() skipping keys
1068 // that already exist, rather than overwriting matching keys).
1069 // Setting this option to true will affect 2 things:
1070 // 1) Disable some internal optimizations around SST file compression
1071 // 2) Reserve bottom-most level for ingested files only.
1072 // 3) Note that num_levels should be >= 3 if this option is turned on.
1073 //
1074 // DEFAULT: false
1075 // Immutable.
1076 bool allow_ingest_behind = false;
1077
1078 // Needed to support differential snapshots.
1079 // If set to true then DB will only process deletes with sequence number
1080 // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
1081 // Clients are responsible to periodically call this method to advance
1082 // the cutoff time. If this method is never called and preserve_deletes
1083 // is set to true NO deletes will ever be processed.
1084 // At the moment this only keeps normal deletes, SingleDeletes will
1085 // not be preserved.
1086 // DEFAULT: false
1087 // Immutable (TODO: make it dynamically changeable)
1088 bool preserve_deletes = false;
1089
1090 // If enabled it uses two queues for writes, one for the ones with
1091 // disable_memtable and one for the ones that also write to memtable. This
1092 // allows the memtable writes not to lag behind other writes. It can be used
1093 // to optimize MySQL 2PC in which only the commits, which are serial, write to
1094 // memtable.
1095 bool two_write_queues = false;
1096
1097 // If true WAL is not flushed automatically after each write. Instead it
1098 // relies on manual invocation of FlushWAL to write the WAL buffer to its
1099 // file.
1100 bool manual_wal_flush = false;
1101
1102 // If true, RocksDB supports flushing multiple column families and committing
1103 // their results atomically to MANIFEST. Note that it is not
1104 // necessary to set atomic_flush to true if WAL is always enabled since WAL
1105 // allows the database to be restored to the last persistent state in WAL.
1106 // This option is useful when there are column families with writes NOT
1107 // protected by WAL.
1108 // For manual flush, application has to specify which column families to
1109 // flush atomically in DB::Flush.
1110 // For auto-triggered flush, RocksDB atomically flushes ALL column families.
1111 //
1112 // Currently, any WAL-enabled writes after atomic flush may be replayed
1113 // independently if the process crashes later and tries to recover.
1114 bool atomic_flush = false;
1115
1116 // If true, working thread may avoid doing unnecessary and long-latency
1117 // operation (such as deleting obsolete files directly or deleting memtable)
1118 // and will instead schedule a background job to do it.
1119 // Use it if you're latency-sensitive.
1120 // If set to true, takes precedence over
1121 // ReadOptions::background_purge_on_iterator_cleanup.
1122 bool avoid_unnecessary_blocking_io = false;
1123
1124 // Historically DB ID has always been stored in Identity File in DB folder.
1125 // If this flag is true, the DB ID is written to Manifest file in addition
1126 // to the Identity file. By doing this 2 problems are solved
1127 // 1. We don't checksum the Identity file where as Manifest file is.
1128 // 2. Since the source of truth for DB is Manifest file DB ID will sit with
1129 // the source of truth. Previously the Identity file could be copied
1130 // independent of Manifest and that can result in wrong DB ID.
1131 // We recommend setting this flag to true.
1132 // Default: false
1133 bool write_dbid_to_manifest = false;
1134
1135 // The number of bytes to prefetch when reading the log. This is mostly useful
1136 // for reading a remotely located log, as it can save the number of
1137 // round-trips. If 0, then the prefetching is disabled.
1138 //
1139 // Default: 0
1140 size_t log_readahead_size = 0;
1141
1142 // If user does NOT provide the checksum generator factory, the file checksum
1143 // will NOT be used. A new file checksum generator object will be created
1144 // when a SST file is created. Therefore, each created FileChecksumGenerator
1145 // will only be used from a single thread and so does not need to be
1146 // thread-safe.
1147 //
1148 // Default: nullptr
1149 std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
1150
1151 // By default, RocksDB recovery fails if any table file referenced in
1152 // MANIFEST are missing after scanning the MANIFEST.
1153 // Best-efforts recovery is another recovery mode that
1154 // tries to restore the database to the most recent point in time without
1155 // missing file.
1156 // Currently not compatible with atomic flush. Furthermore, WAL files will
1157 // not be used for recovery if best_efforts_recovery is true.
1158 // Default: false
1159 bool best_efforts_recovery = false;
1160
1161 // It defines how many times db resume is called by a separate thread when
1162 // background retryable IO Error happens. When background retryable IO
1163 // Error happens, SetBGError is called to deal with the error. If the error
1164 // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write),
1165 // then db resume is called in background to recover from the error. If this
1166 // value is 0 or negative, db resume will not be called.
1167 //
1168 // Default: INT_MAX
1169 int max_bgerror_resume_count = INT_MAX;
1170
1171 // If max_bgerror_resume_count is >= 2, db resume is called multiple times.
1172 // This option decides how long to wait to retry the next resume if the
1173 // previous resume fails and satisfy redo resume conditions.
1174 //
1175 // Default: 1000000 (microseconds).
1176 uint64_t bgerror_resume_retry_interval = 1000000;
1177
1178 // It allows user to opt-in to get error messages containing corrupted
1179 // keys/values. Corrupt keys, values will be logged in the
1180 // messages/logs/status that will help users with the useful information
1181 // regarding affected data. By default value is set false to prevent users
1182 // data to be exposed in the logs/messages etc.
1183 //
1184 // Default: false
1185 bool allow_data_in_errors = false;
1186
1187 // A string identifying the machine hosting the DB. This
1188 // will be written as a property in every SST file written by the DB (or
1189 // by offline writers such as SstFileWriter and RepairDB). It can be useful
1190 // for troubleshooting in memory corruption caused by a failing host when
1191 // writing a file, by tracing back to the writing host. These corruptions
1192 // may not be caught by the checksum since they happen before checksumming.
1193 // If left as default, the table writer will substitute it with the actual
1194 // hostname when writing the SST file. If set to an empty stirng, the
1195 // property will not be written to the SST file.
1196 //
1197 // Default: hostname
1198 std::string db_host_id = kHostnameForDbHostId;
1199 };
1200
1201 // Options to control the behavior of a database (passed to DB::Open)
1202 struct Options : public DBOptions, public ColumnFamilyOptions {
1203 // Create an Options object with default values for all fields.
1204 Options() : DBOptions(), ColumnFamilyOptions() {}
1205
1206 Options(const DBOptions& db_options,
1207 const ColumnFamilyOptions& column_family_options)
1208 : DBOptions(db_options), ColumnFamilyOptions(column_family_options) {}
1209
1210 // The function recovers options to the option as in version 4.6.
1211 Options* OldDefaults(int rocksdb_major_version = 4,
1212 int rocksdb_minor_version = 6);
1213
1214 void Dump(Logger* log) const;
1215
1216 void DumpCFOptions(Logger* log) const;
1217
1218 // Some functions that make it easier to optimize RocksDB
1219
1220 // Set appropriate parameters for bulk loading.
1221 // The reason that this is a function that returns "this" instead of a
1222 // constructor is to enable chaining of multiple similar calls in the future.
1223 //
1224
1225 // All data will be in level 0 without any automatic compaction.
1226 // It's recommended to manually call CompactRange(NULL, NULL) before reading
1227 // from the database, because otherwise the read can be very slow.
1228 Options* PrepareForBulkLoad();
1229
1230 // Use this if your DB is very small (like under 1GB) and you don't want to
1231 // spend lots of memory for memtables.
1232 Options* OptimizeForSmallDb();
1233 };
1234
1235 //
1236 // An application can issue a read request (via Get/Iterators) and specify
1237 // if that read should process data that ALREADY resides on a specified cache
1238 // level. For example, if an application specifies kBlockCacheTier then the
1239 // Get call will process data that is already processed in the memtable or
1240 // the block cache. It will not page in data from the OS cache or data that
1241 // resides in storage.
1242 enum ReadTier {
1243 kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
1244 kBlockCacheTier = 0x1, // data in memtable or block cache
1245 kPersistedTier = 0x2, // persisted data. When WAL is disabled, this option
1246 // will skip data in memtable.
1247 // Note that this ReadTier currently only supports
1248 // Get and MultiGet and does not support iterators.
1249 kMemtableTier = 0x3 // data in memtable. used for memtable-only iterators.
1250 };
1251
1252 // Options that control read operations
1253 struct ReadOptions {
1254 // If "snapshot" is non-nullptr, read as of the supplied snapshot
1255 // (which must belong to the DB that is being read and which must
1256 // not have been released). If "snapshot" is nullptr, use an implicit
1257 // snapshot of the state at the beginning of this read operation.
1258 // Default: nullptr
1259 const Snapshot* snapshot;
1260
1261 // `iterate_lower_bound` defines the smallest key at which the backward
1262 // iterator can return an entry. Once the bound is passed, Valid() will be
1263 // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
1264 // entry.
1265 //
1266 // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
1267 // need to have the same prefix. This is because ordering is not guaranteed
1268 // outside of prefix domain.
1269 //
1270 // Default: nullptr
1271 const Slice* iterate_lower_bound;
1272
1273 // "iterate_upper_bound" defines the extent upto which the forward iterator
1274 // can returns entries. Once the bound is reached, Valid() will be false.
1275 // "iterate_upper_bound" is exclusive ie the bound value is
1276 // not a valid entry. If prefix_extractor is not null, the Seek target
1277 // and iterate_upper_bound need to have the same prefix.
1278 // This is because ordering is not guaranteed outside of prefix domain.
1279 //
1280 // Default: nullptr
1281 const Slice* iterate_upper_bound;
1282
1283 // RocksDB does auto-readahead for iterators on noticing more than two reads
1284 // for a table file. The readahead starts at 8KB and doubles on every
1285 // additional read upto 256KB.
1286 // This option can help if most of the range scans are large, and if it is
1287 // determined that a larger readahead than that enabled by auto-readahead is
1288 // needed.
1289 // Using a large readahead size (> 2MB) can typically improve the performance
1290 // of forward iteration on spinning disks.
1291 // Default: 0
1292 size_t readahead_size;
1293
1294 // A threshold for the number of keys that can be skipped before failing an
1295 // iterator seek as incomplete. The default value of 0 should be used to
1296 // never fail a request as incomplete, even on skipping too many keys.
1297 // Default: 0
1298 uint64_t max_skippable_internal_keys;
1299
1300 // Specify if this read request should process data that ALREADY
1301 // resides on a particular cache. If the required data is not
1302 // found at the specified cache, then Status::Incomplete is returned.
1303 // Default: kReadAllTier
1304 ReadTier read_tier;
1305
1306 // If true, all data read from underlying storage will be
1307 // verified against corresponding checksums.
1308 // Default: true
1309 bool verify_checksums;
1310
1311 // Should the "data block"/"index block"" read for this iteration be placed in
1312 // block cache?
1313 // Callers may wish to set this field to false for bulk scans.
1314 // This would help not to the change eviction order of existing items in the
1315 // block cache.
1316 // Default: true
1317 bool fill_cache;
1318
1319 // Specify to create a tailing iterator -- a special iterator that has a
1320 // view of the complete database (i.e. it can also be used to read newly
1321 // added data) and is optimized for sequential reads. It will return records
1322 // that were inserted into the database after the creation of the iterator.
1323 // Default: false
1324 // Not supported in ROCKSDB_LITE mode!
1325 bool tailing;
1326
1327 // This options is not used anymore. It was to turn on a functionality that
1328 // has been removed.
1329 bool managed;
1330
1331 // Enable a total order seek regardless of index format (e.g. hash index)
1332 // used in the table. Some table format (e.g. plain table) may not support
1333 // this option.
1334 // If true when calling Get(), we also skip prefix bloom when reading from
1335 // block based table. It provides a way to read existing data after
1336 // changing implementation of prefix extractor.
1337 // Default: false
1338 bool total_order_seek;
1339
1340 // When true, by default use total_order_seek = true, and RocksDB can
1341 // selectively enable prefix seek mode if won't generate a different result
1342 // from total_order_seek, based on seek key, and iterator upper bound.
1343 // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
1344 // prefix mode is not used.
1345 // Default: false
1346 bool auto_prefix_mode;
1347
1348 // Enforce that the iterator only iterates over the same prefix as the seek.
1349 // This option is effective only for prefix seeks, i.e. prefix_extractor is
1350 // non-null for the column family and total_order_seek is false. Unlike
1351 // iterate_upper_bound, prefix_same_as_start only works within a prefix
1352 // but in both directions.
1353 // Default: false
1354 bool prefix_same_as_start;
1355
1356 // Keep the blocks loaded by the iterator pinned in memory as long as the
1357 // iterator is not deleted, If used when reading from tables created with
1358 // BlockBasedTableOptions::use_delta_encoding = false,
1359 // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
1360 // return 1.
1361 // Default: false
1362 bool pin_data;
1363
1364 // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
1365 // schedule a background job in the flush job queue and delete obsolete files
1366 // in background.
1367 // Default: false
1368 bool background_purge_on_iterator_cleanup;
1369
1370 // If true, keys deleted using the DeleteRange() API will be visible to
1371 // readers until they are naturally deleted during compaction. This improves
1372 // read performance in DBs with many range deletions.
1373 // Default: false
1374 bool ignore_range_deletions;
1375
1376 // A callback to determine whether relevant keys for this scan exist in a
1377 // given table based on the table's properties. The callback is passed the
1378 // properties of each table during iteration. If the callback returns false,
1379 // the table will not be scanned. This option only affects Iterators and has
1380 // no impact on point lookups.
1381 // Default: empty (every table will be scanned)
1382 std::function<bool(const TableProperties&)> table_filter;
1383
1384 // Needed to support differential snapshots. Has 2 effects:
1385 // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
1386 // 2) if this param > 0 iterator will return INTERNAL keys instead of
1387 // user keys; e.g. return tombstones as well.
1388 // Default: 0 (don't filter by seqnum, return user keys)
1389 SequenceNumber iter_start_seqnum;
1390
1391 // Timestamp of operation. Read should return the latest data visible to the
1392 // specified timestamp. All timestamps of the same database must be of the
1393 // same length and format. The user is responsible for providing a customized
1394 // compare function via Comparator to order <key, timestamp> tuples.
1395 // For iterator, iter_start_ts is the lower bound (older) and timestamp
1396 // serves as the upper bound. Versions of the same record that fall in
1397 // the timestamp range will be returned. If iter_start_ts is nullptr,
1398 // only the most recent version visible to timestamp is returned.
1399 // The user-specified timestamp feature is still under active development,
1400 // and the API is subject to change.
1401 // Default: nullptr
1402 const Slice* timestamp;
1403 const Slice* iter_start_ts;
1404
1405 // Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
1406 // in microseconds.
1407 // It should be set to microseconds since epoch, i.e, gettimeofday or
1408 // equivalent plus allowed duration in microseconds. The best way is to use
1409 // env->NowMicros() + some timeout.
1410 // This is best efforts. The call may exceed the deadline if there is IO
1411 // involved and the file system doesn't support deadlines, or due to
1412 // checking for deadline periodically rather than for every key if
1413 // processing a batch
1414 std::chrono::microseconds deadline;
1415
1416 // A timeout in microseconds to be passed to the underlying FileSystem for
1417 // reads. As opposed to deadline, this determines the timeout for each
1418 // individual file read request. If a MultiGet/Get/Seek/Next etc call
1419 // results in multiple reads, each read can last upto io_timeout us.
1420 std::chrono::microseconds io_timeout;
1421
1422 // It limits the maximum cumulative value size of the keys in batch while
1423 // reading through MultiGet. Once the cumulative value size exceeds this
1424 // soft limit then all the remaining keys are returned with status Aborted.
1425 //
1426 // Default: std::numeric_limits<uint64_t>::max()
1427 uint64_t value_size_soft_limit;
1428
1429 ReadOptions();
1430 ReadOptions(bool cksum, bool cache);
1431 };
1432
1433 // Options that control write operations
1434 struct WriteOptions {
1435 // If true, the write will be flushed from the operating system
1436 // buffer cache (by calling WritableFile::Sync()) before the write
1437 // is considered complete. If this flag is true, writes will be
1438 // slower.
1439 //
1440 // If this flag is false, and the machine crashes, some recent
1441 // writes may be lost. Note that if it is just the process that
1442 // crashes (i.e., the machine does not reboot), no writes will be
1443 // lost even if sync==false.
1444 //
1445 // In other words, a DB write with sync==false has similar
1446 // crash semantics as the "write()" system call. A DB write
1447 // with sync==true has similar crash semantics to a "write()"
1448 // system call followed by "fdatasync()".
1449 //
1450 // Default: false
1451 bool sync;
1452
1453 // If true, writes will not first go to the write ahead log,
1454 // and the write may get lost after a crash. The backup engine
1455 // relies on write-ahead logs to back up the memtable, so if
1456 // you disable write-ahead logs, you must create backups with
1457 // flush_before_backup=true to avoid losing unflushed memtable data.
1458 // Default: false
1459 bool disableWAL;
1460
1461 // If true and if user is trying to write to column families that don't exist
1462 // (they were dropped), ignore the write (don't return an error). If there
1463 // are multiple writes in a WriteBatch, other writes will succeed.
1464 // Default: false
1465 bool ignore_missing_column_families;
1466
1467 // If true and we need to wait or sleep for the write request, fails
1468 // immediately with Status::Incomplete().
1469 // Default: false
1470 bool no_slowdown;
1471
1472 // If true, this write request is of lower priority if compaction is
1473 // behind. In this case, no_slowdown = true, the request will be cancelled
1474 // immediately with Status::Incomplete() returned. Otherwise, it will be
1475 // slowed down. The slowdown value is determined by RocksDB to guarantee
1476 // it introduces minimum impacts to high priority writes.
1477 //
1478 // Default: false
1479 bool low_pri;
1480
1481 // If true, this writebatch will maintain the last insert positions of each
1482 // memtable as hints in concurrent write. It can improve write performance
1483 // in concurrent writes if keys in one writebatch are sequential. In
1484 // non-concurrent writes (when concurrent_memtable_writes is false) this
1485 // option will be ignored.
1486 //
1487 // Default: false
1488 bool memtable_insert_hint_per_batch;
1489
1490 // Timestamp of write operation, e.g. Put. All timestamps of the same
1491 // database must share the same length and format. The user is also
1492 // responsible for providing a customized compare function via Comparator to
1493 // order <key, timestamp> tuples. If the user wants to enable timestamp, then
1494 // all write operations must be associated with timestamp because RocksDB, as
1495 // a single-node storage engine currently has no knowledge of global time,
1496 // thus has to rely on the application.
1497 // The user-specified timestamp feature is still under active development,
1498 // and the API is subject to change.
1499 const Slice* timestamp;
1500
1501 WriteOptions()
1502 : sync(false),
1503 disableWAL(false),
1504 ignore_missing_column_families(false),
1505 no_slowdown(false),
1506 low_pri(false),
1507 memtable_insert_hint_per_batch(false),
1508 timestamp(nullptr) {}
1509 };
1510
1511 // Options that control flush operations
1512 struct FlushOptions {
1513 // If true, the flush will wait until the flush is done.
1514 // Default: true
1515 bool wait;
1516 // If true, the flush would proceed immediately even it means writes will
1517 // stall for the duration of the flush; if false the operation will wait
1518 // until it's possible to do flush w/o causing stall or until required flush
1519 // is performed by someone else (foreground call or background thread).
1520 // Default: false
1521 bool allow_write_stall;
1522 FlushOptions() : wait(true), allow_write_stall(false) {}
1523 };
1524
1525 // Create a Logger from provided DBOptions
1526 extern Status CreateLoggerFromOptions(const std::string& dbname,
1527 const DBOptions& options,
1528 std::shared_ptr<Logger>* logger);
1529
1530 // CompactionOptions are used in CompactFiles() call.
1531 struct CompactionOptions {
1532 // Compaction output compression type
1533 // Default: snappy
1534 // If set to `kDisableCompressionOption`, RocksDB will choose compression type
1535 // according to the `ColumnFamilyOptions`, taking into account the output
1536 // level if `compression_per_level` is specified.
1537 CompressionType compression;
1538 // Compaction will create files of size `output_file_size_limit`.
1539 // Default: MAX, which means that compaction will create a single file
1540 uint64_t output_file_size_limit;
1541 // If > 0, it will replace the option in the DBOptions for this compaction.
1542 uint32_t max_subcompactions;
1543
1544 CompactionOptions()
1545 : compression(kSnappyCompression),
1546 output_file_size_limit(std::numeric_limits<uint64_t>::max()),
1547 max_subcompactions(0) {}
1548 };
1549
1550 // For level based compaction, we can configure if we want to skip/force
1551 // bottommost level compaction.
1552 enum class BottommostLevelCompaction {
1553 // Skip bottommost level compaction
1554 kSkip,
1555 // Only compact bottommost level if there is a compaction filter
1556 // This is the default option
1557 kIfHaveCompactionFilter,
1558 // Always compact bottommost level
1559 kForce,
1560 // Always compact bottommost level but in bottommost level avoid
1561 // double-compacting files created in the same compaction
1562 kForceOptimized,
1563 };
1564
1565 // CompactRangeOptions is used by CompactRange() call.
1566 struct CompactRangeOptions {
1567 // If true, no other compaction will run at the same time as this
1568 // manual compaction
1569 bool exclusive_manual_compaction = true;
1570 // If true, compacted files will be moved to the minimum level capable
1571 // of holding the data or given level (specified non-negative target_level).
1572 bool change_level = false;
1573 // If change_level is true and target_level have non-negative value, compacted
1574 // files will be moved to target_level.
1575 int target_level = -1;
1576 // Compaction outputs will be placed in options.db_paths[target_path_id].
1577 // Behavior is undefined if target_path_id is out of range.
1578 uint32_t target_path_id = 0;
1579 // By default level based compaction will only compact the bottommost level
1580 // if there is a compaction filter
1581 BottommostLevelCompaction bottommost_level_compaction =
1582 BottommostLevelCompaction::kIfHaveCompactionFilter;
1583 // If true, will execute immediately even if doing so would cause the DB to
1584 // enter write stall mode. Otherwise, it'll sleep until load is low enough.
1585 bool allow_write_stall = false;
1586 // If > 0, it will replace the option in the DBOptions for this compaction.
1587 uint32_t max_subcompactions = 0;
1588 };
1589
1590 // IngestExternalFileOptions is used by IngestExternalFile()
1591 struct IngestExternalFileOptions {
1592 // Can be set to true to move the files instead of copying them.
1593 bool move_files = false;
1594 // If set to true, ingestion falls back to copy when move fails.
1595 bool failed_move_fall_back_to_copy = true;
1596 // If set to false, an ingested file keys could appear in existing snapshots
1597 // that where created before the file was ingested.
1598 bool snapshot_consistency = true;
1599 // If set to false, IngestExternalFile() will fail if the file key range
1600 // overlaps with existing keys or tombstones in the DB.
1601 bool allow_global_seqno = true;
1602 // If set to false and the file key range overlaps with the memtable key range
1603 // (memtable flush required), IngestExternalFile will fail.
1604 bool allow_blocking_flush = true;
1605 // Set to true if you would like duplicate keys in the file being ingested
1606 // to be skipped rather than overwriting existing data under that key.
1607 // Usecase: back-fill of some historical data in the database without
1608 // over-writing existing newer version of data.
1609 // This option could only be used if the DB has been running
1610 // with allow_ingest_behind=true since the dawn of time.
1611 // All files will be ingested at the bottommost level with seqno=0.
1612 bool ingest_behind = false;
1613 // Set to true if you would like to write global_seqno to a given offset in
1614 // the external SST file for backward compatibility. Older versions of
1615 // RocksDB writes a global_seqno to a given offset within ingested SST files,
1616 // and new versions of RocksDB do not. If you ingest an external SST using
1617 // new version of RocksDB and would like to be able to downgrade to an
1618 // older version of RocksDB, you should set 'write_global_seqno' to true. If
1619 // your service is just starting to use the new RocksDB, we recommend that
1620 // you set this option to false, which brings two benefits:
1621 // 1. No extra random write for global_seqno during ingestion.
1622 // 2. Without writing external SST file, it's possible to do checksum.
1623 // We have a plan to set this option to false by default in the future.
1624 bool write_global_seqno = true;
1625 // Set to true if you would like to verify the checksums of each block of the
1626 // external SST file before ingestion.
1627 // Warning: setting this to true causes slowdown in file ingestion because
1628 // the external SST file has to be read.
1629 bool verify_checksums_before_ingest = false;
1630 // When verify_checksums_before_ingest = true, RocksDB uses default
1631 // readahead setting to scan the file while verifying checksums before
1632 // ingestion.
1633 // Users can override the default value using this option.
1634 // Using a large readahead size (> 2MB) can typically improve the performance
1635 // of forward iteration on spinning disks.
1636 size_t verify_checksums_readahead_size = 0;
1637 // Set to TRUE if user wants to verify the sst file checksum of ingested
1638 // files. The DB checksum function will generate the checksum of each
1639 // ingested file (if file_checksum_gen_factory is set) and compare the
1640 // checksum function name and checksum with the ingested checksum information.
1641 //
1642 // If this option is set to True: 1) if DB does not enable checksum
1643 // (file_checksum_gen_factory == nullptr), the ingested checksum information
1644 // will be ignored; 2) If DB enable the checksum function, we calculate the
1645 // sst file checksum after the file is moved or copied and compare the
1646 // checksum and checksum name. If checksum or checksum function name does
1647 // not match, ingestion will be failed. If the verification is sucessful,
1648 // checksum and checksum function name will be stored in Manifest.
1649 // If this option is set to FALSE, 1) if DB does not enable checksum,
1650 // the ingested checksum information will be ignored; 2) if DB enable the
1651 // checksum, we only verify the ingested checksum function name and we
1652 // trust the ingested checksum. If the checksum function name matches, we
1653 // store the checksum in Manifest. DB does not calculate the checksum during
1654 // ingestion. However, if no checksum information is provided with the
1655 // ingested files, DB will generate the checksum and store in the Manifest.
1656 bool verify_file_checksum = true;
1657 };
1658
1659 enum TraceFilterType : uint64_t {
1660 // Trace all the operations
1661 kTraceFilterNone = 0x0,
1662 // Do not trace the get operations
1663 kTraceFilterGet = 0x1 << 0,
1664 // Do not trace the write operations
1665 kTraceFilterWrite = 0x1 << 1
1666 };
1667
1668 // TraceOptions is used for StartTrace
1669 struct TraceOptions {
1670 // To avoid the trace file size grows large than the storage space,
1671 // user can set the max trace file size in Bytes. Default is 64GB
1672 uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
1673 // Specify trace sampling option, i.e. capture one per how many requests.
1674 // Default to 1 (capture every request).
1675 uint64_t sampling_frequency = 1;
1676 // Note: The filtering happens before sampling.
1677 uint64_t filter = kTraceFilterNone;
1678 };
1679
1680 // ImportColumnFamilyOptions is used by ImportColumnFamily()
1681 struct ImportColumnFamilyOptions {
1682 // Can be set to true to move the files instead of copying them.
1683 bool move_files = false;
1684 };
1685
1686 // Options used with DB::GetApproximateSizes()
1687 struct SizeApproximationOptions {
1688 // Defines whether the returned size should include the recently written
1689 // data in the mem-tables. If set to false, include_files must be true.
1690 bool include_memtabtles = false;
1691 // Defines whether the returned size should include data serialized to disk.
1692 // If set to false, include_memtabtles must be true.
1693 bool include_files = true;
1694 // When approximating the files total size that is used to store a keys range
1695 // using DB::GetApproximateSizes, allow approximation with an error margin of
1696 // up to total_files_size * files_size_error_margin. This allows to take some
1697 // shortcuts in files size approximation, resulting in better performance,
1698 // while guaranteeing the resulting error is within a reasonable margin.
1699 // E.g., if the value is 0.1, then the error margin of the returned files size
1700 // approximation will be within 10%.
1701 // If the value is non-positive - a more precise yet more CPU intensive
1702 // estimation is performed.
1703 double files_size_error_margin = -1.0;
1704 };
1705
1706 } // namespace ROCKSDB_NAMESPACE