]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/include/rocksdb/advanced_options.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / include / rocksdb / advanced_options.h
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE file. See the AUTHORS file for names of contributors.
8
9 #pragma once
10
11 #include <memory>
12
13 #include "rocksdb/memtablerep.h"
14 #include "rocksdb/universal_compaction.h"
15
16 namespace rocksdb {
17
18 class Slice;
19 class SliceTransform;
20 enum CompressionType : unsigned char;
21 class TablePropertiesCollectorFactory;
22 class TableFactory;
23 struct Options;
24
25 enum CompactionStyle : char {
26 // level based compaction style
27 kCompactionStyleLevel = 0x0,
28 // Universal compaction style
29 // Not supported in ROCKSDB_LITE.
30 kCompactionStyleUniversal = 0x1,
31 // FIFO compaction style
32 // Not supported in ROCKSDB_LITE
33 kCompactionStyleFIFO = 0x2,
34 // Disable background compaction. Compaction jobs are submitted
35 // via CompactFiles().
36 // Not supported in ROCKSDB_LITE
37 kCompactionStyleNone = 0x3,
38 };
39
40 // In Level-based comapction, it Determines which file from a level to be
41 // picked to merge to the next level. We suggest people try
42 // kMinOverlappingRatio first when you tune your database.
43 enum CompactionPri : char {
44 // Slightly Priotize larger files by size compensated by #deletes
45 kByCompensatedSize = 0x0,
46 // First compact files whose data's latest update time is oldest.
47 // Try this if you only update some hot keys in small ranges.
48 kOldestLargestSeqFirst = 0x1,
49 // First compact files whose range hasn't been compacted to the next level
50 // for the longest. If your updates are random across the key space,
51 // write amplification is slightly better with this option.
52 kOldestSmallestSeqFirst = 0x2,
53 // First compact files whose ratio between overlapping size in next level
54 // and its size is the smallest. It in many cases can optimize write
55 // amplification.
56 kMinOverlappingRatio = 0x3,
57 };
58
59 struct CompactionOptionsFIFO {
60 // once the total sum of table files reaches this, we will delete the oldest
61 // table file
62 // Default: 1GB
63 uint64_t max_table_files_size;
64
65 CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
66 CompactionOptionsFIFO(uint64_t _max_table_files_size) :
67 max_table_files_size(_max_table_files_size) {}
68 };
69
70 // Compression options for different compression algorithms like Zlib
71 struct CompressionOptions {
72 int window_bits;
73 int level;
74 int strategy;
75 // Maximum size of dictionary used to prime the compression library. Currently
76 // this dictionary will be constructed by sampling the first output file in a
77 // subcompaction when the target level is bottommost. This dictionary will be
78 // loaded into the compression library before compressing/uncompressing each
79 // data block of subsequent files in the subcompaction. Effectively, this
80 // improves compression ratios when there are repetitions across data blocks.
81 // A value of 0 indicates the feature is disabled.
82 // Default: 0.
83 uint32_t max_dict_bytes;
84
85 CompressionOptions()
86 : window_bits(-14), level(-1), strategy(0), max_dict_bytes(0) {}
87 CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes)
88 : window_bits(wbits),
89 level(_lev),
90 strategy(_strategy),
91 max_dict_bytes(_max_dict_bytes) {}
92 };
93
94 enum UpdateStatus { // Return status For inplace update callback
95 UPDATE_FAILED = 0, // Nothing to update
96 UPDATED_INPLACE = 1, // Value updated inplace
97 UPDATED = 2, // No inplace update. Merged value set
98 };
99
100
101 struct AdvancedColumnFamilyOptions {
102 // The maximum number of write buffers that are built up in memory.
103 // The default and the minimum number is 2, so that when 1 write buffer
104 // is being flushed to storage, new writes can continue to the other
105 // write buffer.
106 // If max_write_buffer_number > 3, writing will be slowed down to
107 // options.delayed_write_rate if we are writing to the last write buffer
108 // allowed.
109 //
110 // Default: 2
111 //
112 // Dynamically changeable through SetOptions() API
113 int max_write_buffer_number = 2;
114
115 // The minimum number of write buffers that will be merged together
116 // before writing to storage. If set to 1, then
117 // all write buffers are flushed to L0 as individual files and this increases
118 // read amplification because a get request has to check in all of these
119 // files. Also, an in-memory merge may result in writing lesser
120 // data to storage if there are duplicate records in each of these
121 // individual write buffers. Default: 1
122 int min_write_buffer_number_to_merge = 1;
123
124 // The total maximum number of write buffers to maintain in memory including
125 // copies of buffers that have already been flushed. Unlike
126 // max_write_buffer_number, this parameter does not affect flushing.
127 // This controls the minimum amount of write history that will be available
128 // in memory for conflict checking when Transactions are used.
129 //
130 // When using an OptimisticTransactionDB:
131 // If this value is too low, some transactions may fail at commit time due
132 // to not being able to determine whether there were any write conflicts.
133 //
134 // When using a TransactionDB:
135 // If Transaction::SetSnapshot is used, TransactionDB will read either
136 // in-memory write buffers or SST files to do write-conflict checking.
137 // Increasing this value can reduce the number of reads to SST files
138 // done for conflict detection.
139 //
140 // Setting this value to 0 will cause write buffers to be freed immediately
141 // after they are flushed.
142 // If this value is set to -1, 'max_write_buffer_number' will be used.
143 //
144 // Default:
145 // If using a TransactionDB/OptimisticTransactionDB, the default value will
146 // be set to the value of 'max_write_buffer_number' if it is not explicitly
147 // set by the user. Otherwise, the default is 0.
148 int max_write_buffer_number_to_maintain = 0;
149
150 // Allows thread-safe inplace updates. If this is true, there is no way to
151 // achieve point-in-time consistency using snapshot or iterator (assuming
152 // concurrent updates). Hence iterator and multi-get will return results
153 // which are not consistent as of any point-in-time.
154 // If inplace_callback function is not set,
155 // Put(key, new_value) will update inplace the existing_value iff
156 // * key exists in current memtable
157 // * new sizeof(new_value) <= sizeof(existing_value)
158 // * existing_value for that key is a put i.e. kTypeValue
159 // If inplace_callback function is set, check doc for inplace_callback.
160 // Default: false.
161 bool inplace_update_support = false;
162
163 // Number of locks used for inplace update
164 // Default: 10000, if inplace_update_support = true, else 0.
165 //
166 // Dynamically changeable through SetOptions() API
167 size_t inplace_update_num_locks = 10000;
168
169 // existing_value - pointer to previous value (from both memtable and sst).
170 // nullptr if key doesn't exist
171 // existing_value_size - pointer to size of existing_value).
172 // nullptr if key doesn't exist
173 // delta_value - Delta value to be merged with the existing_value.
174 // Stored in transaction logs.
175 // merged_value - Set when delta is applied on the previous value.
176
177 // Applicable only when inplace_update_support is true,
178 // this callback function is called at the time of updating the memtable
179 // as part of a Put operation, lets say Put(key, delta_value). It allows the
180 // 'delta_value' specified as part of the Put operation to be merged with
181 // an 'existing_value' of the key in the database.
182
183 // If the merged value is smaller in size that the 'existing_value',
184 // then this function can update the 'existing_value' buffer inplace and
185 // the corresponding 'existing_value'_size pointer, if it wishes to.
186 // The callback should return UpdateStatus::UPDATED_INPLACE.
187 // In this case. (In this case, the snapshot-semantics of the rocksdb
188 // Iterator is not atomic anymore).
189
190 // If the merged value is larger in size than the 'existing_value' or the
191 // application does not wish to modify the 'existing_value' buffer inplace,
192 // then the merged value should be returned via *merge_value. It is set by
193 // merging the 'existing_value' and the Put 'delta_value'. The callback should
194 // return UpdateStatus::UPDATED in this case. This merged value will be added
195 // to the memtable.
196
197 // If merging fails or the application does not wish to take any action,
198 // then the callback should return UpdateStatus::UPDATE_FAILED.
199
200 // Please remember that the original call from the application is Put(key,
201 // delta_value). So the transaction log (if enabled) will still contain (key,
202 // delta_value). The 'merged_value' is not stored in the transaction log.
203 // Hence the inplace_callback function should be consistent across db reopens.
204
205 // Default: nullptr
206 UpdateStatus (*inplace_callback)(char* existing_value,
207 uint32_t* existing_value_size,
208 Slice delta_value,
209 std::string* merged_value) = nullptr;
210
211 // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
212 // create prefix bloom for memtable with the size of
213 // write_buffer_size * memtable_prefix_bloom_size_ratio.
214 // If it is larger than 0.25, it is santinized to 0.25.
215 //
216 // Default: 0 (disable)
217 //
218 // Dynamically changeable through SetOptions() API
219 double memtable_prefix_bloom_size_ratio = 0.0;
220
221 // Page size for huge page for the arena used by the memtable. If <=0, it
222 // won't allocate from huge page but from malloc.
223 // Users are responsible to reserve huge pages for it to be allocated. For
224 // example:
225 // sysctl -w vm.nr_hugepages=20
226 // See linux doc Documentation/vm/hugetlbpage.txt
227 // If there isn't enough free huge page available, it will fall back to
228 // malloc.
229 //
230 // Dynamically changeable through SetOptions() API
231 size_t memtable_huge_page_size = 0;
232
233 // If non-nullptr, memtable will use the specified function to extract
234 // prefixes for keys, and for each prefix maintain a hint of insert location
235 // to reduce CPU usage for inserting keys with the prefix. Keys out of
236 // domain of the prefix extractor will be insert without using hints.
237 //
238 // Currently only the default skiplist based memtable implements the feature.
239 // All other memtable implementation will ignore the option. It incurs ~250
240 // additional bytes of memory overhead to store a hint for each prefix.
241 // Also concurrent writes (when allow_concurrent_memtable_write is true) will
242 // ignore the option.
243 //
244 // The option is best suited for workloads where keys will likely to insert
245 // to a location close the the last inserted key with the same prefix.
246 // One example could be inserting keys of the form (prefix + timestamp),
247 // and keys of the same prefix always comes in with time order. Another
248 // example would be updating the same key over and over again, in which case
249 // the prefix can be the key itself.
250 //
251 // Default: nullptr (disable)
252 std::shared_ptr<const SliceTransform>
253 memtable_insert_with_hint_prefix_extractor = nullptr;
254
255 // Control locality of bloom filter probes to improve cache miss rate.
256 // This option only applies to memtable prefix bloom and plaintable
257 // prefix bloom. It essentially limits every bloom checking to one cache line.
258 // This optimization is turned off when set to 0, and positive number to turn
259 // it on.
260 // Default: 0
261 uint32_t bloom_locality = 0;
262
263 // size of one block in arena memory allocation.
264 // If <= 0, a proper value is automatically calculated (usually 1/8 of
265 // writer_buffer_size, rounded up to a multiple of 4KB).
266 //
267 // There are two additional restriction of the The specified size:
268 // (1) size should be in the range of [4096, 2 << 30] and
269 // (2) be the multiple of the CPU word (which helps with the memory
270 // alignment).
271 //
272 // We'll automatically check and adjust the size number to make sure it
273 // conforms to the restrictions.
274 //
275 // Default: 0
276 //
277 // Dynamically changeable through SetOptions() API
278 size_t arena_block_size = 0;
279
280 // Different levels can have different compression policies. There
281 // are cases where most lower levels would like to use quick compression
282 // algorithms while the higher levels (which have more data) use
283 // compression algorithms that have better compression but could
284 // be slower. This array, if non-empty, should have an entry for
285 // each level of the database; these override the value specified in
286 // the previous field 'compression'.
287 //
288 // NOTICE if level_compaction_dynamic_level_bytes=true,
289 // compression_per_level[0] still determines L0, but other elements
290 // of the array are based on base level (the level L0 files are merged
291 // to), and may not match the level users see from info log for metadata.
292 // If L0 files are merged to level-n, then, for i>0, compression_per_level[i]
293 // determines compaction type for level n+i-1.
294 // For example, if we have three 5 levels, and we determine to merge L0
295 // data to L4 (which means L1..L3 will be empty), then the new files go to
296 // L4 uses compression type compression_per_level[1].
297 // If now L0 is merged to L2. Data goes to L2 will be compressed
298 // according to compression_per_level[1], L3 using compression_per_level[2]
299 // and L4 using compression_per_level[3]. Compaction for each level can
300 // change when data grows.
301 std::vector<CompressionType> compression_per_level;
302
303 // Number of levels for this database
304 int num_levels = 7;
305
306 // Soft limit on number of level-0 files. We start slowing down writes at this
307 // point. A value <0 means that no writing slow down will be triggered by
308 // number of files in level-0.
309 //
310 // Default: 20
311 //
312 // Dynamically changeable through SetOptions() API
313 int level0_slowdown_writes_trigger = 20;
314
315 // Maximum number of level-0 files. We stop writes at this point.
316 //
317 // Default: 36
318 //
319 // Dynamically changeable through SetOptions() API
320 int level0_stop_writes_trigger = 36;
321
322 // Target file size for compaction.
323 // target_file_size_base is per-file size for level-1.
324 // Target file size for level L can be calculated by
325 // target_file_size_base * (target_file_size_multiplier ^ (L-1))
326 // For example, if target_file_size_base is 2MB and
327 // target_file_size_multiplier is 10, then each file on level-1 will
328 // be 2MB, and each file on level 2 will be 20MB,
329 // and each file on level-3 will be 200MB.
330 //
331 // Default: 64MB.
332 //
333 // Dynamically changeable through SetOptions() API
334 uint64_t target_file_size_base = 64 * 1048576;
335
336 // By default target_file_size_multiplier is 1, which means
337 // by default files in different levels will have similar size.
338 //
339 // Dynamically changeable through SetOptions() API
340 int target_file_size_multiplier = 1;
341
342 // If true, RocksDB will pick target size of each level dynamically.
343 // We will pick a base level b >= 1. L0 will be directly merged into level b,
344 // instead of always into level 1. Level 1 to b-1 need to be empty.
345 // We try to pick b and its target size so that
346 // 1. target size is in the range of
347 // (max_bytes_for_level_base / max_bytes_for_level_multiplier,
348 // max_bytes_for_level_base]
349 // 2. target size of the last level (level num_levels-1) equals to extra size
350 // of the level.
351 // At the same time max_bytes_for_level_multiplier and
352 // max_bytes_for_level_multiplier_additional are still satisfied.
353 //
354 // With this option on, from an empty DB, we make last level the base level,
355 // which means merging L0 data into the last level, until it exceeds
356 // max_bytes_for_level_base. And then we make the second last level to be
357 // base level, to start to merge L0 data to second last level, with its
358 // target size to be 1/max_bytes_for_level_multiplier of the last level's
359 // extra size. After the data accumulates more so that we need to move the
360 // base level to the third last one, and so on.
361 //
362 // For example, assume max_bytes_for_level_multiplier=10, num_levels=6,
363 // and max_bytes_for_level_base=10MB.
364 // Target sizes of level 1 to 5 starts with:
365 // [- - - - 10MB]
366 // with base level is level. Target sizes of level 1 to 4 are not applicable
367 // because they will not be used.
368 // Until the size of Level 5 grows to more than 10MB, say 11MB, we make
369 // base target to level 4 and now the targets looks like:
370 // [- - - 1.1MB 11MB]
371 // While data are accumulated, size targets are tuned based on actual data
372 // of level 5. When level 5 has 50MB of data, the target is like:
373 // [- - - 5MB 50MB]
374 // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep
375 // level 4 to be the base level, its target size needs to be 10.1MB, which
376 // doesn't satisfy the target size range. So now we make level 3 the target
377 // size and the target sizes of the levels look like:
378 // [- - 1.01MB 10.1MB 101MB]
379 // In the same way, while level 5 further grows, all levels' targets grow,
380 // like
381 // [- - 5MB 50MB 500MB]
382 // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
383 // base level and make levels' target sizes like this:
384 // [- 1.001MB 10.01MB 100.1MB 1001MB]
385 // and go on...
386 //
387 // By doing it, we give max_bytes_for_level_multiplier a priority against
388 // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
389 // useful to limit worse case space amplification.
390 //
391 // max_bytes_for_level_multiplier_additional is ignored with this flag on.
392 //
393 // Turning this feature on or off for an existing DB can cause unexpected
394 // LSM tree structure so it's not recommended.
395 //
396 // NOTE: this option is experimental
397 //
398 // Default: false
399 bool level_compaction_dynamic_level_bytes = false;
400
401 // Default: 10.
402 //
403 // Dynamically changeable through SetOptions() API
404 double max_bytes_for_level_multiplier = 10;
405
406 // Different max-size multipliers for different levels.
407 // These are multiplied by max_bytes_for_level_multiplier to arrive
408 // at the max-size of each level.
409 //
410 // Default: 1
411 //
412 // Dynamically changeable through SetOptions() API
413 std::vector<int> max_bytes_for_level_multiplier_additional =
414 std::vector<int>(num_levels, 1);
415
416 // We try to limit number of bytes in one compaction to be lower than this
417 // threshold. But it's not guaranteed.
418 // Value 0 will be sanitized.
419 //
420 // Default: result.target_file_size_base * 25
421 uint64_t max_compaction_bytes = 0;
422
423 // All writes will be slowed down to at least delayed_write_rate if estimated
424 // bytes needed to be compaction exceed this threshold.
425 //
426 // Default: 64GB
427 uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
428
429 // All writes are stopped if estimated bytes needed to be compaction exceed
430 // this threshold.
431 //
432 // Default: 256GB
433 uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
434
435 // The compaction style. Default: kCompactionStyleLevel
436 CompactionStyle compaction_style = kCompactionStyleLevel;
437
438 // If level compaction_style = kCompactionStyleLevel, for each level,
439 // which files are prioritized to be picked to compact.
440 // Default: kByCompensatedSize
441 CompactionPri compaction_pri = kByCompensatedSize;
442
443 // The options needed to support Universal Style compactions
444 CompactionOptionsUniversal compaction_options_universal;
445
446 // The options for FIFO compaction style
447 CompactionOptionsFIFO compaction_options_fifo;
448
449 // An iteration->Next() sequentially skips over keys with the same
450 // user-key unless this option is set. This number specifies the number
451 // of keys (with the same userkey) that will be sequentially
452 // skipped before a reseek is issued.
453 //
454 // Default: 8
455 //
456 // Dynamically changeable through SetOptions() API
457 uint64_t max_sequential_skip_in_iterations = 8;
458
459 // This is a factory that provides MemTableRep objects.
460 // Default: a factory that provides a skip-list-based implementation of
461 // MemTableRep.
462 std::shared_ptr<MemTableRepFactory> memtable_factory =
463 std::shared_ptr<SkipListFactory>(new SkipListFactory);
464
465 // Block-based table related options are moved to BlockBasedTableOptions.
466 // Related options that were originally here but now moved include:
467 // no_block_cache
468 // block_cache
469 // block_cache_compressed
470 // block_size
471 // block_size_deviation
472 // block_restart_interval
473 // filter_policy
474 // whole_key_filtering
475 // If you'd like to customize some of these options, you will need to
476 // use NewBlockBasedTableFactory() to construct a new table factory.
477
478 // This option allows user to collect their own interested statistics of
479 // the tables.
480 // Default: empty vector -- no user-defined statistics collection will be
481 // performed.
482 typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
483 TablePropertiesCollectorFactories;
484 TablePropertiesCollectorFactories table_properties_collector_factories;
485
486 // Maximum number of successive merge operations on a key in the memtable.
487 //
488 // When a merge operation is added to the memtable and the maximum number of
489 // successive merges is reached, the value of the key will be calculated and
490 // inserted into the memtable instead of the merge operation. This will
491 // ensure that there are never more than max_successive_merges merge
492 // operations in the memtable.
493 //
494 // Default: 0 (disabled)
495 //
496 // Dynamically changeable through SetOptions() API
497 size_t max_successive_merges = 0;
498
499 // This flag specifies that the implementation should optimize the filters
500 // mainly for cases where keys are found rather than also optimize for keys
501 // missed. This would be used in cases where the application knows that
502 // there are very few misses or the performance in the case of misses is not
503 // important.
504 //
505 // For now, this flag allows us to not store filters for the last level i.e
506 // the largest level which contains data of the LSM store. For keys which
507 // are hits, the filters in this level are not useful because we will search
508 // for the data anyway. NOTE: the filters in other levels are still useful
509 // even for key hit because they tell us whether to look in that level or go
510 // to the higher level.
511 //
512 // Default: false
513 bool optimize_filters_for_hits = false;
514
515 // After writing every SST file, reopen it and read all the keys.
516 // Default: false
517 bool paranoid_file_checks = false;
518
519 // In debug mode, RocksDB run consistency checks on the LSM everytime the LSM
520 // change (Flush, Compaction, AddFile). These checks are disabled in release
521 // mode, use this option to enable them in release mode as well.
522 // Default: false
523 bool force_consistency_checks = false;
524
525 // Measure IO stats in compactions and flushes, if true.
526 // Default: false
527 bool report_bg_io_stats = false;
528
529 // Create ColumnFamilyOptions with default values for all fields
530 AdvancedColumnFamilyOptions();
531 // Create ColumnFamilyOptions from Options
532 explicit AdvancedColumnFamilyOptions(const Options& options);
533
534 // ---------------- OPTIONS NOT SUPPORTED ANYMORE ----------------
535
536 // NOT SUPPORTED ANYMORE
537 // This does not do anything anymore.
538 int max_mem_compaction_level;
539
540 // NOT SUPPORTED ANYMORE -- this options is no longer used
541 // Puts are delayed to options.delayed_write_rate when any level has a
542 // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0.
543 //
544 // Default: 0 (disabled)
545 //
546 // Dynamically changeable through SetOptions() API
547 double soft_rate_limit = 0.0;
548
549 // NOT SUPPORTED ANYMORE -- this options is no longer used
550 double hard_rate_limit = 0.0;
551
552 // NOT SUPPORTED ANYMORE -- this options is no longer used
553 unsigned int rate_limit_delay_max_milliseconds = 100;
554
555 // NOT SUPPORTED ANYMORE
556 // Does not have any effect.
557 bool purge_redundant_kvs_while_flush = true;
558 };
559
560 } // namespace rocksdb