]>
Commit | Line | Data |
---|---|---|
7c673cae | 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
11fdf7f2 TL |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License | |
4 | // (found in the LICENSE.Apache file in the root directory). | |
7c673cae FG |
5 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
6 | // Use of this source code is governed by a BSD-style license that can be | |
7 | // found in the LICENSE file. See the AUTHORS file for names of contributors. | |
8 | ||
9 | #pragma once | |
10 | ||
11 | #include <memory> | |
12 | ||
13 | #include "rocksdb/memtablerep.h" | |
14 | #include "rocksdb/universal_compaction.h" | |
15 | ||
f67539c2 | 16 | namespace ROCKSDB_NAMESPACE { |
7c673cae FG |
17 | |
18 | class Slice; | |
19 | class SliceTransform; | |
20 | enum CompressionType : unsigned char; | |
21 | class TablePropertiesCollectorFactory; | |
22 | class TableFactory; | |
23 | struct Options; | |
24 | ||
25 | enum CompactionStyle : char { | |
26 | // level based compaction style | |
27 | kCompactionStyleLevel = 0x0, | |
28 | // Universal compaction style | |
29 | // Not supported in ROCKSDB_LITE. | |
30 | kCompactionStyleUniversal = 0x1, | |
31 | // FIFO compaction style | |
32 | // Not supported in ROCKSDB_LITE | |
33 | kCompactionStyleFIFO = 0x2, | |
34 | // Disable background compaction. Compaction jobs are submitted | |
35 | // via CompactFiles(). | |
36 | // Not supported in ROCKSDB_LITE | |
37 | kCompactionStyleNone = 0x3, | |
38 | }; | |
39 | ||
11fdf7f2 | 40 | // In Level-based compaction, it Determines which file from a level to be |
7c673cae FG |
41 | // picked to merge to the next level. We suggest people try |
42 | // kMinOverlappingRatio first when you tune your database. | |
43 | enum CompactionPri : char { | |
11fdf7f2 | 44 | // Slightly prioritize larger files by size compensated by #deletes |
7c673cae FG |
45 | kByCompensatedSize = 0x0, |
46 | // First compact files whose data's latest update time is oldest. | |
47 | // Try this if you only update some hot keys in small ranges. | |
48 | kOldestLargestSeqFirst = 0x1, | |
49 | // First compact files whose range hasn't been compacted to the next level | |
50 | // for the longest. If your updates are random across the key space, | |
51 | // write amplification is slightly better with this option. | |
52 | kOldestSmallestSeqFirst = 0x2, | |
53 | // First compact files whose ratio between overlapping size in next level | |
54 | // and its size is the smallest. It in many cases can optimize write | |
55 | // amplification. | |
56 | kMinOverlappingRatio = 0x3, | |
57 | }; | |
58 | ||
59 | struct CompactionOptionsFIFO { | |
60 | // once the total sum of table files reaches this, we will delete the oldest | |
61 | // table file | |
62 | // Default: 1GB | |
63 | uint64_t max_table_files_size; | |
64 | ||
11fdf7f2 TL |
65 | // If true, try to do compaction to compact smaller files into larger ones. |
66 | // Minimum files to compact follows options.level0_file_num_compaction_trigger | |
67 | // and compaction won't trigger if average compact bytes per del file is | |
68 | // larger than options.write_buffer_size. This is to protect large files | |
69 | // from being compacted again. | |
70 | // Default: false; | |
71 | bool allow_compaction = false; | |
72 | ||
7c673cae | 73 | CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} |
494da23a | 74 | CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) |
11fdf7f2 | 75 | : max_table_files_size(_max_table_files_size), |
11fdf7f2 | 76 | allow_compaction(_allow_compaction) {} |
7c673cae FG |
77 | }; |
78 | ||
79 | // Compression options for different compression algorithms like Zlib | |
80 | struct CompressionOptions { | |
11fdf7f2 TL |
81 | // RocksDB's generic default compression level. Internally it'll be translated |
82 | // to the default compression level specific to the library being used (see | |
83 | // comment above `ColumnFamilyOptions::compression`). | |
84 | // | |
85 | // The default value is the max 16-bit int as it'll be written out in OPTIONS | |
86 | // file, which should be portable. | |
87 | const static int kDefaultCompressionLevel = 32767; | |
88 | ||
7c673cae FG |
89 | int window_bits; |
90 | int level; | |
91 | int strategy; | |
11fdf7f2 TL |
92 | |
93 | // Maximum size of dictionaries used to prime the compression library. | |
94 | // Enabling dictionary can improve compression ratios when there are | |
95 | // repetitions across data blocks. | |
96 | // | |
97 | // The dictionary is created by sampling the SST file data. If | |
98 | // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's | |
99 | // dictionary generator. Otherwise, the random samples are used directly as | |
100 | // the dictionary. | |
101 | // | |
102 | // When compression dictionary is disabled, we compress and write each block | |
103 | // before buffering data for the next one. When compression dictionary is | |
104 | // enabled, we buffer all SST file data in-memory so we can sample it, as data | |
105 | // can only be compressed and written after the dictionary has been finalized. | |
106 | // So users of this feature may see increased memory usage. | |
107 | // | |
7c673cae FG |
108 | // Default: 0. |
109 | uint32_t max_dict_bytes; | |
110 | ||
11fdf7f2 TL |
111 | // Maximum size of training data passed to zstd's dictionary trainer. Using |
112 | // zstd's dictionary trainer can achieve even better compression ratio | |
113 | // improvements than using `max_dict_bytes` alone. | |
114 | // | |
115 | // The training data will be used to generate a dictionary of max_dict_bytes. | |
116 | // | |
117 | // Default: 0. | |
118 | uint32_t zstd_max_train_bytes; | |
119 | ||
120 | // When the compression options are set by the user, it will be set to "true". | |
121 | // For bottommost_compression_opts, to enable it, user must set enabled=true. | |
122 | // Otherwise, bottommost compression will use compression_opts as default | |
123 | // compression options. | |
124 | // | |
125 | // For compression_opts, if compression_opts.enabled=false, it is still | |
126 | // used as compression options for compression process. | |
127 | // | |
128 | // Default: false. | |
129 | bool enabled; | |
130 | ||
7c673cae | 131 | CompressionOptions() |
11fdf7f2 TL |
132 | : window_bits(-14), |
133 | level(kDefaultCompressionLevel), | |
134 | strategy(0), | |
135 | max_dict_bytes(0), | |
136 | zstd_max_train_bytes(0), | |
137 | enabled(false) {} | |
138 | CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes, | |
139 | int _zstd_max_train_bytes, bool _enabled) | |
7c673cae FG |
140 | : window_bits(wbits), |
141 | level(_lev), | |
142 | strategy(_strategy), | |
11fdf7f2 TL |
143 | max_dict_bytes(_max_dict_bytes), |
144 | zstd_max_train_bytes(_zstd_max_train_bytes), | |
145 | enabled(_enabled) {} | |
7c673cae FG |
146 | }; |
147 | ||
148 | enum UpdateStatus { // Return status For inplace update callback | |
149 | UPDATE_FAILED = 0, // Nothing to update | |
150 | UPDATED_INPLACE = 1, // Value updated inplace | |
151 | UPDATED = 2, // No inplace update. Merged value set | |
152 | }; | |
153 | ||
7c673cae FG |
154 | struct AdvancedColumnFamilyOptions { |
155 | // The maximum number of write buffers that are built up in memory. | |
156 | // The default and the minimum number is 2, so that when 1 write buffer | |
157 | // is being flushed to storage, new writes can continue to the other | |
158 | // write buffer. | |
159 | // If max_write_buffer_number > 3, writing will be slowed down to | |
160 | // options.delayed_write_rate if we are writing to the last write buffer | |
161 | // allowed. | |
162 | // | |
163 | // Default: 2 | |
164 | // | |
165 | // Dynamically changeable through SetOptions() API | |
166 | int max_write_buffer_number = 2; | |
167 | ||
168 | // The minimum number of write buffers that will be merged together | |
169 | // before writing to storage. If set to 1, then | |
170 | // all write buffers are flushed to L0 as individual files and this increases | |
171 | // read amplification because a get request has to check in all of these | |
172 | // files. Also, an in-memory merge may result in writing lesser | |
173 | // data to storage if there are duplicate records in each of these | |
174 | // individual write buffers. Default: 1 | |
175 | int min_write_buffer_number_to_merge = 1; | |
176 | ||
f67539c2 | 177 | // DEPRECATED |
7c673cae FG |
178 | // The total maximum number of write buffers to maintain in memory including |
179 | // copies of buffers that have already been flushed. Unlike | |
180 | // max_write_buffer_number, this parameter does not affect flushing. | |
f67539c2 TL |
181 | // This parameter is being replaced by max_write_buffer_size_to_maintain. |
182 | // If both parameters are set to non-zero values, this parameter will be | |
183 | // ignored. | |
184 | int max_write_buffer_number_to_maintain = 0; | |
185 | ||
186 | // The total maximum size(bytes) of write buffers to maintain in memory | |
187 | // including copies of buffers that have already been flushed. This parameter | |
188 | // only affects trimming of flushed buffers and does not affect flushing. | |
189 | // This controls the maximum amount of write history that will be available | |
190 | // in memory for conflict checking when Transactions are used. The actual | |
191 | // size of write history (flushed Memtables) might be higher than this limit | |
192 | // if further trimming will reduce write history total size below this | |
193 | // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB, | |
194 | // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB. | |
195 | // Because trimming the next Memtable of size 20MB will reduce total memory | |
196 | // usage to 52MB which is below the limit, RocksDB will stop trimming. | |
7c673cae FG |
197 | // |
198 | // When using an OptimisticTransactionDB: | |
199 | // If this value is too low, some transactions may fail at commit time due | |
200 | // to not being able to determine whether there were any write conflicts. | |
201 | // | |
202 | // When using a TransactionDB: | |
203 | // If Transaction::SetSnapshot is used, TransactionDB will read either | |
204 | // in-memory write buffers or SST files to do write-conflict checking. | |
205 | // Increasing this value can reduce the number of reads to SST files | |
206 | // done for conflict detection. | |
207 | // | |
208 | // Setting this value to 0 will cause write buffers to be freed immediately | |
f67539c2 TL |
209 | // after they are flushed. If this value is set to -1, |
210 | // 'max_write_buffer_number * write_buffer_size' will be used. | |
7c673cae FG |
211 | // |
212 | // Default: | |
213 | // If using a TransactionDB/OptimisticTransactionDB, the default value will | |
f67539c2 TL |
214 | // be set to the value of 'max_write_buffer_number * write_buffer_size' |
215 | // if it is not explicitly set by the user. Otherwise, the default is 0. | |
216 | int64_t max_write_buffer_size_to_maintain = 0; | |
7c673cae FG |
217 | |
218 | // Allows thread-safe inplace updates. If this is true, there is no way to | |
219 | // achieve point-in-time consistency using snapshot or iterator (assuming | |
220 | // concurrent updates). Hence iterator and multi-get will return results | |
221 | // which are not consistent as of any point-in-time. | |
222 | // If inplace_callback function is not set, | |
223 | // Put(key, new_value) will update inplace the existing_value iff | |
224 | // * key exists in current memtable | |
225 | // * new sizeof(new_value) <= sizeof(existing_value) | |
226 | // * existing_value for that key is a put i.e. kTypeValue | |
227 | // If inplace_callback function is set, check doc for inplace_callback. | |
228 | // Default: false. | |
229 | bool inplace_update_support = false; | |
230 | ||
231 | // Number of locks used for inplace update | |
232 | // Default: 10000, if inplace_update_support = true, else 0. | |
233 | // | |
234 | // Dynamically changeable through SetOptions() API | |
235 | size_t inplace_update_num_locks = 10000; | |
236 | ||
237 | // existing_value - pointer to previous value (from both memtable and sst). | |
238 | // nullptr if key doesn't exist | |
239 | // existing_value_size - pointer to size of existing_value). | |
240 | // nullptr if key doesn't exist | |
241 | // delta_value - Delta value to be merged with the existing_value. | |
242 | // Stored in transaction logs. | |
243 | // merged_value - Set when delta is applied on the previous value. | |
244 | ||
245 | // Applicable only when inplace_update_support is true, | |
246 | // this callback function is called at the time of updating the memtable | |
247 | // as part of a Put operation, lets say Put(key, delta_value). It allows the | |
248 | // 'delta_value' specified as part of the Put operation to be merged with | |
249 | // an 'existing_value' of the key in the database. | |
250 | ||
251 | // If the merged value is smaller in size that the 'existing_value', | |
252 | // then this function can update the 'existing_value' buffer inplace and | |
253 | // the corresponding 'existing_value'_size pointer, if it wishes to. | |
254 | // The callback should return UpdateStatus::UPDATED_INPLACE. | |
255 | // In this case. (In this case, the snapshot-semantics of the rocksdb | |
256 | // Iterator is not atomic anymore). | |
257 | ||
258 | // If the merged value is larger in size than the 'existing_value' or the | |
259 | // application does not wish to modify the 'existing_value' buffer inplace, | |
260 | // then the merged value should be returned via *merge_value. It is set by | |
261 | // merging the 'existing_value' and the Put 'delta_value'. The callback should | |
262 | // return UpdateStatus::UPDATED in this case. This merged value will be added | |
263 | // to the memtable. | |
264 | ||
265 | // If merging fails or the application does not wish to take any action, | |
266 | // then the callback should return UpdateStatus::UPDATE_FAILED. | |
267 | ||
268 | // Please remember that the original call from the application is Put(key, | |
269 | // delta_value). So the transaction log (if enabled) will still contain (key, | |
270 | // delta_value). The 'merged_value' is not stored in the transaction log. | |
271 | // Hence the inplace_callback function should be consistent across db reopens. | |
272 | ||
273 | // Default: nullptr | |
274 | UpdateStatus (*inplace_callback)(char* existing_value, | |
275 | uint32_t* existing_value_size, | |
276 | Slice delta_value, | |
277 | std::string* merged_value) = nullptr; | |
278 | ||
279 | // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, | |
280 | // create prefix bloom for memtable with the size of | |
281 | // write_buffer_size * memtable_prefix_bloom_size_ratio. | |
11fdf7f2 | 282 | // If it is larger than 0.25, it is sanitized to 0.25. |
7c673cae FG |
283 | // |
284 | // Default: 0 (disable) | |
285 | // | |
286 | // Dynamically changeable through SetOptions() API | |
287 | double memtable_prefix_bloom_size_ratio = 0.0; | |
288 | ||
494da23a TL |
289 | // Enable whole key bloom filter in memtable. Note this will only take effect |
290 | // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering | |
291 | // can potentially reduce CPU usage for point-look-ups. | |
292 | // | |
293 | // Default: false (disable) | |
294 | // | |
295 | // Dynamically changeable through SetOptions() API | |
296 | bool memtable_whole_key_filtering = false; | |
297 | ||
7c673cae FG |
298 | // Page size for huge page for the arena used by the memtable. If <=0, it |
299 | // won't allocate from huge page but from malloc. | |
300 | // Users are responsible to reserve huge pages for it to be allocated. For | |
301 | // example: | |
302 | // sysctl -w vm.nr_hugepages=20 | |
303 | // See linux doc Documentation/vm/hugetlbpage.txt | |
304 | // If there isn't enough free huge page available, it will fall back to | |
305 | // malloc. | |
306 | // | |
307 | // Dynamically changeable through SetOptions() API | |
308 | size_t memtable_huge_page_size = 0; | |
309 | ||
310 | // If non-nullptr, memtable will use the specified function to extract | |
311 | // prefixes for keys, and for each prefix maintain a hint of insert location | |
312 | // to reduce CPU usage for inserting keys with the prefix. Keys out of | |
313 | // domain of the prefix extractor will be insert without using hints. | |
314 | // | |
315 | // Currently only the default skiplist based memtable implements the feature. | |
316 | // All other memtable implementation will ignore the option. It incurs ~250 | |
317 | // additional bytes of memory overhead to store a hint for each prefix. | |
318 | // Also concurrent writes (when allow_concurrent_memtable_write is true) will | |
319 | // ignore the option. | |
320 | // | |
321 | // The option is best suited for workloads where keys will likely to insert | |
11fdf7f2 | 322 | // to a location close the last inserted key with the same prefix. |
7c673cae FG |
323 | // One example could be inserting keys of the form (prefix + timestamp), |
324 | // and keys of the same prefix always comes in with time order. Another | |
325 | // example would be updating the same key over and over again, in which case | |
326 | // the prefix can be the key itself. | |
327 | // | |
328 | // Default: nullptr (disable) | |
329 | std::shared_ptr<const SliceTransform> | |
330 | memtable_insert_with_hint_prefix_extractor = nullptr; | |
331 | ||
f67539c2 TL |
332 | // Control locality of bloom filter probes to improve CPU cache hit rate. |
333 | // This option now only applies to plaintable prefix bloom. This | |
334 | // optimization is turned off when set to 0, and positive number to turn | |
7c673cae FG |
335 | // it on. |
336 | // Default: 0 | |
337 | uint32_t bloom_locality = 0; | |
338 | ||
339 | // size of one block in arena memory allocation. | |
340 | // If <= 0, a proper value is automatically calculated (usually 1/8 of | |
341 | // writer_buffer_size, rounded up to a multiple of 4KB). | |
342 | // | |
11fdf7f2 | 343 | // There are two additional restriction of the specified size: |
7c673cae FG |
344 | // (1) size should be in the range of [4096, 2 << 30] and |
345 | // (2) be the multiple of the CPU word (which helps with the memory | |
346 | // alignment). | |
347 | // | |
348 | // We'll automatically check and adjust the size number to make sure it | |
349 | // conforms to the restrictions. | |
350 | // | |
351 | // Default: 0 | |
352 | // | |
353 | // Dynamically changeable through SetOptions() API | |
354 | size_t arena_block_size = 0; | |
355 | ||
356 | // Different levels can have different compression policies. There | |
357 | // are cases where most lower levels would like to use quick compression | |
358 | // algorithms while the higher levels (which have more data) use | |
359 | // compression algorithms that have better compression but could | |
360 | // be slower. This array, if non-empty, should have an entry for | |
361 | // each level of the database; these override the value specified in | |
362 | // the previous field 'compression'. | |
363 | // | |
364 | // NOTICE if level_compaction_dynamic_level_bytes=true, | |
365 | // compression_per_level[0] still determines L0, but other elements | |
366 | // of the array are based on base level (the level L0 files are merged | |
367 | // to), and may not match the level users see from info log for metadata. | |
368 | // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] | |
369 | // determines compaction type for level n+i-1. | |
370 | // For example, if we have three 5 levels, and we determine to merge L0 | |
371 | // data to L4 (which means L1..L3 will be empty), then the new files go to | |
372 | // L4 uses compression type compression_per_level[1]. | |
373 | // If now L0 is merged to L2. Data goes to L2 will be compressed | |
374 | // according to compression_per_level[1], L3 using compression_per_level[2] | |
375 | // and L4 using compression_per_level[3]. Compaction for each level can | |
376 | // change when data grows. | |
377 | std::vector<CompressionType> compression_per_level; | |
378 | ||
379 | // Number of levels for this database | |
380 | int num_levels = 7; | |
381 | ||
382 | // Soft limit on number of level-0 files. We start slowing down writes at this | |
383 | // point. A value <0 means that no writing slow down will be triggered by | |
384 | // number of files in level-0. | |
385 | // | |
386 | // Default: 20 | |
387 | // | |
388 | // Dynamically changeable through SetOptions() API | |
389 | int level0_slowdown_writes_trigger = 20; | |
390 | ||
391 | // Maximum number of level-0 files. We stop writes at this point. | |
392 | // | |
393 | // Default: 36 | |
394 | // | |
395 | // Dynamically changeable through SetOptions() API | |
396 | int level0_stop_writes_trigger = 36; | |
397 | ||
398 | // Target file size for compaction. | |
399 | // target_file_size_base is per-file size for level-1. | |
400 | // Target file size for level L can be calculated by | |
401 | // target_file_size_base * (target_file_size_multiplier ^ (L-1)) | |
402 | // For example, if target_file_size_base is 2MB and | |
403 | // target_file_size_multiplier is 10, then each file on level-1 will | |
404 | // be 2MB, and each file on level 2 will be 20MB, | |
405 | // and each file on level-3 will be 200MB. | |
406 | // | |
407 | // Default: 64MB. | |
408 | // | |
409 | // Dynamically changeable through SetOptions() API | |
410 | uint64_t target_file_size_base = 64 * 1048576; | |
411 | ||
412 | // By default target_file_size_multiplier is 1, which means | |
413 | // by default files in different levels will have similar size. | |
414 | // | |
415 | // Dynamically changeable through SetOptions() API | |
416 | int target_file_size_multiplier = 1; | |
417 | ||
418 | // If true, RocksDB will pick target size of each level dynamically. | |
419 | // We will pick a base level b >= 1. L0 will be directly merged into level b, | |
420 | // instead of always into level 1. Level 1 to b-1 need to be empty. | |
421 | // We try to pick b and its target size so that | |
422 | // 1. target size is in the range of | |
423 | // (max_bytes_for_level_base / max_bytes_for_level_multiplier, | |
424 | // max_bytes_for_level_base] | |
425 | // 2. target size of the last level (level num_levels-1) equals to extra size | |
426 | // of the level. | |
427 | // At the same time max_bytes_for_level_multiplier and | |
428 | // max_bytes_for_level_multiplier_additional are still satisfied. | |
494da23a | 429 | // (When L0 is too large, we make some adjustment. See below.) |
7c673cae FG |
430 | // |
431 | // With this option on, from an empty DB, we make last level the base level, | |
432 | // which means merging L0 data into the last level, until it exceeds | |
433 | // max_bytes_for_level_base. And then we make the second last level to be | |
434 | // base level, to start to merge L0 data to second last level, with its | |
435 | // target size to be 1/max_bytes_for_level_multiplier of the last level's | |
436 | // extra size. After the data accumulates more so that we need to move the | |
437 | // base level to the third last one, and so on. | |
438 | // | |
439 | // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, | |
440 | // and max_bytes_for_level_base=10MB. | |
441 | // Target sizes of level 1 to 5 starts with: | |
442 | // [- - - - 10MB] | |
443 | // with base level is level. Target sizes of level 1 to 4 are not applicable | |
444 | // because they will not be used. | |
445 | // Until the size of Level 5 grows to more than 10MB, say 11MB, we make | |
446 | // base target to level 4 and now the targets looks like: | |
447 | // [- - - 1.1MB 11MB] | |
448 | // While data are accumulated, size targets are tuned based on actual data | |
449 | // of level 5. When level 5 has 50MB of data, the target is like: | |
450 | // [- - - 5MB 50MB] | |
451 | // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep | |
452 | // level 4 to be the base level, its target size needs to be 10.1MB, which | |
453 | // doesn't satisfy the target size range. So now we make level 3 the target | |
454 | // size and the target sizes of the levels look like: | |
455 | // [- - 1.01MB 10.1MB 101MB] | |
456 | // In the same way, while level 5 further grows, all levels' targets grow, | |
457 | // like | |
458 | // [- - 5MB 50MB 500MB] | |
459 | // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the | |
460 | // base level and make levels' target sizes like this: | |
461 | // [- 1.001MB 10.01MB 100.1MB 1001MB] | |
462 | // and go on... | |
463 | // | |
464 | // By doing it, we give max_bytes_for_level_multiplier a priority against | |
465 | // max_bytes_for_level_base, for a more predictable LSM tree shape. It is | |
466 | // useful to limit worse case space amplification. | |
467 | // | |
494da23a TL |
468 | // |
469 | // If the compaction from L0 is lagged behind, a special mode will be turned | |
470 | // on to prioritize write amplification against max_bytes_for_level_multiplier | |
471 | // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking | |
472 | // at number of L0 files and total L0 size. If number of L0 files is at least | |
473 | // the double of level0_file_num_compaction_trigger, or the total size is | |
474 | // at least max_bytes_for_level_base, this mode is on. The target of L1 grows | |
475 | // to the actual data size in L0, and then determine the target for each level | |
476 | // so that each level will have the same level multiplier. | |
477 | // | |
478 | // For example, when L0 size is 100MB, the size of last level is 1600MB, | |
479 | // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. | |
480 | // Since L0 size is larger than max_bytes_for_level_base, this is a L0 | |
481 | // compaction backlogged mode. So that the L1 size is determined to be 100MB. | |
482 | // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will | |
483 | // be needed. The level multiplier will be calculated to be 4 and the three | |
484 | // levels' target to be [100MB, 400MB, 1600MB]. | |
485 | // | |
486 | // In this mode, The number of levels will be no more than the normal mode, | |
487 | // and the level multiplier will be lower. The write amplification will | |
488 | // likely to be reduced. | |
489 | // | |
490 | // | |
7c673cae FG |
491 | // max_bytes_for_level_multiplier_additional is ignored with this flag on. |
492 | // | |
493 | // Turning this feature on or off for an existing DB can cause unexpected | |
494 | // LSM tree structure so it's not recommended. | |
495 | // | |
7c673cae FG |
496 | // Default: false |
497 | bool level_compaction_dynamic_level_bytes = false; | |
498 | ||
499 | // Default: 10. | |
500 | // | |
501 | // Dynamically changeable through SetOptions() API | |
502 | double max_bytes_for_level_multiplier = 10; | |
503 | ||
504 | // Different max-size multipliers for different levels. | |
505 | // These are multiplied by max_bytes_for_level_multiplier to arrive | |
506 | // at the max-size of each level. | |
507 | // | |
508 | // Default: 1 | |
509 | // | |
510 | // Dynamically changeable through SetOptions() API | |
511 | std::vector<int> max_bytes_for_level_multiplier_additional = | |
512 | std::vector<int>(num_levels, 1); | |
513 | ||
514 | // We try to limit number of bytes in one compaction to be lower than this | |
515 | // threshold. But it's not guaranteed. | |
516 | // Value 0 will be sanitized. | |
517 | // | |
494da23a TL |
518 | // Default: target_file_size_base * 25 |
519 | // | |
520 | // Dynamically changeable through SetOptions() API | |
7c673cae FG |
521 | uint64_t max_compaction_bytes = 0; |
522 | ||
523 | // All writes will be slowed down to at least delayed_write_rate if estimated | |
524 | // bytes needed to be compaction exceed this threshold. | |
525 | // | |
526 | // Default: 64GB | |
494da23a TL |
527 | // |
528 | // Dynamically changeable through SetOptions() API | |
7c673cae FG |
529 | uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; |
530 | ||
531 | // All writes are stopped if estimated bytes needed to be compaction exceed | |
532 | // this threshold. | |
533 | // | |
534 | // Default: 256GB | |
494da23a TL |
535 | // |
536 | // Dynamically changeable through SetOptions() API | |
7c673cae FG |
537 | uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; |
538 | ||
539 | // The compaction style. Default: kCompactionStyleLevel | |
540 | CompactionStyle compaction_style = kCompactionStyleLevel; | |
541 | ||
542 | // If level compaction_style = kCompactionStyleLevel, for each level, | |
543 | // which files are prioritized to be picked to compact. | |
494da23a TL |
544 | // Default: kMinOverlappingRatio |
545 | CompactionPri compaction_pri = kMinOverlappingRatio; | |
7c673cae FG |
546 | |
547 | // The options needed to support Universal Style compactions | |
494da23a TL |
548 | // |
549 | // Dynamically changeable through SetOptions() API | |
550 | // Dynamic change example: | |
551 | // SetOptions("compaction_options_universal", "{size_ratio=2;}") | |
7c673cae FG |
552 | CompactionOptionsUniversal compaction_options_universal; |
553 | ||
554 | // The options for FIFO compaction style | |
11fdf7f2 TL |
555 | // |
556 | // Dynamically changeable through SetOptions() API | |
557 | // Dynamic change example: | |
494da23a | 558 | // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}") |
7c673cae FG |
559 | CompactionOptionsFIFO compaction_options_fifo; |
560 | ||
561 | // An iteration->Next() sequentially skips over keys with the same | |
562 | // user-key unless this option is set. This number specifies the number | |
563 | // of keys (with the same userkey) that will be sequentially | |
564 | // skipped before a reseek is issued. | |
565 | // | |
566 | // Default: 8 | |
567 | // | |
568 | // Dynamically changeable through SetOptions() API | |
569 | uint64_t max_sequential_skip_in_iterations = 8; | |
570 | ||
571 | // This is a factory that provides MemTableRep objects. | |
572 | // Default: a factory that provides a skip-list-based implementation of | |
573 | // MemTableRep. | |
574 | std::shared_ptr<MemTableRepFactory> memtable_factory = | |
575 | std::shared_ptr<SkipListFactory>(new SkipListFactory); | |
576 | ||
577 | // Block-based table related options are moved to BlockBasedTableOptions. | |
578 | // Related options that were originally here but now moved include: | |
579 | // no_block_cache | |
580 | // block_cache | |
581 | // block_cache_compressed | |
582 | // block_size | |
583 | // block_size_deviation | |
584 | // block_restart_interval | |
585 | // filter_policy | |
586 | // whole_key_filtering | |
587 | // If you'd like to customize some of these options, you will need to | |
588 | // use NewBlockBasedTableFactory() to construct a new table factory. | |
589 | ||
590 | // This option allows user to collect their own interested statistics of | |
591 | // the tables. | |
592 | // Default: empty vector -- no user-defined statistics collection will be | |
593 | // performed. | |
594 | typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>> | |
595 | TablePropertiesCollectorFactories; | |
596 | TablePropertiesCollectorFactories table_properties_collector_factories; | |
597 | ||
598 | // Maximum number of successive merge operations on a key in the memtable. | |
599 | // | |
600 | // When a merge operation is added to the memtable and the maximum number of | |
601 | // successive merges is reached, the value of the key will be calculated and | |
602 | // inserted into the memtable instead of the merge operation. This will | |
603 | // ensure that there are never more than max_successive_merges merge | |
604 | // operations in the memtable. | |
605 | // | |
606 | // Default: 0 (disabled) | |
607 | // | |
608 | // Dynamically changeable through SetOptions() API | |
609 | size_t max_successive_merges = 0; | |
610 | ||
611 | // This flag specifies that the implementation should optimize the filters | |
612 | // mainly for cases where keys are found rather than also optimize for keys | |
613 | // missed. This would be used in cases where the application knows that | |
614 | // there are very few misses or the performance in the case of misses is not | |
615 | // important. | |
616 | // | |
617 | // For now, this flag allows us to not store filters for the last level i.e | |
618 | // the largest level which contains data of the LSM store. For keys which | |
619 | // are hits, the filters in this level are not useful because we will search | |
620 | // for the data anyway. NOTE: the filters in other levels are still useful | |
621 | // even for key hit because they tell us whether to look in that level or go | |
622 | // to the higher level. | |
623 | // | |
624 | // Default: false | |
625 | bool optimize_filters_for_hits = false; | |
626 | ||
627 | // After writing every SST file, reopen it and read all the keys. | |
494da23a | 628 | // |
7c673cae | 629 | // Default: false |
494da23a TL |
630 | // |
631 | // Dynamically changeable through SetOptions() API | |
7c673cae FG |
632 | bool paranoid_file_checks = false; |
633 | ||
11fdf7f2 | 634 | // In debug mode, RocksDB run consistency checks on the LSM every time the LSM |
7c673cae FG |
635 | // change (Flush, Compaction, AddFile). These checks are disabled in release |
636 | // mode, use this option to enable them in release mode as well. | |
637 | // Default: false | |
638 | bool force_consistency_checks = false; | |
639 | ||
640 | // Measure IO stats in compactions and flushes, if true. | |
494da23a | 641 | // |
7c673cae | 642 | // Default: false |
494da23a TL |
643 | // |
644 | // Dynamically changeable through SetOptions() API | |
7c673cae FG |
645 | bool report_bg_io_stats = false; |
646 | ||
494da23a | 647 | // Files older than TTL will go through the compaction process. |
494da23a TL |
648 | // Pre-req: This needs max_open_files to be set to -1. |
649 | // In Level: Non-bottom-level files older than TTL will go through the | |
650 | // compation process. | |
651 | // In FIFO: Files older than TTL will be deleted. | |
652 | // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 | |
f67539c2 TL |
653 | // In FIFO, this option will have the same meaning as |
654 | // periodic_compaction_seconds. Whichever stricter will be used. | |
655 | // 0 means disabling. | |
656 | // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to | |
657 | // pick default. | |
11fdf7f2 | 658 | // |
f67539c2 TL |
659 | // Default: 30 days for leveled compaction + block based table. disable |
660 | // otherwise. | |
661 | // | |
662 | // Dynamically changeable through SetOptions() API | |
663 | uint64_t ttl = 0xfffffffffffffffe; | |
664 | ||
665 | // Files older than this value will be picked up for compaction, and | |
666 | // re-written to the same level as they were before. | |
667 | // | |
668 | // A file's age is computed by looking at file_creation_time or creation_time | |
669 | // table properties in order, if they have valid non-zero values; if not, the | |
670 | // age is based on the file's last modified time (given by the underlying | |
671 | // Env). | |
672 | // | |
673 | // Supported in Level and FIFO compaction. | |
674 | // In FIFO compaction, this option has the same meaning as TTL and whichever | |
675 | // stricter will be used. | |
676 | // Pre-req: max_open_file == -1. | |
677 | // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 | |
678 | // | |
679 | // Values: | |
680 | // 0: Turn off Periodic compactions. | |
681 | // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature | |
682 | // as needed. For now, RocksDB will change this value to 30 days | |
683 | // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction | |
684 | // process at least once every 30 days if not compacted sooner. | |
685 | // In FIFO compaction, since the option has the same meaning as ttl, | |
686 | // when this value is left default, and ttl is left to 0, 30 days will be | |
687 | // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. | |
688 | // | |
689 | // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) | |
11fdf7f2 TL |
690 | // |
691 | // Dynamically changeable through SetOptions() API | |
f67539c2 | 692 | uint64_t periodic_compaction_seconds = 0xfffffffffffffffe; |
11fdf7f2 | 693 | |
494da23a TL |
694 | // If this option is set then 1 in N blocks are compressed |
695 | // using a fast (lz4) and slow (zstd) compression algorithm. | |
696 | // The compressibility is reported as stats and the stored | |
697 | // data is left uncompressed (unless compression is also requested). | |
698 | uint64_t sample_for_compression = 0; | |
699 | ||
7c673cae FG |
700 | // Create ColumnFamilyOptions with default values for all fields |
701 | AdvancedColumnFamilyOptions(); | |
702 | // Create ColumnFamilyOptions from Options | |
703 | explicit AdvancedColumnFamilyOptions(const Options& options); | |
704 | ||
705 | // ---------------- OPTIONS NOT SUPPORTED ANYMORE ---------------- | |
706 | ||
707 | // NOT SUPPORTED ANYMORE | |
708 | // This does not do anything anymore. | |
709 | int max_mem_compaction_level; | |
710 | ||
711 | // NOT SUPPORTED ANYMORE -- this options is no longer used | |
712 | // Puts are delayed to options.delayed_write_rate when any level has a | |
713 | // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. | |
714 | // | |
715 | // Default: 0 (disabled) | |
716 | // | |
717 | // Dynamically changeable through SetOptions() API | |
718 | double soft_rate_limit = 0.0; | |
719 | ||
720 | // NOT SUPPORTED ANYMORE -- this options is no longer used | |
721 | double hard_rate_limit = 0.0; | |
722 | ||
723 | // NOT SUPPORTED ANYMORE -- this options is no longer used | |
724 | unsigned int rate_limit_delay_max_milliseconds = 100; | |
725 | ||
726 | // NOT SUPPORTED ANYMORE | |
727 | // Does not have any effect. | |
728 | bool purge_redundant_kvs_while_flush = true; | |
729 | }; | |
730 | ||
f67539c2 | 731 | } // namespace ROCKSDB_NAMESPACE |