]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / java / src / main / java / org / rocksdb / AdvancedColumnFamilyOptionsInterface.java
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5
6package org.rocksdb;
7
8import java.util.List;
9
10/**
11 * Advanced Column Family Options which are not
12 * mutable (i.e. present in {@link AdvancedMutableColumnFamilyOptionsInterface}
13 *
14 * Taken from include/rocksdb/advanced_options.h
15 */
f67539c2
TL
16public interface AdvancedColumnFamilyOptionsInterface<
17 T extends AdvancedColumnFamilyOptionsInterface<T>> {
7c673cae
FG
18 /**
19 * The minimum number of write buffers that will be merged together
20 * before writing to storage. If set to 1, then
21 * all write buffers are flushed to L0 as individual files and this increases
22 * read amplification because a get request has to check in all of these
23 * files. Also, an in-memory merge may result in writing lesser
24 * data to storage if there are duplicate records in each of these
25 * individual write buffers. Default: 1
26 *
27 * @param minWriteBufferNumberToMerge the minimum number of write buffers
28 * that will be merged together.
29 * @return the reference to the current options.
30 */
31 T setMinWriteBufferNumberToMerge(
32 int minWriteBufferNumberToMerge);
33
34 /**
35 * The minimum number of write buffers that will be merged together
36 * before writing to storage. If set to 1, then
37 * all write buffers are flushed to L0 as individual files and this increases
38 * read amplification because a get request has to check in all of these
39 * files. Also, an in-memory merge may result in writing lesser
40 * data to storage if there are duplicate records in each of these
41 * individual write buffers. Default: 1
42 *
43 * @return the minimum number of write buffers that will be merged together.
44 */
45 int minWriteBufferNumberToMerge();
46
47 /**
48 * The total maximum number of write buffers to maintain in memory including
49 * copies of buffers that have already been flushed. Unlike
50 * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()},
51 * this parameter does not affect flushing.
52 * This controls the minimum amount of write history that will be available
53 * in memory for conflict checking when Transactions are used.
54 *
55 * When using an OptimisticTransactionDB:
56 * If this value is too low, some transactions may fail at commit time due
57 * to not being able to determine whether there were any write conflicts.
58 *
59 * When using a TransactionDB:
60 * If Transaction::SetSnapshot is used, TransactionDB will read either
61 * in-memory write buffers or SST files to do write-conflict checking.
62 * Increasing this value can reduce the number of reads to SST files
63 * done for conflict detection.
64 *
65 * Setting this value to 0 will cause write buffers to be freed immediately
66 * after they are flushed.
67 * If this value is set to -1,
68 * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
69 * will be used.
70 *
71 * Default:
72 * If using a TransactionDB/OptimisticTransactionDB, the default value will
73 * be set to the value of
74 * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
75 * if it is not explicitly set by the user. Otherwise, the default is 0.
76 *
77 * @param maxWriteBufferNumberToMaintain The maximum number of write
78 * buffers to maintain
79 *
80 * @return the reference to the current options.
81 */
82 T setMaxWriteBufferNumberToMaintain(
83 int maxWriteBufferNumberToMaintain);
84
85 /**
86 * The total maximum number of write buffers to maintain in memory including
87 * copies of buffers that have already been flushed.
88 *
89 * @return maxWriteBufferNumberToMaintain The maximum number of write buffers
90 * to maintain
91 */
92 int maxWriteBufferNumberToMaintain();
93
94 /**
95 * Allows thread-safe inplace updates.
96 * If inplace_callback function is not set,
97 * Put(key, new_value) will update inplace the existing_value iff
98 * * key exists in current memtable
99 * * new sizeof(new_value) &le; sizeof(existing_value)
100 * * existing_value for that key is a put i.e. kTypeValue
101 * If inplace_callback function is set, check doc for inplace_callback.
102 * Default: false.
103 *
104 * @param inplaceUpdateSupport true if thread-safe inplace updates
105 * are allowed.
106 * @return the reference to the current options.
107 */
108 T setInplaceUpdateSupport(
109 boolean inplaceUpdateSupport);
110
111 /**
112 * Allows thread-safe inplace updates.
113 * If inplace_callback function is not set,
114 * Put(key, new_value) will update inplace the existing_value iff
115 * * key exists in current memtable
116 * * new sizeof(new_value) &le; sizeof(existing_value)
117 * * existing_value for that key is a put i.e. kTypeValue
118 * If inplace_callback function is set, check doc for inplace_callback.
119 * Default: false.
120 *
121 * @return true if thread-safe inplace updates are allowed.
122 */
123 boolean inplaceUpdateSupport();
124
125 /**
126 * Control locality of bloom filter probes to improve cache miss rate.
127 * This option only applies to memtable prefix bloom and plaintable
128 * prefix bloom. It essentially limits the max number of cache lines each
129 * bloom filter check can touch.
130 * This optimization is turned off when set to 0. The number should never
131 * be greater than number of probes. This option can boost performance
132 * for in-memory workload but should use with care since it can cause
133 * higher false positive rate.
134 * Default: 0
135 *
136 * @param bloomLocality the level of locality of bloom-filter probes.
137 * @return the reference to the current options.
138 */
139 T setBloomLocality(int bloomLocality);
140
141 /**
142 * Control locality of bloom filter probes to improve cache miss rate.
143 * This option only applies to memtable prefix bloom and plaintable
144 * prefix bloom. It essentially limits the max number of cache lines each
145 * bloom filter check can touch.
146 * This optimization is turned off when set to 0. The number should never
147 * be greater than number of probes. This option can boost performance
148 * for in-memory workload but should use with care since it can cause
149 * higher false positive rate.
150 * Default: 0
151 *
152 * @return the level of locality of bloom-filter probes.
153 * @see #setBloomLocality(int)
154 */
155 int bloomLocality();
156
157 /**
158 * <p>Different levels can have different compression
159 * policies. There are cases where most lower levels
160 * would like to use quick compression algorithms while
161 * the higher levels (which have more data) use
162 * compression algorithms that have better compression
163 * but could be slower. This array, if non-empty, should
164 * have an entry for each level of the database;
165 * these override the value specified in the previous
166 * field 'compression'.</p>
167 *
168 * <strong>NOTICE</strong>
169 * <p>If {@code level_compaction_dynamic_level_bytes=true},
170 * {@code compression_per_level[0]} still determines {@code L0},
171 * but other elements of the array are based on base level
172 * (the level {@code L0} files are merged to), and may not
173 * match the level users see from info log for metadata.
174 * </p>
175 * <p>If {@code L0} files are merged to {@code level - n},
176 * then, for {@code i&gt;0}, {@code compression_per_level[i]}
177 * determines compaction type for level {@code n+i-1}.</p>
178 *
179 * <strong>Example</strong>
180 * <p>For example, if we have 5 levels, and we determine to
181 * merge {@code L0} data to {@code L4} (which means {@code L1..L3}
182 * will be empty), then the new files go to {@code L4} uses
183 * compression type {@code compression_per_level[1]}.</p>
184 *
185 * <p>If now {@code L0} is merged to {@code L2}. Data goes to
186 * {@code L2} will be compressed according to
187 * {@code compression_per_level[1]}, {@code L3} using
188 * {@code compression_per_level[2]}and {@code L4} using
189 * {@code compression_per_level[3]}. Compaction for each
190 * level can change when data grows.</p>
191 *
192 * <p><strong>Default:</strong> empty</p>
193 *
194 * @param compressionLevels list of
195 * {@link org.rocksdb.CompressionType} instances.
196 *
197 * @return the reference to the current options.
198 */
199 T setCompressionPerLevel(
200 List<CompressionType> compressionLevels);
201
202 /**
203 * <p>Return the currently set {@link org.rocksdb.CompressionType}
204 * per instances.</p>
205 *
206 * <p>See: {@link #setCompressionPerLevel(java.util.List)}</p>
207 *
208 * @return list of {@link org.rocksdb.CompressionType}
209 * instances.
210 */
211 List<CompressionType> compressionPerLevel();
212
213 /**
214 * Set the number of levels for this database
215 * If level-styled compaction is used, then this number determines
216 * the total number of levels.
217 *
218 * @param numLevels the number of levels.
219 * @return the reference to the current options.
220 */
221 T setNumLevels(int numLevels);
222
223 /**
224 * If level-styled compaction is used, then this number determines
225 * the total number of levels.
226 *
227 * @return the number of levels.
228 */
229 int numLevels();
230
231 /**
232 * <p>If {@code true}, RocksDB will pick target size of each level
233 * dynamically. We will pick a base level b &gt;= 1. L0 will be
234 * directly merged into level b, instead of always into level 1.
235 * Level 1 to b-1 need to be empty. We try to pick b and its target
236 * size so that</p>
237 *
238 * <ol>
239 * <li>target size is in the range of
240 * (max_bytes_for_level_base / max_bytes_for_level_multiplier,
241 * max_bytes_for_level_base]</li>
242 * <li>target size of the last level (level num_levels-1) equals to extra size
243 * of the level.</li>
244 * </ol>
245 *
246 * <p>At the same time max_bytes_for_level_multiplier and
247 * max_bytes_for_level_multiplier_additional are still satisfied.</p>
248 *
249 * <p>With this option on, from an empty DB, we make last level the base
250 * level, which means merging L0 data into the last level, until it exceeds
251 * max_bytes_for_level_base. And then we make the second last level to be
252 * base level, to start to merge L0 data to second last level, with its
253 * target size to be {@code 1/max_bytes_for_level_multiplier} of the last
254 * levels extra size. After the data accumulates more so that we need to
255 * move the base level to the third last one, and so on.</p>
256 *
f67539c2
TL
257 * <p><b>Example</b></p>
258 *
7c673cae
FG
259 * <p>For example, assume {@code max_bytes_for_level_multiplier=10},
260 * {@code num_levels=6}, and {@code max_bytes_for_level_base=10MB}.</p>
261 *
262 * <p>Target sizes of level 1 to 5 starts with:</p>
263 * {@code [- - - - 10MB]}
264 * <p>with base level is level. Target sizes of level 1 to 4 are not applicable
265 * because they will not be used.
266 * Until the size of Level 5 grows to more than 10MB, say 11MB, we make
267 * base target to level 4 and now the targets looks like:</p>
268 * {@code [- - - 1.1MB 11MB]}
269 * <p>While data are accumulated, size targets are tuned based on actual data
270 * of level 5. When level 5 has 50MB of data, the target is like:</p>
271 * {@code [- - - 5MB 50MB]}
272 * <p>Until level 5's actual size is more than 100MB, say 101MB. Now if we
273 * keep level 4 to be the base level, its target size needs to be 10.1MB,
274 * which doesn't satisfy the target size range. So now we make level 3
275 * the target size and the target sizes of the levels look like:</p>
276 * {@code [- - 1.01MB 10.1MB 101MB]}
277 * <p>In the same way, while level 5 further grows, all levels' targets grow,
278 * like</p>
279 * {@code [- - 5MB 50MB 500MB]}
280 * <p>Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
281 * base level and make levels' target sizes like this:</p>
282 * {@code [- 1.001MB 10.01MB 100.1MB 1001MB]}
283 * <p>and go on...</p>
284 *
285 * <p>By doing it, we give {@code max_bytes_for_level_multiplier} a priority
286 * against {@code max_bytes_for_level_base}, for a more predictable LSM tree
287 * shape. It is useful to limit worse case space amplification.</p>
288 *
289 * <p>{@code max_bytes_for_level_multiplier_additional} is ignored with
290 * this flag on.</p>
291 *
292 * <p>Turning this feature on or off for an existing DB can cause unexpected
293 * LSM tree structure so it's not recommended.</p>
294 *
295 * <p><strong>Caution</strong>: this option is experimental</p>
296 *
297 * <p>Default: false</p>
298 *
299 * @param enableLevelCompactionDynamicLevelBytes boolean value indicating
300 * if {@code LevelCompactionDynamicLevelBytes} shall be enabled.
301 * @return the reference to the current options.
302 */
303 @Experimental("Turning this feature on or off for an existing DB can cause" +
1e59de90 304 " unexpected LSM tree structure so it's not recommended")
7c673cae
FG
305 T setLevelCompactionDynamicLevelBytes(
306 boolean enableLevelCompactionDynamicLevelBytes);
307
308 /**
309 * <p>Return if {@code LevelCompactionDynamicLevelBytes} is enabled.
310 * </p>
311 *
312 * <p>For further information see
313 * {@link #setLevelCompactionDynamicLevelBytes(boolean)}</p>
314 *
315 * @return boolean value indicating if
316 * {@code levelCompactionDynamicLevelBytes} is enabled.
317 */
318 @Experimental("Caution: this option is experimental")
319 boolean levelCompactionDynamicLevelBytes();
320
321 /**
322 * Maximum size of each compaction (not guarantee)
323 *
324 * @param maxCompactionBytes the compaction size limit
325 * @return the reference to the current options.
326 */
327 T setMaxCompactionBytes(
328 long maxCompactionBytes);
329
330 /**
331 * Control maximum size of each compaction (not guaranteed)
332 *
333 * @return compaction size threshold
334 */
335 long maxCompactionBytes();
336
337 /**
338 * Set compaction style for DB.
339 *
340 * Default: LEVEL.
341 *
342 * @param compactionStyle Compaction style.
343 * @return the reference to the current options.
344 */
345 ColumnFamilyOptionsInterface setCompactionStyle(
346 CompactionStyle compactionStyle);
347
348 /**
349 * Compaction style for DB.
350 *
351 * @return Compaction style.
352 */
353 CompactionStyle compactionStyle();
354
355 /**
356 * If level {@link #compactionStyle()} == {@link CompactionStyle#LEVEL},
357 * for each level, which files are prioritized to be picked to compact.
358 *
359 * Default: {@link CompactionPriority#ByCompensatedSize}
360 *
361 * @param compactionPriority The compaction priority
362 *
363 * @return the reference to the current options.
364 */
365 T setCompactionPriority(
366 CompactionPriority compactionPriority);
367
368 /**
369 * Get the Compaction priority if level compaction
370 * is used for all levels
371 *
372 * @return The compaction priority
373 */
374 CompactionPriority compactionPriority();
375
376 /**
377 * Set the options needed to support Universal Style compactions
378 *
379 * @param compactionOptionsUniversal The Universal Style compaction options
380 *
381 * @return the reference to the current options.
382 */
383 T setCompactionOptionsUniversal(
384 CompactionOptionsUniversal compactionOptionsUniversal);
385
386 /**
387 * The options needed to support Universal Style compactions
388 *
389 * @return The Universal Style compaction options
390 */
391 CompactionOptionsUniversal compactionOptionsUniversal();
392
393 /**
394 * The options for FIFO compaction style
395 *
396 * @param compactionOptionsFIFO The FIFO compaction options
397 *
398 * @return the reference to the current options.
399 */
400 T setCompactionOptionsFIFO(
401 CompactionOptionsFIFO compactionOptionsFIFO);
402
403 /**
404 * The options for FIFO compaction style
405 *
406 * @return The FIFO compaction options
407 */
408 CompactionOptionsFIFO compactionOptionsFIFO();
409
410 /**
411 * <p>This flag specifies that the implementation should optimize the filters
412 * mainly for cases where keys are found rather than also optimize for keys
413 * missed. This would be used in cases where the application knows that
414 * there are very few misses or the performance in the case of misses is not
415 * important.</p>
416 *
417 * <p>For now, this flag allows us to not store filters for the last level i.e
418 * the largest level which contains data of the LSM store. For keys which
419 * are hits, the filters in this level are not useful because we will search
420 * for the data anyway.</p>
421 *
422 * <p><strong>NOTE</strong>: the filters in other levels are still useful
423 * even for key hit because they tell us whether to look in that level or go
424 * to the higher level.</p>
425 *
426 * <p>Default: false<p>
427 *
428 * @param optimizeFiltersForHits boolean value indicating if this flag is set.
429 * @return the reference to the current options.
430 */
431 T setOptimizeFiltersForHits(
432 boolean optimizeFiltersForHits);
433
434 /**
435 * <p>Returns the current state of the {@code optimize_filters_for_hits}
436 * setting.</p>
437 *
438 * @return boolean value indicating if the flag
439 * {@code optimize_filters_for_hits} was set.
440 */
441 boolean optimizeFiltersForHits();
442
443 /**
1e59de90
TL
444 * By default, RocksDB runs consistency checks on the LSM every time the LSM
445 * changes (Flush, Compaction, AddFile). Use this option if you need to
446 * disable them.
7c673cae 447 *
1e59de90 448 * Default: true
7c673cae 449 *
1e59de90 450 * @param forceConsistencyChecks false to disable consistency checks
7c673cae
FG
451 *
452 * @return the reference to the current options.
453 */
454 T setForceConsistencyChecks(
455 boolean forceConsistencyChecks);
456
457 /**
1e59de90
TL
458 * By default, RocksDB runs consistency checks on the LSM every time the LSM
459 * changes (Flush, Compaction, AddFile).
7c673cae
FG
460 *
461 * @return true if consistency checks are enforced
462 */
463 boolean forceConsistencyChecks();
464}