]> git.proxmox.com Git - ceph.git/blob - ceph/src/common/options/osd.yaml.in
import quincy beta 17.1.0
[ceph.git] / ceph / src / common / options / osd.yaml.in
1 # -*- mode: YAML -*-
2 ---
3
4 options:
5 - name: osd_numa_prefer_iface
6 type: bool
7 level: advanced
8 desc: prefer IP on network interface on same numa node as storage
9 default: true
10 see_also:
11 - osd_numa_auto_affinity
12 flags:
13 - startup
14 - name: osd_numa_auto_affinity
15 type: bool
16 level: advanced
17 desc: automatically set affinity to numa node when storage and network match
18 default: true
19 flags:
20 - startup
21 - name: osd_numa_node
22 type: int
23 level: advanced
24 desc: set affinity to a numa node (-1 for none)
25 default: -1
26 see_also:
27 - osd_numa_auto_affinity
28 flags:
29 - startup
30 - name: osd_smart_report_timeout
31 type: uint
32 level: advanced
33 desc: Timeout (in seconds) for smarctl to run, default is set to 5
34 default: 5
35 # verify backend can support configured max object name length
36 - name: osd_check_max_object_name_len_on_startup
37 type: bool
38 level: dev
39 default: true
40 with_legacy: true
41 - name: osd_max_backfills
42 type: uint
43 level: advanced
44 desc: Maximum number of concurrent local and remote backfills or recoveries per
45 OSD
46 long_desc: There can be osd_max_backfills local reservations AND the same remote
47 reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
48 in recovery and 1 shard of another recovering PG.
49 fmt_desc: The maximum number of backfills allowed to or from a single OSD.
50 Note that this is applied separately for read and write operations.
51 default: 1
52 flags:
53 - runtime
54 with_legacy: true
55 # Minimum recovery priority (255 = max, smaller = lower)
56 - name: osd_min_recovery_priority
57 type: int
58 level: advanced
59 desc: Minimum priority below which recovery is not performed
60 long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
61 work (e.g., rebalancing) below this threshold and focus solely on higher priority
62 work (e.g., replicating degraded objects).
63 default: 0
64 with_legacy: true
65 - name: osd_backfill_retry_interval
66 type: float
67 level: advanced
68 desc: how frequently to retry backfill reservations after being denied (e.g., due
69 to a full OSD)
70 fmt_desc: The number of seconds to wait before retrying backfill requests.
71 default: 30
72 with_legacy: true
73 - name: osd_recovery_retry_interval
74 type: float
75 level: advanced
76 desc: how frequently to retry recovery reservations after being denied (e.g., due
77 to a full OSD)
78 default: 30
79 with_legacy: true
80 - name: osd_recovery_sleep
81 type: float
82 level: advanced
83 desc: Time in seconds to sleep before next recovery or backfill op
84 fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
85 Increasing this value will slow down recovery operation while
86 client operations will be less impacted.
87 default: 0
88 flags:
89 - runtime
90 with_legacy: true
91 - name: osd_recovery_sleep_hdd
92 type: float
93 level: advanced
94 desc: Time in seconds to sleep before next recovery or backfill op for HDDs
95 fmt_desc: Time in seconds to sleep before next recovery or backfill op
96 for HDDs.
97 default: 0.1
98 flags:
99 - runtime
100 with_legacy: true
101 - name: osd_recovery_sleep_ssd
102 type: float
103 level: advanced
104 desc: Time in seconds to sleep before next recovery or backfill op for SSDs
105 fmt_desc: Time in seconds to sleep before the next recovery or backfill op
106 for SSDs.
107 default: 0
108 see_also:
109 - osd_recovery_sleep
110 flags:
111 - runtime
112 with_legacy: true
113 - name: osd_recovery_sleep_hybrid
114 type: float
115 level: advanced
116 desc: Time in seconds to sleep before next recovery or backfill op when data is
117 on HDD and journal is on SSD
118 fmt_desc: Time in seconds to sleep before the next recovery or backfill op
119 when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
120 default: 0.025
121 see_also:
122 - osd_recovery_sleep
123 flags:
124 - runtime
125 - name: osd_snap_trim_sleep
126 type: float
127 level: advanced
128 desc: Time in seconds to sleep before next snap trim (overrides values below)
129 fmt_desc: Time in seconds to sleep before next snap trim op.
130 Increasing this value will slow down snap trimming.
131 This option overrides backend specific variants.
132 default: 0
133 flags:
134 - runtime
135 with_legacy: true
136 - name: osd_snap_trim_sleep_hdd
137 type: float
138 level: advanced
139 desc: Time in seconds to sleep before next snap trim for HDDs
140 default: 5
141 flags:
142 - runtime
143 - name: osd_snap_trim_sleep_ssd
144 type: float
145 level: advanced
146 desc: Time in seconds to sleep before next snap trim for SSDs
147 fmt_desc: Time in seconds to sleep before next snap trim op
148 for SSD OSDs (including NVMe).
149 default: 0
150 flags:
151 - runtime
152 - name: osd_snap_trim_sleep_hybrid
153 type: float
154 level: advanced
155 desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
156 is on SSD
157 fmt_desc: Time in seconds to sleep before next snap trim op
158 when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
159 default: 2
160 flags:
161 - runtime
162 - name: osd_scrub_invalid_stats
163 type: bool
164 level: advanced
165 default: true
166 with_legacy: true
167 - name: osd_max_scrubs
168 type: int
169 level: advanced
170 desc: Maximum concurrent scrubs on a single OSD
171 fmt_desc: The maximum number of simultaneous scrub operations for
172 a Ceph OSD Daemon.
173 default: 1
174 with_legacy: true
175 - name: osd_scrub_during_recovery
176 type: bool
177 level: advanced
178 desc: Allow scrubbing when PGs on the OSD are undergoing recovery
179 fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
180 scheduling new scrub (and deep--scrub) while there is active recovery.
181 Already running scrubs will be continued. This might be useful to reduce
182 load on busy clusters.
183 default: false
184 with_legacy: true
185 - name: osd_repair_during_recovery
186 type: bool
187 level: advanced
188 desc: Allow requested repairing when PGs on the OSD are undergoing recovery
189 default: false
190 with_legacy: true
191 - name: osd_scrub_begin_hour
192 type: int
193 level: advanced
194 desc: Restrict scrubbing to this hour of the day or later
195 long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
196 fmt_desc: This restricts scrubbing to this hour of the day or later.
197 Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
198 to allow scrubbing the entire day. Along with ``osd_scrub_end_hour``, they define a time
199 window, in which the scrubs can happen.
200 But a scrub will be performed
201 no matter whether the time window allows or not, as long as the placement
202 group's scrub interval exceeds ``osd_scrub_max_interval``.
203 default: 0
204 see_also:
205 - osd_scrub_end_hour
206 min: 0
207 max: 23
208 with_legacy: true
209 - name: osd_scrub_end_hour
210 type: int
211 level: advanced
212 desc: Restrict scrubbing to hours of the day earlier than this
213 long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
214 fmt_desc: This restricts scrubbing to the hour earlier than this.
215 Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
216 for the entire day. Along with ``osd_scrub_begin_hour``, they define a time
217 window, in which the scrubs can happen. But a scrub will be performed
218 no matter whether the time window allows or not, as long as the placement
219 group's scrub interval exceeds ``osd_scrub_max_interval``.
220 default: 0
221 see_also:
222 - osd_scrub_begin_hour
223 min: 0
224 max: 23
225 with_legacy: true
226 - name: osd_scrub_begin_week_day
227 type: int
228 level: advanced
229 desc: Restrict scrubbing to this day of the week or later
230 long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
231 for the entire week.
232 fmt_desc: This restricts scrubbing to this day of the week or later.
233 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
234 and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
235 Along with ``osd_scrub_end_week_day``, they define a time window in which
236 scrubs can happen. But a scrub will be performed
237 no matter whether the time window allows or not, when the PG's
238 scrub interval exceeds ``osd_scrub_max_interval``.
239 default: 0
240 see_also:
241 - osd_scrub_end_week_day
242 min: 0
243 max: 6
244 with_legacy: true
245 - name: osd_scrub_end_week_day
246 type: int
247 level: advanced
248 desc: Restrict scrubbing to days of the week earlier than this
249 long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
250 for the entire week.
251 fmt_desc: This restricts scrubbing to days of the week earlier than this.
252 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
253 and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
254 Along with ``osd_scrub_begin_week_day``, they define a time
255 window, in which the scrubs can happen. But a scrub will be performed
256 no matter whether the time window allows or not, as long as the placement
257 group's scrub interval exceeds ``osd_scrub_max_interval``.
258 default: 0
259 see_also:
260 - osd_scrub_begin_week_day
261 min: 0
262 max: 6
263 with_legacy: true
264 - name: osd_scrub_load_threshold
265 type: float
266 level: advanced
267 desc: Allow scrubbing when system load divided by number of CPUs is below this value
268 fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
269 (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
270 Default is ``0.5``.
271 default: 0.5
272 with_legacy: true
273 # if load is low
274 - name: osd_scrub_min_interval
275 type: float
276 level: advanced
277 desc: Scrub each PG no more often than this interval
278 fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon
279 when the Ceph Storage Cluster load is low.
280 default: 1_day
281 see_also:
282 - osd_scrub_max_interval
283 with_legacy: true
284 # regardless of load
285 - name: osd_scrub_max_interval
286 type: float
287 level: advanced
288 desc: Scrub each PG no less often than this interval
289 fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
290 irrespective of cluster load.
291 default: 7_day
292 see_also:
293 - osd_scrub_min_interval
294 with_legacy: true
295 # randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
296 - name: osd_scrub_interval_randomize_ratio
297 type: float
298 level: advanced
299 desc: Ratio of scrub interval to randomly vary
300 long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
301 so that they are soon uniformly distributed over the week
302 fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
303 the next scrub job for a PG. The delay is a random
304 value less than ``osd_scrub_min_interval`` \*
305 ``osd_scrub_interval_randomized_ratio``. The default setting
306 spreads scrubs throughout the allowed time
307 window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
308 default: 0.5
309 see_also:
310 - osd_scrub_min_interval
311 with_legacy: true
312 # the probability to back off the scheduled scrub
313 - name: osd_scrub_backoff_ratio
314 type: float
315 level: dev
316 desc: Backoff ratio for scheduling scrubs
317 long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means
318 that 1 out of 3 ticks will schedule scrubs
319 default: 0.66
320 with_legacy: true
321 - name: osd_scrub_chunk_min
322 type: int
323 level: advanced
324 desc: Minimum number of objects to scrub in a single chunk
325 fmt_desc: The minimal number of object store chunks to scrub during single operation.
326 Ceph blocks writes to single chunk during scrub.
327 default: 5
328 see_also:
329 - osd_scrub_chunk_max
330 with_legacy: true
331 - name: osd_scrub_chunk_max
332 type: int
333 level: advanced
334 desc: Maximum number of objects to scrub in a single chunk
335 fmt_desc: The maximum number of object store chunks to scrub during single operation.
336 default: 25
337 see_also:
338 - osd_scrub_chunk_min
339 with_legacy: true
340 # sleep between [deep]scrub ops
341 - name: osd_scrub_sleep
342 type: float
343 level: advanced
344 desc: Duration to inject a delay during scrubbing
345 fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow
346 down the overall rate of scrubbing so that client operations will be less impacted.
347 default: 0
348 flags:
349 - runtime
350 with_legacy: true
351 # more sleep between [deep]scrub ops
352 - name: osd_scrub_extended_sleep
353 type: float
354 level: advanced
355 desc: Duration to inject a delay during scrubbing out of scrubbing hours
356 default: 0
357 see_also:
358 - osd_scrub_begin_hour
359 - osd_scrub_end_hour
360 - osd_scrub_begin_week_day
361 - osd_scrub_end_week_day
362 with_legacy: true
363 # whether auto-repair inconsistencies upon deep-scrubbing
364 - name: osd_scrub_auto_repair
365 type: bool
366 level: advanced
367 desc: Automatically repair damaged objects detected during scrub
368 fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
369 are found by scrubs or deep-scrubs. However, if more than
370 ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
371 default: false
372 with_legacy: true
373 # only auto-repair when number of errors is below this threshold
374 - name: osd_scrub_auto_repair_num_errors
375 type: uint
376 level: advanced
377 desc: Maximum number of detected errors to automatically repair
378 fmt_desc: Auto repair will not occur if more than this many errors are found.
379 default: 5
380 see_also:
381 - osd_scrub_auto_repair
382 with_legacy: true
383 - name: osd_scrub_max_preemptions
384 type: uint
385 level: advanced
386 desc: Set the maximum number of times we will preempt a deep scrub due to a client
387 operation before blocking client IO to complete the scrub
388 default: 5
389 min: 0
390 max: 30
391 - name: osd_deep_scrub_interval
392 type: float
393 level: advanced
394 desc: Deep scrub each PG (i.e., verify data checksums) at least this often
395 fmt_desc: The interval for "deep" scrubbing (fully reading all data). The
396 ``osd_scrub_load_threshold`` does not affect this setting.
397 default: 7_day
398 with_legacy: true
399 - name: osd_deep_scrub_randomize_ratio
400 type: float
401 level: advanced
402 desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs
403 are deep)
404 long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they
405 are uniformly distributed over the week
406 default: 0.15
407 with_legacy: true
408 - name: osd_deep_scrub_stride
409 type: size
410 level: advanced
411 desc: Number of bytes to read from an object at a time during deep scrub
412 fmt_desc: Read size when doing a deep scrub.
413 default: 512_K
414 with_legacy: true
415 - name: osd_deep_scrub_keys
416 type: int
417 level: advanced
418 desc: Number of keys to read from an object at a time during deep scrub
419 default: 1024
420 with_legacy: true
421 # objects must be this old (seconds) before we update the whole-object digest on scrub
422 - name: osd_deep_scrub_update_digest_min_age
423 type: int
424 level: advanced
425 desc: Update overall object digest only if object was last modified longer ago than
426 this
427 default: 2_hr
428 with_legacy: true
429 - name: osd_deep_scrub_large_omap_object_key_threshold
430 type: uint
431 level: advanced
432 desc: Warn when we encounter an object with more omap keys than this
433 default: 200000
434 services:
435 - osd
436 - mds
437 see_also:
438 - osd_deep_scrub_large_omap_object_value_sum_threshold
439 with_legacy: true
440 - name: osd_deep_scrub_large_omap_object_value_sum_threshold
441 type: size
442 level: advanced
443 desc: Warn when we encounter an object with more omap key bytes than this
444 default: 1_G
445 services:
446 - osd
447 see_also:
448 - osd_deep_scrub_large_omap_object_key_threshold
449 with_legacy: true
450 # where rados plugins are stored
451 - name: osd_class_dir
452 type: str
453 level: advanced
454 default: @CMAKE_INSTALL_LIBDIR@/rados-classes
455 fmt_desc: The class path for RADOS class plug-ins.
456 with_legacy: true
457 - name: osd_open_classes_on_start
458 type: bool
459 level: advanced
460 default: true
461 with_legacy: true
462 # list of object classes allowed to be loaded (allow all: *)
463 - name: osd_class_load_list
464 type: str
465 level: advanced
466 default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
467 user version cas cmpomap queue 2pc_queue fifo
468 with_legacy: true
469 # list of object classes with default execute perm (allow all: *)
470 - name: osd_class_default_list
471 type: str
472 level: advanced
473 default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
474 user version cas cmpomap queue 2pc_queue fifo
475 with_legacy: true
476 - name: osd_agent_max_ops
477 type: int
478 level: advanced
479 desc: maximum concurrent tiering operations for tiering agent
480 fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
481 in the high speed mode.
482 default: 4
483 with_legacy: true
484 - name: osd_agent_max_low_ops
485 type: int
486 level: advanced
487 desc: maximum concurrent low-priority tiering operations for tiering agent
488 fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
489 in the low speed mode.
490 default: 2
491 with_legacy: true
492 - name: osd_agent_min_evict_effort
493 type: float
494 level: advanced
495 desc: minimum effort to expend evicting clean objects
496 default: 0.1
497 min: 0
498 max: 0.99
499 with_legacy: true
500 - name: osd_agent_quantize_effort
501 type: float
502 level: advanced
503 desc: size of quantize unit for eviction effort
504 default: 0.1
505 with_legacy: true
506 - name: osd_agent_delay_time
507 type: float
508 level: advanced
509 desc: how long agent should sleep if it has no work to do
510 default: 5
511 with_legacy: true
512 # decay atime and hist histograms after how many objects go by
513 - name: osd_agent_hist_halflife
514 type: int
515 level: advanced
516 desc: halflife of agent atime and temp histograms
517 default: 1000
518 with_legacy: true
519 # decay atime and hist histograms after how many objects go by
520 - name: osd_agent_slop
521 type: float
522 level: advanced
523 desc: slop factor to avoid switching tiering flush and eviction mode
524 default: 0.02
525 with_legacy: true
526 - name: osd_find_best_info_ignore_history_les
527 type: bool
528 level: dev
529 desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
530 long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
531 DIRECTION OF A DEVELOPER. It makes peering ignore the last_epoch_started value
532 when peering, which can allow the OSD to believe an OSD has an authoritative view
533 of a PG's contents even when it is in fact old and stale, typically leading to
534 data loss (by believing a stale PG is up to date).
535 default: false
536 with_legacy: true
537 - name: osd_uuid
538 type: uuid
539 level: advanced
540 desc: uuid label for a new OSD
541 fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
542 note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
543 applies to the entire cluster.
544 flags:
545 - create
546 with_legacy: true
547 - name: osd_data
548 type: str
549 level: advanced
550 desc: path to OSD data
551 fmt_desc: The path to the OSDs data. You must create the directory when
552 deploying Ceph. You should mount a drive for OSD data at this
553 mount point. We do not recommend changing the default.
554 default: /var/lib/ceph/osd/$cluster-$id
555 flags:
556 - no_mon_update
557 with_legacy: true
558 - name: osd_journal
559 type: str
560 level: advanced
561 desc: path to OSD journal (when FileStore backend is in use)
562 fmt_desc: The path to the OSD's journal. This may be a path to a file or a
563 block device (such as a partition of an SSD). If it is a file,
564 you must create the directory to contain it. We recommend using a
565 separate fast device when the ``osd_data`` drive is an HDD.
566 default: /var/lib/ceph/osd/$cluster-$id/journal
567 flags:
568 - no_mon_update
569 with_legacy: true
570 - name: osd_journal_size
571 type: size
572 level: advanced
573 desc: size of FileStore journal (in MiB)
574 fmt_desc: The size of the journal in megabytes.
575 default: 5_K
576 flags:
577 - create
578 with_legacy: true
579 - name: osd_journal_flush_on_shutdown
580 type: bool
581 level: advanced
582 desc: flush FileStore journal contents during clean OSD shutdown
583 default: true
584 with_legacy: true
585 - name: osd_compact_on_start
586 type: bool
587 level: advanced
588 desc: compact OSD's object store's OMAP on start
589 default: false
590 # flags for specific control purpose during osd mount() process.
591 # e.g., can be 1 to skip over replaying journal
592 # or 2 to skip over mounting omap or 3 to skip over both.
593 # This might be helpful in case the journal is totally corrupted
594 # and we still want to bring the osd daemon back normally, etc.
595 - name: osd_os_flags
596 type: uint
597 level: dev
598 desc: flags to skip filestore omap or journal initialization
599 default: 0
600 - name: osd_max_write_size
601 type: size
602 level: advanced
603 desc: Maximum size of a RADOS write operation in megabytes
604 long_desc: This setting prevents clients from doing very large writes to RADOS. If
605 you set this to a value below what clients expect, they will receive an error
606 when attempting to write to the cluster.
607 fmt_desc: The maximum size of a write in megabytes.
608 default: 90
609 min: 4
610 with_legacy: true
611 - name: osd_max_pgls
612 type: uint
613 level: advanced
614 desc: maximum number of results when listing objects in a pool
615 fmt_desc: The maximum number of placement groups to list. A client
616 requesting a large number can tie up the Ceph OSD Daemon.
617 default: 1_K
618 with_legacy: true
619 - name: osd_client_message_size_cap
620 type: size
621 level: advanced
622 desc: maximum memory to devote to in-flight client requests
623 long_desc: If this value is exceeded, the OSD will not read any new client data
624 off of the network until memory is freed.
625 fmt_desc: The largest client data message allowed in memory.
626 default: 500_M
627 with_legacy: true
628 - name: osd_client_message_cap
629 type: uint
630 level: advanced
631 desc: maximum number of in-flight client requests
632 default: 256
633 with_legacy: true
634 - name: osd_crush_update_on_start
635 type: bool
636 level: advanced
637 desc: update OSD CRUSH location on startup
638 default: true
639 with_legacy: true
640 - name: osd_class_update_on_start
641 type: bool
642 level: advanced
643 desc: set OSD device class on startup
644 default: true
645 with_legacy: true
646 - name: osd_crush_initial_weight
647 type: float
648 level: advanced
649 desc: if >= 0, initial CRUSH weight for newly created OSDs
650 long_desc: If this value is negative, the size of the OSD in TiB is used.
651 fmt_desc: The initial CRUSH weight for newly added OSDs. The default
652 value of this option is ``the size of a newly added OSD in TB``. By default,
653 the initial CRUSH weight for a newly added OSD is set to its device size in
654 TB. See `Weighting Bucket Items`_ for details.
655 default: -1
656 with_legacy: true
657 # Allows the "peered" state for recovery and backfill below min_size
658 - name: osd_allow_recovery_below_min_size
659 type: bool
660 level: dev
661 desc: allow replicated pools to recover with < min_size active members
662 default: true
663 services:
664 - osd
665 with_legacy: true
666 # cap on # of inc maps we send to peers, clients
667 - name: osd_map_share_max_epochs
668 type: int
669 level: advanced
670 default: 40
671 with_legacy: true
672 - name: osd_map_cache_size
673 type: int
674 level: advanced
675 default: 50
676 fmt_desc: The number of OSD maps to keep cached.
677 with_legacy: true
678 - name: osd_pg_epoch_max_lag_factor
679 type: float
680 level: advanced
681 desc: Max multiple of the map cache that PGs can lag before we throttle map injest
682 default: 2
683 see_also:
684 - osd_map_cache_size
685 - name: osd_inject_bad_map_crc_probability
686 type: float
687 level: dev
688 default: 0
689 with_legacy: true
690 - name: osd_inject_failure_on_pg_removal
691 type: bool
692 level: dev
693 default: false
694 with_legacy: true
695 # shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
696 - name: osd_max_markdown_period
697 type: int
698 level: advanced
699 default: 10_min
700 with_legacy: true
701 - name: osd_max_markdown_count
702 type: int
703 level: advanced
704 default: 5
705 with_legacy: true
706 - name: osd_op_thread_timeout
707 type: int
708 level: advanced
709 default: 15
710 fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
711 with_legacy: true
712 - name: osd_op_thread_suicide_timeout
713 type: int
714 level: advanced
715 default: 150
716 with_legacy: true
717 - name: osd_op_pq_max_tokens_per_priority
718 type: uint
719 level: advanced
720 default: 4_M
721 with_legacy: true
722 - name: osd_op_pq_min_cost
723 type: size
724 level: advanced
725 default: 64_K
726 with_legacy: true
727 # preserve clone_overlap during recovery/migration
728 - name: osd_recover_clone_overlap
729 type: bool
730 level: advanced
731 default: true
732 fmt_desc: Preserves clone overlap during recovery. Should always be set
733 to ``true``.
734 with_legacy: true
735 - name: osd_num_cache_shards
736 type: size
737 level: advanced
738 desc: The number of cache shards to use in the object store.
739 default: 32
740 flags:
741 - startup
742 - name: osd_aggregated_slow_ops_logging
743 type: bool
744 level: advanced
745 desc: Allow OSD daemon to send an aggregated slow ops to the cluster log
746 fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in
747 an aggregated format to the cluster log else sends every slow op to the
748 cluster log.
749 default: true
750 with_legacy: true
751 - name: osd_op_num_threads_per_shard
752 type: int
753 level: advanced
754 default: 0
755 flags:
756 - startup
757 with_legacy: true
758 - name: osd_op_num_threads_per_shard_hdd
759 type: int
760 level: advanced
761 default: 1
762 see_also:
763 - osd_op_num_threads_per_shard
764 flags:
765 - startup
766 with_legacy: true
767 - name: osd_op_num_threads_per_shard_ssd
768 type: int
769 level: advanced
770 default: 2
771 see_also:
772 - osd_op_num_threads_per_shard
773 flags:
774 - startup
775 with_legacy: true
776 - name: osd_op_num_shards
777 type: int
778 level: advanced
779 fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
780 PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
781 non-zero.
782 default: 0
783 flags:
784 - startup
785 with_legacy: true
786 - name: osd_op_num_shards_hdd
787 type: int
788 level: advanced
789 fmt_desc: the number of shards allocated for a given OSD (for rotational media).
790 default: 5
791 see_also:
792 - osd_op_num_shards
793 flags:
794 - startup
795 with_legacy: true
796 - name: osd_op_num_shards_ssd
797 type: int
798 level: advanced
799 fmt_desc: the number of shards allocated for a given OSD (for solid state media).
800 default: 8
801 see_also:
802 - osd_op_num_shards
803 flags:
804 - startup
805 with_legacy: true
806 - name: osd_skip_data_digest
807 type: bool
808 level: dev
809 desc: Do not store full-object checksums if the backend (bluestore) does its own
810 checksums. Only usable with all BlueStore OSDs.
811 default: false
812 # PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
813 # mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
814 # and "mclock_client" are based on the mClock/dmClock algorithm
815 # (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
816 # class the operation belongs to. "mclock_client" does the same but
817 # also works to ienforce fairness between clients. "debug_random"
818 # chooses among all four with equal probability.
819 - name: osd_op_queue
820 type: str
821 level: advanced
822 desc: which operation priority queue algorithm to use
823 long_desc: which operation priority queue algorithm to use
824 fmt_desc: This sets the type of queue to be used for prioritizing ops
825 within each OSD. Both queues feature a strict sub-queue which is
826 dequeued before the normal queue. The normal queue is different
827 between implementations. The WeightedPriorityQueue (``wpq``)
828 dequeues operations in relation to their priorities to prevent
829 starvation of any queue. WPQ should help in cases where a few OSDs
830 are more overloaded than others. The mClockQueue
831 (``mclock_scheduler``) prioritizes operations based on which class
832 they belong to (recovery, scrub, snaptrim, client op, osd subop).
833 See `QoS Based on mClock`_. Requires a restart.
834 default: mclock_scheduler
835 see_also:
836 - osd_op_queue_cut_off
837 enum_values:
838 - wpq
839 - mclock_scheduler
840 - debug_random
841 with_legacy: true
842 # Min priority to go to strict queue. (low, high)
843 - name: osd_op_queue_cut_off
844 type: str
845 level: advanced
846 desc: the threshold between high priority ops and low priority ops
847 long_desc: the threshold between high priority ops that use strict priority ordering
848 and low priority ops that use a fairness algorithm that may or may not incorporate
849 priority
850 fmt_desc: This selects which priority ops will be sent to the strict
851 queue verses the normal queue. The ``low`` setting sends all
852 replication ops and higher to the strict queue, while the ``high``
853 option sends only replication acknowledgment ops and higher to
854 the strict queue. Setting this to ``high`` should help when a few
855 OSDs in the cluster are very busy especially when combined with
856 ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
857 handling replication traffic could starve primary client traffic
858 on these OSDs without these settings. Requires a restart.
859 default: high
860 see_also:
861 - osd_op_queue
862 enum_values:
863 - low
864 - high
865 - debug_random
866 with_legacy: true
867 - name: osd_mclock_scheduler_client_res
868 type: uint
869 level: advanced
870 desc: IO proportion reserved for each client (default)
871 long_desc: Only considered for osd_op_queue = mclock_scheduler
872 fmt_desc: IO proportion reserved for each client (default).
873 default: 1
874 see_also:
875 - osd_op_queue
876 - name: osd_mclock_scheduler_client_wgt
877 type: uint
878 level: advanced
879 desc: IO share for each client (default) over reservation
880 long_desc: Only considered for osd_op_queue = mclock_scheduler
881 fmt_desc: IO share for each client (default) over reservation.
882 default: 1
883 see_also:
884 - osd_op_queue
885 - name: osd_mclock_scheduler_client_lim
886 type: uint
887 level: advanced
888 desc: IO limit for each client (default) over reservation
889 long_desc: Only considered for osd_op_queue = mclock_scheduler
890 fmt_desc: IO limit for each client (default) over reservation.
891 default: 999999
892 see_also:
893 - osd_op_queue
894 - name: osd_mclock_scheduler_background_recovery_res
895 type: uint
896 level: advanced
897 desc: IO proportion reserved for background recovery (default)
898 long_desc: Only considered for osd_op_queue = mclock_scheduler
899 fmt_desc: IO proportion reserved for background recovery (default).
900 default: 1
901 see_also:
902 - osd_op_queue
903 - name: osd_mclock_scheduler_background_recovery_wgt
904 type: uint
905 level: advanced
906 desc: IO share for each background recovery over reservation
907 long_desc: Only considered for osd_op_queue = mclock_scheduler
908 fmt_desc: IO share for each background recovery over reservation.
909 default: 1
910 see_also:
911 - osd_op_queue
912 - name: osd_mclock_scheduler_background_recovery_lim
913 type: uint
914 level: advanced
915 desc: IO limit for background recovery over reservation
916 long_desc: Only considered for osd_op_queue = mclock_scheduler
917 fmt_desc: IO limit for background recovery over reservation.
918 default: 999999
919 see_also:
920 - osd_op_queue
921 - name: osd_mclock_scheduler_background_best_effort_res
922 type: uint
923 level: advanced
924 desc: IO proportion reserved for background best_effort (default)
925 long_desc: Only considered for osd_op_queue = mclock_scheduler
926 fmt_desc: IO proportion reserved for background best_effort (default).
927 default: 1
928 see_also:
929 - osd_op_queue
930 - name: osd_mclock_scheduler_background_best_effort_wgt
931 type: uint
932 level: advanced
933 desc: IO share for each background best_effort over reservation
934 long_desc: Only considered for osd_op_queue = mclock_scheduler
935 fmt_desc: IO share for each background best_effort over reservation.
936 default: 1
937 see_also:
938 - osd_op_queue
939 - name: osd_mclock_scheduler_background_best_effort_lim
940 type: uint
941 level: advanced
942 desc: IO limit for background best_effort over reservation
943 long_desc: Only considered for osd_op_queue = mclock_scheduler
944 fmt_desc: IO limit for background best_effort over reservation.
945 default: 999999
946 see_also:
947 - osd_op_queue
948 - name: osd_mclock_scheduler_anticipation_timeout
949 type: float
950 level: advanced
951 desc: mclock anticipation timeout in seconds
952 long_desc: the amount of time that mclock waits until the unused resource is forfeited
953 default: 0
954 - name: osd_mclock_cost_per_io_usec
955 type: float
956 level: dev
957 desc: Cost per IO in microseconds to consider per OSD (overrides _ssd and _hdd if
958 non-zero)
959 long_desc: This option specifies the cost factor to consider in usec per OSD. This
960 is considered by the mclock scheduler to set an additional cost factor in QoS
961 calculations. Only considered for osd_op_queue = mclock_scheduler
962 fmt_desc: Cost per IO in microseconds to consider per OSD (overrides _ssd
963 and _hdd if non-zero)
964 default: 0
965 flags:
966 - runtime
967 - name: osd_mclock_cost_per_io_usec_hdd
968 type: float
969 level: dev
970 desc: Cost per IO in microseconds to consider per OSD (for rotational media)
971 long_desc: This option specifies the cost factor to consider in usec per OSD for
972 rotational device type. This is considered by the mclock_scheduler to set an additional
973 cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
974 fmt_desc: Cost per IO in microseconds to consider per OSD (for rotational
975 media)
976 default: 25000
977 flags:
978 - runtime
979 - name: osd_mclock_cost_per_io_usec_ssd
980 type: float
981 level: dev
982 desc: Cost per IO in microseconds to consider per OSD (for solid state media)
983 long_desc: This option specifies the cost factor to consider in usec per OSD for
984 solid state device type. This is considered by the mclock_scheduler to set an
985 additional cost factor in QoS calculations. Only considered for osd_op_queue =
986 mclock_scheduler
987 fmt_desc: Cost per IO in microseconds to consider per OSD (for solid state
988 media)
989 default: 50
990 flags:
991 - runtime
992 - name: osd_mclock_cost_per_byte_usec
993 type: float
994 level: dev
995 desc: Cost per byte in microseconds to consider per OSD (overrides _ssd and _hdd
996 if non-zero)
997 long_desc: This option specifies the cost per byte to consider in microseconds per
998 OSD. This is considered by the mclock scheduler to set an additional cost factor
999 in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
1000 fmt_desc: Cost per byte in microseconds to consider per OSD (overrides _ssd
1001 and _hdd if non-zero)
1002 default: 0
1003 flags:
1004 - runtime
1005 - name: osd_mclock_cost_per_byte_usec_hdd
1006 type: float
1007 level: dev
1008 desc: Cost per byte in microseconds to consider per OSD (for rotational media)
1009 long_desc: This option specifies the cost per byte to consider in microseconds per
1010 OSD for rotational device type. This is considered by the mclock_scheduler to
1011 set an additional cost factor in QoS calculations. Only considered for osd_op_queue
1012 = mclock_scheduler
1013 fmt_desc: Cost per byte in microseconds to consider per OSD (for rotational
1014 media)
1015 default: 5.2
1016 flags:
1017 - runtime
1018 - name: osd_mclock_cost_per_byte_usec_ssd
1019 type: float
1020 level: dev
1021 desc: Cost per byte in microseconds to consider per OSD (for solid state media)
1022 long_desc: This option specifies the cost per byte to consider in microseconds per
1023 OSD for solid state device type. This is considered by the mclock_scheduler to
1024 set an additional cost factor in QoS calculations. Only considered for osd_op_queue
1025 = mclock_scheduler
1026 fmt_desc: Cost per byte in microseconds to consider per OSD (for solid state
1027 media)
1028 default: 0.011
1029 flags:
1030 - runtime
1031 - name: osd_mclock_max_capacity_iops_hdd
1032 type: float
1033 level: basic
1034 desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational
1035 media)
1036 long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
1037 QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
1038 = mclock_scheduler
1039 fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
1040 rotational media)
1041 default: 315
1042 flags:
1043 - runtime
1044 - name: osd_mclock_max_capacity_iops_ssd
1045 type: float
1046 level: basic
1047 desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state
1048 media)
1049 long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
1050 QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
1051 = mclock_scheduler
1052 fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
1053 solid state media)
1054 default: 21500
1055 flags:
1056 - runtime
1057 - name: osd_mclock_force_run_benchmark_on_init
1058 type: bool
1059 level: advanced
1060 desc: Force run the OSD benchmark on OSD initialization/boot-up
1061 long_desc: This option specifies whether the OSD benchmark must be run during
1062 the OSD boot-up sequence even if historical data about the OSD iops capacity
1063 is available in the MON config store. Enable this to refresh the OSD iops
1064 capacity if the underlying device's performance characteristics have changed
1065 significantly. Only considered for osd_op_queue = mclock_scheduler.
1066 fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up
1067 default: false
1068 see_also:
1069 - osd_mclock_max_capacity_iops_hdd
1070 - osd_mclock_max_capacity_iops_ssd
1071 flags:
1072 - startup
1073 - name: osd_mclock_skip_benchmark
1074 type: bool
1075 level: dev
1076 desc: Skip the OSD benchmark on OSD initialization/boot-up
1077 long_desc: This option specifies whether the OSD benchmark must be skipped during
1078 the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler.
1079 fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up
1080 default: false
1081 see_also:
1082 - osd_mclock_max_capacity_iops_hdd
1083 - osd_mclock_max_capacity_iops_ssd
1084 flags:
1085 - runtime
1086 - name: osd_mclock_profile
1087 type: str
1088 level: advanced
1089 desc: Which mclock profile to use
1090 long_desc: This option specifies the mclock profile to enable - one among the set
1091 of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
1092 fmt_desc: |
1093 This sets the type of mclock profile to use for providing QoS
1094 based on operations belonging to different classes (background
1095 recovery, scrub, snaptrim, client op, osd subop). Once a built-in
1096 profile is enabled, the lower level mclock resource control
1097 parameters [*reservation, weight, limit*] and some Ceph
1098 configuration parameters are set transparently. Note that the
1099 above does not apply for the *custom* profile.
1100 default: high_client_ops
1101 see_also:
1102 - osd_op_queue
1103 enum_values:
1104 - balanced
1105 - high_recovery_ops
1106 - high_client_ops
1107 - custom
1108 flags:
1109 - runtime
1110 # Set to true for testing. Users should NOT set this.
1111 # If set to true even after reading enough shards to
1112 # decode the object, any error will be reported.
1113 - name: osd_read_ec_check_for_errors
1114 type: bool
1115 level: advanced
1116 default: false
1117 with_legacy: true
1118 - name: osd_recovery_delay_start
1119 type: float
1120 level: advanced
1121 default: 0
1122 fmt_desc: After peering completes, Ceph will delay for the specified number
1123 of seconds before starting to recover RADOS objects.
1124 with_legacy: true
1125 - name: osd_recovery_max_active
1126 type: uint
1127 level: advanced
1128 desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
1129 and _hdd if non-zero)
1130 fmt_desc: The number of active recovery requests per OSD at one time. More
1131 requests will accelerate recovery, but the requests places an
1132 increased load on the cluster.
1133 note: This value is only used if it is non-zero. Normally it
1134 is ``0``, which means that the ``hdd`` or ``ssd`` values
1135 (below) are used, depending on the type of the primary
1136 device backing the OSD.
1137 default: 0
1138 see_also:
1139 - osd_recovery_max_active_hdd
1140 - osd_recovery_max_active_ssd
1141 flags:
1142 - runtime
1143 with_legacy: true
1144 - name: osd_recovery_max_active_hdd
1145 type: uint
1146 level: advanced
1147 desc: Number of simultaneous active recovery operations per OSD (for rotational
1148 devices)
1149 fmt_desc: The number of active recovery requests per OSD at one time, if the
1150 primary device is rotational.
1151 default: 3
1152 see_also:
1153 - osd_recovery_max_active
1154 - osd_recovery_max_active_ssd
1155 flags:
1156 - runtime
1157 with_legacy: true
1158 - name: osd_recovery_max_active_ssd
1159 type: uint
1160 level: advanced
1161 desc: Number of simultaneous active recovery operations per OSD (for non-rotational
1162 solid state devices)
1163 fmt_desc: The number of active recovery requests per OSD at one time, if the
1164 primary device is non-rotational (i.e., an SSD).
1165 default: 10
1166 see_also:
1167 - osd_recovery_max_active
1168 - osd_recovery_max_active_hdd
1169 flags:
1170 - runtime
1171 with_legacy: true
1172 - name: osd_recovery_max_single_start
1173 type: uint
1174 level: advanced
1175 default: 1
1176 fmt_desc: The maximum number of recovery operations per OSD that will be
1177 newly started when an OSD is recovering.
1178 with_legacy: true
1179 # max size of push chunk
1180 - name: osd_recovery_max_chunk
1181 type: size
1182 level: advanced
1183 default: 8_M
1184 fmt_desc: the maximum total size of data chunks a recovery op can carry.
1185 with_legacy: true
1186 # max number of omap entries per chunk; 0 to disable limit
1187 - name: osd_recovery_max_omap_entries_per_chunk
1188 type: uint
1189 level: advanced
1190 default: 8096
1191 with_legacy: true
1192 # max size of a COPYFROM chunk
1193 - name: osd_copyfrom_max_chunk
1194 type: size
1195 level: advanced
1196 default: 8_M
1197 with_legacy: true
1198 # push cost per object
1199 - name: osd_push_per_object_cost
1200 type: size
1201 level: advanced
1202 default: 1000
1203 fmt_desc: the overhead for serving a push op
1204 with_legacy: true
1205 # max size of push message
1206 - name: osd_max_push_cost
1207 type: size
1208 level: advanced
1209 default: 8_M
1210 with_legacy: true
1211 # max objects in single push op
1212 - name: osd_max_push_objects
1213 type: uint
1214 level: advanced
1215 default: 10
1216 with_legacy: true
1217 # Only use clone_overlap for recovery if there are fewer than
1218 # osd_recover_clone_overlap_limit entries in the overlap set
1219 - name: osd_recover_clone_overlap_limit
1220 type: uint
1221 level: advanced
1222 default: 10
1223 flags:
1224 - runtime
1225 - name: osd_debug_feed_pullee
1226 type: int
1227 level: dev
1228 desc: Feed a pullee, and force primary to pull a currently missing object from it
1229 default: -1
1230 with_legacy: true
1231 - name: osd_backfill_scan_min
1232 type: int
1233 level: advanced
1234 default: 64
1235 fmt_desc: The minimum number of objects per backfill scan.
1236 with_legacy: true
1237 - name: osd_backfill_scan_max
1238 type: int
1239 level: advanced
1240 default: 512
1241 fmt_desc: The maximum number of objects per backfill scan.p
1242 with_legacy: true
1243 # minimum number of peers
1244 - name: osd_heartbeat_min_peers
1245 type: int
1246 level: advanced
1247 default: 10
1248 with_legacy: true
1249 - name: osd_delete_sleep
1250 type: float
1251 level: advanced
1252 desc: Time in seconds to sleep before next removal transaction (overrides values
1253 below)
1254 fmt_desc: Time in seconds to sleep before the next removal transaction. This
1255 throttles the PG deletion process.
1256 default: 0
1257 flags:
1258 - runtime
1259 - name: osd_delete_sleep_hdd
1260 type: float
1261 level: advanced
1262 desc: Time in seconds to sleep before next removal transaction for HDDs
1263 default: 5
1264 flags:
1265 - runtime
1266 - name: osd_delete_sleep_ssd
1267 type: float
1268 level: advanced
1269 desc: Time in seconds to sleep before next removal transaction for SSDs
1270 default: 1
1271 flags:
1272 - runtime
1273 - name: osd_delete_sleep_hybrid
1274 type: float
1275 level: advanced
1276 desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
1277 and OSD journal or WAL+DB is on SSD
1278 default: 1
1279 flags:
1280 - runtime