]> git.proxmox.com Git - ceph.git/blob - ceph/src/common/options/global.yaml.in
3bba588296951c7002032366f8dcc5f4f5b74888
[ceph.git] / ceph / src / common / options / global.yaml.in
1 # -*- mode: YAML -*-
2 ---
3
4 options:
5 - name: host
6 type: str
7 level: basic
8 desc: local hostname
9 long_desc: if blank, ceph assumes the short hostname (hostname -s)
10 tags:
11 - network
12 services:
13 - common
14 flags:
15 - no_mon_update
16 with_legacy: true
17 - name: fsid
18 type: uuid
19 level: basic
20 desc: cluster fsid (uuid)
21 fmt_desc: The cluster ID. One per cluster.
22 May be generated by a deployment tool if not specified.
23 note: Do not set this value if you use a deployment tool that does
24 it for you.
25 tags:
26 - service
27 services:
28 - common
29 flags:
30 - no_mon_update
31 - startup
32 - name: public_addr
33 type: addr
34 level: basic
35 desc: public-facing address to bind to
36 fmt_desc: The IP address for the public (front-side) network.
37 Set for each daemon.
38 services:
39 - mon
40 - mds
41 - osd
42 - mgr
43 flags:
44 - startup
45 with_legacy: true
46 - name: public_addrv
47 type: addrvec
48 level: basic
49 desc: public-facing address to bind to
50 services:
51 - mon
52 - mds
53 - osd
54 - mgr
55 flags:
56 - startup
57 with_legacy: true
58 - name: public_bind_addr
59 type: addr
60 level: advanced
61 services:
62 - mon
63 flags:
64 - startup
65 fmt_desc: In some dynamic deployments the Ceph MON daemon might bind
66 to an IP address locally that is different from the ``public_addr``
67 advertised to other peers in the network. The environment must ensure
68 that routing rules are set correctly. If ``public_bind_addr`` is set
69 the Ceph Monitor daemon will bind to it locally and use ``public_addr``
70 in the monmaps to advertise its address to peers. This behavior is limited
71 to the Monitor daemon.
72 with_legacy: true
73 - name: cluster_addr
74 type: addr
75 level: basic
76 desc: cluster-facing address to bind to
77 fmt_desc: The IP address for the cluster (back-side) network.
78 Set for each daemon.
79 tags:
80 - network
81 services:
82 - osd
83 flags:
84 - startup
85 with_legacy: true
86 - name: public_network
87 type: str
88 level: advanced
89 desc: Network(s) from which to choose a public address to bind to
90 fmt_desc: The IP address and netmask of the public (front-side) network
91 (e.g., ``192.168.0.0/24``). Set in ``[global]``. You may specify
92 comma-separated subnets. The format of it looks like
93 ``{ip-address}/{netmask} [, {ip-address}/{netmask}]``
94 tags:
95 - network
96 services:
97 - mon
98 - mds
99 - osd
100 - mgr
101 flags:
102 - startup
103 with_legacy: true
104 - name: public_network_interface
105 type: str
106 level: advanced
107 desc: Interface name(s) from which to choose an address from a public_network to
108 bind to; public_network must also be specified.
109 tags:
110 - network
111 services:
112 - mon
113 - mds
114 - osd
115 - mgr
116 see_also:
117 - public_network
118 flags:
119 - startup
120 - name: cluster_network
121 type: str
122 level: advanced
123 desc: Network(s) from which to choose a cluster address to bind to
124 fmt_desc: The IP address and netmask of the cluster (back-side) network
125 (e.g., ``10.0.0.0/24``). Set in ``[global]``. You may specify
126 comma-separated subnets. The format of it looks like
127 ``{ip-address}/{netmask} [, {ip-address}/{netmask}]``
128 tags:
129 - network
130 services:
131 - osd
132 flags:
133 - startup
134 with_legacy: true
135 - name: cluster_network_interface
136 type: str
137 level: advanced
138 desc: Interface name(s) from which to choose an address from a cluster_network to
139 bind to; cluster_network must also be specified.
140 tags:
141 - network
142 services:
143 - mon
144 - mds
145 - osd
146 - mgr
147 see_also:
148 - cluster_network
149 flags:
150 - startup
151 - name: monmap
152 type: str
153 level: advanced
154 desc: path to MonMap file
155 long_desc: This option is normally used during mkfs, but can also be used to identify
156 which monitors to connect to.
157 services:
158 - mon
159 flags:
160 - no_mon_update
161 - create
162 - name: mon_host
163 type: str
164 level: basic
165 desc: list of hosts or addresses to search for a monitor
166 long_desc: This is a list of IP addresses or hostnames that are separated by commas, whitespace, or semicolons. Hostnames are resolved via DNS. All A and AAAA records are included in the search list.
167 services:
168 - common
169 flags:
170 - no_mon_update
171 - startup
172 - name: mon_host_override
173 type: str
174 level: advanced
175 desc: monitor(s) to use overriding the MonMap
176 fmt_desc: This is the list of monitors that the Ceph process **initially** contacts when first establishing communication with the Ceph cluster. This overrides the known monitor list that is derived from MonMap updates sent to older Ceph instances (like librados cluster handles). This option is expected to be useful primarily for debugging.
177 services:
178 - common
179 flags:
180 - no_mon_update
181 - startup
182 - name: mon_dns_srv_name
183 type: str
184 level: advanced
185 desc: name of DNS SRV record to check for monitor addresses
186 fmt_desc: the service name used querying the DNS for the monitor hosts/addresses
187 default: ceph-mon
188 tags:
189 - network
190 services:
191 - common
192 see_also:
193 - mon_host
194 flags:
195 - startup
196 - name: container_image
197 type: str
198 level: basic
199 desc: container image (used by cephadm orchestrator)
200 default: docker.io/ceph/daemon-base:latest-master-devel
201 flags:
202 - startup
203 - name: no_config_file
204 type: bool
205 level: advanced
206 desc: signal that we don't require a config file to be present
207 long_desc: When specified, we won't be looking for a configuration file, and will
208 instead expect that whatever options or values are required for us to work will
209 be passed as arguments.
210 default: false
211 tags:
212 - config
213 services:
214 - common
215 flags:
216 - no_mon_update
217 - startup
218 - name: lockdep
219 type: bool
220 level: dev
221 desc: enable lockdep lock dependency analyzer
222 default: false
223 services:
224 - common
225 flags:
226 - no_mon_update
227 - startup
228 with_legacy: true
229 - name: lockdep_force_backtrace
230 type: bool
231 level: dev
232 desc: always gather current backtrace at every lock
233 default: false
234 services:
235 - common
236 see_also:
237 - lockdep
238 flags:
239 - startup
240 with_legacy: true
241 - name: run_dir
242 type: str
243 level: advanced
244 desc: path for the 'run' directory for storing pid and socket files
245 default: /var/run/ceph
246 services:
247 - common
248 see_also:
249 - admin_socket
250 flags:
251 - startup
252 with_legacy: true
253 - name: admin_socket
254 type: str
255 level: advanced
256 desc: path for the runtime control socket file, used by the 'ceph daemon' command
257 fmt_desc: The socket for executing administrative commands on a daemon,
258 irrespective of whether Ceph Monitors have established a quorum.
259 daemon_default: $run_dir/$cluster-$name.asok
260 services:
261 - common
262 flags:
263 - startup
264 # default changed by common_preinit()
265 with_legacy: true
266 - name: admin_socket_mode
267 type: str
268 level: advanced
269 desc: file mode to set for the admin socket file, e.g, '0755'
270 services:
271 - common
272 see_also:
273 - admin_socket
274 flags:
275 - startup
276 with_legacy: true
277 - name: daemonize
278 type: bool
279 level: advanced
280 desc: whether to daemonize (background) after startup
281 default: false
282 daemon_default: true
283 tags:
284 - service
285 services:
286 - mon
287 - mgr
288 - osd
289 - mds
290 see_also:
291 - pid_file
292 - chdir
293 flags:
294 - no_mon_update
295 - startup
296 # default changed by common_preinit()
297 with_legacy: true
298 - name: setuser
299 type: str
300 level: advanced
301 desc: uid or user name to switch to on startup
302 long_desc: This is normally specified by the systemd unit file.
303 tags:
304 - service
305 services:
306 - mon
307 - mgr
308 - osd
309 - mds
310 see_also:
311 - setgroup
312 flags:
313 - startup
314 with_legacy: true
315 - name: setgroup
316 type: str
317 level: advanced
318 desc: gid or group name to switch to on startup
319 long_desc: This is normally specified by the systemd unit file.
320 tags:
321 - service
322 services:
323 - mon
324 - mgr
325 - osd
326 - mds
327 see_also:
328 - setuser
329 flags:
330 - startup
331 with_legacy: true
332 - name: setuser_match_path
333 type: str
334 level: advanced
335 desc: if set, setuser/setgroup is condition on this path matching ownership
336 long_desc: If setuser or setgroup are specified, and this option is non-empty, then
337 the uid/gid of the daemon will only be changed if the file or directory specified
338 by this option has a matching uid and/or gid. This exists primarily to allow
339 switching to user ceph for OSDs to be conditional on whether the osd data contents
340 have also been chowned after an upgrade. This is normally specified by the systemd
341 unit file.
342 tags:
343 - service
344 services:
345 - mon
346 - mgr
347 - osd
348 - mds
349 see_also:
350 - setuser
351 - setgroup
352 flags:
353 - startup
354 with_legacy: true
355 - name: pid_file
356 type: str
357 level: advanced
358 desc: path to write a pid file (if any)
359 fmt_desc: The file in which the mon, osd or mds will write its
360 PID. For instance, ``/var/run/$cluster/$type.$id.pid``
361 will create /var/run/ceph/mon.a.pid for the ``mon`` with
362 id ``a`` running in the ``ceph`` cluster. The ``pid
363 file`` is removed when the daemon stops gracefully. If
364 the process is not daemonized (i.e. runs with the ``-f``
365 or ``-d`` option), the ``pid file`` is not created.
366 tags:
367 - service
368 services:
369 - mon
370 - mgr
371 - osd
372 - mds
373 flags:
374 - startup
375 with_legacy: true
376 - name: chdir
377 type: str
378 level: advanced
379 desc: path to chdir(2) to after daemonizing
380 fmt_desc: The directory Ceph daemons change to once they are
381 up and running. Default ``/`` directory recommended.
382 tags:
383 - service
384 services:
385 - mon
386 - mgr
387 - osd
388 - mds
389 see_also:
390 - daemonize
391 flags:
392 - no_mon_update
393 - startup
394 with_legacy: true
395 - name: fatal_signal_handlers
396 type: bool
397 level: advanced
398 desc: whether to register signal handlers for SIGABRT etc that dump a stack trace
399 long_desc: This is normally true for daemons and values for libraries.
400 fmt_desc: If set, we will install signal handlers for SEGV, ABRT, BUS, ILL,
401 FPE, XCPU, XFSZ, SYS signals to generate a useful log message
402 default: true
403 tags:
404 - service
405 services:
406 - mon
407 - mgr
408 - osd
409 - mds
410 flags:
411 - startup
412 with_legacy: true
413 - name: crash_dir
414 type: str
415 level: advanced
416 desc: Directory where crash reports are archived
417 default: /var/lib/ceph/crash
418 flags:
419 - startup
420 with_legacy: true
421 - name: restapi_log_level
422 type: str
423 level: advanced
424 desc: default set by python code
425 with_legacy: true
426 - name: restapi_base_url
427 type: str
428 level: advanced
429 desc: default set by python code
430 with_legacy: true
431 - name: erasure_code_dir
432 type: str
433 level: advanced
434 desc: directory where erasure-code plugins can be found
435 default: @CEPH_INSTALL_FULL_PKGLIBDIR@/erasure-code
436 services:
437 - mon
438 - osd
439 flags:
440 - startup
441 with_legacy: true
442 - name: log_file
443 type: str
444 level: basic
445 desc: path to log file
446 fmt_desc: The location of the logging file for your cluster.
447 daemon_default: /var/log/ceph/$cluster-$name.log
448 see_also:
449 - log_to_file
450 - log_to_stderr
451 - err_to_stderr
452 - log_to_syslog
453 - err_to_syslog
454 # default changed by common_preinit()
455 with_legacy: true
456 - name: log_max_new
457 type: int
458 level: advanced
459 desc: max unwritten log entries to allow before waiting to flush to the log
460 fmt_desc: The maximum number of new log files.
461 default: 1000
462 see_also:
463 - log_max_recent
464 # default changed by common_preinit()
465 with_legacy: true
466 - name: log_max_recent
467 type: int
468 level: advanced
469 desc: recent log entries to keep in memory to dump in the event of a crash
470 long_desc: The purpose of this option is to log at a higher debug level only to
471 the in-memory buffer, and write out the detailed log messages only if there is
472 a crash. Only log entries below the lower log level will be written unconditionally
473 to the log. For example, debug_osd=1/5 will write everything <= 1 to the log
474 unconditionally but keep entries at levels 2-5 in memory. If there is a seg fault
475 or assertion failure, all entries will be dumped to the log.
476 default: 500
477 daemon_default: 10000
478 # default changed by common_preinit()
479 with_legacy: true
480 - name: log_to_file
481 type: bool
482 level: basic
483 desc: send log lines to a file
484 fmt_desc: Determines if logging messages should appear in a file.
485 default: true
486 see_also:
487 - log_file
488 with_legacy: true
489 - name: log_to_stderr
490 type: bool
491 level: basic
492 desc: send log lines to stderr
493 fmt_desc: Determines if logging messages should appear in ``stderr``.
494 default: true
495 daemon_default: false
496 with_legacy: true
497 - name: err_to_stderr
498 type: bool
499 level: basic
500 desc: send critical error log lines to stderr
501 fmt_desc: Determines if error messages should appear in ``stderr``.
502 default: false
503 daemon_default: true
504 with_legacy: true
505 - name: log_stderr_prefix
506 type: str
507 level: advanced
508 desc: String to prefix log messages with when sent to stderr
509 long_desc: This is useful in container environments when combined with mon_cluster_log_to_stderr. The
510 mon log prefixes each line with the channel name (e.g., 'default', 'audit'), while
511 log_stderr_prefix can be set to 'debug '.
512 see_also:
513 - mon_cluster_log_to_stderr
514 - name: log_to_syslog
515 type: bool
516 level: basic
517 desc: send log lines to syslog facility
518 fmt_desc: Determines if logging messages should appear in ``syslog``.
519 default: false
520 with_legacy: true
521 - name: err_to_syslog
522 type: bool
523 level: basic
524 desc: send critical error log lines to syslog facility
525 fmt_desc: Determines if error messages should appear in ``syslog``.
526 default: false
527 with_legacy: true
528 - name: log_flush_on_exit
529 type: bool
530 level: advanced
531 desc: set a process exit handler to ensure the log is flushed on exit
532 fmt_desc: Determines if Ceph should flush the log files after exit.
533 default: false
534 with_legacy: true
535 - name: log_stop_at_utilization
536 type: float
537 level: basic
538 desc: stop writing to the log file when device utilization reaches this ratio
539 default: 0.97
540 see_also:
541 - log_file
542 min: 0
543 max: 1
544 with_legacy: true
545 - name: log_to_graylog
546 type: bool
547 level: basic
548 desc: send log lines to remote graylog server
549 default: false
550 see_also:
551 - err_to_graylog
552 - log_graylog_host
553 - log_graylog_port
554 with_legacy: true
555 - name: err_to_graylog
556 type: bool
557 level: basic
558 desc: send critical error log lines to remote graylog server
559 default: false
560 see_also:
561 - log_to_graylog
562 - log_graylog_host
563 - log_graylog_port
564 with_legacy: true
565 - name: log_graylog_host
566 type: str
567 level: basic
568 desc: address or hostname of graylog server to log to
569 default: 127.0.0.1
570 see_also:
571 - log_to_graylog
572 - err_to_graylog
573 - log_graylog_port
574 with_legacy: true
575 - name: log_graylog_port
576 type: int
577 level: basic
578 desc: port number for the remote graylog server
579 default: 12201
580 see_also:
581 - log_graylog_host
582 with_legacy: true
583 - name: log_to_journald
584 type: bool
585 level: basic
586 desc: send log lines to journald
587 default: false
588 see_also:
589 - err_to_journald
590 - name: err_to_journald
591 type: bool
592 level: basic
593 desc: send critical error log lines to journald
594 default: false
595 see_also:
596 - log_to_journald
597 - name: log_coarse_timestamps
598 type: bool
599 level: advanced
600 desc: timestamp log entries from coarse system clock to improve performance
601 default: true
602 tags:
603 - performance
604 - service
605 services:
606 - common
607 # options will take k/v pairs, or single-item that will be assumed as general
608 # default for all, regardless of channel.
609 # e.g., "info" would be taken as the same as "default=info"
610 # also, "default=daemon audit=local0" would mean
611 # "default all to 'daemon', override 'audit' with 'local0'
612 - name: clog_to_monitors
613 type: str
614 level: advanced
615 desc: Make daemons send cluster log messages to monitors
616 fmt_desc: Determines if ``clog`` messages should be sent to monitors.
617 default: default=true
618 flags:
619 - runtime
620 with_legacy: true
621 services:
622 - mgr
623 - osd
624 - mds
625 - name: clog_to_syslog
626 type: str
627 level: advanced
628 desc: Make daemons send cluster log messages to syslog
629 fmt_desc: Determines if ``clog`` messages should be sent to syslog.
630 default: 'false'
631 flags:
632 - runtime
633 with_legacy: true
634 services:
635 - mon
636 - mgr
637 - osd
638 - mds
639 - name: clog_to_syslog_level
640 type: str
641 level: advanced
642 desc: Syslog level for cluster log messages
643 default: info
644 see_also:
645 - clog_to_syslog
646 flags:
647 - runtime
648 with_legacy: true
649 services:
650 - mon
651 - mgr
652 - osd
653 - mds
654 - name: clog_to_syslog_facility
655 type: str
656 level: advanced
657 desc: Syslog facility for cluster log messages
658 default: default=daemon audit=local0
659 see_also:
660 - clog_to_syslog
661 flags:
662 - runtime
663 with_legacy: true
664 services:
665 - mon
666 - mgr
667 - osd
668 - mds
669 - name: clog_to_graylog
670 type: str
671 level: advanced
672 desc: Make daemons send cluster log to graylog
673 default: 'false'
674 flags:
675 - runtime
676 services:
677 - mon
678 - mgr
679 - osd
680 - mds
681 - name: clog_to_graylog_host
682 type: str
683 level: advanced
684 desc: Graylog host to cluster log messages
685 default: 127.0.0.1
686 see_also:
687 - clog_to_graylog
688 flags:
689 - runtime
690 with_legacy: true
691 services:
692 - mon
693 - mgr
694 - osd
695 - mds
696 - name: clog_to_graylog_port
697 type: str
698 level: advanced
699 desc: Graylog port number for cluster log messages
700 default: '12201'
701 see_also:
702 - clog_to_graylog
703 flags:
704 - runtime
705 with_legacy: true
706 services:
707 - mon
708 - mgr
709 - osd
710 - mds
711 - name: enable_experimental_unrecoverable_data_corrupting_features
712 type: str
713 level: advanced
714 desc: Enable named (or all with '*') experimental features that may be untested,
715 dangerous, and/or cause permanent data loss
716 flags:
717 - runtime
718 with_legacy: true
719 - name: plugin_dir
720 type: str
721 level: advanced
722 desc: Base directory for dynamically loaded plugins
723 default: @CEPH_INSTALL_FULL_PKGLIBDIR@
724 services:
725 - mon
726 - osd
727 flags:
728 - startup
729 - name: compressor_zlib_isal
730 type: bool
731 level: advanced
732 desc: Use Intel ISA-L accelerated zlib implementation if available
733 default: false
734 with_legacy: true
735 # regular zlib compression level, not applicable to isa-l optimized version
736 - name: compressor_zlib_level
737 type: int
738 level: advanced
739 desc: Zlib compression level to use
740 default: 5
741 with_legacy: true
742 # regular zlib compression winsize, not applicable to isa-l optimized version
743 - name: compressor_zlib_winsize
744 type: int
745 level: advanced
746 desc: Zlib compression winsize to use
747 default: -15
748 min: -15
749 max: 32
750 with_legacy: true
751 # regular zstd compression level
752 - name: compressor_zstd_level
753 type: int
754 level: advanced
755 desc: Zstd compression level to use
756 default: 1
757 with_legacy: true
758 - name: qat_compressor_enabled
759 type: bool
760 level: advanced
761 desc: Enable Intel QAT acceleration support for compression if available
762 default: false
763 with_legacy: true
764 - name: plugin_crypto_accelerator
765 type: str
766 level: advanced
767 desc: Crypto accelerator library to use
768 default: crypto_isal
769 with_legacy: true
770 - name: openssl_engine_opts
771 type: str
772 level: advanced
773 desc: Use engine for specific openssl algorithm
774 long_desc: 'Pass opts in this way: engine_id=engine1,dynamic_path=/some/path/engine1.so,default_algorithms=DIGESTS:engine_id=engine2,dynamic_path=/some/path/engine2.so,default_algorithms=CIPHERS,other_ctrl=other_value'
775 flags:
776 - startup
777 with_legacy: true
778 - name: mempool_debug
779 type: bool
780 level: dev
781 default: false
782 flags:
783 - no_mon_update
784 with_legacy: true
785 - name: thp
786 type: bool
787 level: dev
788 desc: enable transparent huge page (THP) support
789 long_desc: Ceph is known to suffer from memory fragmentation due to THP use. This
790 is indicated by RSS usage above configured memory targets. Enabling THP is currently
791 discouraged until selective use of THP by Ceph is implemented.
792 default: false
793 flags:
794 - startup
795 - name: key
796 type: str
797 level: advanced
798 desc: Authentication key
799 long_desc: A CephX authentication key, base64 encoded. It normally looks something
800 like 'AQAtut9ZdMbNJBAAHz6yBAWyJyz2yYRyeMWDag=='.
801 fmt_desc: The key (i.e., the text string of the key itself). Not recommended.
802 see_also:
803 - keyfile
804 - keyring
805 flags:
806 - no_mon_update
807 - startup
808 with_legacy: true
809 - name: keyfile
810 type: str
811 level: advanced
812 desc: Path to a file containing a key
813 long_desc: The file should contain a CephX authentication key and optionally a trailing
814 newline, but nothing else.
815 fmt_desc: The path to a key file (i.e,. a file containing only the key).
816 see_also:
817 - key
818 flags:
819 - no_mon_update
820 - startup
821 with_legacy: true
822 - name: keyring
823 type: str
824 level: advanced
825 desc: Path to a keyring file.
826 long_desc: A keyring file is an INI-style formatted file where the section names
827 are client or daemon names (e.g., 'osd.0') and each section contains a 'key' property
828 with CephX authentication key as the value.
829 # please note, document are generated without accessing to the CMake
830 # variables, so please update the document manually with a representive
831 # default value using the ":default:" option of ".. confval::" directive.
832 default: @keyring_paths@
833 see_also:
834 - key
835 - keyfile
836 flags:
837 - no_mon_update
838 - startup
839 with_legacy: true
840 - name: heartbeat_interval
841 type: int
842 level: advanced
843 desc: Frequency of internal heartbeat checks (seconds)
844 default: 5
845 flags:
846 - startup
847 with_legacy: true
848 - name: heartbeat_file
849 type: str
850 level: advanced
851 desc: File to touch on successful internal heartbeat
852 long_desc: If set, this file will be touched every time an internal heartbeat check
853 succeeds.
854 see_also:
855 - heartbeat_interval
856 flags:
857 - startup
858 with_legacy: true
859 - name: heartbeat_inject_failure
860 type: int
861 level: dev
862 default: 0
863 with_legacy: true
864 - name: perf
865 type: bool
866 level: advanced
867 desc: Enable internal performance metrics
868 long_desc: If enabled, collect and expose internal health metrics
869 default: true
870 with_legacy: true
871 - name: ms_type
872 type: str
873 level: advanced
874 desc: Messenger implementation to use for network communication
875 fmt_desc: Transport type used by Async Messenger. Can be ``async+posix``,
876 ``async+dpdk`` or ``async+rdma``. Posix uses standard TCP/IP networking and is
877 default. Other transports may be experimental and support may be limited.
878 default: async+posix
879 flags:
880 - startup
881 with_legacy: true
882 - name: ms_public_type
883 type: str
884 level: advanced
885 desc: Messenger implementation to use for the public network
886 long_desc: If not specified, use ms_type
887 see_also:
888 - ms_type
889 flags:
890 - startup
891 with_legacy: true
892 - name: ms_cluster_type
893 type: str
894 level: advanced
895 desc: Messenger implementation to use for the internal cluster network
896 long_desc: If not specified, use ms_type
897 see_also:
898 - ms_type
899 flags:
900 - startup
901 with_legacy: true
902 - name: ms_mon_cluster_mode
903 type: str
904 level: basic
905 desc: Connection modes (crc, secure) for intra-mon connections in order of preference
906 fmt_desc: the connection mode (or permitted modes) to use between monitors.
907 default: secure crc
908 see_also:
909 - ms_mon_service_mode
910 - ms_mon_client_mode
911 - ms_service_mode
912 - ms_cluster_mode
913 - ms_client_mode
914 flags:
915 - startup
916 - name: ms_mon_service_mode
917 type: str
918 level: basic
919 desc: Allowed connection modes (crc, secure) for connections to mons
920 fmt_desc: a list of permitted modes for clients or
921 other Ceph daemons to use when connecting to monitors.
922 default: secure crc
923 see_also:
924 - ms_service_mode
925 - ms_mon_cluster_mode
926 - ms_mon_client_mode
927 - ms_cluster_mode
928 - ms_client_mode
929 flags:
930 - startup
931 - name: ms_mon_client_mode
932 type: str
933 level: basic
934 desc: Connection modes (crc, secure) for connections from clients to monitors in
935 order of preference
936 fmt_desc: a list of connection modes, in order of
937 preference, for clients or non-monitor daemons to use when
938 connecting to monitors.
939 default: secure crc
940 see_also:
941 - ms_mon_service_mode
942 - ms_mon_cluster_mode
943 - ms_service_mode
944 - ms_cluster_mode
945 - ms_client_mode
946 flags:
947 - startup
948 - name: ms_cluster_mode
949 type: str
950 level: basic
951 desc: Connection modes (crc, secure) for intra-cluster connections in order of preference
952 fmt_desc: connection mode (or permitted modes) used
953 for intra-cluster communication between Ceph daemons. If multiple
954 modes are listed, the modes listed first are preferred.
955 default: crc secure
956 see_also:
957 - ms_service_mode
958 - ms_client_mode
959 flags:
960 - startup
961 - name: ms_service_mode
962 type: str
963 level: basic
964 desc: Allowed connection modes (crc, secure) for connections to daemons
965 fmt_desc: a list of permitted modes for clients to use
966 when connecting to the cluster.
967 default: crc secure
968 see_also:
969 - ms_cluster_mode
970 - ms_client_mode
971 flags:
972 - startup
973 - name: ms_client_mode
974 type: str
975 level: basic
976 desc: Connection modes (crc, secure) for connections from clients in order of preference
977 fmt_desc: a list of connection modes, in order of
978 preference, for clients to use (or allow) when talking to a Ceph
979 cluster.
980 default: crc secure
981 see_also:
982 - ms_cluster_mode
983 - ms_service_mode
984 flags:
985 - startup
986 - name: ms_osd_compress_mode
987 type: str
988 level: advanced
989 desc: Compression policy to use in Messenger for communicating with OSD
990 default: none
991 services:
992 - osd
993 enum_values:
994 - none
995 - force
996 see_also:
997 - ms_compress_secure
998 flags:
999 - runtime
1000 - name: ms_osd_compress_min_size
1001 type: uint
1002 level: advanced
1003 desc: Minimal message size eligable for on-wire compression
1004 default: 1_K
1005 services:
1006 - osd
1007 see_also:
1008 - ms_osd_compress_mode
1009 flags:
1010 - runtime
1011 - name: ms_osd_compression_algorithm
1012 type: str
1013 level: advanced
1014 desc: Compression algorithm to use in Messenger when communicating with OSD
1015 long_desc: Compression algorithm for connections with OSD in order of preference
1016 default: snappy zlib zstd lz4
1017 services:
1018 - osd
1019 see_also:
1020 - ms_osd_compress_mode
1021 flags:
1022 - runtime
1023 - name: ms_compress_secure
1024 type: bool
1025 level: advanced
1026 desc: Allowing compression when on-wire encryption is enabled
1027 long_desc: Combining encryption with compression reduces the level of security of
1028 messages between peers. In case both encryption and compression are enabled,
1029 compression setting will be ignored and message will not be compressed.
1030 This behaviour can be override using this setting.
1031 default: false
1032 see_also:
1033 - ms_osd_compress_mode
1034 flags:
1035 - runtime
1036 - name: ms_learn_addr_from_peer
1037 type: bool
1038 level: advanced
1039 desc: Learn address from what IP our first peer thinks we connect from
1040 long_desc: Use the IP address our first peer (usually a monitor) sees that we are
1041 connecting from. This is useful if a client is behind some sort of NAT and we
1042 want to see it identified by its local (not NATed) address.
1043 default: true
1044 with_legacy: true
1045 - name: ms_tcp_nodelay
1046 type: bool
1047 level: advanced
1048 desc: Disable Nagle's algorithm and send queued network traffic immediately
1049 fmt_desc: Ceph enables ``ms_tcp_nodelay`` so that each request is sent
1050 immediately (no buffering). Disabling `Nagle's algorithm`_
1051 increases network traffic, which can introduce latency. If you
1052 experience large numbers of small packets, you may try
1053 disabling ``ms_tcp_nodelay``.
1054 default: true
1055 with_legacy: true
1056 - name: ms_tcp_rcvbuf
1057 type: size
1058 level: advanced
1059 desc: Size of TCP socket receive buffer
1060 fmt_desc: The size of the socket buffer on the receiving end of a network
1061 connection. Disable by default.
1062 default: 0
1063 with_legacy: true
1064 - name: ms_tcp_prefetch_max_size
1065 type: size
1066 level: advanced
1067 desc: Maximum amount of data to prefetch out of the socket receive buffer
1068 default: 4_K
1069 with_legacy: true
1070 - name: ms_initial_backoff
1071 type: float
1072 level: advanced
1073 desc: Initial backoff after a network error is detected (seconds)
1074 fmt_desc: The initial time to wait before reconnecting on a fault.
1075 default: 0.2
1076 with_legacy: true
1077 - name: ms_max_backoff
1078 type: float
1079 level: advanced
1080 desc: Maximum backoff after a network error before retrying (seconds)
1081 fmt_desc: The maximum time to wait before reconnecting on a fault.
1082 default: 15
1083 see_also:
1084 - ms_initial_backoff
1085 with_legacy: true
1086 - name: ms_crc_data
1087 type: bool
1088 level: dev
1089 desc: Set and/or verify crc32c checksum on data payload sent over network
1090 default: true
1091 with_legacy: true
1092 - name: ms_crc_header
1093 type: bool
1094 level: dev
1095 desc: Set and/or verify crc32c checksum on header payload sent over network
1096 default: true
1097 with_legacy: true
1098 - name: ms_die_on_bad_msg
1099 type: bool
1100 level: dev
1101 desc: Induce a daemon crash/exit when a bad network message is received
1102 fmt_desc: Debug option; do not configure.
1103 default: false
1104 with_legacy: true
1105 - name: ms_die_on_unhandled_msg
1106 type: bool
1107 level: dev
1108 desc: Induce a daemon crash/exit when an unrecognized message is received
1109 default: false
1110 with_legacy: true
1111 - name: ms_die_on_old_message
1112 type: bool
1113 level: dev
1114 desc: Induce a daemon crash/exit when a old, undecodable message is received
1115 default: false
1116 with_legacy: true
1117 - name: ms_die_on_skipped_message
1118 type: bool
1119 level: dev
1120 desc: Induce a daemon crash/exit if sender skips a message sequence number
1121 default: false
1122 with_legacy: true
1123 - name: ms_die_on_bug
1124 type: bool
1125 level: dev
1126 desc: Induce a crash/exit on various bugs (for testing purposes)
1127 default: false
1128 with_legacy: true
1129 - name: ms_dispatch_throttle_bytes
1130 type: size
1131 level: advanced
1132 desc: Limit messages that are read off the network but still being processed
1133 fmt_desc: Throttles total size of messages waiting to be dispatched.
1134 default: 100_M
1135 with_legacy: true
1136 - name: ms_bind_ipv4
1137 type: bool
1138 level: advanced
1139 desc: Bind servers to IPv4 address(es)
1140 fmt_desc: Enables Ceph daemons to bind to IPv4 addresses.
1141 default: true
1142 see_also:
1143 - ms_bind_ipv6
1144 - name: ms_bind_ipv6
1145 type: bool
1146 level: advanced
1147 desc: Bind servers to IPv6 address(es)
1148 fmt_desc: Enables Ceph daemons to bind to IPv6 addresses.
1149 default: false
1150 see_also:
1151 - ms_bind_ipv4
1152 with_legacy: true
1153 - name: ms_bind_prefer_ipv4
1154 type: bool
1155 level: advanced
1156 desc: Prefer IPV4 over IPV6 address(es)
1157 default: false
1158 - name: ms_bind_msgr1
1159 type: bool
1160 level: advanced
1161 desc: Bind servers to msgr1 (legacy) protocol address(es)
1162 default: true
1163 see_also:
1164 - ms_bind_msgr2
1165 - name: ms_bind_msgr2
1166 type: bool
1167 level: advanced
1168 desc: Bind servers to msgr2 (nautilus+) protocol address(es)
1169 default: true
1170 see_also:
1171 - ms_bind_msgr1
1172 - name: ms_bind_port_min
1173 type: int
1174 level: advanced
1175 desc: Lowest port number to bind daemon(s) to
1176 fmt_desc: The minimum port number to which an OSD or MDS daemon will bind.
1177 default: 6800
1178 with_legacy: true
1179 - name: ms_bind_port_max
1180 type: int
1181 level: advanced
1182 desc: Highest port number to bind daemon(s) to
1183 fmt_desc: The maximum port number to which an OSD or MDS daemon will bind.
1184 default: 7568
1185 with_legacy: true
1186 # FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
1187 - name: ms_bind_retry_count
1188 type: int
1189 level: advanced
1190 desc: Number of attempts to make while bind(2)ing to a port
1191 default: @ms_bind_retry_count@
1192 with_legacy: true
1193 # FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
1194 - name: ms_bind_retry_delay
1195 type: int
1196 level: advanced
1197 desc: Delay between bind(2) attempts (seconds)
1198 default: @ms_bind_retry_delay@
1199 with_legacy: true
1200 - name: ms_bind_before_connect
1201 type: bool
1202 level: advanced
1203 desc: Call bind(2) on client sockets
1204 default: false
1205 with_legacy: true
1206 - name: ms_tcp_listen_backlog
1207 type: int
1208 level: advanced
1209 desc: Size of queue of incoming connections for accept(2)
1210 default: 512
1211 with_legacy: true
1212 - name: ms_connection_ready_timeout
1213 type: uint
1214 level: advanced
1215 desc: Time before we declare a not yet ready connection as dead (seconds)
1216 default: 10
1217 with_legacy: true
1218 - name: ms_connection_idle_timeout
1219 type: uint
1220 level: advanced
1221 desc: Time before an idle connection is closed (seconds)
1222 default: 900
1223 with_legacy: true
1224 - name: ms_pq_max_tokens_per_priority
1225 type: uint
1226 level: dev
1227 default: 16_M
1228 with_legacy: true
1229 - name: ms_pq_min_cost
1230 type: size
1231 level: dev
1232 default: 64_K
1233 with_legacy: true
1234 - name: ms_inject_socket_failures
1235 type: uint
1236 level: dev
1237 desc: Inject a socket failure every Nth socket operation
1238 fmt_desc: Debug option; do not configure.
1239 default: 0
1240 with_legacy: true
1241 - name: ms_inject_delay_type
1242 type: str
1243 level: dev
1244 desc: Entity type to inject delays for
1245 flags:
1246 - runtime
1247 with_legacy: true
1248 - name: ms_inject_delay_max
1249 type: float
1250 level: dev
1251 desc: Max delay to inject
1252 default: 1
1253 with_legacy: true
1254 - name: ms_inject_delay_probability
1255 type: float
1256 level: dev
1257 default: 0
1258 with_legacy: true
1259 - name: ms_inject_internal_delays
1260 type: float
1261 level: dev
1262 desc: Inject various internal delays to induce races (seconds)
1263 default: 0
1264 with_legacy: true
1265 - name: ms_blackhole_osd
1266 type: bool
1267 level: dev
1268 default: false
1269 with_legacy: true
1270 - name: ms_blackhole_mon
1271 type: bool
1272 level: dev
1273 default: false
1274 with_legacy: true
1275 - name: ms_blackhole_mds
1276 type: bool
1277 level: dev
1278 default: false
1279 with_legacy: true
1280 - name: ms_blackhole_mgr
1281 type: bool
1282 level: dev
1283 default: false
1284 with_legacy: true
1285 - name: ms_blackhole_client
1286 type: bool
1287 level: dev
1288 default: false
1289 with_legacy: true
1290 - name: ms_dump_on_send
1291 type: bool
1292 level: advanced
1293 desc: Hexdump message to debug log on message send
1294 default: false
1295 with_legacy: true
1296 - name: ms_dump_corrupt_message_level
1297 type: int
1298 level: advanced
1299 desc: Log level at which to hexdump corrupt messages we receive
1300 default: 1
1301 with_legacy: true
1302 # number of worker processing threads for async messenger created on init
1303 - name: ms_async_op_threads
1304 type: uint
1305 level: advanced
1306 desc: Threadpool size for AsyncMessenger (ms_type=async)
1307 fmt_desc: Initial number of worker threads used by each Async Messenger instance.
1308 Should be at least equal to highest number of replicas, but you can
1309 decrease it if you are low on CPU core count and/or you host a lot of
1310 OSDs on single server.
1311 default: 3
1312 min: 1
1313 max: 24
1314 with_legacy: true
1315 - name: ms_async_reap_threshold
1316 type: uint
1317 level: dev
1318 desc: number of deleted connections before we reap
1319 default: 5
1320 min: 1
1321 with_legacy: true
1322 - name: ms_async_rdma_device_name
1323 type: str
1324 level: advanced
1325 with_legacy: true
1326 - name: ms_async_rdma_enable_hugepage
1327 type: bool
1328 level: advanced
1329 default: false
1330 with_legacy: true
1331 - name: ms_async_rdma_buffer_size
1332 type: size
1333 level: advanced
1334 default: 128_K
1335 with_legacy: true
1336 - name: ms_async_rdma_send_buffers
1337 type: uint
1338 level: advanced
1339 default: 1_K
1340 with_legacy: true
1341 # size of the receive buffer pool, 0 is unlimited
1342 - name: ms_async_rdma_receive_buffers
1343 type: uint
1344 level: advanced
1345 default: 32_K
1346 with_legacy: true
1347 # max number of wr in srq
1348 - name: ms_async_rdma_receive_queue_len
1349 type: uint
1350 level: advanced
1351 default: 4_K
1352 with_legacy: true
1353 # support srq
1354 - name: ms_async_rdma_support_srq
1355 type: bool
1356 level: advanced
1357 default: true
1358 with_legacy: true
1359 - name: ms_async_rdma_port_num
1360 type: uint
1361 level: advanced
1362 default: 1
1363 with_legacy: true
1364 - name: ms_async_rdma_polling_us
1365 type: uint
1366 level: advanced
1367 default: 1000
1368 with_legacy: true
1369 - name: ms_async_rdma_gid_idx
1370 type: int
1371 level: advanced
1372 desc: use gid_idx to select GID for choosing RoCEv1 or RoCEv2
1373 default: 0
1374 with_legacy: true
1375 # GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding
1376 - name: ms_async_rdma_local_gid
1377 type: str
1378 level: advanced
1379 with_legacy: true
1380 # 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5
1381 - name: ms_async_rdma_roce_ver
1382 type: int
1383 level: advanced
1384 default: 1
1385 with_legacy: true
1386 # in RoCE, this means PCP
1387 - name: ms_async_rdma_sl
1388 type: int
1389 level: advanced
1390 default: 3
1391 with_legacy: true
1392 # in RoCE, this means DSCP
1393 - name: ms_async_rdma_dscp
1394 type: int
1395 level: advanced
1396 default: 96
1397 with_legacy: true
1398 # when there are enough accept failures, indicating there are unrecoverable failures,
1399 # just do ceph_abort() . Here we make it configurable.
1400 - name: ms_max_accept_failures
1401 type: int
1402 level: advanced
1403 desc: The maximum number of consecutive failed accept() calls before considering
1404 the daemon is misconfigured and abort it.
1405 default: 4
1406 with_legacy: true
1407 # rdma connection management
1408 - name: ms_async_rdma_cm
1409 type: bool
1410 level: advanced
1411 default: false
1412 with_legacy: true
1413 - name: ms_async_rdma_type
1414 type: str
1415 level: advanced
1416 default: ib
1417 with_legacy: true
1418 - name: ms_dpdk_port_id
1419 type: int
1420 level: advanced
1421 default: 0
1422 with_legacy: true
1423 # it is modified in unittest so that use SAFE_OPTION to declare
1424 - name: ms_dpdk_coremask
1425 type: str
1426 level: advanced
1427 default: '0xF'
1428 see_also:
1429 - ms_async_op_threads
1430 with_legacy: true
1431 - name: ms_dpdk_memory_channel
1432 type: str
1433 level: advanced
1434 default: '4'
1435 with_legacy: true
1436 - name: ms_dpdk_hugepages
1437 type: str
1438 level: advanced
1439 with_legacy: true
1440 - name: ms_dpdk_pmd
1441 type: str
1442 level: advanced
1443 with_legacy: true
1444 - name: ms_dpdk_devs_allowlist
1445 type: str
1446 level: advanced
1447 desc: NIC's PCIe address are allowed to use
1448 long_desc: for a single NIC use ms_dpdk_devs_allowlist=-a 0000:7d:010 or --allow=0000:7d:010;
1449 for a bond nics use ms_dpdk_devs_allowlist=--allow=0000:7d:01.0 --allow=0000:7d:02.6
1450 --vdev=net_bonding0,mode=2,slave=0000:7d:01.0,slave=0000:7d:02.6.
1451 - name: ms_dpdk_host_ipv4_addr
1452 type: str
1453 level: advanced
1454 with_legacy: true
1455 - name: ms_dpdk_gateway_ipv4_addr
1456 type: str
1457 level: advanced
1458 with_legacy: true
1459 - name: ms_dpdk_netmask_ipv4_addr
1460 type: str
1461 level: advanced
1462 with_legacy: true
1463 - name: ms_dpdk_lro
1464 type: bool
1465 level: advanced
1466 default: true
1467 with_legacy: true
1468 - name: ms_dpdk_enable_tso
1469 type: bool
1470 level: advanced
1471 default: true
1472 - name: ms_dpdk_hw_flow_control
1473 type: bool
1474 level: advanced
1475 default: true
1476 with_legacy: true
1477 # Weighing of a hardware network queue relative to a software queue (0=no work, 1= equal share)")
1478 - name: ms_dpdk_hw_queue_weight
1479 type: float
1480 level: advanced
1481 default: 1
1482 with_legacy: true
1483 - name: ms_dpdk_debug_allow_loopback
1484 type: bool
1485 level: dev
1486 default: false
1487 with_legacy: true
1488 - name: ms_dpdk_rx_buffer_count_per_core
1489 type: int
1490 level: advanced
1491 default: 8192
1492 with_legacy: true
1493 - name: inject_early_sigterm
1494 type: bool
1495 level: dev
1496 desc: send ourselves a SIGTERM early during startup
1497 default: false
1498 with_legacy: true
1499 # list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster
1500 - name: mon_initial_members
1501 type: str
1502 level: advanced
1503 fmt_desc: The IDs of initial monitors in a cluster during startup. If
1504 specified, Ceph requires an odd number of monitors to form an
1505 initial quorum (e.g., 3).
1506 note: A *majority* of monitors in your cluster must be able to reach
1507 each other in order to establish a quorum. You can decrease the initial
1508 number of monitors to establish a quorum with this setting.
1509 services:
1510 - mon
1511 flags:
1512 - no_mon_update
1513 - cluster_create
1514 with_legacy: true
1515 - name: mon_max_pg_per_osd
1516 type: uint
1517 level: advanced
1518 desc: Max number of PGs per OSD the cluster will allow
1519 long_desc: If the number of PGs per OSD exceeds this, a health warning will be visible
1520 in `ceph status`. This is also used in automated PG management, as the threshold
1521 at which some pools' pg_num may be shrunk in order to enable increasing the pg_num
1522 of others.
1523 default: 250
1524 flags:
1525 - runtime
1526 services:
1527 - mgr
1528 - mon
1529 min: 1
1530 - name: mon_osd_full_ratio
1531 type: float
1532 level: advanced
1533 desc: full ratio of OSDs to be set during initial creation of the cluster
1534 default: 0.95
1535 flags:
1536 - no_mon_update
1537 - cluster_create
1538 with_legacy: true
1539 - name: mon_osd_backfillfull_ratio
1540 type: float
1541 level: advanced
1542 default: 0.9
1543 flags:
1544 - no_mon_update
1545 - cluster_create
1546 with_legacy: true
1547 - name: mon_osd_nearfull_ratio
1548 type: float
1549 level: advanced
1550 desc: nearfull ratio for OSDs to be set during initial creation of cluster
1551 default: 0.85
1552 flags:
1553 - no_mon_update
1554 - cluster_create
1555 with_legacy: true
1556 - name: mon_osd_initial_require_min_compat_client
1557 type: str
1558 level: advanced
1559 default: luminous
1560 flags:
1561 - no_mon_update
1562 - cluster_create
1563 with_legacy: true
1564 - name: mon_allow_pool_delete
1565 type: bool
1566 level: advanced
1567 desc: allow pool deletions
1568 fmt_desc: Should monitors allow pools to be removed, regardless of what the pool flags say?
1569 default: false
1570 services:
1571 - mon
1572 with_legacy: true
1573 - name: mon_fake_pool_delete
1574 type: bool
1575 level: advanced
1576 desc: fake pool deletions by renaming the rados pool
1577 default: false
1578 services:
1579 - mon
1580 with_legacy: true
1581 - name: mon_globalid_prealloc
1582 type: uint
1583 level: advanced
1584 desc: number of globalid values to preallocate
1585 long_desc: This setting caps how many new clients can authenticate with the cluster
1586 before the monitors have to perform a write to preallocate more. Large values
1587 burn through the 64-bit ID space more quickly.
1588 fmt_desc: The number of global IDs to pre-allocate for clients and daemons in the cluster.
1589 default: 10000
1590 services:
1591 - mon
1592 with_legacy: true
1593 - name: mon_osd_report_timeout
1594 type: int
1595 level: advanced
1596 desc: time before OSDs who do not report to the mons are marked down (seconds)
1597 fmt_desc: The grace period in seconds before declaring
1598 unresponsive Ceph OSD Daemons ``down``.
1599 default: 15_min
1600 services:
1601 - mon
1602 with_legacy: true
1603 - name: mon_warn_on_insecure_global_id_reclaim
1604 type: bool
1605 level: advanced
1606 desc: issue AUTH_INSECURE_GLOBAL_ID_RECLAIM health warning if any connected
1607 clients are insecurely reclaiming global_id
1608 default: true
1609 services:
1610 - mon
1611 see_also:
1612 - mon_warn_on_insecure_global_id_reclaim_allowed
1613 - auth_allow_insecure_global_id_reclaim
1614 - auth_expose_insecure_global_id_reclaim
1615 - name: mon_warn_on_insecure_global_id_reclaim_allowed
1616 type: bool
1617 level: advanced
1618 desc: issue AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED health warning if insecure
1619 global_id reclaim is allowed
1620 default: true
1621 services:
1622 - mon
1623 see_also:
1624 - mon_warn_on_insecure_global_id_reclaim
1625 - auth_allow_insecure_global_id_reclaim
1626 - auth_expose_insecure_global_id_reclaim
1627 - name: mon_warn_on_msgr2_not_enabled
1628 type: bool
1629 level: advanced
1630 desc: issue MON_MSGR2_NOT_ENABLED health warning if monitors are all running Nautilus
1631 but not all binding to a msgr2 port
1632 default: true
1633 services:
1634 - mon
1635 see_also:
1636 - ms_bind_msgr2
1637 - name: mon_warn_on_slow_ping_time
1638 type: float
1639 level: advanced
1640 desc: Override mon_warn_on_slow_ping_ratio with specified threshold in milliseconds
1641 fmt_desc: Override ``mon_warn_on_slow_ping_ratio`` with a specific value.
1642 Raise ``HEALTH_WARN`` if any heartbeat between OSDs exceeds
1643 ``mon_warn_on_slow_ping_time`` milliseconds. The default is 0 (disabled).
1644 default: 0
1645 services:
1646 - mgr
1647 - osd
1648 see_also:
1649 - mon_warn_on_slow_ping_ratio
1650 - name: mon_warn_on_slow_ping_ratio
1651 type: float
1652 level: advanced
1653 desc: Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace
1654 fmt_desc: Raise ``HEALTH_WARN`` when any heartbeat between OSDs exceeds
1655 ``mon_warn_on_slow_ping_ratio`` of ``osd_heartbeat_grace``.
1656 default: 0.05
1657 services:
1658 - mgr
1659 - osd
1660 see_also:
1661 - osd_heartbeat_grace
1662 - mon_warn_on_slow_ping_time
1663 - name: mon_max_snap_prune_per_epoch
1664 type: uint
1665 level: advanced
1666 desc: max number of pruned snaps we will process in a single OSDMap epoch
1667 default: 100
1668 services:
1669 - mon
1670 - name: mon_min_osdmap_epochs
1671 type: int
1672 level: advanced
1673 desc: min number of OSDMaps to store
1674 fmt_desc: Minimum number of OSD map epochs to keep at all times.
1675 default: 500
1676 services:
1677 - mon
1678 with_legacy: true
1679 - name: mon_max_log_epochs
1680 type: int
1681 level: advanced
1682 desc: max number of past cluster log epochs to store
1683 fmt_desc: Maximum number of Log epochs the monitor should keep.
1684 default: 500
1685 services:
1686 - mon
1687 with_legacy: true
1688 - name: mon_max_mdsmap_epochs
1689 type: int
1690 level: advanced
1691 desc: max number of FSMaps/MDSMaps to store
1692 fmt_desc: The maximum number of mdsmap epochs to trim during a single proposal.
1693 default: 500
1694 services:
1695 - mon
1696 with_legacy: true
1697 - name: mon_max_mgrmap_epochs
1698 type: int
1699 level: advanced
1700 desc: max number of MgrMaps to store
1701 default: 500
1702 services:
1703 - mon
1704 - name: mon_max_osd
1705 type: int
1706 level: advanced
1707 desc: max number of OSDs in a cluster
1708 fmt_desc: The maximum number of OSDs allowed in the cluster.
1709 default: 10000
1710 services:
1711 - mon
1712 with_legacy: true
1713 - name: mon_probe_timeout
1714 type: float
1715 level: advanced
1716 desc: timeout for querying other mons during bootstrap pre-election phase (seconds)
1717 fmt_desc: Number of seconds the monitor will wait to find peers before bootstrapping.
1718 default: 2
1719 services:
1720 - mon
1721 with_legacy: true
1722 - name: mon_client_bytes
1723 type: size
1724 level: advanced
1725 desc: max bytes of outstanding client messages mon will read off the network
1726 fmt_desc: The amount of client message data allowed in memory (in bytes).
1727 default: 100_M
1728 services:
1729 - mon
1730 with_legacy: true
1731 - name: mon_warn_pg_not_scrubbed_ratio
1732 type: float
1733 level: advanced
1734 desc: Percentage of the scrub max interval past the scrub max interval to warn
1735 default: 0.5
1736 see_also:
1737 - osd_scrub_max_interval
1738 min: 0
1739 with_legacy: true
1740 - name: mon_warn_pg_not_deep_scrubbed_ratio
1741 type: float
1742 level: advanced
1743 desc: Percentage of the deep scrub interval past the deep scrub interval to warn
1744 default: 0.75
1745 see_also:
1746 - osd_deep_scrub_interval
1747 min: 0
1748 with_legacy: true
1749 - name: mon_scrub_interval
1750 type: secs
1751 level: advanced
1752 desc: frequency for scrubbing mon database
1753 fmt_desc: How often the monitor scrubs its store by comparing
1754 the stored checksums with the computed ones for all stored
1755 keys. (0 disables it. dangerous, use with care)
1756 default: 1_day
1757 services:
1758 - mon
1759 - name: mon_scrub_timeout
1760 type: int
1761 level: advanced
1762 desc: timeout to restart scrub of mon quorum participant does not respond for the
1763 latest chunk
1764 default: 5_min
1765 services:
1766 - mon
1767 with_legacy: true
1768 - name: mon_scrub_max_keys
1769 type: int
1770 level: advanced
1771 desc: max keys per on scrub chunk/step
1772 fmt_desc: The maximum number of keys to scrub each time.
1773 default: 100
1774 services:
1775 - mon
1776 with_legacy: true
1777 # probability of injected crc mismatch [0.0, 1.0]
1778 - name: mon_scrub_inject_crc_mismatch
1779 type: float
1780 level: dev
1781 desc: probability for injecting crc mismatches into mon scrub
1782 default: 0
1783 services:
1784 - mon
1785 with_legacy: true
1786 # probability of injected missing keys [0.0, 1.0]
1787 - name: mon_scrub_inject_missing_keys
1788 type: float
1789 level: dev
1790 desc: probability for injecting missing keys into mon scrub
1791 default: 0
1792 services:
1793 - mon
1794 with_legacy: true
1795 - name: mon_config_key_max_entry_size
1796 type: size
1797 level: advanced
1798 desc: Defines the number of bytes allowed to be held in a single config-key entry
1799 fmt_desc: The maximum size of config-key entry (in bytes)
1800 default: 64_K
1801 services:
1802 - mon
1803 with_legacy: true
1804 - name: mon_sync_timeout
1805 type: float
1806 level: advanced
1807 desc: timeout before canceling sync if syncing mon does not respond
1808 fmt_desc: Number of seconds the monitor will wait for the next update
1809 message from its sync provider before it gives up and bootstrap
1810 again.
1811 default: 1_min
1812 services:
1813 - mon
1814 with_legacy: true
1815 - name: mon_sync_max_payload_size
1816 type: size
1817 level: advanced
1818 desc: target max message payload for mon sync
1819 fmt_desc: The maximum size for a sync payload (in bytes).
1820 default: 1_M
1821 services:
1822 - mon
1823 with_legacy: true
1824 - name: mon_sync_max_payload_keys
1825 type: int
1826 level: advanced
1827 desc: target max keys in message payload for mon sync
1828 default: 2000
1829 services:
1830 - mon
1831 with_legacy: true
1832 - name: mon_sync_debug
1833 type: bool
1834 level: dev
1835 desc: enable extra debugging during mon sync
1836 default: false
1837 services:
1838 - mon
1839 with_legacy: true
1840 - name: mon_inject_sync_get_chunk_delay
1841 type: float
1842 level: dev
1843 desc: inject delay during sync (seconds)
1844 default: 0
1845 services:
1846 - mon
1847 with_legacy: true
1848 - name: mon_osd_min_down_reporters
1849 type: uint
1850 level: advanced
1851 desc: number of OSDs from different subtrees who need to report a down OSD for it
1852 to count
1853 fmt_desc: The minimum number of Ceph OSD Daemons required to report a
1854 ``down`` Ceph OSD Daemon.
1855 default: 2
1856 services:
1857 - mon
1858 see_also:
1859 - mon_osd_reporter_subtree_level
1860 - name: mon_osd_reporter_subtree_level
1861 type: str
1862 level: advanced
1863 desc: in which level of parent bucket the reporters are counted
1864 fmt_desc: In which level of parent bucket the reporters are counted. The OSDs
1865 send failure reports to monitors if they find a peer that is not responsive.
1866 Monitors mark the reported ``OSD`` out and then ``down`` after a grace period.
1867 default: host
1868 services:
1869 - mon
1870 flags:
1871 - runtime
1872 - name: mon_osd_snap_trim_queue_warn_on
1873 type: int
1874 level: advanced
1875 desc: Warn when snap trim queue is that large (or larger).
1876 long_desc: Warn when snap trim queue length for at least one PG crosses this value,
1877 as this is indicator of snap trimmer not keeping up, wasting disk space
1878 default: 32768
1879 services:
1880 - mon
1881 with_legacy: true
1882 # force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous)
1883 - name: mon_osd_force_trim_to
1884 type: int
1885 level: dev
1886 desc: force mons to trim osdmaps through this epoch
1887 fmt_desc: Force monitor to trim osdmaps to this point, even if there is
1888 PGs not clean at the specified epoch (0 disables it. dangerous,
1889 use with care)
1890 default: 0
1891 services:
1892 - mon
1893 with_legacy: true
1894 - name: mon_debug_extra_checks
1895 type: bool
1896 level: dev
1897 desc: Enable some additional monitor checks
1898 long_desc: Enable some additional monitor checks that would be too expensive to
1899 run on production systems, or would only be relevant while testing or debugging.
1900 default: false
1901 services:
1902 - mon
1903 - name: mon_debug_block_osdmap_trim
1904 type: bool
1905 level: dev
1906 desc: Block OSDMap trimming while the option is enabled.
1907 long_desc: Blocking OSDMap trimming may be quite helpful to easily reproduce states
1908 in which the monitor keeps (hundreds of) thousands of osdmaps.
1909 default: false
1910 services:
1911 - mon
1912 - name: mon_debug_deprecated_as_obsolete
1913 type: bool
1914 level: dev
1915 desc: treat deprecated mon commands as obsolete
1916 default: false
1917 services:
1918 - mon
1919 with_legacy: true
1920 - name: mon_debug_dump_transactions
1921 type: bool
1922 level: dev
1923 desc: dump paxos transactions to log
1924 default: false
1925 services:
1926 - mon
1927 see_also:
1928 - mon_debug_dump_location
1929 with_legacy: true
1930 - name: mon_debug_dump_json
1931 type: bool
1932 level: dev
1933 desc: dump paxos transasctions to log as json
1934 default: false
1935 services:
1936 - mon
1937 see_also:
1938 - mon_debug_dump_transactions
1939 with_legacy: true
1940 - name: mon_debug_dump_location
1941 type: str
1942 level: dev
1943 desc: file to dump paxos transactions to
1944 default: /var/log/ceph/$cluster-$name.tdump
1945 services:
1946 - mon
1947 see_also:
1948 - mon_debug_dump_transactions
1949 with_legacy: true
1950 - name: mon_debug_no_require_pacific
1951 type: bool
1952 level: dev
1953 desc: do not set pacific feature for new mon clusters
1954 default: false
1955 services:
1956 - mon
1957 flags:
1958 - cluster_create
1959 - name: mon_debug_no_require_quincy
1960 type: bool
1961 level: dev
1962 desc: do not set quincy feature for new mon clusters
1963 default: false
1964 services:
1965 - mon
1966 flags:
1967 - cluster_create
1968 - name: mon_debug_no_require_bluestore_for_ec_overwrites
1969 type: bool
1970 level: dev
1971 desc: do not require bluestore OSDs to enable EC overwrites on a rados pool
1972 default: false
1973 services:
1974 - mon
1975 with_legacy: true
1976 - name: mon_debug_no_initial_persistent_features
1977 type: bool
1978 level: dev
1979 desc: do not set any monmap features for new mon clusters
1980 default: false
1981 services:
1982 - mon
1983 flags:
1984 - cluster_create
1985 with_legacy: true
1986 - name: mon_inject_transaction_delay_max
1987 type: float
1988 level: dev
1989 desc: max duration of injected delay in paxos
1990 default: 10
1991 services:
1992 - mon
1993 with_legacy: true
1994 # range [0, 1]
1995 - name: mon_inject_transaction_delay_probability
1996 type: float
1997 level: dev
1998 desc: probability of injecting a delay in paxos
1999 default: 0
2000 services:
2001 - mon
2002 with_legacy: true
2003 - name: mon_inject_pg_merge_bounce_probability
2004 type: float
2005 level: dev
2006 desc: probability of failing and reverting a pg_num decrement
2007 default: 0
2008 services:
2009 - mon
2010 # kill the sync provider at a specific point in the work flow
2011 - name: mon_sync_provider_kill_at
2012 type: int
2013 level: dev
2014 desc: kill mon sync requester at specific point
2015 default: 0
2016 services:
2017 - mon
2018 with_legacy: true
2019 # kill the sync requester at a specific point in the work flow
2020 - name: mon_sync_requester_kill_at
2021 type: int
2022 level: dev
2023 desc: kill mon sync requestor at specific point
2024 default: 0
2025 services:
2026 - mon
2027 with_legacy: true
2028 # force monitor to join quorum even if it has been previously removed from the map
2029 - name: mon_force_quorum_join
2030 type: bool
2031 level: advanced
2032 desc: force mon to rejoin quorum even though it was just removed
2033 fmt_desc: Force monitor to join quorum even if it has been previously removed from the map
2034 default: false
2035 services:
2036 - mon
2037 with_legacy: true
2038 # type of keyvaluedb backend
2039 - name: mon_keyvaluedb
2040 type: str
2041 level: advanced
2042 desc: database backend to use for the mon database
2043 default: rocksdb
2044 services:
2045 - mon
2046 enum_values:
2047 - leveldb
2048 - rocksdb
2049 flags:
2050 - create
2051 with_legacy: true
2052 # UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps
2053 - name: mon_debug_unsafe_allow_tier_with_nonempty_snaps
2054 type: bool
2055 level: dev
2056 default: false
2057 services:
2058 - mon
2059 with_legacy: true
2060 # required of mon, mds, osd daemons
2061 - name: auth_cluster_required
2062 type: str
2063 level: advanced
2064 desc: authentication methods required by the cluster
2065 fmt_desc: If enabled, the Ceph Storage Cluster daemons (i.e., ``ceph-mon``,
2066 ``ceph-osd``, ``ceph-mds`` and ``ceph-mgr``) must authenticate with
2067 each other. Valid settings are ``cephx`` or ``none``.
2068 default: cephx
2069 with_legacy: true
2070 # required by daemons of clients
2071 - name: auth_service_required
2072 type: str
2073 level: advanced
2074 desc: authentication methods required by service daemons
2075 fmt_desc: If enabled, the Ceph Storage Cluster daemons require Ceph Clients
2076 to authenticate with the Ceph Storage Cluster in order to access
2077 Ceph services. Valid settings are ``cephx`` or ``none``.
2078 default: cephx
2079 with_legacy: true
2080 # what clients require of daemons
2081 - name: auth_client_required
2082 type: str
2083 level: advanced
2084 desc: authentication methods allowed by clients
2085 fmt_desc: If enabled, the Ceph Client requires the Ceph Storage Cluster to
2086 authenticate with the Ceph Client. Valid settings are ``cephx``
2087 or ``none``.
2088 default: cephx, none
2089 with_legacy: true
2090 # deprecated; default value for above if they are not defined.
2091 - name: auth_supported
2092 type: str
2093 level: advanced
2094 desc: authentication methods required (deprecated)
2095 with_legacy: true
2096 - name: max_rotating_auth_attempts
2097 type: int
2098 level: advanced
2099 desc: number of attempts to initialize rotating keys before giving up
2100 default: 10
2101 with_legacy: true
2102 - name: rotating_keys_bootstrap_timeout
2103 type: int
2104 level: advanced
2105 desc: timeout for obtaining rotating keys during bootstrap phase (seconds)
2106 default: 30
2107 - name: rotating_keys_renewal_timeout
2108 type: int
2109 level: advanced
2110 desc: timeout for updating rotating keys (seconds)
2111 default: 10
2112 - name: cephx_require_signatures
2113 type: bool
2114 level: advanced
2115 default: false
2116 fmt_desc: If set to ``true``, Ceph requires signatures on all message
2117 traffic between the Ceph Client and the Ceph Storage Cluster, and
2118 between daemons comprising the Ceph Storage Cluster.
2119
2120 Ceph Argonaut and Linux kernel versions prior to 3.19 do
2121 not support signatures; if such clients are in use this
2122 option can be turned off to allow them to connect.
2123 with_legacy: true
2124 - name: cephx_require_version
2125 type: int
2126 level: advanced
2127 desc: Cephx version required (1 = pre-mimic, 2 = mimic+)
2128 default: 2
2129 with_legacy: true
2130 - name: cephx_cluster_require_signatures
2131 type: bool
2132 level: advanced
2133 default: false
2134 fmt_desc: If set to ``true``, Ceph requires signatures on all message
2135 traffic between Ceph daemons comprising the Ceph Storage Cluster.
2136 with_legacy: true
2137 - name: cephx_cluster_require_version
2138 type: int
2139 level: advanced
2140 desc: Cephx version required by the cluster from clients (1 = pre-mimic, 2 = mimic+)
2141 default: 2
2142 with_legacy: true
2143 - name: cephx_service_require_signatures
2144 type: bool
2145 level: advanced
2146 default: false
2147 fmt_desc: If set to ``true``, Ceph requires signatures on all message
2148 traffic between Ceph Clients and the Ceph Storage Cluster.
2149 with_legacy: true
2150 - name: cephx_service_require_version
2151 type: int
2152 level: advanced
2153 desc: Cephx version required from ceph services (1 = pre-mimic, 2 = mimic+)
2154 default: 2
2155 with_legacy: true
2156 # Default to signing session messages if supported
2157 - name: cephx_sign_messages
2158 type: bool
2159 level: advanced
2160 default: true
2161 fmt_desc: If the Ceph version supports message signing, Ceph will sign
2162 all messages so they are more difficult to spoof.
2163 with_legacy: true
2164 - name: auth_mon_ticket_ttl
2165 type: float
2166 level: advanced
2167 default: 72_hr
2168 with_legacy: true
2169 - name: auth_service_ticket_ttl
2170 type: float
2171 level: advanced
2172 default: 1_hr
2173 fmt_desc: When the Ceph Storage Cluster sends a Ceph Client a ticket for
2174 authentication, the Ceph Storage Cluster assigns the ticket a
2175 time to live.
2176 with_legacy: true
2177 - name: auth_allow_insecure_global_id_reclaim
2178 type: bool
2179 level: advanced
2180 desc: Allow reclaiming global_id without presenting a valid ticket proving
2181 previous possession of that global_id
2182 long_desc: Allowing unauthorized global_id (re)use poses a security risk.
2183 Unfortunately, older clients may omit their ticket on reconnects and
2184 therefore rely on this being allowed for preserving their global_id for
2185 the lifetime of the client instance. Setting this value to false would
2186 immediately prevent new connections from those clients (assuming
2187 auth_expose_insecure_global_id_reclaim set to true) and eventually break
2188 existing sessions as well (regardless of auth_expose_insecure_global_id_reclaim
2189 setting).
2190 default: true
2191 see_also:
2192 - mon_warn_on_insecure_global_id_reclaim
2193 - mon_warn_on_insecure_global_id_reclaim_allowed
2194 - auth_expose_insecure_global_id_reclaim
2195 with_legacy: true
2196 - name: auth_expose_insecure_global_id_reclaim
2197 type: bool
2198 level: advanced
2199 desc: Force older clients that may omit their ticket on reconnects to
2200 reconnect as part of establishing a session
2201 long_desc: 'In permissive mode (auth_allow_insecure_global_id_reclaim set
2202 to true), this helps with identifying clients that are not patched. In
2203 enforcing mode (auth_allow_insecure_global_id_reclaim set to false), this
2204 is a fail-fast mechanism: don''t establish a session that will almost
2205 inevitably be broken later.'
2206 default: true
2207 see_also:
2208 - mon_warn_on_insecure_global_id_reclaim
2209 - mon_warn_on_insecure_global_id_reclaim_allowed
2210 - auth_allow_insecure_global_id_reclaim
2211 with_legacy: true
2212 # if true, assert when weird things happen
2213 - name: auth_debug
2214 type: bool
2215 level: dev
2216 default: false
2217 with_legacy: true
2218 # how many mons to try to connect to in parallel during hunt
2219 - name: mon_client_hunt_parallel
2220 type: uint
2221 level: advanced
2222 default: 3
2223 with_legacy: true
2224 # try new mon every N seconds until we connect
2225 - name: mon_client_hunt_interval
2226 type: float
2227 level: advanced
2228 default: 3
2229 fmt_desc: The client will try a new monitor every ``N`` seconds until it
2230 establishes a connection.
2231 with_legacy: true
2232 # send logs every N seconds
2233 - name: mon_client_log_interval
2234 type: float
2235 level: advanced
2236 desc: How frequently we send queued cluster log messages to mon
2237 default: 1
2238 with_legacy: true
2239 # ping every N seconds
2240 - name: mon_client_ping_interval
2241 type: float
2242 level: advanced
2243 default: 10
2244 fmt_desc: The client will ping the monitor every ``N`` seconds.
2245 with_legacy: true
2246 # fail if we don't hear back
2247 - name: mon_client_ping_timeout
2248 type: float
2249 level: advanced
2250 default: 30
2251 with_legacy: true
2252 - name: mon_client_hunt_interval_backoff
2253 type: float
2254 level: advanced
2255 default: 1.5
2256 with_legacy: true
2257 - name: mon_client_hunt_interval_min_multiple
2258 type: float
2259 level: advanced
2260 default: 1
2261 with_legacy: true
2262 - name: mon_client_hunt_interval_max_multiple
2263 type: float
2264 level: advanced
2265 default: 10
2266 with_legacy: true
2267 - name: mon_client_max_log_entries_per_message
2268 type: int
2269 level: advanced
2270 default: 1000
2271 fmt_desc: The maximum number of log entries a monitor will generate
2272 per client message.
2273 with_legacy: true
2274 - name: mon_client_directed_command_retry
2275 type: int
2276 level: dev
2277 desc: Number of times to try sending a command directed at a specific monitor
2278 default: 2
2279 with_legacy: true
2280 # whitespace-separated list of key=value pairs describing crush location
2281 - name: crush_location
2282 type: str
2283 level: advanced
2284 with_legacy: true
2285 - name: crush_location_hook
2286 type: str
2287 level: advanced
2288 with_legacy: true
2289 - name: crush_location_hook_timeout
2290 type: int
2291 level: advanced
2292 default: 10
2293 with_legacy: true
2294 - name: objecter_tick_interval
2295 type: float
2296 level: dev
2297 default: 5
2298 with_legacy: true
2299 # before we ask for a map
2300 - name: objecter_timeout
2301 type: float
2302 level: advanced
2303 desc: Seconds before in-flight op is considered 'laggy' and we query mon for the
2304 latest OSDMap
2305 default: 10
2306 with_legacy: true
2307 - name: objecter_inflight_op_bytes
2308 type: size
2309 level: advanced
2310 desc: Max in-flight data in bytes (both directions)
2311 default: 100_M
2312 with_legacy: true
2313 - name: objecter_inflight_ops
2314 type: uint
2315 level: advanced
2316 desc: Max in-flight operations
2317 default: 1_K
2318 with_legacy: true
2319 # num of completion locks per each session, for serializing same object responses
2320 - name: objecter_completion_locks_per_session
2321 type: uint
2322 level: dev
2323 default: 32
2324 with_legacy: true
2325 # suppress watch pings
2326 - name: objecter_inject_no_watch_ping
2327 type: bool
2328 level: dev
2329 default: false
2330 with_legacy: true
2331 # ignore the first reply for each write, and resend the osd op instead
2332 - name: objecter_retry_writes_after_first_reply
2333 type: bool
2334 level: dev
2335 default: false
2336 with_legacy: true
2337 - name: objecter_debug_inject_relock_delay
2338 type: bool
2339 level: dev
2340 default: false
2341 with_legacy: true
2342 - name: filer_max_purge_ops
2343 type: uint
2344 level: advanced
2345 desc: Max in-flight operations for purging a striped range (e.g., MDS journal)
2346 default: 10
2347 with_legacy: true
2348 - name: filer_max_truncate_ops
2349 type: uint
2350 level: advanced
2351 desc: Max in-flight operations for truncating/deleting a striped sequence (e.g.,
2352 MDS journal)
2353 default: 128
2354 with_legacy: true
2355 - name: journaler_write_head_interval
2356 type: int
2357 level: advanced
2358 desc: Interval in seconds between journal header updates (to help bound replay time)
2359 default: 15
2360 # * journal object size
2361 - name: journaler_prefetch_periods
2362 type: uint
2363 level: advanced
2364 desc: Number of striping periods to prefetch while reading MDS journal
2365 default: 10
2366 # we need at least 2 periods to make progress.
2367 min: 2
2368 # * journal object size
2369 - name: journaler_prezero_periods
2370 type: uint
2371 level: advanced
2372 desc: Number of striping periods to zero head of MDS journal write position
2373 default: 5
2374 # we need to zero at least two periods, minimum, to ensure that we
2375 # have a full empty object/period in front of us.
2376 min: 2
2377 - name: osd_calc_pg_upmaps_aggressively
2378 type: bool
2379 level: advanced
2380 desc: try to calculate PG upmaps more aggressively, e.g., by doing a fairly exhaustive
2381 search of existing PGs that can be unmapped or upmapped
2382 default: true
2383 flags:
2384 - runtime
2385 - name: osd_calc_pg_upmaps_local_fallback_retries
2386 type: uint
2387 level: advanced
2388 desc: 'Maximum number of PGs we can attempt to unmap or upmap for a specific overfull
2389 or underfull osd per iteration '
2390 default: 100
2391 flags:
2392 - runtime
2393 # 1 = host
2394 - name: osd_crush_chooseleaf_type
2395 type: int
2396 level: dev
2397 desc: default chooseleaf type for osdmaptool --create
2398 fmt_desc: The bucket type to use for ``chooseleaf`` in a CRUSH rule. Uses
2399 ordinal rank rather than name.
2400 default: 1
2401 flags:
2402 - cluster_create
2403 with_legacy: true
2404 # try to use gmt for hitset archive names if all osds in cluster support it
2405 - name: osd_pool_use_gmt_hitset
2406 type: bool
2407 level: dev
2408 desc: use UTC for hitset timestamps
2409 long_desc: This setting only exists for compatibility with hammer (and older) clusters.
2410 default: true
2411 with_legacy: true
2412 # whether turn on fast read on the pool or not
2413 - name: osd_pool_default_ec_fast_read
2414 type: bool
2415 level: advanced
2416 desc: set ec_fast_read for new erasure-coded pools
2417 fmt_desc: Whether to turn on fast read on the pool or not. It will be used as
2418 the default setting of newly created erasure coded pools if ``fast_read``
2419 is not specified at create time.
2420 default: false
2421 services:
2422 - mon
2423 with_legacy: true
2424 - name: osd_pool_default_crush_rule
2425 type: int
2426 level: advanced
2427 desc: CRUSH rule for newly created pools
2428 fmt_desc: The default CRUSH rule to use when creating a replicated pool. The
2429 default value of ``-1`` means "pick the rule with the lowest numerical ID and
2430 use that". This is to make pool creation work in the absence of rule 0.
2431 default: -1
2432 services:
2433 - mon
2434 - name: osd_pool_default_size
2435 type: uint
2436 level: advanced
2437 desc: the number of copies of an object for new replicated pools
2438 fmt_desc: Sets the number of replicas for objects in the pool. The default
2439 value is the same as
2440 ``ceph osd pool set {pool-name} size {size}``.
2441 default: 3
2442 services:
2443 - mon
2444 min: 0
2445 max: 10
2446 flags:
2447 - runtime
2448 - name: osd_pool_default_min_size
2449 type: uint
2450 level: advanced
2451 desc: the minimal number of copies allowed to write to a degraded pool for new replicated
2452 pools
2453 long_desc: 0 means no specific default; ceph will use size-size/2
2454 fmt_desc: Sets the minimum number of written replicas for objects in the
2455 pool in order to acknowledge an I/O operation to the client. If
2456 minimum is not met, Ceph will not acknowledge the I/O to the
2457 client, **which may result in data loss**. This setting ensures
2458 a minimum number of replicas when operating in ``degraded`` mode.
2459 The default value is ``0`` which means no particular minimum. If ``0``,
2460 minimum is ``size - (size / 2)``.
2461 default: 0
2462 services:
2463 - mon
2464 see_also:
2465 - osd_pool_default_size
2466 min: 0
2467 max: 255
2468 flags:
2469 - runtime
2470 - name: osd_pool_default_pg_num
2471 type: uint
2472 level: advanced
2473 desc: number of PGs for new pools
2474 fmt_desc: The default number of placement groups for a pool. The default
2475 value is the same as ``pg_num`` with ``mkpool``.
2476 long_desc: With default value of `osd_pool_default_pg_autoscale_mode` being
2477 `on` the number of PGs for new pools will start out with 1 pg, unless the
2478 user specifies the pg_num.
2479 default: 32
2480 services:
2481 - mon
2482 see_also:
2483 - osd_pool_default_pg_autoscale_mode
2484 flags:
2485 - runtime
2486 - name: osd_pool_default_pgp_num
2487 type: uint
2488 level: advanced
2489 desc: number of PGs for placement purposes (0 to match pg_num)
2490 fmt_desc: The default number of placement groups for placement for a pool.
2491 The default value is the same as ``pgp_num`` with ``mkpool``.
2492 PG and PGP should be equal (for now).
2493 default: 0
2494 services:
2495 - mon
2496 see_also:
2497 - osd_pool_default_pg_num
2498 flags:
2499 - runtime
2500 - name: osd_pool_default_type
2501 type: str
2502 level: advanced
2503 desc: default type of pool to create
2504 default: replicated
2505 services:
2506 - mon
2507 enum_values:
2508 - replicated
2509 - erasure
2510 flags:
2511 - runtime
2512 - name: osd_pool_default_erasure_code_profile
2513 type: str
2514 level: advanced
2515 desc: default erasure code profile for new erasure-coded pools
2516 default: plugin=jerasure technique=reed_sol_van k=2 m=2
2517 services:
2518 - mon
2519 flags:
2520 - runtime
2521 - name: osd_erasure_code_plugins
2522 type: str
2523 level: advanced
2524 desc: erasure code plugins to load
2525 default: @osd_erasure_code_plugins@
2526 services:
2527 - mon
2528 - osd
2529 flags:
2530 - startup
2531 with_legacy: true
2532 - name: osd_pool_default_flags
2533 type: int
2534 level: dev
2535 desc: (integer) flags to set on new pools
2536 fmt_desc: The default flags for new pools.
2537 default: 0
2538 services:
2539 - mon
2540 with_legacy: true
2541 # use new pg hashing to prevent pool/pg overlap
2542 - name: osd_pool_default_flag_hashpspool
2543 type: bool
2544 level: advanced
2545 desc: set hashpspool (better hashing scheme) flag on new pools
2546 default: true
2547 services:
2548 - mon
2549 with_legacy: true
2550 # pool can't be deleted
2551 - name: osd_pool_default_flag_nodelete
2552 type: bool
2553 level: advanced
2554 desc: set nodelete flag on new pools
2555 fmt_desc: Set the ``nodelete`` flag on new pools, which prevents pool removal.
2556 default: false
2557 services:
2558 - mon
2559 with_legacy: true
2560 # pool's pg and pgp num can't be changed
2561 - name: osd_pool_default_flag_nopgchange
2562 type: bool
2563 level: advanced
2564 desc: set nopgchange flag on new pools
2565 fmt_desc: Set the ``nopgchange`` flag on new pools. Does not allow the number of PGs to be changed.
2566 default: false
2567 services:
2568 - mon
2569 with_legacy: true
2570 # pool's size and min size can't be changed
2571 - name: osd_pool_default_flag_nosizechange
2572 type: bool
2573 level: advanced
2574 desc: set nosizechange flag on new pools
2575 fmt_desc: Set the ``nosizechange`` flag on new pools. Does not allow the ``size`` to be changed.
2576 default: false
2577 services:
2578 - mon
2579 with_legacy: true
2580 - name: osd_pool_default_flag_bulk
2581 type: bool
2582 level: advanced
2583 desc: set bulk flag on new pools
2584 fmt_desc: Set the ``bulk`` flag on new pools. Allowing autoscaler to use scale-down mode.
2585 default: false
2586 services:
2587 - mon
2588 with_legacy: true
2589 - name: osd_pool_default_hit_set_bloom_fpp
2590 type: float
2591 level: advanced
2592 default: 0.05
2593 services:
2594 - mon
2595 see_also:
2596 - osd_tier_default_cache_hit_set_type
2597 with_legacy: true
2598 - name: osd_pool_default_cache_target_dirty_ratio
2599 type: float
2600 level: advanced
2601 default: 0.4
2602 with_legacy: true
2603 - name: osd_pool_default_cache_target_dirty_high_ratio
2604 type: float
2605 level: advanced
2606 default: 0.6
2607 with_legacy: true
2608 - name: osd_pool_default_cache_target_full_ratio
2609 type: float
2610 level: advanced
2611 default: 0.8
2612 with_legacy: true
2613 # seconds
2614 - name: osd_pool_default_cache_min_flush_age
2615 type: int
2616 level: advanced
2617 default: 0
2618 with_legacy: true
2619 # seconds
2620 - name: osd_pool_default_cache_min_evict_age
2621 type: int
2622 level: advanced
2623 default: 0
2624 with_legacy: true
2625 # max size to check for eviction
2626 - name: osd_pool_default_cache_max_evict_check_size
2627 type: int
2628 level: advanced
2629 default: 10
2630 with_legacy: true
2631 - name: osd_pool_default_pg_autoscale_mode
2632 type: str
2633 level: advanced
2634 desc: Default PG autoscaling behavior for new pools
2635 long_desc: With default value `on`, the autoscaler starts a new pool with 1
2636 pg, unless the user specifies the pg_num.
2637 default: 'on'
2638 enum_values:
2639 - 'off'
2640 - 'warn'
2641 - 'on'
2642 flags:
2643 - runtime
2644 - name: osd_pool_default_read_lease_ratio
2645 type: float
2646 level: dev
2647 desc: Default read_lease_ratio for a pool, as a multiple of osd_heartbeat_grace
2648 long_desc: This should be <= 1.0 so that the read lease will have expired by the
2649 time we decide to mark a peer OSD down.
2650 default: 0.8
2651 see_also:
2652 - osd_heartbeat_grace
2653 flags:
2654 - runtime
2655 with_legacy: true
2656 # min target size for a HitSet
2657 - name: osd_hit_set_min_size
2658 type: int
2659 level: advanced
2660 default: 1000
2661 with_legacy: true
2662 # max target size for a HitSet
2663 - name: osd_hit_set_max_size
2664 type: int
2665 level: advanced
2666 default: 100000
2667 with_legacy: true
2668 # rados namespace for hit_set tracking
2669 - name: osd_hit_set_namespace
2670 type: str
2671 level: advanced
2672 default: .ceph-internal
2673 with_legacy: true
2674 # conservative default throttling values
2675 - name: osd_tier_promote_max_objects_sec
2676 type: uint
2677 level: advanced
2678 default: 25
2679 with_legacy: true
2680 - name: osd_tier_promote_max_bytes_sec
2681 type: size
2682 level: advanced
2683 default: 5_M
2684 with_legacy: true
2685 - name: osd_tier_default_cache_mode
2686 type: str
2687 level: advanced
2688 default: writeback
2689 enum_values:
2690 - none
2691 - writeback
2692 - forward
2693 - readonly
2694 - readforward
2695 - readproxy
2696 - proxy
2697 flags:
2698 - runtime
2699 - name: osd_tier_default_cache_hit_set_count
2700 type: uint
2701 level: advanced
2702 default: 4
2703 - name: osd_tier_default_cache_hit_set_period
2704 type: uint
2705 level: advanced
2706 default: 1200
2707 - name: osd_tier_default_cache_hit_set_type
2708 type: str
2709 level: advanced
2710 default: bloom
2711 enum_values:
2712 - bloom
2713 - explicit_hash
2714 - explicit_object
2715 flags:
2716 - runtime
2717 - name: osd_tier_default_cache_min_read_recency_for_promote
2718 type: uint
2719 level: advanced
2720 desc: number of recent HitSets the object must appear in to be promoted (on read)
2721 default: 1
2722 - name: osd_tier_default_cache_min_write_recency_for_promote
2723 type: uint
2724 level: advanced
2725 desc: number of recent HitSets the object must appear in to be promoted (on write)
2726 default: 1
2727 - name: osd_tier_default_cache_hit_set_grade_decay_rate
2728 type: uint
2729 level: advanced
2730 default: 20
2731 - name: osd_tier_default_cache_hit_set_search_last_n
2732 type: uint
2733 level: advanced
2734 default: 1
2735 - name: osd_objecter_finishers
2736 type: int
2737 level: advanced
2738 default: 1
2739 flags:
2740 - startup
2741 with_legacy: true
2742 - name: osd_map_dedup
2743 type: bool
2744 level: advanced
2745 default: true
2746 fmt_desc: Enable removing duplicates in the OSD map.
2747 with_legacy: true
2748 - name: osd_map_message_max
2749 type: int
2750 level: advanced
2751 desc: maximum number of OSDMaps to include in a single message
2752 fmt_desc: The maximum map entries allowed per MOSDMap message.
2753 default: 40
2754 services:
2755 - osd
2756 - mon
2757 with_legacy: true
2758 - name: osd_map_message_max_bytes
2759 type: size
2760 level: advanced
2761 desc: maximum number of bytes worth of OSDMaps to include in a single message
2762 default: 10_M
2763 services:
2764 - osd
2765 - mon
2766 with_legacy: true
2767 # do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
2768 - name: osd_ignore_stale_divergent_priors
2769 type: bool
2770 level: advanced
2771 default: false
2772 with_legacy: true
2773 - name: osd_heartbeat_interval
2774 type: int
2775 level: dev
2776 desc: Interval (in seconds) between peer pings
2777 fmt_desc: How often an Ceph OSD Daemon pings its peers (in seconds).
2778 default: 6
2779 min: 1
2780 max: 1_min
2781 with_legacy: true
2782 # (seconds) how long before we decide a peer has failed
2783 # This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
2784 - name: osd_heartbeat_grace
2785 type: int
2786 level: advanced
2787 default: 20
2788 fmt_desc: The elapsed time when a Ceph OSD Daemon hasn't shown a heartbeat
2789 that the Ceph Storage Cluster considers it ``down``.
2790 This setting must be set in both the [mon] and [osd] or [global]
2791 sections so that it is read by both monitor and OSD daemons.
2792 with_legacy: true
2793 - name: osd_heartbeat_stale
2794 type: int
2795 level: advanced
2796 desc: Interval (in seconds) we mark an unresponsive heartbeat peer as stale.
2797 long_desc: Automatically mark unresponsive heartbeat sessions as stale and tear
2798 them down. The primary benefit is that OSD doesn't need to keep a flood of blocked
2799 heartbeat messages around in memory.
2800 default: 10_min
2801 # prio the heartbeat tcp socket and set dscp as CS6 on it if true
2802 - name: osd_heartbeat_use_min_delay_socket
2803 type: bool
2804 level: advanced
2805 default: false
2806 with_legacy: true
2807 # the minimum size of OSD heartbeat messages to send
2808 - name: osd_heartbeat_min_size
2809 type: size
2810 level: advanced
2811 desc: Minimum heartbeat packet size in bytes. Will add dummy payload if heartbeat
2812 packet is smaller than this.
2813 default: 2000
2814 with_legacy: true
2815 # max number of parallel snap trims/pg
2816 - name: osd_pg_max_concurrent_snap_trims
2817 type: uint
2818 level: advanced
2819 default: 2
2820 min: 1
2821 with_legacy: true
2822 # max number of trimming pgs
2823 - name: osd_max_trimming_pgs
2824 type: uint
2825 level: advanced
2826 default: 2
2827 with_legacy: true
2828 # minimum number of peers that must be reachable to mark ourselves
2829 # back up after being wrongly marked down.
2830 - name: osd_heartbeat_min_healthy_ratio
2831 type: float
2832 level: advanced
2833 default: 0.33
2834 with_legacy: true
2835 # (seconds) how often to ping monitor if no peers
2836 - name: osd_mon_heartbeat_interval
2837 type: int
2838 level: advanced
2839 default: 30
2840 fmt_desc: How often the Ceph OSD Daemon pings a Ceph Monitor if it has no
2841 Ceph OSD Daemon peers.
2842 with_legacy: true
2843 - name: osd_mon_heartbeat_stat_stale
2844 type: int
2845 level: advanced
2846 desc: Stop reporting on heartbeat ping times not updated for this many seconds.
2847 long_desc: Stop reporting on old heartbeat information unless this is set to zero
2848 fmt_desc: Stop reporting on heartbeat ping times which haven't been updated for
2849 this many seconds. Set to zero to disable this action.
2850 default: 1_hr
2851 # failures, up_thru, boot.
2852 - name: osd_mon_report_interval
2853 type: int
2854 level: advanced
2855 desc: Frequency of OSD reports to mon for peer failures, fullness status changes
2856 fmt_desc: The number of seconds a Ceph OSD Daemon may wait
2857 from startup or another reportable event before reporting
2858 to a Ceph Monitor.
2859 default: 5
2860 with_legacy: true
2861 # max updates in flight
2862 - name: osd_mon_report_max_in_flight
2863 type: int
2864 level: advanced
2865 default: 2
2866 with_legacy: true
2867 # (second) how often to send beacon message to monitor
2868 - name: osd_beacon_report_interval
2869 type: int
2870 level: advanced
2871 default: 5_min
2872 with_legacy: true
2873 # report pg stats for any given pg at least this often
2874 - name: osd_pg_stat_report_interval_max
2875 type: int
2876 level: advanced
2877 default: 500
2878 with_legacy: true
2879 # Max number of snap intervals to report to mgr in pg_stat_t
2880 - name: osd_max_snap_prune_intervals_per_epoch
2881 type: uint
2882 level: dev
2883 desc: Max number of snap intervals to report to mgr in pg_stat_t
2884 default: 512
2885 with_legacy: true
2886 - name: osd_default_data_pool_replay_window
2887 type: int
2888 level: advanced
2889 default: 45
2890 fmt_desc: The time (in seconds) for an OSD to wait for a client to replay
2891 a request.
2892 - name: osd_auto_mark_unfound_lost
2893 type: bool
2894 level: advanced
2895 default: false
2896 with_legacy: true
2897 - name: osd_check_for_log_corruption
2898 type: bool
2899 level: advanced
2900 default: false
2901 fmt_desc: Check log files for corruption. Can be computationally expensive.
2902 with_legacy: true
2903 - name: osd_use_stale_snap
2904 type: bool
2905 level: advanced
2906 default: false
2907 with_legacy: true
2908 - name: osd_rollback_to_cluster_snap
2909 type: str
2910 level: advanced
2911 with_legacy: true
2912 - name: osd_default_notify_timeout
2913 type: uint
2914 level: advanced
2915 desc: default number of seconds after which notify propagation times out. used if
2916 a client has not specified other value
2917 fmt_desc: The OSD default notification timeout (in seconds).
2918 default: 30
2919 with_legacy: true
2920 - name: osd_kill_backfill_at
2921 type: int
2922 level: dev
2923 default: 0
2924 with_legacy: true
2925 # Bounds how infrequently a new map epoch will be persisted for a pg
2926 # make this < map_cache_size!
2927 - name: osd_pg_epoch_persisted_max_stale
2928 type: uint
2929 level: advanced
2930 default: 40
2931 with_legacy: true
2932 - name: osd_target_pg_log_entries_per_osd
2933 type: uint
2934 level: dev
2935 desc: target number of PG entries total on an OSD - limited per pg by the min and
2936 max options below
2937 default: 300000
2938 see_also:
2939 - osd_max_pg_log_entries
2940 - osd_min_pg_log_entries
2941 with_legacy: true
2942 - name: osd_min_pg_log_entries
2943 type: uint
2944 level: dev
2945 desc: minimum number of entries to maintain in the PG log
2946 fmt_desc: The minimum number of placement group logs to maintain
2947 when trimming log files.
2948 default: 250
2949 services:
2950 - osd
2951 see_also:
2952 - osd_max_pg_log_entries
2953 - osd_pg_log_dups_tracked
2954 - osd_target_pg_log_entries_per_osd
2955 with_legacy: true
2956 - name: osd_max_pg_log_entries
2957 type: uint
2958 level: dev
2959 desc: maximum number of entries to maintain in the PG log
2960 fmt_desc: The maximum number of placement group logs to maintain
2961 when trimming log files.
2962 default: 10000
2963 services:
2964 - osd
2965 see_also:
2966 - osd_min_pg_log_entries
2967 - osd_pg_log_dups_tracked
2968 - osd_target_pg_log_entries_per_osd
2969 with_legacy: true
2970 - name: osd_pg_log_dups_tracked
2971 type: uint
2972 level: dev
2973 desc: how many versions back to track in order to detect duplicate ops; this is
2974 combined with both the regular pg log entries and additional minimal dup detection
2975 entries
2976 default: 3000
2977 services:
2978 - osd
2979 see_also:
2980 - osd_min_pg_log_entries
2981 - osd_max_pg_log_entries
2982 with_legacy: true
2983 - name: osd_object_clean_region_max_num_intervals
2984 type: int
2985 level: dev
2986 desc: number of intervals in clean_offsets
2987 long_desc: partial recovery uses multiple intervals to record the clean part of
2988 the objectwhen the number of intervals is greater than osd_object_clean_region_max_num_intervals,
2989 minimum interval will be trimmed(0 will recovery the entire object data interval)
2990 default: 10
2991 services:
2992 - osd
2993 with_legacy: true
2994 # max entries factor before force recovery
2995 - name: osd_force_recovery_pg_log_entries_factor
2996 type: float
2997 level: dev
2998 default: 1.3
2999 with_legacy: true
3000 - name: osd_pg_log_trim_min
3001 type: uint
3002 level: dev
3003 desc: Minimum number of log entries to trim at once. This lets us trim in larger
3004 batches rather than with each write.
3005 default: 100
3006 see_also:
3007 - osd_max_pg_log_entries
3008 - osd_min_pg_log_entries
3009 with_legacy: true
3010 - name: osd_force_auth_primary_missing_objects
3011 type: uint
3012 level: advanced
3013 desc: Approximate missing objects above which to force auth_log_shard to be primary
3014 temporarily
3015 default: 100
3016 - name: osd_async_recovery_min_cost
3017 type: uint
3018 level: advanced
3019 desc: A mixture measure of number of current log entries difference and historical
3020 missing objects, above which we switch to use asynchronous recovery when appropriate
3021 default: 100
3022 flags:
3023 - runtime
3024 - name: osd_max_pg_per_osd_hard_ratio
3025 type: float
3026 level: advanced
3027 desc: Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'
3028 long_desc: OSD will refuse to instantiate PG if the number of PG it serves exceeds
3029 this number.
3030 fmt_desc: The ratio of number of PGs per OSD allowed by the cluster before the
3031 OSD refuses to create new PGs. An OSD stops creating new PGs if the number
3032 of PGs it serves exceeds
3033 ``osd_max_pg_per_osd_hard_ratio`` \* ``mon_max_pg_per_osd``.
3034 default: 3
3035 see_also:
3036 - mon_max_pg_per_osd
3037 min: 1
3038 - name: osd_pg_log_trim_max
3039 type: uint
3040 level: advanced
3041 desc: maximum number of entries to remove at once from the PG log
3042 default: 10000
3043 services:
3044 - osd
3045 see_also:
3046 - osd_min_pg_log_entries
3047 - osd_max_pg_log_entries
3048 with_legacy: true
3049 # how many seconds old makes an op complaint-worthy
3050 - name: osd_op_complaint_time
3051 type: float
3052 level: advanced
3053 default: 30
3054 fmt_desc: An operation becomes complaint worthy after the specified number
3055 of seconds have elapsed.
3056 with_legacy: true
3057 - name: osd_command_max_records
3058 type: int
3059 level: advanced
3060 default: 256
3061 fmt_desc: Limits the number of lost objects to return.
3062 with_legacy: true
3063 # max peer osds to report that are blocking our progress
3064 - name: osd_max_pg_blocked_by
3065 type: uint
3066 level: advanced
3067 default: 16
3068 with_legacy: true
3069 - name: osd_op_log_threshold
3070 type: int
3071 level: advanced
3072 default: 5
3073 fmt_desc: How many operations logs to display at once.
3074 with_legacy: true
3075 - name: osd_backoff_on_unfound
3076 type: bool
3077 level: advanced
3078 default: true
3079 with_legacy: true
3080 # [mainly for debug?] object unreadable/writeable
3081 - name: osd_backoff_on_degraded
3082 type: bool
3083 level: advanced
3084 default: false
3085 with_legacy: true
3086 # [debug] pg peering
3087 - name: osd_backoff_on_peering
3088 type: bool
3089 level: advanced
3090 default: false
3091 with_legacy: true
3092 - name: osd_debug_shutdown
3093 type: bool
3094 level: dev
3095 desc: Turn up debug levels during shutdown
3096 default: false
3097 with_legacy: true
3098 # crash osd if client ignores a backoff; useful for debugging
3099 - name: osd_debug_crash_on_ignored_backoff
3100 type: bool
3101 level: dev
3102 default: false
3103 with_legacy: true
3104 - name: osd_debug_inject_dispatch_delay_probability
3105 type: float
3106 level: dev
3107 default: 0
3108 with_legacy: true
3109 - name: osd_debug_inject_dispatch_delay_duration
3110 type: float
3111 level: dev
3112 default: 0.1
3113 with_legacy: true
3114 - name: osd_debug_drop_ping_probability
3115 desc: N/A
3116 type: float
3117 level: dev
3118 default: 0
3119 with_legacy: true
3120 - name: osd_debug_drop_ping_duration
3121 desc: N/A
3122 type: int
3123 level: dev
3124 default: 0
3125 with_legacy: true
3126 - name: osd_debug_op_order
3127 type: bool
3128 level: dev
3129 default: false
3130 with_legacy: true
3131 - name: osd_debug_verify_missing_on_start
3132 type: bool
3133 level: dev
3134 default: false
3135 with_legacy: true
3136 - name: osd_debug_verify_snaps
3137 type: bool
3138 level: dev
3139 default: false
3140 with_legacy: true
3141 - name: osd_debug_verify_stray_on_activate
3142 type: bool
3143 level: dev
3144 default: false
3145 with_legacy: true
3146 - name: osd_debug_skip_full_check_in_backfill_reservation
3147 type: bool
3148 level: dev
3149 default: false
3150 with_legacy: true
3151 - name: osd_debug_reject_backfill_probability
3152 type: float
3153 level: dev
3154 default: 0
3155 with_legacy: true
3156 # inject failure during copyfrom completion
3157 - name: osd_debug_inject_copyfrom_error
3158 type: bool
3159 level: dev
3160 default: false
3161 with_legacy: true
3162 - name: osd_debug_misdirected_ops
3163 type: bool
3164 level: dev
3165 default: false
3166 with_legacy: true
3167 - name: osd_debug_skip_full_check_in_recovery
3168 type: bool
3169 level: dev
3170 default: false
3171 with_legacy: true
3172 - name: osd_debug_random_push_read_error
3173 type: float
3174 level: dev
3175 default: 0
3176 with_legacy: true
3177 - name: osd_debug_verify_cached_snaps
3178 type: bool
3179 level: dev
3180 default: false
3181 with_legacy: true
3182 - name: osd_debug_deep_scrub_sleep
3183 type: float
3184 level: dev
3185 desc: Inject an expensive sleep during deep scrub IO to make it easier to induce
3186 preemption
3187 default: 0
3188 with_legacy: true
3189 - name: osd_debug_no_acting_change
3190 type: bool
3191 level: dev
3192 default: false
3193 with_legacy: true
3194 - name: osd_debug_no_purge_strays
3195 type: bool
3196 level: dev
3197 default: false
3198 with_legacy: true
3199 - name: osd_debug_pretend_recovery_active
3200 type: bool
3201 level: dev
3202 default: false
3203 with_legacy: true
3204 # enable/disable OSD op tracking
3205 - name: osd_enable_op_tracker
3206 type: bool
3207 level: advanced
3208 default: true
3209 with_legacy: true
3210 # The number of shards for holding the ops
3211 - name: osd_num_op_tracker_shard
3212 type: uint
3213 level: advanced
3214 default: 32
3215 with_legacy: true
3216 # Max number of completed ops to track
3217 - name: osd_op_history_size
3218 type: uint
3219 level: advanced
3220 default: 20
3221 fmt_desc: The maximum number of completed operations to track.
3222 with_legacy: true
3223 # Oldest completed op to track
3224 - name: osd_op_history_duration
3225 type: uint
3226 level: advanced
3227 default: 600
3228 fmt_desc: The oldest completed operation to track.
3229 with_legacy: true
3230 # Max number of slow ops to track
3231 - name: osd_op_history_slow_op_size
3232 type: uint
3233 level: advanced
3234 default: 20
3235 with_legacy: true
3236 # track the op if over this threshold
3237 - name: osd_op_history_slow_op_threshold
3238 type: float
3239 level: advanced
3240 default: 10
3241 with_legacy: true
3242 # to adjust various transactions that batch smaller items
3243 - name: osd_target_transaction_size
3244 type: int
3245 level: advanced
3246 default: 30
3247 with_legacy: true
3248 # what % full makes an OSD "full" (failsafe)
3249 - name: osd_failsafe_full_ratio
3250 type: float
3251 level: advanced
3252 default: 0.97
3253 with_legacy: true
3254 - name: osd_fast_shutdown
3255 type: bool
3256 level: advanced
3257 desc: Fast, immediate shutdown
3258 long_desc: Setting this to false makes the OSD do a slower teardown of all state
3259 when it receives a SIGINT or SIGTERM or when shutting down for any other reason. That
3260 slow shutdown is primarilyy useful for doing memory leak checking with valgrind.
3261 default: true
3262 with_legacy: true
3263 - name: osd_fast_shutdown_timeout
3264 type: int
3265 level: advanced
3266 desc: timeout in seconds for osd fast-shutdown (0 is unlimited)
3267 default: 15
3268 with_legacy: true
3269 min: 0
3270 - name: osd_fast_shutdown_notify_mon
3271 type: bool
3272 level: advanced
3273 desc: Tell mon about OSD shutdown on immediate shutdown
3274 long_desc: Tell the monitor the OSD is shutting down on immediate shutdown. This
3275 helps with cluster log messages from other OSDs reporting it immediately failed.
3276 default: true
3277 see_also:
3278 - osd_fast_shutdown
3279 - osd_mon_shutdown_timeout
3280 with_legacy: true
3281 # immediately mark OSDs as down once they refuse to accept connections
3282 - name: osd_fast_fail_on_connection_refused
3283 type: bool
3284 level: advanced
3285 default: true
3286 fmt_desc: If this option is enabled, crashed OSDs are marked down
3287 immediately by connected peers and MONs (assuming that the
3288 crashed OSD host survives). Disable it to restore old
3289 behavior, at the expense of possible long I/O stalls when
3290 OSDs crash in the middle of I/O operations.
3291 with_legacy: true
3292 - name: osd_pg_object_context_cache_count
3293 type: int
3294 level: advanced
3295 default: 64
3296 with_legacy: true
3297 # true if LTTng-UST tracepoints should be enabled
3298 - name: osd_tracing
3299 type: bool
3300 level: advanced
3301 default: false
3302 with_legacy: true
3303 # true if function instrumentation should use LTTng
3304 - name: osd_function_tracing
3305 type: bool
3306 level: advanced
3307 default: false
3308 with_legacy: true
3309 # use fast info attr, if we can
3310 - name: osd_fast_info
3311 type: bool
3312 level: advanced
3313 default: true
3314 with_legacy: true
3315 # determines whether PGLog::check() compares written out log to stored log
3316 - name: osd_debug_pg_log_writeout
3317 type: bool
3318 level: dev
3319 default: false
3320 with_legacy: true
3321 # Max number of loop before we reset thread-pool's handle
3322 - name: osd_loop_before_reset_tphandle
3323 type: uint
3324 level: advanced
3325 default: 64
3326 with_legacy: true
3327 # default timeout while caling WaitInterval on an empty queue
3328 - name: threadpool_default_timeout
3329 type: int
3330 level: advanced
3331 default: 1_min
3332 with_legacy: true
3333 # default wait time for an empty queue before pinging the hb timeout
3334 - name: threadpool_empty_queue_max_wait
3335 type: int
3336 level: advanced
3337 default: 2
3338 with_legacy: true
3339 - name: leveldb_log_to_ceph_log
3340 type: bool
3341 level: advanced
3342 default: true
3343 with_legacy: true
3344 - name: leveldb_write_buffer_size
3345 type: size
3346 level: advanced
3347 default: 8_M
3348 with_legacy: true
3349 - name: leveldb_cache_size
3350 type: size
3351 level: advanced
3352 default: 128_M
3353 with_legacy: true
3354 - name: leveldb_block_size
3355 type: size
3356 level: advanced
3357 default: 0
3358 with_legacy: true
3359 - name: leveldb_bloom_size
3360 type: int
3361 level: advanced
3362 default: 0
3363 with_legacy: true
3364 - name: leveldb_max_open_files
3365 type: int
3366 level: advanced
3367 default: 0
3368 with_legacy: true
3369 - name: leveldb_compression
3370 type: bool
3371 level: advanced
3372 default: true
3373 with_legacy: true
3374 - name: leveldb_paranoid
3375 type: bool
3376 level: advanced
3377 default: false
3378 with_legacy: true
3379 - name: leveldb_log
3380 type: str
3381 level: advanced
3382 default: /dev/null
3383 with_legacy: true
3384 - name: leveldb_compact_on_mount
3385 type: bool
3386 level: advanced
3387 default: false
3388 with_legacy: true
3389 - name: rocksdb_log_to_ceph_log
3390 type: bool
3391 level: advanced
3392 default: true
3393 with_legacy: true
3394 - name: rocksdb_cache_size
3395 type: size
3396 level: advanced
3397 default: 512_M
3398 flags:
3399 - runtime
3400 with_legacy: true
3401 # ratio of cache for row (vs block)
3402 - name: rocksdb_cache_row_ratio
3403 type: float
3404 level: advanced
3405 default: 0
3406 with_legacy: true
3407 # rocksdb block cache shard bits, 4 bit -> 16 shards
3408 - name: rocksdb_cache_shard_bits
3409 type: int
3410 level: advanced
3411 default: 4
3412 with_legacy: true
3413 # 'lru' or 'clock'
3414 - name: rocksdb_cache_type
3415 type: str
3416 level: advanced
3417 default: binned_lru
3418 with_legacy: true
3419 - name: rocksdb_block_size
3420 type: size
3421 level: advanced
3422 default: 4_K
3423 with_legacy: true
3424 # Enabling this will have 5-10% impact on performance for the stats collection
3425 - name: rocksdb_perf
3426 type: bool
3427 level: advanced
3428 default: false
3429 with_legacy: true
3430 # For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
3431 - name: rocksdb_collect_compaction_stats
3432 type: bool
3433 level: advanced
3434 default: false
3435 with_legacy: true
3436 # For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
3437 - name: rocksdb_collect_extended_stats
3438 type: bool
3439 level: advanced
3440 default: false
3441 with_legacy: true
3442 # For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
3443 - name: rocksdb_collect_memory_stats
3444 type: bool
3445 level: advanced
3446 default: false
3447 with_legacy: true
3448 - name: rocksdb_delete_range_threshold
3449 type: uint
3450 level: advanced
3451 desc: The number of keys required to invoke DeleteRange when deleting muliple keys.
3452 default: 1_M
3453 - name: rocksdb_bloom_bits_per_key
3454 type: uint
3455 level: advanced
3456 desc: Number of bits per key to use for RocksDB's bloom filters.
3457 long_desc: 'RocksDB bloom filters can be used to quickly answer the question of
3458 whether or not a key may exist or definitely does not exist in a given RocksDB
3459 SST file without having to read all keys into memory. Using a higher bit value
3460 decreases the likelihood of false positives at the expense of additional disk
3461 space and memory consumption when the filter is loaded into RAM. The current
3462 default value of 20 was found to provide significant performance gains when getattr
3463 calls are made (such as during new object creation in bluestore) without significant
3464 memory overhead or cache pollution when combined with rocksdb partitioned index
3465 filters. See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters
3466 for more information.'
3467 default: 20
3468 - name: rocksdb_cache_index_and_filter_blocks
3469 type: bool
3470 level: dev
3471 desc: Whether to cache indices and filters in block cache
3472 long_desc: By default RocksDB will load an SST file's index and bloom filters into
3473 memory when it is opened and remove them from memory when an SST file is closed. Thus,
3474 memory consumption by indices and bloom filters is directly tied to the number
3475 of concurrent SST files allowed to be kept open. This option instead stores cached
3476 indicies and filters in the block cache where they directly compete with other
3477 cached data. By default we set this option to true to better account for and
3478 bound rocksdb memory usage and keep filters in memory even when an SST file is
3479 closed.
3480 default: true
3481 - name: rocksdb_cache_index_and_filter_blocks_with_high_priority
3482 type: bool
3483 level: dev
3484 desc: Whether to cache indices and filters in the block cache with high priority
3485 long_desc: A downside of setting rocksdb_cache_index_and_filter_blocks to true is
3486 that regular data can push indices and filters out of memory. Setting this option
3487 to true means they are cached with higher priority than other data and should
3488 typically stay in the block cache.
3489 default: false
3490 - name: rocksdb_pin_l0_filter_and_index_blocks_in_cache
3491 type: bool
3492 level: dev
3493 desc: Whether to pin Level 0 indices and bloom filters in the block cache
3494 long_desc: A downside of setting rocksdb_cache_index_and_filter_blocks to true is
3495 that regular data can push indices and filters out of memory. Setting this option
3496 to true means that level 0 SST files will always have their indices and filters
3497 pinned in the block cache.
3498 default: false
3499 - name: rocksdb_index_type
3500 type: str
3501 level: dev
3502 desc: 'Type of index for SST files: binary_search, hash_search, two_level'
3503 long_desc: 'This option controls the table index type. binary_search is a space
3504 efficient index block that is optimized for block-search-based index. hash_search
3505 may improve prefix lookup performance at the expense of higher disk and memory
3506 usage and potentially slower compactions. two_level is an experimental index
3507 type that uses two binary search indexes and works in conjunction with partition
3508 filters. See: http://rocksdb.org/blog/2017/05/12/partitioned-index-filter.html'
3509 default: binary_search
3510 - name: rocksdb_partition_filters
3511 type: bool
3512 level: dev
3513 desc: (experimental) partition SST index/filters into smaller blocks
3514 long_desc: 'This is an experimental option for rocksdb that works in conjunction
3515 with two_level indices to avoid having to keep the entire filter/index in cache
3516 when cache_index_and_filter_blocks is true. The idea is to keep a much smaller
3517 top-level index in heap/cache and then opportunistically cache the lower level
3518 indices. See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters'
3519 default: false
3520 - name: rocksdb_metadata_block_size
3521 type: size
3522 level: dev
3523 desc: The block size for index partitions. (0 = rocksdb default)
3524 default: 4_K
3525 # osd_*_priority adjust the relative priority of client io, recovery io,
3526 # snaptrim io, etc
3527 #
3528 # osd_*_priority determines the ratio of available io between client and
3529 # recovery. Each option may be set between
3530 # 1..63.
3531 - name: osd_client_op_priority
3532 type: uint
3533 level: advanced
3534 default: 63
3535 fmt_desc: The priority set for client operations. This value is relative
3536 to that of ``osd_recovery_op_priority`` below. The default
3537 strongly favors client ops over recovery.
3538 with_legacy: true
3539 - name: osd_recovery_op_priority
3540 type: uint
3541 level: advanced
3542 desc: Priority to use for recovery operations if not specified for the pool
3543 fmt_desc: The priority of recovery operations vs client operations, if not specified by the
3544 pool's ``recovery_op_priority``. The default value prioritizes client
3545 ops (see above) over recovery ops. You may adjust the tradeoff of client
3546 impact against the time to restore cluster health by lowering this value
3547 for increased prioritization of client ops, or by increasing it to favor
3548 recovery.
3549 default: 3
3550 with_legacy: true
3551 - name: osd_peering_op_priority
3552 type: uint
3553 level: dev
3554 default: 255
3555 with_legacy: true
3556 - name: osd_snap_trim_priority
3557 type: uint
3558 level: advanced
3559 default: 5
3560 fmt_desc: The priority set for the snap trim work queue.
3561 with_legacy: true
3562 - name: osd_snap_trim_cost
3563 type: size
3564 level: advanced
3565 default: 1_M
3566 with_legacy: true
3567 - name: osd_pg_delete_priority
3568 type: uint
3569 level: advanced
3570 default: 5
3571 with_legacy: true
3572 - name: osd_pg_delete_cost
3573 type: size
3574 level: advanced
3575 default: 1_M
3576 with_legacy: true
3577 - name: osd_scrub_priority
3578 type: uint
3579 level: advanced
3580 desc: Priority for scrub operations in work queue
3581 fmt_desc: The default work queue priority for scheduled scrubs when the
3582 pool doesn't specify a value of ``scrub_priority``. This can be
3583 boosted to the value of ``osd_client_op_priority`` when scrubs are
3584 blocking client operations.
3585 default: 5
3586 with_legacy: true
3587 - name: osd_scrub_cost
3588 type: size
3589 level: advanced
3590 desc: Cost for scrub operations in work queue
3591 default: 50_M
3592 with_legacy: true
3593 # set requested scrub priority higher than scrub priority to make the
3594 # requested scrubs jump the queue of scheduled scrubs
3595 - name: osd_requested_scrub_priority
3596 type: uint
3597 level: advanced
3598 default: 120
3599 fmt_desc: The priority set for user requested scrub on the work queue. If
3600 this value were to be smaller than ``osd_client_op_priority`` it
3601 can be boosted to the value of ``osd_client_op_priority`` when
3602 scrub is blocking client operations.
3603 with_legacy: true
3604 - name: osd_recovery_priority
3605 type: uint
3606 level: advanced
3607 desc: Priority of recovery in the work queue
3608 long_desc: Not related to a pool's recovery_priority
3609 fmt_desc: The default priority set for recovery work queue. Not
3610 related to a pool's ``recovery_priority``.
3611 default: 5
3612 with_legacy: true
3613 # set default cost equal to 20MB io
3614 - name: osd_recovery_cost
3615 type: size
3616 level: advanced
3617 default: 20_M
3618 with_legacy: true
3619 # osd_recovery_op_warn_multiple scales the normal warning threshold,
3620 # osd_op_complaint_time, so that slow recovery ops won't cause noise
3621 - name: osd_recovery_op_warn_multiple
3622 type: uint
3623 level: advanced
3624 default: 16
3625 with_legacy: true
3626 # Max time to wait between notifying mon of shutdown and shutting down
3627 - name: osd_mon_shutdown_timeout
3628 type: float
3629 level: advanced
3630 default: 5
3631 with_legacy: true
3632 # crash if the OSD has stray PG refs on shutdown
3633 - name: osd_shutdown_pgref_assert
3634 type: bool
3635 level: advanced
3636 default: false
3637 with_legacy: true
3638 # OSD's maximum object size
3639 - name: osd_max_object_size
3640 type: size
3641 level: advanced
3642 default: 128_M
3643 fmt_desc: The maximum size of a RADOS object in bytes.
3644 with_legacy: true
3645 # max rados object name len
3646 - name: osd_max_object_name_len
3647 type: uint
3648 level: advanced
3649 default: 2_K
3650 with_legacy: true
3651 # max rados object namespace len
3652 - name: osd_max_object_namespace_len
3653 type: uint
3654 level: advanced
3655 default: 256
3656 with_legacy: true
3657 # max rados attr name len; cannot go higher than 100 chars for file system backends
3658 - name: osd_max_attr_name_len
3659 type: uint
3660 level: advanced
3661 default: 100
3662 with_legacy: true
3663 - name: osd_max_attr_size
3664 type: uint
3665 level: advanced
3666 default: 0
3667 with_legacy: true
3668 - name: osd_max_omap_entries_per_request
3669 type: uint
3670 level: advanced
3671 default: 1_K
3672 with_legacy: true
3673 - name: osd_max_omap_bytes_per_request
3674 type: size
3675 level: advanced
3676 default: 1_G
3677 with_legacy: true
3678 # osd_recovery_op_warn_multiple scales the normal warning threshold,
3679 # osd_op_complaint_time, so that slow recovery ops won't cause noise
3680 - name: osd_max_write_op_reply_len
3681 type: size
3682 level: advanced
3683 desc: Max size of the per-op payload for requests with the RETURNVEC flag set
3684 long_desc: This value caps the amount of data (per op; a request may have many ops)
3685 that will be sent back to the client and recorded in the PG log.
3686 default: 64
3687 with_legacy: true
3688 - name: osd_objectstore
3689 type: str
3690 level: advanced
3691 desc: backend type for an OSD (like filestore or bluestore)
3692 default: bluestore
3693 enum_values:
3694 - bluestore
3695 - filestore
3696 - memstore
3697 - kstore
3698 - seastore
3699 - cyanstore
3700 flags:
3701 - create
3702 with_legacy: true
3703 # true if LTTng-UST tracepoints should be enabled
3704 - name: osd_objectstore_tracing
3705 type: bool
3706 level: advanced
3707 default: false
3708 with_legacy: true
3709 - name: osd_objectstore_fuse
3710 type: bool
3711 level: advanced
3712 default: false
3713 with_legacy: true
3714 - name: osd_bench_small_size_max_iops
3715 type: uint
3716 level: advanced
3717 default: 100
3718 with_legacy: true
3719 - name: osd_bench_large_size_max_throughput
3720 type: size
3721 level: advanced
3722 default: 100_M
3723 with_legacy: true
3724 - name: osd_bench_max_block_size
3725 type: size
3726 level: advanced
3727 default: 64_M
3728 with_legacy: true
3729 # duration of 'osd bench', capped at 30s to avoid triggering timeouts
3730 - name: osd_bench_duration
3731 type: uint
3732 level: advanced
3733 default: 30
3734 with_legacy: true
3735 # create a blkin trace for all osd requests
3736 - name: osd_blkin_trace_all
3737 type: bool
3738 level: advanced
3739 default: false
3740 with_legacy: true
3741 # create a blkin trace for all objecter requests
3742 - name: osdc_blkin_trace_all
3743 type: bool
3744 level: advanced
3745 default: false
3746 with_legacy: true
3747 - name: osd_discard_disconnected_ops
3748 type: bool
3749 level: advanced
3750 default: true
3751 with_legacy: true
3752 - name: osd_memory_target
3753 type: size
3754 level: basic
3755 desc: When tcmalloc and cache autotuning is enabled, try to keep this many bytes
3756 mapped in memory.
3757 long_desc: The minimum value must be at least equal to osd_memory_base + osd_memory_cache_min.
3758 fmt_desc: |
3759 When TCMalloc is available and cache autotuning is enabled, try to
3760 keep this many bytes mapped in memory. Note: This may not exactly
3761 match the RSS memory usage of the process. While the total amount
3762 of heap memory mapped by the process should usually be close
3763 to this target, there is no guarantee that the kernel will actually
3764 reclaim memory that has been unmapped. During initial development,
3765 it was found that some kernels result in the OSD's RSS memory
3766 exceeding the mapped memory by up to 20%. It is hypothesised
3767 however, that the kernel generally may be more aggressive about
3768 reclaiming unmapped memory when there is a high amount of memory
3769 pressure. Your mileage may vary.
3770 default: 4_G
3771 see_also:
3772 - bluestore_cache_autotune
3773 - osd_memory_cache_min
3774 - osd_memory_base
3775 - osd_memory_target_autotune
3776 min: 896_M
3777 flags:
3778 - runtime
3779 - name: osd_memory_target_autotune
3780 type: bool
3781 default: false
3782 level: advanced
3783 desc: If enabled, allow orchestrator to automatically tune osd_memory_target
3784 see_also:
3785 - osd_memory_target
3786 - name: osd_memory_target_cgroup_limit_ratio
3787 type: float
3788 level: advanced
3789 desc: Set the default value for osd_memory_target to the cgroup memory limit (if
3790 set) times this value
3791 long_desc: A value of 0 disables this feature.
3792 default: 0.8
3793 see_also:
3794 - osd_memory_target
3795 min: 0
3796 max: 1
3797 - name: osd_memory_base
3798 type: size
3799 level: dev
3800 desc: When tcmalloc and cache autotuning is enabled, estimate the minimum amount
3801 of memory in bytes the OSD will need.
3802 fmt_desc: When TCMalloc and cache autotuning are enabled, estimate the minimum
3803 amount of memory in bytes the OSD will need. This is used to help
3804 the autotuner estimate the expected aggregate memory consumption of
3805 the caches.
3806 default: 768_M
3807 see_also:
3808 - bluestore_cache_autotune
3809 flags:
3810 - runtime
3811 - name: osd_memory_expected_fragmentation
3812 type: float
3813 level: dev
3814 desc: When tcmalloc and cache autotuning is enabled, estimate the percent of memory
3815 fragmentation.
3816 fmt_desc: When TCMalloc and cache autotuning is enabled, estimate the
3817 percentage of memory fragmentation. This is used to help the
3818 autotuner estimate the expected aggregate memory consumption
3819 of the caches.
3820 default: 0.15
3821 see_also:
3822 - bluestore_cache_autotune
3823 min: 0
3824 max: 1
3825 flags:
3826 - runtime
3827 - name: osd_memory_cache_min
3828 type: size
3829 level: dev
3830 desc: When tcmalloc and cache autotuning is enabled, set the minimum amount of memory
3831 used for caches.
3832 fmt_desc: |
3833 When TCMalloc and cache autotuning are enabled, set the minimum
3834 amount of memory used for caches. Note: Setting this value too
3835 low can result in significant cache thrashing.
3836 default: 128_M
3837 see_also:
3838 - bluestore_cache_autotune
3839 min: 128_M
3840 flags:
3841 - runtime
3842 - name: osd_memory_cache_resize_interval
3843 type: float
3844 level: dev
3845 desc: When tcmalloc and cache autotuning is enabled, wait this many seconds between
3846 resizing caches.
3847 fmt_desc: When TCMalloc and cache autotuning are enabled, wait this many
3848 seconds between resizing caches. This setting changes the total
3849 amount of memory available for BlueStore to use for caching. Note
3850 that setting this interval too small can result in memory allocator
3851 thrashing and lower performance.
3852 default: 1
3853 see_also:
3854 - bluestore_cache_autotune
3855 - name: memstore_device_bytes
3856 type: size
3857 level: advanced
3858 default: 1_G
3859 with_legacy: true
3860 - name: memstore_page_set
3861 type: bool
3862 level: advanced
3863 default: false
3864 with_legacy: true
3865 - name: memstore_page_size
3866 type: size
3867 level: advanced
3868 default: 64_K
3869 with_legacy: true
3870 - name: memstore_debug_omit_block_device_write
3871 type: bool
3872 level: dev
3873 desc: write metadata only
3874 default: false
3875 see_also:
3876 - bluestore_debug_omit_block_device_write
3877 with_legacy: true
3878 - name: objectstore_blackhole
3879 type: bool
3880 level: advanced
3881 default: false
3882 with_legacy: true
3883 - name: bdev_debug_inflight_ios
3884 type: bool
3885 level: dev
3886 default: false
3887 with_legacy: true
3888 # if N>0, then ~ 1/N IOs will complete before we crash on flush
3889 - name: bdev_inject_crash
3890 type: int
3891 level: dev
3892 default: 0
3893 with_legacy: true
3894 # wait N more seconds on flush
3895 - name: bdev_inject_crash_flush_delay
3896 type: int
3897 level: dev
3898 default: 2
3899 with_legacy: true
3900 - name: bdev_aio
3901 type: bool
3902 level: advanced
3903 default: true
3904 with_legacy: true
3905 # milliseconds
3906 - name: bdev_aio_poll_ms
3907 type: int
3908 level: advanced
3909 default: 250
3910 with_legacy: true
3911 - name: bdev_aio_max_queue_depth
3912 type: int
3913 level: advanced
3914 default: 1024
3915 with_legacy: true
3916 - name: bdev_aio_reap_max
3917 type: int
3918 level: advanced
3919 default: 16
3920 with_legacy: true
3921 - name: bdev_block_size
3922 type: size
3923 level: advanced
3924 default: 4_K
3925 with_legacy: true
3926 - name: bdev_read_buffer_alignment
3927 type: size
3928 level: advanced
3929 default: 4_K
3930 with_legacy: true
3931 - name: bdev_read_preallocated_huge_buffers
3932 type: str
3933 level: advanced
3934 desc: description of pools arrangement for huge page-based read buffers
3935 long_desc: Arrangement of preallocated, huge pages-based pools for reading
3936 from a KernelDevice. Applied to minimize size of scatter-gather lists
3937 sent to NICs. Targets really big buffers (>= 2 or 4 MBs).
3938 Keep in mind the system must be configured accordingly (see /proc/sys/vm/nr_hugepages).
3939 Otherwise the OSD wil fail early.
3940 Beware BlueStore, by default, stores large chunks across many smaller blobs.
3941 Increasing bluestore_max_blob_size changes that, and thus allows the data to
3942 be read back into small number of huge page-backed buffers.
3943 fmt_desc: List of key=value pairs delimited by comma, semicolon or tab.
3944 key specifies the targeted read size and must be expressed in bytes.
3945 value specifies the number of preallocated buffers.
3946 For instance, to preallocate 64 buffers that will be used to serve
3947 2 MB-sized read requests and 128 for 4 MB, someone needs to set
3948 "2097152=64,4194304=128".
3949 see_also:
3950 - bluestore_max_blob_size
3951 - name: bdev_debug_aio
3952 type: bool
3953 level: dev
3954 default: false
3955 with_legacy: true
3956 - name: bdev_debug_aio_suicide_timeout
3957 type: float
3958 level: dev
3959 default: 1_min
3960 with_legacy: true
3961 - name: bdev_debug_aio_log_age
3962 type: float
3963 level: dev
3964 default: 5
3965 with_legacy: true
3966 # if yes, osd will unbind all NVMe devices from kernel driver and bind them
3967 # to the uio_pci_generic driver. The purpose is to prevent the case where
3968 # NVMe driver is loaded while osd is running.
3969 - name: bdev_nvme_unbind_from_kernel
3970 type: bool
3971 level: advanced
3972 default: false
3973 with_legacy: true
3974 - name: bdev_enable_discard
3975 type: bool
3976 level: advanced
3977 default: false
3978 with_legacy: true
3979 - name: bdev_async_discard
3980 type: bool
3981 level: advanced
3982 default: false
3983 with_legacy: true
3984 - name: bdev_flock_retry_interval
3985 type: float
3986 level: advanced
3987 desc: interval to retry the flock
3988 default: 0.1
3989 - name: bdev_flock_retry
3990 type: uint
3991 level: advanced
3992 desc: times to retry the flock
3993 long_desc: The number of times to retry on getting the block device lock. Programs
3994 such as systemd-udevd may compete with Ceph for this lock. 0 means 'unlimited'.
3995 default: 3
3996 - name: bluefs_alloc_size
3997 type: size
3998 level: advanced
3999 desc: Allocation unit size for DB and WAL devices
4000 default: 1_M
4001 with_legacy: true
4002 - name: bluefs_shared_alloc_size
4003 type: size
4004 level: advanced
4005 desc: Allocation unit size for primary/shared device
4006 default: 64_K
4007 with_legacy: true
4008 - name: bluefs_max_prefetch
4009 type: size
4010 level: advanced
4011 default: 1_M
4012 with_legacy: true
4013 # alloc when we get this low
4014 - name: bluefs_min_log_runway
4015 type: size
4016 level: advanced
4017 default: 1_M
4018 with_legacy: true
4019 # alloc this much at a time
4020 - name: bluefs_max_log_runway
4021 type: size
4022 level: advanced
4023 default: 4_M
4024 with_legacy: true
4025 # before we consider
4026 - name: bluefs_log_compact_min_ratio
4027 type: float
4028 level: advanced
4029 default: 5
4030 with_legacy: true
4031 # before we consider
4032 - name: bluefs_log_compact_min_size
4033 type: size
4034 level: advanced
4035 default: 16_M
4036 with_legacy: true
4037 # ignore flush until its this big
4038 - name: bluefs_min_flush_size
4039 type: size
4040 level: advanced
4041 default: 512_K
4042 with_legacy: true
4043 # sync or async log compaction
4044 - name: bluefs_compact_log_sync
4045 type: bool
4046 level: advanced
4047 default: false
4048 with_legacy: true
4049 - name: bluefs_buffered_io
4050 type: bool
4051 level: advanced
4052 desc: Enabled buffered IO for bluefs reads.
4053 long_desc: When this option is enabled, bluefs will in some cases perform buffered
4054 reads. This allows the kernel page cache to act as a secondary cache for things
4055 like RocksDB block reads. For example, if the rocksdb block cache isn't large
4056 enough to hold all blocks during OMAP iteration, it may be possible to read them
4057 from page cache instead of from the disk. This can dramatically improve
4058 performance when the osd_memory_target is too small to hold all entries in block
4059 cache but it does come with downsides. It has been reported to occasionally
4060 cause excessive kernel swapping (and associated stalls) under certain workloads.
4061 Currently the best and most consistent performing combination appears to be
4062 enabling bluefs_buffered_io and disabling system level swap. It is possible
4063 that this recommendation may change in the future however.
4064 default: true
4065 with_legacy: true
4066 - name: bluefs_sync_write
4067 type: bool
4068 level: advanced
4069 default: false
4070 with_legacy: true
4071 - name: bluefs_allocator
4072 type: str
4073 level: dev
4074 default: hybrid
4075 enum_values:
4076 - bitmap
4077 - stupid
4078 - avl
4079 - hybrid
4080 with_legacy: true
4081 - name: bluefs_log_replay_check_allocations
4082 type: bool
4083 level: advanced
4084 desc: Enables checks for allocations consistency during log replay
4085 default: true
4086 with_legacy: true
4087 - name: bluefs_replay_recovery
4088 type: bool
4089 level: dev
4090 desc: Attempt to read bluefs log so large that it became unreadable.
4091 long_desc: If BlueFS log grows to extreme sizes (200GB+) it is likely that it becames
4092 unreadable. This options enables heuristics that scans devices for missing data.
4093 DO NOT ENABLE BY DEFAULT
4094 default: false
4095 with_legacy: true
4096 - name: bluefs_replay_recovery_disable_compact
4097 type: bool
4098 level: advanced
4099 default: false
4100 with_legacy: true
4101 - name: bluefs_check_for_zeros
4102 type: bool
4103 level: dev
4104 desc: Check data read for suspicious pages
4105 long_desc: Looks into data read to check if there is a 4K block entirely filled
4106 with zeros. If this happens, we re-read data. If there is difference, we print
4107 error to log.
4108 default: false
4109 see_also:
4110 - bluestore_retry_disk_reads
4111 flags:
4112 - runtime
4113 with_legacy: true
4114 - name: bluefs_check_volume_selector_on_umount
4115 type: bool
4116 level: dev
4117 desc: Check validity of volume selector on umount
4118 long_desc: Checks if volume selector did not diverge from the state it should be in.
4119 Reference is constructed from bluefs inode table. Asserts on inconsistency.
4120 default: false
4121 flags:
4122 - runtime
4123 with_legacy: true
4124 - name: bluefs_check_volume_selector_often
4125 type: bool
4126 level: dev
4127 desc: Periodically check validity of volume selector
4128 long_desc: Periodically checks if current volume selector does not diverge from the valid state.
4129 Reference is constructed from bluefs inode table. Asserts on inconsistency. This is debug feature.
4130 default: false
4131 see_also:
4132 - bluefs_check_volume_selector_on_umount
4133 flags:
4134 - startup
4135 with_legacy: true
4136 - name: bluestore_bluefs
4137 type: bool
4138 level: dev
4139 desc: Use BlueFS to back rocksdb
4140 long_desc: BlueFS allows rocksdb to share the same physical device(s) as the rest
4141 of BlueStore. It should be used in all cases unless testing/developing an alternative
4142 metadata database for BlueStore.
4143 default: true
4144 flags:
4145 - create
4146 with_legacy: true
4147 # mirror to normal Env for debug
4148 - name: bluestore_bluefs_env_mirror
4149 type: bool
4150 level: dev
4151 desc: Mirror bluefs data to file system for testing/validation
4152 default: false
4153 flags:
4154 - create
4155 with_legacy: true
4156 - name: bluestore_bluefs_max_free
4157 type: size
4158 level: advanced
4159 default: 10_G
4160 desc: Maximum free space allocated to BlueFS
4161 - name: bluestore_bluefs_alloc_failure_dump_interval
4162 type: float
4163 level: advanced
4164 desc: How frequently (in seconds) to dump allocator onBlueFS space allocation failure
4165 default: 0
4166 with_legacy: true
4167 - name: bluestore_spdk_mem
4168 type: size
4169 level: dev
4170 desc: Amount of dpdk memory size in MB
4171 long_desc: If running multiple SPDK instances per node, you must specify the amount
4172 of dpdk memory size in MB each instance will use, to make sure each instance uses
4173 its own dpdk memory
4174 default: 512
4175 - name: bluestore_spdk_coremask
4176 type: str
4177 level: dev
4178 desc: A hexadecimal bit mask of the cores to run on. Note the core numbering can
4179 change between platforms and should be determined beforehand
4180 default: '0x1'
4181 - name: bluestore_spdk_max_io_completion
4182 type: uint
4183 level: dev
4184 desc: Maximal I/Os to be batched completed while checking queue pair completions,
4185 0 means let spdk library determine it
4186 default: 0
4187 - name: bluestore_spdk_io_sleep
4188 type: uint
4189 level: dev
4190 desc: Time period to wait if there is no completed I/O from polling
4191 default: 5
4192 # If you want to use spdk driver, you need to specify NVMe serial number here
4193 # with "spdk:" prefix.
4194 # Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
4195 # get the serial number of Intel(R) Fultondale NVMe controllers.
4196 # Example:
4197 # bluestore_block_path = spdk:55cd2e404bd73932
4198 - name: bluestore_block_path
4199 type: str
4200 level: dev
4201 desc: Path to block device/file
4202 flags:
4203 - create
4204 with_legacy: true
4205 - name: bluestore_block_size
4206 type: size
4207 level: dev
4208 desc: Size of file to create for backing bluestore
4209 default: 100_G
4210 flags:
4211 - create
4212 with_legacy: true
4213 - name: bluestore_block_create
4214 type: bool
4215 level: dev
4216 desc: Create bluestore_block_path if it doesn't exist
4217 default: true
4218 see_also:
4219 - bluestore_block_path
4220 - bluestore_block_size
4221 flags:
4222 - create
4223 with_legacy: true
4224 - name: bluestore_block_db_path
4225 type: str
4226 level: dev
4227 desc: Path for db block device
4228 flags:
4229 - create
4230 with_legacy: true
4231 # rocksdb ssts (hot/warm)
4232 - name: bluestore_block_db_size
4233 type: size
4234 level: dev
4235 desc: Size of file to create for bluestore_block_db_path
4236 default: 0
4237 flags:
4238 - create
4239 with_legacy: true
4240 - name: bluestore_block_db_create
4241 type: bool
4242 level: dev
4243 desc: Create bluestore_block_db_path if it doesn't exist
4244 default: false
4245 see_also:
4246 - bluestore_block_db_path
4247 - bluestore_block_db_size
4248 flags:
4249 - create
4250 with_legacy: true
4251 - name: bluestore_block_wal_path
4252 type: str
4253 level: dev
4254 desc: Path to block device/file backing bluefs wal
4255 flags:
4256 - create
4257 with_legacy: true
4258 # rocksdb wal
4259 - name: bluestore_block_wal_size
4260 type: size
4261 level: dev
4262 desc: Size of file to create for bluestore_block_wal_path
4263 default: 96_M
4264 flags:
4265 - create
4266 with_legacy: true
4267 - name: bluestore_block_wal_create
4268 type: bool
4269 level: dev
4270 desc: Create bluestore_block_wal_path if it doesn't exist
4271 default: false
4272 see_also:
4273 - bluestore_block_wal_path
4274 - bluestore_block_wal_size
4275 flags:
4276 - create
4277 with_legacy: true
4278 # whether preallocate space if block/db_path/wal_path is file rather that block device.
4279 - name: bluestore_block_preallocate_file
4280 type: bool
4281 level: dev
4282 desc: Preallocate file created via bluestore_block*_create
4283 default: false
4284 flags:
4285 - create
4286 with_legacy: true
4287 - name: bluestore_ignore_data_csum
4288 type: bool
4289 level: dev
4290 desc: Ignore checksum errors on read and do not generate an EIO error
4291 default: false
4292 flags:
4293 - runtime
4294 with_legacy: true
4295 - name: bluestore_csum_type
4296 type: str
4297 level: advanced
4298 desc: Default checksum algorithm to use
4299 long_desc: crc32c, xxhash32, and xxhash64 are available. The _16 and _8 variants
4300 use only a subset of the bits for more compact (but less reliable) checksumming.
4301 fmt_desc: The default checksum algorithm to use.
4302 default: crc32c
4303 enum_values:
4304 - none
4305 - crc32c
4306 - crc32c_16
4307 - crc32c_8
4308 - xxhash32
4309 - xxhash64
4310 flags:
4311 - runtime
4312 with_legacy: true
4313 - name: bluestore_retry_disk_reads
4314 type: uint
4315 level: advanced
4316 desc: Number of read retries on checksum validation error
4317 long_desc: Retries to read data from the disk this many times when checksum validation
4318 fails to handle spurious read errors gracefully.
4319 default: 3
4320 min: 0
4321 max: 255
4322 flags:
4323 - runtime
4324 with_legacy: true
4325 - name: bluestore_min_alloc_size
4326 type: uint
4327 level: advanced
4328 desc: Minimum allocation size to allocate for an object
4329 long_desc: A smaller allocation size generally means less data is read and then
4330 rewritten when a copy-on-write operation is triggered (e.g., when writing to something
4331 that was recently snapshotted). Similarly, less data is journaled before performing
4332 an overwrite (writes smaller than min_alloc_size must first pass through the BlueStore
4333 journal). Larger values of min_alloc_size reduce the amount of metadata required
4334 to describe the on-disk layout and reduce overall fragmentation.
4335 default: 0
4336 flags:
4337 - create
4338 with_legacy: true
4339 - name: bluestore_min_alloc_size_hdd
4340 type: size
4341 level: advanced
4342 desc: Default min_alloc_size value for rotational media
4343 default: 4_K
4344 see_also:
4345 - bluestore_min_alloc_size
4346 flags:
4347 - create
4348 with_legacy: true
4349 - name: bluestore_min_alloc_size_ssd
4350 type: size
4351 level: advanced
4352 desc: Default min_alloc_size value for non-rotational (solid state) media
4353 default: 4_K
4354 see_also:
4355 - bluestore_min_alloc_size
4356 flags:
4357 - create
4358 with_legacy: true
4359 - name: bluestore_use_optimal_io_size_for_min_alloc_size
4360 type: bool
4361 level: advanced
4362 desc: Discover media optimal IO Size and use for min_alloc_size
4363 default: false
4364 see_also:
4365 - bluestore_min_alloc_size
4366 flags:
4367 - create
4368 with_legacy: true
4369 - name: bluestore_max_alloc_size
4370 type: size
4371 level: advanced
4372 desc: Maximum size of a single allocation (0 for no max)
4373 default: 0
4374 flags:
4375 - create
4376 with_legacy: true
4377 - name: bluestore_prefer_deferred_size
4378 type: size
4379 level: advanced
4380 desc: Writes smaller than this size will be written to the journal and then asynchronously
4381 written to the device. This can be beneficial when using rotational media where
4382 seeks are expensive, and is helpful both with and without solid state journal/wal
4383 devices.
4384 default: 0
4385 flags:
4386 - runtime
4387 with_legacy: true
4388 - name: bluestore_prefer_deferred_size_hdd
4389 type: size
4390 level: advanced
4391 desc: Default bluestore_prefer_deferred_size for rotational media
4392 default: 64_K
4393 see_also:
4394 - bluestore_prefer_deferred_size
4395 flags:
4396 - runtime
4397 with_legacy: true
4398 - name: bluestore_prefer_deferred_size_ssd
4399 type: size
4400 level: advanced
4401 desc: Default bluestore_prefer_deferred_size for non-rotational (solid state) media
4402 default: 0
4403 see_also:
4404 - bluestore_prefer_deferred_size
4405 flags:
4406 - runtime
4407 with_legacy: true
4408 - name: bluestore_compression_mode
4409 type: str
4410 level: advanced
4411 desc: Default policy for using compression when pool does not specify
4412 long_desc: '''none'' means never use compression. ''passive'' means use compression
4413 when clients hint that data is compressible. ''aggressive'' means use compression
4414 unless clients hint that data is not compressible. This option is used when the
4415 per-pool property for the compression mode is not present.'
4416 fmt_desc: The default policy for using compression if the per-pool property
4417 ``compression_mode`` is not set. ``none`` means never use
4418 compression. ``passive`` means use compression when
4419 :c:func:`clients hint <rados_set_alloc_hint>` that data is
4420 compressible. ``aggressive`` means use compression unless
4421 clients hint that data is not compressible. ``force`` means use
4422 compression under all circumstances even if the clients hint that
4423 the data is not compressible.
4424 default: none
4425 enum_values:
4426 - none
4427 - passive
4428 - aggressive
4429 - force
4430 flags:
4431 - runtime
4432 with_legacy: true
4433 - name: bluestore_compression_algorithm
4434 type: str
4435 level: advanced
4436 desc: Default compression algorithm to use when writing object data
4437 long_desc: This controls the default compressor to use (if any) if the per-pool
4438 property is not set. Note that zstd is *not* recommended for bluestore due to
4439 high CPU overhead when compressing small amounts of data.
4440 fmt_desc: The default compressor to use (if any) if the per-pool property
4441 ``compression_algorithm`` is not set. Note that ``zstd`` is *not*
4442 recommended for BlueStore due to high CPU overhead when
4443 compressing small amounts of data.
4444 default: snappy
4445 enum_values:
4446 - ''
4447 - snappy
4448 - zlib
4449 - zstd
4450 - lz4
4451 flags:
4452 - runtime
4453 with_legacy: true
4454 - name: bluestore_compression_min_blob_size
4455 type: size
4456 level: advanced
4457 desc: Maximum chunk size to apply compression to when random access is expected
4458 for an object.
4459 long_desc: Chunks larger than this are broken into smaller chunks before being compressed
4460 fmt_desc: Chunks smaller than this are never compressed.
4461 The per-pool property ``compression_min_blob_size`` overrides
4462 this setting.
4463 default: 0
4464 flags:
4465 - runtime
4466 with_legacy: true
4467 - name: bluestore_compression_min_blob_size_hdd
4468 type: size
4469 level: advanced
4470 desc: Default value of bluestore_compression_min_blob_size for rotational media
4471 fmt_desc: Default value of ``bluestore compression min blob size``
4472 for rotational media.
4473 default: 8_K
4474 see_also:
4475 - bluestore_compression_min_blob_size
4476 flags:
4477 - runtime
4478 with_legacy: true
4479 - name: bluestore_compression_min_blob_size_ssd
4480 type: size
4481 level: advanced
4482 desc: Default value of bluestore_compression_min_blob_size for non-rotational (solid
4483 state) media
4484 fmt_desc: Default value of ``bluestore compression min blob size``
4485 for non-rotational (solid state) media.
4486 default: 64_K
4487 see_also:
4488 - bluestore_compression_min_blob_size
4489 flags:
4490 - runtime
4491 with_legacy: true
4492 - name: bluestore_compression_max_blob_size
4493 type: size
4494 level: advanced
4495 desc: Maximum chunk size to apply compression to when non-random access is expected
4496 for an object.
4497 long_desc: Chunks larger than this are broken into smaller chunks before being compressed
4498 fmt_desc: Chunks larger than this value are broken into smaller blobs of at most
4499 ``bluestore_compression_max_blob_size`` bytes before being compressed.
4500 The per-pool property ``compression_max_blob_size`` overrides
4501 this setting.
4502 default: 0
4503 flags:
4504 - runtime
4505 with_legacy: true
4506 - name: bluestore_compression_max_blob_size_hdd
4507 type: size
4508 level: advanced
4509 desc: Default value of bluestore_compression_max_blob_size for rotational media
4510 fmt_desc: Default value of ``bluestore compression max blob size``
4511 for rotational media.
4512 default: 64_K
4513 see_also:
4514 - bluestore_compression_max_blob_size
4515 flags:
4516 - runtime
4517 with_legacy: true
4518 - name: bluestore_compression_max_blob_size_ssd
4519 type: size
4520 level: advanced
4521 desc: Default value of bluestore_compression_max_blob_size for non-rotational (solid
4522 state) media
4523 fmt_desc: Default value of ``bluestore compression max blob size``
4524 for non-rotational (SSD, NVMe) media.
4525 default: 64_K
4526 see_also:
4527 - bluestore_compression_max_blob_size
4528 flags:
4529 - runtime
4530 with_legacy: true
4531 # Specifies minimum expected amount of saved allocation units
4532 # per single blob to enable compressed blobs garbage collection
4533 - name: bluestore_gc_enable_blob_threshold
4534 type: int
4535 level: dev
4536 default: 0
4537 flags:
4538 - runtime
4539 with_legacy: true
4540 # Specifies minimum expected amount of saved allocation units
4541 # per all blobsb to enable compressed blobs garbage collection
4542 - name: bluestore_gc_enable_total_threshold
4543 type: int
4544 level: dev
4545 default: 0
4546 flags:
4547 - runtime
4548 with_legacy: true
4549 - name: bluestore_max_blob_size
4550 type: size
4551 level: dev
4552 long_desc: Bluestore blobs are collections of extents (ie on-disk data) originating
4553 from one or more objects. Blobs can be compressed, typically have checksum data,
4554 may be overwritten, may be shared (with an extent ref map), or split. This setting
4555 controls the maximum size a blob is allowed to be.
4556 default: 0
4557 flags:
4558 - runtime
4559 with_legacy: true
4560 - name: bluestore_max_blob_size_hdd
4561 type: size
4562 level: dev
4563 default: 64_K
4564 see_also:
4565 - bluestore_max_blob_size
4566 flags:
4567 - runtime
4568 with_legacy: true
4569 - name: bluestore_max_blob_size_ssd
4570 type: size
4571 level: dev
4572 default: 64_K
4573 see_also:
4574 - bluestore_max_blob_size
4575 flags:
4576 - runtime
4577 with_legacy: true
4578 # Require the net gain of compression at least to be at this ratio,
4579 # otherwise we don't compress.
4580 # And ask for compressing at least 12.5%(1/8) off, by default.
4581 - name: bluestore_compression_required_ratio
4582 type: float
4583 level: advanced
4584 desc: Compression ratio required to store compressed data
4585 long_desc: If we compress data and get less than this we discard the result and
4586 store the original uncompressed data.
4587 fmt_desc: The ratio of the size of the data chunk after
4588 compression relative to the original size must be at
4589 least this small in order to store the compressed
4590 version.
4591 default: 0.875
4592 flags:
4593 - runtime
4594 with_legacy: true
4595 - name: bluestore_extent_map_shard_max_size
4596 type: size
4597 level: dev
4598 desc: Max size (bytes) for a single extent map shard before splitting
4599 default: 1200
4600 with_legacy: true
4601 - name: bluestore_extent_map_shard_target_size
4602 type: size
4603 level: dev
4604 desc: Target size (bytes) for a single extent map shard
4605 default: 500
4606 with_legacy: true
4607 - name: bluestore_extent_map_shard_min_size
4608 type: size
4609 level: dev
4610 desc: Min size (bytes) for a single extent map shard before merging
4611 default: 150
4612 with_legacy: true
4613 - name: bluestore_extent_map_shard_target_size_slop
4614 type: float
4615 level: dev
4616 desc: Ratio above/below target for a shard when trying to align to an existing extent
4617 or blob boundary
4618 default: 0.2
4619 with_legacy: true
4620 - name: bluestore_extent_map_inline_shard_prealloc_size
4621 type: size
4622 level: dev
4623 desc: Preallocated buffer for inline shards
4624 default: 256
4625 with_legacy: true
4626 - name: bluestore_cache_trim_interval
4627 type: float
4628 level: advanced
4629 desc: How frequently we trim the bluestore cache
4630 default: 0.05
4631 with_legacy: true
4632 - name: bluestore_cache_trim_max_skip_pinned
4633 type: uint
4634 level: dev
4635 desc: Max pinned cache entries we consider before giving up
4636 default: 1000
4637 with_legacy: true
4638 - name: bluestore_cache_type
4639 type: str
4640 level: dev
4641 desc: Cache replacement algorithm
4642 default: 2q
4643 enum_values:
4644 - 2q
4645 - lru
4646 with_legacy: true
4647 - name: bluestore_2q_cache_kin_ratio
4648 type: float
4649 level: dev
4650 desc: 2Q paper suggests .5
4651 default: 0.5
4652 with_legacy: true
4653 - name: bluestore_2q_cache_kout_ratio
4654 type: float
4655 level: dev
4656 desc: 2Q paper suggests .5
4657 default: 0.5
4658 with_legacy: true
4659 - name: bluestore_cache_size
4660 type: size
4661 level: dev
4662 desc: Cache size (in bytes) for BlueStore
4663 long_desc: This includes data and metadata cached by BlueStore as well as memory
4664 devoted to rocksdb's cache(s).
4665 fmt_desc: The amount of memory BlueStore will use for its cache. If zero,
4666 ``bluestore_cache_size_hdd`` or ``bluestore_cache_size_ssd`` will
4667 be used instead.
4668 default: 0
4669 with_legacy: true
4670 - name: bluestore_cache_size_hdd
4671 type: size
4672 level: dev
4673 desc: Default bluestore_cache_size for rotational media
4674 fmt_desc: The default amount of memory BlueStore will use for its cache when
4675 backed by an HDD.
4676 default: 1_G
4677 see_also:
4678 - bluestore_cache_size
4679 with_legacy: true
4680 - name: bluestore_cache_size_ssd
4681 type: size
4682 level: dev
4683 desc: Default bluestore_cache_size for non-rotational (solid state) media
4684 fmt_desc: The default amount of memory BlueStore will use for its cache when
4685 backed by an SSD.
4686 default: 3_G
4687 see_also:
4688 - bluestore_cache_size
4689 with_legacy: true
4690 - name: bluestore_cache_meta_ratio
4691 type: float
4692 level: dev
4693 desc: Ratio of bluestore cache to devote to metadata
4694 default: 0.45
4695 see_also:
4696 - bluestore_cache_size
4697 with_legacy: true
4698 - name: bluestore_cache_kv_ratio
4699 type: float
4700 level: dev
4701 desc: Ratio of bluestore cache to devote to key/value database (RocksDB)
4702 default: 0.45
4703 see_also:
4704 - bluestore_cache_size
4705 with_legacy: true
4706 - name: bluestore_cache_kv_onode_ratio
4707 type: float
4708 level: dev
4709 desc: Ratio of bluestore cache to devote to kv onode column family (rocksdb)
4710 default: 0.04
4711 see_also:
4712 - bluestore_cache_size
4713 - name: bluestore_cache_autotune
4714 type: bool
4715 level: dev
4716 desc: Automatically tune the ratio of caches while respecting min values.
4717 fmt_desc: Automatically tune the space ratios assigned to various BlueStore
4718 caches while respecting minimum values.
4719 default: true
4720 see_also:
4721 - bluestore_cache_size
4722 - bluestore_cache_meta_ratio
4723 - name: bluestore_cache_autotune_interval
4724 type: float
4725 level: dev
4726 desc: The number of seconds to wait between rebalances when cache autotune is enabled.
4727 fmt_desc: |
4728 The number of seconds to wait between rebalances when cache autotune
4729 is enabled. This setting changes how quickly the allocation ratios of
4730 various caches are recomputed. Note: Setting this interval too small
4731 can result in high CPU usage and lower performance.
4732 default: 5
4733 see_also:
4734 - bluestore_cache_autotune
4735 - name: bluestore_cache_age_bin_interval
4736 type: float
4737 level: dev
4738 desc: The duration (in seconds) represented by a single cache age bin.
4739 fmt_desc: |
4740 The caches used by bluestore will assign cache entries to an 'age bin'
4741 that represents a period of time during which that cache entry was most
4742 recently updated. By binning the caches in this way, Ceph's priority
4743 cache balancing code can make better decisions about which caches should
4744 receive priority based on the relative ages of items in the caches. By
4745 default, a single cache age bin represents 1 second of time. Note:
4746 Setting this interval too small can result in high CPU usage and lower
4747 performance.
4748 default: 1
4749 see_also:
4750 - bluestore_cache_age_bins_kv
4751 - bluestore_cache_age_bins_kv_onode
4752 - bluestore_cache_age_bins_meta
4753 - bluestore_cache_age_bins_data
4754 - name: bluestore_cache_age_bins_kv
4755 type: str
4756 level: dev
4757 desc: A 10 element, space separated list of age bins for kv cache
4758 fmt_desc: |
4759 A 10 element, space separated list of cache age bins grouped by
4760 priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4761 PRI10=[n+8,n+9). Values represent the starting and ending bin for each
4762 priority level. A 0 in the 2nd term will prevent any items from being
4763 associated with that priority. bin duration is based on the
4764 bluestore_cache_age_bin_interval value. For example,
4765 "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4766 contains 1 age bin. Assuming the default age bin interval of 1 second,
4767 PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4768 bins representing cache items that are 1 to less than 5 seconds old. All
4769 other cache items in this example are associated with the lowest priority
4770 level as PRI3-PRI10 all have 0s in their second term.
4771 default: "1 2 6 24 120 720 0 0 0 0"
4772 see_also:
4773 - bluestore_cache_age_bin_interval
4774 - name: bluestore_cache_age_bins_kv_onode
4775 type: str
4776 level: dev
4777 desc: A 10 element, space separated list of age bins for kv onode cache
4778 fmt_desc: |
4779 A 10 element, space separated list of cache age bins grouped by
4780 priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4781 PRI10=[n+8,n+9). Values represent the starting and ending bin for each
4782 priority level. A 0 in the 2nd term will prevent any items from being
4783 associated with that priority. bin duration is based on the
4784 bluestore_cache_age_bin_interval value. For example,
4785 "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4786 contains 1 age bin. Assuming the default age bin interval of 1 second,
4787 PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4788 bins representing cache items that are 1 to less than 5 seconds old. All
4789 other cache items in this example are associated with the lowest priority
4790 level as PRI3-PRI10 all have 0s in their second term.
4791 default: "0 0 0 0 0 0 0 0 0 720"
4792 see_also:
4793 - bluestore_cache_age_bin_interval
4794 - name: bluestore_cache_age_bins_meta
4795 type: str
4796 level: dev
4797 desc: A 10 element, space separated list of age bins for onode cache
4798 fmt_desc: |
4799 A 10 element, space separated list of cache age bins grouped by
4800 priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4801 PRI10=[n+8,n+9). Values represent the starting and ending bin for each
4802 priority level. A 0 in the 2nd term will prevent any items from being
4803 associated with that priority. bin duration is based on the
4804 bluestore_cache_age_bin_interval value. For example,
4805 "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4806 contains 1 age bin. Assuming the default age bin interval of 1 second,
4807 PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4808 bins representing cache items that are 1 to less than 5 seconds old. All
4809 other cache items in this example are associated with the lowest priority
4810 level as PRI3-PRI10 all have 0s in their second term.
4811 default: "1 2 6 24 120 720 0 0 0 0"
4812 see_also:
4813 - bluestore_cache_age_bin_interval
4814 - name: bluestore_cache_age_bins_data
4815 type: str
4816 level: dev
4817 desc: A 10 element, space separated list of age bins for data cache
4818 fmt_desc: |
4819 A 10 element, space separated list of cache age bins grouped by
4820 priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4821 PRI10=[n+8,n+9). Values represent the starting and ending bin for each
4822 priority level. A 0 in the 2nd term will prevent any items from being
4823 associated with that priority. bin duration is based on the
4824 bluestore_cache_age_bin_interval value. For example,
4825 "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4826 contains 1 age bin. Assuming the default age bin interval of 1 second,
4827 PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4828 bins representing cache items that are 1 to less than 5 seconds old. All
4829 other cache items in this example are associated with the lowest priority
4830 level as PRI3-PRI10 all have 0s in their second term.
4831 default: "1 2 6 24 120 720 0 0 0 0"
4832 see_also:
4833 - bluestore_cache_age_bin_interval
4834 - name: bluestore_alloc_stats_dump_interval
4835 type: float
4836 level: dev
4837 desc: The period (in second) for logging allocation statistics.
4838 default: 1_day
4839 with_legacy: true
4840 - name: bluestore_kvbackend
4841 type: str
4842 level: dev
4843 desc: Key value database to use for bluestore
4844 default: rocksdb
4845 flags:
4846 - create
4847 with_legacy: true
4848 - name: bluestore_allocator
4849 type: str
4850 level: advanced
4851 desc: Allocator policy
4852 long_desc: Allocator to use for bluestore. Stupid should only be used for testing.
4853 default: hybrid
4854 enum_values:
4855 - bitmap
4856 - stupid
4857 - avl
4858 - hybrid
4859 - zoned
4860 with_legacy: true
4861 - name: bluestore_freelist_blocks_per_key
4862 type: size
4863 level: dev
4864 desc: Block (and bits) per database key
4865 default: 128
4866 with_legacy: true
4867 - name: bluestore_bitmapallocator_blocks_per_zone
4868 type: size
4869 level: dev
4870 default: 1_K
4871 with_legacy: true
4872 - name: bluestore_bitmapallocator_span_size
4873 type: size
4874 level: dev
4875 default: 1_K
4876 with_legacy: true
4877 - name: bluestore_max_deferred_txc
4878 type: uint
4879 level: advanced
4880 desc: Max transactions with deferred writes that can accumulate before we force
4881 flush deferred writes
4882 default: 32
4883 with_legacy: true
4884 - name: bluestore_max_defer_interval
4885 type: float
4886 level: advanced
4887 desc: max duration to force deferred submit
4888 default: 3
4889 with_legacy: true
4890 - name: bluestore_rocksdb_options
4891 type: str
4892 level: advanced
4893 desc: Full set of rocksdb settings to override
4894 default: compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152,max_background_compactions=2,max_total_wal_size=1073741824
4895 with_legacy: true
4896 - name: bluestore_rocksdb_options_annex
4897 type: str
4898 level: advanced
4899 desc: An addition to bluestore_rocksdb_options. Allows setting rocksdb options without
4900 repeating the existing defaults.
4901 with_legacy: true
4902 - name: bluestore_rocksdb_cf
4903 type: bool
4904 level: advanced
4905 desc: Enable use of rocksdb column families for bluestore metadata
4906 fmt_desc: Enables sharding of BlueStore's RocksDB.
4907 When ``true``, ``bluestore_rocksdb_cfs`` is used.
4908 Only applied when OSD is doing ``--mkfs``.
4909 default: true
4910 verbatim: |
4911 #ifdef WITH_SEASTAR
4912 // This is necessary as the Seastar's allocator imposes restrictions
4913 // on the number of threads that entered malloc/free/*. Unfortunately,
4914 // RocksDB sharding in BlueStore dramatically lifted the number of
4915 // threads spawn during RocksDB's init.
4916 .set_validator([](std::string *value, std::string *error_message) {
4917 if (const bool parsed_value = strict_strtob(value->c_str(), error_message);
4918 error_message->empty() && parsed_value) {
4919 *error_message = "invalid BlueStore sharding configuration."
4920 " Be aware any change takes effect only on mkfs!";
4921 return -EINVAL;
4922 } else {
4923 return 0;
4924 }
4925 })
4926 #endif
4927 - name: bluestore_rocksdb_cfs
4928 type: str
4929 level: dev
4930 desc: Definition of column families and their sharding
4931 long_desc: 'Space separated list of elements: column_def [ ''='' rocksdb_options
4932 ]. column_def := column_name [ ''('' shard_count [ '','' hash_begin ''-'' [ hash_end
4933 ] ] '')'' ]. Example: ''I=write_buffer_size=1048576 O(6) m(7,10-)''. Interval
4934 [hash_begin..hash_end) defines characters to use for hash calculation. Recommended
4935 hash ranges: O(0-13) P(0-8) m(0-16). Sharding of S,T,C,M,B prefixes is inadvised'
4936 fmt_desc: Definition of BlueStore's RocksDB sharding.
4937 The optimal value depends on multiple factors, and modification is invadvisable.
4938 This setting is used only when OSD is doing ``--mkfs``.
4939 Next runs of OSD retrieve sharding from disk.
4940 default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P
4941 - name: bluestore_qfsck_on_mount
4942 type: bool
4943 level: dev
4944 desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state
4945 default: true
4946 with_legacy: true
4947 - name: bluestore_fsck_on_mount
4948 type: bool
4949 level: dev
4950 desc: Run fsck at mount
4951 default: false
4952 with_legacy: true
4953 - name: bluestore_fsck_on_mount_deep
4954 type: bool
4955 level: dev
4956 desc: Run deep fsck at mount when bluestore_fsck_on_mount is set to true
4957 default: false
4958 with_legacy: true
4959 - name: bluestore_fsck_quick_fix_on_mount
4960 type: bool
4961 level: dev
4962 desc: Do quick-fix for the store at mount
4963 default: false
4964 with_legacy: true
4965 - name: bluestore_fsck_on_umount
4966 type: bool
4967 level: dev
4968 desc: Run fsck at umount
4969 default: false
4970 with_legacy: true
4971 - name: bluestore_allocation_from_file
4972 type: bool
4973 level: dev
4974 desc: Remove allocation info from RocksDB and store the info in a new allocation file
4975 default: true
4976 with_legacy: true
4977 - name: bluestore_fsck_on_umount_deep
4978 type: bool
4979 level: dev
4980 desc: Run deep fsck at umount when bluestore_fsck_on_umount is set to true
4981 default: false
4982 with_legacy: true
4983 - name: bluestore_fsck_on_mkfs
4984 type: bool
4985 level: dev
4986 desc: Run fsck after mkfs
4987 default: true
4988 with_legacy: true
4989 - name: bluestore_fsck_on_mkfs_deep
4990 type: bool
4991 level: dev
4992 desc: Run deep fsck after mkfs
4993 default: false
4994 with_legacy: true
4995 - name: bluestore_sync_submit_transaction
4996 type: bool
4997 level: dev
4998 desc: Try to submit metadata transaction to rocksdb in queuing thread context
4999 default: false
5000 with_legacy: true
5001 - name: bluestore_fsck_read_bytes_cap
5002 type: size
5003 level: advanced
5004 desc: Maximum bytes read at once by deep fsck
5005 default: 64_M
5006 flags:
5007 - runtime
5008 with_legacy: true
5009 - name: bluestore_fsck_quick_fix_threads
5010 type: int
5011 level: advanced
5012 desc: Number of additional threads to perform quick-fix (shallow fsck) command
5013 default: 2
5014 with_legacy: true
5015 - name: bluestore_fsck_shared_blob_tracker_size
5016 type: float
5017 level: dev
5018 desc: Size(a fraction of osd_memory_target, defaults to 128MB) of a hash table to track shared blobs ref counts. Higher the size, more precise is the tracker -> less overhead during the repair.
5019 default: 0.03125
5020 see_also:
5021 - osd_memory_target
5022 flags:
5023 - runtime
5024 - name: bluestore_throttle_bytes
5025 type: size
5026 level: advanced
5027 desc: Maximum bytes in flight before we throttle IO submission
5028 default: 64_M
5029 flags:
5030 - runtime
5031 with_legacy: true
5032 - name: bluestore_throttle_deferred_bytes
5033 type: size
5034 level: advanced
5035 desc: Maximum bytes for deferred writes before we throttle IO submission
5036 default: 128_M
5037 flags:
5038 - runtime
5039 with_legacy: true
5040 - name: bluestore_throttle_cost_per_io
5041 type: size
5042 level: advanced
5043 desc: Overhead added to transaction cost (in bytes) for each IO
5044 default: 0
5045 flags:
5046 - runtime
5047 with_legacy: true
5048 - name: bluestore_throttle_cost_per_io_hdd
5049 type: uint
5050 level: advanced
5051 desc: Default bluestore_throttle_cost_per_io for rotational media
5052 default: 670000
5053 see_also:
5054 - bluestore_throttle_cost_per_io
5055 flags:
5056 - runtime
5057 with_legacy: true
5058 - name: bluestore_throttle_cost_per_io_ssd
5059 type: uint
5060 level: advanced
5061 desc: Default bluestore_throttle_cost_per_io for non-rotation (solid state) media
5062 default: 4000
5063 see_also:
5064 - bluestore_throttle_cost_per_io
5065 flags:
5066 - runtime
5067 with_legacy: true
5068 - name: bluestore_deferred_batch_ops
5069 type: uint
5070 level: advanced
5071 desc: Max number of deferred writes before we flush the deferred write queue
5072 default: 0
5073 min: 0
5074 max: 65535
5075 flags:
5076 - runtime
5077 with_legacy: true
5078 - name: bluestore_deferred_batch_ops_hdd
5079 type: uint
5080 level: advanced
5081 desc: Default bluestore_deferred_batch_ops for rotational media
5082 default: 64
5083 see_also:
5084 - bluestore_deferred_batch_ops
5085 min: 0
5086 max: 65535
5087 flags:
5088 - runtime
5089 with_legacy: true
5090 - name: bluestore_deferred_batch_ops_ssd
5091 type: uint
5092 level: advanced
5093 desc: Default bluestore_deferred_batch_ops for non-rotational (solid state) media
5094 default: 16
5095 see_also:
5096 - bluestore_deferred_batch_ops
5097 min: 0
5098 max: 65535
5099 flags:
5100 - runtime
5101 with_legacy: true
5102 - name: bluestore_nid_prealloc
5103 type: int
5104 level: dev
5105 desc: Number of unique object ids to preallocate at a time
5106 default: 1024
5107 with_legacy: true
5108 - name: bluestore_blobid_prealloc
5109 type: uint
5110 level: dev
5111 desc: Number of unique blob ids to preallocate at a time
5112 default: 10_K
5113 with_legacy: true
5114 - name: bluestore_clone_cow
5115 type: bool
5116 level: advanced
5117 desc: Use copy-on-write when cloning objects (versus reading and rewriting them
5118 at clone time)
5119 default: true
5120 flags:
5121 - runtime
5122 with_legacy: true
5123 - name: bluestore_default_buffered_read
5124 type: bool
5125 level: advanced
5126 desc: Cache read results by default (unless hinted NOCACHE or WONTNEED)
5127 default: true
5128 flags:
5129 - runtime
5130 with_legacy: true
5131 - name: bluestore_default_buffered_write
5132 type: bool
5133 level: advanced
5134 desc: Cache writes by default (unless hinted NOCACHE or WONTNEED)
5135 default: false
5136 flags:
5137 - runtime
5138 with_legacy: true
5139 - name: bluestore_debug_no_reuse_blocks
5140 type: bool
5141 level: dev
5142 default: false
5143 with_legacy: true
5144 - name: bluestore_debug_small_allocations
5145 type: int
5146 level: dev
5147 default: 0
5148 with_legacy: true
5149 - name: bluestore_debug_too_many_blobs_threshold
5150 type: int
5151 level: dev
5152 default: 24576
5153 with_legacy: true
5154 - name: bluestore_debug_freelist
5155 type: bool
5156 level: dev
5157 default: false
5158 with_legacy: true
5159 - name: bluestore_debug_prefill
5160 type: float
5161 level: dev
5162 desc: simulate fragmentation
5163 default: 0
5164 with_legacy: true
5165 - name: bluestore_debug_prefragment_max
5166 type: size
5167 level: dev
5168 default: 1_M
5169 with_legacy: true
5170 - name: bluestore_debug_inject_read_err
5171 type: bool
5172 level: dev
5173 default: false
5174 with_legacy: true
5175 - name: bluestore_debug_randomize_serial_transaction
5176 type: int
5177 level: dev
5178 default: 0
5179 with_legacy: true
5180 - name: bluestore_debug_omit_block_device_write
5181 type: bool
5182 level: dev
5183 default: false
5184 with_legacy: true
5185 - name: bluestore_debug_fsck_abort
5186 type: bool
5187 level: dev
5188 default: false
5189 with_legacy: true
5190 - name: bluestore_debug_omit_kv_commit
5191 type: bool
5192 level: dev
5193 default: false
5194 with_legacy: true
5195 - name: bluestore_debug_permit_any_bdev_label
5196 type: bool
5197 level: dev
5198 default: false
5199 with_legacy: true
5200 - name: bluestore_debug_random_read_err
5201 type: float
5202 level: dev
5203 default: 0
5204 with_legacy: true
5205 - name: bluestore_debug_inject_bug21040
5206 type: bool
5207 level: dev
5208 default: false
5209 with_legacy: true
5210 - name: bluestore_debug_inject_csum_err_probability
5211 type: float
5212 level: dev
5213 desc: inject crc verification errors into bluestore device reads
5214 default: 0
5215 with_legacy: true
5216 - name: bluestore_debug_legacy_omap
5217 type: bool
5218 level: dev
5219 desc: Allows mkfs to create OSD in legacy OMAP naming mode (neither per-pool nor per-pg).
5220 This is intended primarily for developers' purposes. The resulting OSD might/would
5221 be transformed to the currrently default 'per-pg' format when BlueStore's quick-fix or
5222 repair are applied.
5223 default: false
5224 with_legacy: true
5225 - name: bluestore_fsck_error_on_no_per_pool_stats
5226 type: bool
5227 level: advanced
5228 desc: Make fsck error (instead of warn) when bluestore lacks per-pool stats, e.g.,
5229 after an upgrade
5230 default: false
5231 with_legacy: true
5232 - name: bluestore_warn_on_bluefs_spillover
5233 type: bool
5234 level: advanced
5235 desc: Enable health indication on bluefs slow device usage
5236 default: true
5237 with_legacy: true
5238 - name: bluestore_warn_on_legacy_statfs
5239 type: bool
5240 level: advanced
5241 desc: Enable health indication on lack of per-pool statfs reporting from bluestore
5242 default: true
5243 with_legacy: true
5244 - name: bluestore_warn_on_spurious_read_errors
5245 type: bool
5246 level: advanced
5247 desc: Enable health indication when spurious read errors are observed by OSD
5248 default: true
5249 with_legacy: true
5250 - name: bluestore_fsck_error_on_no_per_pool_omap
5251 type: bool
5252 level: advanced
5253 desc: Make fsck error (instead of warn) when objects without per-pool omap are found
5254 default: false
5255 with_legacy: true
5256 - name: bluestore_fsck_error_on_no_per_pg_omap
5257 type: bool
5258 level: advanced
5259 desc: Make fsck error (instead of warn) when objects without per-pg omap are found
5260 default: false
5261 with_legacy: true
5262 - name: bluestore_warn_on_no_per_pool_omap
5263 type: bool
5264 level: advanced
5265 desc: Enable health indication on lack of per-pool omap
5266 default: true
5267 with_legacy: true
5268 - name: bluestore_warn_on_no_per_pg_omap
5269 type: bool
5270 level: advanced
5271 desc: Enable health indication on lack of per-pg omap
5272 default: false
5273 with_legacy: true
5274 - name: bluestore_log_op_age
5275 type: float
5276 level: advanced
5277 desc: log operation if it's slower than this age (seconds)
5278 default: 5
5279 with_legacy: true
5280 - name: bluestore_log_omap_iterator_age
5281 type: float
5282 level: advanced
5283 desc: log omap iteration operation if it's slower than this age (seconds)
5284 default: 5
5285 with_legacy: true
5286 - name: bluestore_log_collection_list_age
5287 type: float
5288 level: advanced
5289 desc: log collection list operation if it's slower than this age (seconds)
5290 default: 1_min
5291 with_legacy: true
5292 - name: bluestore_debug_enforce_settings
5293 type: str
5294 level: dev
5295 desc: Enforces specific hw profile settings
5296 long_desc: '''hdd'' enforces settings intended for BlueStore above a rotational
5297 drive. ''ssd'' enforces settings intended for BlueStore above a solid drive. ''default''
5298 - using settings for the actual hardware.'
5299 default: default
5300 enum_values:
5301 - default
5302 - hdd
5303 - ssd
5304 with_legacy: true
5305 - name: bluestore_avl_alloc_ff_max_search_count
5306 type: uint
5307 level: dev
5308 desc: Search for this many ranges in first-fit mode before switching over to
5309 to best-fit mode. 0 to iterate through all ranges for required chunk.
5310 default: 100
5311 - name: bluestore_avl_alloc_ff_max_search_bytes
5312 type: size
5313 level: dev
5314 desc: Maximum distance to search in first-fit mode before switching over to
5315 to best-fit mode. 0 to iterate through all ranges for required chunk.
5316 default: 16_M
5317 - name: bluestore_avl_alloc_bf_threshold
5318 type: uint
5319 level: dev
5320 desc: Sets threshold at which shrinking max free chunk size triggers enabling best-fit
5321 mode.
5322 long_desc: 'AVL allocator works in two modes: near-fit and best-fit. By default,
5323 it uses very fast near-fit mode, in which it tries to fit a new block near the
5324 last allocated block of similar size. The second mode is much slower best-fit
5325 mode, in which it tries to find an exact match for the requested allocation. This
5326 mode is used when either the device gets fragmented or when it is low on free
5327 space. When the largest free block is smaller than ''bluestore_avl_alloc_bf_threshold'',
5328 best-fit mode is used.'
5329 default: 128_K
5330 see_also:
5331 - bluestore_avl_alloc_bf_free_pct
5332 - name: bluestore_avl_alloc_bf_free_pct
5333 type: uint
5334 level: dev
5335 desc: Sets threshold at which shrinking free space (in %, integer) triggers enabling
5336 best-fit mode.
5337 long_desc: 'AVL allocator works in two modes: near-fit and best-fit. By default,
5338 it uses very fast near-fit mode, in which it tries to fit a new block near the
5339 last allocated block of similar size. The second mode is much slower best-fit
5340 mode, in which it tries to find an exact match for the requested allocation. This
5341 mode is used when either the device gets fragmented or when it is low on free
5342 space. When free space is smaller than ''bluestore_avl_alloc_bf_free_pct'', best-fit
5343 mode is used.'
5344 default: 4
5345 see_also:
5346 - bluestore_avl_alloc_bf_threshold
5347 - name: bluestore_hybrid_alloc_mem_cap
5348 type: uint
5349 level: dev
5350 desc: Maximum RAM hybrid allocator should use before enabling bitmap supplement
5351 default: 64_M
5352 - name: bluestore_volume_selection_policy
5353 type: str
5354 level: dev
5355 desc: Determines bluefs volume selection policy
5356 long_desc: Determines bluefs volume selection policy. 'use_some_extra' policy allows
5357 to override RocksDB level granularity and put high level's data to faster device
5358 even when the level doesn't completely fit there. 'fit_to_fast' policy enables
5359 using 100% of faster disk capacity and allows the user to turn on 'level_compaction_dynamic_level_bytes'
5360 option in RocksDB options.
5361 default: use_some_extra
5362 enum_values:
5363 - rocksdb_original
5364 - use_some_extra
5365 - fit_to_fast
5366 with_legacy: true
5367 - name: bluestore_volume_selection_reserved_factor
5368 type: float
5369 level: advanced
5370 desc: DB level size multiplier. Determines amount of space at DB device to bar from
5371 the usage when 'use some extra' policy is in action. Reserved size is determined
5372 as sum(L_max_size[0], L_max_size[L-1]) + L_max_size[L] * this_factor
5373 default: 2
5374 flags:
5375 - startup
5376 with_legacy: true
5377 - name: bluestore_volume_selection_reserved
5378 type: int
5379 level: advanced
5380 desc: Space reserved at DB device and not allowed for 'use some extra' policy usage.
5381 Overrides 'bluestore_volume_selection_reserved_factor' setting and introduces
5382 straightforward limit.
5383 default: 0
5384 flags:
5385 - startup
5386 with_legacy: true
5387 - name: bdev_ioring
5388 type: bool
5389 level: advanced
5390 desc: Enables Linux io_uring API instead of libaio
5391 default: false
5392 - name: bdev_ioring_hipri
5393 type: bool
5394 level: advanced
5395 desc: Enables Linux io_uring API Use polled IO completions
5396 default: false
5397 - name: bdev_ioring_sqthread_poll
5398 type: bool
5399 level: advanced
5400 desc: Enables Linux io_uring API Offload submission/completion to kernel thread
5401 default: false
5402 - name: bluestore_kv_sync_util_logging_s
5403 type: float
5404 level: advanced
5405 desc: KV sync thread utilization logging period
5406 long_desc: How often (in seconds) to print KV sync thread utilization, not logged
5407 when set to 0 or when utilization is 0%
5408 default: 10
5409 flags:
5410 - runtime
5411 with_legacy: true
5412 - name: bluestore_fail_eio
5413 type: bool
5414 level: dev
5415 desc: fail/crash on EIO
5416 long_desc: whether bluestore osd fails on eio
5417 default: false
5418 flags:
5419 - runtime
5420 with_legacy: true
5421 - name: bluestore_zero_block_detection
5422 type: bool
5423 level: dev
5424 desc: punch holes instead of writing zeros
5425 long_desc: Intended for large-scale synthetic testing. Currently this is implemented
5426 with punch hole semantics, affecting the logical extent map of the object. This does
5427 not interact well with some RBD and CephFS features.
5428 default: false
5429 flags:
5430 - runtime
5431 with_legacy: true
5432 - name: kstore_max_ops
5433 type: uint
5434 level: advanced
5435 default: 512
5436 with_legacy: true
5437 - name: kstore_max_bytes
5438 type: size
5439 level: advanced
5440 default: 64_M
5441 with_legacy: true
5442 - name: kstore_backend
5443 type: str
5444 level: advanced
5445 default: rocksdb
5446 with_legacy: true
5447 - name: kstore_rocksdb_options
5448 type: str
5449 level: advanced
5450 desc: Options to pass through when RocksDB is used as the KeyValueDB for kstore.
5451 default: compression=kNoCompression
5452 with_legacy: true
5453 - name: kstore_fsck_on_mount
5454 type: bool
5455 level: advanced
5456 desc: Whether or not to run fsck on mount for kstore.
5457 default: false
5458 with_legacy: true
5459 - name: kstore_fsck_on_mount_deep
5460 type: bool
5461 level: advanced
5462 desc: Whether or not to run deep fsck on mount for kstore
5463 default: true
5464 with_legacy: true
5465 - name: kstore_nid_prealloc
5466 type: uint
5467 level: advanced
5468 default: 1_K
5469 with_legacy: true
5470 - name: kstore_sync_transaction
5471 type: bool
5472 level: advanced
5473 default: false
5474 with_legacy: true
5475 - name: kstore_sync_submit_transaction
5476 type: bool
5477 level: advanced
5478 default: false
5479 with_legacy: true
5480 - name: kstore_onode_map_size
5481 type: uint
5482 level: advanced
5483 default: 1_K
5484 with_legacy: true
5485 - name: kstore_default_stripe_size
5486 type: size
5487 level: advanced
5488 default: 64_K
5489 with_legacy: true
5490 # rocksdb options that will be used for omap(if omap_backend is rocksdb)
5491 - name: filestore_rocksdb_options
5492 type: str
5493 level: dev
5494 desc: Options to pass through when RocksDB is used as the KeyValueDB for filestore.
5495 default: max_background_jobs=10,compaction_readahead_size=2097152,compression=kNoCompression
5496 with_legacy: true
5497 - name: filestore_omap_backend
5498 type: str
5499 level: dev
5500 desc: The KeyValueDB to use for filestore metadata (ie omap).
5501 default: rocksdb
5502 enum_values:
5503 - leveldb
5504 - rocksdb
5505 with_legacy: true
5506 - name: filestore_omap_backend_path
5507 type: str
5508 level: dev
5509 desc: The path where the filestore KeyValueDB should store it's database(s).
5510 with_legacy: true
5511 # filestore wb throttle limits
5512 - name: filestore_wbthrottle_enable
5513 type: bool
5514 level: advanced
5515 desc: Enabling throttling of operations to backing file system
5516 default: true
5517 with_legacy: true
5518 - name: filestore_wbthrottle_btrfs_bytes_start_flusher
5519 type: size
5520 level: advanced
5521 desc: Start flushing (fsyncing) when this many bytes are written(btrfs)
5522 default: 40_M
5523 with_legacy: true
5524 - name: filestore_wbthrottle_btrfs_bytes_hard_limit
5525 type: size
5526 level: advanced
5527 desc: Block writes when this many bytes haven't been flushed (fsynced) (btrfs)
5528 default: 400_M
5529 with_legacy: true
5530 - name: filestore_wbthrottle_btrfs_ios_start_flusher
5531 type: uint
5532 level: advanced
5533 desc: Start flushing (fsyncing) when this many IOs are written (brtrfs)
5534 default: 500
5535 with_legacy: true
5536 - name: filestore_wbthrottle_btrfs_ios_hard_limit
5537 type: uint
5538 level: advanced
5539 desc: Block writes when this many IOs haven't been flushed (fsynced) (btrfs)
5540 default: 5000
5541 with_legacy: true
5542 - name: filestore_wbthrottle_btrfs_inodes_start_flusher
5543 type: uint
5544 level: advanced
5545 desc: Start flushing (fsyncing) when this many distinct inodes have been modified
5546 (btrfs)
5547 default: 500
5548 with_legacy: true
5549 - name: filestore_wbthrottle_xfs_bytes_start_flusher
5550 type: size
5551 level: advanced
5552 desc: Start flushing (fsyncing) when this many bytes are written(xfs)
5553 default: 40_M
5554 with_legacy: true
5555 - name: filestore_wbthrottle_xfs_bytes_hard_limit
5556 type: size
5557 level: advanced
5558 desc: Block writes when this many bytes haven't been flushed (fsynced) (xfs)
5559 default: 400_M
5560 with_legacy: true
5561 - name: filestore_wbthrottle_xfs_ios_start_flusher
5562 type: uint
5563 level: advanced
5564 desc: Start flushing (fsyncing) when this many IOs are written (xfs)
5565 default: 500
5566 with_legacy: true
5567 - name: filestore_wbthrottle_xfs_ios_hard_limit
5568 type: uint
5569 level: advanced
5570 desc: Block writes when this many IOs haven't been flushed (fsynced) (xfs)
5571 default: 5000
5572 with_legacy: true
5573 - name: filestore_wbthrottle_xfs_inodes_start_flusher
5574 type: uint
5575 level: advanced
5576 desc: Start flushing (fsyncing) when this many distinct inodes have been modified
5577 (xfs)
5578 default: 500
5579 with_legacy: true
5580 # These must be less than the fd limit
5581 - name: filestore_wbthrottle_btrfs_inodes_hard_limit
5582 type: uint
5583 level: advanced
5584 desc: Block writing when this many inodes have outstanding writes (btrfs)
5585 default: 5000
5586 with_legacy: true
5587 - name: filestore_wbthrottle_xfs_inodes_hard_limit
5588 type: uint
5589 level: advanced
5590 desc: Block writing when this many inodes have outstanding writes (xfs)
5591 default: 5000
5592 with_legacy: true
5593 # Introduce a O_DSYNC write in the filestore
5594 - name: filestore_odsync_write
5595 type: bool
5596 level: dev
5597 desc: Write with O_DSYNC
5598 default: false
5599 with_legacy: true
5600 # Tests index failure paths
5601 - name: filestore_index_retry_probability
5602 type: float
5603 level: dev
5604 default: 0
5605 with_legacy: true
5606 # Allow object read error injection
5607 - name: filestore_debug_inject_read_err
5608 type: bool
5609 level: dev
5610 default: false
5611 with_legacy: true
5612 - name: filestore_debug_random_read_err
5613 type: float
5614 level: dev
5615 default: 0
5616 with_legacy: true
5617 # Expensive debugging check on sync
5618 - name: filestore_debug_omap_check
5619 type: bool
5620 level: dev
5621 default: false
5622 fmt_desc: Debugging check on synchronization. This is an expensive operation.
5623
5624 with_legacy: true
5625 - name: filestore_omap_header_cache_size
5626 type: size
5627 level: dev
5628 default: 1_K
5629 with_legacy: true
5630 # Use omap for xattrs for attrs over
5631 # filestore_max_inline_xattr_size or
5632 - name: filestore_max_inline_xattr_size
5633 type: size
5634 level: dev
5635 default: 0
5636 with_legacy: true
5637 - name: filestore_max_inline_xattr_size_xfs
5638 type: size
5639 level: dev
5640 default: 64_K
5641 with_legacy: true
5642 - name: filestore_max_inline_xattr_size_btrfs
5643 type: size
5644 level: dev
5645 default: 2_K
5646 with_legacy: true
5647 - name: filestore_max_inline_xattr_size_other
5648 type: size
5649 level: dev
5650 default: 512
5651 with_legacy: true
5652 # for more than filestore_max_inline_xattrs attrs
5653 - name: filestore_max_inline_xattrs
5654 type: uint
5655 level: dev
5656 default: 0
5657 with_legacy: true
5658 - name: filestore_max_inline_xattrs_xfs
5659 type: uint
5660 level: dev
5661 default: 10
5662 with_legacy: true
5663 - name: filestore_max_inline_xattrs_btrfs
5664 type: uint
5665 level: dev
5666 default: 10
5667 with_legacy: true
5668 - name: filestore_max_inline_xattrs_other
5669 type: uint
5670 level: dev
5671 default: 2
5672 with_legacy: true
5673 - name: filestore_max_xattr_value_size
5674 type: size
5675 level: dev
5676 default: 0
5677 with_legacy: true
5678 - name: filestore_max_xattr_value_size_xfs
5679 type: size
5680 level: dev
5681 default: 64_K
5682 with_legacy: true
5683 - name: filestore_max_xattr_value_size_btrfs
5684 type: size
5685 level: dev
5686 default: 64_K
5687 with_legacy: true
5688 # ext4 allows 4k xattrs total including some smallish extra fields and the
5689 # keys. We're allowing 2 512 inline attrs in addition some some filestore
5690 # replay attrs. After accounting for those, we still need to fit up to
5691 # two attrs of this value. That means we need this value to be around 1k
5692 # to be safe. This is hacky, but it's not worth complicating the code
5693 # to work around ext4's total xattr limit.
5694 - name: filestore_max_xattr_value_size_other
5695 type: size
5696 level: dev
5697 default: 1_K
5698 with_legacy: true
5699 # track sloppy crcs
5700 - name: filestore_sloppy_crc
5701 type: bool
5702 level: dev
5703 default: false
5704 with_legacy: true
5705 - name: filestore_sloppy_crc_block_size
5706 type: size
5707 level: dev
5708 default: 64_K
5709 with_legacy: true
5710 - name: filestore_max_alloc_hint_size
5711 type: size
5712 level: dev
5713 default: 1_M
5714 with_legacy: true
5715 # seconds
5716 - name: filestore_max_sync_interval
5717 type: float
5718 level: advanced
5719 desc: Period between calls to syncfs(2) and journal trims (seconds)
5720 default: 5
5721 with_legacy: true
5722 # seconds
5723 - name: filestore_min_sync_interval
5724 type: float
5725 level: dev
5726 desc: Minimum period between calls to syncfs(2)
5727 default: 0.01
5728 with_legacy: true
5729 - name: filestore_btrfs_snap
5730 type: bool
5731 level: dev
5732 default: true
5733 with_legacy: true
5734 - name: filestore_btrfs_clone_range
5735 type: bool
5736 level: advanced
5737 desc: Use btrfs clone_range ioctl to efficiently duplicate objects
5738 default: true
5739 with_legacy: true
5740 # zfsonlinux is still unstable
5741 - name: filestore_zfs_snap
5742 type: bool
5743 level: dev
5744 default: false
5745 with_legacy: true
5746 - name: filestore_fsync_flushes_journal_data
5747 type: bool
5748 level: dev
5749 default: false
5750 with_legacy: true
5751 # (try to) use fiemap
5752 - name: filestore_fiemap
5753 type: bool
5754 level: advanced
5755 desc: Use fiemap ioctl(2) to determine which parts of objects are sparse
5756 default: false
5757 with_legacy: true
5758 - name: filestore_punch_hole
5759 type: bool
5760 level: advanced
5761 desc: Use fallocate(2) FALLOC_FL_PUNCH_HOLE to efficiently zero ranges of objects
5762 default: false
5763 with_legacy: true
5764 # (try to) use seek_data/hole
5765 - name: filestore_seek_data_hole
5766 type: bool
5767 level: advanced
5768 desc: Use lseek(2) SEEK_HOLE and SEEK_DATA to determine which parts of objects are
5769 sparse
5770 default: false
5771 with_legacy: true
5772 - name: filestore_splice
5773 type: bool
5774 level: advanced
5775 desc: Use splice(2) to more efficiently copy data between files
5776 default: false
5777 with_legacy: true
5778 - name: filestore_fadvise
5779 type: bool
5780 level: advanced
5781 desc: Use posix_fadvise(2) to pass hints to file system
5782 default: true
5783 with_legacy: true
5784 # collect device partition information for management application to use
5785 - name: filestore_collect_device_partition_information
5786 type: bool
5787 level: advanced
5788 desc: Collect metadata about the backing file system on OSD startup
5789 default: true
5790 with_legacy: true
5791 # (try to) use extsize for alloc hint NOTE: extsize seems to trigger
5792 # data corruption in xfs prior to kernel 3.5. filestore will
5793 # implicitly disable this if it cannot confirm the kernel is newer
5794 # than that.
5795 # NOTE: This option involves a tradeoff: When disabled, fragmentation is
5796 # worse, but large sequential writes are faster. When enabled, large
5797 # sequential writes are slower, but fragmentation is reduced.
5798 - name: filestore_xfs_extsize
5799 type: bool
5800 level: advanced
5801 desc: Use XFS extsize ioctl(2) to hint allocator about expected write sizes
5802 default: false
5803 with_legacy: true
5804 - name: filestore_journal_parallel
5805 type: bool
5806 level: dev
5807 default: false
5808 with_legacy: true
5809 - name: filestore_journal_writeahead
5810 type: bool
5811 level: dev
5812 default: false
5813 with_legacy: true
5814 - name: filestore_journal_trailing
5815 type: bool
5816 level: dev
5817 default: false
5818 with_legacy: true
5819 - name: filestore_queue_max_ops
5820 type: uint
5821 level: advanced
5822 desc: Max IO operations in flight
5823 default: 50
5824 with_legacy: true
5825 - name: filestore_queue_max_bytes
5826 type: size
5827 level: advanced
5828 desc: Max (written) bytes in flight
5829 default: 100_M
5830 with_legacy: true
5831 - name: filestore_caller_concurrency
5832 type: int
5833 level: dev
5834 default: 10
5835 with_legacy: true
5836 # Expected filestore throughput in B/s
5837 - name: filestore_expected_throughput_bytes
5838 type: float
5839 level: advanced
5840 desc: Expected throughput of backend device (aids throttling calculations)
5841 default: 209715200
5842 with_legacy: true
5843 # Expected filestore throughput in ops/s
5844 - name: filestore_expected_throughput_ops
5845 type: float
5846 level: advanced
5847 desc: Expected through of backend device in IOPS (aids throttling calculations)
5848 default: 200
5849 with_legacy: true
5850 # Filestore max delay multiple. Defaults to 0 (disabled)
5851 - name: filestore_queue_max_delay_multiple
5852 type: float
5853 level: dev
5854 default: 0
5855 with_legacy: true
5856 # Filestore high delay multiple. Defaults to 0 (disabled)
5857 - name: filestore_queue_high_delay_multiple
5858 type: float
5859 level: dev
5860 default: 0
5861 with_legacy: true
5862 # Filestore max delay multiple ops. Defaults to 0 (disabled)
5863 - name: filestore_queue_max_delay_multiple_bytes
5864 type: float
5865 level: dev
5866 default: 0
5867 with_legacy: true
5868 # Filestore high delay multiple bytes. Defaults to 0 (disabled)
5869 - name: filestore_queue_high_delay_multiple_bytes
5870 type: float
5871 level: dev
5872 default: 0
5873 with_legacy: true
5874 # Filestore max delay multiple ops. Defaults to 0 (disabled)
5875 - name: filestore_queue_max_delay_multiple_ops
5876 type: float
5877 level: dev
5878 default: 0
5879 with_legacy: true
5880 # Filestore high delay multiple ops. Defaults to 0 (disabled)
5881 - name: filestore_queue_high_delay_multiple_ops
5882 type: float
5883 level: dev
5884 default: 0
5885 with_legacy: true
5886 - name: filestore_queue_low_threshhold
5887 type: float
5888 level: dev
5889 default: 0.3
5890 with_legacy: true
5891 - name: filestore_queue_high_threshhold
5892 type: float
5893 level: dev
5894 with_legacy: true
5895 default: 0.9
5896 - name: filestore_op_threads
5897 type: int
5898 level: advanced
5899 desc: Threads used to apply changes to backing file system
5900 default: 2
5901 with_legacy: true
5902 - name: filestore_op_thread_timeout
5903 type: int
5904 level: advanced
5905 desc: Seconds before a worker thread is considered stalled
5906 default: 1_min
5907 with_legacy: true
5908 - name: filestore_op_thread_suicide_timeout
5909 type: int
5910 level: advanced
5911 desc: Seconds before a worker thread is considered dead
5912 default: 3_min
5913 with_legacy: true
5914 - name: filestore_commit_timeout
5915 type: float
5916 level: advanced
5917 desc: Seconds before backing file system is considered hung
5918 default: 10_min
5919 with_legacy: true
5920 - name: filestore_fiemap_threshold
5921 type: size
5922 level: dev
5923 default: 4_K
5924 with_legacy: true
5925 - name: filestore_merge_threshold
5926 type: int
5927 level: dev
5928 default: -10
5929 with_legacy: true
5930 - name: filestore_split_multiple
5931 type: int
5932 level: dev
5933 default: 2
5934 with_legacy: true
5935 - name: filestore_split_rand_factor
5936 type: uint
5937 level: dev
5938 default: 20
5939 with_legacy: true
5940 - name: filestore_update_to
5941 type: int
5942 level: dev
5943 default: 1000
5944 with_legacy: true
5945 - name: filestore_blackhole
5946 type: bool
5947 level: dev
5948 default: false
5949 with_legacy: true
5950 - name: filestore_fd_cache_size
5951 type: int
5952 level: dev
5953 default: 128
5954 with_legacy: true
5955 - name: filestore_fd_cache_shards
5956 type: int
5957 level: dev
5958 default: 16
5959 with_legacy: true
5960 - name: filestore_ondisk_finisher_threads
5961 type: int
5962 level: dev
5963 default: 1
5964 with_legacy: true
5965 - name: filestore_apply_finisher_threads
5966 type: int
5967 level: dev
5968 default: 1
5969 with_legacy: true
5970 # file onto which store transaction dumps
5971 - name: filestore_dump_file
5972 type: str
5973 level: dev
5974 with_legacy: true
5975 # inject a failure at the n'th opportunity
5976 - name: filestore_kill_at
5977 type: int
5978 level: dev
5979 default: 0
5980 with_legacy: true
5981 # artificially stall for N seconds in op queue thread
5982 - name: filestore_inject_stall
5983 type: int
5984 level: dev
5985 default: 0
5986 with_legacy: true
5987 # fail/crash on EIO
5988 - name: filestore_fail_eio
5989 type: bool
5990 level: dev
5991 default: true
5992 with_legacy: true
5993 - name: filestore_debug_verify_split
5994 type: bool
5995 level: dev
5996 default: false
5997 with_legacy: true
5998 - name: journal_dio
5999 type: bool
6000 level: dev
6001 default: true
6002 fmt_desc: Enables direct i/o to the journal. Requires ``journal block
6003 align`` set to ``true``.
6004 with_legacy: true
6005 - name: journal_aio
6006 type: bool
6007 level: dev
6008 default: true
6009 fmt_desc: Enables using ``libaio`` for asynchronous writes to the journal.
6010 Requires ``journal dio`` set to ``true``. Version 0.61 and later, ``true``.
6011 Version 0.60 and earlier, ``false``.
6012 with_legacy: true
6013 - name: journal_force_aio
6014 type: bool
6015 level: dev
6016 default: false
6017 with_legacy: true
6018 - name: journal_block_size
6019 type: size
6020 level: dev
6021 default: 4_K
6022 with_legacy: true
6023 - name: journal_block_align
6024 type: bool
6025 level: dev
6026 default: true
6027 fmt_desc: Block aligns write operations. Required for ``dio`` and ``aio``.
6028 with_legacy: true
6029 - name: journal_write_header_frequency
6030 type: uint
6031 level: dev
6032 default: 0
6033 with_legacy: true
6034 - name: journal_max_write_bytes
6035 type: size
6036 level: advanced
6037 desc: Max bytes in flight to journal
6038 fmt_desc: The maximum number of bytes the journal will write at
6039 any one time.
6040 default: 10_M
6041 with_legacy: true
6042 - name: journal_max_write_entries
6043 type: int
6044 level: advanced
6045 desc: Max IOs in flight to journal
6046 fmt_desc: The maximum number of entries the journal will write at
6047 any one time.
6048 default: 100
6049 with_legacy: true
6050 # Target range for journal fullness
6051 - name: journal_throttle_low_threshhold
6052 type: float
6053 level: dev
6054 default: 0.6
6055 with_legacy: true
6056 - name: journal_throttle_high_threshhold
6057 type: float
6058 level: dev
6059 default: 0.9
6060 with_legacy: true
6061 # Multiple over expected at high_threshhold. Defaults to 0 (disabled).
6062 - name: journal_throttle_high_multiple
6063 type: float
6064 level: dev
6065 default: 0
6066 with_legacy: true
6067 # Multiple over expected at max. Defaults to 0 (disabled).
6068 - name: journal_throttle_max_multiple
6069 type: float
6070 level: dev
6071 default: 0
6072 with_legacy: true
6073 # align data payloads >= this.
6074 - name: journal_align_min_size
6075 type: size
6076 level: dev
6077 default: 64_K
6078 fmt_desc: Align data payloads greater than the specified minimum.
6079 with_legacy: true
6080 - name: journal_replay_from
6081 type: int
6082 level: dev
6083 default: 0
6084 with_legacy: true
6085 - name: journal_zero_on_create
6086 type: bool
6087 level: dev
6088 default: false
6089 fmt_desc: |
6090 Causes the file store to overwrite the entire journal with
6091 ``0``'s during ``mkfs``.
6092 with_legacy: true
6093 # assume journal is not corrupt
6094 - name: journal_ignore_corruption
6095 type: bool
6096 level: dev
6097 default: false
6098 with_legacy: true
6099 # using ssd disk as journal, whether support discard nouse journal-data.
6100 - name: journal_discard
6101 type: bool
6102 level: dev
6103 default: false
6104 with_legacy: true
6105 # fio data directory for fio-objectstore
6106 - name: fio_dir
6107 type: str
6108 level: advanced
6109 default: /tmp/fio
6110 with_legacy: true
6111 - name: rados_mon_op_timeout
6112 type: secs
6113 level: advanced
6114 desc: timeout for operations handled by monitors such as statfs (0 is unlimited)
6115 default: 0
6116 min: 0
6117 flags:
6118 - runtime
6119 - name: rados_osd_op_timeout
6120 type: secs
6121 level: advanced
6122 desc: timeout for operations handled by osds such as write (0 is unlimited)
6123 default: 0
6124 min: 0
6125 flags:
6126 - runtime
6127 # true if LTTng-UST tracepoints should be enabled
6128 - name: rados_tracing
6129 type: bool
6130 level: advanced
6131 default: false
6132 with_legacy: true
6133 - name: mgr_connect_retry_interval
6134 type: float
6135 level: dev
6136 default: 1
6137 services:
6138 - common
6139 - name: mgr_client_service_daemon_unregister_timeout
6140 type: float
6141 level: dev
6142 desc: Time to wait during shutdown to deregister service with mgr
6143 default: 1
6144 - name: throttler_perf_counter
6145 type: bool
6146 level: advanced
6147 default: true
6148 with_legacy: true
6149 - name: event_tracing
6150 type: bool
6151 level: advanced
6152 default: false
6153 with_legacy: true
6154 - name: bluestore_tracing
6155 type: bool
6156 level: advanced
6157 desc: Enable bluestore event tracing.
6158 default: false
6159 - name: bluestore_throttle_trace_rate
6160 type: float
6161 level: advanced
6162 desc: Rate at which to sample bluestore transactions (per second)
6163 default: 0
6164 - name: debug_deliberately_leak_memory
6165 type: bool
6166 level: dev
6167 default: false
6168 with_legacy: true
6169 - name: debug_asserts_on_shutdown
6170 type: bool
6171 level: dev
6172 desc: Enable certain asserts to check for refcounting bugs on shutdown; see http://tracker.ceph.com/issues/21738
6173 default: false
6174 - name: debug_asok_assert_abort
6175 type: bool
6176 level: dev
6177 desc: allow commands 'assert' and 'abort' via asok for testing crash dumps etc
6178 default: false
6179 with_legacy: true
6180 - name: target_max_misplaced_ratio
6181 type: float
6182 level: basic
6183 desc: Max ratio of misplaced objects to target when throttling data rebalancing
6184 activity
6185 default: 0.05
6186 - name: device_failure_prediction_mode
6187 type: str
6188 level: basic
6189 desc: Method used to predict device failures
6190 long_desc: To disable prediction, use 'none', 'local' uses a prediction model that
6191 runs inside the mgr daemon. 'cloud' will share metrics with a cloud service and
6192 query the service for devicelife expectancy.
6193 default: none
6194 enum_values:
6195 - none
6196 - local
6197 - cloud
6198 flags:
6199 - runtime
6200 - name: gss_ktab_client_file
6201 type: str
6202 level: advanced
6203 desc: GSS/KRB5 Keytab file for client authentication
6204 long_desc: This sets the full path for the GSS/Kerberos client keytab file location.
6205 default: /var/lib/ceph/$name/gss_client_$name.ktab
6206 services:
6207 - mon
6208 - osd
6209 - name: gss_target_name
6210 type: str
6211 level: advanced
6212 long_desc: This sets the gss target service name.
6213 default: ceph
6214 services:
6215 - mon
6216 - osd
6217 - name: debug_disable_randomized_ping
6218 type: bool
6219 level: dev
6220 desc: Disable heartbeat ping randomization for testing purposes
6221 default: false
6222 - name: debug_heartbeat_testing_span
6223 type: int
6224 level: dev
6225 desc: Override 60 second periods for testing only
6226 default: 0
6227 - name: librados_thread_count
6228 type: uint
6229 level: advanced
6230 desc: Size of thread pool for Objecter
6231 default: 2
6232 tags:
6233 - client
6234 min: 1
6235 - name: osd_asio_thread_count
6236 type: uint
6237 level: advanced
6238 desc: Size of thread pool for ASIO completions
6239 default: 2
6240 tags:
6241 - osd
6242 min: 1
6243 - name: cephsqlite_lock_renewal_interval
6244 type: millisecs
6245 level: advanced
6246 desc: number of milliseconds before lock is renewed
6247 default: 2000
6248 tags:
6249 - client
6250 see_also:
6251 - cephsqlite_lock_renewal_timeout
6252 min: 100
6253 - name: cephsqlite_lock_renewal_timeout
6254 type: millisecs
6255 level: advanced
6256 desc: number of milliseconds before transaction lock times out
6257 long_desc: The amount of time before a running libcephsqlite VFS connection has
6258 to renew a lock on the database before the lock is automatically lost. If the
6259 lock is lost, the VFS will abort the process to prevent database corruption.
6260 default: 30000
6261 tags:
6262 - client
6263 see_also:
6264 - cephsqlite_lock_renewal_interval
6265 min: 100
6266 - name: cephsqlite_blocklist_dead_locker
6267 type: bool
6268 level: advanced
6269 desc: blocklist the last dead owner of the database lock
6270 long_desc: Require that the Ceph SQLite VFS blocklist the last dead owner of the
6271 database when cleanup was incomplete. DO NOT CHANGE THIS UNLESS YOU UNDERSTAND
6272 THE RAMIFICATIONS. CORRUPTION MAY RESULT.
6273 default: true
6274 tags:
6275 - client
6276 - name: bdev_type
6277 type: str
6278 level: advanced
6279 desc: Explicitly set the device type to select the driver if it's needed
6280 enum_values:
6281 - aio
6282 - spdk
6283 - pmem
6284 - hm_smr
6285 - name: bluestore_cleaner_sleep_interval
6286 type: float
6287 level: advanced
6288 desc: How long cleaner should sleep before re-checking utilization
6289 default: 5
6290 with_legacy: true
6291 - name: jaeger_tracing_enable
6292 type: bool
6293 level: advanced
6294 desc: Ceph should use jaeger tracing system
6295 default: false
6296 services:
6297 - rgw
6298 - osd
6299 with_legacy: true
6300 - name: mgr_ttl_cache_expire_seconds
6301 type: uint
6302 level: dev
6303 desc: Set the time to live in seconds - set to 0 to disable the cache.
6304 default: 0
6305 services:
6306 - mgr