ceph/src/common/options/global.yaml.in

   1 # -*- mode: YAML -*-
   2 ---
   3
   4 options:
   5 - name: host
   6   type: str
   7   level: basic
   8   desc: local hostname
   9   long_desc: if blank, ceph assumes the short hostname (hostname -s)
  10   tags:
  11   - network
  12   services:
  13   - common
  14   flags:
  15   - no_mon_update
  16   with_legacy: true
  17 - name: fsid
  18   type: uuid
  19   level: basic
  20   desc: cluster fsid (uuid)
  21   fmt_desc: The cluster ID. One per cluster.
  22     May be generated by a deployment tool if not specified.
  23   note: Do not set this value if you use a deployment tool that does
  24     it for you.
  25   tags:
  26   - service
  27   services:
  28   - common
  29   flags:
  30   - no_mon_update
  31   - startup
  32 - name: public_addr
  33   type: addr
  34   level: basic
  35   desc: public-facing address to bind to
  36   fmt_desc: The IP address for the public (front-side) network.
  37    Set for each daemon.
  38   services:
  39   - mon
  40   - mds
  41   - osd
  42   - mgr
  43   flags:
  44   - startup
  45   with_legacy: true
  46 - name: public_addrv
  47   type: addrvec
  48   level: basic
  49   desc: public-facing address to bind to
  50   services:
  51   - mon
  52   - mds
  53   - osd
  54   - mgr
  55   flags:
  56   - startup
  57   with_legacy: true
  58 - name: public_bind_addr
  59   type: addr
  60   level: advanced
  61   services:
  62   - mon
  63   flags:
  64   - startup
  65   fmt_desc: In some dynamic deployments the Ceph MON daemon might bind
  66    to an IP address locally that is different from the ``public_addr``
  67    advertised to other peers in the network. The environment must ensure
  68    that routing rules are set correctly. If ``public_bind_addr`` is set
  69    the Ceph Monitor daemon will bind to it locally and use ``public_addr``
  70    in the monmaps to advertise its address to peers. This behavior is limited
  71    to the Monitor daemon.
  72   with_legacy: true
  73 - name: cluster_addr
  74   type: addr
  75   level: basic
  76   desc: cluster-facing address to bind to
  77   fmt_desc: The IP address for the cluster (back-side) network.
  78    Set for each daemon.
  79   tags:
  80   - network
  81   services:
  82   - osd
  83   flags:
  84   - startup
  85   with_legacy: true
  86 - name: public_network
  87   type: str
  88   level: advanced
  89   desc: Network(s) from which to choose a public address to bind to
  90   fmt_desc: The IP address and netmask of the public (front-side) network
  91    (e.g., ``192.168.0.0/24``). Set in ``[global]``. You may specify
  92    comma-separated subnets. The format of it looks like
  93    ``{ip-address}/{netmask} [, {ip-address}/{netmask}]``
  94   tags:
  95   - network
  96   services:
  97   - mon
  98   - mds
  99   - osd
 100   - mgr
 101   flags:
 102   - startup
 103   with_legacy: true
 104 - name: public_network_interface
 105   type: str
 106   level: advanced
 107   desc: Interface name(s) from which to choose an address from a public_network to
 108     bind to; public_network must also be specified.
 109   tags:
 110   - network
 111   services:
 112   - mon
 113   - mds
 114   - osd
 115   - mgr
 116   see_also:
 117   - public_network
 118   flags:
 119   - startup
 120 - name: cluster_network
 121   type: str
 122   level: advanced
 123   desc: Network(s) from which to choose a cluster address to bind to
 124   fmt_desc: The IP address and netmask of the cluster (back-side) network
 125    (e.g., ``10.0.0.0/24``).  Set in ``[global]``. You may specify
 126    comma-separated subnets. The format of it looks like
 127    ``{ip-address}/{netmask} [, {ip-address}/{netmask}]``
 128   tags:
 129   - network
 130   services:
 131   - osd
 132   flags:
 133   - startup
 134   with_legacy: true
 135 - name: cluster_network_interface
 136   type: str
 137   level: advanced
 138   desc: Interface name(s) from which to choose an address from a cluster_network to
 139     bind to; cluster_network must also be specified.
 140   tags:
 141   - network
 142   services:
 143   - mon
 144   - mds
 145   - osd
 146   - mgr
 147   see_also:
 148   - cluster_network
 149   flags:
 150   - startup
 151 - name: monmap
 152   type: str
 153   level: advanced
 154   desc: path to MonMap file
 155   long_desc: This option is normally used during mkfs, but can also be used to identify
 156     which monitors to connect to.
 157   services:
 158   - mon
 159   flags:
 160   - no_mon_update
 161   - create
 162 - name: mon_host
 163   type: str
 164   level: basic
 165   desc: list of hosts or addresses to search for a monitor
 166   long_desc: This is a list of IP addresses or hostnames that are separated by commas, whitespace, or semicolons. Hostnames are resolved via DNS. All A and AAAA records are included in the search list.
 167   services:
 168   - common
 169   flags:
 170   - no_mon_update
 171   - startup
 172 - name: mon_host_override
 173   type: str
 174   level: advanced
 175   desc: monitor(s) to use overriding the MonMap
 176   fmt_desc: This is the list of monitors that the Ceph process **initially** contacts when first establishing communication with the Ceph cluster. This overrides the known monitor list that is derived from MonMap updates sent to older Ceph instances (like librados cluster handles). This option is expected to be useful primarily for debugging.
 177   services:
 178   - common
 179   flags:
 180   - no_mon_update
 181   - startup
 182 - name: mon_dns_srv_name
 183   type: str
 184   level: advanced
 185   desc: name of DNS SRV record to check for monitor addresses
 186   fmt_desc: the service name used querying the DNS for the monitor hosts/addresses
 187   default: ceph-mon
 188   tags:
 189   - network
 190   services:
 191   - common
 192   see_also:
 193   - mon_host
 194   flags:
 195   - startup
 196 - name: container_image
 197   type: str
 198   level: basic
 199   desc: container image (used by cephadm orchestrator)
 200   default: docker.io/ceph/daemon-base:latest-master-devel
 201   flags:
 202   - startup
 203 - name: no_config_file
 204   type: bool
 205   level: advanced
 206   desc: signal that we don't require a config file to be present
 207   long_desc: When specified, we won't be looking for a configuration file, and will
 208     instead expect that whatever options or values are required for us to work will
 209     be passed as arguments.
 210   default: false
 211   tags:
 212   - config
 213   services:
 214   - common
 215   flags:
 216   - no_mon_update
 217   - startup
 218 - name: lockdep
 219   type: bool
 220   level: dev
 221   desc: enable lockdep lock dependency analyzer
 222   default: false
 223   services:
 224   - common
 225   flags:
 226   - no_mon_update
 227   - startup
 228   with_legacy: true
 229 - name: lockdep_force_backtrace
 230   type: bool
 231   level: dev
 232   desc: always gather current backtrace at every lock
 233   default: false
 234   services:
 235   - common
 236   see_also:
 237   - lockdep
 238   flags:
 239   - startup
 240   with_legacy: true
 241 - name: run_dir
 242   type: str
 243   level: advanced
 244   desc: path for the 'run' directory for storing pid and socket files
 245   default: /var/run/ceph
 246   services:
 247   - common
 248   see_also:
 249   - admin_socket
 250   flags:
 251   - startup
 252   with_legacy: true
 253 - name: admin_socket
 254   type: str
 255   level: advanced
 256   desc: path for the runtime control socket file, used by the 'ceph daemon' command
 257   fmt_desc: The socket for executing administrative commands on a daemon,
 258     irrespective of whether Ceph Monitors have established a quorum.
 259   daemon_default: $run_dir/$cluster-$name.asok
 260   services:
 261   - common
 262   flags:
 263   - startup
 264   # default changed by common_preinit()
 265   with_legacy: true
 266 - name: admin_socket_mode
 267   type: str
 268   level: advanced
 269   desc: file mode to set for the admin socket file, e.g, '0755'
 270   services:
 271   - common
 272   see_also:
 273   - admin_socket
 274   flags:
 275   - startup
 276   with_legacy: true
 277 - name: daemonize
 278   type: bool
 279   level: advanced
 280   desc: whether to daemonize (background) after startup
 281   default: false
 282   daemon_default: true
 283   tags:
 284   - service
 285   services:
 286   - mon
 287   - mgr
 288   - osd
 289   - mds
 290   see_also:
 291   - pid_file
 292   - chdir
 293   flags:
 294   - no_mon_update
 295   - startup
 296   # default changed by common_preinit()
 297   with_legacy: true
 298 - name: setuser
 299   type: str
 300   level: advanced
 301   desc: uid or user name to switch to on startup
 302   long_desc: This is normally specified by the systemd unit file.
 303   tags:
 304   - service
 305   services:
 306   - mon
 307   - mgr
 308   - osd
 309   - mds
 310   see_also:
 311   - setgroup
 312   flags:
 313   - startup
 314   with_legacy: true
 315 - name: setgroup
 316   type: str
 317   level: advanced
 318   desc: gid or group name to switch to on startup
 319   long_desc: This is normally specified by the systemd unit file.
 320   tags:
 321   - service
 322   services:
 323   - mon
 324   - mgr
 325   - osd
 326   - mds
 327   see_also:
 328   - setuser
 329   flags:
 330   - startup
 331   with_legacy: true
 332 - name: setuser_match_path
 333   type: str
 334   level: advanced
 335   desc: if set, setuser/setgroup is condition on this path matching ownership
 336   long_desc: If setuser or setgroup are specified, and this option is non-empty, then
 337     the uid/gid of the daemon will only be changed if the file or directory specified
 338     by this option has a matching uid and/or gid.  This exists primarily to allow
 339     switching to user ceph for OSDs to be conditional on whether the osd data contents
 340     have also been chowned after an upgrade.  This is normally specified by the systemd
 341     unit file.
 342   tags:
 343   - service
 344   services:
 345   - mon
 346   - mgr
 347   - osd
 348   - mds
 349   see_also:
 350   - setuser
 351   - setgroup
 352   flags:
 353   - startup
 354   with_legacy: true
 355 - name: pid_file
 356   type: str
 357   level: advanced
 358   desc: path to write a pid file (if any)
 359   fmt_desc: The file in which the mon, osd or mds will write its
 360     PID.  For instance, ``/var/run/$cluster/$type.$id.pid``
 361     will create /var/run/ceph/mon.a.pid for the ``mon`` with
 362     id ``a`` running in the ``ceph`` cluster. The ``pid
 363     file`` is removed when the daemon stops gracefully. If
 364     the process is not daemonized (i.e. runs with the ``-f``
 365     or ``-d`` option), the ``pid file`` is not created.
 366   tags:
 367   - service
 368   services:
 369   - mon
 370   - mgr
 371   - osd
 372   - mds
 373   flags:
 374   - startup
 375   with_legacy: true
 376 - name: chdir
 377   type: str
 378   level: advanced
 379   desc: path to chdir(2) to after daemonizing
 380   fmt_desc: The directory Ceph daemons change to once they are
 381     up and running. Default ``/`` directory recommended.
 382   tags:
 383   - service
 384   services:
 385   - mon
 386   - mgr
 387   - osd
 388   - mds
 389   see_also:
 390   - daemonize
 391   flags:
 392   - no_mon_update
 393   - startup
 394   with_legacy: true
 395 - name: fatal_signal_handlers
 396   type: bool
 397   level: advanced
 398   desc: whether to register signal handlers for SIGABRT etc that dump a stack trace
 399   long_desc: This is normally true for daemons and values for libraries.
 400   fmt_desc: If set, we will install signal handlers for SEGV, ABRT, BUS, ILL,
 401     FPE, XCPU, XFSZ, SYS signals to generate a useful log message
 402   default: true
 403   tags:
 404   - service
 405   services:
 406   - mon
 407   - mgr
 408   - osd
 409   - mds
 410   flags:
 411   - startup
 412   with_legacy: true
 413 - name: crash_dir
 414   type: str
 415   level: advanced
 416   desc: Directory where crash reports are archived
 417   default: /var/lib/ceph/crash
 418   flags:
 419   - startup
 420   with_legacy: true
 421 - name: restapi_log_level
 422   type: str
 423   level: advanced
 424   desc: default set by python code
 425   with_legacy: true
 426 - name: restapi_base_url
 427   type: str
 428   level: advanced
 429   desc: default set by python code
 430   with_legacy: true
 431 - name: erasure_code_dir
 432   type: str
 433   level: advanced
 434   desc: directory where erasure-code plugins can be found
 435   default: @CEPH_INSTALL_FULL_PKGLIBDIR@/erasure-code
 436   services:
 437   - mon
 438   - osd
 439   flags:
 440   - startup
 441   with_legacy: true
 442 - name: log_file
 443   type: str
 444   level: basic
 445   desc: path to log file
 446   fmt_desc: The location of the logging file for your cluster.
 447   daemon_default: /var/log/ceph/$cluster-$name.log
 448   see_also:
 449   - log_to_file
 450   - log_to_stderr
 451   - err_to_stderr
 452   - log_to_syslog
 453   - err_to_syslog
 454   # default changed by common_preinit()
 455   with_legacy: true
 456 - name: log_max_new
 457   type: int
 458   level: advanced
 459   desc: max unwritten log entries to allow before waiting to flush to the log
 460   fmt_desc: The maximum number of new log files.
 461   default: 1000
 462   see_also:
 463   - log_max_recent
 464   # default changed by common_preinit()
 465   with_legacy: true
 466 - name: log_max_recent
 467   type: int
 468   level: advanced
 469   desc: recent log entries to keep in memory to dump in the event of a crash
 470   long_desc: The purpose of this option is to log at a higher debug level only to
 471     the in-memory buffer, and write out the detailed log messages only if there is
 472     a crash.  Only log entries below the lower log level will be written unconditionally
 473     to the log.  For example, debug_osd=1/5 will write everything <= 1 to the log
 474     unconditionally but keep entries at levels 2-5 in memory.  If there is a seg fault
 475     or assertion failure, all entries will be dumped to the log.
 476   default: 500
 477   daemon_default: 10000
 478   # default changed by common_preinit()
 479   with_legacy: true
 480 - name: log_to_file
 481   type: bool
 482   level: basic
 483   desc: send log lines to a file
 484   fmt_desc: Determines if logging messages should appear in a file.
 485   default: true
 486   see_also:
 487   - log_file
 488   with_legacy: true
 489 - name: log_to_stderr
 490   type: bool
 491   level: basic
 492   desc: send log lines to stderr
 493   fmt_desc: Determines if logging messages should appear in ``stderr``.
 494   default: true
 495   daemon_default: false
 496   with_legacy: true
 497 - name: err_to_stderr
 498   type: bool
 499   level: basic
 500   desc: send critical error log lines to stderr
 501   fmt_desc: Determines if error messages should appear in ``stderr``.
 502   default: false
 503   daemon_default: true
 504   with_legacy: true
 505 - name: log_stderr_prefix
 506   type: str
 507   level: advanced
 508   desc: String to prefix log messages with when sent to stderr
 509   long_desc: This is useful in container environments when combined with mon_cluster_log_to_stderr.  The
 510     mon log prefixes each line with the channel name (e.g., 'default', 'audit'), while
 511     log_stderr_prefix can be set to 'debug '.
 512   see_also:
 513   - mon_cluster_log_to_stderr
 514 - name: log_to_syslog
 515   type: bool
 516   level: basic
 517   desc: send log lines to syslog facility
 518   fmt_desc: Determines if logging messages should appear in ``syslog``.
 519   default: false
 520   with_legacy: true
 521 - name: err_to_syslog
 522   type: bool
 523   level: basic
 524   desc: send critical error log lines to syslog facility
 525   fmt_desc: Determines if error messages should appear in ``syslog``.
 526   default: false
 527   with_legacy: true
 528 - name: log_flush_on_exit
 529   type: bool
 530   level: advanced
 531   desc: set a process exit handler to ensure the log is flushed on exit
 532   fmt_desc: Determines if Ceph should flush the log files after exit.
 533   default: false
 534   with_legacy: true
 535 - name: log_stop_at_utilization
 536   type: float
 537   level: basic
 538   desc: stop writing to the log file when device utilization reaches this ratio
 539   default: 0.97
 540   see_also:
 541   - log_file
 542   min: 0
 543   max: 1
 544   with_legacy: true
 545 - name: log_to_graylog
 546   type: bool
 547   level: basic
 548   desc: send log lines to remote graylog server
 549   default: false
 550   see_also:
 551   - err_to_graylog
 552   - log_graylog_host
 553   - log_graylog_port
 554   with_legacy: true
 555 - name: err_to_graylog
 556   type: bool
 557   level: basic
 558   desc: send critical error log lines to remote graylog server
 559   default: false
 560   see_also:
 561   - log_to_graylog
 562   - log_graylog_host
 563   - log_graylog_port
 564   with_legacy: true
 565 - name: log_graylog_host
 566   type: str
 567   level: basic
 568   desc: address or hostname of graylog server to log to
 569   default: 127.0.0.1
 570   see_also:
 571   - log_to_graylog
 572   - err_to_graylog
 573   - log_graylog_port
 574   with_legacy: true
 575 - name: log_graylog_port
 576   type: int
 577   level: basic
 578   desc: port number for the remote graylog server
 579   default: 12201
 580   see_also:
 581   - log_graylog_host
 582   with_legacy: true
 583 - name: log_to_journald
 584   type: bool
 585   level: basic
 586   desc: send log lines to journald
 587   default: false
 588   see_also:
 589   - err_to_journald
 590 - name: err_to_journald
 591   type: bool
 592   level: basic
 593   desc: send critical error log lines to journald
 594   default: false
 595   see_also:
 596   - log_to_journald
 597 - name: log_coarse_timestamps
 598   type: bool
 599   level: advanced
 600   desc: timestamp log entries from coarse system clock to improve performance
 601   default: true
 602   tags:
 603   - performance
 604   - service
 605   services:
 606   - common
 607 # options will take k/v pairs, or single-item that will be assumed as general
 608 # default for all, regardless of channel.
 609 # e.g., "info" would be taken as the same as "default=info"
 610 # also, "default=daemon audit=local0" would mean
 611 #    "default all to 'daemon', override 'audit' with 'local0'
 612 - name: clog_to_monitors
 613   type: str
 614   level: advanced
 615   desc: Make daemons send cluster log messages to monitors
 616   fmt_desc: Determines if ``clog`` messages should be sent to monitors.
 617   default: default=true
 618   flags:
 619   - runtime
 620   with_legacy: true
 621   services:
 622   - mgr
 623   - osd
 624   - mds
 625 - name: clog_to_syslog
 626   type: str
 627   level: advanced
 628   desc: Make daemons send cluster log messages to syslog
 629   fmt_desc: Determines if ``clog`` messages should be sent to syslog.
 630   default: 'false'
 631   flags:
 632   - runtime
 633   with_legacy: true
 634   services:
 635   - mon
 636   - mgr
 637   - osd
 638   - mds
 639 - name: clog_to_syslog_level
 640   type: str
 641   level: advanced
 642   desc: Syslog level for cluster log messages
 643   default: info
 644   see_also:
 645   - clog_to_syslog
 646   flags:
 647   - runtime
 648   with_legacy: true
 649   services:
 650   - mon
 651   - mgr
 652   - osd
 653   - mds
 654 - name: clog_to_syslog_facility
 655   type: str
 656   level: advanced
 657   desc: Syslog facility for cluster log messages
 658   default: default=daemon audit=local0
 659   see_also:
 660   - clog_to_syslog
 661   flags:
 662   - runtime
 663   with_legacy: true
 664   services:
 665   - mon
 666   - mgr
 667   - osd
 668   - mds
 669 - name: clog_to_graylog
 670   type: str
 671   level: advanced
 672   desc: Make daemons send cluster log to graylog
 673   default: 'false'
 674   flags:
 675   - runtime
 676   services:
 677   - mon
 678   - mgr
 679   - osd
 680   - mds
 681 - name: clog_to_graylog_host
 682   type: str
 683   level: advanced
 684   desc: Graylog host to cluster log messages
 685   default: 127.0.0.1
 686   see_also:
 687   - clog_to_graylog
 688   flags:
 689   - runtime
 690   with_legacy: true
 691   services:
 692   - mon
 693   - mgr
 694   - osd
 695   - mds
 696 - name: clog_to_graylog_port
 697   type: str
 698   level: advanced
 699   desc: Graylog port number for cluster log messages
 700   default: '12201'
 701   see_also:
 702   - clog_to_graylog
 703   flags:
 704   - runtime
 705   with_legacy: true
 706   services:
 707   - mon
 708   - mgr
 709   - osd
 710   - mds
 711 - name: enable_experimental_unrecoverable_data_corrupting_features
 712   type: str
 713   level: advanced
 714   desc: Enable named (or all with '*') experimental features that may be untested,
 715     dangerous, and/or cause permanent data loss
 716   flags:
 717   - runtime
 718   with_legacy: true
 719 - name: plugin_dir
 720   type: str
 721   level: advanced
 722   desc: Base directory for dynamically loaded plugins
 723   default: @CEPH_INSTALL_FULL_PKGLIBDIR@
 724   services:
 725   - mon
 726   - osd
 727   flags:
 728   - startup
 729 - name: compressor_zlib_isal
 730   type: bool
 731   level: advanced
 732   desc: Use Intel ISA-L accelerated zlib implementation if available
 733   default: false
 734   with_legacy: true
 735 # regular zlib compression level, not applicable to isa-l optimized version
 736 - name: compressor_zlib_level
 737   type: int
 738   level: advanced
 739   desc: Zlib compression level to use
 740   default: 5
 741   with_legacy: true
 742 # regular zlib compression winsize, not applicable to isa-l optimized version
 743 - name: compressor_zlib_winsize
 744   type: int
 745   level: advanced
 746   desc: Zlib compression winsize to use
 747   default: -15
 748   min: -15
 749   max: 32
 750   with_legacy: true
 751 # regular zstd compression level
 752 - name: compressor_zstd_level
 753   type: int
 754   level: advanced
 755   desc: Zstd compression level to use
 756   default: 1
 757   with_legacy: true
 758 - name: qat_compressor_enabled
 759   type: bool
 760   level: advanced
 761   desc: Enable Intel QAT acceleration support for compression if available
 762   default: false
 763   with_legacy: true
 764 - name: plugin_crypto_accelerator
 765   type: str
 766   level: advanced
 767   desc: Crypto accelerator library to use
 768   default: crypto_isal
 769   with_legacy: true
 770 - name: openssl_engine_opts
 771   type: str
 772   level: advanced
 773   desc: Use engine for specific openssl algorithm
 774   long_desc: 'Pass opts in this way: engine_id=engine1,dynamic_path=/some/path/engine1.so,default_algorithms=DIGESTS:engine_id=engine2,dynamic_path=/some/path/engine2.so,default_algorithms=CIPHERS,other_ctrl=other_value'
 775   flags:
 776   - startup
 777   with_legacy: true
 778 - name: mempool_debug
 779   type: bool
 780   level: dev
 781   default: false
 782   flags:
 783   - no_mon_update
 784   with_legacy: true
 785 - name: thp
 786   type: bool
 787   level: dev
 788   desc: enable transparent huge page (THP) support
 789   long_desc: Ceph is known to suffer from memory fragmentation due to THP use. This
 790     is indicated by RSS usage above configured memory targets. Enabling THP is currently
 791     discouraged until selective use of THP by Ceph is implemented.
 792   default: false
 793   flags:
 794   - startup
 795 - name: key
 796   type: str
 797   level: advanced
 798   desc: Authentication key
 799   long_desc: A CephX authentication key, base64 encoded.  It normally looks something
 800     like 'AQAtut9ZdMbNJBAAHz6yBAWyJyz2yYRyeMWDag=='.
 801   fmt_desc: The key (i.e., the text string of the key itself). Not recommended.
 802   see_also:
 803   - keyfile
 804   - keyring
 805   flags:
 806   - no_mon_update
 807   - startup
 808   with_legacy: true
 809 - name: keyfile
 810   type: str
 811   level: advanced
 812   desc: Path to a file containing a key
 813   long_desc: The file should contain a CephX authentication key and optionally a trailing
 814     newline, but nothing else.
 815   fmt_desc: The path to a key file (i.e,. a file containing only the key).
 816   see_also:
 817   - key
 818   flags:
 819   - no_mon_update
 820   - startup
 821   with_legacy: true
 822 - name: keyring
 823   type: str
 824   level: advanced
 825   desc: Path to a keyring file.
 826   long_desc: A keyring file is an INI-style formatted file where the section names
 827     are client or daemon names (e.g., 'osd.0') and each section contains a 'key' property
 828     with CephX authentication key as the value.
 829   # please note, document are generated without accessing to the CMake
 830   # variables, so please update the document manually with a representive
 831   # default value using the ":default:" option of ".. confval::" directive.
 832   default: @keyring_paths@
 833   see_also:
 834   - key
 835   - keyfile
 836   flags:
 837   - no_mon_update
 838   - startup
 839   with_legacy: true
 840 - name: heartbeat_interval
 841   type: int
 842   level: advanced
 843   desc: Frequency of internal heartbeat checks (seconds)
 844   default: 5
 845   flags:
 846   - startup
 847   with_legacy: true
 848 - name: heartbeat_file
 849   type: str
 850   level: advanced
 851   desc: File to touch on successful internal heartbeat
 852   long_desc: If set, this file will be touched every time an internal heartbeat check
 853     succeeds.
 854   see_also:
 855   - heartbeat_interval
 856   flags:
 857   - startup
 858   with_legacy: true
 859 - name: heartbeat_inject_failure
 860   type: int
 861   level: dev
 862   default: 0
 863   with_legacy: true
 864 - name: perf
 865   type: bool
 866   level: advanced
 867   desc: Enable internal performance metrics
 868   long_desc: If enabled, collect and expose internal health metrics
 869   default: true
 870   with_legacy: true
 871 - name: ms_type
 872   type: str
 873   level: advanced
 874   desc: Messenger implementation to use for network communication
 875   fmt_desc: Transport type used by Async Messenger. Can be ``async+posix``,
 876     ``async+dpdk`` or ``async+rdma``. Posix uses standard TCP/IP networking and is
 877     default. Other transports may be experimental and support may be limited.
 878   default: async+posix
 879   flags:
 880   - startup
 881   with_legacy: true
 882 - name: ms_public_type
 883   type: str
 884   level: advanced
 885   desc: Messenger implementation to use for the public network
 886   long_desc: If not specified, use ms_type
 887   see_also:
 888   - ms_type
 889   flags:
 890   - startup
 891   with_legacy: true
 892 - name: ms_cluster_type
 893   type: str
 894   level: advanced
 895   desc: Messenger implementation to use for the internal cluster network
 896   long_desc: If not specified, use ms_type
 897   see_also:
 898   - ms_type
 899   flags:
 900   - startup
 901   with_legacy: true
 902 - name: ms_mon_cluster_mode
 903   type: str
 904   level: basic
 905   desc: Connection modes (crc, secure) for intra-mon connections in order of preference
 906   fmt_desc: the connection mode (or permitted modes) to use between monitors.
 907   default: secure crc
 908   see_also:
 909   - ms_mon_service_mode
 910   - ms_mon_client_mode
 911   - ms_service_mode
 912   - ms_cluster_mode
 913   - ms_client_mode
 914   flags:
 915   - startup
 916 - name: ms_mon_service_mode
 917   type: str
 918   level: basic
 919   desc: Allowed connection modes (crc, secure) for connections to mons
 920   fmt_desc: a list of permitted modes for clients or
 921     other Ceph daemons to use when connecting to monitors.
 922   default: secure crc
 923   see_also:
 924   - ms_service_mode
 925   - ms_mon_cluster_mode
 926   - ms_mon_client_mode
 927   - ms_cluster_mode
 928   - ms_client_mode
 929   flags:
 930   - startup
 931 - name: ms_mon_client_mode
 932   type: str
 933   level: basic
 934   desc: Connection modes (crc, secure) for connections from clients to monitors in
 935     order of preference
 936   fmt_desc: a list of connection modes, in order of
 937     preference, for clients or non-monitor daemons to use when
 938     connecting to monitors.
 939   default: secure crc
 940   see_also:
 941   - ms_mon_service_mode
 942   - ms_mon_cluster_mode
 943   - ms_service_mode
 944   - ms_cluster_mode
 945   - ms_client_mode
 946   flags:
 947   - startup
 948 - name: ms_cluster_mode
 949   type: str
 950   level: basic
 951   desc: Connection modes (crc, secure) for intra-cluster connections in order of preference
 952   fmt_desc: connection mode (or permitted modes) used
 953     for intra-cluster communication between Ceph daemons.  If multiple
 954     modes are listed, the modes listed first are preferred.
 955   default: crc secure
 956   see_also:
 957   - ms_service_mode
 958   - ms_client_mode
 959   flags:
 960   - startup
 961 - name: ms_service_mode
 962   type: str
 963   level: basic
 964   desc: Allowed connection modes (crc, secure) for connections to daemons
 965   fmt_desc: a list of permitted modes for clients to use
 966     when connecting to the cluster.
 967   default: crc secure
 968   see_also:
 969   - ms_cluster_mode
 970   - ms_client_mode
 971   flags:
 972   - startup
 973 - name: ms_client_mode
 974   type: str
 975   level: basic
 976   desc: Connection modes (crc, secure) for connections from clients in order of preference
 977   fmt_desc: a list of connection modes, in order of
 978     preference, for clients to use (or allow) when talking to a Ceph
 979     cluster.
 980   default: crc secure
 981   see_also:
 982   - ms_cluster_mode
 983   - ms_service_mode
 984   flags:
 985   - startup
 986 - name: ms_osd_compress_mode
 987   type: str
 988   level: advanced
 989   desc: Compression policy to use in Messenger for communicating with OSD
 990   default: none
 991   services:
 992   - osd
 993   enum_values:
 994   - none
 995   - force
 996   see_also:
 997   - ms_compress_secure
 998   flags:
 999   - runtime
1000 - name: ms_osd_compress_min_size
1001   type: uint
1002   level: advanced
1003   desc: Minimal message size eligable for on-wire compression
1004   default: 1_K
1005   services:
1006   - osd
1007   see_also:
1008   - ms_osd_compress_mode
1009   flags:
1010   - runtime
1011 - name: ms_osd_compression_algorithm
1012   type: str
1013   level: advanced
1014   desc: Compression algorithm to use in Messenger when communicating with OSD
1015   long_desc: Compression algorithm for connections with OSD in order of preference
1016   default: snappy zlib zstd lz4
1017   services:
1018   - osd
1019   see_also:
1020   - ms_osd_compress_mode
1021   flags:
1022   - runtime
1023 - name: ms_compress_secure
1024   type: bool
1025   level: advanced
1026   desc: Allowing compression when on-wire encryption is enabled
1027   long_desc: Combining encryption with compression reduces the level of security of
1028     messages between peers. In case both encryption and compression are enabled,
1029     compression setting will be ignored and message will not be compressed.
1030     This behaviour can be override using this setting.
1031   default: false
1032   see_also:
1033   - ms_osd_compress_mode
1034   flags:
1035   - runtime
1036 - name: ms_learn_addr_from_peer
1037   type: bool
1038   level: advanced
1039   desc: Learn address from what IP our first peer thinks we connect from
1040   long_desc: Use the IP address our first peer (usually a monitor) sees that we are
1041     connecting from.  This is useful if a client is behind some sort of NAT and we
1042     want to see it identified by its local (not NATed) address.
1043   default: true
1044   with_legacy: true
1045 - name: ms_tcp_nodelay
1046   type: bool
1047   level: advanced
1048   desc: Disable Nagle's algorithm and send queued network traffic immediately
1049   fmt_desc: Ceph enables ``ms_tcp_nodelay`` so that each request is sent
1050    immediately (no buffering). Disabling `Nagle's algorithm`_
1051    increases network traffic, which can introduce latency. If you
1052    experience large numbers of small packets, you may try
1053    disabling ``ms_tcp_nodelay``.
1054   default: true
1055   with_legacy: true
1056 - name: ms_tcp_rcvbuf
1057   type: size
1058   level: advanced
1059   desc: Size of TCP socket receive buffer
1060   fmt_desc: The size of the socket buffer on the receiving end of a network
1061    connection. Disable by default.
1062   default: 0
1063   with_legacy: true
1064 - name: ms_tcp_prefetch_max_size
1065   type: size
1066   level: advanced
1067   desc: Maximum amount of data to prefetch out of the socket receive buffer
1068   default: 4_K
1069   with_legacy: true
1070 - name: ms_initial_backoff
1071   type: float
1072   level: advanced
1073   desc: Initial backoff after a network error is detected (seconds)
1074   fmt_desc: The initial time to wait before reconnecting on a fault.
1075   default: 0.2
1076   with_legacy: true
1077 - name: ms_max_backoff
1078   type: float
1079   level: advanced
1080   desc: Maximum backoff after a network error before retrying (seconds)
1081   fmt_desc: The maximum time to wait before reconnecting on a fault.
1082   default: 15
1083   see_also:
1084   - ms_initial_backoff
1085   with_legacy: true
1086 - name: ms_crc_data
1087   type: bool
1088   level: dev
1089   desc: Set and/or verify crc32c checksum on data payload sent over network
1090   default: true
1091   with_legacy: true
1092 - name: ms_crc_header
1093   type: bool
1094   level: dev
1095   desc: Set and/or verify crc32c checksum on header payload sent over network
1096   default: true
1097   with_legacy: true
1098 - name: ms_die_on_bad_msg
1099   type: bool
1100   level: dev
1101   desc: Induce a daemon crash/exit when a bad network message is received
1102   fmt_desc: Debug option; do not configure.
1103   default: false
1104   with_legacy: true
1105 - name: ms_die_on_unhandled_msg
1106   type: bool
1107   level: dev
1108   desc: Induce a daemon crash/exit when an unrecognized message is received
1109   default: false
1110   with_legacy: true
1111 - name: ms_die_on_old_message
1112   type: bool
1113   level: dev
1114   desc: Induce a daemon crash/exit when a old, undecodable message is received
1115   default: false
1116   with_legacy: true
1117 - name: ms_die_on_skipped_message
1118   type: bool
1119   level: dev
1120   desc: Induce a daemon crash/exit if sender skips a message sequence number
1121   default: false
1122   with_legacy: true
1123 - name: ms_die_on_bug
1124   type: bool
1125   level: dev
1126   desc: Induce a crash/exit on various bugs (for testing purposes)
1127   default: false
1128   with_legacy: true
1129 - name: ms_dispatch_throttle_bytes
1130   type: size
1131   level: advanced
1132   desc: Limit messages that are read off the network but still being processed
1133   fmt_desc: Throttles total size of messages waiting to be dispatched.
1134   default: 100_M
1135   with_legacy: true
1136 - name: ms_bind_ipv4
1137   type: bool
1138   level: advanced
1139   desc: Bind servers to IPv4 address(es)
1140   fmt_desc: Enables Ceph daemons to bind to IPv4 addresses.
1141   default: true
1142   see_also:
1143   - ms_bind_ipv6
1144 - name: ms_bind_ipv6
1145   type: bool
1146   level: advanced
1147   desc: Bind servers to IPv6 address(es)
1148   fmt_desc: Enables Ceph daemons to bind to IPv6 addresses.
1149   default: false
1150   see_also:
1151   - ms_bind_ipv4
1152   with_legacy: true
1153 - name: ms_bind_prefer_ipv4
1154   type: bool
1155   level: advanced
1156   desc: Prefer IPV4 over IPV6 address(es)
1157   default: false
1158 - name: ms_bind_msgr1
1159   type: bool
1160   level: advanced
1161   desc: Bind servers to msgr1 (legacy) protocol address(es)
1162   default: true
1163   see_also:
1164   - ms_bind_msgr2
1165 - name: ms_bind_msgr2
1166   type: bool
1167   level: advanced
1168   desc: Bind servers to msgr2 (nautilus+) protocol address(es)
1169   default: true
1170   see_also:
1171   - ms_bind_msgr1
1172 - name: ms_bind_port_min
1173   type: int
1174   level: advanced
1175   desc: Lowest port number to bind daemon(s) to
1176   fmt_desc: The minimum port number to which an OSD or MDS daemon will bind.
1177   default: 6800
1178   with_legacy: true
1179 - name: ms_bind_port_max
1180   type: int
1181   level: advanced
1182   desc: Highest port number to bind daemon(s) to
1183   fmt_desc: The maximum port number to which an OSD or MDS daemon will bind.
1184   default: 7568
1185   with_legacy: true
1186 # FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
1187 - name: ms_bind_retry_count
1188   type: int
1189   level: advanced
1190   desc: Number of attempts to make while bind(2)ing to a port
1191   default: @ms_bind_retry_count@
1192   with_legacy: true
1193 # FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
1194 - name: ms_bind_retry_delay
1195   type: int
1196   level: advanced
1197   desc: Delay between bind(2) attempts (seconds)
1198   default: @ms_bind_retry_delay@
1199   with_legacy: true
1200 - name: ms_bind_before_connect
1201   type: bool
1202   level: advanced
1203   desc: Call bind(2) on client sockets
1204   default: false
1205   with_legacy: true
1206 - name: ms_tcp_listen_backlog
1207   type: int
1208   level: advanced
1209   desc: Size of queue of incoming connections for accept(2)
1210   default: 512
1211   with_legacy: true
1212 - name: ms_connection_ready_timeout
1213   type: uint
1214   level: advanced
1215   desc: Time before we declare a not yet ready connection as dead (seconds)
1216   default: 10
1217   with_legacy: true
1218 - name: ms_connection_idle_timeout
1219   type: uint
1220   level: advanced
1221   desc: Time before an idle connection is closed (seconds)
1222   default: 900
1223   with_legacy: true
1224 - name: ms_pq_max_tokens_per_priority
1225   type: uint
1226   level: dev
1227   default: 16_M
1228   with_legacy: true
1229 - name: ms_pq_min_cost
1230   type: size
1231   level: dev
1232   default: 64_K
1233   with_legacy: true
1234 - name: ms_inject_socket_failures
1235   type: uint
1236   level: dev
1237   desc: Inject a socket failure every Nth socket operation
1238   fmt_desc: Debug option; do not configure.
1239   default: 0
1240   with_legacy: true
1241 - name: ms_inject_delay_type
1242   type: str
1243   level: dev
1244   desc: Entity type to inject delays for
1245   flags:
1246   - runtime
1247   with_legacy: true
1248 - name: ms_inject_delay_max
1249   type: float
1250   level: dev
1251   desc: Max delay to inject
1252   default: 1
1253   with_legacy: true
1254 - name: ms_inject_delay_probability
1255   type: float
1256   level: dev
1257   default: 0
1258   with_legacy: true
1259 - name: ms_inject_internal_delays
1260   type: float
1261   level: dev
1262   desc: Inject various internal delays to induce races (seconds)
1263   default: 0
1264   with_legacy: true
1265 - name: ms_blackhole_osd
1266   type: bool
1267   level: dev
1268   default: false
1269   with_legacy: true
1270 - name: ms_blackhole_mon
1271   type: bool
1272   level: dev
1273   default: false
1274   with_legacy: true
1275 - name: ms_blackhole_mds
1276   type: bool
1277   level: dev
1278   default: false
1279   with_legacy: true
1280 - name: ms_blackhole_mgr
1281   type: bool
1282   level: dev
1283   default: false
1284   with_legacy: true
1285 - name: ms_blackhole_client
1286   type: bool
1287   level: dev
1288   default: false
1289   with_legacy: true
1290 - name: ms_dump_on_send
1291   type: bool
1292   level: advanced
1293   desc: Hexdump message to debug log on message send
1294   default: false
1295   with_legacy: true
1296 - name: ms_dump_corrupt_message_level
1297   type: int
1298   level: advanced
1299   desc: Log level at which to hexdump corrupt messages we receive
1300   default: 1
1301   with_legacy: true
1302 # number of worker processing threads for async messenger created on init
1303 - name: ms_async_op_threads
1304   type: uint
1305   level: advanced
1306   desc: Threadpool size for AsyncMessenger (ms_type=async)
1307   fmt_desc: Initial number of worker threads used by each Async Messenger instance.
1308     Should be at least equal to highest number of replicas, but you can
1309     decrease it if you are low on CPU core count and/or you host a lot of
1310     OSDs on single server.
1311   default: 3
1312   min: 1
1313   max: 24
1314   with_legacy: true
1315 - name: ms_async_reap_threshold
1316   type: uint
1317   level: dev
1318   desc: number of deleted connections before we reap
1319   default: 5
1320   min: 1
1321   with_legacy: true
1322 - name: ms_async_rdma_device_name
1323   type: str
1324   level: advanced
1325   with_legacy: true
1326 - name: ms_async_rdma_enable_hugepage
1327   type: bool
1328   level: advanced
1329   default: false
1330   with_legacy: true
1331 - name: ms_async_rdma_buffer_size
1332   type: size
1333   level: advanced
1334   default: 128_K
1335   with_legacy: true
1336 - name: ms_async_rdma_send_buffers
1337   type: uint
1338   level: advanced
1339   default: 1_K
1340   with_legacy: true
1341 # size of the receive buffer pool, 0 is unlimited
1342 - name: ms_async_rdma_receive_buffers
1343   type: uint
1344   level: advanced
1345   default: 32_K
1346   with_legacy: true
1347 # max number of wr in srq
1348 - name: ms_async_rdma_receive_queue_len
1349   type: uint
1350   level: advanced
1351   default: 4_K
1352   with_legacy: true
1353 # support srq
1354 - name: ms_async_rdma_support_srq
1355   type: bool
1356   level: advanced
1357   default: true
1358   with_legacy: true
1359 - name: ms_async_rdma_port_num
1360   type: uint
1361   level: advanced
1362   default: 1
1363   with_legacy: true
1364 - name: ms_async_rdma_polling_us
1365   type: uint
1366   level: advanced
1367   default: 1000
1368   with_legacy: true
1369 - name: ms_async_rdma_gid_idx
1370   type: int
1371   level: advanced
1372   desc: use gid_idx to select GID for choosing RoCEv1 or RoCEv2
1373   default: 0
1374   with_legacy: true
1375 # GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding
1376 - name: ms_async_rdma_local_gid
1377   type: str
1378   level: advanced
1379   with_legacy: true
1380 # 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5
1381 - name: ms_async_rdma_roce_ver
1382   type: int
1383   level: advanced
1384   default: 1
1385   with_legacy: true
1386 # in RoCE, this means PCP
1387 - name: ms_async_rdma_sl
1388   type: int
1389   level: advanced
1390   default: 3
1391   with_legacy: true
1392 # in RoCE, this means DSCP
1393 - name: ms_async_rdma_dscp
1394   type: int
1395   level: advanced
1396   default: 96
1397   with_legacy: true
1398 # when there are enough accept failures, indicating there are unrecoverable failures,
1399 # just do ceph_abort() . Here we make it configurable.
1400 - name: ms_max_accept_failures
1401   type: int
1402   level: advanced
1403   desc: The maximum number of consecutive failed accept() calls before considering
1404     the daemon is misconfigured and abort it.
1405   default: 4
1406   with_legacy: true
1407 # rdma connection management
1408 - name: ms_async_rdma_cm
1409   type: bool
1410   level: advanced
1411   default: false
1412   with_legacy: true
1413 - name: ms_async_rdma_type
1414   type: str
1415   level: advanced
1416   default: ib
1417   with_legacy: true
1418 - name: ms_dpdk_port_id
1419   type: int
1420   level: advanced
1421   default: 0
1422   with_legacy: true
1423 # it is modified in unittest so that use SAFE_OPTION to declare
1424 - name: ms_dpdk_coremask
1425   type: str
1426   level: advanced
1427   default: '0xF'
1428   see_also:
1429   - ms_async_op_threads
1430   with_legacy: true
1431 - name: ms_dpdk_memory_channel
1432   type: str
1433   level: advanced
1434   default: '4'
1435   with_legacy: true
1436 - name: ms_dpdk_hugepages
1437   type: str
1438   level: advanced
1439   with_legacy: true
1440 - name: ms_dpdk_pmd
1441   type: str
1442   level: advanced
1443   with_legacy: true
1444 - name: ms_dpdk_devs_allowlist
1445   type: str
1446   level: advanced
1447   desc: NIC's PCIe address are allowed to use
1448   long_desc: for a single NIC use ms_dpdk_devs_allowlist=-a 0000:7d:010 or --allow=0000:7d:010;
1449     for a bond nics use ms_dpdk_devs_allowlist=--allow=0000:7d:01.0 --allow=0000:7d:02.6
1450     --vdev=net_bonding0,mode=2,slave=0000:7d:01.0,slave=0000:7d:02.6.
1451 - name: ms_dpdk_host_ipv4_addr
1452   type: str
1453   level: advanced
1454   with_legacy: true
1455 - name: ms_dpdk_gateway_ipv4_addr
1456   type: str
1457   level: advanced
1458   with_legacy: true
1459 - name: ms_dpdk_netmask_ipv4_addr
1460   type: str
1461   level: advanced
1462   with_legacy: true
1463 - name: ms_dpdk_lro
1464   type: bool
1465   level: advanced
1466   default: true
1467   with_legacy: true
1468 - name: ms_dpdk_enable_tso
1469   type: bool
1470   level: advanced
1471   default: true
1472 - name: ms_dpdk_hw_flow_control
1473   type: bool
1474   level: advanced
1475   default: true
1476   with_legacy: true
1477 # Weighing of a hardware network queue relative to a software queue (0=no work, 1=     equal share)")
1478 - name: ms_dpdk_hw_queue_weight
1479   type: float
1480   level: advanced
1481   default: 1
1482   with_legacy: true
1483 - name: ms_dpdk_debug_allow_loopback
1484   type: bool
1485   level: dev
1486   default: false
1487   with_legacy: true
1488 - name: ms_dpdk_rx_buffer_count_per_core
1489   type: int
1490   level: advanced
1491   default: 8192
1492   with_legacy: true
1493 - name: inject_early_sigterm
1494   type: bool
1495   level: dev
1496   desc: send ourselves a SIGTERM early during startup
1497   default: false
1498   with_legacy: true
1499 # list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster
1500 - name: mon_initial_members
1501   type: str
1502   level: advanced
1503   fmt_desc: The IDs of initial monitors in a cluster during startup. If
1504     specified, Ceph requires an odd number of monitors to form an
1505     initial quorum (e.g., 3).
1506   note: A *majority* of monitors in your cluster must be able to reach
1507     each other in order to establish a quorum. You can decrease the initial
1508     number of monitors to establish a quorum with this setting.
1509   services:
1510   - mon
1511   flags:
1512   - no_mon_update
1513   - cluster_create
1514   with_legacy: true
1515 - name: mon_max_pg_per_osd
1516   type: uint
1517   level: advanced
1518   desc: Max number of PGs per OSD the cluster will allow
1519   long_desc: If the number of PGs per OSD exceeds this, a health warning will be visible
1520     in `ceph status`.  This is also used in automated PG management, as the threshold
1521     at which some pools' pg_num may be shrunk in order to enable increasing the pg_num
1522     of others.
1523   default: 250
1524   flags:
1525   - runtime
1526   services:
1527   - mgr
1528   - mon
1529   min: 1
1530 - name: mon_osd_full_ratio
1531   type: float
1532   level: advanced
1533   desc: full ratio of OSDs to be set during initial creation of the cluster
1534   default: 0.95
1535   flags:
1536   - no_mon_update
1537   - cluster_create
1538   with_legacy: true
1539 - name: mon_osd_backfillfull_ratio
1540   type: float
1541   level: advanced
1542   default: 0.9
1543   flags:
1544   - no_mon_update
1545   - cluster_create
1546   with_legacy: true
1547 - name: mon_osd_nearfull_ratio
1548   type: float
1549   level: advanced
1550   desc: nearfull ratio for OSDs to be set during initial creation of cluster
1551   default: 0.85
1552   flags:
1553   - no_mon_update
1554   - cluster_create
1555   with_legacy: true
1556 - name: mon_osd_initial_require_min_compat_client
1557   type: str
1558   level: advanced
1559   default: luminous
1560   flags:
1561   - no_mon_update
1562   - cluster_create
1563   with_legacy: true
1564 - name: mon_allow_pool_delete
1565   type: bool
1566   level: advanced
1567   desc: allow pool deletions
1568   fmt_desc: Should monitors allow pools to be removed, regardless of what the pool flags say?
1569   default: false
1570   services:
1571   - mon
1572   with_legacy: true
1573 - name: mon_fake_pool_delete
1574   type: bool
1575   level: advanced
1576   desc: fake pool deletions by renaming the rados pool
1577   default: false
1578   services:
1579   - mon
1580   with_legacy: true
1581 - name: mon_globalid_prealloc
1582   type: uint
1583   level: advanced
1584   desc: number of globalid values to preallocate
1585   long_desc: This setting caps how many new clients can authenticate with the cluster
1586     before the monitors have to perform a write to preallocate more.  Large values
1587     burn through the 64-bit ID space more quickly.
1588   fmt_desc: The number of global IDs to pre-allocate for clients and daemons in the cluster.
1589   default: 10000
1590   services:
1591   - mon
1592   with_legacy: true
1593 - name: mon_osd_report_timeout
1594   type: int
1595   level: advanced
1596   desc: time before OSDs who do not report to the mons are marked down (seconds)
1597   fmt_desc: The grace period in seconds before declaring
1598               unresponsive Ceph OSD Daemons ``down``.
1599   default: 15_min
1600   services:
1601   - mon
1602   with_legacy: true
1603 - name: mon_warn_on_insecure_global_id_reclaim
1604   type: bool
1605   level: advanced
1606   desc: issue AUTH_INSECURE_GLOBAL_ID_RECLAIM health warning if any connected
1607     clients are insecurely reclaiming global_id
1608   default: true
1609   services:
1610   - mon
1611   see_also:
1612   - mon_warn_on_insecure_global_id_reclaim_allowed
1613   - auth_allow_insecure_global_id_reclaim
1614   - auth_expose_insecure_global_id_reclaim
1615 - name: mon_warn_on_insecure_global_id_reclaim_allowed
1616   type: bool
1617   level: advanced
1618   desc: issue AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED health warning if insecure
1619     global_id reclaim is allowed
1620   default: true
1621   services:
1622   - mon
1623   see_also:
1624   - mon_warn_on_insecure_global_id_reclaim
1625   - auth_allow_insecure_global_id_reclaim
1626   - auth_expose_insecure_global_id_reclaim
1627 - name: mon_warn_on_msgr2_not_enabled
1628   type: bool
1629   level: advanced
1630   desc: issue MON_MSGR2_NOT_ENABLED health warning if monitors are all running Nautilus
1631     but not all binding to a msgr2 port
1632   default: true
1633   services:
1634   - mon
1635   see_also:
1636   - ms_bind_msgr2
1637 - name: mon_warn_on_slow_ping_time
1638   type: float
1639   level: advanced
1640   desc: Override mon_warn_on_slow_ping_ratio with specified threshold in milliseconds
1641   fmt_desc: Override ``mon_warn_on_slow_ping_ratio`` with a specific value.
1642     Raise ``HEALTH_WARN`` if any heartbeat between OSDs exceeds
1643     ``mon_warn_on_slow_ping_time`` milliseconds.  The default is 0 (disabled).
1644   default: 0
1645   services:
1646   - mgr
1647   - osd
1648   see_also:
1649   - mon_warn_on_slow_ping_ratio
1650 - name: mon_warn_on_slow_ping_ratio
1651   type: float
1652   level: advanced
1653   desc: Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace
1654   fmt_desc: Raise ``HEALTH_WARN`` when any heartbeat between OSDs exceeds
1655     ``mon_warn_on_slow_ping_ratio`` of ``osd_heartbeat_grace``.
1656   default: 0.05
1657   services:
1658   - mgr
1659   - osd
1660   see_also:
1661   - osd_heartbeat_grace
1662   - mon_warn_on_slow_ping_time
1663 - name: mon_max_snap_prune_per_epoch
1664   type: uint
1665   level: advanced
1666   desc: max number of pruned snaps we will process in a single OSDMap epoch
1667   default: 100
1668   services:
1669   - mon
1670 - name: mon_min_osdmap_epochs
1671   type: int
1672   level: advanced
1673   desc: min number of OSDMaps to store
1674   fmt_desc: Minimum number of OSD map epochs to keep at all times.
1675   default: 500
1676   services:
1677   - mon
1678   with_legacy: true
1679 - name: mon_max_log_epochs
1680   type: int
1681   level: advanced
1682   desc: max number of past cluster log epochs to store
1683   fmt_desc: Maximum number of Log epochs the monitor should keep.
1684   default: 500
1685   services:
1686   - mon
1687   with_legacy: true
1688 - name: mon_max_mdsmap_epochs
1689   type: int
1690   level: advanced
1691   desc: max number of FSMaps/MDSMaps to store
1692   fmt_desc: The maximum number of mdsmap epochs to trim during a single proposal.
1693   default: 500
1694   services:
1695   - mon
1696   with_legacy: true
1697 - name: mon_max_mgrmap_epochs
1698   type: int
1699   level: advanced
1700   desc: max number of MgrMaps to store
1701   default: 500
1702   services:
1703   - mon
1704 - name: mon_max_osd
1705   type: int
1706   level: advanced
1707   desc: max number of OSDs in a cluster
1708   fmt_desc: The maximum number of OSDs allowed in the cluster.
1709   default: 10000
1710   services:
1711   - mon
1712   with_legacy: true
1713 - name: mon_probe_timeout
1714   type: float
1715   level: advanced
1716   desc: timeout for querying other mons during bootstrap pre-election phase (seconds)
1717   fmt_desc: Number of seconds the monitor will wait to find peers before bootstrapping.
1718   default: 2
1719   services:
1720   - mon
1721   with_legacy: true
1722 - name: mon_client_bytes
1723   type: size
1724   level: advanced
1725   desc: max bytes of outstanding client messages mon will read off the network
1726   fmt_desc: The amount of client message data allowed in memory (in bytes).
1727   default: 100_M
1728   services:
1729   - mon
1730   with_legacy: true
1731 - name: mon_warn_pg_not_scrubbed_ratio
1732   type: float
1733   level: advanced
1734   desc: Percentage of the scrub max interval past the scrub max interval to warn
1735   default: 0.5
1736   see_also:
1737   - osd_scrub_max_interval
1738   min: 0
1739   with_legacy: true
1740 - name: mon_warn_pg_not_deep_scrubbed_ratio
1741   type: float
1742   level: advanced
1743   desc: Percentage of the deep scrub interval past the deep scrub interval to warn
1744   default: 0.75
1745   see_also:
1746   - osd_deep_scrub_interval
1747   min: 0
1748   with_legacy: true
1749 - name: mon_scrub_interval
1750   type: secs
1751   level: advanced
1752   desc: frequency for scrubbing mon database
1753   fmt_desc: How often the monitor scrubs its store by comparing
1754     the stored checksums with the computed ones for all stored
1755     keys. (0 disables it. dangerous, use with care)
1756   default: 1_day
1757   services:
1758   - mon
1759 - name: mon_scrub_timeout
1760   type: int
1761   level: advanced
1762   desc: timeout to restart scrub of mon quorum participant does not respond for the
1763     latest chunk
1764   default: 5_min
1765   services:
1766   - mon
1767   with_legacy: true
1768 - name: mon_scrub_max_keys
1769   type: int
1770   level: advanced
1771   desc: max keys per on scrub chunk/step
1772   fmt_desc: The maximum number of keys to scrub each time.
1773   default: 100
1774   services:
1775   - mon
1776   with_legacy: true
1777 # probability of injected crc mismatch [0.0, 1.0]
1778 - name: mon_scrub_inject_crc_mismatch
1779   type: float
1780   level: dev
1781   desc: probability for injecting crc mismatches into mon scrub
1782   default: 0
1783   services:
1784   - mon
1785   with_legacy: true
1786 # probability of injected missing keys [0.0, 1.0]
1787 - name: mon_scrub_inject_missing_keys
1788   type: float
1789   level: dev
1790   desc: probability for injecting missing keys into mon scrub
1791   default: 0
1792   services:
1793   - mon
1794   with_legacy: true
1795 - name: mon_config_key_max_entry_size
1796   type: size
1797   level: advanced
1798   desc: Defines the number of bytes allowed to be held in a single config-key entry
1799   fmt_desc: The maximum size of config-key entry (in bytes)
1800   default: 64_K
1801   services:
1802   - mon
1803   with_legacy: true
1804 - name: mon_sync_timeout
1805   type: float
1806   level: advanced
1807   desc: timeout before canceling sync if syncing mon does not respond
1808   fmt_desc: Number of seconds the monitor will wait for the next update
1809     message from its sync provider before it gives up and bootstrap
1810     again.
1811   default: 1_min
1812   services:
1813   - mon
1814   with_legacy: true
1815 - name: mon_sync_max_payload_size
1816   type: size
1817   level: advanced
1818   desc: target max message payload for mon sync
1819   fmt_desc: The maximum size for a sync payload (in bytes).
1820   default: 1_M
1821   services:
1822   - mon
1823   with_legacy: true
1824 - name: mon_sync_max_payload_keys
1825   type: int
1826   level: advanced
1827   desc: target max keys in message payload for mon sync
1828   default: 2000
1829   services:
1830   - mon
1831   with_legacy: true
1832 - name: mon_sync_debug
1833   type: bool
1834   level: dev
1835   desc: enable extra debugging during mon sync
1836   default: false
1837   services:
1838   - mon
1839   with_legacy: true
1840 - name: mon_inject_sync_get_chunk_delay
1841   type: float
1842   level: dev
1843   desc: inject delay during sync (seconds)
1844   default: 0
1845   services:
1846   - mon
1847   with_legacy: true
1848 - name: mon_osd_min_down_reporters
1849   type: uint
1850   level: advanced
1851   desc: number of OSDs from different subtrees who need to report a down OSD for it
1852     to count
1853   fmt_desc: The minimum number of Ceph OSD Daemons required to report a
1854               ``down`` Ceph OSD Daemon.
1855   default: 2
1856   services:
1857   - mon
1858   see_also:
1859   - mon_osd_reporter_subtree_level
1860 - name: mon_osd_reporter_subtree_level
1861   type: str
1862   level: advanced
1863   desc: in which level of parent bucket the reporters are counted
1864   fmt_desc: In which level of parent bucket the reporters are counted. The OSDs
1865               send failure reports to monitors if they find a peer that is not responsive.
1866               Monitors mark the reported ``OSD`` out and then ``down`` after a grace period.
1867   default: host
1868   services:
1869   - mon
1870   flags:
1871   - runtime
1872 - name: mon_osd_snap_trim_queue_warn_on
1873   type: int
1874   level: advanced
1875   desc: Warn when snap trim queue is that large (or larger).
1876   long_desc: Warn when snap trim queue length for at least one PG crosses this value,
1877     as this is indicator of snap trimmer not keeping up, wasting disk space
1878   default: 32768
1879   services:
1880   - mon
1881   with_legacy: true
1882 # force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous)
1883 - name: mon_osd_force_trim_to
1884   type: int
1885   level: dev
1886   desc: force mons to trim osdmaps through this epoch
1887   fmt_desc: Force monitor to trim osdmaps to this point, even if there is
1888     PGs not clean at the specified epoch (0 disables it. dangerous,
1889     use with care)
1890   default: 0
1891   services:
1892   - mon
1893   with_legacy: true
1894 - name: mon_debug_extra_checks
1895   type: bool
1896   level: dev
1897   desc: Enable some additional monitor checks
1898   long_desc: Enable some additional monitor checks that would be too expensive to
1899     run on production systems, or would only be relevant while testing or debugging.
1900   default: false
1901   services:
1902   - mon
1903 - name: mon_debug_block_osdmap_trim
1904   type: bool
1905   level: dev
1906   desc: Block OSDMap trimming while the option is enabled.
1907   long_desc: Blocking OSDMap trimming may be quite helpful to easily reproduce states
1908     in which the monitor keeps (hundreds of) thousands of osdmaps.
1909   default: false
1910   services:
1911   - mon
1912 - name: mon_debug_deprecated_as_obsolete
1913   type: bool
1914   level: dev
1915   desc: treat deprecated mon commands as obsolete
1916   default: false
1917   services:
1918   - mon
1919   with_legacy: true
1920 - name: mon_debug_dump_transactions
1921   type: bool
1922   level: dev
1923   desc: dump paxos transactions to log
1924   default: false
1925   services:
1926   - mon
1927   see_also:
1928   - mon_debug_dump_location
1929   with_legacy: true
1930 - name: mon_debug_dump_json
1931   type: bool
1932   level: dev
1933   desc: dump paxos transasctions to log as json
1934   default: false
1935   services:
1936   - mon
1937   see_also:
1938   - mon_debug_dump_transactions
1939   with_legacy: true
1940 - name: mon_debug_dump_location
1941   type: str
1942   level: dev
1943   desc: file to dump paxos transactions to
1944   default: /var/log/ceph/$cluster-$name.tdump
1945   services:
1946   - mon
1947   see_also:
1948   - mon_debug_dump_transactions
1949   with_legacy: true
1950 - name: mon_debug_no_require_pacific
1951   type: bool
1952   level: dev
1953   desc: do not set pacific feature for new mon clusters
1954   default: false
1955   services:
1956   - mon
1957   flags:
1958   - cluster_create
1959 - name: mon_debug_no_require_quincy
1960   type: bool
1961   level: dev
1962   desc: do not set quincy feature for new mon clusters
1963   default: false
1964   services:
1965   - mon
1966   flags:
1967   - cluster_create
1968 - name: mon_debug_no_require_bluestore_for_ec_overwrites
1969   type: bool
1970   level: dev
1971   desc: do not require bluestore OSDs to enable EC overwrites on a rados pool
1972   default: false
1973   services:
1974   - mon
1975   with_legacy: true
1976 - name: mon_debug_no_initial_persistent_features
1977   type: bool
1978   level: dev
1979   desc: do not set any monmap features for new mon clusters
1980   default: false
1981   services:
1982   - mon
1983   flags:
1984   - cluster_create
1985   with_legacy: true
1986 - name: mon_inject_transaction_delay_max
1987   type: float
1988   level: dev
1989   desc: max duration of injected delay in paxos
1990   default: 10
1991   services:
1992   - mon
1993   with_legacy: true
1994 # range [0, 1]
1995 - name: mon_inject_transaction_delay_probability
1996   type: float
1997   level: dev
1998   desc: probability of injecting a delay in paxos
1999   default: 0
2000   services:
2001   - mon
2002   with_legacy: true
2003 - name: mon_inject_pg_merge_bounce_probability
2004   type: float
2005   level: dev
2006   desc: probability of failing and reverting a pg_num decrement
2007   default: 0
2008   services:
2009   - mon
2010 # kill the sync provider at a specific point in the work flow
2011 - name: mon_sync_provider_kill_at
2012   type: int
2013   level: dev
2014   desc: kill mon sync requester at specific point
2015   default: 0
2016   services:
2017   - mon
2018   with_legacy: true
2019 # kill the sync requester at a specific point in the work flow
2020 - name: mon_sync_requester_kill_at
2021   type: int
2022   level: dev
2023   desc: kill mon sync requestor at specific point
2024   default: 0
2025   services:
2026   - mon
2027   with_legacy: true
2028 # force monitor to join quorum even if it has been previously removed from the map
2029 - name: mon_force_quorum_join
2030   type: bool
2031   level: advanced
2032   desc: force mon to rejoin quorum even though it was just removed
2033   fmt_desc: Force monitor to join quorum even if it has been previously removed from the map
2034   default: false
2035   services:
2036   - mon
2037   with_legacy: true
2038 # type of keyvaluedb backend
2039 - name: mon_keyvaluedb
2040   type: str
2041   level: advanced
2042   desc: database backend to use for the mon database
2043   default: rocksdb
2044   services:
2045   - mon
2046   enum_values:
2047   - leveldb
2048   - rocksdb
2049   flags:
2050   - create
2051   with_legacy: true
2052 # UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps
2053 - name: mon_debug_unsafe_allow_tier_with_nonempty_snaps
2054   type: bool
2055   level: dev
2056   default: false
2057   services:
2058   - mon
2059   with_legacy: true
2060 # required of mon, mds, osd daemons
2061 - name: auth_cluster_required
2062   type: str
2063   level: advanced
2064   desc: authentication methods required by the cluster
2065   fmt_desc: If enabled, the Ceph Storage Cluster daemons (i.e., ``ceph-mon``,
2066    ``ceph-osd``, ``ceph-mds`` and ``ceph-mgr``) must authenticate with
2067    each other. Valid settings are ``cephx`` or ``none``.
2068   default: cephx
2069   with_legacy: true
2070 # required by daemons of clients
2071 - name: auth_service_required
2072   type: str
2073   level: advanced
2074   desc: authentication methods required by service daemons
2075   fmt_desc: If enabled, the Ceph Storage Cluster daemons require Ceph Clients
2076    to authenticate with the Ceph Storage Cluster in order to access
2077    Ceph services. Valid settings are ``cephx`` or ``none``.
2078   default: cephx
2079   with_legacy: true
2080 # what clients require of daemons
2081 - name: auth_client_required
2082   type: str
2083   level: advanced
2084   desc: authentication methods allowed by clients
2085   fmt_desc: If enabled, the Ceph Client requires the Ceph Storage Cluster to
2086    authenticate with the Ceph Client. Valid settings are ``cephx``
2087    or ``none``.
2088   default: cephx, none
2089   with_legacy: true
2090 # deprecated; default value for above if they are not defined.
2091 - name: auth_supported
2092   type: str
2093   level: advanced
2094   desc: authentication methods required (deprecated)
2095   with_legacy: true
2096 - name: max_rotating_auth_attempts
2097   type: int
2098   level: advanced
2099   desc: number of attempts to initialize rotating keys before giving up
2100   default: 10
2101   with_legacy: true
2102 - name: rotating_keys_bootstrap_timeout
2103   type: int
2104   level: advanced
2105   desc: timeout for obtaining rotating keys during bootstrap phase (seconds)
2106   default: 30
2107 - name: rotating_keys_renewal_timeout
2108   type: int
2109   level: advanced
2110   desc: timeout for updating rotating keys (seconds)
2111   default: 10
2112 - name: cephx_require_signatures
2113   type: bool
2114   level: advanced
2115   default: false
2116   fmt_desc: If set to ``true``, Ceph requires signatures on all message
2117    traffic between the Ceph Client and the Ceph Storage Cluster, and
2118    between daemons comprising the Ceph Storage Cluster.
2119
2120    Ceph Argonaut and Linux kernel versions prior to 3.19 do
2121    not support signatures; if such clients are in use this
2122    option can be turned off to allow them to connect.
2123   with_legacy: true
2124 - name: cephx_require_version
2125   type: int
2126   level: advanced
2127   desc: Cephx version required (1 = pre-mimic, 2 = mimic+)
2128   default: 2
2129   with_legacy: true
2130 - name: cephx_cluster_require_signatures
2131   type: bool
2132   level: advanced
2133   default: false
2134   fmt_desc:    If set to ``true``, Ceph requires signatures on all message
2135    traffic between Ceph daemons comprising the Ceph Storage Cluster.
2136   with_legacy: true
2137 - name: cephx_cluster_require_version
2138   type: int
2139   level: advanced
2140   desc: Cephx version required by the cluster from clients (1 = pre-mimic, 2 = mimic+)
2141   default: 2
2142   with_legacy: true
2143 - name: cephx_service_require_signatures
2144   type: bool
2145   level: advanced
2146   default: false
2147   fmt_desc: If set to ``true``, Ceph requires signatures on all message
2148    traffic between Ceph Clients and the Ceph Storage Cluster.
2149   with_legacy: true
2150 - name: cephx_service_require_version
2151   type: int
2152   level: advanced
2153   desc: Cephx version required from ceph services (1 = pre-mimic, 2 = mimic+)
2154   default: 2
2155   with_legacy: true
2156 # Default to signing session messages if supported
2157 - name: cephx_sign_messages
2158   type: bool
2159   level: advanced
2160   default: true
2161   fmt_desc: If the Ceph version supports message signing, Ceph will sign
2162    all messages so they are more difficult to spoof.
2163   with_legacy: true
2164 - name: auth_mon_ticket_ttl
2165   type: float
2166   level: advanced
2167   default: 72_hr
2168   with_legacy: true
2169 - name: auth_service_ticket_ttl
2170   type: float
2171   level: advanced
2172   default: 1_hr
2173   fmt_desc: When the Ceph Storage Cluster sends a Ceph Client a ticket for
2174    authentication, the Ceph Storage Cluster assigns the ticket a
2175    time to live.
2176   with_legacy: true
2177 - name: auth_allow_insecure_global_id_reclaim
2178   type: bool
2179   level: advanced
2180   desc: Allow reclaiming global_id without presenting a valid ticket proving
2181     previous possession of that global_id
2182   long_desc: Allowing unauthorized global_id (re)use poses a security risk.
2183     Unfortunately, older clients may omit their ticket on reconnects and
2184     therefore rely on this being allowed for preserving their global_id for
2185     the lifetime of the client instance. Setting this value to false would
2186     immediately prevent new connections from those clients (assuming
2187     auth_expose_insecure_global_id_reclaim set to true) and eventually break
2188     existing sessions as well (regardless of auth_expose_insecure_global_id_reclaim
2189     setting).
2190   default: true
2191   see_also:
2192   - mon_warn_on_insecure_global_id_reclaim
2193   - mon_warn_on_insecure_global_id_reclaim_allowed
2194   - auth_expose_insecure_global_id_reclaim
2195   with_legacy: true
2196 - name: auth_expose_insecure_global_id_reclaim
2197   type: bool
2198   level: advanced
2199   desc: Force older clients that may omit their ticket on reconnects to
2200     reconnect as part of establishing a session
2201   long_desc: 'In permissive mode (auth_allow_insecure_global_id_reclaim set
2202     to true), this helps with identifying clients that are not patched. In
2203     enforcing mode (auth_allow_insecure_global_id_reclaim set to false), this
2204     is a fail-fast mechanism: don''t establish a session that will almost
2205     inevitably be broken later.'
2206   default: true
2207   see_also:
2208   - mon_warn_on_insecure_global_id_reclaim
2209   - mon_warn_on_insecure_global_id_reclaim_allowed
2210   - auth_allow_insecure_global_id_reclaim
2211   with_legacy: true
2212 # if true, assert when weird things happen
2213 - name: auth_debug
2214   type: bool
2215   level: dev
2216   default: false
2217   with_legacy: true
2218 # how many mons to try to connect to in parallel during hunt
2219 - name: mon_client_hunt_parallel
2220   type: uint
2221   level: advanced
2222   default: 3
2223   with_legacy: true
2224 # try new mon every N seconds until we connect
2225 - name: mon_client_hunt_interval
2226   type: float
2227   level: advanced
2228   default: 3
2229   fmt_desc: The client will try a new monitor every ``N`` seconds until it
2230     establishes a connection.
2231   with_legacy: true
2232 # send logs every N seconds
2233 - name: mon_client_log_interval
2234   type: float
2235   level: advanced
2236   desc: How frequently we send queued cluster log messages to mon
2237   default: 1
2238   with_legacy: true
2239 # ping every N seconds
2240 - name: mon_client_ping_interval
2241   type: float
2242   level: advanced
2243   default: 10
2244   fmt_desc: The client will ping the monitor every ``N`` seconds.
2245   with_legacy: true
2246 # fail if we don't hear back
2247 - name: mon_client_ping_timeout
2248   type: float
2249   level: advanced
2250   default: 30
2251   with_legacy: true
2252 - name: mon_client_hunt_interval_backoff
2253   type: float
2254   level: advanced
2255   default: 1.5
2256   with_legacy: true
2257 - name: mon_client_hunt_interval_min_multiple
2258   type: float
2259   level: advanced
2260   default: 1
2261   with_legacy: true
2262 - name: mon_client_hunt_interval_max_multiple
2263   type: float
2264   level: advanced
2265   default: 10
2266   with_legacy: true
2267 - name: mon_client_max_log_entries_per_message
2268   type: int
2269   level: advanced
2270   default: 1000
2271   fmt_desc: The maximum number of log entries a monitor will generate
2272     per client message.
2273   with_legacy: true
2274 - name: mon_client_directed_command_retry
2275   type: int
2276   level: dev
2277   desc: Number of times to try sending a command directed at a specific monitor
2278   default: 2
2279   with_legacy: true
2280 # whitespace-separated list of key=value pairs describing crush location
2281 - name: crush_location
2282   type: str
2283   level: advanced
2284   with_legacy: true
2285 - name: crush_location_hook
2286   type: str
2287   level: advanced
2288   with_legacy: true
2289 - name: crush_location_hook_timeout
2290   type: int
2291   level: advanced
2292   default: 10
2293   with_legacy: true
2294 - name: objecter_tick_interval
2295   type: float
2296   level: dev
2297   default: 5
2298   with_legacy: true
2299 # before we ask for a map
2300 - name: objecter_timeout
2301   type: float
2302   level: advanced
2303   desc: Seconds before in-flight op is considered 'laggy' and we query mon for the
2304     latest OSDMap
2305   default: 10
2306   with_legacy: true
2307 - name: objecter_inflight_op_bytes
2308   type: size
2309   level: advanced
2310   desc: Max in-flight data in bytes (both directions)
2311   default: 100_M
2312   with_legacy: true
2313 - name: objecter_inflight_ops
2314   type: uint
2315   level: advanced
2316   desc: Max in-flight operations
2317   default: 1_K
2318   with_legacy: true
2319 # num of completion locks per each session, for serializing same object responses
2320 - name: objecter_completion_locks_per_session
2321   type: uint
2322   level: dev
2323   default: 32
2324   with_legacy: true
2325 # suppress watch pings
2326 - name: objecter_inject_no_watch_ping
2327   type: bool
2328   level: dev
2329   default: false
2330   with_legacy: true
2331 # ignore the first reply for each write, and resend the osd op instead
2332 - name: objecter_retry_writes_after_first_reply
2333   type: bool
2334   level: dev
2335   default: false
2336   with_legacy: true
2337 - name: objecter_debug_inject_relock_delay
2338   type: bool
2339   level: dev
2340   default: false
2341   with_legacy: true
2342 - name: filer_max_purge_ops
2343   type: uint
2344   level: advanced
2345   desc: Max in-flight operations for purging a striped range (e.g., MDS journal)
2346   default: 10
2347   with_legacy: true
2348 - name: filer_max_truncate_ops
2349   type: uint
2350   level: advanced
2351   desc: Max in-flight operations for truncating/deleting a striped sequence (e.g.,
2352     MDS journal)
2353   default: 128
2354   with_legacy: true
2355 - name: journaler_write_head_interval
2356   type: int
2357   level: advanced
2358   desc: Interval in seconds between journal header updates (to help bound replay time)
2359   default: 15
2360 # * journal object size
2361 - name: journaler_prefetch_periods
2362   type: uint
2363   level: advanced
2364   desc: Number of striping periods to prefetch while reading MDS journal
2365   default: 10
2366   # we need at least 2 periods to make progress.
2367   min: 2
2368 # * journal object size
2369 - name: journaler_prezero_periods
2370   type: uint
2371   level: advanced
2372   desc: Number of striping periods to zero head of MDS journal write position
2373   default: 5
2374   # we need to zero at least two periods, minimum, to ensure that we
2375   # have a full empty object/period in front of us.
2376   min: 2
2377 - name: osd_calc_pg_upmaps_aggressively
2378   type: bool
2379   level: advanced
2380   desc: try to calculate PG upmaps more aggressively, e.g., by doing a fairly exhaustive
2381     search of existing PGs that can be unmapped or upmapped
2382   default: true
2383   flags:
2384   - runtime
2385 - name: osd_calc_pg_upmaps_local_fallback_retries
2386   type: uint
2387   level: advanced
2388   desc: 'Maximum number of PGs we can attempt to unmap or upmap for a specific overfull
2389     or underfull osd per iteration '
2390   default: 100
2391   flags:
2392   - runtime
2393 # 1 = host
2394 - name: osd_crush_chooseleaf_type
2395   type: int
2396   level: dev
2397   desc: default chooseleaf type for osdmaptool --create
2398   fmt_desc: The bucket type to use for ``chooseleaf`` in a CRUSH rule. Uses
2399     ordinal rank rather than name.
2400   default: 1
2401   flags:
2402   - cluster_create
2403   with_legacy: true
2404 # try to use gmt for hitset archive names if all osds in cluster support it
2405 - name: osd_pool_use_gmt_hitset
2406   type: bool
2407   level: dev
2408   desc: use UTC for hitset timestamps
2409   long_desc: This setting only exists for compatibility with hammer (and older) clusters.
2410   default: true
2411   with_legacy: true
2412 # whether turn on fast read on the pool or not
2413 - name: osd_pool_default_ec_fast_read
2414   type: bool
2415   level: advanced
2416   desc: set ec_fast_read for new erasure-coded pools
2417   fmt_desc: Whether to turn on fast read on the pool or not. It will be used as
2418     the default setting of newly created erasure coded pools if ``fast_read``
2419     is not specified at create time.
2420   default: false
2421   services:
2422   - mon
2423   with_legacy: true
2424 - name: osd_pool_default_crush_rule
2425   type: int
2426   level: advanced
2427   desc: CRUSH rule for newly created pools
2428   fmt_desc: The default CRUSH rule to use when creating a replicated pool. The
2429     default value of ``-1`` means "pick the rule with the lowest numerical ID and
2430     use that".  This is to make pool creation work in the absence of rule 0.
2431   default: -1
2432   services:
2433   - mon
2434 - name: osd_pool_default_size
2435   type: uint
2436   level: advanced
2437   desc: the number of copies of an object for new replicated pools
2438   fmt_desc: Sets the number of replicas for objects in the pool. The default
2439     value is the same as
2440     ``ceph osd pool set {pool-name} size {size}``.
2441   default: 3
2442   services:
2443   - mon
2444   min: 0
2445   max: 10
2446   flags:
2447   - runtime
2448 - name: osd_pool_default_min_size
2449   type: uint
2450   level: advanced
2451   desc: the minimal number of copies allowed to write to a degraded pool for new replicated
2452     pools
2453   long_desc: 0 means no specific default; ceph will use size-size/2
2454   fmt_desc: Sets the minimum number of written replicas for objects in the
2455     pool in order to acknowledge an I/O operation to the client.  If
2456     minimum is not met, Ceph will not acknowledge the I/O to the
2457     client, **which may result in data loss**. This setting ensures
2458     a minimum number of replicas when operating in ``degraded`` mode.
2459     The default value is ``0`` which means no particular minimum. If ``0``,
2460     minimum is ``size - (size / 2)``.
2461   default: 0
2462   services:
2463   - mon
2464   see_also:
2465   - osd_pool_default_size
2466   min: 0
2467   max: 255
2468   flags:
2469   - runtime
2470 - name: osd_pool_default_pg_num
2471   type: uint
2472   level: advanced
2473   desc: number of PGs for new pools
2474   fmt_desc: The default number of placement groups for a pool. The default
2475     value is the same as ``pg_num`` with ``mkpool``.
2476   long_desc: With default value of `osd_pool_default_pg_autoscale_mode` being
2477     `on` the number of PGs for new pools will start out with 1 pg, unless the
2478     user specifies the pg_num.
2479   default: 32
2480   services:
2481   - mon
2482   see_also:
2483   - osd_pool_default_pg_autoscale_mode
2484   flags:
2485   - runtime
2486 - name: osd_pool_default_pgp_num
2487   type: uint
2488   level: advanced
2489   desc: number of PGs for placement purposes (0 to match pg_num)
2490   fmt_desc: The default number of placement groups for placement for a pool.
2491     The default value is the same as ``pgp_num`` with ``mkpool``.
2492     PG and PGP should be equal (for now).
2493   default: 0
2494   services:
2495   - mon
2496   see_also:
2497   - osd_pool_default_pg_num
2498   flags:
2499   - runtime
2500 - name: osd_pool_default_type
2501   type: str
2502   level: advanced
2503   desc: default type of pool to create
2504   default: replicated
2505   services:
2506   - mon
2507   enum_values:
2508   - replicated
2509   - erasure
2510   flags:
2511   - runtime
2512 - name: osd_pool_default_erasure_code_profile
2513   type: str
2514   level: advanced
2515   desc: default erasure code profile for new erasure-coded pools
2516   default: plugin=jerasure technique=reed_sol_van k=2 m=2
2517   services:
2518   - mon
2519   flags:
2520   - runtime
2521 - name: osd_erasure_code_plugins
2522   type: str
2523   level: advanced
2524   desc: erasure code plugins to load
2525   default: @osd_erasure_code_plugins@
2526   services:
2527   - mon
2528   - osd
2529   flags:
2530   - startup
2531   with_legacy: true
2532 - name: osd_pool_default_flags
2533   type: int
2534   level: dev
2535   desc: (integer) flags to set on new pools
2536   fmt_desc: The default flags for new pools.
2537   default: 0
2538   services:
2539   - mon
2540   with_legacy: true
2541 # use new pg hashing to prevent pool/pg overlap
2542 - name: osd_pool_default_flag_hashpspool
2543   type: bool
2544   level: advanced
2545   desc: set hashpspool (better hashing scheme) flag on new pools
2546   default: true
2547   services:
2548   - mon
2549   with_legacy: true
2550 # pool can't be deleted
2551 - name: osd_pool_default_flag_nodelete
2552   type: bool
2553   level: advanced
2554   desc: set nodelete flag on new pools
2555   fmt_desc: Set the ``nodelete`` flag on new pools, which prevents pool removal.
2556   default: false
2557   services:
2558   - mon
2559   with_legacy: true
2560 # pool's pg and pgp num can't be changed
2561 - name: osd_pool_default_flag_nopgchange
2562   type: bool
2563   level: advanced
2564   desc: set nopgchange flag on new pools
2565   fmt_desc: Set the ``nopgchange`` flag on new pools. Does not allow the number of PGs to be changed.
2566   default: false
2567   services:
2568   - mon
2569   with_legacy: true
2570 # pool's size and min size can't be changed
2571 - name: osd_pool_default_flag_nosizechange
2572   type: bool
2573   level: advanced
2574   desc: set nosizechange flag on new pools
2575   fmt_desc: Set the ``nosizechange`` flag on new pools. Does not allow the ``size`` to be changed.
2576   default: false
2577   services:
2578   - mon
2579   with_legacy: true
2580 - name: osd_pool_default_flag_bulk
2581   type: bool
2582   level: advanced
2583   desc: set bulk flag on new pools
2584   fmt_desc: Set the ``bulk`` flag on new pools. Allowing autoscaler to use scale-down mode.
2585   default: false
2586   services:
2587   - mon
2588   with_legacy: true
2589 - name: osd_pool_default_hit_set_bloom_fpp
2590   type: float
2591   level: advanced
2592   default: 0.05
2593   services:
2594   - mon
2595   see_also:
2596   - osd_tier_default_cache_hit_set_type
2597   with_legacy: true
2598 - name: osd_pool_default_cache_target_dirty_ratio
2599   type: float
2600   level: advanced
2601   default: 0.4
2602   with_legacy: true
2603 - name: osd_pool_default_cache_target_dirty_high_ratio
2604   type: float
2605   level: advanced
2606   default: 0.6
2607   with_legacy: true
2608 - name: osd_pool_default_cache_target_full_ratio
2609   type: float
2610   level: advanced
2611   default: 0.8
2612   with_legacy: true
2613 # seconds
2614 - name: osd_pool_default_cache_min_flush_age
2615   type: int
2616   level: advanced
2617   default: 0
2618   with_legacy: true
2619 # seconds
2620 - name: osd_pool_default_cache_min_evict_age
2621   type: int
2622   level: advanced
2623   default: 0
2624   with_legacy: true
2625 # max size to check for eviction
2626 - name: osd_pool_default_cache_max_evict_check_size
2627   type: int
2628   level: advanced
2629   default: 10
2630   with_legacy: true
2631 - name: osd_pool_default_pg_autoscale_mode
2632   type: str
2633   level: advanced
2634   desc: Default PG autoscaling behavior for new pools
2635   long_desc: With default value `on`, the autoscaler starts a new pool with 1
2636     pg, unless the user specifies the pg_num.
2637   default: 'on'
2638   enum_values:
2639   - 'off'
2640   - 'warn'
2641   - 'on'
2642   flags:
2643   - runtime
2644 - name: osd_pool_default_read_lease_ratio
2645   type: float
2646   level: dev
2647   desc: Default read_lease_ratio for a pool, as a multiple of osd_heartbeat_grace
2648   long_desc: This should be <= 1.0 so that the read lease will have expired by the
2649     time we decide to mark a peer OSD down.
2650   default: 0.8
2651   see_also:
2652   - osd_heartbeat_grace
2653   flags:
2654   - runtime
2655   with_legacy: true
2656 # min target size for a HitSet
2657 - name: osd_hit_set_min_size
2658   type: int
2659   level: advanced
2660   default: 1000
2661   with_legacy: true
2662 # max target size for a HitSet
2663 - name: osd_hit_set_max_size
2664   type: int
2665   level: advanced
2666   default: 100000
2667   with_legacy: true
2668 # rados namespace for hit_set tracking
2669 - name: osd_hit_set_namespace
2670   type: str
2671   level: advanced
2672   default: .ceph-internal
2673   with_legacy: true
2674 # conservative default throttling values
2675 - name: osd_tier_promote_max_objects_sec
2676   type: uint
2677   level: advanced
2678   default: 25
2679   with_legacy: true
2680 - name: osd_tier_promote_max_bytes_sec
2681   type: size
2682   level: advanced
2683   default: 5_M
2684   with_legacy: true
2685 - name: osd_tier_default_cache_mode
2686   type: str
2687   level: advanced
2688   default: writeback
2689   enum_values:
2690   - none
2691   - writeback
2692   - forward
2693   - readonly
2694   - readforward
2695   - readproxy
2696   - proxy
2697   flags:
2698   - runtime
2699 - name: osd_tier_default_cache_hit_set_count
2700   type: uint
2701   level: advanced
2702   default: 4
2703 - name: osd_tier_default_cache_hit_set_period
2704   type: uint
2705   level: advanced
2706   default: 1200
2707 - name: osd_tier_default_cache_hit_set_type
2708   type: str
2709   level: advanced
2710   default: bloom
2711   enum_values:
2712   - bloom
2713   - explicit_hash
2714   - explicit_object
2715   flags:
2716   - runtime
2717 - name: osd_tier_default_cache_min_read_recency_for_promote
2718   type: uint
2719   level: advanced
2720   desc: number of recent HitSets the object must appear in to be promoted (on read)
2721   default: 1
2722 - name: osd_tier_default_cache_min_write_recency_for_promote
2723   type: uint
2724   level: advanced
2725   desc: number of recent HitSets the object must appear in to be promoted (on write)
2726   default: 1
2727 - name: osd_tier_default_cache_hit_set_grade_decay_rate
2728   type: uint
2729   level: advanced
2730   default: 20
2731 - name: osd_tier_default_cache_hit_set_search_last_n
2732   type: uint
2733   level: advanced
2734   default: 1
2735 - name: osd_objecter_finishers
2736   type: int
2737   level: advanced
2738   default: 1
2739   flags:
2740   - startup
2741   with_legacy: true
2742 - name: osd_map_dedup
2743   type: bool
2744   level: advanced
2745   default: true
2746   fmt_desc: Enable removing duplicates in the OSD map.
2747   with_legacy: true
2748 - name: osd_map_message_max
2749   type: int
2750   level: advanced
2751   desc: maximum number of OSDMaps to include in a single message
2752   fmt_desc: The maximum map entries allowed per MOSDMap message.
2753   default: 40
2754   services:
2755   - osd
2756   - mon
2757   with_legacy: true
2758 - name: osd_map_message_max_bytes
2759   type: size
2760   level: advanced
2761   desc: maximum number of bytes worth of OSDMaps to include in a single message
2762   default: 10_M
2763   services:
2764   - osd
2765   - mon
2766   with_legacy: true
2767 # do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
2768 - name: osd_ignore_stale_divergent_priors
2769   type: bool
2770   level: advanced
2771   default: false
2772   with_legacy: true
2773 - name: osd_heartbeat_interval
2774   type: int
2775   level: dev
2776   desc: Interval (in seconds) between peer pings
2777   fmt_desc: How often an Ceph OSD Daemon pings its peers (in seconds).
2778   default: 6
2779   min: 1
2780   max: 1_min
2781   with_legacy: true
2782 # (seconds) how long before we decide a peer has failed
2783 # This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
2784 - name: osd_heartbeat_grace
2785   type: int
2786   level: advanced
2787   default: 20
2788   fmt_desc: The elapsed time when a Ceph OSD Daemon hasn't shown a heartbeat
2789               that the Ceph Storage Cluster considers it ``down``.
2790               This setting must be set in both the [mon] and [osd] or [global]
2791               sections so that it is read by both monitor and OSD daemons.
2792   with_legacy: true
2793 - name: osd_heartbeat_stale
2794   type: int
2795   level: advanced
2796   desc: Interval (in seconds) we mark an unresponsive heartbeat peer as stale.
2797   long_desc: Automatically mark unresponsive heartbeat sessions as stale and tear
2798     them down. The primary benefit is that OSD doesn't need to keep a flood of blocked
2799     heartbeat messages around in memory.
2800   default: 10_min
2801 # prio the heartbeat tcp socket and set dscp as CS6 on it if true
2802 - name: osd_heartbeat_use_min_delay_socket
2803   type: bool
2804   level: advanced
2805   default: false
2806   with_legacy: true
2807 # the minimum size of OSD heartbeat messages to send
2808 - name: osd_heartbeat_min_size
2809   type: size
2810   level: advanced
2811   desc: Minimum heartbeat packet size in bytes. Will add dummy payload if heartbeat
2812     packet is smaller than this.
2813   default: 2000
2814   with_legacy: true
2815 # max number of parallel snap trims/pg
2816 - name: osd_pg_max_concurrent_snap_trims
2817   type: uint
2818   level: advanced
2819   default: 2
2820   min: 1
2821   with_legacy: true
2822 # max number of trimming pgs
2823 - name: osd_max_trimming_pgs
2824   type: uint
2825   level: advanced
2826   default: 2
2827   with_legacy: true
2828 # minimum number of peers that must be reachable to mark ourselves
2829 # back up after being wrongly marked down.
2830 - name: osd_heartbeat_min_healthy_ratio
2831   type: float
2832   level: advanced
2833   default: 0.33
2834   with_legacy: true
2835 # (seconds) how often to ping monitor if no peers
2836 - name: osd_mon_heartbeat_interval
2837   type: int
2838   level: advanced
2839   default: 30
2840   fmt_desc: How often the Ceph OSD Daemon pings a Ceph Monitor if it has no
2841               Ceph OSD Daemon peers.
2842   with_legacy: true
2843 - name: osd_mon_heartbeat_stat_stale
2844   type: int
2845   level: advanced
2846   desc: Stop reporting on heartbeat ping times not updated for this many seconds.
2847   long_desc: Stop reporting on old heartbeat information unless this is set to zero
2848   fmt_desc: Stop reporting on heartbeat ping times which haven't been updated for
2849               this many seconds.  Set to zero to disable this action.
2850   default: 1_hr
2851 # failures, up_thru, boot.
2852 - name: osd_mon_report_interval
2853   type: int
2854   level: advanced
2855   desc: Frequency of OSD reports to mon for peer failures, fullness status changes
2856   fmt_desc: The number of seconds a Ceph OSD Daemon may wait
2857               from startup or another reportable event before reporting
2858               to a Ceph Monitor.
2859   default: 5
2860   with_legacy: true
2861 # max updates in flight
2862 - name: osd_mon_report_max_in_flight
2863   type: int
2864   level: advanced
2865   default: 2
2866   with_legacy: true
2867 # (second) how often to send beacon message to monitor
2868 - name: osd_beacon_report_interval
2869   type: int
2870   level: advanced
2871   default: 5_min
2872   with_legacy: true
2873 # report pg stats for any given pg at least this often
2874 - name: osd_pg_stat_report_interval_max
2875   type: int
2876   level: advanced
2877   default: 500
2878   with_legacy: true
2879 # Max number of snap intervals to report to mgr in pg_stat_t
2880 - name: osd_max_snap_prune_intervals_per_epoch
2881   type: uint
2882   level: dev
2883   desc: Max number of snap intervals to report to mgr in pg_stat_t
2884   default: 512
2885   with_legacy: true
2886 - name: osd_default_data_pool_replay_window
2887   type: int
2888   level: advanced
2889   default: 45
2890   fmt_desc: The time (in seconds) for an OSD to wait for a client to replay
2891     a request.
2892 - name: osd_auto_mark_unfound_lost
2893   type: bool
2894   level: advanced
2895   default: false
2896   with_legacy: true
2897 - name: osd_check_for_log_corruption
2898   type: bool
2899   level: advanced
2900   default: false
2901   fmt_desc: Check log files for corruption. Can be computationally expensive.
2902   with_legacy: true
2903 - name: osd_use_stale_snap
2904   type: bool
2905   level: advanced
2906   default: false
2907   with_legacy: true
2908 - name: osd_rollback_to_cluster_snap
2909   type: str
2910   level: advanced
2911   with_legacy: true
2912 - name: osd_default_notify_timeout
2913   type: uint
2914   level: advanced
2915   desc: default number of seconds after which notify propagation times out. used if
2916     a client has not specified other value
2917   fmt_desc: The OSD default notification timeout (in seconds).
2918   default: 30
2919   with_legacy: true
2920 - name: osd_kill_backfill_at
2921   type: int
2922   level: dev
2923   default: 0
2924   with_legacy: true
2925 # Bounds how infrequently a new map epoch will be persisted for a pg
2926 # make this < map_cache_size!
2927 - name: osd_pg_epoch_persisted_max_stale
2928   type: uint
2929   level: advanced
2930   default: 40
2931   with_legacy: true
2932 - name: osd_target_pg_log_entries_per_osd
2933   type: uint
2934   level: dev
2935   desc: target number of PG entries total on an OSD - limited per pg by the min and
2936     max options below
2937   default: 300000
2938   see_also:
2939   - osd_max_pg_log_entries
2940   - osd_min_pg_log_entries
2941   with_legacy: true
2942 - name: osd_min_pg_log_entries
2943   type: uint
2944   level: dev
2945   desc: minimum number of entries to maintain in the PG log
2946   fmt_desc: The minimum number of placement group logs to maintain
2947     when trimming log files.
2948   default: 250
2949   services:
2950   - osd
2951   see_also:
2952   - osd_max_pg_log_entries
2953   - osd_pg_log_dups_tracked
2954   - osd_target_pg_log_entries_per_osd
2955   with_legacy: true
2956 - name: osd_max_pg_log_entries
2957   type: uint
2958   level: dev
2959   desc: maximum number of entries to maintain in the PG log
2960   fmt_desc: The maximum number of placement group logs to maintain
2961     when trimming log files.
2962   default: 10000
2963   services:
2964   - osd
2965   see_also:
2966   - osd_min_pg_log_entries
2967   - osd_pg_log_dups_tracked
2968   - osd_target_pg_log_entries_per_osd
2969   with_legacy: true
2970 - name: osd_pg_log_dups_tracked
2971   type: uint
2972   level: dev
2973   desc: how many versions back to track in order to detect duplicate ops; this is
2974     combined with both the regular pg log entries and additional minimal dup detection
2975     entries
2976   default: 3000
2977   services:
2978   - osd
2979   see_also:
2980   - osd_min_pg_log_entries
2981   - osd_max_pg_log_entries
2982   with_legacy: true
2983 - name: osd_object_clean_region_max_num_intervals
2984   type: int
2985   level: dev
2986   desc: number of intervals in clean_offsets
2987   long_desc: partial recovery uses multiple intervals to record the clean part of
2988     the objectwhen the number of intervals is greater than osd_object_clean_region_max_num_intervals,
2989     minimum interval will be trimmed(0 will recovery the entire object data interval)
2990   default: 10
2991   services:
2992   - osd
2993   with_legacy: true
2994 # max entries factor before force recovery
2995 - name: osd_force_recovery_pg_log_entries_factor
2996   type: float
2997   level: dev
2998   default: 1.3
2999   with_legacy: true
3000 - name: osd_pg_log_trim_min
3001   type: uint
3002   level: dev
3003   desc: Minimum number of log entries to trim at once. This lets us trim in larger
3004     batches rather than with each write.
3005   default: 100
3006   see_also:
3007   - osd_max_pg_log_entries
3008   - osd_min_pg_log_entries
3009   with_legacy: true
3010 - name: osd_force_auth_primary_missing_objects
3011   type: uint
3012   level: advanced
3013   desc: Approximate missing objects above which to force auth_log_shard to be primary
3014     temporarily
3015   default: 100
3016 - name: osd_async_recovery_min_cost
3017   type: uint
3018   level: advanced
3019   desc: A mixture measure of number of current log entries difference and historical
3020     missing objects,  above which we switch to use asynchronous recovery when appropriate
3021   default: 100
3022   flags:
3023   - runtime
3024 - name: osd_max_pg_per_osd_hard_ratio
3025   type: float
3026   level: advanced
3027   desc: Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'
3028   long_desc: OSD will refuse to instantiate PG if the number of PG it serves exceeds
3029     this number.
3030   fmt_desc: The ratio of number of PGs per OSD allowed by the cluster before the
3031     OSD refuses to create new PGs. An OSD stops creating new PGs if the number
3032     of PGs it serves exceeds
3033     ``osd_max_pg_per_osd_hard_ratio`` \* ``mon_max_pg_per_osd``.
3034   default: 3
3035   see_also:
3036   - mon_max_pg_per_osd
3037   min: 1
3038 - name: osd_pg_log_trim_max
3039   type: uint
3040   level: advanced
3041   desc: maximum number of entries to remove at once from the PG log
3042   default: 10000
3043   services:
3044   - osd
3045   see_also:
3046   - osd_min_pg_log_entries
3047   - osd_max_pg_log_entries
3048   with_legacy: true
3049 # how many seconds old makes an op complaint-worthy
3050 - name: osd_op_complaint_time
3051   type: float
3052   level: advanced
3053   default: 30
3054   fmt_desc: An operation becomes complaint worthy after the specified number
3055     of seconds have elapsed.
3056   with_legacy: true
3057 - name: osd_command_max_records
3058   type: int
3059   level: advanced
3060   default: 256
3061   fmt_desc: Limits the number of lost objects to return.
3062   with_legacy: true
3063 # max peer osds to report that are blocking our progress
3064 - name: osd_max_pg_blocked_by
3065   type: uint
3066   level: advanced
3067   default: 16
3068   with_legacy: true
3069 - name: osd_op_log_threshold
3070   type: int
3071   level: advanced
3072   default: 5
3073   fmt_desc: How many operations logs to display at once.
3074   with_legacy: true
3075 - name: osd_backoff_on_unfound
3076   type: bool
3077   level: advanced
3078   default: true
3079   with_legacy: true
3080 # [mainly for debug?] object unreadable/writeable
3081 - name: osd_backoff_on_degraded
3082   type: bool
3083   level: advanced
3084   default: false
3085   with_legacy: true
3086 # [debug] pg peering
3087 - name: osd_backoff_on_peering
3088   type: bool
3089   level: advanced
3090   default: false
3091   with_legacy: true
3092 - name: osd_debug_shutdown
3093   type: bool
3094   level: dev
3095   desc: Turn up debug levels during shutdown
3096   default: false
3097   with_legacy: true
3098 # crash osd if client ignores a backoff; useful for debugging
3099 - name: osd_debug_crash_on_ignored_backoff
3100   type: bool
3101   level: dev
3102   default: false
3103   with_legacy: true
3104 - name: osd_debug_inject_dispatch_delay_probability
3105   type: float
3106   level: dev
3107   default: 0
3108   with_legacy: true
3109 - name: osd_debug_inject_dispatch_delay_duration
3110   type: float
3111   level: dev
3112   default: 0.1
3113   with_legacy: true
3114 - name: osd_debug_drop_ping_probability
3115   desc: N/A
3116   type: float
3117   level: dev
3118   default: 0
3119   with_legacy: true
3120 - name: osd_debug_drop_ping_duration
3121   desc: N/A
3122   type: int
3123   level: dev
3124   default: 0
3125   with_legacy: true
3126 - name: osd_debug_op_order
3127   type: bool
3128   level: dev
3129   default: false
3130   with_legacy: true
3131 - name: osd_debug_verify_missing_on_start
3132   type: bool
3133   level: dev
3134   default: false
3135   with_legacy: true
3136 - name: osd_debug_verify_snaps
3137   type: bool
3138   level: dev
3139   default: false
3140   with_legacy: true
3141 - name: osd_debug_verify_stray_on_activate
3142   type: bool
3143   level: dev
3144   default: false
3145   with_legacy: true
3146 - name: osd_debug_skip_full_check_in_backfill_reservation
3147   type: bool
3148   level: dev
3149   default: false
3150   with_legacy: true
3151 - name: osd_debug_reject_backfill_probability
3152   type: float
3153   level: dev
3154   default: 0
3155   with_legacy: true
3156 # inject failure during copyfrom completion
3157 - name: osd_debug_inject_copyfrom_error
3158   type: bool
3159   level: dev
3160   default: false
3161   with_legacy: true
3162 - name: osd_debug_misdirected_ops
3163   type: bool
3164   level: dev
3165   default: false
3166   with_legacy: true
3167 - name: osd_debug_skip_full_check_in_recovery
3168   type: bool
3169   level: dev
3170   default: false
3171   with_legacy: true
3172 - name: osd_debug_random_push_read_error
3173   type: float
3174   level: dev
3175   default: 0
3176   with_legacy: true
3177 - name: osd_debug_verify_cached_snaps
3178   type: bool
3179   level: dev
3180   default: false
3181   with_legacy: true
3182 - name: osd_debug_deep_scrub_sleep
3183   type: float
3184   level: dev
3185   desc: Inject an expensive sleep during deep scrub IO to make it easier to induce
3186     preemption
3187   default: 0
3188   with_legacy: true
3189 - name: osd_debug_no_acting_change
3190   type: bool
3191   level: dev
3192   default: false
3193   with_legacy: true
3194 - name: osd_debug_no_purge_strays
3195   type: bool
3196   level: dev
3197   default: false
3198   with_legacy: true
3199 - name: osd_debug_pretend_recovery_active
3200   type: bool
3201   level: dev
3202   default: false
3203   with_legacy: true
3204 # enable/disable OSD op tracking
3205 - name: osd_enable_op_tracker
3206   type: bool
3207   level: advanced
3208   default: true
3209   with_legacy: true
3210 # The number of shards for holding the ops
3211 - name: osd_num_op_tracker_shard
3212   type: uint
3213   level: advanced
3214   default: 32
3215   with_legacy: true
3216 # Max number of completed ops to track
3217 - name: osd_op_history_size
3218   type: uint
3219   level: advanced
3220   default: 20
3221   fmt_desc: The maximum number of completed operations to track.
3222   with_legacy: true
3223 # Oldest completed op to track
3224 - name: osd_op_history_duration
3225   type: uint
3226   level: advanced
3227   default: 600
3228   fmt_desc: The oldest completed operation to track.
3229   with_legacy: true
3230 # Max number of slow ops to track
3231 - name: osd_op_history_slow_op_size
3232   type: uint
3233   level: advanced
3234   default: 20
3235   with_legacy: true
3236 # track the op if over this threshold
3237 - name: osd_op_history_slow_op_threshold
3238   type: float
3239   level: advanced
3240   default: 10
3241   with_legacy: true
3242 # to adjust various transactions that batch smaller items
3243 - name: osd_target_transaction_size
3244   type: int
3245   level: advanced
3246   default: 30
3247   with_legacy: true
3248 # what % full makes an OSD "full" (failsafe)
3249 - name: osd_failsafe_full_ratio
3250   type: float
3251   level: advanced
3252   default: 0.97
3253   with_legacy: true
3254 - name: osd_fast_shutdown
3255   type: bool
3256   level: advanced
3257   desc: Fast, immediate shutdown
3258   long_desc: Setting this to false makes the OSD do a slower teardown of all state
3259     when it receives a SIGINT or SIGTERM or when shutting down for any other reason.  That
3260     slow shutdown is primarilyy useful for doing memory leak checking with valgrind.
3261   default: true
3262   with_legacy: true
3263 - name: osd_fast_shutdown_timeout
3264   type: int
3265   level: advanced
3266   desc: timeout in seconds for osd fast-shutdown (0 is unlimited)
3267   default: 15
3268   with_legacy: true
3269   min: 0
3270 - name: osd_fast_shutdown_notify_mon
3271   type: bool
3272   level: advanced
3273   desc: Tell mon about OSD shutdown on immediate shutdown
3274   long_desc: Tell the monitor the OSD is shutting down on immediate shutdown. This
3275     helps with cluster log messages from other OSDs reporting it immediately failed.
3276   default: true
3277   see_also:
3278   - osd_fast_shutdown
3279   - osd_mon_shutdown_timeout
3280   with_legacy: true
3281 # immediately mark OSDs as down once they refuse to accept connections
3282 - name: osd_fast_fail_on_connection_refused
3283   type: bool
3284   level: advanced
3285   default: true
3286   fmt_desc: If this option is enabled, crashed OSDs are marked down
3287     immediately by connected peers and MONs (assuming that the
3288     crashed OSD host survives). Disable it to restore old
3289     behavior, at the expense of possible long I/O stalls when
3290     OSDs crash in the middle of I/O operations.
3291   with_legacy: true
3292 - name: osd_pg_object_context_cache_count
3293   type: int
3294   level: advanced
3295   default: 64
3296   with_legacy: true
3297 # true if LTTng-UST tracepoints should be enabled
3298 - name: osd_tracing
3299   type: bool
3300   level: advanced
3301   default: false
3302   with_legacy: true
3303 # true if function instrumentation should use LTTng
3304 - name: osd_function_tracing
3305   type: bool
3306   level: advanced
3307   default: false
3308   with_legacy: true
3309 # use fast info attr, if we can
3310 - name: osd_fast_info
3311   type: bool
3312   level: advanced
3313   default: true
3314   with_legacy: true
3315 # determines whether PGLog::check() compares written out log to stored log
3316 - name: osd_debug_pg_log_writeout
3317   type: bool
3318   level: dev
3319   default: false
3320   with_legacy: true
3321 # Max number of loop before we reset thread-pool's handle
3322 - name: osd_loop_before_reset_tphandle
3323   type: uint
3324   level: advanced
3325   default: 64
3326   with_legacy: true
3327 # default timeout while caling WaitInterval on an empty queue
3328 - name: threadpool_default_timeout
3329   type: int
3330   level: advanced
3331   default: 1_min
3332   with_legacy: true
3333 # default wait time for an empty queue before pinging the hb timeout
3334 - name: threadpool_empty_queue_max_wait
3335   type: int
3336   level: advanced
3337   default: 2
3338   with_legacy: true
3339 - name: leveldb_log_to_ceph_log
3340   type: bool
3341   level: advanced
3342   default: true
3343   with_legacy: true
3344 - name: leveldb_write_buffer_size
3345   type: size
3346   level: advanced
3347   default: 8_M
3348   with_legacy: true
3349 - name: leveldb_cache_size
3350   type: size
3351   level: advanced
3352   default: 128_M
3353   with_legacy: true
3354 - name: leveldb_block_size
3355   type: size
3356   level: advanced
3357   default: 0
3358   with_legacy: true
3359 - name: leveldb_bloom_size
3360   type: int
3361   level: advanced
3362   default: 0
3363   with_legacy: true
3364 - name: leveldb_max_open_files
3365   type: int
3366   level: advanced
3367   default: 0
3368   with_legacy: true
3369 - name: leveldb_compression
3370   type: bool
3371   level: advanced
3372   default: true
3373   with_legacy: true
3374 - name: leveldb_paranoid
3375   type: bool
3376   level: advanced
3377   default: false
3378   with_legacy: true
3379 - name: leveldb_log
3380   type: str
3381   level: advanced
3382   default: /dev/null
3383   with_legacy: true
3384 - name: leveldb_compact_on_mount
3385   type: bool
3386   level: advanced
3387   default: false
3388   with_legacy: true
3389 - name: rocksdb_log_to_ceph_log
3390   type: bool
3391   level: advanced
3392   default: true
3393   with_legacy: true
3394 - name: rocksdb_cache_size
3395   type: size
3396   level: advanced
3397   default: 512_M
3398   flags:
3399   - runtime
3400   with_legacy: true
3401 # ratio of cache for row (vs block)
3402 - name: rocksdb_cache_row_ratio
3403   type: float
3404   level: advanced
3405   default: 0
3406   with_legacy: true
3407 # rocksdb block cache shard bits, 4 bit -> 16 shards
3408 - name: rocksdb_cache_shard_bits
3409   type: int
3410   level: advanced
3411   default: 4
3412   with_legacy: true
3413 # 'lru' or 'clock'
3414 - name: rocksdb_cache_type
3415   type: str
3416   level: advanced
3417   default: binned_lru
3418   with_legacy: true
3419 - name: rocksdb_block_size
3420   type: size
3421   level: advanced
3422   default: 4_K
3423   with_legacy: true
3424 # Enabling this will have 5-10% impact on performance for the stats collection
3425 - name: rocksdb_perf
3426   type: bool
3427   level: advanced
3428   default: false
3429   with_legacy: true
3430 # For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
3431 - name: rocksdb_collect_compaction_stats
3432   type: bool
3433   level: advanced
3434   default: false
3435   with_legacy: true
3436 # For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
3437 - name: rocksdb_collect_extended_stats
3438   type: bool
3439   level: advanced
3440   default: false
3441   with_legacy: true
3442 # For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
3443 - name: rocksdb_collect_memory_stats
3444   type: bool
3445   level: advanced
3446   default: false
3447   with_legacy: true
3448 - name: rocksdb_delete_range_threshold
3449   type: uint
3450   level: advanced
3451   desc: The number of keys required to invoke DeleteRange when deleting muliple keys.
3452   default: 1_M
3453 - name: rocksdb_bloom_bits_per_key
3454   type: uint
3455   level: advanced
3456   desc: Number of bits per key to use for RocksDB's bloom filters.
3457   long_desc: 'RocksDB bloom filters can be used to quickly answer the question of
3458     whether or not a key may exist or definitely does not exist in a given RocksDB
3459     SST file without having to read all keys into memory.  Using a higher bit value
3460     decreases the likelihood of false positives at the expense of additional disk
3461     space and memory consumption when the filter is loaded into RAM.  The current
3462     default value of 20 was found to provide significant performance gains when getattr
3463     calls are made (such as during new object creation in bluestore) without significant
3464     memory overhead or cache pollution when combined with rocksdb partitioned index
3465     filters.  See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters
3466     for more information.'
3467   default: 20
3468 - name: rocksdb_cache_index_and_filter_blocks
3469   type: bool
3470   level: dev
3471   desc: Whether to cache indices and filters in block cache
3472   long_desc: By default RocksDB will load an SST file's index and bloom filters into
3473     memory when it is opened and remove them from memory when an SST file is closed.  Thus,
3474     memory consumption by indices and bloom filters is directly tied to the number
3475     of concurrent SST files allowed to be kept open.  This option instead stores cached
3476     indicies and filters in the block cache where they directly compete with other
3477     cached data.  By default we set this option to true to better account for and
3478     bound rocksdb memory usage and keep filters in memory even when an SST file is
3479     closed.
3480   default: true
3481 - name: rocksdb_cache_index_and_filter_blocks_with_high_priority
3482   type: bool
3483   level: dev
3484   desc: Whether to cache indices and filters in the block cache with high priority
3485   long_desc: A downside of setting rocksdb_cache_index_and_filter_blocks to true is
3486     that regular data can push indices and filters out of memory.  Setting this option
3487     to true means they are cached with higher priority than other data and should
3488     typically stay in the block cache.
3489   default: false
3490 - name: rocksdb_pin_l0_filter_and_index_blocks_in_cache
3491   type: bool
3492   level: dev
3493   desc: Whether to pin Level 0 indices and bloom filters in the block cache
3494   long_desc: A downside of setting rocksdb_cache_index_and_filter_blocks to true is
3495     that regular data can push indices and filters out of memory.  Setting this option
3496     to true means that level 0 SST files will always have their indices and filters
3497     pinned in the block cache.
3498   default: false
3499 - name: rocksdb_index_type
3500   type: str
3501   level: dev
3502   desc: 'Type of index for SST files: binary_search, hash_search, two_level'
3503   long_desc: 'This option controls the table index type.  binary_search is a space
3504     efficient index block that is optimized for block-search-based index. hash_search
3505     may improve prefix lookup performance at the expense of higher disk and memory
3506     usage and potentially slower compactions.  two_level is an experimental index
3507     type that uses two binary search indexes and works in conjunction with partition
3508     filters.  See: http://rocksdb.org/blog/2017/05/12/partitioned-index-filter.html'
3509   default: binary_search
3510 - name: rocksdb_partition_filters
3511   type: bool
3512   level: dev
3513   desc: (experimental) partition SST index/filters into smaller blocks
3514   long_desc: 'This is an experimental option for rocksdb that works in conjunction
3515     with two_level indices to avoid having to keep the entire filter/index in cache
3516     when cache_index_and_filter_blocks is true.  The idea is to keep a much smaller
3517     top-level index in heap/cache and then opportunistically cache the lower level
3518     indices.  See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters'
3519   default: false
3520 - name: rocksdb_metadata_block_size
3521   type: size
3522   level: dev
3523   desc: The block size for index partitions. (0 = rocksdb default)
3524   default: 4_K
3525 # osd_*_priority adjust the relative priority of client io, recovery io,
3526 # snaptrim io, etc
3527 #
3528 # osd_*_priority determines the ratio of available io between client and
3529 # recovery.  Each option may be set between
3530 # 1..63.
3531 - name: osd_client_op_priority
3532   type: uint
3533   level: advanced
3534   default: 63
3535   fmt_desc: The priority set for client operations.  This value is relative
3536     to that of ``osd_recovery_op_priority`` below.  The default
3537     strongly favors client ops over recovery.
3538   with_legacy: true
3539 - name: osd_recovery_op_priority
3540   type: uint
3541   level: advanced
3542   desc: Priority to use for recovery operations if not specified for the pool
3543   fmt_desc: The priority of recovery operations vs client operations, if not specified by the
3544     pool's ``recovery_op_priority``.  The default value prioritizes client
3545     ops (see above) over recovery ops.  You may adjust the tradeoff of client
3546     impact against the time to restore cluster health by lowering this value
3547     for increased prioritization of client ops, or by increasing it to favor
3548     recovery.
3549   default: 3
3550   with_legacy: true
3551 - name: osd_peering_op_priority
3552   type: uint
3553   level: dev
3554   default: 255
3555   with_legacy: true
3556 - name: osd_snap_trim_priority
3557   type: uint
3558   level: advanced
3559   default: 5
3560   fmt_desc: The priority set for the snap trim work queue.
3561   with_legacy: true
3562 - name: osd_snap_trim_cost
3563   type: size
3564   level: advanced
3565   default: 1_M
3566   with_legacy: true
3567 - name: osd_pg_delete_priority
3568   type: uint
3569   level: advanced
3570   default: 5
3571   with_legacy: true
3572 - name: osd_pg_delete_cost
3573   type: size
3574   level: advanced
3575   default: 1_M
3576   with_legacy: true
3577 - name: osd_scrub_priority
3578   type: uint
3579   level: advanced
3580   desc: Priority for scrub operations in work queue
3581   fmt_desc: The default work queue priority for scheduled scrubs when the
3582     pool doesn't specify a value of ``scrub_priority``.  This can be
3583     boosted to the value of ``osd_client_op_priority`` when scrubs are
3584     blocking client operations.
3585   default: 5
3586   with_legacy: true
3587 - name: osd_scrub_cost
3588   type: size
3589   level: advanced
3590   desc: Cost for scrub operations in work queue
3591   default: 50_M
3592   with_legacy: true
3593 # set requested scrub priority higher than scrub priority to make the
3594 # requested scrubs jump the queue of scheduled scrubs
3595 - name: osd_requested_scrub_priority
3596   type: uint
3597   level: advanced
3598   default: 120
3599   fmt_desc: The priority set for user requested scrub on the work queue.  If
3600     this value were to be smaller than ``osd_client_op_priority`` it
3601     can be boosted to the value of ``osd_client_op_priority`` when
3602     scrub is blocking client operations.
3603   with_legacy: true
3604 - name: osd_recovery_priority
3605   type: uint
3606   level: advanced
3607   desc: Priority of recovery in the work queue
3608   long_desc: Not related to a pool's recovery_priority
3609   fmt_desc: The default priority set for recovery work queue.  Not
3610     related to a pool's ``recovery_priority``.
3611   default: 5
3612   with_legacy: true
3613 # set default cost equal to 20MB io
3614 - name: osd_recovery_cost
3615   type: size
3616   level: advanced
3617   default: 20_M
3618   with_legacy: true
3619 # osd_recovery_op_warn_multiple scales the normal warning threshold,
3620 # osd_op_complaint_time, so that slow recovery ops won't cause noise
3621 - name: osd_recovery_op_warn_multiple
3622   type: uint
3623   level: advanced
3624   default: 16
3625   with_legacy: true
3626 # Max time to wait between notifying mon of shutdown and shutting down
3627 - name: osd_mon_shutdown_timeout
3628   type: float
3629   level: advanced
3630   default: 5
3631   with_legacy: true
3632 # crash if the OSD has stray PG refs on shutdown
3633 - name: osd_shutdown_pgref_assert
3634   type: bool
3635   level: advanced
3636   default: false
3637   with_legacy: true
3638 # OSD's maximum object size
3639 - name: osd_max_object_size
3640   type: size
3641   level: advanced
3642   default: 128_M
3643   fmt_desc: The maximum size of a RADOS object in bytes.
3644   with_legacy: true
3645 # max rados object name len
3646 - name: osd_max_object_name_len
3647   type: uint
3648   level: advanced
3649   default: 2_K
3650   with_legacy: true
3651 # max rados object namespace len
3652 - name: osd_max_object_namespace_len
3653   type: uint
3654   level: advanced
3655   default: 256
3656   with_legacy: true
3657 # max rados attr name len; cannot go higher than 100 chars for file system backends
3658 - name: osd_max_attr_name_len
3659   type: uint
3660   level: advanced
3661   default: 100
3662   with_legacy: true
3663 - name: osd_max_attr_size
3664   type: uint
3665   level: advanced
3666   default: 0
3667   with_legacy: true
3668 - name: osd_max_omap_entries_per_request
3669   type: uint
3670   level: advanced
3671   default: 1_K
3672   with_legacy: true
3673 - name: osd_max_omap_bytes_per_request
3674   type: size
3675   level: advanced
3676   default: 1_G
3677   with_legacy: true
3678 # osd_recovery_op_warn_multiple scales the normal warning threshold,
3679 # osd_op_complaint_time, so that slow recovery ops won't cause noise
3680 - name: osd_max_write_op_reply_len
3681   type: size
3682   level: advanced
3683   desc: Max size of the per-op payload for requests with the RETURNVEC flag set
3684   long_desc: This value caps the amount of data (per op; a request may have many ops)
3685     that will be sent back to the client and recorded in the PG log.
3686   default: 64
3687   with_legacy: true
3688 - name: osd_objectstore
3689   type: str
3690   level: advanced
3691   desc: backend type for an OSD (like filestore or bluestore)
3692   default: bluestore
3693   enum_values:
3694   - bluestore
3695   - filestore
3696   - memstore
3697   - kstore
3698   - seastore
3699   - cyanstore
3700   flags:
3701   - create
3702   with_legacy: true
3703 # true if LTTng-UST tracepoints should be enabled
3704 - name: osd_objectstore_tracing
3705   type: bool
3706   level: advanced
3707   default: false
3708   with_legacy: true
3709 - name: osd_objectstore_fuse
3710   type: bool
3711   level: advanced
3712   default: false
3713   with_legacy: true
3714 - name: osd_bench_small_size_max_iops
3715   type: uint
3716   level: advanced
3717   default: 100
3718   with_legacy: true
3719 - name: osd_bench_large_size_max_throughput
3720   type: size
3721   level: advanced
3722   default: 100_M
3723   with_legacy: true
3724 - name: osd_bench_max_block_size
3725   type: size
3726   level: advanced
3727   default: 64_M
3728   with_legacy: true
3729 # duration of 'osd bench', capped at 30s to avoid triggering timeouts
3730 - name: osd_bench_duration
3731   type: uint
3732   level: advanced
3733   default: 30
3734   with_legacy: true
3735 # create a blkin trace for all osd requests
3736 - name: osd_blkin_trace_all
3737   type: bool
3738   level: advanced
3739   default: false
3740   with_legacy: true
3741 # create a blkin trace for all objecter requests
3742 - name: osdc_blkin_trace_all
3743   type: bool
3744   level: advanced
3745   default: false
3746   with_legacy: true
3747 - name: osd_discard_disconnected_ops
3748   type: bool
3749   level: advanced
3750   default: true
3751   with_legacy: true
3752 - name: osd_memory_target
3753   type: size
3754   level: basic
3755   desc: When tcmalloc and cache autotuning is enabled, try to keep this many bytes
3756     mapped in memory.
3757   long_desc: The minimum value must be at least equal to osd_memory_base + osd_memory_cache_min.
3758   fmt_desc: |
3759     When TCMalloc is available and cache autotuning is enabled, try to
3760     keep this many bytes mapped in memory. Note: This may not exactly
3761     match the RSS memory usage of the process.  While the total amount
3762     of heap memory mapped by the process should usually be close
3763     to this target, there is no guarantee that the kernel will actually
3764     reclaim  memory that has been unmapped.  During initial development,
3765     it was found that some kernels result in the OSD's RSS memory
3766     exceeding the mapped memory by up to 20%.  It is hypothesised
3767     however, that the kernel generally may be more aggressive about
3768     reclaiming unmapped memory when there is a high amount of memory
3769     pressure.  Your mileage may vary.
3770   default: 4_G
3771   see_also:
3772   - bluestore_cache_autotune
3773   - osd_memory_cache_min
3774   - osd_memory_base
3775   - osd_memory_target_autotune
3776   min: 896_M
3777   flags:
3778   - runtime
3779 - name: osd_memory_target_autotune
3780   type: bool
3781   default: false
3782   level: advanced
3783   desc: If enabled, allow orchestrator to automatically tune osd_memory_target
3784   see_also:
3785   - osd_memory_target
3786 - name: osd_memory_target_cgroup_limit_ratio
3787   type: float
3788   level: advanced
3789   desc: Set the default value for osd_memory_target to the cgroup memory limit (if
3790     set) times this value
3791   long_desc: A value of 0 disables this feature.
3792   default: 0.8
3793   see_also:
3794   - osd_memory_target
3795   min: 0
3796   max: 1
3797 - name: osd_memory_base
3798   type: size
3799   level: dev
3800   desc: When tcmalloc and cache autotuning is enabled, estimate the minimum amount
3801     of memory in bytes the OSD will need.
3802   fmt_desc: When TCMalloc and cache autotuning are enabled, estimate the minimum
3803     amount of memory in bytes the OSD will need.  This is used to help
3804     the autotuner estimate the expected aggregate memory consumption of
3805     the caches.
3806   default: 768_M
3807   see_also:
3808   - bluestore_cache_autotune
3809   flags:
3810   - runtime
3811 - name: osd_memory_expected_fragmentation
3812   type: float
3813   level: dev
3814   desc: When tcmalloc and cache autotuning is enabled, estimate the percent of memory
3815     fragmentation.
3816   fmt_desc: When TCMalloc and cache autotuning is enabled, estimate the
3817     percentage of memory fragmentation.  This is used to help the
3818     autotuner estimate the expected aggregate memory consumption
3819     of the caches.
3820   default: 0.15
3821   see_also:
3822   - bluestore_cache_autotune
3823   min: 0
3824   max: 1
3825   flags:
3826   - runtime
3827 - name: osd_memory_cache_min
3828   type: size
3829   level: dev
3830   desc: When tcmalloc and cache autotuning is enabled, set the minimum amount of memory
3831     used for caches.
3832   fmt_desc: |
3833     When TCMalloc and cache autotuning are enabled, set the minimum
3834     amount of memory used for caches. Note: Setting this value too
3835     low can result in significant cache thrashing.
3836   default: 128_M
3837   see_also:
3838   - bluestore_cache_autotune
3839   min: 128_M
3840   flags:
3841   - runtime
3842 - name: osd_memory_cache_resize_interval
3843   type: float
3844   level: dev
3845   desc: When tcmalloc and cache autotuning is enabled, wait this many seconds between
3846     resizing caches.
3847   fmt_desc: When TCMalloc and cache autotuning are enabled, wait this many
3848     seconds between resizing caches.  This setting changes the total
3849     amount of memory available for BlueStore to use for caching.  Note
3850     that setting this interval too small can result in memory allocator
3851     thrashing and lower performance.
3852   default: 1
3853   see_also:
3854   - bluestore_cache_autotune
3855 - name: memstore_device_bytes
3856   type: size
3857   level: advanced
3858   default: 1_G
3859   with_legacy: true
3860 - name: memstore_page_set
3861   type: bool
3862   level: advanced
3863   default: false
3864   with_legacy: true
3865 - name: memstore_page_size
3866   type: size
3867   level: advanced
3868   default: 64_K
3869   with_legacy: true
3870 - name: memstore_debug_omit_block_device_write
3871   type: bool
3872   level: dev
3873   desc: write metadata only
3874   default: false
3875   see_also:
3876   - bluestore_debug_omit_block_device_write
3877   with_legacy: true
3878 - name: objectstore_blackhole
3879   type: bool
3880   level: advanced
3881   default: false
3882   with_legacy: true
3883 - name: bdev_debug_inflight_ios
3884   type: bool
3885   level: dev
3886   default: false
3887   with_legacy: true
3888 # if N>0, then ~ 1/N IOs will complete before we crash on flush
3889 - name: bdev_inject_crash
3890   type: int
3891   level: dev
3892   default: 0
3893   with_legacy: true
3894 # wait N more seconds on flush
3895 - name: bdev_inject_crash_flush_delay
3896   type: int
3897   level: dev
3898   default: 2
3899   with_legacy: true
3900 - name: bdev_aio
3901   type: bool
3902   level: advanced
3903   default: true
3904   with_legacy: true
3905 # milliseconds
3906 - name: bdev_aio_poll_ms
3907   type: int
3908   level: advanced
3909   default: 250
3910   with_legacy: true
3911 - name: bdev_aio_max_queue_depth
3912   type: int
3913   level: advanced
3914   default: 1024
3915   with_legacy: true
3916 - name: bdev_aio_reap_max
3917   type: int
3918   level: advanced
3919   default: 16
3920   with_legacy: true
3921 - name: bdev_block_size
3922   type: size
3923   level: advanced
3924   default: 4_K
3925   with_legacy: true
3926 - name: bdev_read_buffer_alignment
3927   type: size
3928   level: advanced
3929   default: 4_K
3930   with_legacy: true
3931 - name: bdev_read_preallocated_huge_buffers
3932   type: str
3933   level: advanced
3934   desc: description of pools arrangement for huge page-based read buffers
3935   long_desc: Arrangement of preallocated, huge pages-based pools for reading
3936     from a KernelDevice. Applied to minimize size of scatter-gather lists
3937     sent to NICs. Targets really  big buffers (>= 2 or 4 MBs).
3938     Keep in mind the system must be configured accordingly (see /proc/sys/vm/nr_hugepages).
3939     Otherwise the OSD wil fail early.
3940     Beware BlueStore, by default, stores large chunks across many smaller blobs.
3941     Increasing bluestore_max_blob_size changes that, and thus allows the data to
3942     be read back into small number of huge page-backed buffers.
3943   fmt_desc: List of key=value pairs delimited by comma, semicolon or tab.
3944     key specifies the targeted read size and must be expressed in bytes.
3945     value specifies the number of preallocated buffers.
3946     For instance, to preallocate 64 buffers that will be used to serve
3947     2 MB-sized read requests and 128 for 4 MB, someone needs to set
3948     "2097152=64,4194304=128".
3949   see_also:
3950   - bluestore_max_blob_size
3951 - name: bdev_debug_aio
3952   type: bool
3953   level: dev
3954   default: false
3955   with_legacy: true
3956 - name: bdev_debug_aio_suicide_timeout
3957   type: float
3958   level: dev
3959   default: 1_min
3960   with_legacy: true
3961 - name: bdev_debug_aio_log_age
3962   type: float
3963   level: dev
3964   default: 5
3965   with_legacy: true
3966 # if yes, osd will unbind all NVMe devices from kernel driver and bind them
3967 # to the uio_pci_generic driver. The purpose is to prevent the case where
3968 # NVMe driver is loaded while osd is running.
3969 - name: bdev_nvme_unbind_from_kernel
3970   type: bool
3971   level: advanced
3972   default: false
3973   with_legacy: true
3974 - name: bdev_enable_discard
3975   type: bool
3976   level: advanced
3977   default: false
3978   with_legacy: true
3979 - name: bdev_async_discard
3980   type: bool
3981   level: advanced
3982   default: false
3983   with_legacy: true
3984 - name: bdev_flock_retry_interval
3985   type: float
3986   level: advanced
3987   desc: interval to retry the flock
3988   default: 0.1
3989 - name: bdev_flock_retry
3990   type: uint
3991   level: advanced
3992   desc: times to retry the flock
3993   long_desc: The number of times to retry on getting the block device lock. Programs
3994     such as systemd-udevd may compete with Ceph for this lock. 0 means 'unlimited'.
3995   default: 3
3996 - name: bluefs_alloc_size
3997   type: size
3998   level: advanced
3999   desc: Allocation unit size for DB and WAL devices
4000   default: 1_M
4001   with_legacy: true
4002 - name: bluefs_shared_alloc_size
4003   type: size
4004   level: advanced
4005   desc: Allocation unit size for primary/shared device
4006   default: 64_K
4007   with_legacy: true
4008 - name: bluefs_max_prefetch
4009   type: size
4010   level: advanced
4011   default: 1_M
4012   with_legacy: true
4013 # alloc when we get this low
4014 - name: bluefs_min_log_runway
4015   type: size
4016   level: advanced
4017   default: 1_M
4018   with_legacy: true
4019 # alloc this much at a time
4020 - name: bluefs_max_log_runway
4021   type: size
4022   level: advanced
4023   default: 4_M
4024   with_legacy: true
4025 # before we consider
4026 - name: bluefs_log_compact_min_ratio
4027   type: float
4028   level: advanced
4029   default: 5
4030   with_legacy: true
4031 # before we consider
4032 - name: bluefs_log_compact_min_size
4033   type: size
4034   level: advanced
4035   default: 16_M
4036   with_legacy: true
4037 # ignore flush until its this big
4038 - name: bluefs_min_flush_size
4039   type: size
4040   level: advanced
4041   default: 512_K
4042   with_legacy: true
4043 # sync or async log compaction
4044 - name: bluefs_compact_log_sync
4045   type: bool
4046   level: advanced
4047   default: false
4048   with_legacy: true
4049 - name: bluefs_buffered_io
4050   type: bool
4051   level: advanced
4052   desc: Enabled buffered IO for bluefs reads.
4053   long_desc: When this option is enabled, bluefs will in some cases perform buffered
4054     reads.  This allows the kernel page cache to act as a secondary cache for things
4055     like RocksDB block reads.  For example, if the rocksdb block cache isn't large
4056     enough to hold all blocks during OMAP iteration, it may be possible to read them
4057     from page cache instead of from the disk.  This can dramatically improve
4058     performance when the osd_memory_target is too small to hold all entries in block
4059     cache but it does come with downsides.  It has been reported to occasionally
4060     cause excessive kernel swapping (and associated stalls) under certain workloads.
4061     Currently the best and most consistent performing combination appears to be
4062     enabling bluefs_buffered_io and disabling system level swap.  It is possible
4063     that this recommendation may change in the future however.
4064   default: true
4065   with_legacy: true
4066 - name: bluefs_sync_write
4067   type: bool
4068   level: advanced
4069   default: false
4070   with_legacy: true
4071 - name: bluefs_allocator
4072   type: str
4073   level: dev
4074   default: hybrid
4075   enum_values:
4076   - bitmap
4077   - stupid
4078   - avl
4079   - hybrid
4080   with_legacy: true
4081 - name: bluefs_log_replay_check_allocations
4082   type: bool
4083   level: advanced
4084   desc: Enables checks for allocations consistency during log replay
4085   default: true
4086   with_legacy: true
4087 - name: bluefs_replay_recovery
4088   type: bool
4089   level: dev
4090   desc: Attempt to read bluefs log so large that it became unreadable.
4091   long_desc: If BlueFS log grows to extreme sizes (200GB+) it is likely that it becames
4092     unreadable. This options enables heuristics that scans devices for missing data.
4093     DO NOT ENABLE BY DEFAULT
4094   default: false
4095   with_legacy: true
4096 - name: bluefs_replay_recovery_disable_compact
4097   type: bool
4098   level: advanced
4099   default: false
4100   with_legacy: true
4101 - name: bluefs_check_for_zeros
4102   type: bool
4103   level: dev
4104   desc: Check data read for suspicious pages
4105   long_desc: Looks into data read to check if there is a 4K block entirely filled
4106     with zeros. If this happens, we re-read data. If there is difference, we print
4107     error to log.
4108   default: false
4109   see_also:
4110   - bluestore_retry_disk_reads
4111   flags:
4112   - runtime
4113   with_legacy: true
4114 - name: bluefs_check_volume_selector_on_umount
4115   type: bool
4116   level: dev
4117   desc: Check validity of volume selector on umount
4118   long_desc: Checks if volume selector did not diverge from the state it should be in.
4119     Reference is constructed from bluefs inode table. Asserts on inconsistency.
4120   default: false
4121   flags:
4122   - runtime
4123   with_legacy: true
4124 - name: bluefs_check_volume_selector_often
4125   type: bool
4126   level: dev
4127   desc: Periodically check validity of volume selector
4128   long_desc: Periodically checks if current volume selector does not diverge from the valid state.
4129     Reference is constructed from bluefs inode table. Asserts on inconsistency. This is debug feature.
4130   default: false
4131   see_also:
4132   - bluefs_check_volume_selector_on_umount
4133   flags:
4134   - startup
4135   with_legacy: true
4136 - name: bluestore_bluefs
4137   type: bool
4138   level: dev
4139   desc: Use BlueFS to back rocksdb
4140   long_desc: BlueFS allows rocksdb to share the same physical device(s) as the rest
4141     of BlueStore.  It should be used in all cases unless testing/developing an alternative
4142     metadata database for BlueStore.
4143   default: true
4144   flags:
4145   - create
4146   with_legacy: true
4147 # mirror to normal Env for debug
4148 - name: bluestore_bluefs_env_mirror
4149   type: bool
4150   level: dev
4151   desc: Mirror bluefs data to file system for testing/validation
4152   default: false
4153   flags:
4154   - create
4155   with_legacy: true
4156 - name: bluestore_bluefs_max_free
4157   type: size
4158   level: advanced
4159   default: 10_G
4160   desc: Maximum free space allocated to BlueFS
4161 - name: bluestore_bluefs_alloc_failure_dump_interval
4162   type: float
4163   level: advanced
4164   desc: How frequently (in seconds) to dump allocator onBlueFS space allocation failure
4165   default: 0
4166   with_legacy: true
4167 - name: bluestore_spdk_mem
4168   type: size
4169   level: dev
4170   desc: Amount of dpdk memory size in MB
4171   long_desc: If running multiple SPDK instances per node, you must specify the amount
4172     of dpdk memory size in MB each instance will use, to make sure each instance uses
4173     its own dpdk memory
4174   default: 512
4175 - name: bluestore_spdk_coremask
4176   type: str
4177   level: dev
4178   desc: A hexadecimal bit mask of the cores to run on. Note the core numbering can
4179     change between platforms and should be determined beforehand
4180   default: '0x1'
4181 - name: bluestore_spdk_max_io_completion
4182   type: uint
4183   level: dev
4184   desc: Maximal I/Os to be batched completed while checking queue pair completions,
4185     0 means let spdk library determine it
4186   default: 0
4187 - name: bluestore_spdk_io_sleep
4188   type: uint
4189   level: dev
4190   desc: Time period to wait if there is no completed I/O from polling
4191   default: 5
4192 # If you want to use spdk driver, you need to specify NVMe serial number here
4193 # with "spdk:" prefix.
4194 # Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
4195 # get the serial number of Intel(R) Fultondale NVMe controllers.
4196 # Example:
4197 # bluestore_block_path = spdk:55cd2e404bd73932
4198 - name: bluestore_block_path
4199   type: str
4200   level: dev
4201   desc: Path to block device/file
4202   flags:
4203   - create
4204   with_legacy: true
4205 - name: bluestore_block_size
4206   type: size
4207   level: dev
4208   desc: Size of file to create for backing bluestore
4209   default: 100_G
4210   flags:
4211   - create
4212   with_legacy: true
4213 - name: bluestore_block_create
4214   type: bool
4215   level: dev
4216   desc: Create bluestore_block_path if it doesn't exist
4217   default: true
4218   see_also:
4219   - bluestore_block_path
4220   - bluestore_block_size
4221   flags:
4222   - create
4223   with_legacy: true
4224 - name: bluestore_block_db_path
4225   type: str
4226   level: dev
4227   desc: Path for db block device
4228   flags:
4229   - create
4230   with_legacy: true
4231 # rocksdb ssts (hot/warm)
4232 - name: bluestore_block_db_size
4233   type: size
4234   level: dev
4235   desc: Size of file to create for bluestore_block_db_path
4236   default: 0
4237   flags:
4238   - create
4239   with_legacy: true
4240 - name: bluestore_block_db_create
4241   type: bool
4242   level: dev
4243   desc: Create bluestore_block_db_path if it doesn't exist
4244   default: false
4245   see_also:
4246   - bluestore_block_db_path
4247   - bluestore_block_db_size
4248   flags:
4249   - create
4250   with_legacy: true
4251 - name: bluestore_block_wal_path
4252   type: str
4253   level: dev
4254   desc: Path to block device/file backing bluefs wal
4255   flags:
4256   - create
4257   with_legacy: true
4258 # rocksdb wal
4259 - name: bluestore_block_wal_size
4260   type: size
4261   level: dev
4262   desc: Size of file to create for bluestore_block_wal_path
4263   default: 96_M
4264   flags:
4265   - create
4266   with_legacy: true
4267 - name: bluestore_block_wal_create
4268   type: bool
4269   level: dev
4270   desc: Create bluestore_block_wal_path if it doesn't exist
4271   default: false
4272   see_also:
4273   - bluestore_block_wal_path
4274   - bluestore_block_wal_size
4275   flags:
4276   - create
4277   with_legacy: true
4278 # whether preallocate space if block/db_path/wal_path is file rather that block device.
4279 - name: bluestore_block_preallocate_file
4280   type: bool
4281   level: dev
4282   desc: Preallocate file created via bluestore_block*_create
4283   default: false
4284   flags:
4285   - create
4286   with_legacy: true
4287 - name: bluestore_ignore_data_csum
4288   type: bool
4289   level: dev
4290   desc: Ignore checksum errors on read and do not generate an EIO error
4291   default: false
4292   flags:
4293   - runtime
4294   with_legacy: true
4295 - name: bluestore_csum_type
4296   type: str
4297   level: advanced
4298   desc: Default checksum algorithm to use
4299   long_desc: crc32c, xxhash32, and xxhash64 are available.  The _16 and _8 variants
4300     use only a subset of the bits for more compact (but less reliable) checksumming.
4301   fmt_desc: The default checksum algorithm to use.
4302   default: crc32c
4303   enum_values:
4304   - none
4305   - crc32c
4306   - crc32c_16
4307   - crc32c_8
4308   - xxhash32
4309   - xxhash64
4310   flags:
4311   - runtime
4312   with_legacy: true
4313 - name: bluestore_retry_disk_reads
4314   type: uint
4315   level: advanced
4316   desc: Number of read retries on checksum validation error
4317   long_desc: Retries to read data from the disk this many times when checksum validation
4318     fails to handle spurious read errors gracefully.
4319   default: 3
4320   min: 0
4321   max: 255
4322   flags:
4323   - runtime
4324   with_legacy: true
4325 - name: bluestore_min_alloc_size
4326   type: uint
4327   level: advanced
4328   desc: Minimum allocation size to allocate for an object
4329   long_desc: A smaller allocation size generally means less data is read and then
4330     rewritten when a copy-on-write operation is triggered (e.g., when writing to something
4331     that was recently snapshotted).  Similarly, less data is journaled before performing
4332     an overwrite (writes smaller than min_alloc_size must first pass through the BlueStore
4333     journal).  Larger values of min_alloc_size reduce the amount of metadata required
4334     to describe the on-disk layout and reduce overall fragmentation.
4335   default: 0
4336   flags:
4337   - create
4338   with_legacy: true
4339 - name: bluestore_min_alloc_size_hdd
4340   type: size
4341   level: advanced
4342   desc: Default min_alloc_size value for rotational media
4343   default: 4_K
4344   see_also:
4345   - bluestore_min_alloc_size
4346   flags:
4347   - create
4348   with_legacy: true
4349 - name: bluestore_min_alloc_size_ssd
4350   type: size
4351   level: advanced
4352   desc: Default min_alloc_size value for non-rotational (solid state)  media
4353   default: 4_K
4354   see_also:
4355   - bluestore_min_alloc_size
4356   flags:
4357   - create
4358   with_legacy: true
4359 - name: bluestore_use_optimal_io_size_for_min_alloc_size
4360   type: bool
4361   level: advanced
4362   desc: Discover media optimal IO Size and use for min_alloc_size
4363   default: false
4364   see_also:
4365   - bluestore_min_alloc_size
4366   flags:
4367   - create
4368   with_legacy: true
4369 - name: bluestore_max_alloc_size
4370   type: size
4371   level: advanced
4372   desc: Maximum size of a single allocation (0 for no max)
4373   default: 0
4374   flags:
4375   - create
4376   with_legacy: true
4377 - name: bluestore_prefer_deferred_size
4378   type: size
4379   level: advanced
4380   desc: Writes smaller than this size will be written to the journal and then asynchronously
4381     written to the device.  This can be beneficial when using rotational media where
4382     seeks are expensive, and is helpful both with and without solid state journal/wal
4383     devices.
4384   default: 0
4385   flags:
4386   - runtime
4387   with_legacy: true
4388 - name: bluestore_prefer_deferred_size_hdd
4389   type: size
4390   level: advanced
4391   desc: Default bluestore_prefer_deferred_size for rotational media
4392   default: 64_K
4393   see_also:
4394   - bluestore_prefer_deferred_size
4395   flags:
4396   - runtime
4397   with_legacy: true
4398 - name: bluestore_prefer_deferred_size_ssd
4399   type: size
4400   level: advanced
4401   desc: Default bluestore_prefer_deferred_size for non-rotational (solid state) media
4402   default: 0
4403   see_also:
4404   - bluestore_prefer_deferred_size
4405   flags:
4406   - runtime
4407   with_legacy: true
4408 - name: bluestore_compression_mode
4409   type: str
4410   level: advanced
4411   desc: Default policy for using compression when pool does not specify
4412   long_desc: '''none'' means never use compression.  ''passive'' means use compression
4413     when clients hint that data is compressible.  ''aggressive'' means use compression
4414     unless clients hint that data is not compressible.  This option is used when the
4415     per-pool property for the compression mode is not present.'
4416   fmt_desc: The default policy for using compression if the per-pool property
4417     ``compression_mode`` is not set. ``none`` means never use
4418     compression. ``passive`` means use compression when
4419     :c:func:`clients hint <rados_set_alloc_hint>` that data is
4420     compressible.  ``aggressive`` means use compression unless
4421     clients hint that data is not compressible.  ``force`` means use
4422     compression under all circumstances even if the clients hint that
4423     the data is not compressible.
4424   default: none
4425   enum_values:
4426   - none
4427   - passive
4428   - aggressive
4429   - force
4430   flags:
4431   - runtime
4432   with_legacy: true
4433 - name: bluestore_compression_algorithm
4434   type: str
4435   level: advanced
4436   desc: Default compression algorithm to use when writing object data
4437   long_desc: This controls the default compressor to use (if any) if the per-pool
4438     property is not set.  Note that zstd is *not* recommended for bluestore due to
4439     high CPU overhead when compressing small amounts of data.
4440   fmt_desc: The default compressor to use (if any) if the per-pool property
4441     ``compression_algorithm`` is not set. Note that ``zstd`` is *not*
4442     recommended for BlueStore due to high CPU overhead when
4443     compressing small amounts of data.
4444   default: snappy
4445   enum_values:
4446   - ''
4447   - snappy
4448   - zlib
4449   - zstd
4450   - lz4
4451   flags:
4452   - runtime
4453   with_legacy: true
4454 - name: bluestore_compression_min_blob_size
4455   type: size
4456   level: advanced
4457   desc: Maximum chunk size to apply compression to when random access is expected
4458     for an object.
4459   long_desc: Chunks larger than this are broken into smaller chunks before being compressed
4460   fmt_desc: Chunks smaller than this are never compressed.
4461     The per-pool property ``compression_min_blob_size`` overrides
4462     this setting.
4463   default: 0
4464   flags:
4465   - runtime
4466   with_legacy: true
4467 - name: bluestore_compression_min_blob_size_hdd
4468   type: size
4469   level: advanced
4470   desc: Default value of bluestore_compression_min_blob_size for rotational media
4471   fmt_desc: Default value of ``bluestore compression min blob size``
4472     for rotational media.
4473   default: 8_K
4474   see_also:
4475   - bluestore_compression_min_blob_size
4476   flags:
4477   - runtime
4478   with_legacy: true
4479 - name: bluestore_compression_min_blob_size_ssd
4480   type: size
4481   level: advanced
4482   desc: Default value of bluestore_compression_min_blob_size for non-rotational (solid
4483     state) media
4484   fmt_desc: Default value of ``bluestore compression min blob size``
4485     for non-rotational (solid state) media.
4486   default: 64_K
4487   see_also:
4488   - bluestore_compression_min_blob_size
4489   flags:
4490   - runtime
4491   with_legacy: true
4492 - name: bluestore_compression_max_blob_size
4493   type: size
4494   level: advanced
4495   desc: Maximum chunk size to apply compression to when non-random access is expected
4496     for an object.
4497   long_desc: Chunks larger than this are broken into smaller chunks before being compressed
4498   fmt_desc: Chunks larger than this value are broken into smaller blobs of at most
4499     ``bluestore_compression_max_blob_size`` bytes before being compressed.
4500     The per-pool property ``compression_max_blob_size`` overrides
4501     this setting.
4502   default: 0
4503   flags:
4504   - runtime
4505   with_legacy: true
4506 - name: bluestore_compression_max_blob_size_hdd
4507   type: size
4508   level: advanced
4509   desc: Default value of bluestore_compression_max_blob_size for rotational media
4510   fmt_desc: Default value of ``bluestore compression max blob size``
4511     for rotational media.
4512   default: 64_K
4513   see_also:
4514   - bluestore_compression_max_blob_size
4515   flags:
4516   - runtime
4517   with_legacy: true
4518 - name: bluestore_compression_max_blob_size_ssd
4519   type: size
4520   level: advanced
4521   desc: Default value of bluestore_compression_max_blob_size for non-rotational (solid
4522     state) media
4523   fmt_desc: Default value of ``bluestore compression max blob size``
4524     for non-rotational (SSD, NVMe) media.
4525   default: 64_K
4526   see_also:
4527   - bluestore_compression_max_blob_size
4528   flags:
4529   - runtime
4530   with_legacy: true
4531 # Specifies minimum expected amount of saved allocation units
4532 # per single blob to enable compressed blobs garbage collection
4533 - name: bluestore_gc_enable_blob_threshold
4534   type: int
4535   level: dev
4536   default: 0
4537   flags:
4538   - runtime
4539   with_legacy: true
4540 # Specifies minimum expected amount of saved allocation units
4541 # per all blobsb to enable compressed blobs garbage collection
4542 - name: bluestore_gc_enable_total_threshold
4543   type: int
4544   level: dev
4545   default: 0
4546   flags:
4547   - runtime
4548   with_legacy: true
4549 - name: bluestore_max_blob_size
4550   type: size
4551   level: dev
4552   long_desc: Bluestore blobs are collections of extents (ie on-disk data) originating
4553     from one or more objects.  Blobs can be compressed, typically have checksum data,
4554     may be overwritten, may be shared (with an extent ref map), or split.  This setting
4555     controls the maximum size a blob is allowed to be.
4556   default: 0
4557   flags:
4558   - runtime
4559   with_legacy: true
4560 - name: bluestore_max_blob_size_hdd
4561   type: size
4562   level: dev
4563   default: 64_K
4564   see_also:
4565   - bluestore_max_blob_size
4566   flags:
4567   - runtime
4568   with_legacy: true
4569 - name: bluestore_max_blob_size_ssd
4570   type: size
4571   level: dev
4572   default: 64_K
4573   see_also:
4574   - bluestore_max_blob_size
4575   flags:
4576   - runtime
4577   with_legacy: true
4578 # Require the net gain of compression at least to be at this ratio,
4579 # otherwise we don't compress.
4580 # And ask for compressing at least 12.5%(1/8) off, by default.
4581 - name: bluestore_compression_required_ratio
4582   type: float
4583   level: advanced
4584   desc: Compression ratio required to store compressed data
4585   long_desc: If we compress data and get less than this we discard the result and
4586     store the original uncompressed data.
4587   fmt_desc: The ratio of the size of the data chunk after
4588     compression relative to the original size must be at
4589     least this small in order to store the compressed
4590     version.
4591   default: 0.875
4592   flags:
4593   - runtime
4594   with_legacy: true
4595 - name: bluestore_extent_map_shard_max_size
4596   type: size
4597   level: dev
4598   desc: Max size (bytes) for a single extent map shard before splitting
4599   default: 1200
4600   with_legacy: true
4601 - name: bluestore_extent_map_shard_target_size
4602   type: size
4603   level: dev
4604   desc: Target size (bytes) for a single extent map shard
4605   default: 500
4606   with_legacy: true
4607 - name: bluestore_extent_map_shard_min_size
4608   type: size
4609   level: dev
4610   desc: Min size (bytes) for a single extent map shard before merging
4611   default: 150
4612   with_legacy: true
4613 - name: bluestore_extent_map_shard_target_size_slop
4614   type: float
4615   level: dev
4616   desc: Ratio above/below target for a shard when trying to align to an existing extent
4617     or blob boundary
4618   default: 0.2
4619   with_legacy: true
4620 - name: bluestore_extent_map_inline_shard_prealloc_size
4621   type: size
4622   level: dev
4623   desc: Preallocated buffer for inline shards
4624   default: 256
4625   with_legacy: true
4626 - name: bluestore_cache_trim_interval
4627   type: float
4628   level: advanced
4629   desc: How frequently we trim the bluestore cache
4630   default: 0.05
4631   with_legacy: true
4632 - name: bluestore_cache_trim_max_skip_pinned
4633   type: uint
4634   level: dev
4635   desc: Max pinned cache entries we consider before giving up
4636   default: 1000
4637   with_legacy: true
4638 - name: bluestore_cache_type
4639   type: str
4640   level: dev
4641   desc: Cache replacement algorithm
4642   default: 2q
4643   enum_values:
4644   - 2q
4645   - lru
4646   with_legacy: true
4647 - name: bluestore_2q_cache_kin_ratio
4648   type: float
4649   level: dev
4650   desc: 2Q paper suggests .5
4651   default: 0.5
4652   with_legacy: true
4653 - name: bluestore_2q_cache_kout_ratio
4654   type: float
4655   level: dev
4656   desc: 2Q paper suggests .5
4657   default: 0.5
4658   with_legacy: true
4659 - name: bluestore_cache_size
4660   type: size
4661   level: dev
4662   desc: Cache size (in bytes) for BlueStore
4663   long_desc: This includes data and metadata cached by BlueStore as well as memory
4664     devoted to rocksdb's cache(s).
4665   fmt_desc: The amount of memory BlueStore will use for its cache.  If zero,
4666     ``bluestore_cache_size_hdd`` or ``bluestore_cache_size_ssd`` will
4667     be used instead.
4668   default: 0
4669   with_legacy: true
4670 - name: bluestore_cache_size_hdd
4671   type: size
4672   level: dev
4673   desc: Default bluestore_cache_size for rotational media
4674   fmt_desc: The default amount of memory BlueStore will use for its cache when
4675     backed by an HDD.
4676   default: 1_G
4677   see_also:
4678   - bluestore_cache_size
4679   with_legacy: true
4680 - name: bluestore_cache_size_ssd
4681   type: size
4682   level: dev
4683   desc: Default bluestore_cache_size for non-rotational (solid state) media
4684   fmt_desc: The default amount of memory BlueStore will use for its cache when
4685     backed by an SSD.
4686   default: 3_G
4687   see_also:
4688   - bluestore_cache_size
4689   with_legacy: true
4690 - name: bluestore_cache_meta_ratio
4691   type: float
4692   level: dev
4693   desc: Ratio of bluestore cache to devote to metadata
4694   default: 0.45
4695   see_also:
4696   - bluestore_cache_size
4697   with_legacy: true
4698 - name: bluestore_cache_kv_ratio
4699   type: float
4700   level: dev
4701   desc: Ratio of bluestore cache to devote to key/value database (RocksDB)
4702   default: 0.45
4703   see_also:
4704   - bluestore_cache_size
4705   with_legacy: true
4706 - name: bluestore_cache_kv_onode_ratio
4707   type: float
4708   level: dev
4709   desc: Ratio of bluestore cache to devote to kv onode column family (rocksdb)
4710   default: 0.04
4711   see_also:
4712   - bluestore_cache_size
4713 - name: bluestore_cache_autotune
4714   type: bool
4715   level: dev
4716   desc: Automatically tune the ratio of caches while respecting min values.
4717   fmt_desc: Automatically tune the space ratios assigned to various BlueStore
4718     caches while respecting minimum values.
4719   default: true
4720   see_also:
4721   - bluestore_cache_size
4722   - bluestore_cache_meta_ratio
4723 - name: bluestore_cache_autotune_interval
4724   type: float
4725   level: dev
4726   desc: The number of seconds to wait between rebalances when cache autotune is enabled.
4727   fmt_desc: |
4728     The number of seconds to wait between rebalances when cache autotune
4729     is enabled.  This setting changes how quickly the allocation ratios of
4730     various caches are recomputed.  Note:  Setting this interval too small
4731     can result in high CPU usage and lower performance.
4732   default: 5
4733   see_also:
4734   - bluestore_cache_autotune
4735 - name: bluestore_cache_age_bin_interval
4736   type: float
4737   level: dev
4738   desc: The duration (in seconds) represented by a single cache age bin.
4739   fmt_desc: |
4740     The caches used by bluestore will assign cache entries to an 'age bin'
4741     that represents a period of time during which that cache entry was most
4742     recently updated.  By binning the caches in this way, Ceph's priority
4743     cache balancing code can make better decisions about which caches should
4744     receive priority based on the relative ages of items in the caches.  By
4745     default, a single cache age bin represents 1 second of time.  Note:
4746     Setting this interval too small can result in high CPU usage and lower
4747     performance.
4748   default: 1
4749   see_also:
4750   - bluestore_cache_age_bins_kv
4751   - bluestore_cache_age_bins_kv_onode
4752   - bluestore_cache_age_bins_meta
4753   - bluestore_cache_age_bins_data
4754 - name: bluestore_cache_age_bins_kv
4755   type: str
4756   level: dev
4757   desc: A 10 element, space separated list of age bins for kv cache
4758   fmt_desc: |
4759     A 10 element, space separated list of cache age bins grouped by
4760     priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4761     PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
4762     priority level.  A 0 in the 2nd term will prevent any items from being
4763     associated with that priority.  bin duration is based on the
4764     bluestore_cache_age_bin_interval value.  For example,
4765     "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4766     contains 1 age bin.  Assuming the default age bin interval of 1 second,
4767     PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4768     bins representing cache items that are 1 to less than 5 seconds old. All
4769     other cache items in this example are associated with the lowest priority
4770     level as PRI3-PRI10 all have 0s in their second term.
4771   default: "1 2 6 24 120 720 0 0 0 0"
4772   see_also:
4773   - bluestore_cache_age_bin_interval
4774 - name: bluestore_cache_age_bins_kv_onode
4775   type: str
4776   level: dev
4777   desc: A 10 element, space separated list of age bins for kv onode cache
4778   fmt_desc: |
4779     A 10 element, space separated list of cache age bins grouped by
4780     priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4781     PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
4782     priority level.  A 0 in the 2nd term will prevent any items from being
4783     associated with that priority.  bin duration is based on the
4784     bluestore_cache_age_bin_interval value.  For example,
4785     "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4786     contains 1 age bin.  Assuming the default age bin interval of 1 second,
4787     PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4788     bins representing cache items that are 1 to less than 5 seconds old. All
4789     other cache items in this example are associated with the lowest priority
4790     level as PRI3-PRI10 all have 0s in their second term.
4791   default: "0 0 0 0 0 0 0 0 0 720"
4792   see_also:
4793   - bluestore_cache_age_bin_interval
4794 - name: bluestore_cache_age_bins_meta
4795   type: str
4796   level: dev
4797   desc: A 10 element, space separated list of age bins for onode cache
4798   fmt_desc: |
4799     A 10 element, space separated list of cache age bins grouped by
4800     priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4801     PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
4802     priority level.  A 0 in the 2nd term will prevent any items from being
4803     associated with that priority.  bin duration is based on the
4804     bluestore_cache_age_bin_interval value.  For example,
4805     "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4806     contains 1 age bin.  Assuming the default age bin interval of 1 second,
4807     PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4808     bins representing cache items that are 1 to less than 5 seconds old. All
4809     other cache items in this example are associated with the lowest priority
4810     level as PRI3-PRI10 all have 0s in their second term.
4811   default: "1 2 6 24 120 720 0 0 0 0"
4812   see_also:
4813   - bluestore_cache_age_bin_interval
4814 - name: bluestore_cache_age_bins_data
4815   type: str
4816   level: dev
4817   desc: A 10 element, space separated list of age bins for data cache
4818   fmt_desc: |
4819     A 10 element, space separated list of cache age bins grouped by
4820     priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
4821     PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
4822     priority level.  A 0 in the 2nd term will prevent any items from being
4823     associated with that priority.  bin duration is based on the
4824     bluestore_cache_age_bin_interval value.  For example,
4825     "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
4826     contains 1 age bin.  Assuming the default age bin interval of 1 second,
4827     PRI1 represents cache items that are less than 1 second old. PRI2 has 4
4828     bins representing cache items that are 1 to less than 5 seconds old. All
4829     other cache items in this example are associated with the lowest priority
4830     level as PRI3-PRI10 all have 0s in their second term.
4831   default: "1 2 6 24 120 720 0 0 0 0"
4832   see_also:
4833   - bluestore_cache_age_bin_interval
4834 - name: bluestore_alloc_stats_dump_interval
4835   type: float
4836   level: dev
4837   desc: The period (in second) for logging allocation statistics.
4838   default: 1_day
4839   with_legacy: true
4840 - name: bluestore_kvbackend
4841   type: str
4842   level: dev
4843   desc: Key value database to use for bluestore
4844   default: rocksdb
4845   flags:
4846   - create
4847   with_legacy: true
4848 - name: bluestore_allocator
4849   type: str
4850   level: advanced
4851   desc: Allocator policy
4852   long_desc: Allocator to use for bluestore.  Stupid should only be used for testing.
4853   default: hybrid
4854   enum_values:
4855   - bitmap
4856   - stupid
4857   - avl
4858   - hybrid
4859   - zoned
4860   with_legacy: true
4861 - name: bluestore_freelist_blocks_per_key
4862   type: size
4863   level: dev
4864   desc: Block (and bits) per database key
4865   default: 128
4866   with_legacy: true
4867 - name: bluestore_bitmapallocator_blocks_per_zone
4868   type: size
4869   level: dev
4870   default: 1_K
4871   with_legacy: true
4872 - name: bluestore_bitmapallocator_span_size
4873   type: size
4874   level: dev
4875   default: 1_K
4876   with_legacy: true
4877 - name: bluestore_max_deferred_txc
4878   type: uint
4879   level: advanced
4880   desc: Max transactions with deferred writes that can accumulate before we force
4881     flush deferred writes
4882   default: 32
4883   with_legacy: true
4884 - name: bluestore_max_defer_interval
4885   type: float
4886   level: advanced
4887   desc: max duration to force deferred submit
4888   default: 3
4889   with_legacy: true
4890 - name: bluestore_rocksdb_options
4891   type: str
4892   level: advanced
4893   desc: Full set of rocksdb settings to override
4894   default: compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152,max_background_compactions=2,max_total_wal_size=1073741824
4895   with_legacy: true
4896 - name: bluestore_rocksdb_options_annex
4897   type: str
4898   level: advanced
4899   desc: An addition to bluestore_rocksdb_options. Allows setting rocksdb options without
4900     repeating the existing defaults.
4901   with_legacy: true
4902 - name: bluestore_rocksdb_cf
4903   type: bool
4904   level: advanced
4905   desc: Enable use of rocksdb column families for bluestore metadata
4906   fmt_desc: Enables sharding of BlueStore's RocksDB.
4907     When ``true``, ``bluestore_rocksdb_cfs`` is used.
4908     Only applied when OSD is doing ``--mkfs``.
4909   default: true
4910   verbatim: |
4911     #ifdef WITH_SEASTAR
4912     // This is necessary as the Seastar's allocator imposes restrictions
4913     // on the number of threads that entered malloc/free/*. Unfortunately,
4914     // RocksDB sharding in BlueStore dramatically lifted the number of
4915     // threads spawn during RocksDB's init.
4916     .set_validator([](std::string *value, std::string *error_message) {
4917       if (const bool parsed_value = strict_strtob(value->c_str(), error_message);
4918         error_message->empty() && parsed_value) {
4919         *error_message = "invalid BlueStore sharding configuration."
4920                          " Be aware any change takes effect only on mkfs!";
4921         return -EINVAL;
4922       } else {
4923         return 0;
4924       }
4925     })
4926     #endif
4927 - name: bluestore_rocksdb_cfs
4928   type: str
4929   level: dev
4930   desc: Definition of column families and their sharding
4931   long_desc: 'Space separated list of elements: column_def [ ''='' rocksdb_options
4932     ]. column_def := column_name [ ''('' shard_count [ '','' hash_begin ''-'' [ hash_end
4933     ] ] '')'' ]. Example: ''I=write_buffer_size=1048576 O(6) m(7,10-)''. Interval
4934     [hash_begin..hash_end) defines characters to use for hash calculation. Recommended
4935     hash ranges: O(0-13) P(0-8) m(0-16). Sharding of S,T,C,M,B prefixes is inadvised'
4936   fmt_desc: Definition of BlueStore's RocksDB sharding.
4937     The optimal value depends on multiple factors, and modification is invadvisable.
4938     This setting is used only when OSD is doing ``--mkfs``.
4939     Next runs of OSD retrieve sharding from disk.
4940   default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P
4941 - name: bluestore_qfsck_on_mount
4942   type: bool
4943   level: dev
4944   desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state
4945   default: true
4946   with_legacy: true
4947 - name: bluestore_fsck_on_mount
4948   type: bool
4949   level: dev
4950   desc: Run fsck at mount
4951   default: false
4952   with_legacy: true
4953 - name: bluestore_fsck_on_mount_deep
4954   type: bool
4955   level: dev
4956   desc: Run deep fsck at mount when bluestore_fsck_on_mount is set to true
4957   default: false
4958   with_legacy: true
4959 - name: bluestore_fsck_quick_fix_on_mount
4960   type: bool
4961   level: dev
4962   desc: Do quick-fix for the store at mount
4963   default: false
4964   with_legacy: true
4965 - name: bluestore_fsck_on_umount
4966   type: bool
4967   level: dev
4968   desc: Run fsck at umount
4969   default: false
4970   with_legacy: true
4971 - name: bluestore_allocation_from_file
4972   type: bool
4973   level: dev
4974   desc: Remove allocation info from RocksDB and store the info in a new allocation file
4975   default: true
4976   with_legacy: true
4977 - name: bluestore_fsck_on_umount_deep
4978   type: bool
4979   level: dev
4980   desc: Run deep fsck at umount when bluestore_fsck_on_umount is set to true
4981   default: false
4982   with_legacy: true
4983 - name: bluestore_fsck_on_mkfs
4984   type: bool
4985   level: dev
4986   desc: Run fsck after mkfs
4987   default: true
4988   with_legacy: true
4989 - name: bluestore_fsck_on_mkfs_deep
4990   type: bool
4991   level: dev
4992   desc: Run deep fsck after mkfs
4993   default: false
4994   with_legacy: true
4995 - name: bluestore_sync_submit_transaction
4996   type: bool
4997   level: dev
4998   desc: Try to submit metadata transaction to rocksdb in queuing thread context
4999   default: false
5000   with_legacy: true
5001 - name: bluestore_fsck_read_bytes_cap
5002   type: size
5003   level: advanced
5004   desc: Maximum bytes read at once by deep fsck
5005   default: 64_M
5006   flags:
5007   - runtime
5008   with_legacy: true
5009 - name: bluestore_fsck_quick_fix_threads
5010   type: int
5011   level: advanced
5012   desc: Number of additional threads to perform quick-fix (shallow fsck) command
5013   default: 2
5014   with_legacy: true
5015 - name: bluestore_fsck_shared_blob_tracker_size
5016   type: float
5017   level: dev
5018   desc: Size(a fraction of osd_memory_target, defaults to 128MB) of a hash table to track shared blobs ref counts. Higher the size, more precise is the tracker -> less overhead during the repair.
5019   default: 0.03125
5020   see_also:
5021   - osd_memory_target
5022   flags:
5023   - runtime
5024 - name: bluestore_throttle_bytes
5025   type: size
5026   level: advanced
5027   desc: Maximum bytes in flight before we throttle IO submission
5028   default: 64_M
5029   flags:
5030   - runtime
5031   with_legacy: true
5032 - name: bluestore_throttle_deferred_bytes
5033   type: size
5034   level: advanced
5035   desc: Maximum bytes for deferred writes before we throttle IO submission
5036   default: 128_M
5037   flags:
5038   - runtime
5039   with_legacy: true
5040 - name: bluestore_throttle_cost_per_io
5041   type: size
5042   level: advanced
5043   desc: Overhead added to transaction cost (in bytes) for each IO
5044   default: 0
5045   flags:
5046   - runtime
5047   with_legacy: true
5048 - name: bluestore_throttle_cost_per_io_hdd
5049   type: uint
5050   level: advanced
5051   desc: Default bluestore_throttle_cost_per_io for rotational media
5052   default: 670000
5053   see_also:
5054   - bluestore_throttle_cost_per_io
5055   flags:
5056   - runtime
5057   with_legacy: true
5058 - name: bluestore_throttle_cost_per_io_ssd
5059   type: uint
5060   level: advanced
5061   desc: Default bluestore_throttle_cost_per_io for non-rotation (solid state) media
5062   default: 4000
5063   see_also:
5064   - bluestore_throttle_cost_per_io
5065   flags:
5066   - runtime
5067   with_legacy: true
5068 - name: bluestore_deferred_batch_ops
5069   type: uint
5070   level: advanced
5071   desc: Max number of deferred writes before we flush the deferred write queue
5072   default: 0
5073   min: 0
5074   max: 65535
5075   flags:
5076   - runtime
5077   with_legacy: true
5078 - name: bluestore_deferred_batch_ops_hdd
5079   type: uint
5080   level: advanced
5081   desc: Default bluestore_deferred_batch_ops for rotational media
5082   default: 64
5083   see_also:
5084   - bluestore_deferred_batch_ops
5085   min: 0
5086   max: 65535
5087   flags:
5088   - runtime
5089   with_legacy: true
5090 - name: bluestore_deferred_batch_ops_ssd
5091   type: uint
5092   level: advanced
5093   desc: Default bluestore_deferred_batch_ops for non-rotational (solid state) media
5094   default: 16
5095   see_also:
5096   - bluestore_deferred_batch_ops
5097   min: 0
5098   max: 65535
5099   flags:
5100   - runtime
5101   with_legacy: true
5102 - name: bluestore_nid_prealloc
5103   type: int
5104   level: dev
5105   desc: Number of unique object ids to preallocate at a time
5106   default: 1024
5107   with_legacy: true
5108 - name: bluestore_blobid_prealloc
5109   type: uint
5110   level: dev
5111   desc: Number of unique blob ids to preallocate at a time
5112   default: 10_K
5113   with_legacy: true
5114 - name: bluestore_clone_cow
5115   type: bool
5116   level: advanced
5117   desc: Use copy-on-write when cloning objects (versus reading and rewriting them
5118     at clone time)
5119   default: true
5120   flags:
5121   - runtime
5122   with_legacy: true
5123 - name: bluestore_default_buffered_read
5124   type: bool
5125   level: advanced
5126   desc: Cache read results by default (unless hinted NOCACHE or WONTNEED)
5127   default: true
5128   flags:
5129   - runtime
5130   with_legacy: true
5131 - name: bluestore_default_buffered_write
5132   type: bool
5133   level: advanced
5134   desc: Cache writes by default (unless hinted NOCACHE or WONTNEED)
5135   default: false
5136   flags:
5137   - runtime
5138   with_legacy: true
5139 - name: bluestore_debug_no_reuse_blocks
5140   type: bool
5141   level: dev
5142   default: false
5143   with_legacy: true
5144 - name: bluestore_debug_small_allocations
5145   type: int
5146   level: dev
5147   default: 0
5148   with_legacy: true
5149 - name: bluestore_debug_too_many_blobs_threshold
5150   type: int
5151   level: dev
5152   default: 24576
5153   with_legacy: true
5154 - name: bluestore_debug_freelist
5155   type: bool
5156   level: dev
5157   default: false
5158   with_legacy: true
5159 - name: bluestore_debug_prefill
5160   type: float
5161   level: dev
5162   desc: simulate fragmentation
5163   default: 0
5164   with_legacy: true
5165 - name: bluestore_debug_prefragment_max
5166   type: size
5167   level: dev
5168   default: 1_M
5169   with_legacy: true
5170 - name: bluestore_debug_inject_read_err
5171   type: bool
5172   level: dev
5173   default: false
5174   with_legacy: true
5175 - name: bluestore_debug_randomize_serial_transaction
5176   type: int
5177   level: dev
5178   default: 0
5179   with_legacy: true
5180 - name: bluestore_debug_omit_block_device_write
5181   type: bool
5182   level: dev
5183   default: false
5184   with_legacy: true
5185 - name: bluestore_debug_fsck_abort
5186   type: bool
5187   level: dev
5188   default: false
5189   with_legacy: true
5190 - name: bluestore_debug_omit_kv_commit
5191   type: bool
5192   level: dev
5193   default: false
5194   with_legacy: true
5195 - name: bluestore_debug_permit_any_bdev_label
5196   type: bool
5197   level: dev
5198   default: false
5199   with_legacy: true
5200 - name: bluestore_debug_random_read_err
5201   type: float
5202   level: dev
5203   default: 0
5204   with_legacy: true
5205 - name: bluestore_debug_inject_bug21040
5206   type: bool
5207   level: dev
5208   default: false
5209   with_legacy: true
5210 - name: bluestore_debug_inject_csum_err_probability
5211   type: float
5212   level: dev
5213   desc: inject crc verification errors into bluestore device reads
5214   default: 0
5215   with_legacy: true
5216 - name: bluestore_debug_legacy_omap
5217   type: bool
5218   level: dev
5219   desc: Allows mkfs to create OSD in legacy OMAP naming mode (neither per-pool nor per-pg).
5220     This is intended primarily for developers' purposes. The resulting OSD might/would
5221     be transformed to the currrently default 'per-pg' format when BlueStore's quick-fix or
5222     repair are applied.
5223   default: false
5224   with_legacy: true
5225 - name: bluestore_fsck_error_on_no_per_pool_stats
5226   type: bool
5227   level: advanced
5228   desc: Make fsck error (instead of warn) when bluestore lacks per-pool stats, e.g.,
5229     after an upgrade
5230   default: false
5231   with_legacy: true
5232 - name: bluestore_warn_on_bluefs_spillover
5233   type: bool
5234   level: advanced
5235   desc: Enable health indication on bluefs slow device usage
5236   default: true
5237   with_legacy: true
5238 - name: bluestore_warn_on_legacy_statfs
5239   type: bool
5240   level: advanced
5241   desc: Enable health indication on lack of per-pool statfs reporting from bluestore
5242   default: true
5243   with_legacy: true
5244 - name: bluestore_warn_on_spurious_read_errors
5245   type: bool
5246   level: advanced
5247   desc: Enable health indication when spurious read errors are observed by OSD
5248   default: true
5249   with_legacy: true
5250 - name: bluestore_fsck_error_on_no_per_pool_omap
5251   type: bool
5252   level: advanced
5253   desc: Make fsck error (instead of warn) when objects without per-pool omap are found
5254   default: false
5255   with_legacy: true
5256 - name: bluestore_fsck_error_on_no_per_pg_omap
5257   type: bool
5258   level: advanced
5259   desc: Make fsck error (instead of warn) when objects without per-pg omap are found
5260   default: false
5261   with_legacy: true
5262 - name: bluestore_warn_on_no_per_pool_omap
5263   type: bool
5264   level: advanced
5265   desc: Enable health indication on lack of per-pool omap
5266   default: true
5267   with_legacy: true
5268 - name: bluestore_warn_on_no_per_pg_omap
5269   type: bool
5270   level: advanced
5271   desc: Enable health indication on lack of per-pg omap
5272   default: false
5273   with_legacy: true
5274 - name: bluestore_log_op_age
5275   type: float
5276   level: advanced
5277   desc: log operation if it's slower than this age (seconds)
5278   default: 5
5279   with_legacy: true
5280 - name: bluestore_log_omap_iterator_age
5281   type: float
5282   level: advanced
5283   desc: log omap iteration operation if it's slower than this age (seconds)
5284   default: 5
5285   with_legacy: true
5286 - name: bluestore_log_collection_list_age
5287   type: float
5288   level: advanced
5289   desc: log collection list operation if it's slower than this age (seconds)
5290   default: 1_min
5291   with_legacy: true
5292 - name: bluestore_debug_enforce_settings
5293   type: str
5294   level: dev
5295   desc: Enforces specific hw profile settings
5296   long_desc: '''hdd'' enforces settings intended for BlueStore above a rotational
5297     drive. ''ssd'' enforces settings intended for BlueStore above a solid drive. ''default''
5298     - using settings for the actual hardware.'
5299   default: default
5300   enum_values:
5301   - default
5302   - hdd
5303   - ssd
5304   with_legacy: true
5305 - name: bluestore_avl_alloc_ff_max_search_count
5306   type: uint
5307   level: dev
5308   desc: Search for this many ranges in first-fit mode before switching over to
5309     to best-fit mode. 0 to iterate through all ranges for required chunk.
5310   default: 100
5311 - name: bluestore_avl_alloc_ff_max_search_bytes
5312   type: size
5313   level: dev
5314   desc: Maximum distance to search in first-fit mode before switching over to
5315     to best-fit mode. 0 to iterate through all ranges for required chunk.
5316   default: 16_M
5317 - name: bluestore_avl_alloc_bf_threshold
5318   type: uint
5319   level: dev
5320   desc: Sets threshold at which shrinking max free chunk size triggers enabling best-fit
5321     mode.
5322   long_desc: 'AVL allocator works in two modes: near-fit and best-fit. By default,
5323     it uses very fast near-fit mode, in which it tries to fit a new block near the
5324     last allocated block of similar size. The second mode is much slower best-fit
5325     mode, in which it tries to find an exact match for the requested allocation. This
5326     mode is used when either the device gets fragmented or when it is low on free
5327     space. When the largest free block is smaller than ''bluestore_avl_alloc_bf_threshold'',
5328     best-fit mode is used.'
5329   default: 128_K
5330   see_also:
5331   - bluestore_avl_alloc_bf_free_pct
5332 - name: bluestore_avl_alloc_bf_free_pct
5333   type: uint
5334   level: dev
5335   desc: Sets threshold at which shrinking free space (in %, integer) triggers enabling
5336     best-fit mode.
5337   long_desc: 'AVL allocator works in two modes: near-fit and best-fit. By default,
5338     it uses very fast near-fit mode, in which it tries to fit a new block near the
5339     last allocated block of similar size. The second mode is much slower best-fit
5340     mode, in which it tries to find an exact match for the requested allocation. This
5341     mode is used when either the device gets fragmented or when it is low on free
5342     space. When free space is smaller than ''bluestore_avl_alloc_bf_free_pct'', best-fit
5343     mode is used.'
5344   default: 4
5345   see_also:
5346   - bluestore_avl_alloc_bf_threshold
5347 - name: bluestore_hybrid_alloc_mem_cap
5348   type: uint
5349   level: dev
5350   desc: Maximum RAM hybrid allocator should use before enabling bitmap supplement
5351   default: 64_M
5352 - name: bluestore_volume_selection_policy
5353   type: str
5354   level: dev
5355   desc: Determines bluefs volume selection policy
5356   long_desc: Determines bluefs volume selection policy. 'use_some_extra' policy allows
5357     to override RocksDB level granularity and put high level's data to faster device
5358     even when the level doesn't completely fit there. 'fit_to_fast' policy enables
5359     using 100% of faster disk capacity and allows the user to turn on 'level_compaction_dynamic_level_bytes'
5360     option in RocksDB options.
5361   default: use_some_extra
5362   enum_values:
5363   - rocksdb_original
5364   - use_some_extra
5365   - fit_to_fast
5366   with_legacy: true
5367 - name: bluestore_volume_selection_reserved_factor
5368   type: float
5369   level: advanced
5370   desc: DB level size multiplier. Determines amount of space at DB device to bar from
5371     the usage when 'use some extra' policy is in action. Reserved size is determined
5372     as sum(L_max_size[0], L_max_size[L-1]) + L_max_size[L] * this_factor
5373   default: 2
5374   flags:
5375   - startup
5376   with_legacy: true
5377 - name: bluestore_volume_selection_reserved
5378   type: int
5379   level: advanced
5380   desc: Space reserved at DB device and not allowed for 'use some extra' policy usage.
5381     Overrides 'bluestore_volume_selection_reserved_factor' setting and introduces
5382     straightforward limit.
5383   default: 0
5384   flags:
5385   - startup
5386   with_legacy: true
5387 - name: bdev_ioring
5388   type: bool
5389   level: advanced
5390   desc: Enables Linux io_uring API instead of libaio
5391   default: false
5392 - name: bdev_ioring_hipri
5393   type: bool
5394   level: advanced
5395   desc: Enables Linux io_uring API Use polled IO completions
5396   default: false
5397 - name: bdev_ioring_sqthread_poll
5398   type: bool
5399   level: advanced
5400   desc: Enables Linux io_uring API Offload submission/completion to kernel thread
5401   default: false
5402 - name: bluestore_kv_sync_util_logging_s
5403   type: float
5404   level: advanced
5405   desc: KV sync thread utilization logging period
5406   long_desc: How often (in seconds) to print KV sync thread utilization, not logged
5407     when set to 0 or when utilization is 0%
5408   default: 10
5409   flags:
5410   - runtime
5411   with_legacy: true
5412 - name: bluestore_fail_eio
5413   type: bool
5414   level: dev
5415   desc: fail/crash on EIO
5416   long_desc: whether bluestore osd fails on eio
5417   default: false
5418   flags:
5419   - runtime
5420   with_legacy: true
5421 - name: bluestore_zero_block_detection
5422   type: bool
5423   level: dev
5424   desc: punch holes instead of writing zeros
5425   long_desc: Intended for large-scale synthetic testing. Currently this is implemented
5426     with punch hole semantics, affecting the logical extent map of the object. This does
5427     not interact well with some RBD and CephFS features.
5428   default: false
5429   flags:
5430   - runtime
5431   with_legacy: true
5432 - name: kstore_max_ops
5433   type: uint
5434   level: advanced
5435   default: 512
5436   with_legacy: true
5437 - name: kstore_max_bytes
5438   type: size
5439   level: advanced
5440   default: 64_M
5441   with_legacy: true
5442 - name: kstore_backend
5443   type: str
5444   level: advanced
5445   default: rocksdb
5446   with_legacy: true
5447 - name: kstore_rocksdb_options
5448   type: str
5449   level: advanced
5450   desc: Options to pass through when RocksDB is used as the KeyValueDB for kstore.
5451   default: compression=kNoCompression
5452   with_legacy: true
5453 - name: kstore_fsck_on_mount
5454   type: bool
5455   level: advanced
5456   desc: Whether or not to run fsck on mount for kstore.
5457   default: false
5458   with_legacy: true
5459 - name: kstore_fsck_on_mount_deep
5460   type: bool
5461   level: advanced
5462   desc: Whether or not to run deep fsck on mount for kstore
5463   default: true
5464   with_legacy: true
5465 - name: kstore_nid_prealloc
5466   type: uint
5467   level: advanced
5468   default: 1_K
5469   with_legacy: true
5470 - name: kstore_sync_transaction
5471   type: bool
5472   level: advanced
5473   default: false
5474   with_legacy: true
5475 - name: kstore_sync_submit_transaction
5476   type: bool
5477   level: advanced
5478   default: false
5479   with_legacy: true
5480 - name: kstore_onode_map_size
5481   type: uint
5482   level: advanced
5483   default: 1_K
5484   with_legacy: true
5485 - name: kstore_default_stripe_size
5486   type: size
5487   level: advanced
5488   default: 64_K
5489   with_legacy: true
5490 # rocksdb options that will be used for omap(if omap_backend is rocksdb)
5491 - name: filestore_rocksdb_options
5492   type: str
5493   level: dev
5494   desc: Options to pass through when RocksDB is used as the KeyValueDB for filestore.
5495   default: max_background_jobs=10,compaction_readahead_size=2097152,compression=kNoCompression
5496   with_legacy: true
5497 - name: filestore_omap_backend
5498   type: str
5499   level: dev
5500   desc: The KeyValueDB to use for filestore metadata (ie omap).
5501   default: rocksdb
5502   enum_values:
5503   - leveldb
5504   - rocksdb
5505   with_legacy: true
5506 - name: filestore_omap_backend_path
5507   type: str
5508   level: dev
5509   desc: The path where the filestore KeyValueDB should store it's database(s).
5510   with_legacy: true
5511 # filestore wb throttle limits
5512 - name: filestore_wbthrottle_enable
5513   type: bool
5514   level: advanced
5515   desc: Enabling throttling of operations to backing file system
5516   default: true
5517   with_legacy: true
5518 - name: filestore_wbthrottle_btrfs_bytes_start_flusher
5519   type: size
5520   level: advanced
5521   desc: Start flushing (fsyncing) when this many bytes are written(btrfs)
5522   default: 40_M
5523   with_legacy: true
5524 - name: filestore_wbthrottle_btrfs_bytes_hard_limit
5525   type: size
5526   level: advanced
5527   desc: Block writes when this many bytes haven't been flushed (fsynced) (btrfs)
5528   default: 400_M
5529   with_legacy: true
5530 - name: filestore_wbthrottle_btrfs_ios_start_flusher
5531   type: uint
5532   level: advanced
5533   desc: Start flushing (fsyncing) when this many IOs are written (brtrfs)
5534   default: 500
5535   with_legacy: true
5536 - name: filestore_wbthrottle_btrfs_ios_hard_limit
5537   type: uint
5538   level: advanced
5539   desc: Block writes when this many IOs haven't been flushed (fsynced) (btrfs)
5540   default: 5000
5541   with_legacy: true
5542 - name: filestore_wbthrottle_btrfs_inodes_start_flusher
5543   type: uint
5544   level: advanced
5545   desc: Start flushing (fsyncing) when this many distinct inodes have been modified
5546     (btrfs)
5547   default: 500
5548   with_legacy: true
5549 - name: filestore_wbthrottle_xfs_bytes_start_flusher
5550   type: size
5551   level: advanced
5552   desc: Start flushing (fsyncing) when this many bytes are written(xfs)
5553   default: 40_M
5554   with_legacy: true
5555 - name: filestore_wbthrottle_xfs_bytes_hard_limit
5556   type: size
5557   level: advanced
5558   desc: Block writes when this many bytes haven't been flushed (fsynced) (xfs)
5559   default: 400_M
5560   with_legacy: true
5561 - name: filestore_wbthrottle_xfs_ios_start_flusher
5562   type: uint
5563   level: advanced
5564   desc: Start flushing (fsyncing) when this many IOs are written (xfs)
5565   default: 500
5566   with_legacy: true
5567 - name: filestore_wbthrottle_xfs_ios_hard_limit
5568   type: uint
5569   level: advanced
5570   desc: Block writes when this many IOs haven't been flushed (fsynced) (xfs)
5571   default: 5000
5572   with_legacy: true
5573 - name: filestore_wbthrottle_xfs_inodes_start_flusher
5574   type: uint
5575   level: advanced
5576   desc: Start flushing (fsyncing) when this many distinct inodes have been modified
5577     (xfs)
5578   default: 500
5579   with_legacy: true
5580 # These must be less than the fd limit
5581 - name: filestore_wbthrottle_btrfs_inodes_hard_limit
5582   type: uint
5583   level: advanced
5584   desc: Block writing when this many inodes have outstanding writes (btrfs)
5585   default: 5000
5586   with_legacy: true
5587 - name: filestore_wbthrottle_xfs_inodes_hard_limit
5588   type: uint
5589   level: advanced
5590   desc: Block writing when this many inodes have outstanding writes (xfs)
5591   default: 5000
5592   with_legacy: true
5593 # Introduce a O_DSYNC write in the filestore
5594 - name: filestore_odsync_write
5595   type: bool
5596   level: dev
5597   desc: Write with O_DSYNC
5598   default: false
5599   with_legacy: true
5600 # Tests index failure paths
5601 - name: filestore_index_retry_probability
5602   type: float
5603   level: dev
5604   default: 0
5605   with_legacy: true
5606 # Allow object read error injection
5607 - name: filestore_debug_inject_read_err
5608   type: bool
5609   level: dev
5610   default: false
5611   with_legacy: true
5612 - name: filestore_debug_random_read_err
5613   type: float
5614   level: dev
5615   default: 0
5616   with_legacy: true
5617 # Expensive debugging check on sync
5618 - name: filestore_debug_omap_check
5619   type: bool
5620   level: dev
5621   default: false
5622   fmt_desc: Debugging check on synchronization. This is an expensive operation.
5623
5624   with_legacy: true
5625 - name: filestore_omap_header_cache_size
5626   type: size
5627   level: dev
5628   default: 1_K
5629   with_legacy: true
5630 # Use omap for xattrs for attrs over
5631 # filestore_max_inline_xattr_size or
5632 - name: filestore_max_inline_xattr_size
5633   type: size
5634   level: dev
5635   default: 0
5636   with_legacy: true
5637 - name: filestore_max_inline_xattr_size_xfs
5638   type: size
5639   level: dev
5640   default: 64_K
5641   with_legacy: true
5642 - name: filestore_max_inline_xattr_size_btrfs
5643   type: size
5644   level: dev
5645   default: 2_K
5646   with_legacy: true
5647 - name: filestore_max_inline_xattr_size_other
5648   type: size
5649   level: dev
5650   default: 512
5651   with_legacy: true
5652 # for more than filestore_max_inline_xattrs attrs
5653 - name: filestore_max_inline_xattrs
5654   type: uint
5655   level: dev
5656   default: 0
5657   with_legacy: true
5658 - name: filestore_max_inline_xattrs_xfs
5659   type: uint
5660   level: dev
5661   default: 10
5662   with_legacy: true
5663 - name: filestore_max_inline_xattrs_btrfs
5664   type: uint
5665   level: dev
5666   default: 10
5667   with_legacy: true
5668 - name: filestore_max_inline_xattrs_other
5669   type: uint
5670   level: dev
5671   default: 2
5672   with_legacy: true
5673 - name: filestore_max_xattr_value_size
5674   type: size
5675   level: dev
5676   default: 0
5677   with_legacy: true
5678 - name: filestore_max_xattr_value_size_xfs
5679   type: size
5680   level: dev
5681   default: 64_K
5682   with_legacy: true
5683 - name: filestore_max_xattr_value_size_btrfs
5684   type: size
5685   level: dev
5686   default: 64_K
5687   with_legacy: true
5688 # ext4 allows 4k xattrs total including some smallish extra fields and the
5689 # keys.  We're allowing 2 512 inline attrs in addition some some filestore
5690 # replay attrs.  After accounting for those, we still need to fit up to
5691 # two attrs of this value.  That means we need this value to be around 1k
5692 # to be safe.  This is hacky, but it's not worth complicating the code
5693 # to work around ext4's total xattr limit.
5694 - name: filestore_max_xattr_value_size_other
5695   type: size
5696   level: dev
5697   default: 1_K
5698   with_legacy: true
5699 # track sloppy crcs
5700 - name: filestore_sloppy_crc
5701   type: bool
5702   level: dev
5703   default: false
5704   with_legacy: true
5705 - name: filestore_sloppy_crc_block_size
5706   type: size
5707   level: dev
5708   default: 64_K
5709   with_legacy: true
5710 - name: filestore_max_alloc_hint_size
5711   type: size
5712   level: dev
5713   default: 1_M
5714   with_legacy: true
5715 # seconds
5716 - name: filestore_max_sync_interval
5717   type: float
5718   level: advanced
5719   desc: Period between calls to syncfs(2) and journal trims (seconds)
5720   default: 5
5721   with_legacy: true
5722 # seconds
5723 - name: filestore_min_sync_interval
5724   type: float
5725   level: dev
5726   desc: Minimum period between calls to syncfs(2)
5727   default: 0.01
5728   with_legacy: true
5729 - name: filestore_btrfs_snap
5730   type: bool
5731   level: dev
5732   default: true
5733   with_legacy: true
5734 - name: filestore_btrfs_clone_range
5735   type: bool
5736   level: advanced
5737   desc: Use btrfs clone_range ioctl to efficiently duplicate objects
5738   default: true
5739   with_legacy: true
5740 # zfsonlinux is still unstable
5741 - name: filestore_zfs_snap
5742   type: bool
5743   level: dev
5744   default: false
5745   with_legacy: true
5746 - name: filestore_fsync_flushes_journal_data
5747   type: bool
5748   level: dev
5749   default: false
5750   with_legacy: true
5751 # (try to) use fiemap
5752 - name: filestore_fiemap
5753   type: bool
5754   level: advanced
5755   desc: Use fiemap ioctl(2) to determine which parts of objects are sparse
5756   default: false
5757   with_legacy: true
5758 - name: filestore_punch_hole
5759   type: bool
5760   level: advanced
5761   desc: Use fallocate(2) FALLOC_FL_PUNCH_HOLE to efficiently zero ranges of objects
5762   default: false
5763   with_legacy: true
5764 # (try to) use seek_data/hole
5765 - name: filestore_seek_data_hole
5766   type: bool
5767   level: advanced
5768   desc: Use lseek(2) SEEK_HOLE and SEEK_DATA to determine which parts of objects are
5769     sparse
5770   default: false
5771   with_legacy: true
5772 - name: filestore_splice
5773   type: bool
5774   level: advanced
5775   desc: Use splice(2) to more efficiently copy data between files
5776   default: false
5777   with_legacy: true
5778 - name: filestore_fadvise
5779   type: bool
5780   level: advanced
5781   desc: Use posix_fadvise(2) to pass hints to file system
5782   default: true
5783   with_legacy: true
5784 # collect device partition information for management application to use
5785 - name: filestore_collect_device_partition_information
5786   type: bool
5787   level: advanced
5788   desc: Collect metadata about the backing file system on OSD startup
5789   default: true
5790   with_legacy: true
5791 # (try to) use extsize for alloc hint NOTE: extsize seems to trigger
5792 # data corruption in xfs prior to kernel 3.5.  filestore will
5793 # implicitly disable this if it cannot confirm the kernel is newer
5794 # than that.
5795 # NOTE: This option involves a tradeoff: When disabled, fragmentation is
5796 # worse, but large sequential writes are faster. When enabled, large
5797 # sequential writes are slower, but fragmentation is reduced.
5798 - name: filestore_xfs_extsize
5799   type: bool
5800   level: advanced
5801   desc: Use XFS extsize ioctl(2) to hint allocator about expected write sizes
5802   default: false
5803   with_legacy: true
5804 - name: filestore_journal_parallel
5805   type: bool
5806   level: dev
5807   default: false
5808   with_legacy: true
5809 - name: filestore_journal_writeahead
5810   type: bool
5811   level: dev
5812   default: false
5813   with_legacy: true
5814 - name: filestore_journal_trailing
5815   type: bool
5816   level: dev
5817   default: false
5818   with_legacy: true
5819 - name: filestore_queue_max_ops
5820   type: uint
5821   level: advanced
5822   desc: Max IO operations in flight
5823   default: 50
5824   with_legacy: true
5825 - name: filestore_queue_max_bytes
5826   type: size
5827   level: advanced
5828   desc: Max (written) bytes in flight
5829   default: 100_M
5830   with_legacy: true
5831 - name: filestore_caller_concurrency
5832   type: int
5833   level: dev
5834   default: 10
5835   with_legacy: true
5836 # Expected filestore throughput in B/s
5837 - name: filestore_expected_throughput_bytes
5838   type: float
5839   level: advanced
5840   desc: Expected throughput of backend device (aids throttling calculations)
5841   default: 209715200
5842   with_legacy: true
5843 # Expected filestore throughput in ops/s
5844 - name: filestore_expected_throughput_ops
5845   type: float
5846   level: advanced
5847   desc: Expected through of backend device in IOPS (aids throttling calculations)
5848   default: 200
5849   with_legacy: true
5850 # Filestore max delay multiple.  Defaults to 0 (disabled)
5851 - name: filestore_queue_max_delay_multiple
5852   type: float
5853   level: dev
5854   default: 0
5855   with_legacy: true
5856 # Filestore high delay multiple.  Defaults to 0 (disabled)
5857 - name: filestore_queue_high_delay_multiple
5858   type: float
5859   level: dev
5860   default: 0
5861   with_legacy: true
5862 # Filestore max delay multiple ops.  Defaults to 0 (disabled)
5863 - name: filestore_queue_max_delay_multiple_bytes
5864   type: float
5865   level: dev
5866   default: 0
5867   with_legacy: true
5868 # Filestore high delay multiple bytes.  Defaults to 0 (disabled)
5869 - name: filestore_queue_high_delay_multiple_bytes
5870   type: float
5871   level: dev
5872   default: 0
5873   with_legacy: true
5874 # Filestore max delay multiple ops.  Defaults to 0 (disabled)
5875 - name: filestore_queue_max_delay_multiple_ops
5876   type: float
5877   level: dev
5878   default: 0
5879   with_legacy: true
5880 # Filestore high delay multiple ops.  Defaults to 0 (disabled)
5881 - name: filestore_queue_high_delay_multiple_ops
5882   type: float
5883   level: dev
5884   default: 0
5885   with_legacy: true
5886 - name: filestore_queue_low_threshhold
5887   type: float
5888   level: dev
5889   default: 0.3
5890   with_legacy: true
5891 - name: filestore_queue_high_threshhold
5892   type: float
5893   level: dev
5894   with_legacy: true
5895   default: 0.9
5896 - name: filestore_op_threads
5897   type: int
5898   level: advanced
5899   desc: Threads used to apply changes to backing file system
5900   default: 2
5901   with_legacy: true
5902 - name: filestore_op_thread_timeout
5903   type: int
5904   level: advanced
5905   desc: Seconds before a worker thread is considered stalled
5906   default: 1_min
5907   with_legacy: true
5908 - name: filestore_op_thread_suicide_timeout
5909   type: int
5910   level: advanced
5911   desc: Seconds before a worker thread is considered dead
5912   default: 3_min
5913   with_legacy: true
5914 - name: filestore_commit_timeout
5915   type: float
5916   level: advanced
5917   desc: Seconds before backing file system is considered hung
5918   default: 10_min
5919   with_legacy: true
5920 - name: filestore_fiemap_threshold
5921   type: size
5922   level: dev
5923   default: 4_K
5924   with_legacy: true
5925 - name: filestore_merge_threshold
5926   type: int
5927   level: dev
5928   default: -10
5929   with_legacy: true
5930 - name: filestore_split_multiple
5931   type: int
5932   level: dev
5933   default: 2
5934   with_legacy: true
5935 - name: filestore_split_rand_factor
5936   type: uint
5937   level: dev
5938   default: 20
5939   with_legacy: true
5940 - name: filestore_update_to
5941   type: int
5942   level: dev
5943   default: 1000
5944   with_legacy: true
5945 - name: filestore_blackhole
5946   type: bool
5947   level: dev
5948   default: false
5949   with_legacy: true
5950 - name: filestore_fd_cache_size
5951   type: int
5952   level: dev
5953   default: 128
5954   with_legacy: true
5955 - name: filestore_fd_cache_shards
5956   type: int
5957   level: dev
5958   default: 16
5959   with_legacy: true
5960 - name: filestore_ondisk_finisher_threads
5961   type: int
5962   level: dev
5963   default: 1
5964   with_legacy: true
5965 - name: filestore_apply_finisher_threads
5966   type: int
5967   level: dev
5968   default: 1
5969   with_legacy: true
5970 # file onto which store transaction dumps
5971 - name: filestore_dump_file
5972   type: str
5973   level: dev
5974   with_legacy: true
5975 # inject a failure at the n'th opportunity
5976 - name: filestore_kill_at
5977   type: int
5978   level: dev
5979   default: 0
5980   with_legacy: true
5981 # artificially stall for N seconds in op queue thread
5982 - name: filestore_inject_stall
5983   type: int
5984   level: dev
5985   default: 0
5986   with_legacy: true
5987 # fail/crash on EIO
5988 - name: filestore_fail_eio
5989   type: bool
5990   level: dev
5991   default: true
5992   with_legacy: true
5993 - name: filestore_debug_verify_split
5994   type: bool
5995   level: dev
5996   default: false
5997   with_legacy: true
5998 - name: journal_dio
5999   type: bool
6000   level: dev
6001   default: true
6002   fmt_desc: Enables direct i/o to the journal. Requires ``journal block
6003    align`` set to ``true``.
6004   with_legacy: true
6005 - name: journal_aio
6006   type: bool
6007   level: dev
6008   default: true
6009   fmt_desc: Enables using ``libaio`` for asynchronous writes to the journal.
6010    Requires ``journal dio`` set to ``true``. Version 0.61 and later, ``true``.
6011    Version 0.60 and earlier, ``false``.
6012   with_legacy: true
6013 - name: journal_force_aio
6014   type: bool
6015   level: dev
6016   default: false
6017   with_legacy: true
6018 - name: journal_block_size
6019   type: size
6020   level: dev
6021   default: 4_K
6022   with_legacy: true
6023 - name: journal_block_align
6024   type: bool
6025   level: dev
6026   default: true
6027   fmt_desc: Block aligns write operations. Required for ``dio`` and ``aio``.
6028   with_legacy: true
6029 - name: journal_write_header_frequency
6030   type: uint
6031   level: dev
6032   default: 0
6033   with_legacy: true
6034 - name: journal_max_write_bytes
6035   type: size
6036   level: advanced
6037   desc: Max bytes in flight to journal
6038   fmt_desc: The maximum number of bytes the journal will write at
6039    any one time.
6040   default: 10_M
6041   with_legacy: true
6042 - name: journal_max_write_entries
6043   type: int
6044   level: advanced
6045   desc: Max IOs in flight to journal
6046   fmt_desc: The maximum number of entries the journal will write at
6047    any one time.
6048   default: 100
6049   with_legacy: true
6050 # Target range for journal fullness
6051 - name: journal_throttle_low_threshhold
6052   type: float
6053   level: dev
6054   default: 0.6
6055   with_legacy: true
6056 - name: journal_throttle_high_threshhold
6057   type: float
6058   level: dev
6059   default: 0.9
6060   with_legacy: true
6061 # Multiple over expected at high_threshhold. Defaults to 0 (disabled).
6062 - name: journal_throttle_high_multiple
6063   type: float
6064   level: dev
6065   default: 0
6066   with_legacy: true
6067 # Multiple over expected at max.  Defaults to 0 (disabled).
6068 - name: journal_throttle_max_multiple
6069   type: float
6070   level: dev
6071   default: 0
6072   with_legacy: true
6073 # align data payloads >= this.
6074 - name: journal_align_min_size
6075   type: size
6076   level: dev
6077   default: 64_K
6078   fmt_desc: Align data payloads greater than the specified minimum.
6079   with_legacy: true
6080 - name: journal_replay_from
6081   type: int
6082   level: dev
6083   default: 0
6084   with_legacy: true
6085 - name: journal_zero_on_create
6086   type: bool
6087   level: dev
6088   default: false
6089   fmt_desc: |
6090     Causes the file store to overwrite the entire journal with
6091     ``0``'s during ``mkfs``.
6092   with_legacy: true
6093 # assume journal is not corrupt
6094 - name: journal_ignore_corruption
6095   type: bool
6096   level: dev
6097   default: false
6098   with_legacy: true
6099 # using ssd disk as journal, whether support discard nouse journal-data.
6100 - name: journal_discard
6101   type: bool
6102   level: dev
6103   default: false
6104   with_legacy: true
6105 # fio data directory for fio-objectstore
6106 - name: fio_dir
6107   type: str
6108   level: advanced
6109   default: /tmp/fio
6110   with_legacy: true
6111 - name: rados_mon_op_timeout
6112   type: secs
6113   level: advanced
6114   desc: timeout for operations handled by monitors such as statfs (0 is unlimited)
6115   default: 0
6116   min: 0
6117   flags:
6118   - runtime
6119 - name: rados_osd_op_timeout
6120   type: secs
6121   level: advanced
6122   desc: timeout for operations handled by osds such as write (0 is unlimited)
6123   default: 0
6124   min: 0
6125   flags:
6126   - runtime
6127 # true if LTTng-UST tracepoints should be enabled
6128 - name: rados_tracing
6129   type: bool
6130   level: advanced
6131   default: false
6132   with_legacy: true
6133 - name: mgr_connect_retry_interval
6134   type: float
6135   level: dev
6136   default: 1
6137   services:
6138   - common
6139 - name: mgr_client_service_daemon_unregister_timeout
6140   type: float
6141   level: dev
6142   desc: Time to wait during shutdown to deregister service with mgr
6143   default: 1
6144 - name: throttler_perf_counter
6145   type: bool
6146   level: advanced
6147   default: true
6148   with_legacy: true
6149 - name: event_tracing
6150   type: bool
6151   level: advanced
6152   default: false
6153   with_legacy: true
6154 - name: bluestore_tracing
6155   type: bool
6156   level: advanced
6157   desc: Enable bluestore event tracing.
6158   default: false
6159 - name: bluestore_throttle_trace_rate
6160   type: float
6161   level: advanced
6162   desc: Rate at which to sample bluestore transactions (per second)
6163   default: 0
6164 - name: debug_deliberately_leak_memory
6165   type: bool
6166   level: dev
6167   default: false
6168   with_legacy: true
6169 - name: debug_asserts_on_shutdown
6170   type: bool
6171   level: dev
6172   desc: Enable certain asserts to check for refcounting bugs on shutdown; see http://tracker.ceph.com/issues/21738
6173   default: false
6174 - name: debug_asok_assert_abort
6175   type: bool
6176   level: dev
6177   desc: allow commands 'assert' and 'abort' via asok for testing crash dumps etc
6178   default: false
6179   with_legacy: true
6180 - name: target_max_misplaced_ratio
6181   type: float
6182   level: basic
6183   desc: Max ratio of misplaced objects to target when throttling data rebalancing
6184     activity
6185   default: 0.05
6186 - name: device_failure_prediction_mode
6187   type: str
6188   level: basic
6189   desc: Method used to predict device failures
6190   long_desc: To disable prediction, use 'none',  'local' uses a prediction model that
6191     runs inside the mgr daemon.  'cloud' will share metrics with a cloud service and
6192     query the service for devicelife expectancy.
6193   default: none
6194   enum_values:
6195   - none
6196   - local
6197   - cloud
6198   flags:
6199   - runtime
6200 - name: gss_ktab_client_file
6201   type: str
6202   level: advanced
6203   desc: GSS/KRB5 Keytab file for client authentication
6204   long_desc: This sets the full path for the GSS/Kerberos client keytab file location.
6205   default: /var/lib/ceph/$name/gss_client_$name.ktab
6206   services:
6207   - mon
6208   - osd
6209 - name: gss_target_name
6210   type: str
6211   level: advanced
6212   long_desc: This sets the gss target service name.
6213   default: ceph
6214   services:
6215   - mon
6216   - osd
6217 - name: debug_disable_randomized_ping
6218   type: bool
6219   level: dev
6220   desc: Disable heartbeat ping randomization for testing purposes
6221   default: false
6222 - name: debug_heartbeat_testing_span
6223   type: int
6224   level: dev
6225   desc: Override 60 second periods for testing only
6226   default: 0
6227 - name: librados_thread_count
6228   type: uint
6229   level: advanced
6230   desc: Size of thread pool for Objecter
6231   default: 2
6232   tags:
6233   - client
6234   min: 1
6235 - name: osd_asio_thread_count
6236   type: uint
6237   level: advanced
6238   desc: Size of thread pool for ASIO completions
6239   default: 2
6240   tags:
6241   - osd
6242   min: 1
6243 - name: cephsqlite_lock_renewal_interval
6244   type: millisecs
6245   level: advanced
6246   desc: number of milliseconds before lock is renewed
6247   default: 2000
6248   tags:
6249   - client
6250   see_also:
6251   - cephsqlite_lock_renewal_timeout
6252   min: 100
6253 - name: cephsqlite_lock_renewal_timeout
6254   type: millisecs
6255   level: advanced
6256   desc: number of milliseconds before transaction lock times out
6257   long_desc: The amount of time before a running libcephsqlite VFS connection has
6258     to renew a lock on the database before the lock is automatically lost. If the
6259     lock is lost, the VFS will abort the process to prevent database corruption.
6260   default: 30000
6261   tags:
6262   - client
6263   see_also:
6264   - cephsqlite_lock_renewal_interval
6265   min: 100
6266 - name: cephsqlite_blocklist_dead_locker
6267   type: bool
6268   level: advanced
6269   desc: blocklist the last dead owner of the database lock
6270   long_desc: Require that the Ceph SQLite VFS blocklist the last dead owner of the
6271     database when cleanup was incomplete. DO NOT CHANGE THIS UNLESS YOU UNDERSTAND
6272     THE RAMIFICATIONS. CORRUPTION MAY RESULT.
6273   default: true
6274   tags:
6275   - client
6276 - name: bdev_type
6277   type: str
6278   level: advanced
6279   desc: Explicitly set the device type to select the driver if it's needed
6280   enum_values:
6281   - aio
6282   - spdk
6283   - pmem
6284   - hm_smr
6285 - name: bluestore_cleaner_sleep_interval
6286   type: float
6287   level: advanced
6288   desc: How long cleaner should sleep before re-checking utilization
6289   default: 5
6290   with_legacy: true
6291 - name: jaeger_tracing_enable
6292   type: bool
6293   level: advanced
6294   desc: Ceph should use jaeger tracing system
6295   default: false
6296   services:
6297   - rgw
6298   - osd
6299   with_legacy: true
6300 - name: mgr_ttl_cache_expire_seconds
6301   type: uint
6302   level: dev
6303   desc: Set the time to live in seconds - set to 0 to disable the cache.
6304   default: 0
6305   services:
6306   - mgr