ceph/src/common/options/osd.yaml.in

   1 # -*- mode: YAML -*-
   2 ---
   3
   4 options:
   5 - name: osd_numa_prefer_iface
   6   type: bool
   7   level: advanced
   8   desc: prefer IP on network interface on same numa node as storage
   9   default: true
  10   see_also:
  11   - osd_numa_auto_affinity
  12   flags:
  13   - startup
  14 - name: osd_numa_auto_affinity
  15   type: bool
  16   level: advanced
  17   desc: automatically set affinity to numa node when storage and network match
  18   default: true
  19   flags:
  20   - startup
  21 - name: osd_numa_node
  22   type: int
  23   level: advanced
  24   desc: set affinity to a numa node (-1 for none)
  25   default: -1
  26   see_also:
  27   - osd_numa_auto_affinity
  28   flags:
  29   - startup
  30 - name: osd_smart_report_timeout
  31   type: uint
  32   level: advanced
  33   desc: Timeout (in seconds) for smarctl to run, default is set to 5
  34   default: 5
  35 # verify backend can support configured max object name length
  36 - name: osd_check_max_object_name_len_on_startup
  37   type: bool
  38   level: dev
  39   default: true
  40   with_legacy: true
  41 - name: osd_max_backfills
  42   type: uint
  43   level: advanced
  44   desc: Maximum number of concurrent local and remote backfills or recoveries per
  45     OSD
  46   long_desc: There can be osd_max_backfills local reservations AND the same remote
  47     reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
  48     in recovery and 1 shard of another recovering PG.
  49   fmt_desc: The maximum number of backfills allowed to or from a single OSD.
  50     Note that this is applied separately for read and write operations.
  51   default: 1
  52   flags:
  53   - runtime
  54   with_legacy: true
  55 # Minimum recovery priority (255 = max, smaller = lower)
  56 - name: osd_min_recovery_priority
  57   type: int
  58   level: advanced
  59   desc: Minimum priority below which recovery is not performed
  60   long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
  61     work (e.g., rebalancing) below this threshold and focus solely on higher priority
  62     work (e.g., replicating degraded objects).
  63   default: 0
  64   with_legacy: true
  65 - name: osd_backfill_retry_interval
  66   type: float
  67   level: advanced
  68   desc: how frequently to retry backfill reservations after being denied (e.g., due
  69     to a full OSD)
  70   fmt_desc: The number of seconds to wait before retrying backfill requests.
  71   default: 30
  72   with_legacy: true
  73 - name: osd_recovery_retry_interval
  74   type: float
  75   level: advanced
  76   desc: how frequently to retry recovery reservations after being denied (e.g., due
  77     to a full OSD)
  78   default: 30
  79   with_legacy: true
  80 - name: osd_recovery_sleep
  81   type: float
  82   level: advanced
  83   desc: Time in seconds to sleep before next recovery or backfill op
  84   fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
  85     Increasing this value will slow down recovery operation while
  86     client operations will be less impacted.
  87   default: 0
  88   flags:
  89   - runtime
  90   with_legacy: true
  91 - name: osd_recovery_sleep_hdd
  92   type: float
  93   level: advanced
  94   desc: Time in seconds to sleep before next recovery or backfill op for HDDs
  95   fmt_desc: Time in seconds to sleep before next recovery or backfill op
  96     for HDDs.
  97   default: 0.1
  98   flags:
  99   - runtime
 100   with_legacy: true
 101 - name: osd_recovery_sleep_ssd
 102   type: float
 103   level: advanced
 104   desc: Time in seconds to sleep before next recovery or backfill op for SSDs
 105   fmt_desc: Time in seconds to sleep before the next recovery or backfill op
 106     for SSDs.
 107   default: 0
 108   see_also:
 109   - osd_recovery_sleep
 110   flags:
 111   - runtime
 112   with_legacy: true
 113 - name: osd_recovery_sleep_hybrid
 114   type: float
 115   level: advanced
 116   desc: Time in seconds to sleep before next recovery or backfill op when data is
 117     on HDD and journal is on SSD
 118   fmt_desc: Time in seconds to sleep before the next recovery or backfill op
 119     when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
 120   default: 0.025
 121   see_also:
 122   - osd_recovery_sleep
 123   flags:
 124   - runtime
 125 - name: osd_snap_trim_sleep
 126   type: float
 127   level: advanced
 128   desc: Time in seconds to sleep before next snap trim (overrides values below)
 129   fmt_desc: Time in seconds to sleep before next snap trim op.
 130     Increasing this value will slow down snap trimming.
 131     This option overrides backend specific variants.
 132   default: 0
 133   flags:
 134   - runtime
 135   with_legacy: true
 136 - name: osd_snap_trim_sleep_hdd
 137   type: float
 138   level: advanced
 139   desc: Time in seconds to sleep before next snap trim for HDDs
 140   default: 5
 141   flags:
 142   - runtime
 143 - name: osd_snap_trim_sleep_ssd
 144   type: float
 145   level: advanced
 146   desc: Time in seconds to sleep before next snap trim for SSDs
 147   fmt_desc: Time in seconds to sleep before next snap trim op
 148     for SSD OSDs (including NVMe).
 149   default: 0
 150   flags:
 151   - runtime
 152 - name: osd_snap_trim_sleep_hybrid
 153   type: float
 154   level: advanced
 155   desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
 156     is on SSD
 157   fmt_desc: Time in seconds to sleep before next snap trim op
 158     when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
 159   default: 2
 160   flags:
 161   - runtime
 162 - name: osd_scrub_invalid_stats
 163   type: bool
 164   level: advanced
 165   default: true
 166   with_legacy: true
 167 - name: osd_max_scrubs
 168   type: int
 169   level: advanced
 170   desc: Maximum concurrent scrubs on a single OSD
 171   fmt_desc: The maximum number of simultaneous scrub operations for
 172     a Ceph OSD Daemon.
 173   default: 1
 174   with_legacy: true
 175 - name: osd_scrub_during_recovery
 176   type: bool
 177   level: advanced
 178   desc: Allow scrubbing when PGs on the OSD are undergoing recovery
 179   fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
 180     scheduling new scrub (and deep--scrub) while there is active recovery.
 181     Already running scrubs will be continued. This might be useful to reduce
 182     load on busy clusters.
 183   default: false
 184   with_legacy: true
 185 - name: osd_repair_during_recovery
 186   type: bool
 187   level: advanced
 188   desc: Allow requested repairing when PGs on the OSD are undergoing recovery
 189   default: false
 190   with_legacy: true
 191 - name: osd_scrub_begin_hour
 192   type: int
 193   level: advanced
 194   desc: Restrict scrubbing to this hour of the day or later
 195   long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
 196   fmt_desc: This restricts scrubbing to this hour of the day or later.
 197     Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
 198     to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour``, they define a time
 199     window, in which the scrubs can happen.
 200     But a scrub will be performed
 201     no matter whether the time window allows or not, as long as the placement
 202     group's scrub interval exceeds ``osd_scrub_max_interval``.
 203   default: 0
 204   see_also:
 205   - osd_scrub_end_hour
 206   min: 0
 207   max: 23
 208   with_legacy: true
 209 - name: osd_scrub_end_hour
 210   type: int
 211   level: advanced
 212   desc: Restrict scrubbing to hours of the day earlier than this
 213   long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
 214   fmt_desc: This restricts scrubbing to the hour earlier than this.
 215     Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
 216     for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
 217     window, in which the scrubs can happen. But a scrub will be performed
 218     no matter whether the time window allows or not, as long as the placement
 219     group's scrub interval exceeds ``osd_scrub_max_interval``.
 220   default: 0
 221   see_also:
 222   - osd_scrub_begin_hour
 223   min: 0
 224   max: 23
 225   with_legacy: true
 226 - name: osd_scrub_begin_week_day
 227   type: int
 228   level: advanced
 229   desc: Restrict scrubbing to this day of the week or later
 230   long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
 231     for the entire week.
 232   fmt_desc: This restricts scrubbing to this day of the week or later.
 233     0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
 234     and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
 235     Along with ``osd_scrub_end_week_day``, they define a time window in which
 236     scrubs can happen. But a scrub will be performed
 237     no matter whether the time window allows or not, when the PG's
 238     scrub interval exceeds ``osd_scrub_max_interval``.
 239   default: 0
 240   see_also:
 241   - osd_scrub_end_week_day
 242   min: 0
 243   max: 6
 244   with_legacy: true
 245 - name: osd_scrub_end_week_day
 246   type: int
 247   level: advanced
 248   desc: Restrict scrubbing to days of the week earlier than this
 249   long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
 250     for the entire week.
 251   fmt_desc: This restricts scrubbing to days of the week earlier than this.
 252     0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
 253     and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
 254     Along with ``osd_scrub_begin_week_day``, they define a time
 255     window, in which the scrubs can happen. But a scrub will be performed
 256     no matter whether the time window allows or not, as long as the placement
 257     group's scrub interval exceeds ``osd_scrub_max_interval``.
 258   default: 0
 259   see_also:
 260   - osd_scrub_begin_week_day
 261   min: 0
 262   max: 6
 263   with_legacy: true
 264 - name: osd_scrub_load_threshold
 265   type: float
 266   level: advanced
 267   desc: Allow scrubbing when system load divided by number of CPUs is below this value
 268   fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
 269     (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
 270     Default is ``0.5``.
 271   default: 0.5
 272   with_legacy: true
 273 # if load is low
 274 - name: osd_scrub_min_interval
 275   type: float
 276   level: advanced
 277   desc: Scrub each PG no more often than this interval
 278   fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon
 279     when the Ceph Storage Cluster load is low.
 280   default: 1_day
 281   see_also:
 282   - osd_scrub_max_interval
 283   with_legacy: true
 284 # regardless of load
 285 - name: osd_scrub_max_interval
 286   type: float
 287   level: advanced
 288   desc: Scrub each PG no less often than this interval
 289   fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
 290     irrespective of cluster load.
 291   default: 7_day
 292   see_also:
 293   - osd_scrub_min_interval
 294   with_legacy: true
 295 # randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
 296 - name: osd_scrub_interval_randomize_ratio
 297   type: float
 298   level: advanced
 299   desc: Ratio of scrub interval to randomly vary
 300   long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
 301     so that they are soon uniformly distributed over the week
 302   fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
 303     the next scrub job for a PG. The delay is a random
 304     value less than ``osd_scrub_min_interval`` \*
 305     ``osd_scrub_interval_randomized_ratio``. The default setting
 306     spreads scrubs throughout the allowed time
 307     window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
 308   default: 0.5
 309   see_also:
 310   - osd_scrub_min_interval
 311   with_legacy: true
 312 # the probability to back off the scheduled scrub
 313 - name: osd_scrub_backoff_ratio
 314   type: float
 315   level: dev
 316   desc: Backoff ratio for scheduling scrubs
 317   long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means
 318     that 1 out of 3 ticks will schedule scrubs
 319   default: 0.66
 320   with_legacy: true
 321 - name: osd_scrub_chunk_min
 322   type: int
 323   level: advanced
 324   desc: Minimum number of objects to scrub in a single chunk
 325   fmt_desc: The minimal number of object store chunks to scrub during single operation.
 326     Ceph blocks writes to single chunk during scrub.
 327   default: 5
 328   see_also:
 329   - osd_scrub_chunk_max
 330   with_legacy: true
 331 - name: osd_scrub_chunk_max
 332   type: int
 333   level: advanced
 334   desc: Maximum number of objects to scrub in a single chunk
 335   fmt_desc: The maximum number of object store chunks to scrub during single operation.
 336   default: 25
 337   see_also:
 338   - osd_scrub_chunk_min
 339   with_legacy: true
 340 # sleep between [deep]scrub ops
 341 - name: osd_scrub_sleep
 342   type: float
 343   level: advanced
 344   desc: Duration to inject a delay during scrubbing
 345   fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow
 346     down the overall rate of scrubbing so that client operations will be less impacted.
 347   default: 0
 348   flags:
 349   - runtime
 350   with_legacy: true
 351 # more sleep between [deep]scrub ops
 352 - name: osd_scrub_extended_sleep
 353   type: float
 354   level: advanced
 355   desc: Duration to inject a delay during scrubbing out of scrubbing hours
 356   default: 0
 357   see_also:
 358   - osd_scrub_begin_hour
 359   - osd_scrub_end_hour
 360   - osd_scrub_begin_week_day
 361   - osd_scrub_end_week_day
 362   with_legacy: true
 363 # whether auto-repair inconsistencies upon deep-scrubbing
 364 - name: osd_scrub_auto_repair
 365   type: bool
 366   level: advanced
 367   desc: Automatically repair damaged objects detected during scrub
 368   fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
 369     are found by scrubs or deep-scrubs.  However, if more than
 370     ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
 371   default: false
 372   with_legacy: true
 373 # only auto-repair when number of errors is below this threshold
 374 - name: osd_scrub_auto_repair_num_errors
 375   type: uint
 376   level: advanced
 377   desc: Maximum number of detected errors to automatically repair
 378   fmt_desc: Auto repair will not occur if more than this many errors are found.
 379   default: 5
 380   see_also:
 381   - osd_scrub_auto_repair
 382   with_legacy: true
 383 - name: osd_scrub_max_preemptions
 384   type: uint
 385   level: advanced
 386   desc: Set the maximum number of times we will preempt a deep scrub due to a client
 387     operation before blocking client IO to complete the scrub
 388   default: 5
 389   min: 0
 390   max: 30
 391 - name: osd_deep_scrub_interval
 392   type: float
 393   level: advanced
 394   desc: Deep scrub each PG (i.e., verify data checksums) at least this often
 395   fmt_desc: The interval for "deep" scrubbing (fully reading all data). The
 396     ``osd_scrub_load_threshold`` does not affect this setting.
 397   default: 7_day
 398   with_legacy: true
 399 - name: osd_deep_scrub_randomize_ratio
 400   type: float
 401   level: advanced
 402   desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs
 403     are deep)
 404   long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they
 405     are uniformly distributed over the week
 406   default: 0.15
 407   with_legacy: true
 408 - name: osd_deep_scrub_stride
 409   type: size
 410   level: advanced
 411   desc: Number of bytes to read from an object at a time during deep scrub
 412   fmt_desc: Read size when doing a deep scrub.
 413   default: 512_K
 414   with_legacy: true
 415 - name: osd_deep_scrub_keys
 416   type: int
 417   level: advanced
 418   desc: Number of keys to read from an object at a time during deep scrub
 419   default: 1024
 420   with_legacy: true
 421 # objects must be this old (seconds) before we update the whole-object digest on scrub
 422 - name: osd_deep_scrub_update_digest_min_age
 423   type: int
 424   level: advanced
 425   desc: Update overall object digest only if object was last modified longer ago than
 426     this
 427   default: 2_hr
 428   with_legacy: true
 429 - name: osd_deep_scrub_large_omap_object_key_threshold
 430   type: uint
 431   level: advanced
 432   desc: Warn when we encounter an object with more omap keys than this
 433   default: 200000
 434   services:
 435   - osd
 436   - mds
 437   see_also:
 438   - osd_deep_scrub_large_omap_object_value_sum_threshold
 439   with_legacy: true
 440 - name: osd_deep_scrub_large_omap_object_value_sum_threshold
 441   type: size
 442   level: advanced
 443   desc: Warn when we encounter an object with more omap key bytes than this
 444   default: 1_G
 445   services:
 446   - osd
 447   see_also:
 448   - osd_deep_scrub_large_omap_object_key_threshold
 449   with_legacy: true
 450 # where rados plugins are stored
 451 - name: osd_class_dir
 452   type: str
 453   level: advanced
 454   default: @CMAKE_INSTALL_LIBDIR@/rados-classes
 455   fmt_desc: The class path for RADOS class plug-ins.
 456   with_legacy: true
 457 - name: osd_open_classes_on_start
 458   type: bool
 459   level: advanced
 460   default: true
 461   with_legacy: true
 462 # list of object classes allowed to be loaded (allow all: *)
 463 - name: osd_class_load_list
 464   type: str
 465   level: advanced
 466   default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
 467     user version cas cmpomap queue 2pc_queue fifo
 468   with_legacy: true
 469 # list of object classes with default execute perm (allow all: *)
 470 - name: osd_class_default_list
 471   type: str
 472   level: advanced
 473   default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
 474     user version cas cmpomap queue 2pc_queue fifo
 475   with_legacy: true
 476 - name: osd_agent_max_ops
 477   type: int
 478   level: advanced
 479   desc: maximum concurrent tiering operations for tiering agent
 480   fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
 481     in the high speed mode.
 482   default: 4
 483   with_legacy: true
 484 - name: osd_agent_max_low_ops
 485   type: int
 486   level: advanced
 487   desc: maximum concurrent low-priority tiering operations for tiering agent
 488   fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
 489     in the low speed mode.
 490   default: 2
 491   with_legacy: true
 492 - name: osd_agent_min_evict_effort
 493   type: float
 494   level: advanced
 495   desc: minimum effort to expend evicting clean objects
 496   default: 0.1
 497   min: 0
 498   max: 0.99
 499   with_legacy: true
 500 - name: osd_agent_quantize_effort
 501   type: float
 502   level: advanced
 503   desc: size of quantize unit for eviction effort
 504   default: 0.1
 505   with_legacy: true
 506 - name: osd_agent_delay_time
 507   type: float
 508   level: advanced
 509   desc: how long agent should sleep if it has no work to do
 510   default: 5
 511   with_legacy: true
 512 # decay atime and hist histograms after how many objects go by
 513 - name: osd_agent_hist_halflife
 514   type: int
 515   level: advanced
 516   desc: halflife of agent atime and temp histograms
 517   default: 1000
 518   with_legacy: true
 519 # decay atime and hist histograms after how many objects go by
 520 - name: osd_agent_slop
 521   type: float
 522   level: advanced
 523   desc: slop factor to avoid switching tiering flush and eviction mode
 524   default: 0.02
 525   with_legacy: true
 526 - name: osd_find_best_info_ignore_history_les
 527   type: bool
 528   level: dev
 529   desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
 530   long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
 531     DIRECTION OF A DEVELOPER.  It makes peering ignore the last_epoch_started value
 532     when peering, which can allow the OSD to believe an OSD has an authoritative view
 533     of a PG's contents even when it is in fact old and stale, typically leading to
 534     data loss (by believing a stale PG is up to date).
 535   default: false
 536   with_legacy: true
 537 - name: osd_uuid
 538   type: uuid
 539   level: advanced
 540   desc: uuid label for a new OSD
 541   fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
 542   note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
 543     applies to the entire cluster.
 544   flags:
 545   - create
 546   with_legacy: true
 547 - name: osd_data
 548   type: str
 549   level: advanced
 550   desc: path to OSD data
 551   fmt_desc: The path to the OSDs data. You must create the directory when
 552     deploying Ceph. You should mount a drive for OSD data at this
 553     mount point. We do not recommend changing the default.
 554   default: /var/lib/ceph/osd/$cluster-$id
 555   flags:
 556   - no_mon_update
 557   with_legacy: true
 558 - name: osd_journal
 559   type: str
 560   level: advanced
 561   desc: path to OSD journal (when FileStore backend is in use)
 562   fmt_desc: The path to the OSD's journal. This may be a path to a file or a
 563     block device (such as a partition of an SSD). If it is a file,
 564     you must create the directory to contain it. We recommend using a
 565     separate fast device when the ``osd_data`` drive is an HDD.
 566   default: /var/lib/ceph/osd/$cluster-$id/journal
 567   flags:
 568   - no_mon_update
 569   with_legacy: true
 570 - name: osd_journal_size
 571   type: size
 572   level: advanced
 573   desc: size of FileStore journal (in MiB)
 574   fmt_desc: The size of the journal in megabytes.
 575   default: 5_K
 576   flags:
 577   - create
 578   with_legacy: true
 579 - name: osd_journal_flush_on_shutdown
 580   type: bool
 581   level: advanced
 582   desc: flush FileStore journal contents during clean OSD shutdown
 583   default: true
 584   with_legacy: true
 585 - name: osd_compact_on_start
 586   type: bool
 587   level: advanced
 588   desc: compact OSD's object store's OMAP on start
 589   default: false
 590 # flags for specific control purpose during osd mount() process.
 591 # e.g., can be 1 to skip over replaying journal
 592 # or 2 to skip over mounting omap or 3 to skip over both.
 593 # This might be helpful in case the journal is totally corrupted
 594 # and we still want to bring the osd daemon back normally, etc.
 595 - name: osd_os_flags
 596   type: uint
 597   level: dev
 598   desc: flags to skip filestore omap or journal initialization
 599   default: 0
 600 - name: osd_max_write_size
 601   type: size
 602   level: advanced
 603   desc: Maximum size of a RADOS write operation in megabytes
 604   long_desc: This setting prevents clients from doing very large writes to RADOS.  If
 605     you set this to a value below what clients expect, they will receive an error
 606     when attempting to write to the cluster.
 607   fmt_desc: The maximum size of a write in megabytes.
 608   default: 90
 609   min: 4
 610   with_legacy: true
 611 - name: osd_max_pgls
 612   type: uint
 613   level: advanced
 614   desc: maximum number of results when listing objects in a pool
 615   fmt_desc: The maximum number of placement groups to list. A client
 616     requesting a large number can tie up the Ceph OSD Daemon.
 617   default: 1_K
 618   with_legacy: true
 619 - name: osd_client_message_size_cap
 620   type: size
 621   level: advanced
 622   desc: maximum memory to devote to in-flight client requests
 623   long_desc: If this value is exceeded, the OSD will not read any new client data
 624     off of the network until memory is freed.
 625   fmt_desc: The largest client data message allowed in memory.
 626   default: 500_M
 627   with_legacy: true
 628 - name: osd_client_message_cap
 629   type: uint
 630   level: advanced
 631   desc: maximum number of in-flight client requests
 632   default: 256
 633   with_legacy: true
 634 - name: osd_crush_update_on_start
 635   type: bool
 636   level: advanced
 637   desc: update OSD CRUSH location on startup
 638   default: true
 639   with_legacy: true
 640 - name: osd_class_update_on_start
 641   type: bool
 642   level: advanced
 643   desc: set OSD device class on startup
 644   default: true
 645   with_legacy: true
 646 - name: osd_crush_initial_weight
 647   type: float
 648   level: advanced
 649   desc: if >= 0, initial CRUSH weight for newly created OSDs
 650   long_desc: If this value is negative, the size of the OSD in TiB is used.
 651   fmt_desc: The initial CRUSH weight for newly added OSDs. The default
 652     value of this option is ``the size of a newly added OSD in TB``. By default,
 653     the initial CRUSH weight for a newly added OSD is set to its device size in
 654     TB. See `Weighting Bucket Items`_ for details.
 655   default: -1
 656   with_legacy: true
 657 # Allows the "peered" state for recovery and backfill below min_size
 658 - name: osd_allow_recovery_below_min_size
 659   type: bool
 660   level: dev
 661   desc: allow replicated pools to recover with < min_size active members
 662   default: true
 663   services:
 664   - osd
 665   with_legacy: true
 666 # cap on # of inc maps we send to peers, clients
 667 - name: osd_map_share_max_epochs
 668   type: int
 669   level: advanced
 670   default: 40
 671   with_legacy: true
 672 - name: osd_map_cache_size
 673   type: int
 674   level: advanced
 675   default: 50
 676   fmt_desc: The number of OSD maps to keep cached.
 677   with_legacy: true
 678 - name: osd_pg_epoch_max_lag_factor
 679   type: float
 680   level: advanced
 681   desc: Max multiple of the map cache that PGs can lag before we throttle map injest
 682   default: 2
 683   see_also:
 684   - osd_map_cache_size
 685 - name: osd_inject_bad_map_crc_probability
 686   type: float
 687   level: dev
 688   default: 0
 689   with_legacy: true
 690 - name: osd_inject_failure_on_pg_removal
 691   type: bool
 692   level: dev
 693   default: false
 694   with_legacy: true
 695 # shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
 696 - name: osd_max_markdown_period
 697   type: int
 698   level: advanced
 699   default: 10_min
 700   with_legacy: true
 701 - name: osd_max_markdown_count
 702   type: int
 703   level: advanced
 704   default: 5
 705   with_legacy: true
 706 - name: osd_op_thread_timeout
 707   type: int
 708   level: advanced
 709   default: 15
 710   fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
 711   with_legacy: true
 712 - name: osd_op_thread_suicide_timeout
 713   type: int
 714   level: advanced
 715   default: 150
 716   with_legacy: true
 717 - name: osd_op_pq_max_tokens_per_priority
 718   type: uint
 719   level: advanced
 720   default: 4_M
 721   with_legacy: true
 722 - name: osd_op_pq_min_cost
 723   type: size
 724   level: advanced
 725   default: 64_K
 726   with_legacy: true
 727 # preserve clone_overlap during recovery/migration
 728 - name: osd_recover_clone_overlap
 729   type: bool
 730   level: advanced
 731   default: true
 732   fmt_desc: Preserves clone overlap during recovery. Should always be set
 733     to ``true``.
 734   with_legacy: true
 735 - name: osd_num_cache_shards
 736   type: size
 737   level: advanced
 738   desc: The number of cache shards to use in the object store.
 739   default: 32
 740   flags:
 741   - startup
 742 - name: osd_aggregated_slow_ops_logging
 743   type: bool
 744   level: advanced
 745   desc: Allow OSD daemon to send an aggregated slow ops to the cluster log
 746   fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in
 747     an aggregated format to the cluster log else sends every slow op to the
 748     cluster log.
 749   default: true
 750   with_legacy: true
 751 - name: osd_op_num_threads_per_shard
 752   type: int
 753   level: advanced
 754   default: 0
 755   flags:
 756   - startup
 757   with_legacy: true
 758 - name: osd_op_num_threads_per_shard_hdd
 759   type: int
 760   level: advanced
 761   default: 1
 762   see_also:
 763   - osd_op_num_threads_per_shard
 764   flags:
 765   - startup
 766   with_legacy: true
 767 - name: osd_op_num_threads_per_shard_ssd
 768   type: int
 769   level: advanced
 770   default: 2
 771   see_also:
 772   - osd_op_num_threads_per_shard
 773   flags:
 774   - startup
 775   with_legacy: true
 776 - name: osd_op_num_shards
 777   type: int
 778   level: advanced
 779   fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
 780     PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
 781     non-zero.
 782   default: 0
 783   flags:
 784   - startup
 785   with_legacy: true
 786 - name: osd_op_num_shards_hdd
 787   type: int
 788   level: advanced
 789   fmt_desc: the number of shards allocated for a given OSD (for rotational media).
 790   default: 5
 791   see_also:
 792   - osd_op_num_shards
 793   flags:
 794   - startup
 795   with_legacy: true
 796 - name: osd_op_num_shards_ssd
 797   type: int
 798   level: advanced
 799   fmt_desc: the number of shards allocated for a given OSD (for solid state media).
 800   default: 8
 801   see_also:
 802   - osd_op_num_shards
 803   flags:
 804   - startup
 805   with_legacy: true
 806 - name: osd_skip_data_digest
 807   type: bool
 808   level: dev
 809   desc: Do not store full-object checksums if the backend (bluestore) does its own
 810     checksums.  Only usable with all BlueStore OSDs.
 811   default: false
 812 # PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
 813 # mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
 814 # and "mclock_client" are based on the mClock/dmClock algorithm
 815 # (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
 816 # class the operation belongs to. "mclock_client" does the same but
 817 # also works to ienforce fairness between clients. "debug_random"
 818 # chooses among all four with equal probability.
 819 - name: osd_op_queue
 820   type: str
 821   level: advanced
 822   desc: which operation priority queue algorithm to use
 823   long_desc: which operation priority queue algorithm to use
 824   fmt_desc: This sets the type of queue to be used for prioritizing ops
 825     within each OSD. Both queues feature a strict sub-queue which is
 826     dequeued before the normal queue. The normal queue is different
 827     between implementations. The WeightedPriorityQueue (``wpq``)
 828     dequeues operations in relation to their priorities to prevent
 829     starvation of any queue. WPQ should help in cases where a few OSDs
 830     are more overloaded than others. The mClockQueue
 831     (``mclock_scheduler``) prioritizes operations based on which class
 832     they belong to (recovery, scrub, snaptrim, client op, osd subop).
 833     See `QoS Based on mClock`_. Requires a restart.
 834   default: mclock_scheduler
 835   see_also:
 836   - osd_op_queue_cut_off
 837   enum_values:
 838   - wpq
 839   - mclock_scheduler
 840   - debug_random
 841   with_legacy: true
 842 # Min priority to go to strict queue. (low, high)
 843 - name: osd_op_queue_cut_off
 844   type: str
 845   level: advanced
 846   desc: the threshold between high priority ops and low priority ops
 847   long_desc: the threshold between high priority ops that use strict priority ordering
 848     and low priority ops that use a fairness algorithm that may or may not incorporate
 849     priority
 850   fmt_desc: This selects which priority ops will be sent to the strict
 851     queue verses the normal queue. The ``low`` setting sends all
 852     replication ops and higher to the strict queue, while the ``high``
 853     option sends only replication acknowledgment ops and higher to
 854     the strict queue. Setting this to ``high`` should help when a few
 855     OSDs in the cluster are very busy especially when combined with
 856     ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
 857     handling replication traffic could starve primary client traffic
 858     on these OSDs without these settings. Requires a restart.
 859   default: high
 860   see_also:
 861   - osd_op_queue
 862   enum_values:
 863   - low
 864   - high
 865   - debug_random
 866   with_legacy: true
 867 - name: osd_mclock_scheduler_client_res
 868   type: uint
 869   level: advanced
 870   desc: IO proportion reserved for each client (default)
 871   long_desc: Only considered for osd_op_queue = mclock_scheduler
 872   fmt_desc: IO proportion reserved for each client (default).
 873   default: 1
 874   see_also:
 875   - osd_op_queue
 876 - name: osd_mclock_scheduler_client_wgt
 877   type: uint
 878   level: advanced
 879   desc: IO share for each client (default) over reservation
 880   long_desc: Only considered for osd_op_queue = mclock_scheduler
 881   fmt_desc: IO share for each client (default) over reservation.
 882   default: 1
 883   see_also:
 884   - osd_op_queue
 885 - name: osd_mclock_scheduler_client_lim
 886   type: uint
 887   level: advanced
 888   desc: IO limit for each client (default) over reservation
 889   long_desc: Only considered for osd_op_queue = mclock_scheduler
 890   fmt_desc: IO limit for each client (default) over reservation.
 891   default: 999999
 892   see_also:
 893   - osd_op_queue
 894 - name: osd_mclock_scheduler_background_recovery_res
 895   type: uint
 896   level: advanced
 897   desc: IO proportion reserved for background recovery (default)
 898   long_desc: Only considered for osd_op_queue = mclock_scheduler
 899   fmt_desc: IO proportion reserved for background recovery (default).
 900   default: 1
 901   see_also:
 902   - osd_op_queue
 903 - name: osd_mclock_scheduler_background_recovery_wgt
 904   type: uint
 905   level: advanced
 906   desc: IO share for each background recovery over reservation
 907   long_desc: Only considered for osd_op_queue = mclock_scheduler
 908   fmt_desc: IO share for each background recovery over reservation.
 909   default: 1
 910   see_also:
 911   - osd_op_queue
 912 - name: osd_mclock_scheduler_background_recovery_lim
 913   type: uint
 914   level: advanced
 915   desc: IO limit for background recovery over reservation
 916   long_desc: Only considered for osd_op_queue = mclock_scheduler
 917   fmt_desc: IO limit for background recovery over reservation.
 918   default: 999999
 919   see_also:
 920   - osd_op_queue
 921 - name: osd_mclock_scheduler_background_best_effort_res
 922   type: uint
 923   level: advanced
 924   desc: IO proportion reserved for background best_effort (default)
 925   long_desc: Only considered for osd_op_queue = mclock_scheduler
 926   fmt_desc: IO proportion reserved for background best_effort (default).
 927   default: 1
 928   see_also:
 929   - osd_op_queue
 930 - name: osd_mclock_scheduler_background_best_effort_wgt
 931   type: uint
 932   level: advanced
 933   desc: IO share for each background best_effort over reservation
 934   long_desc: Only considered for osd_op_queue = mclock_scheduler
 935   fmt_desc: IO share for each background best_effort over reservation.
 936   default: 1
 937   see_also:
 938   - osd_op_queue
 939 - name: osd_mclock_scheduler_background_best_effort_lim
 940   type: uint
 941   level: advanced
 942   desc: IO limit for background best_effort over reservation
 943   long_desc: Only considered for osd_op_queue = mclock_scheduler
 944   fmt_desc: IO limit for background best_effort over reservation.
 945   default: 999999
 946   see_also:
 947   - osd_op_queue
 948 - name: osd_mclock_scheduler_anticipation_timeout
 949   type: float
 950   level: advanced
 951   desc: mclock anticipation timeout in seconds
 952   long_desc: the amount of time that mclock waits until the unused resource is forfeited
 953   default: 0
 954 - name: osd_mclock_cost_per_io_usec
 955   type: float
 956   level: dev
 957   desc: Cost per IO in microseconds to consider per OSD (overrides _ssd and _hdd if
 958     non-zero)
 959   long_desc: This option specifies the cost factor to consider in usec per OSD. This
 960     is considered by the mclock scheduler to set an additional cost factor in QoS
 961     calculations. Only considered for osd_op_queue = mclock_scheduler
 962   fmt_desc: Cost per IO in microseconds to consider per OSD (overrides _ssd
 963     and _hdd if non-zero)
 964   default: 0
 965   flags:
 966   - runtime
 967 - name: osd_mclock_cost_per_io_usec_hdd
 968   type: float
 969   level: dev
 970   desc: Cost per IO in microseconds to consider per OSD (for rotational media)
 971   long_desc: This option specifies the cost factor to consider in usec per OSD for
 972     rotational device type. This is considered by the mclock_scheduler to set an additional
 973     cost factor in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
 974   fmt_desc: Cost per IO in microseconds to consider per OSD (for rotational
 975     media)
 976   default: 25000
 977   flags:
 978   - runtime
 979 - name: osd_mclock_cost_per_io_usec_ssd
 980   type: float
 981   level: dev
 982   desc: Cost per IO in microseconds to consider per OSD (for solid state media)
 983   long_desc: This option specifies the cost factor to consider in usec per OSD for
 984     solid state device type. This is considered by the mclock_scheduler to set an
 985     additional cost factor in QoS calculations. Only considered for osd_op_queue =
 986     mclock_scheduler
 987   fmt_desc: Cost per IO in microseconds to consider per OSD (for solid state
 988     media)
 989   default: 50
 990   flags:
 991   - runtime
 992 - name: osd_mclock_cost_per_byte_usec
 993   type: float
 994   level: dev
 995   desc: Cost per byte in microseconds to consider per OSD (overrides _ssd and _hdd
 996     if non-zero)
 997   long_desc: This option specifies the cost per byte to consider in microseconds per
 998     OSD. This is considered by the mclock scheduler to set an additional cost factor
 999     in QoS calculations. Only considered for osd_op_queue = mclock_scheduler
1000   fmt_desc: Cost per byte in microseconds to consider per OSD (overrides _ssd
1001     and _hdd if non-zero)
1002   default: 0
1003   flags:
1004   - runtime
1005 - name: osd_mclock_cost_per_byte_usec_hdd
1006   type: float
1007   level: dev
1008   desc: Cost per byte in microseconds to consider per OSD (for rotational media)
1009   long_desc: This option specifies the cost per byte to consider in microseconds per
1010     OSD for rotational device type. This is considered by the mclock_scheduler to
1011     set an additional cost factor in QoS calculations. Only considered for osd_op_queue
1012     = mclock_scheduler
1013   fmt_desc: Cost per byte in microseconds to consider per OSD (for rotational
1014     media)
1015   default: 5.2
1016   flags:
1017   - runtime
1018 - name: osd_mclock_cost_per_byte_usec_ssd
1019   type: float
1020   level: dev
1021   desc: Cost per byte in microseconds to consider per OSD (for solid state media)
1022   long_desc: This option specifies the cost per byte to consider in microseconds per
1023     OSD for solid state device type. This is considered by the mclock_scheduler to
1024     set an additional cost factor in QoS calculations. Only considered for osd_op_queue
1025     = mclock_scheduler
1026   fmt_desc: Cost per byte in microseconds to consider per OSD (for solid state
1027     media)
1028   default: 0.011
1029   flags:
1030   - runtime
1031 - name: osd_mclock_max_capacity_iops_hdd
1032   type: float
1033   level: basic
1034   desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for rotational
1035     media)
1036   long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
1037     QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
1038     = mclock_scheduler
1039   fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
1040     rotational media)
1041   default: 315
1042   flags:
1043   - runtime
1044 - name: osd_mclock_max_capacity_iops_ssd
1045   type: float
1046   level: basic
1047   desc: Max IOPs capacity (at 4KiB block size) to consider per OSD (for solid state
1048     media)
1049   long_desc: This option specifies the max OSD capacity in iops per OSD. Helps in
1050     QoS calculations when enabling a dmclock profile. Only considered for osd_op_queue
1051     = mclock_scheduler
1052   fmt_desc: Max IOPS capacity (at 4KiB block size) to consider per OSD (for
1053     solid state media)
1054   default: 21500
1055   flags:
1056   - runtime
1057 - name: osd_mclock_force_run_benchmark_on_init
1058   type: bool
1059   level: advanced
1060   desc: Force run the OSD benchmark on OSD initialization/boot-up
1061   long_desc: This option specifies whether the OSD benchmark must be run during
1062     the OSD boot-up sequence even if historical data about the OSD iops capacity
1063     is available in the MON config store. Enable this to refresh the OSD iops
1064     capacity if the underlying device's performance characteristics have changed
1065     significantly. Only considered for osd_op_queue = mclock_scheduler.
1066   fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up
1067   default: false
1068   see_also:
1069   - osd_mclock_max_capacity_iops_hdd
1070   - osd_mclock_max_capacity_iops_ssd
1071   flags:
1072   - startup
1073 - name: osd_mclock_skip_benchmark
1074   type: bool
1075   level: dev
1076   desc: Skip the OSD benchmark on OSD initialization/boot-up
1077   long_desc: This option specifies whether the OSD benchmark must be skipped during
1078     the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler.
1079   fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up
1080   default: false
1081   see_also:
1082   - osd_mclock_max_capacity_iops_hdd
1083   - osd_mclock_max_capacity_iops_ssd
1084   flags:
1085   - runtime
1086 - name: osd_mclock_profile
1087   type: str
1088   level: advanced
1089   desc: Which mclock profile to use
1090   long_desc: This option specifies the mclock profile to enable - one among the set
1091     of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
1092   fmt_desc: |
1093     This sets the type of mclock profile to use for providing QoS
1094     based on operations belonging to different classes (background
1095     recovery, scrub, snaptrim, client op, osd subop). Once a built-in
1096     profile is enabled, the lower level mclock resource control
1097     parameters [*reservation, weight, limit*] and some Ceph
1098     configuration parameters are set transparently. Note that the
1099     above does not apply for the *custom* profile.
1100   default: high_client_ops
1101   see_also:
1102   - osd_op_queue
1103   enum_values:
1104   - balanced
1105   - high_recovery_ops
1106   - high_client_ops
1107   - custom
1108   flags:
1109   - runtime
1110 # Set to true for testing.  Users should NOT set this.
1111 # If set to true even after reading enough shards to
1112 # decode the object, any error will be reported.
1113 - name: osd_read_ec_check_for_errors
1114   type: bool
1115   level: advanced
1116   default: false
1117   with_legacy: true
1118 - name: osd_recovery_delay_start
1119   type: float
1120   level: advanced
1121   default: 0
1122   fmt_desc: After peering completes, Ceph will delay for the specified number
1123     of seconds before starting to recover RADOS objects.
1124   with_legacy: true
1125 - name: osd_recovery_max_active
1126   type: uint
1127   level: advanced
1128   desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
1129     and _hdd if non-zero)
1130   fmt_desc: The number of active recovery requests per OSD at one time. More
1131     requests will accelerate recovery, but the requests places an
1132     increased load on the cluster.
1133   note: This value is only used if it is non-zero. Normally it
1134     is ``0``, which means that the ``hdd`` or ``ssd`` values
1135     (below) are used, depending on the type of the primary
1136     device backing the OSD.
1137   default: 0
1138   see_also:
1139   - osd_recovery_max_active_hdd
1140   - osd_recovery_max_active_ssd
1141   flags:
1142   - runtime
1143   with_legacy: true
1144 - name: osd_recovery_max_active_hdd
1145   type: uint
1146   level: advanced
1147   desc: Number of simultaneous active recovery operations per OSD (for rotational
1148     devices)
1149   fmt_desc: The number of active recovery requests per OSD at one time, if the
1150     primary device is rotational.
1151   default: 3
1152   see_also:
1153   - osd_recovery_max_active
1154   - osd_recovery_max_active_ssd
1155   flags:
1156   - runtime
1157   with_legacy: true
1158 - name: osd_recovery_max_active_ssd
1159   type: uint
1160   level: advanced
1161   desc: Number of simultaneous active recovery operations per OSD (for non-rotational
1162     solid state devices)
1163   fmt_desc: The number of active recovery requests per OSD at one time, if the
1164     primary device is non-rotational (i.e., an SSD).
1165   default: 10
1166   see_also:
1167   - osd_recovery_max_active
1168   - osd_recovery_max_active_hdd
1169   flags:
1170   - runtime
1171   with_legacy: true
1172 - name: osd_recovery_max_single_start
1173   type: uint
1174   level: advanced
1175   default: 1
1176   fmt_desc: The maximum number of recovery operations per OSD that will be
1177     newly started when an OSD is recovering.
1178   with_legacy: true
1179 # max size of push chunk
1180 - name: osd_recovery_max_chunk
1181   type: size
1182   level: advanced
1183   default: 8_M
1184   fmt_desc: the maximum total size of data chunks a recovery op can carry.
1185   with_legacy: true
1186 # max number of omap entries per chunk; 0 to disable limit
1187 - name: osd_recovery_max_omap_entries_per_chunk
1188   type: uint
1189   level: advanced
1190   default: 8096
1191   with_legacy: true
1192 # max size of a COPYFROM chunk
1193 - name: osd_copyfrom_max_chunk
1194   type: size
1195   level: advanced
1196   default: 8_M
1197   with_legacy: true
1198 # push cost per object
1199 - name: osd_push_per_object_cost
1200   type: size
1201   level: advanced
1202   default: 1000
1203   fmt_desc: the overhead for serving a push op
1204   with_legacy: true
1205 # max size of push message
1206 - name: osd_max_push_cost
1207   type: size
1208   level: advanced
1209   default: 8_M
1210   with_legacy: true
1211 # max objects in single push op
1212 - name: osd_max_push_objects
1213   type: uint
1214   level: advanced
1215   default: 10
1216   with_legacy: true
1217 # Only use clone_overlap for recovery if there are fewer than
1218 # osd_recover_clone_overlap_limit entries in the overlap set
1219 - name: osd_recover_clone_overlap_limit
1220   type: uint
1221   level: advanced
1222   default: 10
1223   flags:
1224   - runtime
1225 - name: osd_debug_feed_pullee
1226   type: int
1227   level: dev
1228   desc: Feed a pullee, and force primary to pull a currently missing object from it
1229   default: -1
1230   with_legacy: true
1231 - name: osd_backfill_scan_min
1232   type: int
1233   level: advanced
1234   default: 64
1235   fmt_desc: The minimum number of objects per backfill scan.
1236   with_legacy: true
1237 - name: osd_backfill_scan_max
1238   type: int
1239   level: advanced
1240   default: 512
1241   fmt_desc: The maximum number of objects per backfill scan.p
1242   with_legacy: true
1243 # minimum number of peers
1244 - name: osd_heartbeat_min_peers
1245   type: int
1246   level: advanced
1247   default: 10
1248   with_legacy: true
1249 - name: osd_delete_sleep
1250   type: float
1251   level: advanced
1252   desc: Time in seconds to sleep before next removal transaction (overrides values
1253     below)
1254   fmt_desc: Time in seconds to sleep before the next removal transaction. This
1255     throttles the PG deletion process.
1256   default: 0
1257   flags:
1258   - runtime
1259 - name: osd_delete_sleep_hdd
1260   type: float
1261   level: advanced
1262   desc: Time in seconds to sleep before next removal transaction for HDDs
1263   default: 5
1264   flags:
1265   - runtime
1266 - name: osd_delete_sleep_ssd
1267   type: float
1268   level: advanced
1269   desc: Time in seconds to sleep before next removal transaction for SSDs
1270   default: 1
1271   flags:
1272   - runtime
1273 - name: osd_delete_sleep_hybrid
1274   type: float
1275   level: advanced
1276   desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
1277     and OSD journal or WAL+DB is on SSD
1278   default: 1
1279   flags:
1280   - runtime