# -*- mode: YAML -*-
---

options:
- name: osd_numa_prefer_iface
  type: bool
  level: advanced
  desc: prefer IP on network interface on same numa node as storage
  default: true
  see_also:
  - osd_numa_auto_affinity
  flags:
  - startup
- name: osd_numa_auto_affinity
  type: bool
  level: advanced
  desc: automatically set affinity to numa node when storage and network match
  default: true
  flags:
  - startup
- name: osd_numa_node
  type: int
  level: advanced
  desc: set affinity to a numa node (-1 for none)
  default: -1
  see_also:
  - osd_numa_auto_affinity
  flags:
  - startup
- name: set_keepcaps
  type: bool
  level: advanced
  desc: set the keepcaps flag before changing UID, preserving the permitted capability set
  long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If
    a component that is capability aware needs a specific capability, the keepcaps flag maintains
     the permitted capability set, allowing the capabilities in the effective set to be activated as needed.
  default: false
  flags:
  - startup
- name: osd_smart_report_timeout
  type: uint
  level: advanced
  desc: Timeout (in seconds) for smartctl to run, default is set to 5
  default: 5
# verify backend can support configured max object name length
- name: osd_check_max_object_name_len_on_startup
  type: bool
  level: dev
  default: true
  with_legacy: true
- name: osd_max_backfills
  type: uint
  level: advanced
  desc: Maximum number of concurrent local and remote backfills or recoveries per
    OSD
  long_desc: There can be osd_max_backfills local reservations AND the same remote
    reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
    in recovery and 1 shard of another recovering PG.
  fmt_desc: The maximum number of backfills allowed to or from a single OSD.
    Note that this is applied separately for read and write operations.
  default: 1
  flags:
  - runtime
  with_legacy: true
# Minimum recovery priority (255 = max, smaller = lower)
- name: osd_min_recovery_priority
  type: int
  level: advanced
  desc: Minimum priority below which recovery is not performed
  long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
    work (e.g., rebalancing) below this threshold and focus solely on higher priority
    work (e.g., replicating degraded objects).
  default: 0
  with_legacy: true
- name: osd_backfill_retry_interval
  type: float
  level: advanced
  desc: how frequently to retry backfill reservations after being denied (e.g., due
    to a full OSD)
  fmt_desc: The number of seconds to wait before retrying backfill requests.
  default: 30
  with_legacy: true
- name: osd_recovery_retry_interval
  type: float
  level: advanced
  desc: how frequently to retry recovery reservations after being denied (e.g., due
    to a full OSD)
  default: 30
  with_legacy: true
- name: osd_recovery_sleep
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op. This setting
    overrides _ssd, _hdd, and _hybrid if non-zero.
  fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
    Increasing this value will slow down recovery operation while
    client operations will be less impacted.
  default: 0
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_sleep_hdd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op for HDDs
  fmt_desc: Time in seconds to sleep before next recovery or backfill op
    for HDDs.
  default: 0.1
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_sleep_ssd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op for SSDs
  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
    for SSDs.
  default: 0
  see_also:
  - osd_recovery_sleep
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_sleep_hybrid
  type: float
  level: advanced
  desc: Time in seconds to sleep before next recovery or backfill op when data is
    on HDD and journal is on SSD
  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
    when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
  default: 0.025
  see_also:
  - osd_recovery_sleep
  flags:
  - runtime
- name: osd_snap_trim_sleep
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim. This setting overrides _ssd,
    _hdd, and _hybrid if non-zero.
  fmt_desc: Time in seconds to sleep before next snap trim op.
    Increasing this value will slow down snap trimming.
    This option overrides backend specific variants.
  default: 0
  flags:
  - runtime
  with_legacy: true
- name: osd_snap_trim_sleep_hdd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim for HDDs
  default: 5
  flags:
  - runtime
- name: osd_snap_trim_sleep_ssd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim for SSDs
  fmt_desc: Time in seconds to sleep before next snap trim op
    for SSD OSDs (including NVMe).
  default: 0
  flags:
  - runtime
- name: osd_snap_trim_sleep_hybrid
  type: float
  level: advanced
  desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
    is on SSD
  fmt_desc: Time in seconds to sleep before next snap trim op
    when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
  default: 2
  flags:
  - runtime
- name: osd_scrub_invalid_stats
  type: bool
  level: advanced
  default: true
  with_legacy: true
- name: osd_max_scrubs
  type: int
  level: advanced
  desc: Maximum concurrent scrubs on a single OSD
  fmt_desc: The maximum number of simultaneous scrub operations for
    a Ceph OSD Daemon.
  default: 1
  with_legacy: true
- name: osd_scrub_during_recovery
  type: bool
  level: advanced
  desc: Allow scrubbing when PGs on the OSD are undergoing recovery
  fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
    scheduling new scrub (and deep--scrub) while there is active recovery.
    Already running scrubs will be continued. This might be useful to reduce
    load on busy clusters.
  default: false
  with_legacy: true
- name: osd_repair_during_recovery
  type: bool
  level: advanced
  desc: Allow requested repairing when PGs on the OSD are undergoing recovery
  default: false
  with_legacy: true
- name: osd_scrub_begin_hour
  type: int
  level: advanced
  desc: Restrict scrubbing to this hour of the day or later
  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
  fmt_desc: This restricts scrubbing to this hour of the day or later.
    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour``, they define a time
    window, in which the scrubs can happen.
    But a scrub will be performed
    no matter whether the time window allows or not, as long as the placement
    group's scrub interval exceeds ``osd_scrub_max_interval``.
  default: 0
  see_also:
  - osd_scrub_end_hour
  min: 0
  max: 23
  with_legacy: true
- name: osd_scrub_end_hour
  type: int
  level: advanced
  desc: Restrict scrubbing to hours of the day earlier than this
  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
  fmt_desc: This restricts scrubbing to the hour earlier than this.
    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
    for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
    window, in which the scrubs can happen. But a scrub will be performed
    no matter whether the time window allows or not, as long as the placement
    group's scrub interval exceeds ``osd_scrub_max_interval``.
  default: 0
  see_also:
  - osd_scrub_begin_hour
  min: 0
  max: 23
  with_legacy: true
- name: osd_scrub_begin_week_day
  type: int
  level: advanced
  desc: Restrict scrubbing to this day of the week or later
  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
    for the entire week.
  fmt_desc: This restricts scrubbing to this day of the week or later.
    0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
    Along with ``osd_scrub_end_week_day``, they define a time window in which
    scrubs can happen. But a scrub will be performed
    no matter whether the time window allows or not, when the PG's
    scrub interval exceeds ``osd_scrub_max_interval``.
  default: 0
  see_also:
  - osd_scrub_end_week_day
  min: 0
  max: 6
  with_legacy: true
- name: osd_scrub_end_week_day
  type: int
  level: advanced
  desc: Restrict scrubbing to days of the week earlier than this
  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
    for the entire week.
  fmt_desc: This restricts scrubbing to days of the week earlier than this.
    0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
    Along with ``osd_scrub_begin_week_day``, they define a time
    window, in which the scrubs can happen. But a scrub will be performed
    no matter whether the time window allows or not, as long as the placement
    group's scrub interval exceeds ``osd_scrub_max_interval``.
  default: 0
  see_also:
  - osd_scrub_begin_week_day
  min: 0
  max: 6
  with_legacy: true
- name: osd_scrub_load_threshold
  type: float
  level: advanced
  desc: Allow scrubbing when system load divided by number of CPUs is below this value
  fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
    (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
    Default is ``0.5``.
  default: 0.5
  with_legacy: true
# if load is low
- name: osd_scrub_min_interval
  type: float
  level: advanced
  desc: Scrub each PG no more often than this interval
  fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon
    when the Ceph Storage Cluster load is low.
  default: 1_day
  see_also:
  - osd_scrub_max_interval
  with_legacy: true
# regardless of load
- name: osd_scrub_max_interval
  type: float
  level: advanced
  desc: Scrub each PG no less often than this interval
  fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
    irrespective of cluster load.
  default: 7_day
  see_also:
  - osd_scrub_min_interval
  with_legacy: true
# randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
- name: osd_scrub_interval_randomize_ratio
  type: float
  level: advanced
  desc: Ratio of scrub interval to randomly vary
  long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
    so that they are soon uniformly distributed over the week
  fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
    the next scrub job for a PG. The delay is a random
    value less than ``osd_scrub_min_interval`` \*
    ``osd_scrub_interval_randomized_ratio``. The default setting
    spreads scrubs throughout the allowed time
    window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
  default: 0.5
  see_also:
  - osd_scrub_min_interval
  with_legacy: true
# the probability to back off the scheduled scrub
- name: osd_scrub_backoff_ratio
  type: float
  level: dev
  desc: Backoff ratio for scheduling scrubs
  long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means
    that 1 out of 3 ticks will schedule scrubs
  default: 0.66
  with_legacy: true
- name: osd_scrub_chunk_min
  type: int
  level: advanced
  desc: Minimum number of objects to deep-scrub in a single chunk
  fmt_desc: The minimal number of object store chunks to scrub during single operation.
    Ceph blocks writes to single chunk during scrub.
  default: 5
  see_also:
  - osd_scrub_chunk_max
  with_legacy: true
- name: osd_scrub_chunk_max
  type: int
  level: advanced
  desc: Maximum number of objects to deep-scrub in a single chunk
  fmt_desc: The maximum number of object store chunks to scrub during single operation.
  default: 25
  see_also:
  - osd_scrub_chunk_min
  with_legacy: true
- name: osd_shallow_scrub_chunk_min
  type: int
  level: advanced
  desc: Minimum number of objects to scrub in a single chunk
  fmt_desc: The minimum number of object store chunks to scrub during single operation.
    Not applicable to deep scrubs.
    Ceph blocks writes to single chunk during scrub.
  default: 50
  see_also:
  - osd_shallow_scrub_chunk_max
  - osd_scrub_chunk_min
  with_legacy: true
- name: osd_shallow_scrub_chunk_max
  type: int
  level: advanced
  desc: Maximum number of objects to scrub in a single chunk
  fmt_desc: The maximum number of object store chunks to scrub during single operation.
    Not applicable to deep scrubs.
  default: 100
  see_also:
  - osd_shallow_scrub_chunk_min
  - osd_scrub_chunk_max
  with_legacy: true
# sleep between [deep]scrub ops
- name: osd_scrub_sleep
  type: float
  level: advanced
  desc: Duration to inject a delay during scrubbing
  fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow
    down the overall rate of scrubbing so that client operations will be less impacted.
  default: 0
  flags:
  - runtime
  with_legacy: true
# more sleep between [deep]scrub ops
- name: osd_scrub_extended_sleep
  type: float
  level: advanced
  desc: Duration to inject a delay during scrubbing out of scrubbing hours
  default: 0
  see_also:
  - osd_scrub_begin_hour
  - osd_scrub_end_hour
  - osd_scrub_begin_week_day
  - osd_scrub_end_week_day
  with_legacy: true
# whether auto-repair inconsistencies upon deep-scrubbing
- name: osd_scrub_auto_repair
  type: bool
  level: advanced
  desc: Automatically repair damaged objects detected during scrub
  fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
    are found by scrubs or deep-scrubs.  However, if more than
    ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
  default: false
  with_legacy: true
# only auto-repair when number of errors is below this threshold
- name: osd_scrub_auto_repair_num_errors
  type: uint
  level: advanced
  desc: Maximum number of detected errors to automatically repair
  fmt_desc: Auto repair will not occur if more than this many errors are found.
  default: 5
  see_also:
  - osd_scrub_auto_repair
  with_legacy: true
- name: osd_scrub_max_preemptions
  type: uint
  level: advanced
  desc: Set the maximum number of times we will preempt a deep scrub due to a client
    operation before blocking client IO to complete the scrub
  default: 5
  min: 0
  max: 30
- name: osd_deep_scrub_interval
  type: float
  level: advanced
  desc: Deep scrub each PG (i.e., verify data checksums) at least this often
  fmt_desc: The interval for "deep" scrubbing (fully reading all data). The
    ``osd_scrub_load_threshold`` does not affect this setting.
  default: 7_day
  with_legacy: true
- name: osd_deep_scrub_randomize_ratio
  type: float
  level: advanced
  desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs
    are deep)
  long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they
    are uniformly distributed over the week
  default: 0.15
  with_legacy: true
- name: osd_deep_scrub_stride
  type: size
  level: advanced
  desc: Number of bytes to read from an object at a time during deep scrub
  fmt_desc: Read size when doing a deep scrub.
  default: 512_K
  with_legacy: true
- name: osd_deep_scrub_keys
  type: int
  level: advanced
  desc: Number of keys to read from an object at a time during deep scrub
  default: 1024
  with_legacy: true
# objects must be this old (seconds) before we update the whole-object digest on scrub
- name: osd_deep_scrub_update_digest_min_age
  type: int
  level: advanced
  desc: Update overall object digest only if object was last modified longer ago than
    this
  default: 2_hr
  with_legacy: true
- name: osd_deep_scrub_large_omap_object_key_threshold
  type: uint
  level: advanced
  desc: Warn when we encounter an object with more omap keys than this
  default: 200000
  services:
  - osd
  - mds
  see_also:
  - osd_deep_scrub_large_omap_object_value_sum_threshold
  with_legacy: true
- name: osd_deep_scrub_large_omap_object_value_sum_threshold
  type: size
  level: advanced
  desc: Warn when we encounter an object with more omap key bytes than this
  default: 1_G
  services:
  - osd
  see_also:
  - osd_deep_scrub_large_omap_object_key_threshold
  with_legacy: true
# when scrubbing blocks on a locked object
- name: osd_blocked_scrub_grace_period
  type: int
  level: advanced
  desc: Time (seconds) before issuing a cluster-log warning
  long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked.
  default: 120
  with_legacy: true
# timely updates to the 'pg dump' output, esp. re scrub scheduling
- name: osd_stats_update_period_scrubbing
  type: int
  level: advanced
  desc: Stats update period (seconds) when scrubbing
  long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its
    stats (inc. scrub/block duration) every this many seconds.
  default: 15
  with_legacy: false
- name: osd_stats_update_period_not_scrubbing
  type: int
  level: advanced
  desc: Stats update period (seconds) when not scrubbing
  long_desc: A PG we are a primary of, publishes its
    stats (inc. scrub/block duration) every this many seconds.
  default: 120
  with_legacy: false
# when replicas are slow to respond to scrub resource reservations
# Note: disable by using a very large value
- name: osd_scrub_slow_reservation_response
  type: millisecs
  level: advanced
  desc: Duration before issuing a cluster-log warning
  long_desc: Waiting too long for a replica to respond (after at least half of the
    replicas have responded).
  default: 2200
  min: 500
  see_also:
  - osd_scrub_reservation_timeout
  with_legacy: false
# when a replica does not respond to scrub resource request
# Note: disable by using a very large value
- name: osd_scrub_reservation_timeout
  type: millisecs
  level: advanced
  desc: Duration before aborting the scrub session
  long_desc: Waiting too long for some replicas to respond to
    scrub reservation requests.
  default: 5000
  min: 2000
  see_also:
  - osd_scrub_slow_reservation_response
  with_legacy: false
# where rados plugins are stored
- name: osd_class_dir
  type: str
  level: advanced
  default: @CMAKE_INSTALL_LIBDIR@/rados-classes
  fmt_desc: The class path for RADOS class plug-ins.
  with_legacy: true
- name: osd_open_classes_on_start
  type: bool
  level: advanced
  default: true
  with_legacy: true
# list of object classes allowed to be loaded (allow all: *)
- name: osd_class_load_list
  type: str
  level: advanced
  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
    user version cas cmpomap queue 2pc_queue fifo
  with_legacy: true
# list of object classes with default execute perm (allow all: *)
- name: osd_class_default_list
  type: str
  level: advanced
  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
    user version cas cmpomap queue 2pc_queue fifo
  with_legacy: true
- name: osd_agent_max_ops
  type: int
  level: advanced
  desc: maximum concurrent tiering operations for tiering agent
  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
    in the high speed mode.
  default: 4
  with_legacy: true
- name: osd_agent_max_low_ops
  type: int
  level: advanced
  desc: maximum concurrent low-priority tiering operations for tiering agent
  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
    in the low speed mode.
  default: 2
  with_legacy: true
- name: osd_agent_min_evict_effort
  type: float
  level: advanced
  desc: minimum effort to expend evicting clean objects
  default: 0.1
  min: 0
  max: 0.99
  with_legacy: true
- name: osd_agent_quantize_effort
  type: float
  level: advanced
  desc: size of quantize unit for eviction effort
  default: 0.1
  with_legacy: true
- name: osd_agent_delay_time
  type: float
  level: advanced
  desc: how long agent should sleep if it has no work to do
  default: 5
  with_legacy: true
# decay atime and hist histograms after how many objects go by
- name: osd_agent_hist_halflife
  type: int
  level: advanced
  desc: halflife of agent atime and temp histograms
  default: 1000
  with_legacy: true
# decay atime and hist histograms after how many objects go by
- name: osd_agent_slop
  type: float
  level: advanced
  desc: slop factor to avoid switching tiering flush and eviction mode
  default: 0.02
  with_legacy: true
- name: osd_find_best_info_ignore_history_les
  type: bool
  level: dev
  desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
  long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
    DIRECTION OF A DEVELOPER.  It makes peering ignore the last_epoch_started value
    when peering, which can allow the OSD to believe an OSD has an authoritative view
    of a PG's contents even when it is in fact old and stale, typically leading to
    data loss (by believing a stale PG is up to date).
  default: false
  with_legacy: true
- name: osd_uuid
  type: uuid
  level: advanced
  desc: uuid label for a new OSD
  fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
  note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
    applies to the entire cluster.
  flags:
  - create
  with_legacy: true
- name: osd_data
  type: str
  level: advanced
  desc: path to OSD data
  fmt_desc: The path to the OSDs data. You must create the directory when
    deploying Ceph. You should mount a drive for OSD data at this
    mount point. We do not recommend changing the default.
  default: /var/lib/ceph/osd/$cluster-$id
  flags:
  - no_mon_update
  with_legacy: true
- name: osd_journal
  type: str
  level: advanced
  desc: path to OSD journal (when FileStore backend is in use)
  fmt_desc: The path to the OSD's journal. This may be a path to a file or a
    block device (such as a partition of an SSD). If it is a file,
    you must create the directory to contain it. We recommend using a
    separate fast device when the ``osd_data`` drive is an HDD.
  default: /var/lib/ceph/osd/$cluster-$id/journal
  flags:
  - no_mon_update
  with_legacy: true
- name: osd_journal_size
  type: size
  level: advanced
  desc: size of FileStore journal (in MiB)
  fmt_desc: The size of the journal in megabytes.
  default: 5_K
  flags:
  - create
  with_legacy: true
- name: osd_journal_flush_on_shutdown
  type: bool
  level: advanced
  desc: flush FileStore journal contents during clean OSD shutdown
  default: true
  with_legacy: true
- name: osd_compact_on_start
  type: bool
  level: advanced
  desc: compact OSD's object store's OMAP on start
  default: false
# flags for specific control purpose during osd mount() process.
# e.g., can be 1 to skip over replaying journal
# or 2 to skip over mounting omap or 3 to skip over both.
# This might be helpful in case the journal is totally corrupted
# and we still want to bring the osd daemon back normally, etc.
- name: osd_os_flags
  type: uint
  level: dev
  desc: flags to skip filestore omap or journal initialization
  default: 0
- name: osd_max_write_size
  type: size
  level: advanced
  desc: Maximum size of a RADOS write operation in megabytes
  long_desc: This setting prevents clients from doing very large writes to RADOS.  If
    you set this to a value below what clients expect, they will receive an error
    when attempting to write to the cluster.
  fmt_desc: The maximum size of a write in megabytes.
  default: 90
  min: 4
  with_legacy: true
- name: osd_max_pgls
  type: uint
  level: advanced
  desc: maximum number of results when listing objects in a pool
  fmt_desc: The maximum number of placement groups to list. A client
    requesting a large number can tie up the Ceph OSD Daemon.
  default: 1_K
  with_legacy: true
- name: osd_client_message_size_cap
  type: size
  level: advanced
  desc: maximum memory to devote to in-flight client requests
  long_desc: If this value is exceeded, the OSD will not read any new client data
    off of the network until memory is freed.
  fmt_desc: The largest client data message allowed in memory.
  default: 500_M
  with_legacy: true
- name: osd_client_message_cap
  type: uint
  level: advanced
  desc: maximum number of in-flight client requests
  default: 256
  with_legacy: true
- name: osd_crush_update_on_start
  type: bool
  level: advanced
  desc: update OSD CRUSH location on startup
  default: true
  with_legacy: true
- name: osd_class_update_on_start
  type: bool
  level: advanced
  desc: set OSD device class on startup
  default: true
  with_legacy: true
- name: osd_crush_initial_weight
  type: float
  level: advanced
  desc: if >= 0, initial CRUSH weight for newly created OSDs
  long_desc: If this value is negative, the size of the OSD in TiB is used.
  fmt_desc: The initial CRUSH weight for newly added OSDs. The default
    value of this option is ``the size of a newly added OSD in TB``. By default,
    the initial CRUSH weight for a newly added OSD is set to its device size in
    TB. See `Weighting Bucket Items`_ for details.
  default: -1
  with_legacy: true
# Allows the "peered" state for recovery and backfill below min_size
- name: osd_allow_recovery_below_min_size
  type: bool
  level: dev
  desc: allow replicated pools to recover with < min_size active members
  default: true
  services:
  - osd
  with_legacy: true
# cap on # of inc maps we send to peers, clients
- name: osd_map_share_max_epochs
  type: int
  level: advanced
  default: 40
  with_legacy: true
- name: osd_map_cache_size
  type: int
  level: advanced
  default: 50
  fmt_desc: The number of OSD maps to keep cached.
  with_legacy: true
- name: osd_pg_epoch_max_lag_factor
  type: float
  level: advanced
  desc: Max multiple of the map cache that PGs can lag before we throttle map injest
  default: 2
  see_also:
  - osd_map_cache_size
- name: osd_inject_bad_map_crc_probability
  type: float
  level: dev
  default: 0
  with_legacy: true
- name: osd_inject_failure_on_pg_removal
  type: bool
  level: dev
  default: false
  with_legacy: true
# shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
- name: osd_max_markdown_period
  type: int
  level: advanced
  default: 10_min
  with_legacy: true
- name: osd_max_markdown_count
  type: int
  level: advanced
  default: 5
  with_legacy: true
- name: osd_op_thread_timeout
  type: int
  level: advanced
  default: 15
  fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
  with_legacy: true
- name: osd_op_thread_suicide_timeout
  type: int
  level: advanced
  default: 150
  with_legacy: true
- name: osd_op_pq_max_tokens_per_priority
  type: uint
  level: advanced
  default: 4_M
  with_legacy: true
- name: osd_op_pq_min_cost
  type: size
  level: advanced
  default: 64_K
  with_legacy: true
# preserve clone_overlap during recovery/migration
- name: osd_recover_clone_overlap
  type: bool
  level: advanced
  default: true
  fmt_desc: Preserves clone overlap during recovery. Should always be set
    to ``true``.
  with_legacy: true
- name: osd_num_cache_shards
  type: size
  level: advanced
  desc: The number of cache shards to use in the object store.
  default: 32
  flags:
  - startup
- name: osd_aggregated_slow_ops_logging
  type: bool
  level: advanced
  desc: Allow OSD daemon to send an aggregated slow ops to the cluster log
  fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in 
    an aggregated format to the cluster log else sends every slow op to the
    cluster log.
  default: true
  with_legacy: true
- name: osd_op_num_threads_per_shard
  type: int
  level: advanced
  default: 0
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_threads_per_shard_hdd
  type: int
  level: advanced
  default: 1
  see_also:
  - osd_op_num_threads_per_shard
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_threads_per_shard_ssd
  type: int
  level: advanced
  default: 2
  see_also:
  - osd_op_num_threads_per_shard
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_shards
  type: int
  level: advanced
  fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
    PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
    non-zero.
  default: 0
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_shards_hdd
  type: int
  level: advanced
  fmt_desc: the number of shards allocated for a given OSD (for rotational media).
  default: 5
  see_also:
  - osd_op_num_shards
  flags:
  - startup
  with_legacy: true
- name: osd_op_num_shards_ssd
  type: int
  level: advanced
  fmt_desc: the number of shards allocated for a given OSD (for solid state media).
  default: 8
  see_also:
  - osd_op_num_shards
  flags:
  - startup
  with_legacy: true
- name: osd_skip_data_digest
  type: bool
  level: dev
  desc: Do not store full-object checksums if the backend (bluestore) does its own
    checksums.  Only usable with all BlueStore OSDs.
  default: false
# PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
# mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
# and "mclock_client" are based on the mClock/dmClock algorithm
# (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
# class the operation belongs to. "mclock_client" does the same but
# also works to ienforce fairness between clients. "debug_random"
# chooses among all four with equal probability.
- name: osd_op_queue
  type: str
  level: advanced
  desc: which operation priority queue algorithm to use
  long_desc: which operation priority queue algorithm to use
  fmt_desc: This sets the type of queue to be used for prioritizing ops
    within each OSD. Both queues feature a strict sub-queue which is
    dequeued before the normal queue. The normal queue is different
    between implementations. The WeightedPriorityQueue (``wpq``)
    dequeues operations in relation to their priorities to prevent
    starvation of any queue. WPQ should help in cases where a few OSDs
    are more overloaded than others. The mClockQueue
    (``mclock_scheduler``) prioritizes operations based on which class
    they belong to (recovery, scrub, snaptrim, client op, osd subop).
    See `QoS Based on mClock`_. Requires a restart.
  default: mclock_scheduler
  see_also:
  - osd_op_queue_cut_off
  enum_values:
  - wpq
  - mclock_scheduler
  - debug_random
  with_legacy: true
# Min priority to go to strict queue. (low, high)
- name: osd_op_queue_cut_off
  type: str
  level: advanced
  desc: the threshold between high priority ops and low priority ops
  long_desc: the threshold between high priority ops that use strict priority ordering
    and low priority ops that use a fairness algorithm that may or may not incorporate
    priority
  fmt_desc: This selects which priority ops will be sent to the strict
    queue verses the normal queue. The ``low`` setting sends all
    replication ops and higher to the strict queue, while the ``high``
    option sends only replication acknowledgment ops and higher to
    the strict queue. Setting this to ``high`` should help when a few
    OSDs in the cluster are very busy especially when combined with
    ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
    handling replication traffic could starve primary client traffic
    on these OSDs without these settings. Requires a restart.
  default: high
  see_also:
  - osd_op_queue
  enum_values:
  - low
  - high
  - debug_random
  with_legacy: true
- name: osd_mclock_scheduler_client_res
  type: float
  level: advanced
  desc: IO proportion reserved for each client (default). The default value
    of 0 specifies the lowest possible reservation. Any value greater than
    0 and up to 1.0 specifies the minimum IO proportion to reserve for each
    client in terms of a fraction of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO proportion reserved for each client (default).
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_client_wgt
  type: uint
  level: advanced
  desc: IO share for each client (default) over reservation
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO share for each client (default) over reservation.
  default: 1
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_client_lim
  type: float
  level: advanced
  desc: IO limit for each client (default) over reservation. The default
    value of 0 specifies no limit enforcement, which means each client can
    use the maximum possible IOPS capacity of the OSD. Any value greater
    than 0 and up to 1.0 specifies the upper IO limit over reservation
    that each client receives in terms of a fraction of the OSD's
    maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO limit for each client (default) over reservation.
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_recovery_res
  type: float
  level: advanced
  desc: IO proportion reserved for background recovery (default). The
    default value of 0 specifies the lowest possible reservation. Any value
    greater than 0 and up to 1.0 specifies the minimum IO proportion to
    reserve for background recovery operations in terms of a fraction of
    the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO proportion reserved for background recovery (default).
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_recovery_wgt
  type: uint
  level: advanced
  desc: IO share for each background recovery over reservation
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO share for each background recovery over reservation.
  default: 1
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_recovery_lim
  type: float
  level: advanced
  desc: IO limit for background recovery over reservation. The default
    value of 0 specifies no limit enforcement, which means background
    recovery operation can use the maximum possible IOPS capacity of the
    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
    limit over reservation that background recovery operation receives in
    terms of a fraction of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO limit for background recovery over reservation.
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_res
  type: float
  level: advanced
  desc: IO proportion reserved for background best_effort (default). The
    default value of 0 specifies the lowest possible reservation. Any value
    greater than 0 and up to 1.0 specifies the minimum IO proportion to
    reserve for background best_effort operations in terms of a fraction
    of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO proportion reserved for background best_effort (default).
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_wgt
  type: uint
  level: advanced
  desc: IO share for each background best_effort over reservation
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO share for each background best_effort over reservation.
  default: 1
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_background_best_effort_lim
  type: float
  level: advanced
  desc: IO limit for background best_effort over reservation. The default
    value of 0 specifies no limit enforcement, which means background
    best_effort operation can use the maximum possible IOPS capacity of the
    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
    limit over reservation that background best_effort operation receives
    in terms of a fraction of the OSD's maximum IOPS capacity.
  long_desc: Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: IO limit for background best_effort over reservation.
  default: 0
  min: 0
  max: 1.0
  see_also:
  - osd_op_queue
- name: osd_mclock_scheduler_anticipation_timeout
  type: float
  level: advanced
  desc: mclock anticipation timeout in seconds
  long_desc: the amount of time that mclock waits until the unused resource is forfeited
  default: 0
- name: osd_mclock_max_sequential_bandwidth_hdd
  type: size
  level: basic
  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
    rotational media)
  long_desc: This option specifies the maximum sequential bandwidth to consider
    for an OSD whose underlying device type is rotational media. This is
    considered by the mclock scheduler to derive the cost factor to be used in
    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
    OSD (for rotational media)
  default: 150_M
  flags:
  - runtime
- name: osd_mclock_max_sequential_bandwidth_ssd
  type: size
  level: basic
  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
    solid state media)
  long_desc: This option specifies the maximum sequential bandwidth to consider
    for an OSD whose underlying device type is solid state media. This is
    considered by the mclock scheduler to derive the cost factor to be used in
    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
    OSD (for solid state media)
  default: 1200_M
  flags:
  - runtime
- name: osd_mclock_max_capacity_iops_hdd
  type: float
  level: basic
  desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD
    (for rotational media)
  long_desc: This option specifies the max OSD random write IOPS capacity per
    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
    considered for osd_op_queue = mclock_scheduler
  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
    OSD (for rotational media)
  default: 315
  flags:
  - runtime
- name: osd_mclock_max_capacity_iops_ssd
  type: float
  level: basic
  desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD
    (for solid state media)
  long_desc: This option specifies the max OSD random write IOPS capacity per
    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
    considered for osd_op_queue = mclock_scheduler
  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
    OSD (for solid state media)
  default: 21500
  flags:
  - runtime
- name: osd_mclock_force_run_benchmark_on_init
  type: bool
  level: advanced
  desc: Force run the OSD benchmark on OSD initialization/boot-up
  long_desc: This option specifies whether the OSD benchmark must be run during
    the OSD boot-up sequence even if historical data about the OSD iops capacity
    is available in the MON config store. Enable this to refresh the OSD iops
    capacity if the underlying device's performance characteristics have changed
    significantly. Only considered for osd_op_queue = mclock_scheduler.
  fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up
  default: false
  see_also:
  - osd_mclock_max_capacity_iops_hdd
  - osd_mclock_max_capacity_iops_ssd
  flags:
  - startup
- name: osd_mclock_skip_benchmark
  type: bool
  level: dev
  desc: Skip the OSD benchmark on OSD initialization/boot-up
  long_desc: This option specifies whether the OSD benchmark must be skipped during
    the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler.
  fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up
  default: false
  see_also:
  - osd_mclock_max_capacity_iops_hdd
  - osd_mclock_max_capacity_iops_ssd
  flags:
  - runtime
- name: osd_mclock_profile
  type: str
  level: advanced
  desc: Which mclock profile to use
  long_desc: This option specifies the mclock profile to enable - one among the set
    of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
  fmt_desc: |
    This sets the type of mclock profile to use for providing QoS
    based on operations belonging to different classes (background
    recovery, scrub, snaptrim, client op, osd subop). Once a built-in
    profile is enabled, the lower level mclock resource control
    parameters [*reservation, weight, limit*] and some Ceph
    configuration parameters are set transparently. Note that the
    above does not apply for the *custom* profile.
  default: balanced
  see_also:
  - osd_op_queue
  enum_values:
  - balanced
  - high_recovery_ops
  - high_client_ops
  - custom
  flags:
  - runtime
- name: osd_mclock_override_recovery_settings
  type: bool
  level: advanced
  desc: Setting this option enables the override of recovery/backfill limits
    for the mClock scheduler.
  long_desc: This option when set enables the override of the max recovery
    active and the max backfills limits with mClock scheduler active. These
    options are not modifiable when mClock scheduler is active. Any attempt
    to modify these values without setting this option will reset the
    recovery or backfill option back to its default value.
  fmt_desc: Setting this option will enable the override of the
    recovery/backfill limits for the mClock scheduler as defined by the
    ``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and
    ``osd_max_backfills`` options.
  default: false
  see_also:
  - osd_recovery_max_active_hdd
  - osd_recovery_max_active_ssd
  - osd_max_backfills
  flags:
  - runtime
- name: osd_mclock_iops_capacity_threshold_hdd
  type: float
  level: basic
  desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
    the OSD bench results for an OSD (for rotational media)
  long_desc: This option specifies the threshold IOPS capacity for an OSD under
    which the OSD bench results can be considered for QoS calculations. Only
    considered for osd_op_queue = mclock_scheduler
  fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
    ignore OSD bench results for an OSD (for rotational media)
  default: 500
  flags:
  - runtime
- name: osd_mclock_iops_capacity_threshold_ssd
  type: float
  level: basic
  desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
    the OSD bench results for an OSD (for solid state media)
  long_desc: This option specifies the threshold IOPS capacity for an OSD under
    which the OSD bench results can be considered for QoS calculations. Only
    considered for osd_op_queue = mclock_scheduler
  fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
    ignore OSD bench results for an OSD (for solid state media)
  default: 80000
  flags:
  - runtime
# Set to true for testing.  Users should NOT set this.
# If set to true even after reading enough shards to
# decode the object, any error will be reported.
- name: osd_read_ec_check_for_errors
  type: bool
  level: advanced
  default: false
  with_legacy: true
- name: osd_recovery_delay_start
  type: float
  level: advanced
  default: 0
  fmt_desc: After peering completes, Ceph will delay for the specified number
    of seconds before starting to recover RADOS objects.
  with_legacy: true
- name: osd_recovery_max_active
  type: uint
  level: advanced
  desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
    and _hdd if non-zero)
  fmt_desc: The number of active recovery requests per OSD at one time. More
    requests will accelerate recovery, but the requests places an
    increased load on the cluster.
  note: This value is only used if it is non-zero. Normally it
    is ``0``, which means that the ``hdd`` or ``ssd`` values
    (below) are used, depending on the type of the primary
    device backing the OSD.
  default: 0
  see_also:
  - osd_recovery_max_active_hdd
  - osd_recovery_max_active_ssd
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_max_active_hdd
  type: uint
  level: advanced
  desc: Number of simultaneous active recovery operations per OSD (for rotational
    devices)
  fmt_desc: The number of active recovery requests per OSD at one time, if the
    primary device is rotational.
  default: 3
  see_also:
  - osd_recovery_max_active
  - osd_recovery_max_active_ssd
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_max_active_ssd
  type: uint
  level: advanced
  desc: Number of simultaneous active recovery operations per OSD (for non-rotational
    solid state devices)
  fmt_desc: The number of active recovery requests per OSD at one time, if the
    primary device is non-rotational (i.e., an SSD).
  default: 10
  see_also:
  - osd_recovery_max_active
  - osd_recovery_max_active_hdd
  flags:
  - runtime
  with_legacy: true
- name: osd_recovery_max_single_start
  type: uint
  level: advanced
  default: 1
  fmt_desc: The maximum number of recovery operations per OSD that will be
    newly started when an OSD is recovering.
  with_legacy: true
# max size of push chunk
- name: osd_recovery_max_chunk
  type: size
  level: advanced
  default: 8_M
  fmt_desc: the maximum total size of data chunks a recovery op can carry.
  with_legacy: true
# max number of omap entries per chunk; 0 to disable limit
- name: osd_recovery_max_omap_entries_per_chunk
  type: uint
  level: advanced
  default: 8096
  with_legacy: true
# max size of a COPYFROM chunk
- name: osd_copyfrom_max_chunk
  type: size
  level: advanced
  default: 8_M
  with_legacy: true
# push cost per object
- name: osd_push_per_object_cost
  type: size
  level: advanced
  default: 1000
  fmt_desc: the overhead for serving a push op
  with_legacy: true
# max size of push message
- name: osd_max_push_cost
  type: size
  level: advanced
  default: 8_M
  with_legacy: true
# max objects in single push op
- name: osd_max_push_objects
  type: uint
  level: advanced
  default: 10
  with_legacy: true
# Only use clone_overlap for recovery if there are fewer than
# osd_recover_clone_overlap_limit entries in the overlap set
- name: osd_recover_clone_overlap_limit
  type: uint
  level: advanced
  default: 10
  flags:
  - runtime
- name: osd_debug_feed_pullee
  type: int
  level: dev
  desc: Feed a pullee, and force primary to pull a currently missing object from it
  default: -1
  with_legacy: true
- name: osd_backfill_scan_min
  type: int
  level: advanced
  default: 64
  fmt_desc: The minimum number of objects per backfill scan.
  with_legacy: true
- name: osd_backfill_scan_max
  type: int
  level: advanced
  default: 512
  fmt_desc: The maximum number of objects per backfill scan.p
  with_legacy: true
- name: osd_extblkdev_plugins
  type: str
  level: advanced
  desc: extended block device plugins to load, provide compression feedback at runtime
  default: vdo
  flags:
  - startup
# minimum number of peers
- name: osd_heartbeat_min_peers
  type: int
  level: advanced
  default: 10
  with_legacy: true
- name: osd_delete_sleep
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction. This setting
    overrides _ssd, _hdd, and _hybrid if non-zero.
  fmt_desc: Time in seconds to sleep before the next removal transaction. This
    throttles the PG deletion process.
  default: 0
  flags:
  - runtime
- name: osd_delete_sleep_hdd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction for HDDs
  default: 5
  flags:
  - runtime
- name: osd_delete_sleep_ssd
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction for SSDs
  default: 1
  flags:
  - runtime
- name: osd_delete_sleep_hybrid
  type: float
  level: advanced
  desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
    and OSD journal or WAL+DB is on SSD
  default: 1
  flags:
  - runtime
- name: osd_rocksdb_iterator_bounds_enabled
  desc: Whether omap iterator bounds are applied to rocksdb iterator ReadOptions
  type: bool
  level: dev
  default: true
  with_legacy: true