ceph/monitoring/ceph-mixin/prometheus_alerts.yml

   1 groups:
   2   - name: cluster health
   3     rules:
   4       - alert: CephHealthError
   5         expr: ceph_health_status == 2
   6         for: 5m
   7         labels:
   8           severity: critical
   9           type: ceph_default
  10           oid: 1.3.6.1.4.1.50495.1.2.1.2.1
  11         annotations:
  12           summary: Cluster is in an ERROR state
  13           description: >
  14             Ceph in HEALTH_ERROR state for more than 5 minutes.
  15             Please check "ceph health detail" for more information.
  16
  17       - alert: CephHealthWarning
  18         expr: ceph_health_status == 1
  19         for: 15m
  20         labels:
  21           severity: warning
  22           type: ceph_default
  23         annotations:
  24           summary: Cluster is in a WARNING state
  25           description: >
  26             Ceph has been in HEALTH_WARN for more than 15 minutes.
  27             Please check "ceph health detail" for more information.
  28
  29   - name: mon
  30     rules:
  31       - alert: CephMonDownQuorumAtRisk
  32         expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
  33         for: 30s
  34         labels:
  35           severity: critical
  36           type: ceph_default
  37           oid: 1.3.6.1.4.1.50495.1.2.1.3.1
  38         annotations:
  39           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
  40           summary: Monitor quorum is at risk
  41           description: |
  42             {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active
  43             Without quorum the cluster will become inoperable, affecting all connected clients and services.
  44
  45             The following monitors are down:
  46             {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
  47               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  48             {{- end }}
  49       - alert: CephMonDown
  50         expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
  51         for: 30s
  52         labels:
  53           severity: warning
  54           type: ceph_default
  55         annotations:
  56           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
  57           summary: One of more ceph monitors are down
  58           description: |
  59             {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down.
  60             Quorum is still intact, but the loss of further monitors will make your cluster inoperable.
  61
  62             The following monitors are down:
  63             {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
  64               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  65             {{- end }}
  66       - alert: CephMonDiskspaceCritical
  67         expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
  68         for: 1m
  69         labels:
  70           severity: critical
  71           type: ceph_default
  72           oid: 1.3.6.1.4.1.50495.1.2.1.3.2
  73         annotations:
  74           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
  75           summary: Disk space on at least one monitor is critically low
  76           description: |
  77             The free space available to a monitor's store is critically low (<5% by default).
  78             You should increase the space available to the monitor(s). The
  79             default location for the store sits under /var/lib/ceph. Your monitor hosts are;
  80             {{- range query "ceph_mon_metadata"}}
  81               - {{ .Labels.hostname }}
  82             {{- end }}
  83
  84       - alert: CephMonDiskspaceLow
  85         expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
  86         for: 5m
  87         labels:
  88           severity: warning
  89           type: ceph_default
  90         annotations:
  91           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
  92           summary: Disk space on at least one monitor is approaching full
  93           description: |
  94             The space available to a monitor's store is approaching full (>70% is the default).
  95             You should increase the space available to the monitor store. The
  96             default location for the store sits under /var/lib/ceph. Your monitor hosts are;
  97             {{- range query "ceph_mon_metadata"}}
  98               - {{ .Labels.hostname }}
  99             {{- end }}
 100
 101       - alert: CephMonClockSkew
 102         expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
 103         for: 1m
 104         labels:
 105           severity: warning
 106           type: ceph_default
 107         annotations:
 108           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
 109           summary: Clock skew across the Monitor hosts detected
 110           description: |
 111             The ceph monitors rely on a consistent time reference to maintain
 112             quorum and cluster consistency. This event indicates that at least
 113             one of your mons is not sync'd correctly.
 114
 115             Review the cluster status with ceph -s. This will show which monitors
 116             are affected. Check the time sync status on each monitor host.
 117
 118   - name: osd
 119     rules:
 120       - alert: CephOSDDownHigh
 121         expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
 122         labels:
 123           severity: critical
 124           type: ceph_default
 125           oid: 1.3.6.1.4.1.50495.1.2.1.4.1
 126         annotations:
 127           summary: More than 10% of OSDs are down
 128           description: |
 129             {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%).
 130
 131             The following OSDs are down:
 132             {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
 133               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
 134             {{- end }}
 135       - alert: CephOSDHostDown
 136         expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
 137         for: 5m
 138         labels:
 139           severity: warning
 140           type: ceph_default
 141           oid: 1.3.6.1.4.1.50495.1.2.1.4.8
 142         annotations:
 143           summary: An OSD host is offline
 144           description: |
 145             The following OSDs are down:
 146             {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
 147             - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }}
 148             {{- end }}
 149       - alert: CephOSDDown
 150         expr: ceph_health_detail{name="OSD_DOWN"} == 1
 151         for: 5m
 152         labels:
 153           severity: warning
 154           type: ceph_default
 155           oid: 1.3.6.1.4.1.50495.1.2.1.4.2
 156         annotations:
 157           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
 158           summary: An OSD has been marked down/unavailable
 159           description: |
 160             {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins.
 161
 162             The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
 163               {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
 164               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
 165               {{- end }}
 166
 167       - alert: CephOSDNearFull
 168         expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
 169         for: 5m
 170         labels:
 171           severity: warning
 172           type: ceph_default
 173           oid: 1.3.6.1.4.1.50495.1.2.1.4.3
 174         annotations:
 175           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
 176           summary: OSD(s) running low on free space (NEARFULL)
 177           description: |
 178             One or more OSDs have reached their NEARFULL threshold
 179
 180             Use 'ceph health detail' to identify which OSDs have reached this threshold.
 181             To resolve, either add capacity to the cluster, or delete unwanted data
 182       - alert: CephOSDFull
 183         expr: ceph_health_detail{name="OSD_FULL"} > 0
 184         for: 1m
 185         labels:
 186           severity: critical
 187           type: ceph_default
 188           oid: 1.3.6.1.4.1.50495.1.2.1.4.6
 189         annotations:
 190           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
 191           summary: OSD(s) is full, writes blocked
 192           description: |
 193             An OSD has reached it's full threshold. Writes from all pools that share the
 194             affected OSD will be blocked.
 195
 196             To resolve, either add capacity to the cluster, or delete unwanted data
 197       - alert: CephOSDBackfillFull
 198         expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
 199         for: 1m
 200         labels:
 201           severity: warning
 202           type: ceph_default
 203         annotations:
 204           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
 205           summary: OSD(s) too full for backfill operations
 206           description: |
 207             An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations
 208             completing for some pools. Check the current capacity utilisation with 'ceph df'
 209
 210             To resolve, either add capacity to the cluster, or delete unwanted data
 211       - alert: CephOSDTooManyRepairs
 212         expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
 213         for: 30s
 214         labels:
 215           severity: warning
 216           type: ceph_default
 217         annotations:
 218           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
 219           summary: OSD has hit a high number of read errors
 220           description: |
 221             Reads from an OSD have used a secondary PG to return data to the client, indicating
 222             a potential failing disk.
 223       - alert: CephOSDTimeoutsPublicNetwork
 224         expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
 225         for: 1m
 226         labels:
 227           severity: warning
 228           type: ceph_default
 229         annotations:
 230           summary: Network issues delaying OSD heartbeats (public network)
 231           description: |
 232             OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
 233             for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
 234       - alert: CephOSDTimeoutsClusterNetwork
 235         expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
 236         for: 1m
 237         labels:
 238           severity: warning
 239           type: ceph_default
 240         annotations:
 241           summary: Network issues delaying OSD heartbeats (cluster network)
 242           description: |
 243             OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
 244             for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
 245       - alert: CephOSDInternalDiskSizeMismatch
 246         expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
 247         for: 1m
 248         labels:
 249           severity: warning
 250           type: ceph_default
 251         annotations:
 252           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
 253           summary: OSD size inconsistency error
 254           description: |
 255             One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata.
 256             This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs.
 257       - alert: CephDeviceFailurePredicted
 258         expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
 259         for: 1m
 260         labels:
 261           severity: warning
 262           type: ceph_default
 263         annotations:
 264           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
 265           summary: Device(s) have been predicted to fail soon
 266           description: |
 267             The device health module has determined that one or more devices will fail
 268             soon. To review the device states use 'ceph device ls'. To show a specific
 269             device use 'ceph device info <dev id>'.
 270
 271             Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once
 272             the osd is empty remove and replace the OSD.
 273       - alert: CephDeviceFailurePredictionTooHigh
 274         expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
 275         for: 1m
 276         labels:
 277           severity: critical
 278           type: ceph_default
 279           oid: 1.3.6.1.4.1.50495.1.2.1.4.7
 280         annotations:
 281           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
 282           summary: Too many devices have been predicted to fail, unable to resolve
 283           description: |
 284             The device health module has determined that the number of devices predicted to
 285             fail can not be remediated automatically, since it would take too many osd's out of
 286             the cluster, impacting performance and potentially availabililty. You should add new
 287             OSDs to the cluster to allow data to be relocated to avoid the data integrity issues.
 288       - alert: CephDeviceFailureRelocationIncomplete
 289         expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
 290         for: 1m
 291         labels:
 292           severity: warning
 293           type: ceph_default
 294         annotations:
 295           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
 296           summary: A device failure is predicted, but unable to relocate data
 297           description: |
 298             The device health module has determined that one or more devices will fail
 299             soon, but the normal process of relocating the data on the device to other
 300             OSDs in the cluster is blocked.
 301
 302             Check the the cluster has available freespace. It may be necessary to add
 303             more disks to the cluster to allow the data from the failing device to
 304             successfully migrate.
 305
 306       - alert: CephOSDFlapping
 307         expr: |
 308           (
 309             rate(ceph_osd_up[5m])
 310             * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
 311           ) * 60 > 1
 312         labels:
 313           severity: warning
 314           type: ceph_default
 315           oid: 1.3.6.1.4.1.50495.1.2.1.4.4
 316         annotations:
 317           documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
 318           summary: Network issues are causing OSD's to flap (mark each other out)
 319           description: >
 320             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
 321             marked down and back up at {{ $value | humanize }} times once a
 322             minute for 5 minutes. This could indicate a network issue (latency,
 323             packet drop, disruption) on the clusters "cluster network". Check the
 324             network environment on the listed host(s).
 325
 326       - alert: CephOSDReadErrors
 327         expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
 328         for: 30s
 329         labels:
 330           severity: warning
 331           type: ceph_default
 332         annotations:
 333           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
 334           summary: Device read errors detected
 335           description: >
 336             An OSD has encountered read errors, but the OSD has recovered by retrying
 337             the reads. This may indicate an issue with the Hardware or Kernel.
 338       # alert on high deviation from average PG count
 339       - alert: CephPGImbalance
 340         expr: |
 341           abs(
 342             (
 343               (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 344             ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 345           ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 346         for: 5m
 347         labels:
 348           severity: warning
 349           type: ceph_default
 350           oid: 1.3.6.1.4.1.50495.1.2.1.4.5
 351         annotations:
 352           summary: PG allocations are not balanced across devices
 353           description: >
 354             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
 355             by more than 30% from average PG count.
 356       # alert on high commit latency...but how high is too high
 357
 358   - name: mds
 359     rules:
 360       - alert: CephFilesystemDamaged
 361         expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
 362         for: 1m
 363         labels:
 364           severity: critical
 365           type: ceph_default
 366           oid: 1.3.6.1.4.1.50495.1.2.1.5.1
 367         annotations:
 368           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
 369           summary: Ceph filesystem is damaged.
 370           description: >
 371             The filesystems metadata has been corrupted. Data access
 372             may be blocked.
 373
 374             Either analyse the output from the mds daemon admin socket, or
 375             escalate to support
 376       - alert: CephFilesystemOffline
 377         expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
 378         for: 1m
 379         labels:
 380           severity: critical
 381           type: ceph_default
 382           oid: 1.3.6.1.4.1.50495.1.2.1.5.3
 383         annotations:
 384           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
 385           summary: Ceph filesystem is offline
 386           description: >
 387             All MDS ranks are unavailable. The ceph daemons providing the metadata
 388             for the Ceph filesystem are all down, rendering the filesystem offline.
 389       - alert: CephFilesystemDegraded
 390         expr: ceph_health_detail{name="FS_DEGRADED"} > 0
 391         for: 1m
 392         labels:
 393           severity: critical
 394           type: ceph_default
 395           oid: 1.3.6.1.4.1.50495.1.2.1.5.4
 396         annotations:
 397           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
 398           summary: Ceph filesystem is degraded
 399           description: >
 400             One or more metadata daemons (MDS ranks) are failed or in a
 401             damaged state. At best the filesystem is partially available,
 402             worst case is the filesystem is completely unusable.
 403       - alert: CephFilesystemMDSRanksLow
 404         expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
 405         for: 1m
 406         labels:
 407           severity: warning
 408           type: ceph_default
 409         annotations:
 410           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
 411           summary: Ceph MDS daemon count is lower than configured
 412           description: >
 413             The filesystem's "max_mds" setting defined the number of MDS ranks in
 414             the filesystem. The current number of active MDS daemons is less than
 415             this setting.
 416       - alert: CephFilesystemInsufficientStandby
 417         expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
 418         for: 1m
 419         labels:
 420           severity: warning
 421           type: ceph_default
 422         annotations:
 423           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
 424           summary: Ceph filesystem standby daemons too low
 425           description: >
 426             The minimum number of standby daemons determined by standby_count_wanted
 427             is less than the actual number of standby daemons. Adjust the standby count
 428             or increase the number of mds daemons within the filesystem.
 429       - alert: CephFilesystemFailureNoStandby
 430         expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
 431         for: 1m
 432         labels:
 433           severity: critical
 434           type: ceph_default
 435           oid: 1.3.6.1.4.1.50495.1.2.1.5.5
 436         annotations:
 437           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
 438           summary: Ceph MDS daemon failed, no further standby available
 439           description: >
 440             An MDS daemon has failed, leaving only one active rank without
 441             further standby. Investigate the cause of the failure or add a
 442             standby daemon
 443       - alert: CephFilesystemReadOnly
 444         expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
 445         for: 1m
 446         labels:
 447           severity: critical
 448           type: ceph_default
 449           oid: 1.3.6.1.4.1.50495.1.2.1.5.2
 450         annotations:
 451           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
 452           summary: Ceph filesystem in read only mode, due to write error(s)
 453           description: >
 454             The filesystem has switched to READ ONLY due to an unexpected
 455             write error, when writing to the metadata pool
 456
 457             Either analyse the output from the mds daemon admin socket, or
 458             escalate to support
 459
 460   - name: mgr
 461     rules:
 462       - alert: CephMgrModuleCrash
 463         expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
 464         for: 5m
 465         labels:
 466           severity: critical
 467           type: ceph_default
 468           oid: 1.3.6.1.4.1.50495.1.2.1.6.1
 469         annotations:
 470           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
 471           summary: A mgr module has recently crashed
 472           description: >
 473             One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A
 474             crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to
 475             investigate which module has failed, and archive it to acknowledge the failure.
 476       - alert: CephMgrPrometheusModuleInactive
 477         expr: up{job="ceph"} == 0
 478         for: 1m
 479         labels:
 480           severity: critical
 481           type: ceph_default
 482           oid: 1.3.6.1.4.1.50495.1.2.1.6.2
 483         annotations:
 484           summary: Ceph's mgr/prometheus module is not available
 485           description: >
 486             The mgr/prometheus module at {{ $labels.instance }} is unreachable. This
 487             could mean that the module has been disabled or the mgr itself is down.
 488
 489             Without the mgr/prometheus module metrics and alerts will no longer
 490             function. Open a shell to ceph and use 'ceph -s' to to determine whether the
 491             mgr is active. If the mgr is not active, restart it, otherwise you can check
 492             the mgr/prometheus module is loaded with 'ceph mgr module ls'  and if it's
 493             not listed as enabled, enable it with 'ceph mgr module enable prometheus'
 494
 495   - name: pgs
 496     rules:
 497       - alert: CephPGsInactive
 498         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
 499         for: 5m
 500         labels:
 501           severity: critical
 502           type: ceph_default
 503           oid: 1.3.6.1.4.1.50495.1.2.1.7.1
 504         annotations:
 505           summary: One or more Placement Groups are inactive
 506           description: >
 507             {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
 508             Inactive placement groups aren't able to serve read/write
 509             requests.
 510       - alert: CephPGsUnclean
 511         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
 512         for: 15m
 513         labels:
 514           severity: warning
 515           type: ceph_default
 516           oid: 1.3.6.1.4.1.50495.1.2.1.7.2
 517         annotations:
 518           summary: One or more platcment groups are marked unclean
 519           description: >
 520             {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
 521             Unclean PGs haven't been able to completely recover from a previous failure.
 522       - alert: CephPGsDamaged
 523         expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
 524         for: 5m
 525         labels:
 526           severity: critical
 527           type: ceph_default
 528           oid: 1.3.6.1.4.1.50495.1.2.1.7.4
 529         annotations:
 530           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
 531           summary: Placement group damaged, manual intervention needed
 532           description: >
 533             During data consistency checks (scrub), at least one PG has been flagged as being
 534             damaged or inconsistent.
 535
 536             Check to see which PG is affected, and attempt a manual repair if necessary. To list
 537             problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use
 538             the 'ceph pg repair <pg_num>' command.
 539       - alert: CephPGRecoveryAtRisk
 540         expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
 541         for: 1m
 542         labels:
 543           severity: critical
 544           type: ceph_default
 545           oid: 1.3.6.1.4.1.50495.1.2.1.7.5
 546         annotations:
 547           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
 548           summary: OSDs are too full for automatic recovery
 549           description: >
 550             Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their
 551             'full' threshold. Add more capacity to the cluster, or delete unwanted data.
 552       - alert: CephPGUnavilableBlockingIO
 553         # PG_AVAILABILITY, but an OSD is not in a DOWN state
 554         expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1
 555         for: 1m
 556         labels:
 557           severity: critical
 558           type: ceph_default
 559           oid: 1.3.6.1.4.1.50495.1.2.1.7.3
 560         annotations:
 561           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
 562           summary: Placement group is unavailable, blocking some I/O
 563           description: >
 564             Data availability is reduced impacting the clusters ability to service I/O to some data. One or
 565             more placement groups (PGs) are in a state that blocks IO.
 566       - alert: CephPGBackfillAtRisk
 567         expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
 568         for: 1m
 569         labels:
 570           severity: critical
 571           type: ceph_default
 572           oid: 1.3.6.1.4.1.50495.1.2.1.7.6
 573         annotations:
 574           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
 575           summary: Backfill operations are blocked, due to lack of freespace
 576           description: >
 577             Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
 578             have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
 579       - alert: CephPGNotScrubbed
 580         expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
 581         for: 5m
 582         labels:
 583           severity: warning
 584           type: ceph_default
 585         annotations:
 586           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
 587           summary: Placement group(s) have not been scrubbed
 588           description: |
 589             One or more PGs have not been scrubbed recently. The scrub process is a data integrity
 590             feature, protectng against bit-rot. It checks that objects and their metadata (size and
 591             attributes) match across object replicas. When PGs miss their scrub window, it may
 592             indicate the scrub window is too small, or PGs were not in a 'clean' state during the
 593             scrub window.
 594
 595             You can manually initiate a scrub with: ceph pg scrub <pgid>
 596       - alert: CephPGsHighPerOSD
 597         expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
 598         for: 1m
 599         labels:
 600           severity: warning
 601           type: ceph_default
 602         annotations:
 603           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
 604           summary: Placement groups per OSD is too high
 605           description: |
 606             The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).
 607
 608             Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status'
 609             and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide
 610             the autoscaler based on the expected relative size of the pool
 611             (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1')
 612       - alert: CephPGNotDeepScrubbed
 613         expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
 614         for: 5m
 615         labels:
 616           severity: warning
 617           type: ceph_default
 618         annotations:
 619           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
 620           summary: Placement group(s) have not been deep scrubbed
 621           description: |
 622             One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity
 623             feature, protectng against bit-rot. It compares the contents of objects and their
 624             replicas for inconsistency. When PGs miss their deep scrub window, it may indicate
 625             that the window is too small or PGs were not in a 'clean' state during the deep-scrub
 626             window.
 627
 628             You can manually initiate a deep scrub with: ceph pg deep-scrub <pgid>
 629
 630   - name: nodes
 631     rules:
 632       - alert: CephNodeRootFilesystemFull
 633         expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
 634         for: 5m
 635         labels:
 636           severity: critical
 637           type: ceph_default
 638           oid: 1.3.6.1.4.1.50495.1.2.1.8.1
 639         annotations:
 640           summary: Root filesystem is dangerously full
 641           description: >
 642             Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
 643
 644       # alert on nic packet errors and drops rates > 1% packets/s
 645       - alert: CephNodeNetworkPacketDrops
 646         expr: |
 647           (
 648             increase(node_network_receive_drop_total{device!="lo"}[1m]) +
 649             increase(node_network_transmit_drop_total{device!="lo"}[1m])
 650           ) / (
 651             increase(node_network_receive_packets_total{device!="lo"}[1m]) +
 652             increase(node_network_transmit_packets_total{device!="lo"}[1m])
 653           ) >= 0.0001 or (
 654             increase(node_network_receive_drop_total{device!="lo"}[1m]) +
 655             increase(node_network_transmit_drop_total{device!="lo"}[1m])
 656           ) >= 10
 657         labels:
 658           severity: warning
 659           type: ceph_default
 660           oid: 1.3.6.1.4.1.50495.1.2.1.8.2
 661         annotations:
 662           summary: One or more Nics is seeing packet drops
 663           description: >
 664             Node {{ $labels.instance }} experiences packet drop > 0.01% or >
 665             10 packets/s on interface {{ $labels.device }}.
 666
 667       - alert: CephNodeNetworkPacketErrors
 668         expr: |
 669           (
 670             increase(node_network_receive_errs_total{device!="lo"}[1m]) +
 671             increase(node_network_transmit_errs_total{device!="lo"}[1m])
 672           ) / (
 673             increase(node_network_receive_packets_total{device!="lo"}[1m]) +
 674             increase(node_network_transmit_packets_total{device!="lo"}[1m])
 675           ) >= 0.0001 or (
 676             increase(node_network_receive_errs_total{device!="lo"}[1m]) +
 677             increase(node_network_transmit_errs_total{device!="lo"}[1m])
 678           ) >= 10
 679         labels:
 680           severity: warning
 681           type: ceph_default
 682           oid: 1.3.6.1.4.1.50495.1.2.1.8.3
 683         annotations:
 684           summary: One or more Nics is seeing packet errors
 685           description: >
 686             Node {{ $labels.instance }} experiences packet errors > 0.01% or
 687             > 10 packets/s on interface {{ $labels.device }}.
 688
 689       # Restrict to device names beginning with '/' to skip false alarms from
 690       # tmpfs, overlay type filesystems
 691       - alert: CephNodeDiskspaceWarning
 692         expr: |
 693           predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
 694           on(instance) group_left(nodename) node_uname_info < 0
 695         labels:
 696           severity: warning
 697           type: ceph_default
 698           oid: 1.3.6.1.4.1.50495.1.2.1.8.4
 699         annotations:
 700           summary: Host filesystem freespace is getting low
 701           description: >
 702             Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
 703             will be full in less than 5 days assuming the average fill-up
 704             rate of the past 48 hours.
 705
 706       - alert: CephNodeInconsistentMTU
 707         expr: |
 708           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
 709             scalar(
 710               max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
 711                 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
 712             )
 713           or
 714           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
 715             scalar(
 716               min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
 717                 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
 718             )
 719         labels:
 720           severity: warning
 721           type: ceph_default
 722         annotations:
 723           summary: MTU settings across Ceph hosts are inconsistent
 724           description: >
 725             Node {{ $labels.instance }} has a different MTU size ({{ $value }})
 726             than the median of devices named {{ $labels.device }}.
 727
 728   - name: pools
 729     rules:
 730       - alert: CephPoolGrowthWarning
 731         expr: |
 732           (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
 733               group_right ceph_pool_metadata) >= 95
 734         labels:
 735           severity: warning
 736           type: ceph_default
 737           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
 738         annotations:
 739           summary: Pool growth rate may soon exceed it's capacity
 740           description: >
 741             Pool '{{ $labels.name }}' will be full in less than 5 days
 742             assuming the average fill-up rate of the past 48 hours.
 743       - alert: CephPoolBackfillFull
 744         expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
 745         labels:
 746           severity: warning
 747           type: ceph_default
 748         annotations:
 749           summary: Freespace in a pool is too low for recovery/rebalance
 750           description: >
 751             A pool is approaching it's near full threshold, which will
 752             prevent rebalance operations from completing. You should
 753             consider adding more capacity to the pool.
 754
 755       - alert: CephPoolFull
 756         expr: ceph_health_detail{name="POOL_FULL"} > 0
 757         for: 1m
 758         labels:
 759           severity: critical
 760           type: ceph_default
 761           oid: 1.3.6.1.4.1.50495.1.2.1.9.1
 762         annotations:
 763           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
 764           summary: Pool is full - writes are blocked
 765           description: |
 766             A pool has reached it's MAX quota, or the OSDs supporting the pool
 767             have reached their FULL threshold. Until this is resolved, writes to
 768             the pool will be blocked.
 769             Pool Breakdown (top 5)
 770             {{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }}
 771               - {{ .Labels.name }} at {{ .Value }}%
 772             {{- end }}
 773             Either increase the pools quota, or add capacity to the cluster first
 774             then increase it's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
 775       - alert: CephPoolNearFull
 776         expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
 777         for: 5m
 778         labels:
 779           severity: warning
 780           type: ceph_default
 781         annotations:
 782           summary: One or more Ceph pools are getting full
 783           description: |
 784             A pool has exceeeded it warning (percent full) threshold, or the OSDs
 785             supporting the pool have reached their NEARFULL thresholds. Writes may
 786             continue, but you are at risk of the pool going read only if more capacity
 787             isn't made available.
 788
 789             Determine the affected pool with 'ceph df detail', for example looking
 790             at QUOTA BYTES and STORED. Either increase the pools quota, or add
 791             capacity to the cluster first then increase it's quota
 792             (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
 793   - name: healthchecks
 794     rules:
 795       - alert: CephSlowOps
 796         expr: ceph_healthcheck_slow_ops > 0
 797         for: 30s
 798         labels:
 799           severity: warning
 800           type: ceph_default
 801         annotations:
 802           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
 803           summary: MON/OSD operations are slow to complete
 804           description: >
 805             {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
 806 # cephadm alerts
 807   - name: cephadm
 808     rules:
 809       - alert: CephadmUpgradeFailed
 810         expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
 811         for: 30s
 812         labels:
 813           severity: critical
 814           type: ceph_default
 815           oid: 1.3.6.1.4.1.50495.1.2.1.11.2
 816         annotations:
 817           summary: Ceph version upgrade has failed
 818           description: >
 819             The cephadm cluster upgrade process has failed. The cluster remains in
 820             an undetermined state.
 821
 822             Please review the cephadm logs, to understand the nature of the issue
 823       - alert: CephadmDaemonFailed
 824         expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
 825         for: 30s
 826         labels:
 827           severity: critical
 828           type: ceph_default
 829           oid: 1.3.6.1.4.1.50495.1.2.1.11.1
 830         annotations:
 831           summary: A ceph daemon manged by cephadm is down
 832           description: >
 833             A daemon managed by cephadm is no longer active. Determine, which
 834             daemon is down with 'ceph health detail'. you may start daemons with
 835             the 'ceph orch daemon start <daemon_id>'
 836       - alert: CephadmPaused
 837         expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
 838         for: 1m
 839         labels:
 840           severity: warning
 841           type: ceph_default
 842         annotations:
 843           documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
 844           summary: Orchestration tasks via cephadm are PAUSED
 845           description: >
 846             Cluster management has been paused manually. This will prevent the
 847             orchestrator from service management and reconciliation. If this is
 848             not intentional, resume cephadm operations with 'ceph orch resume'
 849
 850 # prometheus alerts
 851   - name: PrometheusServer
 852     rules:
 853       - alert: PrometheusJobMissing
 854         expr: absent(up{job="ceph"})
 855         for: 30s
 856         labels:
 857           severity: critical
 858           type: ceph_default
 859           oid: 1.3.6.1.4.1.50495.1.2.1.12.1
 860         annotations:
 861           summary: The scrape job for Ceph is missing from Prometheus
 862           description: |
 863             The prometheus job that scrapes from Ceph is no longer defined, this
 864             will effectively mean you'll have no metrics or alerts for the cluster.
 865
 866             Please review the job definitions in the prometheus.yml file of the prometheus
 867             instance.
 868 # Object related events
 869   - name: rados
 870     rules:
 871       - alert: CephObjectMissing
 872         expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
 873         for: 30s
 874         labels:
 875           severity: critical
 876           type: ceph_default
 877           oid: 1.3.6.1.4.1.50495.1.2.1.10.1
 878         annotations:
 879           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
 880           summary: Object(s) has been marked UNFOUND
 881           description: |
 882             A version of a RADOS object can not be found, even though all OSDs are up. I/O
 883             requests for this object from clients will block (hang). Resolving this issue may
 884             require the object to be rolled back to a prior version manually, and manually verified.
 885 # Generic
 886   - name: generic
 887     rules:
 888       - alert: CephDaemonCrash
 889         expr: ceph_health_detail{name="RECENT_CRASH"} == 1
 890         for: 1m
 891         labels:
 892           severity: critical
 893           type: ceph_default
 894           oid: 1.3.6.1.4.1.50495.1.2.1.1.2
 895         annotations:
 896           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
 897           summary: One or more Ceph daemons have crashed, and are pending acknowledgement
 898           description: |
 899             One or more daemons have crashed recently, and need to be acknowledged. This notification
 900             ensures that software crashes don't go unseen. To acknowledge a crash, use the
 901             'ceph crash archive <id>' command.