ceph/monitoring/ceph-mixin/prometheus_alerts.yml

   1 groups:
   2   - name: "cluster health"
   3     rules:
   4       - alert: "CephHealthError"
   5         annotations:
   6           description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information."
   7           summary: "Ceph is in the ERROR state"
   8         expr: "ceph_health_status == 2"
   9         for: "5m"
  10         labels:
  11           oid: "1.3.6.1.4.1.50495.1.2.1.2.1"
  12           severity: "critical"
  13           type: "ceph_default"
  14       - alert: "CephHealthWarning"
  15         annotations:
  16           description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information."
  17           summary: "Ceph is in the WARNING state"
  18         expr: "ceph_health_status == 1"
  19         for: "15m"
  20         labels:
  21           severity: "warning"
  22           type: "ceph_default"
  23   - name: "mon"
  24     rules:
  25       - alert: "CephMonDownQuorumAtRisk"
  26         annotations:
  27           description: "{{ $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
  28           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
  29           summary: "Monitor quorum is at risk"
  30         expr: |
  31           (
  32             (ceph_health_detail{name="MON_DOWN"} == 1) * on() (
  33               count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
  34             )
  35           ) == 1
  36         for: "30s"
  37         labels:
  38           oid: "1.3.6.1.4.1.50495.1.2.1.3.1"
  39           severity: "critical"
  40           type: "ceph_default"
  41       - alert: "CephMonDown"
  42         annotations:
  43           description: |
  44             {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
  45           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down"
  46           summary: "One or more monitors down"
  47         expr: |
  48           count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
  49         for: "30s"
  50         labels:
  51           severity: "warning"
  52           type: "ceph_default"
  53       - alert: "CephMonDiskspaceCritical"
  54         annotations:
  55           description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
  56           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit"
  57           summary: "Filesystem space on at least one monitor is critically low"
  58         expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1"
  59         for: "1m"
  60         labels:
  61           oid: "1.3.6.1.4.1.50495.1.2.1.3.2"
  62           severity: "critical"
  63           type: "ceph_default"
  64       - alert: "CephMonDiskspaceLow"
  65         annotations:
  66           description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}"
  67           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low"
  68           summary: "Drive space on at least one monitor is approaching full"
  69         expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1"
  70         for: "5m"
  71         labels:
  72           severity: "warning"
  73           type: "ceph_default"
  74       - alert: "CephMonClockSkew"
  75         annotations:
  76           description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
  77           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew"
  78           summary: "Clock skew detected among monitors"
  79         expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1"
  80         for: "1m"
  81         labels:
  82           severity: "warning"
  83           type: "ceph_default"
  84   - name: "osd"
  85     rules:
  86       - alert: "CephOSDDownHigh"
  87         annotations:
  88           description: "{{ $value | humanize }}% or {{ with query \"count(ceph_osd_up == 0)\" }}{{ . | first | value }}{{ end }} of {{ with query \"count(ceph_osd_up)\" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}"
  89           summary: "More than 10% of OSDs are down"
  90         expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10"
  91         labels:
  92           oid: "1.3.6.1.4.1.50495.1.2.1.4.1"
  93           severity: "critical"
  94           type: "ceph_default"
  95       - alert: "CephOSDHostDown"
  96         annotations:
  97           description: "The following OSDs are down: {{- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}"
  98           summary: "An OSD host is offline"
  99         expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1"
 100         for: "5m"
 101         labels:
 102           oid: "1.3.6.1.4.1.50495.1.2.1.4.8"
 103           severity: "warning"
 104           type: "ceph_default"
 105       - alert: "CephOSDDown"
 106         annotations:
 107           description: |
 108             {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
 109           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down"
 110           summary: "An OSD has been marked down"
 111         expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1"
 112         for: "5m"
 113         labels:
 114           oid: "1.3.6.1.4.1.50495.1.2.1.4.2"
 115           severity: "warning"
 116           type: "ceph_default"
 117       - alert: "CephOSDNearFull"
 118         annotations:
 119           description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
 120           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull"
 121           summary: "OSD(s) running low on free space (NEARFULL)"
 122         expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1"
 123         for: "5m"
 124         labels:
 125           oid: "1.3.6.1.4.1.50495.1.2.1.4.3"
 126           severity: "warning"
 127           type: "ceph_default"
 128       - alert: "CephOSDFull"
 129         annotations:
 130           description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
 131           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full"
 132           summary: "OSD full, writes blocked"
 133         expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0"
 134         for: "1m"
 135         labels:
 136           oid: "1.3.6.1.4.1.50495.1.2.1.4.6"
 137           severity: "critical"
 138           type: "ceph_default"
 139       - alert: "CephOSDBackfillFull"
 140         annotations:
 141           description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
 142           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull"
 143           summary: "OSD(s) too full for backfill operations"
 144         expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0"
 145         for: "1m"
 146         labels:
 147           severity: "warning"
 148           type: "ceph_default"
 149       - alert: "CephOSDTooManyRepairs"
 150         annotations:
 151           description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive."
 152           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs"
 153           summary: "OSD reports a high number of read errors"
 154         expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1"
 155         for: "30s"
 156         labels:
 157           severity: "warning"
 158           type: "ceph_default"
 159       - alert: "CephOSDTimeoutsPublicNetwork"
 160         annotations:
 161           description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
 162           summary: "Network issues delaying OSD heartbeats (public network)"
 163         expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1"
 164         for: "1m"
 165         labels:
 166           severity: "warning"
 167           type: "ceph_default"
 168       - alert: "CephOSDTimeoutsClusterNetwork"
 169         annotations:
 170           description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
 171           summary: "Network issues delaying OSD heartbeats (cluster network)"
 172         expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1"
 173         for: "1m"
 174         labels:
 175           severity: "warning"
 176           type: "ceph_default"
 177       - alert: "CephOSDInternalDiskSizeMismatch"
 178         annotations:
 179           description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
 180           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch"
 181           summary: "OSD size inconsistency error"
 182         expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1"
 183         for: "1m"
 184         labels:
 185           severity: "warning"
 186           type: "ceph_default"
 187       - alert: "CephDeviceFailurePredicted"
 188         annotations:
 189           description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
 190           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2"
 191           summary: "Device(s) predicted to fail soon"
 192         expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1"
 193         for: "1m"
 194         labels:
 195           severity: "warning"
 196           type: "ceph_default"
 197       - alert: "CephDeviceFailurePredictionTooHigh"
 198         annotations:
 199           description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
 200           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany"
 201           summary: "Too many devices are predicted to fail, unable to resolve"
 202         expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1"
 203         for: "1m"
 204         labels:
 205           oid: "1.3.6.1.4.1.50495.1.2.1.4.7"
 206           severity: "critical"
 207           type: "ceph_default"
 208       - alert: "CephDeviceFailureRelocationIncomplete"
 209         annotations:
 210           description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
 211           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use"
 212           summary: "Device failure is predicted, but unable to relocate data"
 213         expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1"
 214         for: "1m"
 215         labels:
 216           severity: "warning"
 217           type: "ceph_default"
 218       - alert: "CephOSDFlapping"
 219         annotations:
 220           description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
 221           documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds"
 222           summary: "Network issues are causing OSDs to flap (mark each other down)"
 223         expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1"
 224         labels:
 225           oid: "1.3.6.1.4.1.50495.1.2.1.4.4"
 226           severity: "warning"
 227           type: "ceph_default"
 228       - alert: "CephOSDReadErrors"
 229         annotations:
 230           description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
 231           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors"
 232           summary: "Device read errors detected"
 233         expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1"
 234         for: "30s"
 235         labels:
 236           severity: "warning"
 237           type: "ceph_default"
 238       - alert: "CephPGImbalance"
 239         annotations:
 240           description: "OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count."
 241           summary: "PGs are not balanced across OSDs"
 242         expr: |
 243           abs(
 244             ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) /
 245             on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 246           ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 247         for: "5m"
 248         labels:
 249           oid: "1.3.6.1.4.1.50495.1.2.1.4.5"
 250           severity: "warning"
 251           type: "ceph_default"
 252   - name: "mds"
 253     rules:
 254       - alert: "CephFilesystemDamaged"
 255         annotations:
 256           description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
 257           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
 258           summary: "CephFS filesystem is damaged."
 259         expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0"
 260         for: "1m"
 261         labels:
 262           oid: "1.3.6.1.4.1.50495.1.2.1.5.1"
 263           severity: "critical"
 264           type: "ceph_default"
 265       - alert: "CephFilesystemOffline"
 266         annotations:
 267           description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
 268           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down"
 269           summary: "CephFS filesystem is offline"
 270         expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0"
 271         for: "1m"
 272         labels:
 273           oid: "1.3.6.1.4.1.50495.1.2.1.5.3"
 274           severity: "critical"
 275           type: "ceph_default"
 276       - alert: "CephFilesystemDegraded"
 277         annotations:
 278           description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
 279           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded"
 280           summary: "CephFS filesystem is degraded"
 281         expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0"
 282         for: "1m"
 283         labels:
 284           oid: "1.3.6.1.4.1.50495.1.2.1.5.4"
 285           severity: "critical"
 286           type: "ceph_default"
 287       - alert: "CephFilesystemMDSRanksLow"
 288         annotations:
 289           description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
 290           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max"
 291           summary: "Ceph MDS daemon count is lower than configured"
 292         expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0"
 293         for: "1m"
 294         labels:
 295           severity: "warning"
 296           type: "ceph_default"
 297       - alert: "CephFilesystemInsufficientStandby"
 298         annotations:
 299           description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
 300           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby"
 301           summary: "Ceph filesystem standby daemons too few"
 302         expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0"
 303         for: "1m"
 304         labels:
 305           severity: "warning"
 306           type: "ceph_default"
 307       - alert: "CephFilesystemFailureNoStandby"
 308         annotations:
 309           description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
 310           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds"
 311           summary: "MDS daemon failed, no further standby available"
 312         expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0"
 313         for: "1m"
 314         labels:
 315           oid: "1.3.6.1.4.1.50495.1.2.1.5.5"
 316           severity: "critical"
 317           type: "ceph_default"
 318       - alert: "CephFilesystemReadOnly"
 319         annotations:
 320           description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
 321           documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages"
 322           summary: "CephFS filesystem in read only mode due to write error(s)"
 323         expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0"
 324         for: "1m"
 325         labels:
 326           oid: "1.3.6.1.4.1.50495.1.2.1.5.2"
 327           severity: "critical"
 328           type: "ceph_default"
 329   - name: "mgr"
 330     rules:
 331       - alert: "CephMgrModuleCrash"
 332         annotations:
 333           description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
 334           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash"
 335           summary: "A manager module has recently crashed"
 336         expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1"
 337         for: "5m"
 338         labels:
 339           oid: "1.3.6.1.4.1.50495.1.2.1.6.1"
 340           severity: "critical"
 341           type: "ceph_default"
 342       - alert: "CephMgrPrometheusModuleInactive"
 343         annotations:
 344           description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
 345           summary: "The mgr/prometheus module is not available"
 346         expr: "up{job=\"ceph\"} == 0"
 347         for: "1m"
 348         labels:
 349           oid: "1.3.6.1.4.1.50495.1.2.1.6.2"
 350           severity: "critical"
 351           type: "ceph_default"
 352   - name: "pgs"
 353     rules:
 354       - alert: "CephPGsInactive"
 355         annotations:
 356           description: "{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests."
 357           summary: "One or more placement groups are inactive"
 358         expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0"
 359         for: "5m"
 360         labels:
 361           oid: "1.3.6.1.4.1.50495.1.2.1.7.1"
 362           severity: "critical"
 363           type: "ceph_default"
 364       - alert: "CephPGsUnclean"
 365         annotations:
 366           description: "{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure."
 367           summary: "One or more placement groups are marked unclean"
 368         expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0"
 369         for: "15m"
 370         labels:
 371           oid: "1.3.6.1.4.1.50495.1.2.1.7.2"
 372           severity: "warning"
 373           type: "ceph_default"
 374       - alert: "CephPGsDamaged"
 375         annotations:
 376           description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command."
 377           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged"
 378           summary: "Placement group damaged, manual intervention needed"
 379         expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1"
 380         for: "5m"
 381         labels:
 382           oid: "1.3.6.1.4.1.50495.1.2.1.7.4"
 383           severity: "critical"
 384           type: "ceph_default"
 385       - alert: "CephPGRecoveryAtRisk"
 386         annotations:
 387           description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data."
 388           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full"
 389           summary: "OSDs are too full for recovery"
 390         expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1"
 391         for: "1m"
 392         labels:
 393           oid: "1.3.6.1.4.1.50495.1.2.1.7.5"
 394           severity: "critical"
 395           type: "ceph_default"
 396       - alert: "CephPGUnavilableBlockingIO"
 397         annotations:
 398           description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O."
 399           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability"
 400           summary: "PG is unavailable, blocking I/O"
 401         expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1"
 402         for: "1m"
 403         labels:
 404           oid: "1.3.6.1.4.1.50495.1.2.1.7.3"
 405           severity: "critical"
 406           type: "ceph_default"
 407       - alert: "CephPGBackfillAtRisk"
 408         annotations:
 409           description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data."
 410           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full"
 411           summary: "Backfill operations are blocked due to lack of free space"
 412         expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1"
 413         for: "1m"
 414         labels:
 415           oid: "1.3.6.1.4.1.50495.1.2.1.7.6"
 416           severity: "critical"
 417           type: "ceph_default"
 418       - alert: "CephPGNotScrubbed"
 419         annotations:
 420           description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
 421           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed"
 422           summary: "Placement group(s) have not been scrubbed"
 423         expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1"
 424         for: "5m"
 425         labels:
 426           severity: "warning"
 427           type: "ceph_default"
 428       - alert: "CephPGsHighPerOSD"
 429         annotations:
 430           description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
 431           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs"
 432           summary: "Placement groups per OSD is too high"
 433         expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1"
 434         for: "1m"
 435         labels:
 436           severity: "warning"
 437           type: "ceph_default"
 438       - alert: "CephPGNotDeepScrubbed"
 439         annotations:
 440           description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window."
 441           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed"
 442           summary: "Placement group(s) have not been deep scrubbed"
 443         expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1"
 444         for: "5m"
 445         labels:
 446           severity: "warning"
 447           type: "ceph_default"
 448   - name: "nodes"
 449     rules:
 450       - alert: "CephNodeRootFilesystemFull"
 451         annotations:
 452           description: "Root volume is dangerously full: {{ $value | humanize }}% free."
 453           summary: "Root filesystem is dangerously full"
 454         expr: "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100 < 5"
 455         for: "5m"
 456         labels:
 457           oid: "1.3.6.1.4.1.50495.1.2.1.8.1"
 458           severity: "critical"
 459           type: "ceph_default"
 460       - alert: "CephNodeNetworkPacketDrops"
 461         annotations:
 462           description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
 463           summary: "One or more NICs reports packet drops"
 464         expr: |
 465           (
 466             rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 467             rate(node_network_transmit_drop_total{device!="lo"}[1m])
 468           ) / (
 469             rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 470             rate(node_network_transmit_packets_total{device!="lo"}[1m])
 471           ) >= 0.0050000000000000001 and (
 472             rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 473             rate(node_network_transmit_drop_total{device!="lo"}[1m])
 474           ) >= 10
 475         labels:
 476           oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
 477           severity: "warning"
 478           type: "ceph_default"
 479       - alert: "CephNodeNetworkPacketErrors"
 480         annotations:
 481           description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
 482           summary: "One or more NICs reports packet errors"
 483         expr: |
 484           (
 485             rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 486             rate(node_network_transmit_errs_total{device!="lo"}[1m])
 487           ) / (
 488             rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 489             rate(node_network_transmit_packets_total{device!="lo"}[1m])
 490           ) >= 0.0001 or (
 491             rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 492             rate(node_network_transmit_errs_total{device!="lo"}[1m])
 493           ) >= 10
 494         labels:
 495           oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
 496           severity: "warning"
 497           type: "ceph_default"
 498       - alert: "CephNodeDiskspaceWarning"
 499         annotations:
 500           description: "Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate."
 501           summary: "Host filesystem free space is getting low"
 502         expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0"
 503         labels:
 504           oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
 505           severity: "warning"
 506           type: "ceph_default"
 507       - alert: "CephNodeInconsistentMTU"
 508         annotations:
 509           description: "Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}."
 510           summary: "MTU settings across Ceph hosts are inconsistent"
 511         expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0))  )"
 512         labels:
 513           severity: "warning"
 514           type: "ceph_default"
 515   - name: "pools"
 516     rules:
 517       - alert: "CephPoolGrowthWarning"
 518         annotations:
 519           description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
 520           summary: "Pool growth rate may soon exceed capacity"
 521         expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
 522         labels:
 523           oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
 524           severity: "warning"
 525           type: "ceph_default"
 526       - alert: "CephPoolBackfillFull"
 527         annotations:
 528           description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity."
 529           summary: "Free space in a pool is too low for recovery/backfill"
 530         expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0"
 531         labels:
 532           severity: "warning"
 533           type: "ceph_default"
 534       - alert: "CephPoolFull"
 535         annotations:
 536           description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
 537           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full"
 538           summary: "Pool is full - writes are blocked"
 539         expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0"
 540         for: "1m"
 541         labels:
 542           oid: "1.3.6.1.4.1.50495.1.2.1.9.1"
 543           severity: "critical"
 544           type: "ceph_default"
 545       - alert: "CephPoolNearFull"
 546         annotations:
 547           description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
 548           summary: "One or more Ceph pools are nearly full"
 549         expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0"
 550         for: "5m"
 551         labels:
 552           severity: "warning"
 553           type: "ceph_default"
 554   - name: "healthchecks"
 555     rules:
 556       - alert: "CephSlowOps"
 557         annotations:
 558           description: "{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 559           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
 560           summary: "OSD operations are slow to complete"
 561         expr: "ceph_healthcheck_slow_ops > 0"
 562         for: "30s"
 563         labels:
 564           severity: "warning"
 565           type: "ceph_default"
 566       - alert: "CephDaemonSlowOps"
 567         for: "30s"
 568         expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
 569         labels:
 570           severity: 'warning'
 571           type: 'ceph_default'
 572         annotations:
 573           summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
 574           description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
 575           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
 576   - name: "cephadm"
 577     rules:
 578       - alert: "CephadmUpgradeFailed"
 579         annotations:
 580           description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
 581           summary: "Ceph version upgrade has failed"
 582         expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0"
 583         for: "30s"
 584         labels:
 585           oid: "1.3.6.1.4.1.50495.1.2.1.11.2"
 586           severity: "critical"
 587           type: "ceph_default"
 588       - alert: "CephadmDaemonFailed"
 589         annotations:
 590           description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
 591           summary: "A ceph daemon manged by cephadm is down"
 592         expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0"
 593         for: "30s"
 594         labels:
 595           oid: "1.3.6.1.4.1.50495.1.2.1.11.1"
 596           severity: "critical"
 597           type: "ceph_default"
 598       - alert: "CephadmPaused"
 599         annotations:
 600           description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
 601           documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused"
 602           summary: "Orchestration tasks via cephadm are PAUSED"
 603         expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0"
 604         for: "1m"
 605         labels:
 606           severity: "warning"
 607           type: "ceph_default"
 608   - name: "PrometheusServer"
 609     rules:
 610       - alert: "PrometheusJobMissing"
 611         annotations:
 612           description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance."
 613           summary: "The scrape job for Ceph is missing from Prometheus"
 614         expr: "absent(up{job=\"ceph\"})"
 615         for: "30s"
 616         labels:
 617           oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
 618           severity: "critical"
 619           type: "ceph_default"
 620   - name: "rados"
 621     rules:
 622       - alert: "CephObjectMissing"
 623         annotations:
 624           description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified."
 625           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound"
 626           summary: "Object(s) marked UNFOUND"
 627         expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1"
 628         for: "30s"
 629         labels:
 630           oid: "1.3.6.1.4.1.50495.1.2.1.10.1"
 631           severity: "critical"
 632           type: "ceph_default"
 633   - name: "generic"
 634     rules:
 635       - alert: "CephDaemonCrash"
 636         annotations:
 637           description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command."
 638           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash"
 639           summary: "One or more Ceph daemons have crashed, and are pending acknowledgement"
 640         expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1"
 641         for: "1m"
 642         labels:
 643           oid: "1.3.6.1.4.1.50495.1.2.1.1.2"
 644           severity: "critical"
 645           type: "ceph_default"