ceph/monitoring/ceph-mixin/prometheus_alerts.libsonnet

   1 {
   2   _config:: error 'must provide _config',
   3
   4   MultiClusterQuery()::
   5     if $._config.showMultiCluster
   6     then 'cluster,'
   7     else '',
   8
   9   MultiClusterSummary()::
  10     if $._config.showMultiCluster
  11     then ' on cluster {{ $labels.cluster }}'
  12     else '',
  13
  14   groups+: [
  15     {
  16       name: 'cluster health',
  17       rules: [
  18         {
  19           alert: 'CephHealthError',
  20           'for': '5m',
  21           expr: 'ceph_health_status == 2',
  22           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.2.1' },
  23           annotations: {
  24             summary: 'Ceph is in the ERROR state%(cluster)s' % $.MultiClusterSummary(),
  25             description: "The cluster state has been HEALTH_ERROR for more than 5 minutes%(cluster)s. Please check 'ceph health detail' for more information." % $.MultiClusterSummary(),
  26           },
  27         },
  28         {
  29           alert: 'CephHealthWarning',
  30           'for': '15m',
  31           expr: 'ceph_health_status == 1',
  32           labels: { severity: 'warning', type: 'ceph_default' },
  33           annotations: {
  34             summary: 'Ceph is in the WARNING state%(cluster)s' % $.MultiClusterSummary(),
  35             description: "The cluster state has been HEALTH_WARN for more than 15 minutes%(cluster)s. Please check 'ceph health detail' for more information." % $.MultiClusterSummary(),
  36           },
  37         },
  38       ],
  39     },
  40     {
  41       name: 'mon',
  42       rules: [
  43         {
  44           alert: 'CephMonDownQuorumAtRisk',
  45           'for': '30s',
  46           expr: |||
  47             (
  48               (ceph_health_detail{name="MON_DOWN"} == 1) * on() (
  49                 count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1)
  50               )
  51             ) == 1
  52           |||,
  53           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.3.1' },
  54           annotations: {
  55             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down',
  56             summary: 'Monitor quorum is at risk%(cluster)s' % $.MultiClusterSummary(),
  57             description: '{{ $min := query "floor(count(ceph_mon_metadata) / 2) + 1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
  58           },
  59         },
  60         {
  61           alert: 'CephMonDown',
  62           'for': '30s',
  63           expr: |||
  64             count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
  65           |||,
  66           labels: { severity: 'warning', type: 'ceph_default' },
  67           annotations: {
  68             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down',
  69             summary: 'One or more monitors down%(cluster)s' % $.MultiClusterSummary(),
  70             description: |||
  71               {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}   - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
  72             |||,
  73           },
  74         },
  75         {
  76           alert: 'CephMonDiskspaceCritical',
  77           'for': '1m',
  78           expr: 'ceph_health_detail{name="MON_DISK_CRIT"} == 1',
  79           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.3.2' },
  80           annotations: {
  81             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit',
  82             summary: 'Filesystem space on at least one monitor is critically low%(cluster)s' % $.MultiClusterSummary(),
  83             description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}",
  84           },
  85         },
  86         {
  87           alert: 'CephMonDiskspaceLow',
  88           'for': '5m',
  89           expr: 'ceph_health_detail{name="MON_DISK_LOW"} == 1',
  90           labels: { severity: 'warning', type: 'ceph_default' },
  91           annotations: {
  92             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low',
  93             summary: 'Drive space on at least one monitor is approaching full%(cluster)s' % $.MultiClusterSummary(),
  94             description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query \"ceph_mon_metadata\"}} - {{ .Labels.hostname }} {{- end }}",
  95           },
  96         },
  97         {
  98           alert: 'CephMonClockSkew',
  99           'for': '1m',
 100           expr: 'ceph_health_detail{name="MON_CLOCK_SKEW"} == 1',
 101           labels: { severity: 'warning', type: 'ceph_default' },
 102           annotations: {
 103             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew',
 104             summary: 'Clock skew detected among monitors%(cluster)s' % $.MultiClusterSummary(),
 105             description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon.",
 106           },
 107         },
 108       ],
 109     },
 110     {
 111       name: 'osd',
 112       rules: [
 113         {
 114           alert: 'CephOSDDownHigh',
 115           expr: 'count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10',
 116           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.1' },
 117           annotations: {
 118             summary: 'More than 10%% of OSDs are down%(cluster)s' % $.MultiClusterSummary(),
 119             description: '{{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}',
 120           },
 121         },
 122         {
 123           alert: 'CephOSDHostDown',
 124           'for': '5m',
 125           expr: 'ceph_health_detail{name="OSD_HOST_DOWN"} == 1',
 126           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.8' },
 127           annotations: {
 128             summary: 'An OSD host is offline%(cluster)s' % $.MultiClusterSummary(),
 129             description: 'The following OSDs are down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}',
 130           },
 131         },
 132         {
 133           alert: 'CephOSDDown',
 134           'for': '5m',
 135           expr: 'ceph_health_detail{name="OSD_DOWN"} == 1',
 136           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.2' },
 137           annotations: {
 138             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down',
 139             summary: 'An OSD has been marked down%(cluster)s' % $.MultiClusterSummary(),
 140             description: |||
 141               {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}
 142             |||,
 143           },
 144         },
 145         {
 146           alert: 'CephOSDNearFull',
 147           'for': '5m',
 148           expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1',
 149           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.3' },
 150           annotations: {
 151             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull',
 152             summary: 'OSD(s) running low on free space (NEARFULL)%(cluster)s' % $.MultiClusterSummary(),
 153             description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.",
 154           },
 155         },
 156         {
 157           alert: 'CephOSDFull',
 158           'for': '1m',
 159           expr: 'ceph_health_detail{name="OSD_FULL"} > 0',
 160           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.6' },
 161           annotations: {
 162             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full',
 163             summary: 'OSD full, writes blocked%(cluster)s' % $.MultiClusterSummary(),
 164             description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.",
 165           },
 166         },
 167         {
 168           alert: 'CephOSDBackfillFull',
 169           'for': '1m',
 170           expr: 'ceph_health_detail{name="OSD_BACKFILLFULL"} > 0',
 171           labels: { severity: 'warning', type: 'ceph_default' },
 172           annotations: {
 173             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull',
 174             summary: 'OSD(s) too full for backfill operations%(cluster)s' % $.MultiClusterSummary(),
 175             description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.",
 176           },
 177         },
 178         {
 179           alert: 'CephOSDTooManyRepairs',
 180           'for': '30s',
 181           expr: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1',
 182           labels: { severity: 'warning', type: 'ceph_default' },
 183           annotations: {
 184             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs',
 185             summary: 'OSD reports a high number of read errors%(cluster)s' % $.MultiClusterSummary(),
 186             description: 'Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.',
 187           },
 188         },
 189         {
 190           alert: 'CephOSDTimeoutsPublicNetwork',
 191           'for': '1m',
 192           expr: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1',
 193           labels: { severity: 'warning', type: 'ceph_default' },
 194           annotations: {
 195             summary: 'Network issues delaying OSD heartbeats (public network)%(cluster)s' % $.MultiClusterSummary(),
 196             description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs.",
 197           },
 198         },
 199         {
 200           alert: 'CephOSDTimeoutsClusterNetwork',
 201           'for': '1m',
 202           expr: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1',
 203           labels: { severity: 'warning', type: 'ceph_default' },
 204           annotations: {
 205             summary: 'Network issues delaying OSD heartbeats (cluster network)%(cluster)s' % $.MultiClusterSummary(),
 206             description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.",
 207           },
 208         },
 209         {
 210           alert: 'CephOSDInternalDiskSizeMismatch',
 211           'for': '1m',
 212           expr: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1',
 213           labels: { severity: 'warning', type: 'ceph_default' },
 214           annotations: {
 215             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch',
 216             summary: 'OSD size inconsistency error%(cluster)s' % $.MultiClusterSummary(),
 217             description: 'One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs.',
 218           },
 219         },
 220         {
 221           alert: 'CephDeviceFailurePredicted',
 222           'for': '1m',
 223           expr: 'ceph_health_detail{name="DEVICE_HEALTH"} == 1',
 224           labels: { severity: 'warning', type: 'ceph_default' },
 225           annotations: {
 226             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#id2',
 227             summary: 'Device(s) predicted to fail soon%(cluster)s' % $.MultiClusterSummary(),
 228             description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD.",
 229           },
 230         },
 231         {
 232           alert: 'CephDeviceFailurePredictionTooHigh',
 233           'for': '1m',
 234           expr: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1',
 235           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.7' },
 236           annotations: {
 237             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany',
 238             summary: 'Too many devices are predicted to fail, unable to resolve%(cluster)s' % $.MultiClusterSummary(),
 239             description: 'The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated.',
 240           },
 241         },
 242         {
 243           alert: 'CephDeviceFailureRelocationIncomplete',
 244           'for': '1m',
 245           expr: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1',
 246           labels: { severity: 'warning', type: 'ceph_default' },
 247           annotations: {
 248             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use',
 249             summary: 'Device failure is predicted, but unable to relocate data%(cluster)s' % $.MultiClusterSummary(),
 250             description: 'The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer.',
 251           },
 252         },
 253         {
 254           alert: 'CephOSDFlapping',
 255           expr: '(rate(ceph_osd_up[5m]) * on(%(cluster)sceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1' % $.MultiClusterQuery(),
 256           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.4' },
 257           annotations: {
 258             documentation: 'https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds',
 259             summary: 'Network issues are causing OSDs to flap (mark each other down)%(cluster)s' % $.MultiClusterSummary(),
 260             description: 'OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s).',
 261           },
 262         },
 263         {
 264           alert: 'CephOSDReadErrors',
 265           'for': '30s',
 266           expr: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1',
 267           labels: { severity: 'warning', type: 'ceph_default' },
 268           annotations: {
 269             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors',
 270             summary: 'Device read errors detected%(cluster)s' % $.MultiClusterSummary(),
 271             description: 'An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel.',
 272           },
 273         },
 274         {
 275           alert: 'CephPGImbalance',
 276           'for': '5m',
 277           expr: |||
 278             abs(
 279               ((ceph_osd_numpg > 0) - on (%(cluster)sjob) group_left avg(ceph_osd_numpg > 0) by (%(cluster)sjob)) /
 280               on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 281             ) * on (%(cluster)sceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 282           ||| % [$.MultiClusterQuery(), $.MultiClusterQuery(), $.MultiClusterQuery()],
 283           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.4.5' },
 284           annotations: {
 285             summary: 'PGs are not balanced across OSDs%(cluster)s' % $.MultiClusterSummary(),
 286             description: 'OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count.',
 287           },
 288         },
 289       ],
 290     },
 291     {
 292       name: 'mds',
 293       rules: [
 294         {
 295           alert: 'CephFilesystemDamaged',
 296           'for': '1m',
 297           expr: 'ceph_health_detail{name="MDS_DAMAGE"} > 0',
 298           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.1' },
 299           annotations: {
 300             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages',
 301             summary: 'CephFS filesystem is damaged%(cluster)s.' % $.MultiClusterSummary(),
 302             description: 'Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support.',
 303           },
 304         },
 305         {
 306           alert: 'CephFilesystemOffline',
 307           'for': '1m',
 308           expr: 'ceph_health_detail{name="MDS_ALL_DOWN"} > 0',
 309           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.3' },
 310           annotations: {
 311             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down',
 312             summary: 'CephFS filesystem is offline%(cluster)s' % $.MultiClusterSummary(),
 313             description: 'All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline.',
 314           },
 315         },
 316         {
 317           alert: 'CephFilesystemDegraded',
 318           'for': '1m',
 319           expr: 'ceph_health_detail{name="FS_DEGRADED"} > 0',
 320           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.4' },
 321           annotations: {
 322             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded',
 323             summary: 'CephFS filesystem is degraded%(cluster)s' % $.MultiClusterSummary(),
 324             description: 'One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable.',
 325           },
 326         },
 327         {
 328           alert: 'CephFilesystemMDSRanksLow',
 329           'for': '1m',
 330           expr: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0',
 331           labels: { severity: 'warning', type: 'ceph_default' },
 332           annotations: {
 333             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max',
 334             summary: 'Ceph MDS daemon count is lower than configured%(cluster)s' % $.MultiClusterSummary(),
 335             description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value.",
 336           },
 337         },
 338         {
 339           alert: 'CephFilesystemInsufficientStandby',
 340           'for': '1m',
 341           expr: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0',
 342           labels: { severity: 'warning', type: 'ceph_default' },
 343           annotations: {
 344             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby',
 345             summary: 'Ceph filesystem standby daemons too few%(cluster)s' % $.MultiClusterSummary(),
 346             description: 'The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons.',
 347           },
 348         },
 349         {
 350           alert: 'CephFilesystemFailureNoStandby',
 351           'for': '1m',
 352           expr: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0',
 353           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.5' },
 354           annotations: {
 355             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds',
 356             summary: 'MDS daemon failed, no further standby available%(cluster)s' % $.MultiClusterSummary(),
 357             description: 'An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS.',
 358           },
 359         },
 360         {
 361           alert: 'CephFilesystemReadOnly',
 362           'for': '1m',
 363           expr: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0',
 364           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.5.2' },
 365           annotations: {
 366             documentation: 'https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages',
 367             summary: 'CephFS filesystem in read only mode due to write error(s)%(cluster)s' % $.MultiClusterSummary(),
 368             description: 'The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support.',
 369           },
 370         },
 371       ],
 372     },
 373     {
 374       name: 'mgr',
 375       rules: [
 376         {
 377           alert: 'CephMgrModuleCrash',
 378           'for': '5m',
 379           expr: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1',
 380           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.6.1' },
 381           annotations: {
 382             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash',
 383             summary: 'A manager module has recently crashed%(cluster)s' % $.MultiClusterSummary(),
 384             description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure.",
 385           },
 386         },
 387         {
 388           alert: 'CephMgrPrometheusModuleInactive',
 389           'for': '1m',
 390           expr: 'up{job="ceph"} == 0',
 391           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.6.2' },
 392           annotations: {
 393             summary: 'The mgr/prometheus module is not available%(cluster)s' % $.MultiClusterSummary(),
 394             description: "The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'.",
 395           },
 396         },
 397       ],
 398     },
 399     {
 400       name: 'pgs',
 401       rules: [
 402         {
 403           alert: 'CephPGsInactive',
 404           'for': '5m',
 405           expr: 'ceph_pool_metadata * on(%(cluster)spool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0' % $.MultiClusterQuery(),
 406           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.1' },
 407           annotations: {
 408             summary: 'One or more placement groups are inactive%(cluster)s' % $.MultiClusterSummary(),
 409             description: '{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests.',
 410           },
 411         },
 412         {
 413           alert: 'CephPGsUnclean',
 414           'for': '15m',
 415           expr: 'ceph_pool_metadata * on(%(cluster)spool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0' % $.MultiClusterQuery(),
 416           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.2' },
 417           annotations: {
 418             summary: 'One or more placement groups are marked unclean%(cluster)s' % $.MultiClusterSummary(),
 419             description: '{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure.',
 420           },
 421         },
 422         {
 423           alert: 'CephPGsDamaged',
 424           'for': '5m',
 425           expr: 'ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1',
 426           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.4' },
 427           annotations: {
 428             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged',
 429             summary: 'Placement group damaged, manual intervention needed%(cluster)s' % $.MultiClusterSummary(),
 430             description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.",
 431           },
 432         },
 433         {
 434           alert: 'CephPGRecoveryAtRisk',
 435           'for': '1m',
 436           expr: 'ceph_health_detail{name="PG_RECOVERY_FULL"} == 1',
 437           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.5' },
 438           annotations: {
 439             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full',
 440             summary: 'OSDs are too full for recovery%(cluster)s' % $.MultiClusterSummary(),
 441             description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.",
 442           },
 443         },
 444         {
 445           alert: 'CephPGUnavilableBlockingIO',
 446           'for': '1m',
 447           expr: '((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1',
 448           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.3' },
 449           annotations: {
 450             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability',
 451             summary: 'PG is unavailable%(cluster)s, blocking I/O' % $.MultiClusterSummary(),
 452             description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.",
 453           },
 454         },
 455         {
 456           alert: 'CephPGBackfillAtRisk',
 457           'for': '1m',
 458           expr: 'ceph_health_detail{name="PG_BACKFILL_FULL"} == 1',
 459           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.7.6' },
 460           annotations: {
 461             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full',
 462             summary: 'Backfill operations are blocked due to lack of free space%(cluster)s' % $.MultiClusterSummary(),
 463             description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.",
 464           },
 465         },
 466         {
 467           alert: 'CephPGNotScrubbed',
 468           'for': '5m',
 469           expr: 'ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1',
 470           labels: { severity: 'warning', type: 'ceph_default' },
 471           annotations: {
 472             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed',
 473             summary: 'Placement group(s) have not been scrubbed%(cluster)s' % $.MultiClusterSummary(),
 474             description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>",
 475           },
 476         },
 477         {
 478           alert: 'CephPGsHighPerOSD',
 479           'for': '1m',
 480           expr: 'ceph_health_detail{name="TOO_MANY_PGS"} == 1',
 481           labels: { severity: 'warning', type: 'ceph_default' },
 482           annotations: {
 483             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs',
 484             summary: 'Placement groups per OSD is too high%(cluster)s' % $.MultiClusterSummary(),
 485             description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools.",
 486           },
 487         },
 488         {
 489           alert: 'CephPGNotDeepScrubbed',
 490           'for': '5m',
 491           expr: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1',
 492           labels: { severity: 'warning', type: 'ceph_default' },
 493           annotations: {
 494             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed',
 495             summary: 'Placement group(s) have not been deep scrubbed%(cluster)s' % $.MultiClusterSummary(),
 496             description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.",
 497           },
 498         },
 499       ],
 500     },
 501     {
 502       name: 'nodes',
 503       rules: [
 504         {
 505           alert: 'CephNodeRootFilesystemFull',
 506           'for': '5m',
 507           expr: 'node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5',
 508           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.1' },
 509           annotations: {
 510             summary: 'Root filesystem is dangerously full%(cluster)s' % $.MultiClusterSummary(),
 511             description: 'Root volume is dangerously full: {{ $value | humanize }}% free.',
 512           },
 513         },
 514         {
 515           alert: 'CephNodeNetworkPacketDrops',
 516           expr: |||
 517             (
 518               rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 519               rate(node_network_transmit_drop_total{device!="lo"}[1m])
 520             ) / (
 521               rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 522               rate(node_network_transmit_packets_total{device!="lo"}[1m])
 523             ) >= %(CephNodeNetworkPacketDropsThreshold)s and (
 524               rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 525               rate(node_network_transmit_drop_total{device!="lo"}[1m])
 526             ) >= %(CephNodeNetworkPacketDropsPerSec)s
 527           ||| % $._config,
 528           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' },
 529           annotations: {
 530             summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(),
 531             description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec },
 532           },
 533         },
 534         {
 535           alert: 'CephNodeNetworkPacketErrors',
 536           expr: |||
 537             (
 538               rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 539               rate(node_network_transmit_errs_total{device!="lo"}[1m])
 540             ) / (
 541               rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 542               rate(node_network_transmit_packets_total{device!="lo"}[1m])
 543             ) >= 0.0001 or (
 544               rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 545               rate(node_network_transmit_errs_total{device!="lo"}[1m])
 546             ) >= 10
 547           |||,
 548           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.3' },
 549           annotations: {
 550             summary: 'One or more NICs reports packet errors%(cluster)s' % $.MultiClusterSummary(),
 551             description: 'Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}.',
 552           },
 553         },
 554         {
 555           alert: 'CephNodeDiskspaceWarning',
 556           expr: 'predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0',
 557           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.4' },
 558           annotations: {
 559             summary: 'Host filesystem free space is getting low%(cluster)s' % $.MultiClusterSummary(),
 560             description: 'Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate.',
 561           },
 562         },
 563         {
 564           alert: 'CephNodeInconsistentMTU',
 565           expr: 'node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=      quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))  )',
 566           labels: { severity: 'warning', type: 'ceph_default' },
 567           annotations: {
 568             summary: 'MTU settings across Ceph hosts are inconsistent%(cluster)s' % $.MultiClusterSummary(),
 569             description: 'Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}.',
 570           },
 571         },
 572       ],
 573     },
 574     {
 575       name: 'pools',
 576       rules: [
 577         {
 578           alert: 'CephPoolGrowthWarning',
 579           expr: '(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(%(cluster)spool_id)    group_right ceph_pool_metadata) >= 95' % $.MultiClusterQuery(),
 580           labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.2' },
 581           annotations: {
 582             summary: 'Pool growth rate may soon exceed capacity%(cluster)s' % $.MultiClusterSummary(),
 583             description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.",
 584           },
 585         },
 586         {
 587           alert: 'CephPoolBackfillFull',
 588           expr: 'ceph_health_detail{name="POOL_BACKFILLFULL"} > 0',
 589           labels: { severity: 'warning', type: 'ceph_default' },
 590           annotations: {
 591             summary: 'Free space in a pool is too low for recovery/backfill%(cluster)s' % $.MultiClusterSummary(),
 592             description: 'A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.',
 593           },
 594         },
 595         {
 596           alert: 'CephPoolFull',
 597           'for': '1m',
 598           expr: 'ceph_health_detail{name="POOL_FULL"} > 0',
 599           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.9.1' },
 600           annotations: {
 601             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full',
 602             summary: 'Pool is full - writes are blocked%(cluster)s' % $.MultiClusterSummary(),
 603             description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)",
 604           },
 605         },
 606         {
 607           alert: 'CephPoolNearFull',
 608           'for': '5m',
 609           expr: 'ceph_health_detail{name="POOL_NEAR_FULL"} > 0',
 610           labels: { severity: 'warning', type: 'ceph_default' },
 611           annotations: {
 612             summary: 'One or more Ceph pools are nearly full%(cluster)s' % $.MultiClusterSummary(),
 613             description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active.",
 614           },
 615         },
 616       ],
 617     },
 618     {
 619       name: 'healthchecks',
 620       rules: [
 621         {
 622           alert: 'CephSlowOps',
 623           'for': '30s',
 624           expr: 'ceph_healthcheck_slow_ops > 0',
 625           labels: { severity: 'warning', type: 'ceph_default' },
 626           annotations: {
 627             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops',
 628             summary: 'OSD operations are slow to complete%(cluster)s' % $.MultiClusterSummary(),
 629             description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)',
 630           },
 631         },
 632       ],
 633     },
 634     {
 635       name: 'cephadm',
 636       rules: [
 637         {
 638           alert: 'CephadmUpgradeFailed',
 639           'for': '30s',
 640           expr: 'ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0',
 641           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.11.2' },
 642           annotations: {
 643             summary: 'Ceph version upgrade has failed%(cluster)s' % $.MultiClusterSummary(),
 644             description: 'The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue',
 645           },
 646         },
 647         {
 648           alert: 'CephadmDaemonFailed',
 649           'for': '30s',
 650           expr: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0',
 651           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.11.1' },
 652           annotations: {
 653             summary: 'A ceph daemon manged by cephadm is down%(cluster)s' % $.MultiClusterSummary(),
 654             description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'",
 655           },
 656         },
 657         {
 658           alert: 'CephadmPaused',
 659           'for': '1m',
 660           expr: 'ceph_health_detail{name="CEPHADM_PAUSED"} > 0',
 661           labels: { severity: 'warning', type: 'ceph_default' },
 662           annotations: {
 663             documentation: 'https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused',
 664             summary: 'Orchestration tasks via cephadm are PAUSED%(cluster)s' % $.MultiClusterSummary(),
 665             description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'",
 666           },
 667         },
 668       ],
 669     },
 670     {
 671       name: 'PrometheusServer',
 672       rules: [
 673         {
 674           alert: 'PrometheusJobMissing',
 675           'for': '30s',
 676           expr: 'absent(up{job="ceph"})',
 677           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.12.1' },
 678           annotations: {
 679             summary: 'The scrape job for Ceph is missing from Prometheus%(cluster)s' % $.MultiClusterSummary(),
 680             description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance.",
 681           },
 682         },
 683       ],
 684     },
 685     {
 686       name: 'rados',
 687       rules: [
 688         {
 689           alert: 'CephObjectMissing',
 690           'for': '30s',
 691           expr: '(ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1',
 692           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.10.1' },
 693           annotations: {
 694             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound',
 695             summary: 'Object(s) marked UNFOUND%(cluster)s' % $.MultiClusterSummary(),
 696             description: 'The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.',
 697           },
 698         },
 699       ],
 700     },
 701     {
 702       name: 'generic',
 703       rules: [
 704         {
 705           alert: 'CephDaemonCrash',
 706           'for': '1m',
 707           expr: 'ceph_health_detail{name="RECENT_CRASH"} == 1',
 708           labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.1.2' },
 709           annotations: {
 710             documentation: 'https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash',
 711             summary: 'One or more Ceph daemons have crashed, and are pending acknowledgement%(cluster)s' % $.MultiClusterSummary(),
 712             description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.",
 713           },
 714         },
 715       ],
 716     },
 717   ],
 718 }