ceph/monitoring/prometheus/alerts/ceph_default_alerts.yml

   1 groups:
   2   - name: cluster health
   3     rules:
   4       - alert: health error
   5         expr: ceph_health_status == 2
   6         for: 5m
   7         labels:
   8           severity: critical
   9           type: ceph_default
  10           oid: 1.3.6.1.4.1.50495.15.1.2.2.1
  11         annotations:
  12           description: >
  13             Ceph in HEALTH_ERROR state for more than 5 minutes.
  14             Please check "ceph health detail" for more information.
  15
  16       - alert: health warn
  17         expr: ceph_health_status == 1
  18         for: 15m
  19         labels:
  20           severity: warning
  21           type: ceph_default
  22           oid: 1.3.6.1.4.1.50495.15.1.2.2.2
  23         annotations:
  24           description: >
  25             Ceph has been in HEALTH_WARN for more than 15 minutes.
  26             Please check "ceph health detail" for more information.
  27
  28   - name: mon
  29     rules:
  30       - alert: low monitor quorum count
  31         expr: sum(ceph_mon_quorum_status) < 3
  32         labels:
  33           severity: critical
  34           type: ceph_default
  35           oid: 1.3.6.1.4.1.50495.15.1.2.3.1
  36         annotations:
  37           description: |
  38             Monitor count in quorum is below three.
  39
  40             Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
  41
  42             The following monitors are down:
  43             {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
  44               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  45             {{- end }}
  46
  47   - name: osd
  48     rules:
  49       - alert: 10% OSDs down
  50         expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
  51         labels:
  52           severity: critical
  53           type: ceph_default
  54           oid: 1.3.6.1.4.1.50495.15.1.2.4.1
  55         annotations:
  56           description: |
  57             {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
  58
  59             The following OSDs are down:
  60             {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
  61               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  62             {{- end }}
  63
  64       - alert: OSD down
  65         expr: count(ceph_osd_up == 0) > 0
  66         for: 15m
  67         labels:
  68           severity: warning
  69           type: ceph_default
  70           oid: 1.3.6.1.4.1.50495.15.1.2.4.2
  71         annotations:
  72           description: |
  73             {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
  74             {{ $value }} OSD{{ $s }} down for more than 15 minutes.
  75
  76             {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
  77
  78             The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
  79               {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
  80                 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  81               {{- end }}
  82
  83       - alert: OSDs near full
  84         expr: |
  85           (
  86             ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
  87             * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
  88           ) * 100 > 90
  89         for: 5m
  90         labels:
  91           severity: critical
  92           type: ceph_default
  93           oid: 1.3.6.1.4.1.50495.15.1.2.4.3
  94         annotations:
  95           description: >
  96             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
  97             dangerously full: {{ $value | humanize }}%
  98
  99       - alert: flapping OSD
 100         expr: |
 101           (
 102             rate(ceph_osd_up[5m])
 103             * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
 104           ) * 60 > 1
 105         labels:
 106           severity: warning
 107           type: ceph_default
 108           oid: 1.3.6.1.4.1.50495.15.1.2.4.4
 109         annotations:
 110           description: >
 111             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
 112             marked down and back up at {{ $value | humanize }} times once a
 113             minute for 5 minutes.
 114
 115       # alert on high deviation from average PG count
 116       - alert: high pg count deviation
 117         expr: |
 118           abs(
 119             (
 120               (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 121             ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 122           ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 123         for: 5m
 124         labels:
 125           severity: warning
 126           type: ceph_default
 127           oid: 1.3.6.1.4.1.50495.15.1.2.4.5
 128         annotations:
 129           description: >
 130             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
 131             by more than 30% from average PG count.
 132       # alert on high commit latency...but how high is too high
 133   - name: mds
 134     rules:
 135     # no mds metrics are exported yet
 136   - name: mgr
 137     rules:
 138     # no mgr metrics are exported yet
 139   - name: pgs
 140     rules:
 141       - alert: pgs inactive
 142         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
 143         for: 5m
 144         labels:
 145           severity: critical
 146           type: ceph_default
 147           oid: 1.3.6.1.4.1.50495.15.1.2.7.1
 148         annotations:
 149           description: >
 150             {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
 151             Inactive placement groups aren't able to serve read/write
 152             requests.
 153       - alert: pgs unclean
 154         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
 155         for: 15m
 156         labels:
 157           severity: warning
 158           type: ceph_default
 159           oid: 1.3.6.1.4.1.50495.15.1.2.7.2
 160         annotations:
 161           description: >
 162             {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
 163             Unclean PGs haven't been able to completely recover from a
 164             previous failure.
 165   - name: nodes
 166     rules:
 167       - alert: root volume full
 168         expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
 169         for: 5m
 170         labels:
 171           severity: critical
 172           type: ceph_default
 173           oid: 1.3.6.1.4.1.50495.15.1.2.8.1
 174         annotations:
 175           description: >
 176             Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
 177
 178       # alert on nic packet errors and drops rates > 1 packet/s
 179       - alert: network packets dropped
 180         expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
 181         labels:
 182           severity: warning
 183           type: ceph_default
 184           oid: 1.3.6.1.4.1.50495.15.1.2.8.2
 185         annotations:
 186           description: >
 187             Node {{ $labels.instance }} experiences packet drop > 1
 188             packet/s on interface {{ $labels.device }}.
 189
 190       - alert: network packet errors
 191         expr: |
 192           irate(node_network_receive_errs_total{device!="lo"}[5m]) +
 193           irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
 194         labels:
 195           severity: warning
 196           type: ceph_default
 197           oid: 1.3.6.1.4.1.50495.15.1.2.8.3
 198         annotations:
 199           description: >
 200             Node {{ $labels.instance }} experiences packet errors > 1
 201             packet/s on interface {{ $labels.device }}.
 202
 203       - alert: storage filling up
 204         expr: |
 205           predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
 206           on(instance) group_left(nodename) node_uname_info < 0
 207         labels:
 208           severity: warning
 209           type: ceph_default
 210           oid: 1.3.6.1.4.1.50495.15.1.2.8.4
 211         annotations:
 212           description: >
 213             Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
 214             will be full in less than 5 days assuming the average fill-up
 215             rate of the past 48 hours.
 216
 217   - name: pools
 218     rules:
 219       - alert: pool full
 220         expr: |
 221           ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
 222           * on(pool_id) group_right ceph_pool_metadata * 100 > 90
 223         labels:
 224           severity: critical
 225           type: ceph_default
 226           oid: 1.3.6.1.4.1.50495.15.1.2.9.1
 227         annotations:
 228           description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
 229
 230       - alert: pool filling up
 231         expr: |
 232           (
 233             predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
 234             >= ceph_pool_stored + ceph_pool_max_avail
 235           ) * on(pool_id) group_left(name) ceph_pool_metadata
 236         labels:
 237           severity: warning
 238           type: ceph_default
 239           oid: 1.3.6.1.4.1.50495.15.1.2.9.2
 240         annotations:
 241           description: >
 242             Pool {{ $labels.name }} will be full in less than 5 days
 243             assuming the average fill-up rate of the past 48 hours.
 244
 245   - name: healthchecks
 246     rules:
 247       - alert: Slow OSD Ops
 248         expr: ceph_healthcheck_slow_ops > 0
 249         for: 30s
 250         labels:
 251           severity: warning
 252           type: ceph_default
 253         annotations:
 254           description: >
 255             {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)