ceph/monitoring/prometheus/alerts/ceph_default_alerts.yml

   1 groups:
   2   - name: cluster health
   3     rules:
   4       - alert: health error
   5         expr: ceph_health_status == 2
   6         for: 5m
   7         labels:
   8           severity: critical
   9           type: ceph_default
  10           oid: 1.3.6.1.4.1.50495.15.1.2.2.1
  11         annotations:
  12           description: >
  13             Ceph in HEALTH_ERROR state for more than 5 minutes.
  14             Please check "ceph health detail" for more information.
  15
  16       - alert: health warn
  17         expr: ceph_health_status == 1
  18         for: 15m
  19         labels:
  20           severity: warning
  21           type: ceph_default
  22           oid: 1.3.6.1.4.1.50495.15.1.2.2.2
  23         annotations:
  24           description: >
  25             Ceph has been in HEALTH_WARN for more than 15 minutes.
  26             Please check "ceph health detail" for more information.
  27
  28   - name: mon
  29     rules:
  30       - alert: low monitor quorum count
  31         expr: sum(ceph_mon_quorum_status) < 3
  32         labels:
  33           severity: critical
  34           type: ceph_default
  35           oid: 1.3.6.1.4.1.50495.15.1.2.3.1
  36         annotations:
  37           description: |
  38             Monitor count in quorum is below three.
  39
  40             Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
  41
  42             The following monitors are down:
  43             {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
  44               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  45             {{- end }}
  46
  47   - name: osd
  48     rules:
  49       - alert: 10% OSDs down
  50         expr: (sum(ceph_osd_up) / count(ceph_osd_up)) * 100 <= 90
  51         labels:
  52           severity: critical
  53           type: ceph_default
  54           oid: 1.3.6.1.4.1.50495.15.1.2.4.1
  55         annotations:
  56           description: |
  57             {{ $value | humanize}}% or {{with query "sum(ceph_osd_up)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)"}}{{. | first | value }}{{ end }} OSDs are down (>=10%).
  58
  59             The following OSDs are down:
  60             {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
  61               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  62             {{- end }}
  63
  64       - alert: OSD down
  65         expr: count(ceph_osd_up == 0) > 0
  66         for: 15m
  67         labels:
  68           severity: warning
  69           type: ceph_default
  70           oid: 1.3.6.1.4.1.50495.15.1.2.4.2
  71         annotations:
  72           description: |
  73             {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
  74             {{ $value }} OSD{{ $s }} down for more than 15 minutes.
  75
  76             {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
  77
  78             The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
  79               {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
  80                 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  81               {{- end }}
  82
  83       - alert: OSDs near full
  84         expr: |
  85           (
  86             ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
  87             * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
  88           ) * 100 > 90
  89         for: 5m
  90         labels:
  91           severity: critical
  92           type: ceph_default
  93           oid: 1.3.6.1.4.1.50495.15.1.2.4.3
  94         annotations:
  95           description: >
  96             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
  97             dangerously full: {{ $value | humanize }}%
  98
  99       - alert: flapping OSD
 100         expr: |
 101           (
 102             rate(ceph_osd_up[5m])
 103             * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
 104           ) * 60 > 1
 105         labels:
 106           severity: warning
 107           type: ceph_default
 108           oid: 1.3.6.1.4.1.50495.15.1.2.4.4
 109         annotations:
 110           description: >
 111             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
 112             marked down and back up at {{ $value | humanize }} times once a
 113             minute for 5 minutes.
 114
 115       # alert on high deviation from average PG count
 116       - alert: high pg count deviation
 117         expr: |
 118           abs(
 119             (
 120               (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 121             ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 122           ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 123         for: 5m
 124         labels:
 125           severity: warning
 126           type: ceph_default
 127           oid: 1.3.6.1.4.1.50495.15.1.2.4.5
 128         annotations:
 129           description: >
 130             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
 131             by more than 30% from average PG count.
 132       # alert on high commit latency...but how high is too high
 133   - name: mds
 134     rules:
 135     # no mds metrics are exported yet
 136   - name: mgr
 137     rules:
 138     # no mgr metrics are exported yet
 139   - name: pgs
 140     rules:
 141       - alert: pgs inactive
 142         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
 143         for: 5m
 144         labels:
 145           severity: critical
 146           type: ceph_default
 147           oid: 1.3.6.1.4.1.50495.15.1.2.7.1
 148         annotations:
 149           description: >
 150             {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
 151             Inactive placement groups aren't able to serve read/write
 152             requests.
 153       - alert: pgs unclean
 154         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
 155         for: 15m
 156         labels:
 157           severity: warning
 158           type: ceph_default
 159           oid: 1.3.6.1.4.1.50495.15.1.2.7.2
 160         annotations:
 161           description: >
 162             {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
 163             Unclean PGs haven't been able to completely recover from a
 164             previous failure.
 165   - name: nodes
 166     rules:
 167       - alert: root volume full
 168         expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
 169         labels:
 170           severity: critical
 171           type: ceph_default
 172           oid: 1.3.6.1.4.1.50495.15.1.2.8.1
 173         annotations:
 174           description: >
 175             Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
 176
 177       # alert on nic packet errors and drops rates > 1 packet/s
 178       - alert: network packets dropped
 179         expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
 180         labels:
 181           severity: warning
 182           type: ceph_default
 183           oid: 1.3.6.1.4.1.50495.15.1.2.8.2
 184         annotations:
 185           description: >
 186             Node {{ $labels.instance }} experiences packet drop > 1
 187             packet/s on interface {{ $labels.device }}.
 188
 189       - alert: network packet errors
 190         expr: |
 191           irate(node_network_receive_errs_total{device!="lo"}[5m]) +
 192           irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
 193         labels:
 194           severity: warning
 195           type: ceph_default
 196           oid: 1.3.6.1.4.1.50495.15.1.2.8.3
 197         annotations:
 198           description: >
 199             Node {{ $labels.instance }} experiences packet errors > 1
 200             packet/s on interface {{ $labels.device }}.
 201
 202       # predict fs fill-up times
 203       - alert: storage filling
 204         expr: |
 205           (
 206             (
 207               node_filesystem_free_bytes / deriv(node_filesystem_free_bytes[2d])
 208               * on(instance) group_left(nodename) node_uname_info
 209             ) <= 5
 210           ) > 0
 211         labels:
 212           severity: warning
 213           type: ceph_default
 214           oid: 1.3.6.1.4.1.50495.15.1.2.8.4
 215         annotations:
 216           description: >
 217             Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
 218             will be full in less than 5 days assuming the average fill-up
 219             rate of the past 48 hours.
 220
 221   - name: pools
 222     rules:
 223       - alert: pool full
 224         expr: |
 225           ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
 226           * on(pool_id) group_right ceph_pool_metadata * 100 > 90
 227         labels:
 228           severity: critical
 229           type: ceph_default
 230           oid: 1.3.6.1.4.1.50495.15.1.2.9.1
 231         annotations:
 232           description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
 233
 234       - alert: pool filling up
 235         expr: |
 236           (
 237             (
 238               (ceph_pool_max_avail - ceph_pool_stored) / deriv(ceph_pool_max_avail[2d])
 239             ) * on(pool_id) group_right ceph_pool_metadata <= 5
 240           ) > 0
 241         labels:
 242           severity: warning
 243           type: ceph_default
 244           oid: 1.3.6.1.4.1.50495.15.1.2.9.2
 245         annotations:
 246           description: >
 247             Pool {{ $labels.name }} will be full in less than 5 days
 248             assuming the average fill-up rate of the past 48 hours.