ceph/monitoring/prometheus/alerts/ceph_default_alerts.yml

   1 groups:
   2   - name: cluster health
   3     rules:
   4       - alert: health error
   5         expr: ceph_health_status == 2
   6         for: 5m
   7         labels:
   8           severity: critical
   9           type: ceph_default
  10           oid: 1.3.6.1.4.1.50495.15.1.2.2.1
  11         annotations:
  12           description: >
  13             Ceph in HEALTH_ERROR state for more than 5 minutes.
  14             Please check "ceph health detail" for more information.
  15
  16       - alert: health warn
  17         expr: ceph_health_status == 1
  18         for: 15m
  19         labels:
  20           severity: warning
  21           type: ceph_default
  22           oid: 1.3.6.1.4.1.50495.15.1.2.2.2
  23         annotations:
  24           description: >
  25             Ceph has been in HEALTH_WARN for more than 15 minutes.
  26             Please check "ceph health detail" for more information.
  27
  28   - name: mon
  29     rules:
  30       - alert: low monitor quorum count
  31         expr: sum(ceph_mon_quorum_status) < 3
  32         labels:
  33           severity: critical
  34           type: ceph_default
  35           oid: 1.3.6.1.4.1.50495.15.1.2.3.1
  36         annotations:
  37           description: |
  38             Monitor count in quorum is below three.
  39
  40             Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
  41
  42             The following monitors are down:
  43             {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
  44               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  45             {{- end }}
  46
  47   - name: osd
  48     rules:
  49       - alert: 10% OSDs down
  50         expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
  51         labels:
  52           severity: critical
  53           type: ceph_default
  54           oid: 1.3.6.1.4.1.50495.15.1.2.4.1
  55         annotations:
  56           description: |
  57             {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
  58
  59             The following OSDs are down:
  60             {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
  61               - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  62             {{- end }}
  63
  64       - alert: OSD down
  65         expr: count(ceph_osd_up == 0) > 0
  66         for: 15m
  67         labels:
  68           severity: warning
  69           type: ceph_default
  70           oid: 1.3.6.1.4.1.50495.15.1.2.4.2
  71         annotations:
  72           description: |
  73             {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
  74             {{ $value }} OSD{{ $s }} down for more than 15 minutes.
  75
  76             {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
  77
  78             The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
  79               {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
  80                 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
  81               {{- end }}
  82
  83       - alert: OSDs near full
  84         expr: |
  85           (
  86             ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
  87             * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
  88           ) * 100 > 90
  89         for: 5m
  90         labels:
  91           severity: critical
  92           type: ceph_default
  93           oid: 1.3.6.1.4.1.50495.15.1.2.4.3
  94         annotations:
  95           description: >
  96             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
  97             dangerously full: {{ $value | humanize }}%
  98
  99       - alert: flapping OSD
 100         expr: |
 101           (
 102             rate(ceph_osd_up[5m])
 103             * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
 104           ) * 60 > 1
 105         labels:
 106           severity: warning
 107           type: ceph_default
 108           oid: 1.3.6.1.4.1.50495.15.1.2.4.4
 109         annotations:
 110           description: >
 111             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
 112             marked down and back up at {{ $value | humanize }} times once a
 113             minute for 5 minutes.
 114
 115       # alert on high deviation from average PG count
 116       - alert: high pg count deviation
 117         expr: |
 118           abs(
 119             (
 120               (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 121             ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 122           ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 123         for: 5m
 124         labels:
 125           severity: warning
 126           type: ceph_default
 127           oid: 1.3.6.1.4.1.50495.15.1.2.4.5
 128         annotations:
 129           description: >
 130             OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
 131             by more than 30% from average PG count.
 132       # alert on high commit latency...but how high is too high
 133   - name: mds
 134     rules:
 135     # no mds metrics are exported yet
 136   - name: mgr
 137     rules:
 138     # no mgr metrics are exported yet
 139   - name: pgs
 140     rules:
 141       - alert: pgs inactive
 142         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
 143         for: 5m
 144         labels:
 145           severity: critical
 146           type: ceph_default
 147           oid: 1.3.6.1.4.1.50495.15.1.2.7.1
 148         annotations:
 149           description: >
 150             {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
 151             Inactive placement groups aren't able to serve read/write
 152             requests.
 153       - alert: pgs unclean
 154         expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
 155         for: 15m
 156         labels:
 157           severity: warning
 158           type: ceph_default
 159           oid: 1.3.6.1.4.1.50495.15.1.2.7.2
 160         annotations:
 161           description: >
 162             {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
 163             Unclean PGs haven't been able to completely recover from a
 164             previous failure.
 165   - name: nodes
 166     rules:
 167       - alert: root volume full
 168         expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
 169         for: 5m
 170         labels:
 171           severity: critical
 172           type: ceph_default
 173           oid: 1.3.6.1.4.1.50495.15.1.2.8.1
 174         annotations:
 175           description: >
 176             Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
 177
 178       # alert on nic packet errors and drops rates > 1% packets/s
 179       - alert: network packets dropped
 180         expr: |
 181           (
 182             increase(node_network_receive_drop_total{device!="lo"}[1m]) +
 183             increase(node_network_transmit_drop_total{device!="lo"}[1m])
 184           ) / (
 185             increase(node_network_receive_packets_total{device!="lo"}[1m]) +
 186             increase(node_network_transmit_packets_total{device!="lo"}[1m])
 187           ) >= 0.0001 or (
 188             increase(node_network_receive_drop_total{device!="lo"}[1m]) +
 189             increase(node_network_transmit_drop_total{device!="lo"}[1m])
 190           ) >= 10
 191         labels:
 192           severity: warning
 193           type: ceph_default
 194           oid: 1.3.6.1.4.1.50495.15.1.2.8.2
 195         annotations:
 196           description: >
 197             Node {{ $labels.instance }} experiences packet drop > 0.01% or >
 198             10 packets/s on interface {{ $labels.device }}.
 199
 200       - alert: network packet errors
 201         expr: |
 202           (
 203             increase(node_network_receive_errs_total{device!="lo"}[1m]) +
 204             increase(node_network_transmit_errs_total{device!="lo"}[1m])
 205           ) / (
 206             increase(node_network_receive_packets_total{device!="lo"}[1m]) +
 207             increase(node_network_transmit_packets_total{device!="lo"}[1m])
 208           ) >= 0.0001 or (
 209             increase(node_network_receive_errs_total{device!="lo"}[1m]) +
 210             increase(node_network_transmit_errs_total{device!="lo"}[1m])
 211           ) >= 10
 212         labels:
 213           severity: warning
 214           type: ceph_default
 215           oid: 1.3.6.1.4.1.50495.15.1.2.8.3
 216         annotations:
 217           description: >
 218             Node {{ $labels.instance }} experiences packet errors > 0.01% or
 219             > 10 packets/s on interface {{ $labels.device }}.
 220
 221       - alert: storage filling up
 222         expr: |
 223           predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
 224           on(instance) group_left(nodename) node_uname_info < 0
 225         labels:
 226           severity: warning
 227           type: ceph_default
 228           oid: 1.3.6.1.4.1.50495.15.1.2.8.4
 229         annotations:
 230           description: >
 231             Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
 232             will be full in less than 5 days assuming the average fill-up
 233             rate of the past 48 hours.
 234
 235       - alert: MTU Mismatch
 236         expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
 237         labels:
 238           severity: warning
 239           type: ceph_default
 240           oid: 1.3.6.1.4.1.50495.15.1.2.8.5
 241         annotations:
 242           description: >
 243             Node {{ $labels.instance }} has a different MTU size ({{ $value }})
 244             than the median value on device {{ $labels.device }}.
 245
 246   - name: pools
 247     rules:
 248       - alert: pool full
 249         expr: |
 250           ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
 251           * on(pool_id) group_right ceph_pool_metadata * 100 > 90
 252         labels:
 253           severity: critical
 254           type: ceph_default
 255           oid: 1.3.6.1.4.1.50495.15.1.2.9.1
 256         annotations:
 257           description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
 258
 259       - alert: pool filling up
 260         expr: |
 261           (
 262             predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
 263             >= ceph_pool_stored + ceph_pool_max_avail
 264           ) * on(pool_id) group_left(name) ceph_pool_metadata
 265         labels:
 266           severity: warning
 267           type: ceph_default
 268           oid: 1.3.6.1.4.1.50495.15.1.2.9.2
 269         annotations:
 270           description: >
 271             Pool {{ $labels.name }} will be full in less than 5 days
 272             assuming the average fill-up rate of the past 48 hours.
 273
 274   - name: healthchecks
 275     rules:
 276       - alert: Slow OSD Ops
 277         expr: ceph_healthcheck_slow_ops > 0
 278         for: 30s
 279         labels:
 280           severity: warning
 281           type: ceph_default
 282         annotations:
 283           description: >
 284             {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)