]> git.proxmox.com Git - ceph.git/blame - ceph/monitoring/prometheus/alerts/ceph_default_alerts.yml
import 15.2.4
[ceph.git] / ceph / monitoring / prometheus / alerts / ceph_default_alerts.yml
CommitLineData
81eedcae
TL
1groups:
2 - name: cluster health
3 rules:
4 - alert: health error
5 expr: ceph_health_status == 2
6 for: 5m
7 labels:
8 severity: critical
9 type: ceph_default
9f95a23c 10 oid: 1.3.6.1.4.1.50495.15.1.2.2.1
81eedcae 11 annotations:
92f5a8d4
TL
12 description: >
13 Ceph in HEALTH_ERROR state for more than 5 minutes.
14 Please check "ceph health detail" for more information.
15
81eedcae
TL
16 - alert: health warn
17 expr: ceph_health_status == 1
18 for: 15m
19 labels:
20 severity: warning
21 type: ceph_default
9f95a23c 22 oid: 1.3.6.1.4.1.50495.15.1.2.2.2
81eedcae 23 annotations:
92f5a8d4
TL
24 description: >
25 Ceph has been in HEALTH_WARN for more than 15 minutes.
26 Please check "ceph health detail" for more information.
27
81eedcae
TL
28 - name: mon
29 rules:
30 - alert: low monitor quorum count
31 expr: sum(ceph_mon_quorum_status) < 3
32 labels:
33 severity: critical
34 type: ceph_default
9f95a23c 35 oid: 1.3.6.1.4.1.50495.15.1.2.3.1
81eedcae 36 annotations:
92f5a8d4
TL
37 description: |
38 Monitor count in quorum is below three.
39
40 Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.
41
42 The following monitors are down:
43 {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
44 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
45 {{- end }}
46
81eedcae
TL
47 - name: osd
48 rules:
49 - alert: 10% OSDs down
e306af50 50 expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
81eedcae
TL
51 labels:
52 severity: critical
53 type: ceph_default
9f95a23c 54 oid: 1.3.6.1.4.1.50495.15.1.2.4.1
81eedcae 55 annotations:
92f5a8d4 56 description: |
e306af50 57 {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).
92f5a8d4
TL
58
59 The following OSDs are down:
60 {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
61 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
62 {{- end }}
63
81eedcae
TL
64 - alert: OSD down
65 expr: count(ceph_osd_up == 0) > 0
66 for: 15m
67 labels:
68 severity: warning
69 type: ceph_default
9f95a23c 70 oid: 1.3.6.1.4.1.50495.15.1.2.4.2
81eedcae 71 annotations:
92f5a8d4
TL
72 description: |
73 {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
74 {{ $value }} OSD{{ $s }} down for more than 15 minutes.
75
76 {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.
77
78 The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
79 {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
80 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
81 {{- end }}
82
81eedcae 83 - alert: OSDs near full
92f5a8d4
TL
84 expr: |
85 (
86 ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
87 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
88 ) * 100 > 90
89 for: 5m
81eedcae
TL
90 labels:
91 severity: critical
92 type: ceph_default
9f95a23c 93 oid: 1.3.6.1.4.1.50495.15.1.2.4.3
81eedcae 94 annotations:
92f5a8d4
TL
95 description: >
96 OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
97 dangerously full: {{ $value | humanize }}%
98
99 - alert: flapping OSD
100 expr: |
101 (
102 rate(ceph_osd_up[5m])
103 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
104 ) * 60 > 1
81eedcae
TL
105 labels:
106 severity: warning
107 type: ceph_default
9f95a23c 108 oid: 1.3.6.1.4.1.50495.15.1.2.4.4
81eedcae
TL
109 annotations:
110 description: >
92f5a8d4
TL
111 OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
112 marked down and back up at {{ $value | humanize }} times once a
113 minute for 5 minutes.
114
81eedcae
TL
115 # alert on high deviation from average PG count
116 - alert: high pg count deviation
92f5a8d4
TL
117 expr: |
118 abs(
119 (
120 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
121 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
122 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
81eedcae
TL
123 for: 5m
124 labels:
125 severity: warning
126 type: ceph_default
9f95a23c 127 oid: 1.3.6.1.4.1.50495.15.1.2.4.5
81eedcae
TL
128 annotations:
129 description: >
92f5a8d4
TL
130 OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
131 by more than 30% from average PG count.
81eedcae
TL
132 # alert on high commit latency...but how high is too high
133 - name: mds
134 rules:
135 # no mds metrics are exported yet
136 - name: mgr
137 rules:
138 # no mgr metrics are exported yet
139 - name: pgs
140 rules:
141 - alert: pgs inactive
92f5a8d4 142 expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
81eedcae
TL
143 for: 5m
144 labels:
145 severity: critical
146 type: ceph_default
9f95a23c 147 oid: 1.3.6.1.4.1.50495.15.1.2.7.1
81eedcae 148 annotations:
92f5a8d4
TL
149 description: >
150 {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
151 Inactive placement groups aren't able to serve read/write
152 requests.
81eedcae 153 - alert: pgs unclean
92f5a8d4 154 expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
81eedcae
TL
155 for: 15m
156 labels:
157 severity: warning
158 type: ceph_default
9f95a23c 159 oid: 1.3.6.1.4.1.50495.15.1.2.7.2
81eedcae 160 annotations:
92f5a8d4
TL
161 description: >
162 {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
163 Unclean PGs haven't been able to completely recover from a
164 previous failure.
81eedcae
TL
165 - name: nodes
166 rules:
167 - alert: root volume full
92f5a8d4 168 expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
1911f103 169 for: 5m
81eedcae
TL
170 labels:
171 severity: critical
172 type: ceph_default
9f95a23c 173 oid: 1.3.6.1.4.1.50495.15.1.2.8.1
81eedcae 174 annotations:
92f5a8d4
TL
175 description: >
176 Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
177
81eedcae
TL
178 # alert on nic packet errors and drops rates > 1 packet/s
179 - alert: network packets dropped
180 expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
181 labels:
182 severity: warning
183 type: ceph_default
9f95a23c 184 oid: 1.3.6.1.4.1.50495.15.1.2.8.2
81eedcae
TL
185 annotations:
186 description: >
187 Node {{ $labels.instance }} experiences packet drop > 1
188 packet/s on interface {{ $labels.device }}.
92f5a8d4 189
81eedcae 190 - alert: network packet errors
92f5a8d4
TL
191 expr: |
192 irate(node_network_receive_errs_total{device!="lo"}[5m]) +
193 irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
81eedcae
TL
194 labels:
195 severity: warning
196 type: ceph_default
9f95a23c 197 oid: 1.3.6.1.4.1.50495.15.1.2.8.3
81eedcae
TL
198 annotations:
199 description: >
200 Node {{ $labels.instance }} experiences packet errors > 1
201 packet/s on interface {{ $labels.device }}.
92f5a8d4 202
1911f103 203 - alert: storage filling up
92f5a8d4 204 expr: |
1911f103
TL
205 predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
206 on(instance) group_left(nodename) node_uname_info < 0
81eedcae
TL
207 labels:
208 severity: warning
209 type: ceph_default
9f95a23c 210 oid: 1.3.6.1.4.1.50495.15.1.2.8.4
81eedcae
TL
211 annotations:
212 description: >
92f5a8d4
TL
213 Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
214 will be full in less than 5 days assuming the average fill-up
215 rate of the past 48 hours.
216
81eedcae
TL
217 - name: pools
218 rules:
219 - alert: pool full
92f5a8d4 220 expr: |
9f95a23c 221 ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
92f5a8d4 222 * on(pool_id) group_right ceph_pool_metadata * 100 > 90
81eedcae
TL
223 labels:
224 severity: critical
225 type: ceph_default
9f95a23c 226 oid: 1.3.6.1.4.1.50495.15.1.2.9.1
81eedcae 227 annotations:
92f5a8d4
TL
228 description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.
229
81eedcae 230 - alert: pool filling up
92f5a8d4
TL
231 expr: |
232 (
1911f103
TL
233 predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5) >=
234 ceph_pool_max_avail
235 ) * on(pool_id) group_right(name) ceph_pool_metadata
81eedcae
TL
236 labels:
237 severity: warning
238 type: ceph_default
9f95a23c 239 oid: 1.3.6.1.4.1.50495.15.1.2.9.2
81eedcae
TL
240 annotations:
241 description: >
242 Pool {{ $labels.name }} will be full in less than 5 days
92f5a8d4 243 assuming the average fill-up rate of the past 48 hours.