]> git.proxmox.com Git - ceph.git/blame - ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
import ceph quincy 17.2.6
[ceph.git] / ceph / monitoring / ceph-mixin / tests_alerts / test_alerts.yml
CommitLineData
20effc67
TL
1rule_files:
2 - ../prometheus_alerts.yml
3evaluation_interval: 5m
4tests:
5 # health error
6 - interval: 5m
7 input_series:
8 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
9 values: '2 2 2 2 2 2 2'
10 promql_expr_test:
11 - expr: ceph_health_status == 2
12 eval_time: 5m
13 exp_samples:
14 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
15 value: 2
16 alert_rule_test:
17 - eval_time: 1m
18 alertname: CephHealthError
19 - eval_time: 6m
20 alertname: CephHealthError
21 exp_alerts:
22 - exp_labels:
23 instance: ceph:9283
24 job: ceph
25 oid: 1.3.6.1.4.1.50495.1.2.1.2.1
26 type: ceph_default
27 severity: critical
28 exp_annotations:
2a845540
TL
29 summary: Ceph is in the ERROR state
30 description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
20effc67
TL
31
32 # health warning
33 - interval: 5m
34 input_series:
35 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
36 values: '1 1 1 1 1 1 1 1 1 1'
37 promql_expr_test:
38 - expr: ceph_health_status == 1
39 eval_time: 15m
40 exp_samples:
41 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
42 value: 1
43 alert_rule_test:
44 - eval_time: 10m
45 alertname: CephHealthWarning
46 - eval_time: 20m
47 alertname: CephHealthWarning
48 exp_alerts:
49 - exp_labels:
50 instance: ceph:9283
51 job: ceph
52 type: ceph_default
53 severity: warning
54 exp_annotations:
2a845540
TL
55 summary: Ceph is in the WARNING state
56 description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
20effc67
TL
57
58 # 10% OSDs down
59 - interval: 1m
60 input_series:
61 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
62 values: '1 1 1 1 1'
63 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
64 values: '0 0 0 0 0'
65 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
66 values: '1 1 1 1 1'
67 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
68 ceph_version="ceph version 17.0.0-189-g3558fd72
69 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
70 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
71 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
72 public_addr="172.20.0.2"}'
73 values: '1 1 1 1 1'
74 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
75 ceph_version="ceph version 17.0.0-189-g3558fd72
76 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
77 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
78 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
79 public_addr="172.20.0.2"}'
80 values: '1 1 1 1 1'
81 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
82 ceph_version="ceph version 17.0.0-189-g3558fd72
83 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
84 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
85 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
86 public_addr="172.20.0.2"}'
87 values: '1 1 1 1 1'
88 promql_expr_test:
89 - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
90 eval_time: 1m
91 exp_samples:
92 - labels: '{}'
93 value: 3.333333333333333E+01
94 alert_rule_test:
95 - eval_time: 1m
96 alertname: CephOSDDownHigh
97 exp_alerts:
98 - exp_labels:
99 oid: 1.3.6.1.4.1.50495.1.2.1.4.1
100 type: ceph_default
101 severity: critical
102 exp_annotations:
103 summary: More than 10% of OSDs are down
2a845540 104 description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
20effc67
TL
105
106 # flapping OSD
107 - interval: 1s
108 input_series:
109 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
110 values: '1+1x100'
111 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
112 values: '1+0x100'
113 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
114 values: '1+0x100'
115 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
116 ceph_version="ceph version 17.0.0-189-g3558fd72
117 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
118 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
119 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
120 public_addr="172.20.0.2"}'
121 values: '1 1 1 1 1 1'
122 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
123 ceph_version="ceph version 17.0.0-189-g3558fd72
124 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
125 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
126 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
127 public_addr="172.20.0.2"}'
128 values: '1 1 1 1 1 1'
129 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
130 ceph_version="ceph version 17.0.0-189-g3558fd72
131 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
132 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
133 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
134 public_addr="172.20.0.2"}'
135 values: '1 1 1 1 1 1'
136 promql_expr_test:
137 - expr: |
138 (
139 rate(ceph_osd_up[5m])
140 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
141 ) * 60 > 1
142 eval_time: 1m
143 exp_samples:
144 - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
145 job="ceph"}'
146 value: 1.2200000000000001E+01
147 alert_rule_test:
148 - eval_time: 5m
149 alertname: CephOSDFlapping
150 exp_alerts:
151 - exp_labels:
152 ceph_daemon: osd.0
153 hostname: ceph
154 instance: ceph:9283
155 job: ceph
156 oid: 1.3.6.1.4.1.50495.1.2.1.4.4
157 severity: warning
158 type: ceph_default
159 exp_annotations:
160 documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
2a845540
TL
161 summary: Network issues are causing OSDs to flap (mark each other down)
162 description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
20effc67
TL
163
164 # high pg count deviation
165 - interval: 1m
166 input_series:
167 - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
168 job="ceph"}'
169 values: '100 100 100 100 100 160'
170 - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
171 job="ceph"}'
172 values: '100 100 100 100 100 320'
173 - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
174 job="ceph"}'
175 values: '100 100 100 100 100 160'
176 - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
177 job="ceph"}'
178 values: '100 100 100 100 100 160'
179 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
180 ceph_version="ceph version 17.0.0-189-g3558fd72
181 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
182 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
183 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
184 public_addr="172.20.0.2"}'
185 values: '1 1 1 1 1 1'
186 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
187 ceph_version="ceph version 17.0.0-189-g3558fd72
188 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
189 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
190 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
191 public_addr="172.20.0.2"}'
192 values: '1 1 1 1 1 1'
193 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
194 ceph_version="ceph version 17.0.0-189-g3558fd72
195 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
196 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
197 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
198 public_addr="172.20.0.2"}'
199 values: '1 1 1 1 1 1'
200 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
201 ceph_version="ceph version 17.0.0-189-g3558fd72
202 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
203 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
204 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
205 public_addr="172.20.0.2"}'
206 values: '1 1 1 1 1 1'
207 promql_expr_test:
208 - expr: |
209 abs(
210 (
211 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
212 by (job)
213 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
214 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
215
216 eval_time: 5m
217 exp_samples:
218 - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
219 job="ceph"}'
220 value: 6E-01
221 alert_rule_test:
222 - eval_time: 10m
223 alertname: CephPGImbalance
224 exp_alerts:
225 - exp_labels:
226 ceph_daemon: osd.1
227 hostname: ceph
228 instance: ceph:9283
229 job: ceph
230 oid: 1.3.6.1.4.1.50495.1.2.1.4.5
231 severity: warning
232 type: ceph_default
233 exp_annotations:
2a845540
TL
234 summary: PGs are not balanced across OSDs
235 description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
20effc67
TL
236
237 # pgs inactive
238 - interval: 1m
239 input_series:
240 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
241 name="device_health_metrics",pool_id="1"}'
242 values: '1 1 1 1 1 1 1 1'
243 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
244 name="device_health_metrics",pool_id="2"}'
245 values: '1 1 1 1 1 1 1 1'
246 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
247 name="device_health_metrics",pool_id="3"}'
248 values: '1 1 1 1 1 1 1 1'
249 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
250 values: '1 1 1 1 1 1 1 1'
251 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
252 values: '32 32 32 32 32 32 32 32'
253 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
254 values: '33 32 32 32 32 33 33 32'
255 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
256 values: '1 1 1 1 1 1 1 1 1'
257 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
258 values: '32 32 32 32 32 32 32 32'
259 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
260 values: '32 32 32 32 32 32 32 32'
261 promql_expr_test:
262 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
263 (ceph_pg_total - ceph_pg_active) > 0
264 eval_time: 5m
265 exp_samples:
266 - labels: '{instance="ceph:9283", job="ceph",
267 name="device_health_metrics",
268 pool_id="3"}'
269 value: 1
270 alert_rule_test:
271 - eval_time: 5m
272 alertname: CephPGsInactive
273 exp_alerts:
274 - exp_labels:
275 instance: ceph:9283
276 job: ceph
277 name: device_health_metrics
278 oid: 1.3.6.1.4.1.50495.1.2.1.7.1
279 pool_id: 3
280 severity: critical
281 type: ceph_default
282 exp_annotations:
2a845540
TL
283 summary: One or more placement groups are inactive
284 description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
20effc67
TL
285
286 #pgs unclean
287 - interval: 1m
288 input_series:
289 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
290 name="device_health_metrics",pool_id="1"}'
291 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
292 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
293 name="device_health_metrics",pool_id="2"}'
294 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
295 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
296 name="device_health_metrics",pool_id="3"}'
297 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
298 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
299 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
300 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
301 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
302 32 32 32'
303 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
304 values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
305 33 33'
306 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
307 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
308 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
309 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
310 32 32'
311 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
312 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
313 32 32'
314 promql_expr_test:
315 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
316 (ceph_pg_total - ceph_pg_clean) > 0
317 eval_time: 15m
318 exp_samples:
319 - labels: '{instance="ceph:9283", job="ceph",
320 name="device_health_metrics", pool_id="3"}'
321 value: 1
322 alert_rule_test:
323 - eval_time: 16m
324 alertname: CephPGsUnclean
325 exp_alerts:
326 - exp_labels:
327 instance: ceph:9283
328 job: ceph
329 name: device_health_metrics
330 oid: 1.3.6.1.4.1.50495.1.2.1.7.2
331 pool_id: 3
332 severity: warning
333 type: ceph_default
334 exp_annotations:
2a845540
TL
335 summary: One or more placement groups are marked unclean
336 description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
20effc67
TL
337
338 # root volume full
339 - interval: 1m
340 input_series:
341 - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
342 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
343 mountpoint="/"}'
344 values: '35336400896 35336400896 35336400896 35336400896 35336400896
345 3525385519.104 3533640089'
346 - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
347 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
348 mountpoint="/"}'
349 values: '73445531648 73445531648 73445531648 73445531648 73445531648
350 73445531648 73445531648'
351 promql_expr_test:
352 - expr: node_filesystem_avail_bytes{mountpoint="/"} /
353 node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
354 eval_time: 5m
355 exp_samples:
356 - labels: '{device="/dev/mapper/fedora_localhost --live-home",
357 fstype="ext4", instance="node-exporter", job="node-exporter",
358 mountpoint="/"}'
359 value: 4.8E+00
360 alert_rule_test:
361 - eval_time: 10m
362 alertname: CephNodeRootFilesystemFull
363 exp_alerts:
364 - exp_labels:
365 device: /dev/mapper/fedora_localhost --live-home
366 fstype: ext4
367 instance: node-exporter
368 job: node-exporter
369 mountpoint: /
370 oid: 1.3.6.1.4.1.50495.1.2.1.8.1
371 severity: critical
372 type: ceph_default
373 exp_annotations:
374 summary: Root filesystem is dangerously full
2a845540 375 description: "Root volume is dangerously full: 4.811% free."
20effc67
TL
376
377 # network packets dropped
2a845540 378 - interval: 1m
20effc67
TL
379 input_series:
380 - series: 'node_network_receive_drop_total{device="eth0",
381 instance="node-exporter",job="node-exporter"}'
2a845540 382 values: '0+600x10'
20effc67
TL
383 - series: 'node_network_transmit_drop_total{device="eth0",
384 instance="node-exporter",job="node-exporter"}'
2a845540
TL
385 values: '0+600x10'
386 - series: 'node_network_receive_packets_total{device="eth0",
387 instance="node-exporter",job="node-exporter"}'
388 values: '0+750x10'
389 - series: 'node_network_transmit_packets_total{device="eth0",
390 instance="node-exporter",job="node-exporter"}'
391 values: '0+750x10'
20effc67
TL
392 promql_expr_test:
393 - expr: |
394 (
2a845540
TL
395 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
396 rate(node_network_transmit_drop_total{device!="lo"}[1m])
20effc67 397 ) / (
2a845540
TL
398 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
399 rate(node_network_transmit_packets_total{device!="lo"}[1m])
400 ) >= 0.0050000000000000001 and (
401 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
402 rate(node_network_transmit_drop_total{device!="lo"}[1m])
20effc67
TL
403 ) >= 10
404
405 eval_time: 5m
406 exp_samples:
407 - labels: '{device="eth0", instance="node-exporter",
408 job="node-exporter"}'
2a845540 409 value: 8E-1
20effc67
TL
410 alert_rule_test:
411 - eval_time: 5m
412 alertname: CephNodeNetworkPacketDrops
413 exp_alerts:
414 - exp_labels:
415 device: eth0
416 instance: node-exporter
417 job: node-exporter
418 oid: 1.3.6.1.4.1.50495.1.2.1.8.2
419 severity: warning
420 type: ceph_default
421 exp_annotations:
2a845540
TL
422 summary: One or more NICs reports packet drops
423 description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
20effc67
TL
424
425 # network packets errors
2a845540 426 - interval: 1m
20effc67
TL
427 input_series:
428 - series: 'node_network_receive_errs_total{device="eth0",
429 instance="node-exporter",job="node-exporter"}'
2a845540 430 values: '0+600x10'
20effc67
TL
431 - series: 'node_network_transmit_errs_total{device="eth0",
432 instance="node-exporter",job="node-exporter"}'
2a845540
TL
433 values: '0+600x10'
434 - series: 'node_network_transmit_packets_total{device="eth0",
435 instance="node-exporter",job="node-exporter"}'
436 values: '0+750x10'
437 - series: 'node_network_receive_packets_total{device="eth0",
438 instance="node-exporter",job="node-exporter"}'
439 values: '0+750x10'
20effc67
TL
440 promql_expr_test:
441 - expr: |
442 (
2a845540
TL
443 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
444 rate(node_network_transmit_errs_total{device!="lo"}[1m])
20effc67 445 ) / (
2a845540
TL
446 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
447 rate(node_network_transmit_packets_total{device!="lo"}[1m])
20effc67 448 ) >= 0.0001 or (
2a845540
TL
449 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
450 rate(node_network_transmit_errs_total{device!="lo"}[1m])
20effc67
TL
451 ) >= 10
452
453 eval_time: 5m
454 exp_samples:
455 - labels: '{device="eth0", instance="node-exporter",
456 job="node-exporter"}'
2a845540 457 value: 8E-01
20effc67
TL
458 alert_rule_test:
459 - eval_time: 5m
460 alertname: CephNodeNetworkPacketErrors
461 exp_alerts:
462 - exp_labels:
463 device: eth0
464 instance: node-exporter
465 job: node-exporter
466 oid: 1.3.6.1.4.1.50495.1.2.1.8.3
467 severity: warning
468 type: ceph_default
2a845540
TL
469 exp_annotations:
470 summary: One or more NICs reports packet errors
471 description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
20effc67
TL
472
473# Node Storage disk space filling up
474 - interval: 1m
475 # 20GB = 21474836480, 256MB = 268435456
476 input_series:
477 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
478 fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
479 values: '21474836480-268435456x48'
480 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
481 fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
482 values: '21474836480+0x48'
483 - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
484 values: 1+0x48
485 - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
486 values: 1+0x48
487 promql_expr_test:
488 - expr: |
489 predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
490 on(instance) group_left(nodename) node_uname_info < 0
491 eval_time: 5m
492 exp_samples:
493 - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
494 mountpoint="/rootfs",nodename="node-1.unittests.com"}'
495 value: -1.912602624E+12
496 alert_rule_test:
497 - eval_time: 5m
498 alertname: CephNodeDiskspaceWarning
499 exp_alerts:
500 - exp_labels:
501 severity: warning
502 type: ceph_default
503 oid: 1.3.6.1.4.1.50495.1.2.1.8.4
504 device: /dev/mapper/vg-root
505 fstype: xfs
506 instance: node-1
507 mountpoint: /rootfs
508 nodename: node-1.unittests.com
509 exp_annotations:
2a845540
TL
510 summary: Host filesystem free space is getting low
511 description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
20effc67
TL
512 # MTU Mismatch
513 - interval: 1m
514 input_series:
515 - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
516 job="node-exporter"}'
517 values: '1500 1500 1500 1500 1500'
518 - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
519 job="node-exporter"}'
520 values: '1500 1500 1500 1500 1500'
521 - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
522 job="node-exporter"}'
523 values: '1500 1500 1500 1500 1500'
524 - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
525 job="node-exporter"}'
526 values: '1500 1500 1500 1500 1500'
527 - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
528 job="node-exporter"}'
529 values: '9000 9000 9000 9000 9000'
1d09f67e
TL
530 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
531 job="node-exporter"}'
532 values: '2200 2200 2200 2200 2200'
533 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
534 job="node-exporter"}'
535 values: '2400 2400 2400 2400 2400'
20effc67
TL
536 - series: 'node_network_up{device="eth0",instance="node-exporter",
537 job="node-exporter"}'
538 values: '0 0 0 0 0'
539 - series: 'node_network_up{device="eth1",instance="node-exporter",
540 job="node-exporter"}'
541 values: '0 0 0 0 0'
542 - series: 'node_network_up{device="eth2",instance="node-exporter",
543 job="node-exporter"}'
544 values: '1 1 1 1 1'
545 - series: 'node_network_up{device="eth3",instance="node-exporter",
546 job="node-exporter"}'
1d09f67e 547 values: '1 1 1 1 1'
20effc67
TL
548 - series: 'node_network_up{device="eth4",instance="node-exporter",
549 job="node-exporter"}'
550 values: '1 1 1 1 1'
1d09f67e
TL
551 - series: 'node_network_up{device="eth4",instance="hostname1",
552 job="node-exporter"}'
553 values: '1 1 1 1 1'
554 - series: 'node_network_up{device="eth4",instance="hostname2",
555 job="node-exporter"}'
556 values: '0 0 0 0 0'
20effc67 557 promql_expr_test:
1d09f67e
TL
558 - expr: |
559 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
560 scalar(
561 max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
562 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
563 )
564 or
565 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
566 scalar(
567 min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
568 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
569 )
20effc67
TL
570 eval_time: 1m
571 exp_samples:
572 - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
573 value: 9000
1d09f67e
TL
574 - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
575 value: 2200
20effc67
TL
576 alert_rule_test:
577 - eval_time: 1m
578 alertname: CephNodeInconsistentMTU
579 exp_alerts:
1d09f67e
TL
580 - exp_labels:
581 device: eth4
582 instance: hostname1
583 job: node-exporter
584 severity: warning
585 type: ceph_default
586 exp_annotations:
587 summary: MTU settings across Ceph hosts are inconsistent
2a845540 588 description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
20effc67
TL
589 - exp_labels:
590 device: eth4
591 instance: node-exporter
592 job: node-exporter
593 severity: warning
594 type: ceph_default
595 exp_annotations:
596 summary: MTU settings across Ceph hosts are inconsistent
2a845540 597 description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
20effc67
TL
598
599 # pool full, data series has 6 but using topk(5) so to ensure the
600 # results are working as expected
601 - interval: 1m
602 input_series:
603 - series: 'ceph_health_detail{name="POOL_FULL"}'
604 values: '0 0 0 1 1 1 1 1 1 1 1'
605 - series: 'ceph_pool_percent_used{pool_id="1"}'
606 values: '32+0x10'
607 - series: 'ceph_pool_percent_used{pool_id="2"}'
608 values: '96+0x10'
609 - series: 'ceph_pool_percent_used{pool_id="3"}'
610 values: '90+0x10'
611 - series: 'ceph_pool_percent_used{pool_id="4"}'
612 values: '72+0x10'
613 - series: 'ceph_pool_percent_used{pool_id="5"}'
614 values: '19+0x10'
615 - series: 'ceph_pool_percent_used{pool_id="6"}'
616 values: '10+0x10'
617 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
618 name="cephfs_data",pool_id="1"}'
619 values: '1 1 1 1 1 1 1 1 1'
620 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
621 name="rbd",pool_id="2"}'
622 values: '1 1 1 1 1 1 1 1 1'
623 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
624 name="iscsi",pool_id="3"}'
625 values: '1 1 1 1 1 1 1 1 1'
626 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
627 name="default.rgw.index",pool_id="4"}'
628 values: '1 1 1 1 1 1 1 1 1'
629 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
630 name="default.rgw.log",pool_id="5"}'
631 values: '1 1 1 1 1 1 1 1 1'
632 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
633 name="dummy",pool_id="6"}'
634 values: '1 1 1 1 1 1 1 1 1'
635 promql_expr_test:
636 - expr: ceph_health_detail{name="POOL_FULL"} > 0
637 eval_time: 5m
638 exp_samples:
639 - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}'
640 value: 1
641 alert_rule_test:
642 - eval_time: 1m
643 alertname: CephPoolFull
644 - eval_time: 10m
645 alertname: CephPoolFull
646 exp_alerts:
647 - exp_labels:
648 name: POOL_FULL
649 severity: critical
650 type: ceph_default
651 oid: 1.3.6.1.4.1.50495.1.2.1.9.1
652 exp_annotations:
653 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
654 summary: Pool is full - writes are blocked
2a845540 655 description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
20effc67
TL
656 # slow OSD ops
657 - interval : 1m
658 input_series:
659 - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
660 values: '1+0x120'
661 promql_expr_test:
662 - expr: ceph_healthcheck_slow_ops > 0
663 eval_time: 1m
664 exp_samples:
665 - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
666 job="ceph"}'
667 value: 1
668 alert_rule_test:
669 - eval_time: 20m
670 alertname: CephSlowOps
671 exp_alerts:
672 - exp_labels:
673 instance: ceph:9283
674 job: ceph
675 severity: warning
676 type: ceph_default
677 exp_annotations:
678 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
2a845540
TL
679 summary: OSD operations are slow to complete
680 description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
20effc67 681
39ae355f
TL
682 # slow daemon ops
683 - interval : 1m
684 input_series:
685 - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
686 values: '1+0x120'
687 promql_expr_test:
688 - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
689 eval_time: 1m
690 exp_samples:
691 - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
692 job="ceph", type="SLOW_OPS"}'
693 value: 1
694 alert_rule_test:
695 - eval_time: 20m
696 alertname: CephDaemonSlowOps
697 exp_alerts:
698 - exp_labels:
699 instance: ceph:9283
700 ceph_daemon: "osd.1"
701 job: ceph
702 severity: warning
703 type: ceph_default
704 exp_annotations:
705 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
706 summary: osd.1 operations are slow to complete
707 description: "osd.1 operations are taking too long to process (complaint time exceeded)"
708
20effc67
TL
709# CEPHADM orchestrator alert triggers
710 - interval: 30s
711 input_series:
712 - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
713 values: '1+0x40'
714 promql_expr_test:
715 - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
716 eval_time: 2m
717 exp_samples:
718 - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
719 value: 1
720 alert_rule_test:
721 - eval_time: 1m
722 alertname: CephadmUpgradeFailed
723 - eval_time: 5m
724 alertname: CephadmUpgradeFailed
725 exp_alerts:
726 - exp_labels:
727 name: UPGRADE_EXCEPTION
728 severity: critical
729 type: ceph_default
730 oid: 1.3.6.1.4.1.50495.1.2.1.11.2
731 exp_annotations:
732 summary: Ceph version upgrade has failed
2a845540 733 description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
20effc67
TL
734 - interval: 30s
735 input_series:
736 - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
737 values: '1+0x40'
738 promql_expr_test:
739 - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
740 eval_time: 2m
741 exp_samples:
742 - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
743 value: 1
744 alert_rule_test:
745 - eval_time: 1m
746 alertname: CephadmDaemonFailed
747 - eval_time: 5m
748 alertname: CephadmDaemonFailed
749 exp_alerts:
750 - exp_labels:
751 name: CEPHADM_FAILED_DAEMON
752 severity: critical
753 type: ceph_default
754 oid: 1.3.6.1.4.1.50495.1.2.1.11.1
755 exp_annotations:
756 summary: A ceph daemon manged by cephadm is down
2a845540 757 description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
20effc67
TL
758 - interval: 1m
759 input_series:
760 - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
761 values: '1 1 1 1 1 1 1 1 1'
762 promql_expr_test:
763 - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
764 eval_time: 2m
765 exp_samples:
766 - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
767 value: 1
768 alert_rule_test:
769 - eval_time: 1m
770 alertname: CephadmPaused
771 - eval_time: 5m
772 alertname: CephadmPaused
773 exp_alerts:
774 - exp_labels:
775 name: CEPHADM_PAUSED
776 severity: warning
777 type: ceph_default
778 exp_annotations:
779 documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
780 summary: Orchestration tasks via cephadm are PAUSED
2a845540 781 description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
20effc67
TL
782# MDS
783 - interval: 1m
784 input_series:
785 - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
786 values: '1 1 1 1 1 1 1 1 1'
787 promql_expr_test:
788 - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
789 eval_time: 2m
790 exp_samples:
791 - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
792 value: 1
793 alert_rule_test:
794 - eval_time: 1m
795 alertname: CephFilesystemDamaged
796 - eval_time: 5m
797 alertname: CephFilesystemDamaged
798 exp_alerts:
799 - exp_labels:
800 name: MDS_DAMAGE
801 severity: critical
802 type: ceph_default
803 oid: 1.3.6.1.4.1.50495.1.2.1.5.1
804 exp_annotations:
805 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
2a845540
TL
806 summary: CephFS filesystem is damaged.
807 description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
20effc67
TL
808 - interval: 1m
809 input_series:
810 - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
811 values: '1 1 1 1 1 1 1 1 1'
812 promql_expr_test:
813 - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
814 eval_time: 2m
815 exp_samples:
816 - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
817 value: 1
818 alert_rule_test:
819 - eval_time: 1m
820 alertname: CephFilesystemReadOnly
821 - eval_time: 5m
822 alertname: CephFilesystemReadOnly
823 exp_alerts:
824 - exp_labels:
825 name: MDS_HEALTH_READ_ONLY
826 severity: critical
827 type: ceph_default
828 oid: 1.3.6.1.4.1.50495.1.2.1.5.2
829 exp_annotations:
830 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
2a845540
TL
831 summary: CephFS filesystem in read only mode due to write error(s)
832 description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
20effc67
TL
833 - interval: 1m
834 input_series:
835 - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
836 values: '0 0 1 1 1 1 1 1 1 1 1'
837 promql_expr_test:
838 - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
839 eval_time: 2m
840 exp_samples:
841 - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
842 value: 1
843 alert_rule_test:
844 - eval_time: 1m
845 alertname: CephFilesystemOffline
846 - eval_time: 10m
847 alertname: CephFilesystemOffline
848 exp_alerts:
849 - exp_labels:
850 name: MDS_ALL_DOWN
851 severity: critical
852 type: ceph_default
853 oid: 1.3.6.1.4.1.50495.1.2.1.5.3
854 exp_annotations:
855 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
2a845540
TL
856 summary: CephFS filesystem is offline
857 description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
20effc67
TL
858 - interval: 1m
859 input_series:
860 - series: 'ceph_health_detail{name="FS_DEGRADED"}'
861 values: '0 0 1 1 1 1 1 1 1 1 1'
862 promql_expr_test:
863 - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
864 eval_time: 2m
865 exp_samples:
866 - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
867 value: 1
868 alert_rule_test:
869 - eval_time: 1m
870 alertname: CephFilesystemDegraded
871 - eval_time: 10m
872 alertname: CephFilesystemDegraded
873 exp_alerts:
874 - exp_labels:
875 name: FS_DEGRADED
876 severity: critical
877 type: ceph_default
878 oid: 1.3.6.1.4.1.50495.1.2.1.5.4
879 exp_annotations:
880 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
2a845540
TL
881 summary: CephFS filesystem is degraded
882 description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
20effc67
TL
883 - interval: 1m
884 input_series:
885 - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
886 values: '0 0 1 1 1 1 1 1 1 1 1'
887 promql_expr_test:
888 - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
889 eval_time: 2m
890 exp_samples:
891 - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
892 value: 1
893 alert_rule_test:
894 - eval_time: 1m
895 alertname: CephFilesystemInsufficientStandby
896 - eval_time: 10m
897 alertname: CephFilesystemInsufficientStandby
898 exp_alerts:
899 - exp_labels:
900 name: MDS_INSUFFICIENT_STANDBY
901 severity: warning
902 type: ceph_default
903 exp_annotations:
904 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
2a845540
TL
905 summary: Ceph filesystem standby daemons too few
906 description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
20effc67
TL
907 - interval: 1m
908 input_series:
909 - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
910 values: '0 0 1 1 1 1 1 1 1 1 1'
911 promql_expr_test:
912 - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
913 eval_time: 2m
914 exp_samples:
915 - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
916 value: 1
917 alert_rule_test:
918 - eval_time: 1m
919 alertname: CephFilesystemFailureNoStandby
920 - eval_time: 10m
921 alertname: CephFilesystemFailureNoStandby
922 exp_alerts:
923 - exp_labels:
924 name: FS_WITH_FAILED_MDS
925 severity: critical
926 type: ceph_default
927 oid: 1.3.6.1.4.1.50495.1.2.1.5.5
928 exp_annotations:
929 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
2a845540
TL
930 summary: MDS daemon failed, no further standby available
931 description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
20effc67
TL
932 - interval: 1m
933 input_series:
934 - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
935 values: '0 0 1 1 1 1 1 1 1 1 1'
936 promql_expr_test:
937 - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
938 eval_time: 2m
939 exp_samples:
940 - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
941 value: 1
942 alert_rule_test:
943 - eval_time: 1m
944 alertname: CephFilesystemMDSRanksLow
945 - eval_time: 10m
946 alertname: CephFilesystemMDSRanksLow
947 exp_alerts:
948 - exp_labels:
949 name: MDS_UP_LESS_THAN_MAX
950 severity: warning
951 type: ceph_default
952 exp_annotations:
953 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
954 summary: Ceph MDS daemon count is lower than configured
2a845540 955 description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
20effc67
TL
956# MGR
957 - interval: 1m
958 input_series:
959 - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
960 values: '1+0x2 0+0x10'
961 promql_expr_test:
962 - expr: up{job="ceph"} == 0
963 eval_time: 3m
964 exp_samples:
965 - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
966 value: 0
967 alert_rule_test:
968 - eval_time: 1m
969 alertname: CephMgrPrometheusModuleInactive
970 - eval_time: 10m
971 alertname: CephMgrPrometheusModuleInactive
972 exp_alerts:
973 - exp_labels:
974 instance: ceph-mgr:9283
975 job: ceph
976 severity: critical
977 type: ceph_default
978 oid: 1.3.6.1.4.1.50495.1.2.1.6.2
979 exp_annotations:
2a845540
TL
980 summary: The mgr/prometheus module is not available
981 description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
20effc67
TL
982 - interval: 1m
983 input_series:
984 - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
985 values: '0+0x2 1+0x20'
986 promql_expr_test:
987 - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
988 eval_time: 3m
989 exp_samples:
990 - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
991 value: 1
992 alert_rule_test:
993 - eval_time: 1m
994 alertname: CephMgrModuleCrash
995 - eval_time: 15m
996 alertname: CephMgrModuleCrash
997 exp_alerts:
998 - exp_labels:
999 name: RECENT_MGR_MODULE_CRASH
1000 severity: critical
1001 type: ceph_default
1002 oid: 1.3.6.1.4.1.50495.1.2.1.6.1
1003 exp_annotations:
1004 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
2a845540
TL
1005 summary: A manager module has recently crashed
1006 description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
20effc67
TL
1007# MON
1008 - interval: 1m
1009 input_series:
1010 - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
1011 values: '0+0x2 1+0x10'
1012 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1013 values: '1+0x13'
1014 promql_expr_test:
1015 - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
1016 eval_time: 3m
1017 exp_samples:
1018 - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
1019 value: 1
1020 alert_rule_test:
1021 - eval_time: 1m
1022 alertname: CephMonDiskspaceCritical
1023 - eval_time: 10m
1024 alertname: CephMonDiskspaceCritical
1025 exp_alerts:
1026 - exp_labels:
1027 name: "MON_DISK_CRIT"
1028 severity: critical
1029 type: ceph_default
1030 oid: 1.3.6.1.4.1.50495.1.2.1.3.2
1031 exp_annotations:
1032 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
2a845540
TL
1033 summary: Filesystem space on at least one monitor is critically low
1034 description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
20effc67
TL
1035 - interval: 1m
1036 input_series:
1037 - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
1038 values: '0+0x2 1+0x10'
1039 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1040 values: '1+0x13'
1041 promql_expr_test:
1042 - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
1043 eval_time: 3m
1044 exp_samples:
1045 - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
1046 value: 1
1047 alert_rule_test:
1048 - eval_time: 1m
1049 alertname: CephMonDiskspaceLow
1050 - eval_time: 10m
1051 alertname: CephMonDiskspaceLow
1052 exp_alerts:
1053 - exp_labels:
1054 name: "MON_DISK_LOW"
1055 severity: warning
1056 type: ceph_default
1057 exp_annotations:
1058 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
2a845540
TL
1059 summary: Drive space on at least one monitor is approaching full
1060 description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
20effc67
TL
1061 - interval: 1m
1062 input_series:
1063 - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
1064 values: '0+0x2 1+0x10'
1065 promql_expr_test:
1066 - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
1067 eval_time: 3m
1068 exp_samples:
1069 - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
1070 value: 1
1071 alert_rule_test:
1072 - eval_time: 1m
1073 alertname: CephMonClockSkew
1074 - eval_time: 10m
1075 alertname: CephMonClockSkew
1076 exp_alerts:
1077 - exp_labels:
1078 name: "MON_CLOCK_SKEW"
1079 severity: warning
1080 type: ceph_default
1081 exp_annotations:
1082 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
2a845540
TL
1083 summary: Clock skew detected among monitors
1084 description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
20effc67
TL
1085
1086# Check 3 mons one down, quorum at risk
1087 - interval: 1m
1088 input_series:
1089 - series: 'ceph_health_detail{name="MON_DOWN"}'
1090 values: '0+0x2 1+0x12'
1091 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1092 values: '1+0x14'
1093 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1094 values: '1+0x14'
1095 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1096 values: '1+0x2 0+0x12'
1097 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1098 values: '1+0x14'
1099 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1100 values: '1+0x14'
1101 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1102 values: '1+0x14'
1103 promql_expr_test:
1104 - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
1105 eval_time: 3m
1106 exp_samples:
1107 - labels: '{}'
1108 value: 1
1109 alert_rule_test:
1110 - eval_time: 1m
1111 alertname: CephMonDownQuorumAtRisk
1112 # shouldn't fire
1113 - eval_time: 10m
1114 alertname: CephMonDownQuorumAtRisk
1115 exp_alerts:
1116 - exp_labels:
1117 severity: critical
1118 type: ceph_default
1119 oid: 1.3.6.1.4.1.50495.1.2.1.3.1
1120 exp_annotations:
1121 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1122 summary: Monitor quorum is at risk
2a845540 1123 description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
20effc67
TL
1124# check 5 mons, 1 down - warning only
1125 - interval: 1m
1126 input_series:
1127 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1128 values: '1+0x14'
1129 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1130 values: '1+0x14'
1131 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1132 values: '1+0x14'
1133 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
1134 values: '1+0x14'
1135 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
1136 values: '1+0x2 0+0x12'
1137 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1138 values: '1+0x14'
1139 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1140 values: '1+0x14'
1141 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1142 values: '1+0x14'
1143 - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
1144 values: '1+0x14'
1145 - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
1146 values: '1+0x14'
1147 promql_expr_test:
1148 - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
1149 eval_time: 3m
1150 exp_samples:
1151 - labels: '{}'
1152 value: 1
1153 alert_rule_test:
1154 - eval_time: 1m
1155 alertname: CephMonDown
1156 - eval_time: 10m
1157 alertname: CephMonDown
1158 exp_alerts:
1159 - exp_labels:
1160 severity: warning
1161 type: ceph_default
1162 exp_annotations:
1163 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
2a845540
TL
1164 summary: One or more monitors down
1165 description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5\n"
20effc67
TL
1166# Device Health
1167 - interval: 1m
1168 input_series:
1169 - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
1170 values: '0+0x2 1+0x10'
1171 promql_expr_test:
1172 - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
1173 eval_time: 3m
1174 exp_samples:
1175 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
1176 value: 1
1177 alert_rule_test:
1178 - eval_time: 1m
1179 alertname: CephDeviceFailurePredicted
1180 - eval_time: 10m
1181 alertname: CephDeviceFailurePredicted
1182 exp_alerts:
1183 - exp_labels:
1184 name: "DEVICE_HEALTH"
1185 severity: warning
1186 type: ceph_default
1187 exp_annotations:
1188 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
2a845540
TL
1189 summary: Device(s) predicted to fail soon
1190 description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
20effc67
TL
1191 - interval: 1m
1192 input_series:
1193 - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
1194 values: '0+0x2 1+0x10'
1195 promql_expr_test:
1196 - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
1197 eval_time: 3m
1198 exp_samples:
1199 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
1200 value: 1
1201 alert_rule_test:
1202 - eval_time: 1m
1203 alertname: CephDeviceFailurePredictionTooHigh
1204 - eval_time: 10m
1205 alertname: CephDeviceFailurePredictionTooHigh
1206 exp_alerts:
1207 - exp_labels:
1208 name: "DEVICE_HEALTH_TOOMANY"
1209 severity: critical
1210 type: ceph_default
1211 oid: 1.3.6.1.4.1.50495.1.2.1.4.7
1212 exp_annotations:
1213 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
2a845540
TL
1214 summary: Too many devices are predicted to fail, unable to resolve
1215 description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
20effc67
TL
1216 - interval: 1m
1217 input_series:
1218 - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
1219 values: '0+0x2 1+0x10'
1220 promql_expr_test:
1221 - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
1222 eval_time: 3m
1223 exp_samples:
1224 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
1225 value: 1
1226 alert_rule_test:
1227 - eval_time: 1m
1228 alertname: CephDeviceFailureRelocationIncomplete
1229 - eval_time: 10m
1230 alertname: CephDeviceFailureRelocationIncomplete
1231 exp_alerts:
1232 - exp_labels:
1233 name: "DEVICE_HEALTH_IN_USE"
1234 severity: warning
1235 type: ceph_default
1236 exp_annotations:
1237 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
2a845540
TL
1238 summary: Device failure is predicted, but unable to relocate data
1239 description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
20effc67
TL
1240# OSD
1241 - interval: 1m
1242 input_series:
1243 - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
1244 values: '0+0x2 1+0x10'
1245 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1246 values: '1+0x2 0+0x10'
1247 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1248 values: '1+0x12'
1249 promql_expr_test:
1250 - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
1251 eval_time: 3m
1252 exp_samples:
1253 - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
1254 value: 1
1255 alert_rule_test:
1256 - eval_time: 1m
1257 alertname: CephOSDHostDown
1258 - eval_time: 10m
1259 alertname: CephOSDHostDown
1260 exp_alerts:
1261 - exp_labels:
1262 name: "OSD_HOST_DOWN"
1263 severity: warning
1264 type: ceph_default
1265 oid: 1.3.6.1.4.1.50495.1.2.1.4.8
1266 exp_annotations:
1267 summary: An OSD host is offline
2a845540 1268 description: "The following OSDs are down: - ceph-osd-1 : osd.0"
20effc67
TL
1269 - interval: 1m
1270 input_series:
1271 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
1272 values: '0+0x2 1+0x20'
1273 promql_expr_test:
1274 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
1275 eval_time: 1m
1276 exp_samples:
1277 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
1278 value: 0
1279 alert_rule_test:
1280 - eval_time: 1m
1281 alertname: CephOSDTimeoutsPublicNetwork
1282 - eval_time: 10m
1283 alertname: CephOSDTimeoutsPublicNetwork
1284 exp_alerts:
1285 - exp_labels:
1286 name: "OSD_SLOW_PING_TIME_FRONT"
1287 severity: warning
1288 type: ceph_default
1289 exp_annotations:
1290 summary: Network issues delaying OSD heartbeats (public network)
2a845540 1291 description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
20effc67
TL
1292 - interval: 1m
1293 input_series:
1294 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
1295 values: '0+0x2 1+0x20'
1296 promql_expr_test:
1297 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
1298 eval_time: 1m
1299 exp_samples:
1300 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
1301 value: 0
1302 alert_rule_test:
1303 - eval_time: 1m
1304 alertname: CephOSDTimeoutsClusterNetwork
1305 - eval_time: 10m
1306 alertname: CephOSDTimeoutsClusterNetwork
1307 exp_alerts:
1308 - exp_labels:
1309 name: "OSD_SLOW_PING_TIME_BACK"
1310 severity: warning
1311 type: ceph_default
1312 exp_annotations:
1313 summary: Network issues delaying OSD heartbeats (cluster network)
2a845540 1314 description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
20effc67
TL
1315 - interval: 1m
1316 input_series:
1317 - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1318 values: '0+0x2 1+0x20'
1319 promql_expr_test:
1320 - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
1321 eval_time: 1m
1322 exp_samples:
1323 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1324 value: 0
1325 alert_rule_test:
1326 - eval_time: 1m
1327 alertname: CephOSDInternalDiskSizeMismatch
1328 - eval_time: 10m
1329 alertname: CephOSDInternalDiskSizeMismatch
1330 exp_alerts:
1331 - exp_labels:
1332 name: "BLUESTORE_DISK_SIZE_MISMATCH"
1333 severity: warning
1334 type: ceph_default
1335 exp_annotations:
1336 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
1337 summary: OSD size inconsistency error
2a845540 1338 description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
20effc67
TL
1339 - interval: 30s
1340 input_series:
1341 - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1342 values: '0+0x2 1+0x20'
1343 promql_expr_test:
1344 - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
1345 eval_time: 3m
1346 exp_samples:
1347 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1348 value: 1
1349 alert_rule_test:
1350 - eval_time: 1m
1351 alertname: CephOSDReadErrors
1352 - eval_time: 10m
1353 alertname: CephOSDReadErrors
1354 exp_alerts:
1355 - exp_labels:
1356 name: "BLUESTORE_SPURIOUS_READ_ERRORS"
1357 severity: warning
1358 type: ceph_default
1359 exp_annotations:
1360 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
1361 summary: Device read errors detected
2a845540 1362 description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
20effc67
TL
1363 - interval: 1m
1364 input_series:
1365 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1366 values: '0+0x2 1+0x10'
1367 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1368 values: '1+0x12'
1369 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1370 values: '1+0x2 0+0x10'
1371 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1372 values: '1+0x12'
1373 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1374 values: '1+0x12'
1375 - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
1376 values: '1+0x12'
1377 - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
1378 values: '1+0x12'
1379 promql_expr_test:
1380 - expr: ceph_health_detail{name="OSD_DOWN"} == 1
1381 eval_time: 3m
1382 exp_samples:
1383 - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
1384 value: 1
1385 alert_rule_test:
1386 - eval_time: 1m
1387 alertname: CephOSDDown
1388 - eval_time: 10m
1389 alertname: CephOSDDown
1390 exp_alerts:
1391 - exp_labels:
1392 name: "OSD_DOWN"
1393 severity: warning
1394 type: ceph_default
1395 oid: 1.3.6.1.4.1.50495.1.2.1.4.2
1396 exp_annotations:
1397 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
2a845540
TL
1398 summary: An OSD has been marked down
1399 description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
20effc67
TL
1400 - interval: 1m
1401 input_series:
1402 - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
1403 values: '0+0x2 1+0x10'
1404 promql_expr_test:
1405 - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
1406 eval_time: 3m
1407 exp_samples:
1408 - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
1409 value: 1
1410 alert_rule_test:
1411 - eval_time: 1m
1412 alertname: CephOSDNearFull
1413 - eval_time: 10m
1414 alertname: CephOSDNearFull
1415 exp_alerts:
1416 - exp_labels:
1417 name: "OSD_NEARFULL"
1418 severity: warning
1419 type: ceph_default
1420 oid: 1.3.6.1.4.1.50495.1.2.1.4.3
1421 exp_annotations:
1422 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
1423 summary: OSD(s) running low on free space (NEARFULL)
2a845540 1424 description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
20effc67
TL
1425 - interval: 1m
1426 input_series:
1427 - series: 'ceph_health_detail{name="OSD_FULL"}'
1428 values: '0+0x2 1+0x10'
1429 promql_expr_test:
1430 - expr: ceph_health_detail{name="OSD_FULL"} == 1
1431 eval_time: 3m
1432 exp_samples:
1433 - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
1434 value: 1
1435 alert_rule_test:
1436 - eval_time: 1m
1437 alertname: CephOSDFull
1438 - eval_time: 10m
1439 alertname: CephOSDFull
1440 exp_alerts:
1441 - exp_labels:
1442 name: "OSD_FULL"
1443 severity: critical
1444 type: ceph_default
1445 oid: 1.3.6.1.4.1.50495.1.2.1.4.6
1446 exp_annotations:
1447 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
2a845540
TL
1448 summary: OSD full, writes blocked
1449 description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
20effc67
TL
1450 - interval: 1m
1451 input_series:
1452 - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
1453 values: '0+0x2 1+0x10'
1454 promql_expr_test:
1455 - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
1456 eval_time: 3m
1457 exp_samples:
1458 - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
1459 value: 1
1460 alert_rule_test:
1461 - eval_time: 1m
1462 alertname: CephOSDBackfillFull
1463 - eval_time: 10m
1464 alertname: CephOSDBackfillFull
1465 exp_alerts:
1466 - exp_labels:
1467 name: "OSD_BACKFILLFULL"
1468 severity: warning
1469 type: ceph_default
1470 exp_annotations:
1471 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
1472 summary: OSD(s) too full for backfill operations
2a845540 1473 description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
20effc67
TL
1474 - interval: 30s
1475 input_series:
1476 - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
1477 values: '0+0x2 1+0x20'
1478 promql_expr_test:
1479 - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
1480 eval_time: 1m
1481 exp_samples:
1482 - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
1483 value: 0
1484 alert_rule_test:
1485 - eval_time: 1m
1486 alertname: CephOSDTooManyRepairs
1487 - eval_time: 10m
1488 alertname: CephOSDTooManyRepairs
1489 exp_alerts:
1490 - exp_labels:
1491 name: "OSD_TOO_MANY_REPAIRS"
1492 severity: warning
1493 type: ceph_default
1494 exp_annotations:
1495 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
2a845540
TL
1496 summary: OSD reports a high number of read errors
1497 description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
20effc67
TL
1498# Pools
1499 # trigger percent full prediction on pools 1 and 2 only
1500 - interval: 12h
1501 input_series:
39ae355f
TL
1502 - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
1503 values: '1 1 1 1 1'
1504 - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
1505 values: '78 89 79 98 78'
1506 - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
1507 values: '1 1 1 1 1'
1508 - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
20effc67 1509 values: '22 22 23 23 24'
39ae355f
TL
1510 - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
1511 values: '1 1 1 1 1'
1512 - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
1513 values: '1 1 1 1 1'
1514 - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
20effc67 1515 values: '1 1 1 1 1'
39ae355f 1516 - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
20effc67
TL
1517 values: '1 1 1 1 1'
1518 promql_expr_test:
1519 - expr: |
39ae355f
TL
1520 (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
1521 group_right() ceph_pool_metadata) >= 95
20effc67
TL
1522 eval_time: 36h
1523 exp_samples:
39ae355f
TL
1524 - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
1525 value: 1.435E+02 # 142%
20effc67
TL
1526 alert_rule_test:
1527 - eval_time: 48h
1528 alertname: CephPoolGrowthWarning
1529 exp_alerts:
1530 - exp_labels:
39ae355f
TL
1531 instance: 8090
1532 name: default.rgw.index
20effc67
TL
1533 pool_id: 1
1534 severity: warning
1535 type: ceph_default
1536 oid: 1.3.6.1.4.1.50495.1.2.1.9.2
1537 exp_annotations:
2a845540 1538 summary: Pool growth rate may soon exceed capacity
39ae355f 1539 description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
20effc67
TL
1540 - interval: 1m
1541 input_series:
1542 - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
1543 values: '0+0x2 1+0x10'
1544 promql_expr_test:
1545 - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
1546 eval_time: 3m
1547 exp_samples:
1548 - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
1549 value: 1
1550 alert_rule_test:
1551 - eval_time: 1m
1552 alertname: CephPoolBackfillFull
1553 - eval_time: 5m
1554 alertname: CephPoolBackfillFull
1555 exp_alerts:
1556 - exp_labels:
1557 name: "POOL_BACKFILLFULL"
1558 severity: warning
1559 type: ceph_default
1560 exp_annotations:
2a845540
TL
1561 summary: Free space in a pool is too low for recovery/backfill
1562 description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
20effc67
TL
1563
1564 - interval: 1m
1565 input_series:
1566 - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
1567 values: '0+0x2 1+0x10'
1568 promql_expr_test:
1569 - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
1570 eval_time: 3m
1571 exp_samples:
1572 - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
1573 value: 1
1574 alert_rule_test:
1575 - eval_time: 1m
1576 alertname: CephPoolNearFull
1577 - eval_time: 10m
1578 alertname: CephPoolNearFull
1579 exp_alerts:
1580 - exp_labels:
1581 name: "POOL_NEAR_FULL"
1582 severity: warning
1583 type: ceph_default
1584 exp_annotations:
2a845540
TL
1585 summary: One or more Ceph pools are nearly full
1586 description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
20effc67
TL
1587
1588# PGs
1589 - interval: 1m
1590 input_series:
1591 - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
1592 values: '0+0x2 1+0x10'
1593 promql_expr_test:
1594 - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
1595 eval_time: 3m
1596 exp_samples:
1597 - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
1598 value: 1
1599 alert_rule_test:
1600 - eval_time: 1m
1601 alertname: CephPGNotScrubbed
1602 - eval_time: 10m
1603 alertname: CephPGNotScrubbed
1604 exp_alerts:
1605 - exp_labels:
1606 name: "PG_NOT_SCRUBBED"
1607 severity: warning
1608 type: ceph_default
1609 exp_annotations:
1610 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
1611 summary: Placement group(s) have not been scrubbed
2a845540 1612 description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
20effc67
TL
1613 - interval: 1m
1614 input_series:
1615 - series: 'ceph_health_detail{name="PG_DAMAGED"}'
1616 values: '0+0x4 1+0x20'
1617 promql_expr_test:
1618 - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
1619 eval_time: 5m
1620 exp_samples:
1621 - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
1622 value: 1
1623 alert_rule_test:
1624 - eval_time: 1m
1625 alertname: CephPGsDamaged
1626 - eval_time: 10m
1627 alertname: CephPGsDamaged
1628 exp_alerts:
1629 - exp_labels:
1630 name: "PG_DAMAGED"
1631 severity: critical
1632 type: ceph_default
1633 oid: 1.3.6.1.4.1.50495.1.2.1.7.4
1634 exp_annotations:
1635 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
1636 summary: Placement group damaged, manual intervention needed
2a845540 1637 description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
20effc67
TL
1638 - interval: 1m
1639 input_series:
1640 - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
1641 values: '0+0x4 1+0x20'
1642 promql_expr_test:
1643 - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
1644 eval_time: 5m
1645 exp_samples:
1646 - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
1647 value: 1
1648 alert_rule_test:
1649 - eval_time: 1m
1650 alertname: CephPGsHighPerOSD
1651 - eval_time: 10m
1652 alertname: CephPGsHighPerOSD
1653 exp_alerts:
1654 - exp_labels:
1655 name: "TOO_MANY_PGS"
1656 severity: warning
1657 type: ceph_default
1658 exp_annotations:
1659 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
1660 summary: Placement groups per OSD is too high
2a845540 1661 description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
20effc67
TL
1662 - interval: 1m
1663 input_series:
1664 - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
1665 values: '0+0x2 1+0x20'
1666 promql_expr_test:
1667 - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
1668 eval_time: 1m
1669 exp_samples:
1670 - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
1671 value: 0
1672 alert_rule_test:
1673 - eval_time: 1m
1674 alertname: CephPGRecoveryAtRisk
1675 - eval_time: 10m
1676 alertname: CephPGRecoveryAtRisk
1677 exp_alerts:
1678 - exp_labels:
1679 name: "PG_RECOVERY_FULL"
1680 severity: critical
1681 type: ceph_default
1682 oid: 1.3.6.1.4.1.50495.1.2.1.7.5
1683 exp_annotations:
1684 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
2a845540
TL
1685 summary: OSDs are too full for recovery
1686 description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
20effc67
TL
1687 - interval: 1m
1688 input_series:
1689 - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
1690 values: '0+0x2 1+0x20'
1691 promql_expr_test:
1692 - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
1693 eval_time: 1m
1694 exp_samples:
1695 - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
1696 value: 0
1697 alert_rule_test:
1698 - eval_time: 1m
1699 alertname: CephPGBackfillAtRisk
1700 - eval_time: 10m
1701 alertname: CephPGBackfillAtRisk
1702 exp_alerts:
1703 - exp_labels:
1704 name: "PG_BACKFILL_FULL"
1705 severity: critical
1706 type: ceph_default
1707 oid: 1.3.6.1.4.1.50495.1.2.1.7.6
1708 exp_annotations:
1709 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
2a845540
TL
1710 summary: Backfill operations are blocked due to lack of free space
1711 description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
20effc67
TL
1712 - interval: 1m
1713 input_series:
1714 - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
1715 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
1716 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1717 values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
1718 promql_expr_test:
1719 - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
1720 eval_time: 1m
1721 # empty set at 1m
1722 exp_samples:
1723 alert_rule_test:
1724 # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
1725 - eval_time: 1m
1726 alertname: CephPGUnavilableBlockingIO
1727 exp_alerts:
1728 # PG_AVAILABILITY firing, but osd_down is active .. no alert
1729 - eval_time: 5m
1730 alertname: CephPGUnavilableBlockingIO
1731 exp_alerts:
1732 # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
1733 - eval_time: 15m
1734 alertname: CephPGUnavilableBlockingIO
1735 exp_alerts:
1736 - exp_labels:
1737 name: "PG_AVAILABILITY"
1738 severity: critical
1739 type: ceph_default
1740 oid: 1.3.6.1.4.1.50495.1.2.1.7.3
1741 exp_annotations:
1742 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
2a845540
TL
1743 summary: PG is unavailable, blocking I/O
1744 description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
20effc67
TL
1745 - interval: 1m
1746 input_series:
1747 - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
1748 values: '0+0x2 1+0x10'
1749 promql_expr_test:
1750 - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
1751 eval_time: 3m
1752 exp_samples:
1753 - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
1754 value: 1
1755 alert_rule_test:
1756 - eval_time: 1m
1757 alertname: CephPGNotDeepScrubbed
1758 - eval_time: 10m
1759 alertname: CephPGNotDeepScrubbed
1760 exp_alerts:
1761 - exp_labels:
1762 name: "PG_NOT_DEEP_SCRUBBED"
1763 severity: warning
1764 type: ceph_default
1765 exp_annotations:
1766 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
1767 summary: Placement group(s) have not been deep scrubbed
2a845540 1768 description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
20effc67
TL
1769
1770# Prometheus
1771 - interval: 1m
1772 input_series:
1773 - series: 'up{job="myjob"}'
1774 values: '1+0x10'
1775 promql_expr_test:
1776 - expr: absent(up{job="ceph"})
1777 eval_time: 1m
1778 exp_samples:
1779 - labels: '{job="ceph"}'
1780 value: 1
1781 alert_rule_test:
1782 - eval_time: 5m
1783 alertname: PrometheusJobMissing
1784 exp_alerts:
1785 - exp_labels:
1786 job: ceph
1787 severity: critical
1788 type: ceph_default
1789 oid: 1.3.6.1.4.1.50495.1.2.1.12.1
1790 exp_annotations:
1791 summary: The scrape job for Ceph is missing from Prometheus
2a845540 1792 description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance.
20effc67
TL
1793# RADOS
1794 - interval: 1m
1795 input_series:
1796 - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
1797 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1798 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1799 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1800 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1801 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1802 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1803 values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1804 - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
1805 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1806 - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
1807 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1808 - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
1809 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1810 promql_expr_test:
1811 - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
1812 eval_time: 1m
1813 exp_samples:
1814 alert_rule_test:
1815 # OBJECT_UNFOUND but osd.2 is down, so don't fire
1816 - eval_time: 5m
1817 alertname: CephObjectMissing
1818 exp_alerts:
1819 # OBJECT_UNFOUND and all osd's are online, so fire
1820 - eval_time: 15m
1821 alertname: CephObjectMissing
1822 exp_alerts:
1823 - exp_labels:
1824 severity: critical
1825 type: ceph_default
1826 oid: 1.3.6.1.4.1.50495.1.2.1.10.1
1827 exp_annotations:
1828 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
2a845540
TL
1829 summary: Object(s) marked UNFOUND
1830 description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
20effc67
TL
1831# Generic Alerts
1832 - interval: 1m
1833 input_series:
1834 - series: 'ceph_health_detail{name="RECENT_CRASH"}'
1835 values: '0 0 0 1 1 1 1 1 1 1 1'
1836 promql_expr_test:
1837 - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1838 eval_time: 1m
1839 exp_samples:
1840 alert_rule_test:
1841 # not firing
1842 - eval_time: 1m
1843 alertname: CephDaemonCrash
1844 exp_alerts:
1845 # firing
1846 - eval_time: 10m
1847 alertname: CephDaemonCrash
1848 exp_alerts:
1849 - exp_labels:
1850 name: RECENT_CRASH
1851 severity: critical
1852 type: ceph_default
1853 oid: 1.3.6.1.4.1.50495.1.2.1.1.2
1854 exp_annotations:
1855 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
1856 summary: One or more Ceph daemons have crashed, and are pending acknowledgement
2a845540 1857 description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.