2 - ../prometheus_alerts.yml
3 evaluation_interval: 5m
8 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
9 values: '2 2 2 2 2 2 2'
11 - expr: ceph_health_status == 2
14 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
18 alertname: CephHealthError
20 alertname: CephHealthError
25 oid: 1.3.6.1.4.1.50495.1.2.1.2.1
29 summary: Ceph is in the ERROR state
30 description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
35 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
36 values: '1 1 1 1 1 1 1 1 1 1'
38 - expr: ceph_health_status == 1
41 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
45 alertname: CephHealthWarning
47 alertname: CephHealthWarning
55 summary: Ceph is in the WARNING state
56 description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
61 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
63 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
65 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
67 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
68 ceph_version="ceph version 17.0.0-189-g3558fd72
69 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
70 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
71 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
72 public_addr="172.20.0.2"}'
74 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
75 ceph_version="ceph version 17.0.0-189-g3558fd72
76 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
77 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
78 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
79 public_addr="172.20.0.2"}'
81 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
82 ceph_version="ceph version 17.0.0-189-g3558fd72
83 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
84 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
85 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
86 public_addr="172.20.0.2"}'
89 - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
93 value: 3.333333333333333E+01
96 alertname: CephOSDDownHigh
99 oid: 1.3.6.1.4.1.50495.1.2.1.4.1
103 summary: More than 10% of OSDs are down
104 description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
109 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
111 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
113 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
115 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
116 ceph_version="ceph version 17.0.0-189-g3558fd72
117 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
118 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
119 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
120 public_addr="172.20.0.2"}'
121 values: '1 1 1 1 1 1'
122 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
123 ceph_version="ceph version 17.0.0-189-g3558fd72
124 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
125 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
126 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
127 public_addr="172.20.0.2"}'
128 values: '1 1 1 1 1 1'
129 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
130 ceph_version="ceph version 17.0.0-189-g3558fd72
131 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
132 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
133 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
134 public_addr="172.20.0.2"}'
135 values: '1 1 1 1 1 1'
139 rate(ceph_osd_up[5m])
140 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
144 - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
146 value: 1.2200000000000001E+01
149 alertname: CephOSDFlapping
156 oid: 1.3.6.1.4.1.50495.1.2.1.4.4
160 documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
161 summary: Network issues are causing OSDs to flap (mark each other down)
162 description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
164 # high pg count deviation
167 - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
169 values: '100 100 100 100 100 160'
170 - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
172 values: '100 100 100 100 100 320'
173 - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
175 values: '100 100 100 100 100 160'
176 - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
178 values: '100 100 100 100 100 160'
179 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
180 ceph_version="ceph version 17.0.0-189-g3558fd72
181 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
182 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
183 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
184 public_addr="172.20.0.2"}'
185 values: '1 1 1 1 1 1'
186 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
187 ceph_version="ceph version 17.0.0-189-g3558fd72
188 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
189 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
190 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
191 public_addr="172.20.0.2"}'
192 values: '1 1 1 1 1 1'
193 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
194 ceph_version="ceph version 17.0.0-189-g3558fd72
195 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
196 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
197 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
198 public_addr="172.20.0.2"}'
199 values: '1 1 1 1 1 1'
200 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
201 ceph_version="ceph version 17.0.0-189-g3558fd72
202 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
203 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
204 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
205 public_addr="172.20.0.2"}'
206 values: '1 1 1 1 1 1'
211 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
213 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
214 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
218 - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
223 alertname: CephPGImbalance
230 oid: 1.3.6.1.4.1.50495.1.2.1.4.5
234 summary: PGs are not balanced across OSDs
235 description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
240 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
241 name="device_health_metrics",pool_id="1"}'
242 values: '1 1 1 1 1 1 1 1'
243 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
244 name="device_health_metrics",pool_id="2"}'
245 values: '1 1 1 1 1 1 1 1'
246 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
247 name="device_health_metrics",pool_id="3"}'
248 values: '1 1 1 1 1 1 1 1'
249 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
250 values: '1 1 1 1 1 1 1 1'
251 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
252 values: '32 32 32 32 32 32 32 32'
253 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
254 values: '33 32 32 32 32 33 33 32'
255 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
256 values: '1 1 1 1 1 1 1 1 1'
257 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
258 values: '32 32 32 32 32 32 32 32'
259 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
260 values: '32 32 32 32 32 32 32 32'
262 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
263 (ceph_pg_total - ceph_pg_active) > 0
266 - labels: '{instance="ceph:9283", job="ceph",
267 name="device_health_metrics",
272 alertname: CephPGsInactive
277 name: device_health_metrics
278 oid: 1.3.6.1.4.1.50495.1.2.1.7.1
283 summary: One or more placement groups are inactive
284 description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
289 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
290 name="device_health_metrics",pool_id="1"}'
291 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
292 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
293 name="device_health_metrics",pool_id="2"}'
294 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
295 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
296 name="device_health_metrics",pool_id="3"}'
297 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
298 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
299 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
300 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
301 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
303 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
304 values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
306 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
307 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
308 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
309 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
311 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
312 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
315 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
316 (ceph_pg_total - ceph_pg_clean) > 0
319 - labels: '{instance="ceph:9283", job="ceph",
320 name="device_health_metrics", pool_id="3"}'
324 alertname: CephPGsUnclean
329 name: device_health_metrics
330 oid: 1.3.6.1.4.1.50495.1.2.1.7.2
335 summary: One or more placement groups are marked unclean
336 description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
341 - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
342 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
344 values: '35336400896 35336400896 35336400896 35336400896 35336400896
345 3525385519.104 3533640089'
346 - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
347 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
349 values: '73445531648 73445531648 73445531648 73445531648 73445531648
350 73445531648 73445531648'
352 - expr: node_filesystem_avail_bytes{mountpoint="/"} /
353 node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
356 - labels: '{device="/dev/mapper/fedora_localhost --live-home",
357 fstype="ext4", instance="node-exporter", job="node-exporter",
362 alertname: CephNodeRootFilesystemFull
365 device: /dev/mapper/fedora_localhost --live-home
367 instance: node-exporter
370 oid: 1.3.6.1.4.1.50495.1.2.1.8.1
374 summary: Root filesystem is dangerously full
375 description: "Root volume is dangerously full: 4.811% free."
377 # network packets dropped
380 - series: 'node_network_receive_drop_total{device="eth0",
381 instance="node-exporter",job="node-exporter"}'
383 - series: 'node_network_transmit_drop_total{device="eth0",
384 instance="node-exporter",job="node-exporter"}'
386 - series: 'node_network_receive_packets_total{device="eth0",
387 instance="node-exporter",job="node-exporter"}'
389 - series: 'node_network_transmit_packets_total{device="eth0",
390 instance="node-exporter",job="node-exporter"}'
395 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
396 rate(node_network_transmit_drop_total{device!="lo"}[1m])
398 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
399 rate(node_network_transmit_packets_total{device!="lo"}[1m])
400 ) >= 0.0050000000000000001 and (
401 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
402 rate(node_network_transmit_drop_total{device!="lo"}[1m])
407 - labels: '{device="eth0", instance="node-exporter",
408 job="node-exporter"}'
412 alertname: CephNodeNetworkPacketDrops
416 instance: node-exporter
418 oid: 1.3.6.1.4.1.50495.1.2.1.8.2
422 summary: One or more NICs reports packet drops
423 description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
425 # network packets errors
428 - series: 'node_network_receive_errs_total{device="eth0",
429 instance="node-exporter",job="node-exporter"}'
431 - series: 'node_network_transmit_errs_total{device="eth0",
432 instance="node-exporter",job="node-exporter"}'
434 - series: 'node_network_transmit_packets_total{device="eth0",
435 instance="node-exporter",job="node-exporter"}'
437 - series: 'node_network_receive_packets_total{device="eth0",
438 instance="node-exporter",job="node-exporter"}'
443 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
444 rate(node_network_transmit_errs_total{device!="lo"}[1m])
446 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
447 rate(node_network_transmit_packets_total{device!="lo"}[1m])
449 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
450 rate(node_network_transmit_errs_total{device!="lo"}[1m])
455 - labels: '{device="eth0", instance="node-exporter",
456 job="node-exporter"}'
460 alertname: CephNodeNetworkPacketErrors
464 instance: node-exporter
466 oid: 1.3.6.1.4.1.50495.1.2.1.8.3
470 summary: One or more NICs reports packet errors
471 description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
473 # Node Storage disk space filling up
475 # 20GB = 21474836480, 256MB = 268435456
477 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
478 fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
479 values: '21474836480-268435456x48'
480 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
481 fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
482 values: '21474836480+0x48'
483 - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
485 - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
489 predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
490 on(instance) group_left(nodename) node_uname_info < 0
493 - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
494 mountpoint="/rootfs",nodename="node-1.unittests.com"}'
495 value: -1.912602624E+12
498 alertname: CephNodeDiskspaceWarning
503 oid: 1.3.6.1.4.1.50495.1.2.1.8.4
504 device: /dev/mapper/vg-root
508 nodename: node-1.unittests.com
510 summary: Host filesystem free space is getting low
511 description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
515 - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
516 job="node-exporter"}'
517 values: '1500 1500 1500 1500 1500'
518 - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
519 job="node-exporter"}'
520 values: '1500 1500 1500 1500 1500'
521 - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
522 job="node-exporter"}'
523 values: '1500 1500 1500 1500 1500'
524 - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
525 job="node-exporter"}'
526 values: '1500 1500 1500 1500 1500'
527 - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
528 job="node-exporter"}'
529 values: '9000 9000 9000 9000 9000'
530 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
531 job="node-exporter"}'
532 values: '2200 2200 2200 2200 2200'
533 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
534 job="node-exporter"}'
535 values: '2400 2400 2400 2400 2400'
536 - series: 'node_network_up{device="eth0",instance="node-exporter",
537 job="node-exporter"}'
539 - series: 'node_network_up{device="eth1",instance="node-exporter",
540 job="node-exporter"}'
542 - series: 'node_network_up{device="eth2",instance="node-exporter",
543 job="node-exporter"}'
545 - series: 'node_network_up{device="eth3",instance="node-exporter",
546 job="node-exporter"}'
548 - series: 'node_network_up{device="eth4",instance="node-exporter",
549 job="node-exporter"}'
551 - series: 'node_network_up{device="eth4",instance="hostname1",
552 job="node-exporter"}'
554 - series: 'node_network_up{device="eth4",instance="hostname2",
555 job="node-exporter"}'
559 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
561 max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
562 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
565 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
567 min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
568 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
572 - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
574 - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
578 alertname: CephNodeInconsistentMTU
587 summary: MTU settings across Ceph hosts are inconsistent
588 description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
591 instance: node-exporter
596 summary: MTU settings across Ceph hosts are inconsistent
597 description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
599 # pool full, data series has 6 but using topk(5) so to ensure the
600 # results are working as expected
603 - series: 'ceph_health_detail{name="POOL_FULL"}'
604 values: '0 0 0 1 1 1 1 1 1 1 1'
605 - series: 'ceph_pool_percent_used{pool_id="1"}'
607 - series: 'ceph_pool_percent_used{pool_id="2"}'
609 - series: 'ceph_pool_percent_used{pool_id="3"}'
611 - series: 'ceph_pool_percent_used{pool_id="4"}'
613 - series: 'ceph_pool_percent_used{pool_id="5"}'
615 - series: 'ceph_pool_percent_used{pool_id="6"}'
617 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
618 name="cephfs_data",pool_id="1"}'
619 values: '1 1 1 1 1 1 1 1 1'
620 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
621 name="rbd",pool_id="2"}'
622 values: '1 1 1 1 1 1 1 1 1'
623 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
624 name="iscsi",pool_id="3"}'
625 values: '1 1 1 1 1 1 1 1 1'
626 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
627 name="default.rgw.index",pool_id="4"}'
628 values: '1 1 1 1 1 1 1 1 1'
629 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
630 name="default.rgw.log",pool_id="5"}'
631 values: '1 1 1 1 1 1 1 1 1'
632 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
633 name="dummy",pool_id="6"}'
634 values: '1 1 1 1 1 1 1 1 1'
636 - expr: ceph_health_detail{name="POOL_FULL"} > 0
639 - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}'
643 alertname: CephPoolFull
645 alertname: CephPoolFull
651 oid: 1.3.6.1.4.1.50495.1.2.1.9.1
653 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
654 summary: Pool is full - writes are blocked
655 description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
659 - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
662 - expr: ceph_healthcheck_slow_ops > 0
665 - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
670 alertname: CephSlowOps
678 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
679 summary: OSD operations are slow to complete
680 description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
682 # CEPHADM orchestrator alert triggers
685 - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
688 - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
691 - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
695 alertname: CephadmUpgradeFailed
697 alertname: CephadmUpgradeFailed
700 name: UPGRADE_EXCEPTION
703 oid: 1.3.6.1.4.1.50495.1.2.1.11.2
705 summary: Ceph version upgrade has failed
706 description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
709 - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
712 - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
715 - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
719 alertname: CephadmDaemonFailed
721 alertname: CephadmDaemonFailed
724 name: CEPHADM_FAILED_DAEMON
727 oid: 1.3.6.1.4.1.50495.1.2.1.11.1
729 summary: A ceph daemon manged by cephadm is down
730 description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
733 - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
734 values: '1 1 1 1 1 1 1 1 1'
736 - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
739 - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
743 alertname: CephadmPaused
745 alertname: CephadmPaused
752 documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
753 summary: Orchestration tasks via cephadm are PAUSED
754 description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
758 - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
759 values: '1 1 1 1 1 1 1 1 1'
761 - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
764 - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
768 alertname: CephFilesystemDamaged
770 alertname: CephFilesystemDamaged
776 oid: 1.3.6.1.4.1.50495.1.2.1.5.1
778 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
779 summary: CephFS filesystem is damaged.
780 description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
783 - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
784 values: '1 1 1 1 1 1 1 1 1'
786 - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
789 - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
793 alertname: CephFilesystemReadOnly
795 alertname: CephFilesystemReadOnly
798 name: MDS_HEALTH_READ_ONLY
801 oid: 1.3.6.1.4.1.50495.1.2.1.5.2
803 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
804 summary: CephFS filesystem in read only mode due to write error(s)
805 description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
808 - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
809 values: '0 0 1 1 1 1 1 1 1 1 1'
811 - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
814 - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
818 alertname: CephFilesystemOffline
820 alertname: CephFilesystemOffline
826 oid: 1.3.6.1.4.1.50495.1.2.1.5.3
828 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
829 summary: CephFS filesystem is offline
830 description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
833 - series: 'ceph_health_detail{name="FS_DEGRADED"}'
834 values: '0 0 1 1 1 1 1 1 1 1 1'
836 - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
839 - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
843 alertname: CephFilesystemDegraded
845 alertname: CephFilesystemDegraded
851 oid: 1.3.6.1.4.1.50495.1.2.1.5.4
853 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
854 summary: CephFS filesystem is degraded
855 description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
858 - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
859 values: '0 0 1 1 1 1 1 1 1 1 1'
861 - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
864 - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
868 alertname: CephFilesystemInsufficientStandby
870 alertname: CephFilesystemInsufficientStandby
873 name: MDS_INSUFFICIENT_STANDBY
877 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
878 summary: Ceph filesystem standby daemons too few
879 description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
882 - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
883 values: '0 0 1 1 1 1 1 1 1 1 1'
885 - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
888 - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
892 alertname: CephFilesystemFailureNoStandby
894 alertname: CephFilesystemFailureNoStandby
897 name: FS_WITH_FAILED_MDS
900 oid: 1.3.6.1.4.1.50495.1.2.1.5.5
902 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
903 summary: MDS daemon failed, no further standby available
904 description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
907 - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
908 values: '0 0 1 1 1 1 1 1 1 1 1'
910 - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
913 - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
917 alertname: CephFilesystemMDSRanksLow
919 alertname: CephFilesystemMDSRanksLow
922 name: MDS_UP_LESS_THAN_MAX
926 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
927 summary: Ceph MDS daemon count is lower than configured
928 description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
932 - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
933 values: '1+0x2 0+0x10'
935 - expr: up{job="ceph"} == 0
938 - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
942 alertname: CephMgrPrometheusModuleInactive
944 alertname: CephMgrPrometheusModuleInactive
947 instance: ceph-mgr:9283
951 oid: 1.3.6.1.4.1.50495.1.2.1.6.2
953 summary: The mgr/prometheus module is not available
954 description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
957 - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
958 values: '0+0x2 1+0x20'
960 - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
963 - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
967 alertname: CephMgrModuleCrash
969 alertname: CephMgrModuleCrash
972 name: RECENT_MGR_MODULE_CRASH
975 oid: 1.3.6.1.4.1.50495.1.2.1.6.1
977 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
978 summary: A manager module has recently crashed
979 description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
983 - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
984 values: '0+0x2 1+0x10'
985 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
988 - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
991 - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
995 alertname: CephMonDiskspaceCritical
997 alertname: CephMonDiskspaceCritical
1000 name: "MON_DISK_CRIT"
1003 oid: 1.3.6.1.4.1.50495.1.2.1.3.2
1005 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
1006 summary: Filesystem space on at least one monitor is critically low
1007 description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1010 - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
1011 values: '0+0x2 1+0x10'
1012 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1015 - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
1018 - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
1022 alertname: CephMonDiskspaceLow
1024 alertname: CephMonDiskspaceLow
1027 name: "MON_DISK_LOW"
1031 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
1032 summary: Drive space on at least one monitor is approaching full
1033 description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1036 - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
1037 values: '0+0x2 1+0x10'
1039 - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
1042 - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
1046 alertname: CephMonClockSkew
1048 alertname: CephMonClockSkew
1051 name: "MON_CLOCK_SKEW"
1055 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
1056 summary: Clock skew detected among monitors
1057 description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
1059 # Check 3 mons one down, quorum at risk
1062 - series: 'ceph_health_detail{name="MON_DOWN"}'
1063 values: '0+0x2 1+0x12'
1064 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1066 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1068 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1069 values: '1+0x2 0+0x12'
1070 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1072 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1074 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1077 - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
1084 alertname: CephMonDownQuorumAtRisk
1087 alertname: CephMonDownQuorumAtRisk
1092 oid: 1.3.6.1.4.1.50495.1.2.1.3.1
1094 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1095 summary: Monitor quorum is at risk
1096 description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
1097 # check 5 mons, 1 down - warning only
1100 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1102 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1104 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1106 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
1108 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
1109 values: '1+0x2 0+0x12'
1110 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1112 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1114 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1116 - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
1118 - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
1121 - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
1128 alertname: CephMonDown
1130 alertname: CephMonDown
1136 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1137 summary: One or more monitors down
1138 description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5\n"
1142 - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
1143 values: '0+0x2 1+0x10'
1145 - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
1148 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
1152 alertname: CephDeviceFailurePredicted
1154 alertname: CephDeviceFailurePredicted
1157 name: "DEVICE_HEALTH"
1161 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
1162 summary: Device(s) predicted to fail soon
1163 description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
1166 - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
1167 values: '0+0x2 1+0x10'
1169 - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
1172 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
1176 alertname: CephDeviceFailurePredictionTooHigh
1178 alertname: CephDeviceFailurePredictionTooHigh
1181 name: "DEVICE_HEALTH_TOOMANY"
1184 oid: 1.3.6.1.4.1.50495.1.2.1.4.7
1186 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
1187 summary: Too many devices are predicted to fail, unable to resolve
1188 description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
1191 - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
1192 values: '0+0x2 1+0x10'
1194 - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
1197 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
1201 alertname: CephDeviceFailureRelocationIncomplete
1203 alertname: CephDeviceFailureRelocationIncomplete
1206 name: "DEVICE_HEALTH_IN_USE"
1210 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
1211 summary: Device failure is predicted, but unable to relocate data
1212 description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
1216 - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
1217 values: '0+0x2 1+0x10'
1218 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1219 values: '1+0x2 0+0x10'
1220 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1223 - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
1226 - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
1230 alertname: CephOSDHostDown
1232 alertname: CephOSDHostDown
1235 name: "OSD_HOST_DOWN"
1238 oid: 1.3.6.1.4.1.50495.1.2.1.4.8
1240 summary: An OSD host is offline
1241 description: "The following OSDs are down: - ceph-osd-1 : osd.0"
1244 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
1245 values: '0+0x2 1+0x20'
1247 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
1250 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
1254 alertname: CephOSDTimeoutsPublicNetwork
1256 alertname: CephOSDTimeoutsPublicNetwork
1259 name: "OSD_SLOW_PING_TIME_FRONT"
1263 summary: Network issues delaying OSD heartbeats (public network)
1264 description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
1267 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
1268 values: '0+0x2 1+0x20'
1270 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
1273 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
1277 alertname: CephOSDTimeoutsClusterNetwork
1279 alertname: CephOSDTimeoutsClusterNetwork
1282 name: "OSD_SLOW_PING_TIME_BACK"
1286 summary: Network issues delaying OSD heartbeats (cluster network)
1287 description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
1290 - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1291 values: '0+0x2 1+0x20'
1293 - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
1296 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1300 alertname: CephOSDInternalDiskSizeMismatch
1302 alertname: CephOSDInternalDiskSizeMismatch
1305 name: "BLUESTORE_DISK_SIZE_MISMATCH"
1309 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
1310 summary: OSD size inconsistency error
1311 description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
1314 - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1315 values: '0+0x2 1+0x20'
1317 - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
1320 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1324 alertname: CephOSDReadErrors
1326 alertname: CephOSDReadErrors
1329 name: "BLUESTORE_SPURIOUS_READ_ERRORS"
1333 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
1334 summary: Device read errors detected
1335 description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
1338 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1339 values: '0+0x2 1+0x10'
1340 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1342 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1343 values: '1+0x2 0+0x10'
1344 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1346 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1348 - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
1350 - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
1353 - expr: ceph_health_detail{name="OSD_DOWN"} == 1
1356 - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
1360 alertname: CephOSDDown
1362 alertname: CephOSDDown
1368 oid: 1.3.6.1.4.1.50495.1.2.1.4.2
1370 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
1371 summary: An OSD has been marked down
1372 description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
1375 - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
1376 values: '0+0x2 1+0x10'
1378 - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
1381 - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
1385 alertname: CephOSDNearFull
1387 alertname: CephOSDNearFull
1390 name: "OSD_NEARFULL"
1393 oid: 1.3.6.1.4.1.50495.1.2.1.4.3
1395 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
1396 summary: OSD(s) running low on free space (NEARFULL)
1397 description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1400 - series: 'ceph_health_detail{name="OSD_FULL"}'
1401 values: '0+0x2 1+0x10'
1403 - expr: ceph_health_detail{name="OSD_FULL"} == 1
1406 - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
1410 alertname: CephOSDFull
1412 alertname: CephOSDFull
1418 oid: 1.3.6.1.4.1.50495.1.2.1.4.6
1420 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
1421 summary: OSD full, writes blocked
1422 description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1425 - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
1426 values: '0+0x2 1+0x10'
1428 - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
1431 - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
1435 alertname: CephOSDBackfillFull
1437 alertname: CephOSDBackfillFull
1440 name: "OSD_BACKFILLFULL"
1444 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
1445 summary: OSD(s) too full for backfill operations
1446 description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
1449 - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
1450 values: '0+0x2 1+0x20'
1452 - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
1455 - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
1459 alertname: CephOSDTooManyRepairs
1461 alertname: CephOSDTooManyRepairs
1464 name: "OSD_TOO_MANY_REPAIRS"
1468 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
1469 summary: OSD reports a high number of read errors
1470 description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
1472 # trigger percent full prediction on pools 1 and 2 only
1475 - series: 'ceph_pool_percent_used{pool_id="1"}'
1476 values: '70 75 80 87 92'
1477 - series: 'ceph_pool_percent_used{pool_id="2"}'
1478 values: '22 22 23 23 24'
1479 - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
1481 - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
1485 (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
1486 group_right ceph_pool_metadata) >= 95
1489 - labels: '{name="rbd",pool_id="1",type="replicated"}'
1490 value: 1.424E+02 # 142%
1493 alertname: CephPoolGrowthWarning
1500 oid: 1.3.6.1.4.1.50495.1.2.1.9.2
1502 summary: Pool growth rate may soon exceed capacity
1503 description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
1506 - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
1507 values: '0+0x2 1+0x10'
1509 - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
1512 - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
1516 alertname: CephPoolBackfillFull
1518 alertname: CephPoolBackfillFull
1521 name: "POOL_BACKFILLFULL"
1525 summary: Free space in a pool is too low for recovery/backfill
1526 description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
1530 - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
1531 values: '0+0x2 1+0x10'
1533 - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
1536 - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
1540 alertname: CephPoolNearFull
1542 alertname: CephPoolNearFull
1545 name: "POOL_NEAR_FULL"
1549 summary: One or more Ceph pools are nearly full
1550 description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
1555 - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
1556 values: '0+0x2 1+0x10'
1558 - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
1561 - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
1565 alertname: CephPGNotScrubbed
1567 alertname: CephPGNotScrubbed
1570 name: "PG_NOT_SCRUBBED"
1574 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
1575 summary: Placement group(s) have not been scrubbed
1576 description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
1579 - series: 'ceph_health_detail{name="PG_DAMAGED"}'
1580 values: '0+0x4 1+0x20'
1582 - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
1585 - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
1589 alertname: CephPGsDamaged
1591 alertname: CephPGsDamaged
1597 oid: 1.3.6.1.4.1.50495.1.2.1.7.4
1599 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
1600 summary: Placement group damaged, manual intervention needed
1601 description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
1604 - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
1605 values: '0+0x4 1+0x20'
1607 - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
1610 - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
1614 alertname: CephPGsHighPerOSD
1616 alertname: CephPGsHighPerOSD
1619 name: "TOO_MANY_PGS"
1623 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
1624 summary: Placement groups per OSD is too high
1625 description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
1628 - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
1629 values: '0+0x2 1+0x20'
1631 - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
1634 - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
1638 alertname: CephPGRecoveryAtRisk
1640 alertname: CephPGRecoveryAtRisk
1643 name: "PG_RECOVERY_FULL"
1646 oid: 1.3.6.1.4.1.50495.1.2.1.7.5
1648 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
1649 summary: OSDs are too full for recovery
1650 description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
1653 - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
1654 values: '0+0x2 1+0x20'
1656 - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
1659 - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
1663 alertname: CephPGBackfillAtRisk
1665 alertname: CephPGBackfillAtRisk
1668 name: "PG_BACKFILL_FULL"
1671 oid: 1.3.6.1.4.1.50495.1.2.1.7.6
1673 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
1674 summary: Backfill operations are blocked due to lack of free space
1675 description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
1678 - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
1679 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
1680 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1681 values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
1683 - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
1688 # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
1690 alertname: CephPGUnavilableBlockingIO
1692 # PG_AVAILABILITY firing, but osd_down is active .. no alert
1694 alertname: CephPGUnavilableBlockingIO
1696 # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
1698 alertname: CephPGUnavilableBlockingIO
1701 name: "PG_AVAILABILITY"
1704 oid: 1.3.6.1.4.1.50495.1.2.1.7.3
1706 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
1707 summary: PG is unavailable, blocking I/O
1708 description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
1711 - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
1712 values: '0+0x2 1+0x10'
1714 - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
1717 - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
1721 alertname: CephPGNotDeepScrubbed
1723 alertname: CephPGNotDeepScrubbed
1726 name: "PG_NOT_DEEP_SCRUBBED"
1730 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
1731 summary: Placement group(s) have not been deep scrubbed
1732 description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
1737 - series: 'up{job="myjob"}'
1740 - expr: absent(up{job="ceph"})
1743 - labels: '{job="ceph"}'
1747 alertname: PrometheusJobMissing
1753 oid: 1.3.6.1.4.1.50495.1.2.1.12.1
1755 summary: The scrape job for Ceph is missing from Prometheus
1756 description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance.
1760 - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
1761 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1762 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1763 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1764 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1765 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1766 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1767 values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1768 - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
1769 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1770 - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
1771 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1772 - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
1773 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1775 - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
1779 # OBJECT_UNFOUND but osd.2 is down, so don't fire
1781 alertname: CephObjectMissing
1783 # OBJECT_UNFOUND and all osd's are online, so fire
1785 alertname: CephObjectMissing
1790 oid: 1.3.6.1.4.1.50495.1.2.1.10.1
1792 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
1793 summary: Object(s) marked UNFOUND
1794 description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
1798 - series: 'ceph_health_detail{name="RECENT_CRASH"}'
1799 values: '0 0 0 1 1 1 1 1 1 1 1'
1801 - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1807 alertname: CephDaemonCrash
1811 alertname: CephDaemonCrash
1817 oid: 1.3.6.1.4.1.50495.1.2.1.1.2
1819 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
1820 summary: One or more Ceph daemons have crashed, and are pending acknowledgement
1821 description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.