2 - ../prometheus_alerts.yml
3 evaluation_interval: 5m
8 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
9 values: '2 2 2 2 2 2 2'
11 - expr: ceph_health_status == 2
14 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
18 alertname: CephHealthError
20 alertname: CephHealthError
25 oid: 1.3.6.1.4.1.50495.1.2.1.2.1
29 summary: Ceph is in the ERROR state
30 description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
35 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
36 values: '1 1 1 1 1 1 1 1 1 1'
38 - expr: ceph_health_status == 1
41 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
45 alertname: CephHealthWarning
47 alertname: CephHealthWarning
55 summary: Ceph is in the WARNING state
56 description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
61 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
63 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
65 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
67 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
68 ceph_version="ceph version 17.0.0-189-g3558fd72
69 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
70 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
71 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
72 public_addr="172.20.0.2"}'
74 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
75 ceph_version="ceph version 17.0.0-189-g3558fd72
76 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
77 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
78 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
79 public_addr="172.20.0.2"}'
81 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
82 ceph_version="ceph version 17.0.0-189-g3558fd72
83 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
84 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
85 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
86 public_addr="172.20.0.2"}'
89 - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
93 value: 3.333333333333333E+01
96 alertname: CephOSDDownHigh
99 oid: 1.3.6.1.4.1.50495.1.2.1.4.1
103 summary: More than 10% of OSDs are down
104 description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
109 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
111 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
113 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
115 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
116 ceph_version="ceph version 17.0.0-189-g3558fd72
117 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
118 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
119 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
120 public_addr="172.20.0.2"}'
121 values: '1 1 1 1 1 1'
122 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
123 ceph_version="ceph version 17.0.0-189-g3558fd72
124 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
125 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
126 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
127 public_addr="172.20.0.2"}'
128 values: '1 1 1 1 1 1'
129 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
130 ceph_version="ceph version 17.0.0-189-g3558fd72
131 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
132 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
133 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
134 public_addr="172.20.0.2"}'
135 values: '1 1 1 1 1 1'
139 rate(ceph_osd_up[5m])
140 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
144 - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
146 value: 1.2200000000000001E+01
149 alertname: CephOSDFlapping
156 oid: 1.3.6.1.4.1.50495.1.2.1.4.4
160 documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
161 summary: Network issues are causing OSDs to flap (mark each other down)
162 description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
164 # high pg count deviation
167 - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
169 values: '100 100 100 100 100 160'
170 - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
172 values: '100 100 100 100 100 320'
173 - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
175 values: '100 100 100 100 100 160'
176 - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
178 values: '100 100 100 100 100 160'
179 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
180 ceph_version="ceph version 17.0.0-189-g3558fd72
181 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
182 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
183 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
184 public_addr="172.20.0.2"}'
185 values: '1 1 1 1 1 1'
186 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
187 ceph_version="ceph version 17.0.0-189-g3558fd72
188 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
189 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
190 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
191 public_addr="172.20.0.2"}'
192 values: '1 1 1 1 1 1'
193 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
194 ceph_version="ceph version 17.0.0-189-g3558fd72
195 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
196 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
197 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
198 public_addr="172.20.0.2"}'
199 values: '1 1 1 1 1 1'
200 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
201 ceph_version="ceph version 17.0.0-189-g3558fd72
202 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
203 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
204 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
205 public_addr="172.20.0.2"}'
206 values: '1 1 1 1 1 1'
211 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
213 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
214 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
218 - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
223 alertname: CephPGImbalance
230 oid: 1.3.6.1.4.1.50495.1.2.1.4.5
234 summary: PGs are not balanced across OSDs
235 description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
240 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
241 name="device_health_metrics",pool_id="1"}'
242 values: '1 1 1 1 1 1 1 1'
243 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
244 name="device_health_metrics",pool_id="2"}'
245 values: '1 1 1 1 1 1 1 1'
246 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
247 name="device_health_metrics",pool_id="3"}'
248 values: '1 1 1 1 1 1 1 1'
249 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
250 values: '1 1 1 1 1 1 1 1'
251 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
252 values: '32 32 32 32 32 32 32 32'
253 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
254 values: '33 32 32 32 32 33 33 32'
255 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
256 values: '1 1 1 1 1 1 1 1 1'
257 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
258 values: '32 32 32 32 32 32 32 32'
259 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
260 values: '32 32 32 32 32 32 32 32'
262 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
263 (ceph_pg_total - ceph_pg_active) > 0
266 - labels: '{instance="ceph:9283", job="ceph",
267 name="device_health_metrics",
272 alertname: CephPGsInactive
277 name: device_health_metrics
278 oid: 1.3.6.1.4.1.50495.1.2.1.7.1
283 summary: One or more placement groups are inactive
284 description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
289 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
290 name="device_health_metrics",pool_id="1"}'
291 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
292 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
293 name="device_health_metrics",pool_id="2"}'
294 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
295 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
296 name="device_health_metrics",pool_id="3"}'
297 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
298 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
299 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
300 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
301 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
303 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
304 values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
306 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
307 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
308 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
309 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
311 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
312 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
315 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
316 (ceph_pg_total - ceph_pg_clean) > 0
319 - labels: '{instance="ceph:9283", job="ceph",
320 name="device_health_metrics", pool_id="3"}'
324 alertname: CephPGsUnclean
329 name: device_health_metrics
330 oid: 1.3.6.1.4.1.50495.1.2.1.7.2
335 summary: One or more placement groups are marked unclean
336 description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
341 - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
342 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
344 values: '35336400896 35336400896 35336400896 35336400896 35336400896
345 3525385519.104 3533640089'
346 - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
347 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
349 values: '73445531648 73445531648 73445531648 73445531648 73445531648
350 73445531648 73445531648'
352 - expr: node_filesystem_avail_bytes{mountpoint="/"} /
353 node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
356 - labels: '{device="/dev/mapper/fedora_localhost --live-home",
357 fstype="ext4", instance="node-exporter", job="node-exporter",
362 alertname: CephNodeRootFilesystemFull
365 device: /dev/mapper/fedora_localhost --live-home
367 instance: node-exporter
370 oid: 1.3.6.1.4.1.50495.1.2.1.8.1
374 summary: Root filesystem is dangerously full
375 description: "Root volume is dangerously full: 4.811% free."
377 # network packets dropped
380 - series: 'node_network_receive_drop_total{device="eth0",
381 instance="node-exporter",job="node-exporter"}'
383 - series: 'node_network_transmit_drop_total{device="eth0",
384 instance="node-exporter",job="node-exporter"}'
386 - series: 'node_network_receive_packets_total{device="eth0",
387 instance="node-exporter",job="node-exporter"}'
389 - series: 'node_network_transmit_packets_total{device="eth0",
390 instance="node-exporter",job="node-exporter"}'
395 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
396 rate(node_network_transmit_drop_total{device!="lo"}[1m])
398 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
399 rate(node_network_transmit_packets_total{device!="lo"}[1m])
400 ) >= 0.0050000000000000001 and (
401 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
402 rate(node_network_transmit_drop_total{device!="lo"}[1m])
407 - labels: '{device="eth0", instance="node-exporter",
408 job="node-exporter"}'
412 alertname: CephNodeNetworkPacketDrops
416 instance: node-exporter
418 oid: 1.3.6.1.4.1.50495.1.2.1.8.2
422 summary: One or more NICs reports packet drops
423 description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
425 # network packets errors
428 - series: 'node_network_receive_errs_total{device="eth0",
429 instance="node-exporter",job="node-exporter"}'
431 - series: 'node_network_transmit_errs_total{device="eth0",
432 instance="node-exporter",job="node-exporter"}'
434 - series: 'node_network_transmit_packets_total{device="eth0",
435 instance="node-exporter",job="node-exporter"}'
437 - series: 'node_network_receive_packets_total{device="eth0",
438 instance="node-exporter",job="node-exporter"}'
443 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
444 rate(node_network_transmit_errs_total{device!="lo"}[1m])
446 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
447 rate(node_network_transmit_packets_total{device!="lo"}[1m])
449 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
450 rate(node_network_transmit_errs_total{device!="lo"}[1m])
455 - labels: '{device="eth0", instance="node-exporter",
456 job="node-exporter"}'
460 alertname: CephNodeNetworkPacketErrors
464 instance: node-exporter
466 oid: 1.3.6.1.4.1.50495.1.2.1.8.3
470 summary: One or more NICs reports packet errors
471 description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
473 # Bond is missing a peer
476 - series: 'node_bonding_active{master="bond0",
477 instance="node-exporter",job="node-exporter"}'
479 - series: 'node_bonding_slaves{master="bond0",
480 instance="node-exporter",job="node-exporter"}'
484 node_bonding_slaves - node_bonding_active != 0
487 - labels: '{master="bond0", instance="node-exporter",
488 job="node-exporter"}'
492 alertname: CephNodeNetworkBondDegraded
496 instance: node-exporter
501 summary: Degraded Bond on Node node-exporter
502 description: "Bond bond0 is degraded on Node node-exporter."
504 # Node Storage disk space filling up
506 # 20GB = 21474836480, 256MB = 268435456
508 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
509 fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
510 values: '21474836480-268435456x48'
511 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
512 fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
513 values: '21474836480+0x48'
514 - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
516 - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
520 predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
521 on(instance) group_left(nodename) node_uname_info < 0
524 - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
525 mountpoint="/rootfs",nodename="node-1.unittests.com"}'
526 value: -1.912602624E+12
529 alertname: CephNodeDiskspaceWarning
534 oid: 1.3.6.1.4.1.50495.1.2.1.8.4
535 device: /dev/mapper/vg-root
539 nodename: node-1.unittests.com
541 summary: Host filesystem free space is getting low
542 description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
546 - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
547 job="node-exporter"}'
548 values: '1500 1500 1500 1500 1500'
549 - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
550 job="node-exporter"}'
551 values: '1500 1500 1500 1500 1500'
552 - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
553 job="node-exporter"}'
554 values: '1500 1500 1500 1500 1500'
555 - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
556 job="node-exporter"}'
557 values: '1500 1500 1500 1500 1500'
558 - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
559 job="node-exporter"}'
560 values: '9000 9000 9000 9000 9000'
561 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
562 job="node-exporter"}'
563 values: '2200 2200 2200 2200 2200'
564 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
565 job="node-exporter"}'
566 values: '2400 2400 2400 2400 2400'
567 - series: 'node_network_up{device="eth0",instance="node-exporter",
568 job="node-exporter"}'
570 - series: 'node_network_up{device="eth1",instance="node-exporter",
571 job="node-exporter"}'
573 - series: 'node_network_up{device="eth2",instance="node-exporter",
574 job="node-exporter"}'
576 - series: 'node_network_up{device="eth3",instance="node-exporter",
577 job="node-exporter"}'
579 - series: 'node_network_up{device="eth4",instance="node-exporter",
580 job="node-exporter"}'
582 - series: 'node_network_up{device="eth4",instance="hostname1",
583 job="node-exporter"}'
585 - series: 'node_network_up{device="eth4",instance="hostname2",
586 job="node-exporter"}'
590 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
592 max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
593 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
596 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
598 min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
599 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
603 - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
605 - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
609 alertname: CephNodeInconsistentMTU
618 summary: MTU settings across Ceph hosts are inconsistent
619 description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
622 instance: node-exporter
627 summary: MTU settings across Ceph hosts are inconsistent
628 description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
630 # pool full, data series has 6 but using topk(5) so to ensure the
631 # results are working as expected
634 - series: 'ceph_health_detail{name="POOL_FULL"}'
635 values: '0 0 0 1 1 1 1 1 1 1 1'
636 - series: 'ceph_pool_percent_used{pool_id="1"}'
638 - series: 'ceph_pool_percent_used{pool_id="2"}'
640 - series: 'ceph_pool_percent_used{pool_id="3"}'
642 - series: 'ceph_pool_percent_used{pool_id="4"}'
644 - series: 'ceph_pool_percent_used{pool_id="5"}'
646 - series: 'ceph_pool_percent_used{pool_id="6"}'
648 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
649 name="cephfs_data",pool_id="1"}'
650 values: '1 1 1 1 1 1 1 1 1'
651 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
652 name="rbd",pool_id="2"}'
653 values: '1 1 1 1 1 1 1 1 1'
654 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
655 name="iscsi",pool_id="3"}'
656 values: '1 1 1 1 1 1 1 1 1'
657 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
658 name="default.rgw.index",pool_id="4"}'
659 values: '1 1 1 1 1 1 1 1 1'
660 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
661 name="default.rgw.log",pool_id="5"}'
662 values: '1 1 1 1 1 1 1 1 1'
663 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
664 name="dummy",pool_id="6"}'
665 values: '1 1 1 1 1 1 1 1 1'
667 - expr: ceph_health_detail{name="POOL_FULL"} > 0
670 - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}'
674 alertname: CephPoolFull
676 alertname: CephPoolFull
682 oid: 1.3.6.1.4.1.50495.1.2.1.9.1
684 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
685 summary: Pool is full - writes are blocked
686 description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
690 - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
693 - expr: ceph_healthcheck_slow_ops > 0
696 - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
701 alertname: CephSlowOps
709 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
710 summary: OSD operations are slow to complete
711 description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
716 - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
719 - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
722 - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
723 job="ceph", type="SLOW_OPS"}'
727 alertname: CephDaemonSlowOps
736 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
737 summary: osd.1 operations are slow to complete
738 description: "osd.1 operations are taking too long to process (complaint time exceeded)"
740 # CEPHADM orchestrator alert triggers
743 - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
746 - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
749 - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
753 alertname: CephadmUpgradeFailed
755 alertname: CephadmUpgradeFailed
758 name: UPGRADE_EXCEPTION
761 oid: 1.3.6.1.4.1.50495.1.2.1.11.2
763 summary: Ceph version upgrade has failed
764 description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
767 - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
770 - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
773 - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
777 alertname: CephadmDaemonFailed
779 alertname: CephadmDaemonFailed
782 name: CEPHADM_FAILED_DAEMON
785 oid: 1.3.6.1.4.1.50495.1.2.1.11.1
787 summary: A ceph daemon manged by cephadm is down
788 description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
791 - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
792 values: '1 1 1 1 1 1 1 1 1'
794 - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
797 - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
801 alertname: CephadmPaused
803 alertname: CephadmPaused
810 documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
811 summary: Orchestration tasks via cephadm are PAUSED
812 description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
816 - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
817 values: '1 1 1 1 1 1 1 1 1'
819 - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
822 - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
826 alertname: CephFilesystemDamaged
828 alertname: CephFilesystemDamaged
834 oid: 1.3.6.1.4.1.50495.1.2.1.5.1
836 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
837 summary: CephFS filesystem is damaged.
838 description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
841 - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
842 values: '1 1 1 1 1 1 1 1 1'
844 - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
847 - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
851 alertname: CephFilesystemReadOnly
853 alertname: CephFilesystemReadOnly
856 name: MDS_HEALTH_READ_ONLY
859 oid: 1.3.6.1.4.1.50495.1.2.1.5.2
861 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
862 summary: CephFS filesystem in read only mode due to write error(s)
863 description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
866 - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
867 values: '0 0 1 1 1 1 1 1 1 1 1'
869 - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
872 - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
876 alertname: CephFilesystemOffline
878 alertname: CephFilesystemOffline
884 oid: 1.3.6.1.4.1.50495.1.2.1.5.3
886 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
887 summary: CephFS filesystem is offline
888 description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
891 - series: 'ceph_health_detail{name="FS_DEGRADED"}'
892 values: '0 0 1 1 1 1 1 1 1 1 1'
894 - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
897 - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
901 alertname: CephFilesystemDegraded
903 alertname: CephFilesystemDegraded
909 oid: 1.3.6.1.4.1.50495.1.2.1.5.4
911 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
912 summary: CephFS filesystem is degraded
913 description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
916 - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
917 values: '0 0 1 1 1 1 1 1 1 1 1'
919 - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
922 - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
926 alertname: CephFilesystemInsufficientStandby
928 alertname: CephFilesystemInsufficientStandby
931 name: MDS_INSUFFICIENT_STANDBY
935 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
936 summary: Ceph filesystem standby daemons too few
937 description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
940 - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
941 values: '0 0 1 1 1 1 1 1 1 1 1'
943 - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
946 - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
950 alertname: CephFilesystemFailureNoStandby
952 alertname: CephFilesystemFailureNoStandby
955 name: FS_WITH_FAILED_MDS
958 oid: 1.3.6.1.4.1.50495.1.2.1.5.5
960 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
961 summary: MDS daemon failed, no further standby available
962 description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
965 - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
966 values: '0 0 1 1 1 1 1 1 1 1 1'
968 - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
971 - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
975 alertname: CephFilesystemMDSRanksLow
977 alertname: CephFilesystemMDSRanksLow
980 name: MDS_UP_LESS_THAN_MAX
984 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
985 summary: Ceph MDS daemon count is lower than configured
986 description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
990 - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
991 values: '1+0x2 0+0x10'
993 - expr: up{job="ceph"} == 0
996 - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
1000 alertname: CephMgrPrometheusModuleInactive
1002 alertname: CephMgrPrometheusModuleInactive
1005 instance: ceph-mgr:9283
1009 oid: 1.3.6.1.4.1.50495.1.2.1.6.2
1011 summary: The mgr/prometheus module is not available
1012 description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
1015 - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
1016 values: '0+0x2 1+0x20'
1018 - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
1021 - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
1025 alertname: CephMgrModuleCrash
1027 alertname: CephMgrModuleCrash
1030 name: RECENT_MGR_MODULE_CRASH
1033 oid: 1.3.6.1.4.1.50495.1.2.1.6.1
1035 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
1036 summary: A manager module has recently crashed
1037 description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
1041 - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
1042 values: '0+0x2 1+0x10'
1043 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1046 - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
1049 - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
1053 alertname: CephMonDiskspaceCritical
1055 alertname: CephMonDiskspaceCritical
1058 name: "MON_DISK_CRIT"
1061 oid: 1.3.6.1.4.1.50495.1.2.1.3.2
1063 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
1064 summary: Filesystem space on at least one monitor is critically low
1065 description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1068 - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
1069 values: '0+0x2 1+0x10'
1070 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1073 - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
1076 - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
1080 alertname: CephMonDiskspaceLow
1082 alertname: CephMonDiskspaceLow
1085 name: "MON_DISK_LOW"
1089 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
1090 summary: Drive space on at least one monitor is approaching full
1091 description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1094 - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
1095 values: '0+0x2 1+0x10'
1097 - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
1100 - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
1104 alertname: CephMonClockSkew
1106 alertname: CephMonClockSkew
1109 name: "MON_CLOCK_SKEW"
1113 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
1114 summary: Clock skew detected among monitors
1115 description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
1117 # Check 3 mons one down, quorum at risk
1120 - series: 'ceph_health_detail{name="MON_DOWN"}'
1121 values: '0+0x2 1+0x12'
1122 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1124 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1126 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1127 values: '1+0x2 0+0x12'
1128 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1130 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1132 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1135 - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
1142 alertname: CephMonDownQuorumAtRisk
1145 alertname: CephMonDownQuorumAtRisk
1150 oid: 1.3.6.1.4.1.50495.1.2.1.3.1
1152 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1153 summary: Monitor quorum is at risk
1154 description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
1155 # check 5 mons, 1 down - warning only
1158 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1160 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1162 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1164 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
1166 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
1167 values: '1+0x2 0+0x12'
1168 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1170 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1172 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1174 - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
1176 - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
1179 - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
1186 alertname: CephMonDown
1188 alertname: CephMonDown
1194 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1195 summary: One or more monitors down
1196 description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5\n"
1200 - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
1201 values: '0+0x2 1+0x10'
1203 - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
1206 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
1210 alertname: CephDeviceFailurePredicted
1212 alertname: CephDeviceFailurePredicted
1215 name: "DEVICE_HEALTH"
1219 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
1220 summary: Device(s) predicted to fail soon
1221 description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
1224 - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
1225 values: '0+0x2 1+0x10'
1227 - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
1230 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
1234 alertname: CephDeviceFailurePredictionTooHigh
1236 alertname: CephDeviceFailurePredictionTooHigh
1239 name: "DEVICE_HEALTH_TOOMANY"
1242 oid: 1.3.6.1.4.1.50495.1.2.1.4.7
1244 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
1245 summary: Too many devices are predicted to fail, unable to resolve
1246 description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
1249 - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
1250 values: '0+0x2 1+0x10'
1252 - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
1255 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
1259 alertname: CephDeviceFailureRelocationIncomplete
1261 alertname: CephDeviceFailureRelocationIncomplete
1264 name: "DEVICE_HEALTH_IN_USE"
1268 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
1269 summary: Device failure is predicted, but unable to relocate data
1270 description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
1274 - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
1275 values: '0+0x2 1+0x10'
1276 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1277 values: '1+0x2 0+0x10'
1278 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1281 - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
1284 - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
1288 alertname: CephOSDHostDown
1290 alertname: CephOSDHostDown
1293 name: "OSD_HOST_DOWN"
1296 oid: 1.3.6.1.4.1.50495.1.2.1.4.8
1298 summary: An OSD host is offline
1299 description: "The following OSDs are down: - ceph-osd-1 : osd.0"
1302 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
1303 values: '0+0x2 1+0x20'
1305 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
1308 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
1312 alertname: CephOSDTimeoutsPublicNetwork
1314 alertname: CephOSDTimeoutsPublicNetwork
1317 name: "OSD_SLOW_PING_TIME_FRONT"
1321 summary: Network issues delaying OSD heartbeats (public network)
1322 description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
1325 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
1326 values: '0+0x2 1+0x20'
1328 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
1331 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
1335 alertname: CephOSDTimeoutsClusterNetwork
1337 alertname: CephOSDTimeoutsClusterNetwork
1340 name: "OSD_SLOW_PING_TIME_BACK"
1344 summary: Network issues delaying OSD heartbeats (cluster network)
1345 description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
1348 - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1349 values: '0+0x2 1+0x20'
1351 - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
1354 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1358 alertname: CephOSDInternalDiskSizeMismatch
1360 alertname: CephOSDInternalDiskSizeMismatch
1363 name: "BLUESTORE_DISK_SIZE_MISMATCH"
1367 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
1368 summary: OSD size inconsistency error
1369 description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
1372 - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1373 values: '0+0x2 1+0x20'
1375 - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
1378 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1382 alertname: CephOSDReadErrors
1384 alertname: CephOSDReadErrors
1387 name: "BLUESTORE_SPURIOUS_READ_ERRORS"
1391 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
1392 summary: Device read errors detected
1393 description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
1396 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1397 values: '0+0x2 1+0x10'
1398 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1400 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1401 values: '1+0x2 0+0x10'
1402 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1404 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1406 - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
1408 - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
1411 - expr: ceph_health_detail{name="OSD_DOWN"} == 1
1414 - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
1418 alertname: CephOSDDown
1420 alertname: CephOSDDown
1426 oid: 1.3.6.1.4.1.50495.1.2.1.4.2
1428 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
1429 summary: An OSD has been marked down
1430 description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
1433 - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
1434 values: '0+0x2 1+0x10'
1436 - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
1439 - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
1443 alertname: CephOSDNearFull
1445 alertname: CephOSDNearFull
1448 name: "OSD_NEARFULL"
1451 oid: 1.3.6.1.4.1.50495.1.2.1.4.3
1453 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
1454 summary: OSD(s) running low on free space (NEARFULL)
1455 description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1458 - series: 'ceph_health_detail{name="OSD_FULL"}'
1459 values: '0+0x2 1+0x10'
1461 - expr: ceph_health_detail{name="OSD_FULL"} == 1
1464 - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
1468 alertname: CephOSDFull
1470 alertname: CephOSDFull
1476 oid: 1.3.6.1.4.1.50495.1.2.1.4.6
1478 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
1479 summary: OSD full, writes blocked
1480 description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1483 - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
1484 values: '0+0x2 1+0x10'
1486 - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
1489 - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
1493 alertname: CephOSDBackfillFull
1495 alertname: CephOSDBackfillFull
1498 name: "OSD_BACKFILLFULL"
1502 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
1503 summary: OSD(s) too full for backfill operations
1504 description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
1507 - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
1508 values: '0+0x2 1+0x20'
1510 - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
1513 - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
1517 alertname: CephOSDTooManyRepairs
1519 alertname: CephOSDTooManyRepairs
1522 name: "OSD_TOO_MANY_REPAIRS"
1526 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
1527 summary: OSD reports a high number of read errors
1528 description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
1530 # trigger percent full prediction on pools 1 and 2 only
1533 - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
1535 - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
1536 values: '78 89 79 98 78'
1537 - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
1539 - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
1540 values: '22 22 23 23 24'
1541 - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
1543 - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
1545 - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
1547 - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
1551 (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
1552 group_right() ceph_pool_metadata) >= 95
1555 - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
1556 value: 1.435E+02 # 142%
1559 alertname: CephPoolGrowthWarning
1563 name: default.rgw.index
1567 oid: 1.3.6.1.4.1.50495.1.2.1.9.2
1569 summary: Pool growth rate may soon exceed capacity
1570 description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
1573 - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
1574 values: '0+0x2 1+0x10'
1576 - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
1579 - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
1583 alertname: CephPoolBackfillFull
1585 alertname: CephPoolBackfillFull
1588 name: "POOL_BACKFILLFULL"
1592 summary: Free space in a pool is too low for recovery/backfill
1593 description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
1597 - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
1598 values: '0+0x2 1+0x10'
1600 - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
1603 - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
1607 alertname: CephPoolNearFull
1609 alertname: CephPoolNearFull
1612 name: "POOL_NEAR_FULL"
1616 summary: One or more Ceph pools are nearly full
1617 description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
1622 - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
1623 values: '0+0x2 1+0x10'
1625 - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
1628 - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
1632 alertname: CephPGNotScrubbed
1634 alertname: CephPGNotScrubbed
1637 name: "PG_NOT_SCRUBBED"
1641 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
1642 summary: Placement group(s) have not been scrubbed
1643 description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
1646 - series: 'ceph_health_detail{name="PG_DAMAGED"}'
1647 values: '0+0x4 1+0x20'
1649 - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
1652 - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
1656 alertname: CephPGsDamaged
1658 alertname: CephPGsDamaged
1664 oid: 1.3.6.1.4.1.50495.1.2.1.7.4
1666 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
1667 summary: Placement group damaged, manual intervention needed
1668 description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
1671 - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
1672 values: '0+0x4 1+0x20'
1674 - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
1677 - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
1681 alertname: CephPGsHighPerOSD
1683 alertname: CephPGsHighPerOSD
1686 name: "TOO_MANY_PGS"
1690 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
1691 summary: Placement groups per OSD is too high
1692 description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
1695 - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
1696 values: '0+0x2 1+0x20'
1698 - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
1701 - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
1705 alertname: CephPGRecoveryAtRisk
1707 alertname: CephPGRecoveryAtRisk
1710 name: "PG_RECOVERY_FULL"
1713 oid: 1.3.6.1.4.1.50495.1.2.1.7.5
1715 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
1716 summary: OSDs are too full for recovery
1717 description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
1720 - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
1721 values: '0+0x2 1+0x20'
1723 - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
1726 - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
1730 alertname: CephPGBackfillAtRisk
1732 alertname: CephPGBackfillAtRisk
1735 name: "PG_BACKFILL_FULL"
1738 oid: 1.3.6.1.4.1.50495.1.2.1.7.6
1740 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
1741 summary: Backfill operations are blocked due to lack of free space
1742 description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
1745 - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
1746 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
1747 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1748 values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
1750 - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
1755 # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
1757 alertname: CephPGUnavilableBlockingIO
1759 # PG_AVAILABILITY firing, but osd_down is active .. no alert
1761 alertname: CephPGUnavilableBlockingIO
1763 # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
1765 alertname: CephPGUnavilableBlockingIO
1768 name: "PG_AVAILABILITY"
1771 oid: 1.3.6.1.4.1.50495.1.2.1.7.3
1773 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
1774 summary: PG is unavailable, blocking I/O
1775 description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
1778 - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
1779 values: '0+0x2 1+0x10'
1781 - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
1784 - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
1788 alertname: CephPGNotDeepScrubbed
1790 alertname: CephPGNotDeepScrubbed
1793 name: "PG_NOT_DEEP_SCRUBBED"
1797 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
1798 summary: Placement group(s) have not been deep scrubbed
1799 description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
1804 - series: 'up{job="myjob"}'
1807 - expr: absent(up{job="ceph"})
1810 - labels: '{job="ceph"}'
1814 alertname: PrometheusJobMissing
1820 oid: 1.3.6.1.4.1.50495.1.2.1.12.1
1822 summary: The scrape job for Ceph is missing from Prometheus
1823 description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance.
1827 - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
1828 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1829 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1830 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1831 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1832 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1833 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1834 values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1835 - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
1836 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1837 - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
1838 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1839 - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
1840 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1842 - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
1846 # OBJECT_UNFOUND but osd.2 is down, so don't fire
1848 alertname: CephObjectMissing
1850 # OBJECT_UNFOUND and all osd's are online, so fire
1852 alertname: CephObjectMissing
1857 oid: 1.3.6.1.4.1.50495.1.2.1.10.1
1859 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
1860 summary: Object(s) marked UNFOUND
1861 description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
1865 - series: 'ceph_health_detail{name="RECENT_CRASH"}'
1866 values: '0 0 0 1 1 1 1 1 1 1 1'
1868 - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1874 alertname: CephDaemonCrash
1878 alertname: CephDaemonCrash
1884 oid: 1.3.6.1.4.1.50495.1.2.1.1.2
1886 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
1887 summary: One or more Ceph daemons have crashed, and are pending acknowledgement
1888 description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.