]> git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
update ceph source to reef 18.1.2
[ceph.git] / ceph / monitoring / ceph-mixin / tests_alerts / test_alerts.yml
1 rule_files:
2 - ../prometheus_alerts.yml
3 evaluation_interval: 5m
4 tests:
5 # health error
6 - interval: 5m
7 input_series:
8 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
9 values: '2 2 2 2 2 2 2'
10 promql_expr_test:
11 - expr: ceph_health_status == 2
12 eval_time: 5m
13 exp_samples:
14 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
15 value: 2
16 alert_rule_test:
17 - eval_time: 1m
18 alertname: CephHealthError
19 - eval_time: 6m
20 alertname: CephHealthError
21 exp_alerts:
22 - exp_labels:
23 instance: ceph:9283
24 job: ceph
25 oid: 1.3.6.1.4.1.50495.1.2.1.2.1
26 type: ceph_default
27 severity: critical
28 exp_annotations:
29 summary: Ceph is in the ERROR state
30 description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
31
32 # health warning
33 - interval: 5m
34 input_series:
35 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
36 values: '1 1 1 1 1 1 1 1 1 1'
37 promql_expr_test:
38 - expr: ceph_health_status == 1
39 eval_time: 15m
40 exp_samples:
41 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
42 value: 1
43 alert_rule_test:
44 - eval_time: 10m
45 alertname: CephHealthWarning
46 - eval_time: 20m
47 alertname: CephHealthWarning
48 exp_alerts:
49 - exp_labels:
50 instance: ceph:9283
51 job: ceph
52 type: ceph_default
53 severity: warning
54 exp_annotations:
55 summary: Ceph is in the WARNING state
56 description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
57
58 # 10% OSDs down
59 - interval: 1m
60 input_series:
61 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
62 values: '1 1 1 1 1'
63 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
64 values: '0 0 0 0 0'
65 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
66 values: '1 1 1 1 1'
67 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
68 ceph_version="ceph version 17.0.0-189-g3558fd72
69 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
70 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
71 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
72 public_addr="172.20.0.2"}'
73 values: '1 1 1 1 1'
74 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
75 ceph_version="ceph version 17.0.0-189-g3558fd72
76 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
77 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
78 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
79 public_addr="172.20.0.2"}'
80 values: '1 1 1 1 1'
81 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
82 ceph_version="ceph version 17.0.0-189-g3558fd72
83 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
84 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
85 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
86 public_addr="172.20.0.2"}'
87 values: '1 1 1 1 1'
88 promql_expr_test:
89 - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
90 eval_time: 1m
91 exp_samples:
92 - labels: '{}'
93 value: 3.333333333333333E+01
94 alert_rule_test:
95 - eval_time: 1m
96 alertname: CephOSDDownHigh
97 exp_alerts:
98 - exp_labels:
99 oid: 1.3.6.1.4.1.50495.1.2.1.4.1
100 type: ceph_default
101 severity: critical
102 exp_annotations:
103 summary: More than 10% of OSDs are down
104 description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
105
106 # flapping OSD
107 - interval: 1s
108 input_series:
109 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
110 values: '1+1x100'
111 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
112 values: '1+0x100'
113 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
114 values: '1+0x100'
115 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
116 ceph_version="ceph version 17.0.0-189-g3558fd72
117 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
118 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
119 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
120 public_addr="172.20.0.2"}'
121 values: '1 1 1 1 1 1'
122 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
123 ceph_version="ceph version 17.0.0-189-g3558fd72
124 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
125 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
126 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
127 public_addr="172.20.0.2"}'
128 values: '1 1 1 1 1 1'
129 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
130 ceph_version="ceph version 17.0.0-189-g3558fd72
131 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
132 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
133 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
134 public_addr="172.20.0.2"}'
135 values: '1 1 1 1 1 1'
136 promql_expr_test:
137 - expr: |
138 (
139 rate(ceph_osd_up[5m])
140 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
141 ) * 60 > 1
142 eval_time: 1m
143 exp_samples:
144 - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
145 job="ceph"}'
146 value: 1.2200000000000001E+01
147 alert_rule_test:
148 - eval_time: 5m
149 alertname: CephOSDFlapping
150 exp_alerts:
151 - exp_labels:
152 ceph_daemon: osd.0
153 hostname: ceph
154 instance: ceph:9283
155 job: ceph
156 oid: 1.3.6.1.4.1.50495.1.2.1.4.4
157 severity: warning
158 type: ceph_default
159 exp_annotations:
160 documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
161 summary: Network issues are causing OSDs to flap (mark each other down)
162 description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
163
164 # high pg count deviation
165 - interval: 1m
166 input_series:
167 - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
168 job="ceph"}'
169 values: '100 100 100 100 100 160'
170 - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
171 job="ceph"}'
172 values: '100 100 100 100 100 320'
173 - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
174 job="ceph"}'
175 values: '100 100 100 100 100 160'
176 - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
177 job="ceph"}'
178 values: '100 100 100 100 100 160'
179 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
180 ceph_version="ceph version 17.0.0-189-g3558fd72
181 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
182 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
183 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
184 public_addr="172.20.0.2"}'
185 values: '1 1 1 1 1 1'
186 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
187 ceph_version="ceph version 17.0.0-189-g3558fd72
188 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
189 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
190 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
191 public_addr="172.20.0.2"}'
192 values: '1 1 1 1 1 1'
193 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
194 ceph_version="ceph version 17.0.0-189-g3558fd72
195 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
196 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
197 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
198 public_addr="172.20.0.2"}'
199 values: '1 1 1 1 1 1'
200 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
201 ceph_version="ceph version 17.0.0-189-g3558fd72
202 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
203 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
204 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
205 public_addr="172.20.0.2"}'
206 values: '1 1 1 1 1 1'
207 promql_expr_test:
208 - expr: |
209 abs(
210 (
211 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
212 by (job)
213 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
214 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
215
216 eval_time: 5m
217 exp_samples:
218 - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
219 job="ceph"}'
220 value: 6E-01
221 alert_rule_test:
222 - eval_time: 10m
223 alertname: CephPGImbalance
224 exp_alerts:
225 - exp_labels:
226 ceph_daemon: osd.1
227 hostname: ceph
228 instance: ceph:9283
229 job: ceph
230 oid: 1.3.6.1.4.1.50495.1.2.1.4.5
231 severity: warning
232 type: ceph_default
233 exp_annotations:
234 summary: PGs are not balanced across OSDs
235 description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
236
237 # pgs inactive
238 - interval: 1m
239 input_series:
240 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
241 name="device_health_metrics",pool_id="1"}'
242 values: '1 1 1 1 1 1 1 1'
243 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
244 name="device_health_metrics",pool_id="2"}'
245 values: '1 1 1 1 1 1 1 1'
246 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
247 name="device_health_metrics",pool_id="3"}'
248 values: '1 1 1 1 1 1 1 1'
249 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
250 values: '1 1 1 1 1 1 1 1'
251 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
252 values: '32 32 32 32 32 32 32 32'
253 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
254 values: '33 32 32 32 32 33 33 32'
255 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
256 values: '1 1 1 1 1 1 1 1 1'
257 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
258 values: '32 32 32 32 32 32 32 32'
259 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
260 values: '32 32 32 32 32 32 32 32'
261 promql_expr_test:
262 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
263 (ceph_pg_total - ceph_pg_active) > 0
264 eval_time: 5m
265 exp_samples:
266 - labels: '{instance="ceph:9283", job="ceph",
267 name="device_health_metrics",
268 pool_id="3"}'
269 value: 1
270 alert_rule_test:
271 - eval_time: 5m
272 alertname: CephPGsInactive
273 exp_alerts:
274 - exp_labels:
275 instance: ceph:9283
276 job: ceph
277 name: device_health_metrics
278 oid: 1.3.6.1.4.1.50495.1.2.1.7.1
279 pool_id: 3
280 severity: critical
281 type: ceph_default
282 exp_annotations:
283 summary: One or more placement groups are inactive
284 description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
285
286 #pgs unclean
287 - interval: 1m
288 input_series:
289 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
290 name="device_health_metrics",pool_id="1"}'
291 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
292 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
293 name="device_health_metrics",pool_id="2"}'
294 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
295 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
296 name="device_health_metrics",pool_id="3"}'
297 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
298 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
299 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
300 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
301 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
302 32 32 32'
303 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
304 values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
305 33 33'
306 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
307 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
308 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
309 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
310 32 32'
311 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
312 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
313 32 32'
314 promql_expr_test:
315 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
316 (ceph_pg_total - ceph_pg_clean) > 0
317 eval_time: 15m
318 exp_samples:
319 - labels: '{instance="ceph:9283", job="ceph",
320 name="device_health_metrics", pool_id="3"}'
321 value: 1
322 alert_rule_test:
323 - eval_time: 16m
324 alertname: CephPGsUnclean
325 exp_alerts:
326 - exp_labels:
327 instance: ceph:9283
328 job: ceph
329 name: device_health_metrics
330 oid: 1.3.6.1.4.1.50495.1.2.1.7.2
331 pool_id: 3
332 severity: warning
333 type: ceph_default
334 exp_annotations:
335 summary: One or more placement groups are marked unclean
336 description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
337
338 # root volume full
339 - interval: 1m
340 input_series:
341 - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
342 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
343 mountpoint="/"}'
344 values: '35336400896 35336400896 35336400896 35336400896 35336400896
345 3525385519.104 3533640089'
346 - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
347 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
348 mountpoint="/"}'
349 values: '73445531648 73445531648 73445531648 73445531648 73445531648
350 73445531648 73445531648'
351 promql_expr_test:
352 - expr: node_filesystem_avail_bytes{mountpoint="/"} /
353 node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
354 eval_time: 5m
355 exp_samples:
356 - labels: '{device="/dev/mapper/fedora_localhost --live-home",
357 fstype="ext4", instance="node-exporter", job="node-exporter",
358 mountpoint="/"}'
359 value: 4.8E+00
360 alert_rule_test:
361 - eval_time: 10m
362 alertname: CephNodeRootFilesystemFull
363 exp_alerts:
364 - exp_labels:
365 device: /dev/mapper/fedora_localhost --live-home
366 fstype: ext4
367 instance: node-exporter
368 job: node-exporter
369 mountpoint: /
370 oid: 1.3.6.1.4.1.50495.1.2.1.8.1
371 severity: critical
372 type: ceph_default
373 exp_annotations:
374 summary: Root filesystem is dangerously full
375 description: "Root volume is dangerously full: 4.811% free."
376
377 # network packets dropped
378 - interval: 1m
379 input_series:
380 - series: 'node_network_receive_drop_total{device="eth0",
381 instance="node-exporter",job="node-exporter"}'
382 values: '0+600x10'
383 - series: 'node_network_transmit_drop_total{device="eth0",
384 instance="node-exporter",job="node-exporter"}'
385 values: '0+600x10'
386 - series: 'node_network_receive_packets_total{device="eth0",
387 instance="node-exporter",job="node-exporter"}'
388 values: '0+750x10'
389 - series: 'node_network_transmit_packets_total{device="eth0",
390 instance="node-exporter",job="node-exporter"}'
391 values: '0+750x10'
392 promql_expr_test:
393 - expr: |
394 (
395 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
396 rate(node_network_transmit_drop_total{device!="lo"}[1m])
397 ) / (
398 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
399 rate(node_network_transmit_packets_total{device!="lo"}[1m])
400 ) >= 0.0050000000000000001 and (
401 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
402 rate(node_network_transmit_drop_total{device!="lo"}[1m])
403 ) >= 10
404
405 eval_time: 5m
406 exp_samples:
407 - labels: '{device="eth0", instance="node-exporter",
408 job="node-exporter"}'
409 value: 8E-1
410 alert_rule_test:
411 - eval_time: 5m
412 alertname: CephNodeNetworkPacketDrops
413 exp_alerts:
414 - exp_labels:
415 device: eth0
416 instance: node-exporter
417 job: node-exporter
418 oid: 1.3.6.1.4.1.50495.1.2.1.8.2
419 severity: warning
420 type: ceph_default
421 exp_annotations:
422 summary: One or more NICs reports packet drops
423 description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
424
425 # network packets errors
426 - interval: 1m
427 input_series:
428 - series: 'node_network_receive_errs_total{device="eth0",
429 instance="node-exporter",job="node-exporter"}'
430 values: '0+600x10'
431 - series: 'node_network_transmit_errs_total{device="eth0",
432 instance="node-exporter",job="node-exporter"}'
433 values: '0+600x10'
434 - series: 'node_network_transmit_packets_total{device="eth0",
435 instance="node-exporter",job="node-exporter"}'
436 values: '0+750x10'
437 - series: 'node_network_receive_packets_total{device="eth0",
438 instance="node-exporter",job="node-exporter"}'
439 values: '0+750x10'
440 promql_expr_test:
441 - expr: |
442 (
443 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
444 rate(node_network_transmit_errs_total{device!="lo"}[1m])
445 ) / (
446 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
447 rate(node_network_transmit_packets_total{device!="lo"}[1m])
448 ) >= 0.0001 or (
449 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
450 rate(node_network_transmit_errs_total{device!="lo"}[1m])
451 ) >= 10
452
453 eval_time: 5m
454 exp_samples:
455 - labels: '{device="eth0", instance="node-exporter",
456 job="node-exporter"}'
457 value: 8E-01
458 alert_rule_test:
459 - eval_time: 5m
460 alertname: CephNodeNetworkPacketErrors
461 exp_alerts:
462 - exp_labels:
463 device: eth0
464 instance: node-exporter
465 job: node-exporter
466 oid: 1.3.6.1.4.1.50495.1.2.1.8.3
467 severity: warning
468 type: ceph_default
469 exp_annotations:
470 summary: One or more NICs reports packet errors
471 description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
472
473 # Bond is missing a peer
474 - interval: 1m
475 input_series:
476 - series: 'node_bonding_active{master="bond0",
477 instance="node-exporter",job="node-exporter"}'
478 values: '3'
479 - series: 'node_bonding_slaves{master="bond0",
480 instance="node-exporter",job="node-exporter"}'
481 values: '4'
482 promql_expr_test:
483 - expr: |
484 node_bonding_slaves - node_bonding_active != 0
485 eval_time: 5m
486 exp_samples:
487 - labels: '{master="bond0", instance="node-exporter",
488 job="node-exporter"}'
489 value: 1
490 alert_rule_test:
491 - eval_time: 5m
492 alertname: CephNodeNetworkBondDegraded
493 exp_alerts:
494 - exp_labels:
495 master: bond0
496 instance: node-exporter
497 job: node-exporter
498 severity: warning
499 type: ceph_default
500 exp_annotations:
501 summary: Degraded Bond on Node node-exporter
502 description: "Bond bond0 is degraded on Node node-exporter."
503
504 # Node Storage disk space filling up
505 - interval: 1m
506 # 20GB = 21474836480, 256MB = 268435456
507 input_series:
508 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
509 fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
510 values: '21474836480-268435456x48'
511 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
512 fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
513 values: '21474836480+0x48'
514 - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
515 values: 1+0x48
516 - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
517 values: 1+0x48
518 promql_expr_test:
519 - expr: |
520 predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
521 on(instance) group_left(nodename) node_uname_info < 0
522 eval_time: 5m
523 exp_samples:
524 - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
525 mountpoint="/rootfs",nodename="node-1.unittests.com"}'
526 value: -1.912602624E+12
527 alert_rule_test:
528 - eval_time: 5m
529 alertname: CephNodeDiskspaceWarning
530 exp_alerts:
531 - exp_labels:
532 severity: warning
533 type: ceph_default
534 oid: 1.3.6.1.4.1.50495.1.2.1.8.4
535 device: /dev/mapper/vg-root
536 fstype: xfs
537 instance: node-1
538 mountpoint: /rootfs
539 nodename: node-1.unittests.com
540 exp_annotations:
541 summary: Host filesystem free space is getting low
542 description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
543 # MTU Mismatch
544 - interval: 1m
545 input_series:
546 - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
547 job="node-exporter"}'
548 values: '1500 1500 1500 1500 1500'
549 - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
550 job="node-exporter"}'
551 values: '1500 1500 1500 1500 1500'
552 - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
553 job="node-exporter"}'
554 values: '1500 1500 1500 1500 1500'
555 - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
556 job="node-exporter"}'
557 values: '1500 1500 1500 1500 1500'
558 - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
559 job="node-exporter"}'
560 values: '9000 9000 9000 9000 9000'
561 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
562 job="node-exporter"}'
563 values: '2200 2200 2200 2200 2200'
564 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
565 job="node-exporter"}'
566 values: '2400 2400 2400 2400 2400'
567 - series: 'node_network_up{device="eth0",instance="node-exporter",
568 job="node-exporter"}'
569 values: '0 0 0 0 0'
570 - series: 'node_network_up{device="eth1",instance="node-exporter",
571 job="node-exporter"}'
572 values: '0 0 0 0 0'
573 - series: 'node_network_up{device="eth2",instance="node-exporter",
574 job="node-exporter"}'
575 values: '1 1 1 1 1'
576 - series: 'node_network_up{device="eth3",instance="node-exporter",
577 job="node-exporter"}'
578 values: '1 1 1 1 1'
579 - series: 'node_network_up{device="eth4",instance="node-exporter",
580 job="node-exporter"}'
581 values: '1 1 1 1 1'
582 - series: 'node_network_up{device="eth4",instance="hostname1",
583 job="node-exporter"}'
584 values: '1 1 1 1 1'
585 - series: 'node_network_up{device="eth4",instance="hostname2",
586 job="node-exporter"}'
587 values: '0 0 0 0 0'
588 promql_expr_test:
589 - expr: |
590 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
591 scalar(
592 max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
593 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
594 )
595 or
596 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
597 scalar(
598 min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
599 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
600 )
601 eval_time: 1m
602 exp_samples:
603 - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
604 value: 9000
605 - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
606 value: 2200
607 alert_rule_test:
608 - eval_time: 1m
609 alertname: CephNodeInconsistentMTU
610 exp_alerts:
611 - exp_labels:
612 device: eth4
613 instance: hostname1
614 job: node-exporter
615 severity: warning
616 type: ceph_default
617 exp_annotations:
618 summary: MTU settings across Ceph hosts are inconsistent
619 description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
620 - exp_labels:
621 device: eth4
622 instance: node-exporter
623 job: node-exporter
624 severity: warning
625 type: ceph_default
626 exp_annotations:
627 summary: MTU settings across Ceph hosts are inconsistent
628 description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
629
630 # pool full, data series has 6 but using topk(5) so to ensure the
631 # results are working as expected
632 - interval: 1m
633 input_series:
634 - series: 'ceph_health_detail{name="POOL_FULL"}'
635 values: '0 0 0 1 1 1 1 1 1 1 1'
636 - series: 'ceph_pool_percent_used{pool_id="1"}'
637 values: '32+0x10'
638 - series: 'ceph_pool_percent_used{pool_id="2"}'
639 values: '96+0x10'
640 - series: 'ceph_pool_percent_used{pool_id="3"}'
641 values: '90+0x10'
642 - series: 'ceph_pool_percent_used{pool_id="4"}'
643 values: '72+0x10'
644 - series: 'ceph_pool_percent_used{pool_id="5"}'
645 values: '19+0x10'
646 - series: 'ceph_pool_percent_used{pool_id="6"}'
647 values: '10+0x10'
648 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
649 name="cephfs_data",pool_id="1"}'
650 values: '1 1 1 1 1 1 1 1 1'
651 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
652 name="rbd",pool_id="2"}'
653 values: '1 1 1 1 1 1 1 1 1'
654 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
655 name="iscsi",pool_id="3"}'
656 values: '1 1 1 1 1 1 1 1 1'
657 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
658 name="default.rgw.index",pool_id="4"}'
659 values: '1 1 1 1 1 1 1 1 1'
660 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
661 name="default.rgw.log",pool_id="5"}'
662 values: '1 1 1 1 1 1 1 1 1'
663 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
664 name="dummy",pool_id="6"}'
665 values: '1 1 1 1 1 1 1 1 1'
666 promql_expr_test:
667 - expr: ceph_health_detail{name="POOL_FULL"} > 0
668 eval_time: 5m
669 exp_samples:
670 - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}'
671 value: 1
672 alert_rule_test:
673 - eval_time: 1m
674 alertname: CephPoolFull
675 - eval_time: 10m
676 alertname: CephPoolFull
677 exp_alerts:
678 - exp_labels:
679 name: POOL_FULL
680 severity: critical
681 type: ceph_default
682 oid: 1.3.6.1.4.1.50495.1.2.1.9.1
683 exp_annotations:
684 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
685 summary: Pool is full - writes are blocked
686 description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
687 # slow OSD ops
688 - interval : 1m
689 input_series:
690 - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
691 values: '1+0x120'
692 promql_expr_test:
693 - expr: ceph_healthcheck_slow_ops > 0
694 eval_time: 1m
695 exp_samples:
696 - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
697 job="ceph"}'
698 value: 1
699 alert_rule_test:
700 - eval_time: 20m
701 alertname: CephSlowOps
702 exp_alerts:
703 - exp_labels:
704 instance: ceph:9283
705 job: ceph
706 severity: warning
707 type: ceph_default
708 exp_annotations:
709 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
710 summary: OSD operations are slow to complete
711 description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
712
713 # slow daemon ops
714 - interval : 1m
715 input_series:
716 - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
717 values: '1+0x120'
718 promql_expr_test:
719 - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
720 eval_time: 1m
721 exp_samples:
722 - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
723 job="ceph", type="SLOW_OPS"}'
724 value: 1
725 alert_rule_test:
726 - eval_time: 20m
727 alertname: CephDaemonSlowOps
728 exp_alerts:
729 - exp_labels:
730 instance: ceph:9283
731 ceph_daemon: "osd.1"
732 job: ceph
733 severity: warning
734 type: ceph_default
735 exp_annotations:
736 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
737 summary: osd.1 operations are slow to complete
738 description: "osd.1 operations are taking too long to process (complaint time exceeded)"
739
740 # CEPHADM orchestrator alert triggers
741 - interval: 30s
742 input_series:
743 - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
744 values: '1+0x40'
745 promql_expr_test:
746 - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
747 eval_time: 2m
748 exp_samples:
749 - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
750 value: 1
751 alert_rule_test:
752 - eval_time: 1m
753 alertname: CephadmUpgradeFailed
754 - eval_time: 5m
755 alertname: CephadmUpgradeFailed
756 exp_alerts:
757 - exp_labels:
758 name: UPGRADE_EXCEPTION
759 severity: critical
760 type: ceph_default
761 oid: 1.3.6.1.4.1.50495.1.2.1.11.2
762 exp_annotations:
763 summary: Ceph version upgrade has failed
764 description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
765 - interval: 30s
766 input_series:
767 - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
768 values: '1+0x40'
769 promql_expr_test:
770 - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
771 eval_time: 2m
772 exp_samples:
773 - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
774 value: 1
775 alert_rule_test:
776 - eval_time: 1m
777 alertname: CephadmDaemonFailed
778 - eval_time: 5m
779 alertname: CephadmDaemonFailed
780 exp_alerts:
781 - exp_labels:
782 name: CEPHADM_FAILED_DAEMON
783 severity: critical
784 type: ceph_default
785 oid: 1.3.6.1.4.1.50495.1.2.1.11.1
786 exp_annotations:
787 summary: A ceph daemon manged by cephadm is down
788 description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
789 - interval: 1m
790 input_series:
791 - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
792 values: '1 1 1 1 1 1 1 1 1'
793 promql_expr_test:
794 - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
795 eval_time: 2m
796 exp_samples:
797 - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
798 value: 1
799 alert_rule_test:
800 - eval_time: 1m
801 alertname: CephadmPaused
802 - eval_time: 5m
803 alertname: CephadmPaused
804 exp_alerts:
805 - exp_labels:
806 name: CEPHADM_PAUSED
807 severity: warning
808 type: ceph_default
809 exp_annotations:
810 documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
811 summary: Orchestration tasks via cephadm are PAUSED
812 description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
813 # MDS
814 - interval: 1m
815 input_series:
816 - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
817 values: '1 1 1 1 1 1 1 1 1'
818 promql_expr_test:
819 - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
820 eval_time: 2m
821 exp_samples:
822 - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
823 value: 1
824 alert_rule_test:
825 - eval_time: 1m
826 alertname: CephFilesystemDamaged
827 - eval_time: 5m
828 alertname: CephFilesystemDamaged
829 exp_alerts:
830 - exp_labels:
831 name: MDS_DAMAGE
832 severity: critical
833 type: ceph_default
834 oid: 1.3.6.1.4.1.50495.1.2.1.5.1
835 exp_annotations:
836 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
837 summary: CephFS filesystem is damaged.
838 description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
839 - interval: 1m
840 input_series:
841 - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
842 values: '1 1 1 1 1 1 1 1 1'
843 promql_expr_test:
844 - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
845 eval_time: 2m
846 exp_samples:
847 - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
848 value: 1
849 alert_rule_test:
850 - eval_time: 1m
851 alertname: CephFilesystemReadOnly
852 - eval_time: 5m
853 alertname: CephFilesystemReadOnly
854 exp_alerts:
855 - exp_labels:
856 name: MDS_HEALTH_READ_ONLY
857 severity: critical
858 type: ceph_default
859 oid: 1.3.6.1.4.1.50495.1.2.1.5.2
860 exp_annotations:
861 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
862 summary: CephFS filesystem in read only mode due to write error(s)
863 description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
864 - interval: 1m
865 input_series:
866 - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
867 values: '0 0 1 1 1 1 1 1 1 1 1'
868 promql_expr_test:
869 - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
870 eval_time: 2m
871 exp_samples:
872 - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
873 value: 1
874 alert_rule_test:
875 - eval_time: 1m
876 alertname: CephFilesystemOffline
877 - eval_time: 10m
878 alertname: CephFilesystemOffline
879 exp_alerts:
880 - exp_labels:
881 name: MDS_ALL_DOWN
882 severity: critical
883 type: ceph_default
884 oid: 1.3.6.1.4.1.50495.1.2.1.5.3
885 exp_annotations:
886 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
887 summary: CephFS filesystem is offline
888 description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
889 - interval: 1m
890 input_series:
891 - series: 'ceph_health_detail{name="FS_DEGRADED"}'
892 values: '0 0 1 1 1 1 1 1 1 1 1'
893 promql_expr_test:
894 - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
895 eval_time: 2m
896 exp_samples:
897 - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
898 value: 1
899 alert_rule_test:
900 - eval_time: 1m
901 alertname: CephFilesystemDegraded
902 - eval_time: 10m
903 alertname: CephFilesystemDegraded
904 exp_alerts:
905 - exp_labels:
906 name: FS_DEGRADED
907 severity: critical
908 type: ceph_default
909 oid: 1.3.6.1.4.1.50495.1.2.1.5.4
910 exp_annotations:
911 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
912 summary: CephFS filesystem is degraded
913 description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
914 - interval: 1m
915 input_series:
916 - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
917 values: '0 0 1 1 1 1 1 1 1 1 1'
918 promql_expr_test:
919 - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
920 eval_time: 2m
921 exp_samples:
922 - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
923 value: 1
924 alert_rule_test:
925 - eval_time: 1m
926 alertname: CephFilesystemInsufficientStandby
927 - eval_time: 10m
928 alertname: CephFilesystemInsufficientStandby
929 exp_alerts:
930 - exp_labels:
931 name: MDS_INSUFFICIENT_STANDBY
932 severity: warning
933 type: ceph_default
934 exp_annotations:
935 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
936 summary: Ceph filesystem standby daemons too few
937 description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
938 - interval: 1m
939 input_series:
940 - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
941 values: '0 0 1 1 1 1 1 1 1 1 1'
942 promql_expr_test:
943 - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
944 eval_time: 2m
945 exp_samples:
946 - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
947 value: 1
948 alert_rule_test:
949 - eval_time: 1m
950 alertname: CephFilesystemFailureNoStandby
951 - eval_time: 10m
952 alertname: CephFilesystemFailureNoStandby
953 exp_alerts:
954 - exp_labels:
955 name: FS_WITH_FAILED_MDS
956 severity: critical
957 type: ceph_default
958 oid: 1.3.6.1.4.1.50495.1.2.1.5.5
959 exp_annotations:
960 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
961 summary: MDS daemon failed, no further standby available
962 description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
963 - interval: 1m
964 input_series:
965 - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
966 values: '0 0 1 1 1 1 1 1 1 1 1'
967 promql_expr_test:
968 - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
969 eval_time: 2m
970 exp_samples:
971 - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
972 value: 1
973 alert_rule_test:
974 - eval_time: 1m
975 alertname: CephFilesystemMDSRanksLow
976 - eval_time: 10m
977 alertname: CephFilesystemMDSRanksLow
978 exp_alerts:
979 - exp_labels:
980 name: MDS_UP_LESS_THAN_MAX
981 severity: warning
982 type: ceph_default
983 exp_annotations:
984 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
985 summary: Ceph MDS daemon count is lower than configured
986 description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
987 # MGR
988 - interval: 1m
989 input_series:
990 - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
991 values: '1+0x2 0+0x10'
992 promql_expr_test:
993 - expr: up{job="ceph"} == 0
994 eval_time: 3m
995 exp_samples:
996 - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
997 value: 0
998 alert_rule_test:
999 - eval_time: 1m
1000 alertname: CephMgrPrometheusModuleInactive
1001 - eval_time: 10m
1002 alertname: CephMgrPrometheusModuleInactive
1003 exp_alerts:
1004 - exp_labels:
1005 instance: ceph-mgr:9283
1006 job: ceph
1007 severity: critical
1008 type: ceph_default
1009 oid: 1.3.6.1.4.1.50495.1.2.1.6.2
1010 exp_annotations:
1011 summary: The mgr/prometheus module is not available
1012 description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
1013 - interval: 1m
1014 input_series:
1015 - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
1016 values: '0+0x2 1+0x20'
1017 promql_expr_test:
1018 - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
1019 eval_time: 3m
1020 exp_samples:
1021 - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
1022 value: 1
1023 alert_rule_test:
1024 - eval_time: 1m
1025 alertname: CephMgrModuleCrash
1026 - eval_time: 15m
1027 alertname: CephMgrModuleCrash
1028 exp_alerts:
1029 - exp_labels:
1030 name: RECENT_MGR_MODULE_CRASH
1031 severity: critical
1032 type: ceph_default
1033 oid: 1.3.6.1.4.1.50495.1.2.1.6.1
1034 exp_annotations:
1035 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
1036 summary: A manager module has recently crashed
1037 description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
1038 # MON
1039 - interval: 1m
1040 input_series:
1041 - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
1042 values: '0+0x2 1+0x10'
1043 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1044 values: '1+0x13'
1045 promql_expr_test:
1046 - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
1047 eval_time: 3m
1048 exp_samples:
1049 - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
1050 value: 1
1051 alert_rule_test:
1052 - eval_time: 1m
1053 alertname: CephMonDiskspaceCritical
1054 - eval_time: 10m
1055 alertname: CephMonDiskspaceCritical
1056 exp_alerts:
1057 - exp_labels:
1058 name: "MON_DISK_CRIT"
1059 severity: critical
1060 type: ceph_default
1061 oid: 1.3.6.1.4.1.50495.1.2.1.3.2
1062 exp_annotations:
1063 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
1064 summary: Filesystem space on at least one monitor is critically low
1065 description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1066 - interval: 1m
1067 input_series:
1068 - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
1069 values: '0+0x2 1+0x10'
1070 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1071 values: '1+0x13'
1072 promql_expr_test:
1073 - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
1074 eval_time: 3m
1075 exp_samples:
1076 - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
1077 value: 1
1078 alert_rule_test:
1079 - eval_time: 1m
1080 alertname: CephMonDiskspaceLow
1081 - eval_time: 10m
1082 alertname: CephMonDiskspaceLow
1083 exp_alerts:
1084 - exp_labels:
1085 name: "MON_DISK_LOW"
1086 severity: warning
1087 type: ceph_default
1088 exp_annotations:
1089 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
1090 summary: Drive space on at least one monitor is approaching full
1091 description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1092 - interval: 1m
1093 input_series:
1094 - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
1095 values: '0+0x2 1+0x10'
1096 promql_expr_test:
1097 - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
1098 eval_time: 3m
1099 exp_samples:
1100 - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
1101 value: 1
1102 alert_rule_test:
1103 - eval_time: 1m
1104 alertname: CephMonClockSkew
1105 - eval_time: 10m
1106 alertname: CephMonClockSkew
1107 exp_alerts:
1108 - exp_labels:
1109 name: "MON_CLOCK_SKEW"
1110 severity: warning
1111 type: ceph_default
1112 exp_annotations:
1113 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
1114 summary: Clock skew detected among monitors
1115 description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
1116
1117 # Check 3 mons one down, quorum at risk
1118 - interval: 1m
1119 input_series:
1120 - series: 'ceph_health_detail{name="MON_DOWN"}'
1121 values: '0+0x2 1+0x12'
1122 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1123 values: '1+0x14'
1124 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1125 values: '1+0x14'
1126 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1127 values: '1+0x2 0+0x12'
1128 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1129 values: '1+0x14'
1130 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1131 values: '1+0x14'
1132 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1133 values: '1+0x14'
1134 promql_expr_test:
1135 - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
1136 eval_time: 3m
1137 exp_samples:
1138 - labels: '{}'
1139 value: 1
1140 alert_rule_test:
1141 - eval_time: 1m
1142 alertname: CephMonDownQuorumAtRisk
1143 # shouldn't fire
1144 - eval_time: 10m
1145 alertname: CephMonDownQuorumAtRisk
1146 exp_alerts:
1147 - exp_labels:
1148 severity: critical
1149 type: ceph_default
1150 oid: 1.3.6.1.4.1.50495.1.2.1.3.1
1151 exp_annotations:
1152 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1153 summary: Monitor quorum is at risk
1154 description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
1155 # check 5 mons, 1 down - warning only
1156 - interval: 1m
1157 input_series:
1158 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1159 values: '1+0x14'
1160 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1161 values: '1+0x14'
1162 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1163 values: '1+0x14'
1164 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
1165 values: '1+0x14'
1166 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
1167 values: '1+0x2 0+0x12'
1168 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1169 values: '1+0x14'
1170 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1171 values: '1+0x14'
1172 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1173 values: '1+0x14'
1174 - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
1175 values: '1+0x14'
1176 - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
1177 values: '1+0x14'
1178 promql_expr_test:
1179 - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
1180 eval_time: 3m
1181 exp_samples:
1182 - labels: '{}'
1183 value: 1
1184 alert_rule_test:
1185 - eval_time: 1m
1186 alertname: CephMonDown
1187 - eval_time: 10m
1188 alertname: CephMonDown
1189 exp_alerts:
1190 - exp_labels:
1191 severity: warning
1192 type: ceph_default
1193 exp_annotations:
1194 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1195 summary: One or more monitors down
1196 description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5\n"
1197 # Device Health
1198 - interval: 1m
1199 input_series:
1200 - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
1201 values: '0+0x2 1+0x10'
1202 promql_expr_test:
1203 - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
1204 eval_time: 3m
1205 exp_samples:
1206 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
1207 value: 1
1208 alert_rule_test:
1209 - eval_time: 1m
1210 alertname: CephDeviceFailurePredicted
1211 - eval_time: 10m
1212 alertname: CephDeviceFailurePredicted
1213 exp_alerts:
1214 - exp_labels:
1215 name: "DEVICE_HEALTH"
1216 severity: warning
1217 type: ceph_default
1218 exp_annotations:
1219 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
1220 summary: Device(s) predicted to fail soon
1221 description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
1222 - interval: 1m
1223 input_series:
1224 - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
1225 values: '0+0x2 1+0x10'
1226 promql_expr_test:
1227 - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
1228 eval_time: 3m
1229 exp_samples:
1230 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
1231 value: 1
1232 alert_rule_test:
1233 - eval_time: 1m
1234 alertname: CephDeviceFailurePredictionTooHigh
1235 - eval_time: 10m
1236 alertname: CephDeviceFailurePredictionTooHigh
1237 exp_alerts:
1238 - exp_labels:
1239 name: "DEVICE_HEALTH_TOOMANY"
1240 severity: critical
1241 type: ceph_default
1242 oid: 1.3.6.1.4.1.50495.1.2.1.4.7
1243 exp_annotations:
1244 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
1245 summary: Too many devices are predicted to fail, unable to resolve
1246 description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
1247 - interval: 1m
1248 input_series:
1249 - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
1250 values: '0+0x2 1+0x10'
1251 promql_expr_test:
1252 - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
1253 eval_time: 3m
1254 exp_samples:
1255 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
1256 value: 1
1257 alert_rule_test:
1258 - eval_time: 1m
1259 alertname: CephDeviceFailureRelocationIncomplete
1260 - eval_time: 10m
1261 alertname: CephDeviceFailureRelocationIncomplete
1262 exp_alerts:
1263 - exp_labels:
1264 name: "DEVICE_HEALTH_IN_USE"
1265 severity: warning
1266 type: ceph_default
1267 exp_annotations:
1268 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
1269 summary: Device failure is predicted, but unable to relocate data
1270 description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
1271 # OSD
1272 - interval: 1m
1273 input_series:
1274 - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
1275 values: '0+0x2 1+0x10'
1276 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1277 values: '1+0x2 0+0x10'
1278 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1279 values: '1+0x12'
1280 promql_expr_test:
1281 - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
1282 eval_time: 3m
1283 exp_samples:
1284 - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
1285 value: 1
1286 alert_rule_test:
1287 - eval_time: 1m
1288 alertname: CephOSDHostDown
1289 - eval_time: 10m
1290 alertname: CephOSDHostDown
1291 exp_alerts:
1292 - exp_labels:
1293 name: "OSD_HOST_DOWN"
1294 severity: warning
1295 type: ceph_default
1296 oid: 1.3.6.1.4.1.50495.1.2.1.4.8
1297 exp_annotations:
1298 summary: An OSD host is offline
1299 description: "The following OSDs are down: - ceph-osd-1 : osd.0"
1300 - interval: 1m
1301 input_series:
1302 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
1303 values: '0+0x2 1+0x20'
1304 promql_expr_test:
1305 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
1306 eval_time: 1m
1307 exp_samples:
1308 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
1309 value: 0
1310 alert_rule_test:
1311 - eval_time: 1m
1312 alertname: CephOSDTimeoutsPublicNetwork
1313 - eval_time: 10m
1314 alertname: CephOSDTimeoutsPublicNetwork
1315 exp_alerts:
1316 - exp_labels:
1317 name: "OSD_SLOW_PING_TIME_FRONT"
1318 severity: warning
1319 type: ceph_default
1320 exp_annotations:
1321 summary: Network issues delaying OSD heartbeats (public network)
1322 description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
1323 - interval: 1m
1324 input_series:
1325 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
1326 values: '0+0x2 1+0x20'
1327 promql_expr_test:
1328 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
1329 eval_time: 1m
1330 exp_samples:
1331 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
1332 value: 0
1333 alert_rule_test:
1334 - eval_time: 1m
1335 alertname: CephOSDTimeoutsClusterNetwork
1336 - eval_time: 10m
1337 alertname: CephOSDTimeoutsClusterNetwork
1338 exp_alerts:
1339 - exp_labels:
1340 name: "OSD_SLOW_PING_TIME_BACK"
1341 severity: warning
1342 type: ceph_default
1343 exp_annotations:
1344 summary: Network issues delaying OSD heartbeats (cluster network)
1345 description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
1346 - interval: 1m
1347 input_series:
1348 - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1349 values: '0+0x2 1+0x20'
1350 promql_expr_test:
1351 - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
1352 eval_time: 1m
1353 exp_samples:
1354 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1355 value: 0
1356 alert_rule_test:
1357 - eval_time: 1m
1358 alertname: CephOSDInternalDiskSizeMismatch
1359 - eval_time: 10m
1360 alertname: CephOSDInternalDiskSizeMismatch
1361 exp_alerts:
1362 - exp_labels:
1363 name: "BLUESTORE_DISK_SIZE_MISMATCH"
1364 severity: warning
1365 type: ceph_default
1366 exp_annotations:
1367 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
1368 summary: OSD size inconsistency error
1369 description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
1370 - interval: 30s
1371 input_series:
1372 - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1373 values: '0+0x2 1+0x20'
1374 promql_expr_test:
1375 - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
1376 eval_time: 3m
1377 exp_samples:
1378 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1379 value: 1
1380 alert_rule_test:
1381 - eval_time: 1m
1382 alertname: CephOSDReadErrors
1383 - eval_time: 10m
1384 alertname: CephOSDReadErrors
1385 exp_alerts:
1386 - exp_labels:
1387 name: "BLUESTORE_SPURIOUS_READ_ERRORS"
1388 severity: warning
1389 type: ceph_default
1390 exp_annotations:
1391 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
1392 summary: Device read errors detected
1393 description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
1394 - interval: 1m
1395 input_series:
1396 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1397 values: '0+0x2 1+0x10'
1398 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1399 values: '1+0x12'
1400 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1401 values: '1+0x2 0+0x10'
1402 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1403 values: '1+0x12'
1404 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1405 values: '1+0x12'
1406 - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
1407 values: '1+0x12'
1408 - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
1409 values: '1+0x12'
1410 promql_expr_test:
1411 - expr: ceph_health_detail{name="OSD_DOWN"} == 1
1412 eval_time: 3m
1413 exp_samples:
1414 - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
1415 value: 1
1416 alert_rule_test:
1417 - eval_time: 1m
1418 alertname: CephOSDDown
1419 - eval_time: 10m
1420 alertname: CephOSDDown
1421 exp_alerts:
1422 - exp_labels:
1423 name: "OSD_DOWN"
1424 severity: warning
1425 type: ceph_default
1426 oid: 1.3.6.1.4.1.50495.1.2.1.4.2
1427 exp_annotations:
1428 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
1429 summary: An OSD has been marked down
1430 description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
1431 - interval: 1m
1432 input_series:
1433 - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
1434 values: '0+0x2 1+0x10'
1435 promql_expr_test:
1436 - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
1437 eval_time: 3m
1438 exp_samples:
1439 - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
1440 value: 1
1441 alert_rule_test:
1442 - eval_time: 1m
1443 alertname: CephOSDNearFull
1444 - eval_time: 10m
1445 alertname: CephOSDNearFull
1446 exp_alerts:
1447 - exp_labels:
1448 name: "OSD_NEARFULL"
1449 severity: warning
1450 type: ceph_default
1451 oid: 1.3.6.1.4.1.50495.1.2.1.4.3
1452 exp_annotations:
1453 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
1454 summary: OSD(s) running low on free space (NEARFULL)
1455 description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1456 - interval: 1m
1457 input_series:
1458 - series: 'ceph_health_detail{name="OSD_FULL"}'
1459 values: '0+0x2 1+0x10'
1460 promql_expr_test:
1461 - expr: ceph_health_detail{name="OSD_FULL"} == 1
1462 eval_time: 3m
1463 exp_samples:
1464 - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
1465 value: 1
1466 alert_rule_test:
1467 - eval_time: 1m
1468 alertname: CephOSDFull
1469 - eval_time: 10m
1470 alertname: CephOSDFull
1471 exp_alerts:
1472 - exp_labels:
1473 name: "OSD_FULL"
1474 severity: critical
1475 type: ceph_default
1476 oid: 1.3.6.1.4.1.50495.1.2.1.4.6
1477 exp_annotations:
1478 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
1479 summary: OSD full, writes blocked
1480 description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1481 - interval: 1m
1482 input_series:
1483 - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
1484 values: '0+0x2 1+0x10'
1485 promql_expr_test:
1486 - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
1487 eval_time: 3m
1488 exp_samples:
1489 - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
1490 value: 1
1491 alert_rule_test:
1492 - eval_time: 1m
1493 alertname: CephOSDBackfillFull
1494 - eval_time: 10m
1495 alertname: CephOSDBackfillFull
1496 exp_alerts:
1497 - exp_labels:
1498 name: "OSD_BACKFILLFULL"
1499 severity: warning
1500 type: ceph_default
1501 exp_annotations:
1502 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
1503 summary: OSD(s) too full for backfill operations
1504 description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
1505 - interval: 30s
1506 input_series:
1507 - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
1508 values: '0+0x2 1+0x20'
1509 promql_expr_test:
1510 - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
1511 eval_time: 1m
1512 exp_samples:
1513 - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
1514 value: 0
1515 alert_rule_test:
1516 - eval_time: 1m
1517 alertname: CephOSDTooManyRepairs
1518 - eval_time: 10m
1519 alertname: CephOSDTooManyRepairs
1520 exp_alerts:
1521 - exp_labels:
1522 name: "OSD_TOO_MANY_REPAIRS"
1523 severity: warning
1524 type: ceph_default
1525 exp_annotations:
1526 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
1527 summary: OSD reports a high number of read errors
1528 description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
1529 # Pools
1530 # trigger percent full prediction on pools 1 and 2 only
1531 - interval: 12h
1532 input_series:
1533 - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
1534 values: '1 1 1 1 1'
1535 - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
1536 values: '78 89 79 98 78'
1537 - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
1538 values: '1 1 1 1 1'
1539 - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
1540 values: '22 22 23 23 24'
1541 - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
1542 values: '1 1 1 1 1'
1543 - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
1544 values: '1 1 1 1 1'
1545 - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
1546 values: '1 1 1 1 1'
1547 - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
1548 values: '1 1 1 1 1'
1549 promql_expr_test:
1550 - expr: |
1551 (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
1552 group_right() ceph_pool_metadata) >= 95
1553 eval_time: 36h
1554 exp_samples:
1555 - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
1556 value: 1.435E+02 # 142%
1557 alert_rule_test:
1558 - eval_time: 48h
1559 alertname: CephPoolGrowthWarning
1560 exp_alerts:
1561 - exp_labels:
1562 instance: 8090
1563 name: default.rgw.index
1564 pool_id: 1
1565 severity: warning
1566 type: ceph_default
1567 oid: 1.3.6.1.4.1.50495.1.2.1.9.2
1568 exp_annotations:
1569 summary: Pool growth rate may soon exceed capacity
1570 description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
1571 - interval: 1m
1572 input_series:
1573 - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
1574 values: '0+0x2 1+0x10'
1575 promql_expr_test:
1576 - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
1577 eval_time: 3m
1578 exp_samples:
1579 - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
1580 value: 1
1581 alert_rule_test:
1582 - eval_time: 1m
1583 alertname: CephPoolBackfillFull
1584 - eval_time: 5m
1585 alertname: CephPoolBackfillFull
1586 exp_alerts:
1587 - exp_labels:
1588 name: "POOL_BACKFILLFULL"
1589 severity: warning
1590 type: ceph_default
1591 exp_annotations:
1592 summary: Free space in a pool is too low for recovery/backfill
1593 description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
1594
1595 - interval: 1m
1596 input_series:
1597 - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
1598 values: '0+0x2 1+0x10'
1599 promql_expr_test:
1600 - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
1601 eval_time: 3m
1602 exp_samples:
1603 - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
1604 value: 1
1605 alert_rule_test:
1606 - eval_time: 1m
1607 alertname: CephPoolNearFull
1608 - eval_time: 10m
1609 alertname: CephPoolNearFull
1610 exp_alerts:
1611 - exp_labels:
1612 name: "POOL_NEAR_FULL"
1613 severity: warning
1614 type: ceph_default
1615 exp_annotations:
1616 summary: One or more Ceph pools are nearly full
1617 description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
1618
1619 # PGs
1620 - interval: 1m
1621 input_series:
1622 - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
1623 values: '0+0x2 1+0x10'
1624 promql_expr_test:
1625 - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
1626 eval_time: 3m
1627 exp_samples:
1628 - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
1629 value: 1
1630 alert_rule_test:
1631 - eval_time: 1m
1632 alertname: CephPGNotScrubbed
1633 - eval_time: 10m
1634 alertname: CephPGNotScrubbed
1635 exp_alerts:
1636 - exp_labels:
1637 name: "PG_NOT_SCRUBBED"
1638 severity: warning
1639 type: ceph_default
1640 exp_annotations:
1641 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
1642 summary: Placement group(s) have not been scrubbed
1643 description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
1644 - interval: 1m
1645 input_series:
1646 - series: 'ceph_health_detail{name="PG_DAMAGED"}'
1647 values: '0+0x4 1+0x20'
1648 promql_expr_test:
1649 - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
1650 eval_time: 5m
1651 exp_samples:
1652 - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
1653 value: 1
1654 alert_rule_test:
1655 - eval_time: 1m
1656 alertname: CephPGsDamaged
1657 - eval_time: 10m
1658 alertname: CephPGsDamaged
1659 exp_alerts:
1660 - exp_labels:
1661 name: "PG_DAMAGED"
1662 severity: critical
1663 type: ceph_default
1664 oid: 1.3.6.1.4.1.50495.1.2.1.7.4
1665 exp_annotations:
1666 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
1667 summary: Placement group damaged, manual intervention needed
1668 description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
1669 - interval: 1m
1670 input_series:
1671 - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
1672 values: '0+0x4 1+0x20'
1673 promql_expr_test:
1674 - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
1675 eval_time: 5m
1676 exp_samples:
1677 - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
1678 value: 1
1679 alert_rule_test:
1680 - eval_time: 1m
1681 alertname: CephPGsHighPerOSD
1682 - eval_time: 10m
1683 alertname: CephPGsHighPerOSD
1684 exp_alerts:
1685 - exp_labels:
1686 name: "TOO_MANY_PGS"
1687 severity: warning
1688 type: ceph_default
1689 exp_annotations:
1690 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
1691 summary: Placement groups per OSD is too high
1692 description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
1693 - interval: 1m
1694 input_series:
1695 - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
1696 values: '0+0x2 1+0x20'
1697 promql_expr_test:
1698 - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
1699 eval_time: 1m
1700 exp_samples:
1701 - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
1702 value: 0
1703 alert_rule_test:
1704 - eval_time: 1m
1705 alertname: CephPGRecoveryAtRisk
1706 - eval_time: 10m
1707 alertname: CephPGRecoveryAtRisk
1708 exp_alerts:
1709 - exp_labels:
1710 name: "PG_RECOVERY_FULL"
1711 severity: critical
1712 type: ceph_default
1713 oid: 1.3.6.1.4.1.50495.1.2.1.7.5
1714 exp_annotations:
1715 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
1716 summary: OSDs are too full for recovery
1717 description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
1718 - interval: 1m
1719 input_series:
1720 - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
1721 values: '0+0x2 1+0x20'
1722 promql_expr_test:
1723 - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
1724 eval_time: 1m
1725 exp_samples:
1726 - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
1727 value: 0
1728 alert_rule_test:
1729 - eval_time: 1m
1730 alertname: CephPGBackfillAtRisk
1731 - eval_time: 10m
1732 alertname: CephPGBackfillAtRisk
1733 exp_alerts:
1734 - exp_labels:
1735 name: "PG_BACKFILL_FULL"
1736 severity: critical
1737 type: ceph_default
1738 oid: 1.3.6.1.4.1.50495.1.2.1.7.6
1739 exp_annotations:
1740 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
1741 summary: Backfill operations are blocked due to lack of free space
1742 description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
1743 - interval: 1m
1744 input_series:
1745 - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
1746 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
1747 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1748 values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
1749 promql_expr_test:
1750 - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
1751 eval_time: 1m
1752 # empty set at 1m
1753 exp_samples:
1754 alert_rule_test:
1755 # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
1756 - eval_time: 1m
1757 alertname: CephPGUnavilableBlockingIO
1758 exp_alerts:
1759 # PG_AVAILABILITY firing, but osd_down is active .. no alert
1760 - eval_time: 5m
1761 alertname: CephPGUnavilableBlockingIO
1762 exp_alerts:
1763 # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
1764 - eval_time: 15m
1765 alertname: CephPGUnavilableBlockingIO
1766 exp_alerts:
1767 - exp_labels:
1768 name: "PG_AVAILABILITY"
1769 severity: critical
1770 type: ceph_default
1771 oid: 1.3.6.1.4.1.50495.1.2.1.7.3
1772 exp_annotations:
1773 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
1774 summary: PG is unavailable, blocking I/O
1775 description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
1776 - interval: 1m
1777 input_series:
1778 - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
1779 values: '0+0x2 1+0x10'
1780 promql_expr_test:
1781 - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
1782 eval_time: 3m
1783 exp_samples:
1784 - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
1785 value: 1
1786 alert_rule_test:
1787 - eval_time: 1m
1788 alertname: CephPGNotDeepScrubbed
1789 - eval_time: 10m
1790 alertname: CephPGNotDeepScrubbed
1791 exp_alerts:
1792 - exp_labels:
1793 name: "PG_NOT_DEEP_SCRUBBED"
1794 severity: warning
1795 type: ceph_default
1796 exp_annotations:
1797 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
1798 summary: Placement group(s) have not been deep scrubbed
1799 description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
1800
1801 # Prometheus
1802 - interval: 1m
1803 input_series:
1804 - series: 'up{job="myjob"}'
1805 values: '1+0x10'
1806 promql_expr_test:
1807 - expr: absent(up{job="ceph"})
1808 eval_time: 1m
1809 exp_samples:
1810 - labels: '{job="ceph"}'
1811 value: 1
1812 alert_rule_test:
1813 - eval_time: 5m
1814 alertname: PrometheusJobMissing
1815 exp_alerts:
1816 - exp_labels:
1817 job: ceph
1818 severity: critical
1819 type: ceph_default
1820 oid: 1.3.6.1.4.1.50495.1.2.1.12.1
1821 exp_annotations:
1822 summary: The scrape job for Ceph is missing from Prometheus
1823 description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance.
1824 # RADOS
1825 - interval: 1m
1826 input_series:
1827 - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
1828 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1829 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1830 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1831 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1832 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1833 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1834 values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1835 - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
1836 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1837 - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
1838 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1839 - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
1840 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1841 promql_expr_test:
1842 - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
1843 eval_time: 1m
1844 exp_samples:
1845 alert_rule_test:
1846 # OBJECT_UNFOUND but osd.2 is down, so don't fire
1847 - eval_time: 5m
1848 alertname: CephObjectMissing
1849 exp_alerts:
1850 # OBJECT_UNFOUND and all osd's are online, so fire
1851 - eval_time: 15m
1852 alertname: CephObjectMissing
1853 exp_alerts:
1854 - exp_labels:
1855 severity: critical
1856 type: ceph_default
1857 oid: 1.3.6.1.4.1.50495.1.2.1.10.1
1858 exp_annotations:
1859 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
1860 summary: Object(s) marked UNFOUND
1861 description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
1862 # Generic Alerts
1863 - interval: 1m
1864 input_series:
1865 - series: 'ceph_health_detail{name="RECENT_CRASH"}'
1866 values: '0 0 0 1 1 1 1 1 1 1 1'
1867 promql_expr_test:
1868 - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1869 eval_time: 1m
1870 exp_samples:
1871 alert_rule_test:
1872 # not firing
1873 - eval_time: 1m
1874 alertname: CephDaemonCrash
1875 exp_alerts:
1876 # firing
1877 - eval_time: 10m
1878 alertname: CephDaemonCrash
1879 exp_alerts:
1880 - exp_labels:
1881 name: RECENT_CRASH
1882 severity: critical
1883 type: ceph_default
1884 oid: 1.3.6.1.4.1.50495.1.2.1.1.2
1885 exp_annotations:
1886 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
1887 summary: One or more Ceph daemons have crashed, and are pending acknowledgement
1888 description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.