]> git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
7b7e7db7301bddc9f0d55e2ca2b56198fb89652c
[ceph.git] / ceph / monitoring / ceph-mixin / tests_alerts / test_alerts.yml
1 rule_files:
2 - ../prometheus_alerts.yml
3 evaluation_interval: 5m
4 tests:
5 # health error
6 - interval: 5m
7 input_series:
8 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
9 values: '2 2 2 2 2 2 2'
10 promql_expr_test:
11 - expr: ceph_health_status == 2
12 eval_time: 5m
13 exp_samples:
14 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
15 value: 2
16 alert_rule_test:
17 - eval_time: 1m
18 alertname: CephHealthError
19 - eval_time: 6m
20 alertname: CephHealthError
21 exp_alerts:
22 - exp_labels:
23 instance: ceph:9283
24 job: ceph
25 oid: 1.3.6.1.4.1.50495.1.2.1.2.1
26 type: ceph_default
27 severity: critical
28 exp_annotations:
29 summary: Ceph is in the ERROR state
30 description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
31
32 # health warning
33 - interval: 5m
34 input_series:
35 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
36 values: '1 1 1 1 1 1 1 1 1 1'
37 promql_expr_test:
38 - expr: ceph_health_status == 1
39 eval_time: 15m
40 exp_samples:
41 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
42 value: 1
43 alert_rule_test:
44 - eval_time: 10m
45 alertname: CephHealthWarning
46 - eval_time: 20m
47 alertname: CephHealthWarning
48 exp_alerts:
49 - exp_labels:
50 instance: ceph:9283
51 job: ceph
52 type: ceph_default
53 severity: warning
54 exp_annotations:
55 summary: Ceph is in the WARNING state
56 description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
57
58 # 10% OSDs down
59 - interval: 1m
60 input_series:
61 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
62 values: '1 1 1 1 1'
63 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
64 values: '0 0 0 0 0'
65 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
66 values: '1 1 1 1 1'
67 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
68 ceph_version="ceph version 17.0.0-189-g3558fd72
69 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
70 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
71 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
72 public_addr="172.20.0.2"}'
73 values: '1 1 1 1 1'
74 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
75 ceph_version="ceph version 17.0.0-189-g3558fd72
76 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
77 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
78 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
79 public_addr="172.20.0.2"}'
80 values: '1 1 1 1 1'
81 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
82 ceph_version="ceph version 17.0.0-189-g3558fd72
83 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
84 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
85 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
86 public_addr="172.20.0.2"}'
87 values: '1 1 1 1 1'
88 promql_expr_test:
89 - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
90 eval_time: 1m
91 exp_samples:
92 - labels: '{}'
93 value: 3.333333333333333E+01
94 alert_rule_test:
95 - eval_time: 1m
96 alertname: CephOSDDownHigh
97 exp_alerts:
98 - exp_labels:
99 oid: 1.3.6.1.4.1.50495.1.2.1.4.1
100 type: ceph_default
101 severity: critical
102 exp_annotations:
103 summary: More than 10% of OSDs are down
104 description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
105
106 # flapping OSD
107 - interval: 1s
108 input_series:
109 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
110 values: '1+1x100'
111 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
112 values: '1+0x100'
113 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
114 values: '1+0x100'
115 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
116 ceph_version="ceph version 17.0.0-189-g3558fd72
117 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
118 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
119 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
120 public_addr="172.20.0.2"}'
121 values: '1 1 1 1 1 1'
122 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
123 ceph_version="ceph version 17.0.0-189-g3558fd72
124 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
125 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
126 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
127 public_addr="172.20.0.2"}'
128 values: '1 1 1 1 1 1'
129 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
130 ceph_version="ceph version 17.0.0-189-g3558fd72
131 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
132 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
133 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
134 public_addr="172.20.0.2"}'
135 values: '1 1 1 1 1 1'
136 promql_expr_test:
137 - expr: |
138 (
139 rate(ceph_osd_up[5m])
140 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
141 ) * 60 > 1
142 eval_time: 1m
143 exp_samples:
144 - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
145 job="ceph"}'
146 value: 1.2200000000000001E+01
147 alert_rule_test:
148 - eval_time: 5m
149 alertname: CephOSDFlapping
150 exp_alerts:
151 - exp_labels:
152 ceph_daemon: osd.0
153 hostname: ceph
154 instance: ceph:9283
155 job: ceph
156 oid: 1.3.6.1.4.1.50495.1.2.1.4.4
157 severity: warning
158 type: ceph_default
159 exp_annotations:
160 documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
161 summary: Network issues are causing OSDs to flap (mark each other down)
162 description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
163
164 # high pg count deviation
165 - interval: 1m
166 input_series:
167 - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
168 job="ceph"}'
169 values: '100 100 100 100 100 160'
170 - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
171 job="ceph"}'
172 values: '100 100 100 100 100 320'
173 - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
174 job="ceph"}'
175 values: '100 100 100 100 100 160'
176 - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
177 job="ceph"}'
178 values: '100 100 100 100 100 160'
179 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
180 ceph_version="ceph version 17.0.0-189-g3558fd72
181 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
182 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
183 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
184 public_addr="172.20.0.2"}'
185 values: '1 1 1 1 1 1'
186 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
187 ceph_version="ceph version 17.0.0-189-g3558fd72
188 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
189 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
190 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
191 public_addr="172.20.0.2"}'
192 values: '1 1 1 1 1 1'
193 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
194 ceph_version="ceph version 17.0.0-189-g3558fd72
195 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
196 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
197 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
198 public_addr="172.20.0.2"}'
199 values: '1 1 1 1 1 1'
200 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
201 ceph_version="ceph version 17.0.0-189-g3558fd72
202 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
203 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
204 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
205 public_addr="172.20.0.2"}'
206 values: '1 1 1 1 1 1'
207 promql_expr_test:
208 - expr: |
209 abs(
210 (
211 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
212 by (job)
213 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
214 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
215
216 eval_time: 5m
217 exp_samples:
218 - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
219 job="ceph"}'
220 value: 6E-01
221 alert_rule_test:
222 - eval_time: 10m
223 alertname: CephPGImbalance
224 exp_alerts:
225 - exp_labels:
226 ceph_daemon: osd.1
227 hostname: ceph
228 instance: ceph:9283
229 job: ceph
230 oid: 1.3.6.1.4.1.50495.1.2.1.4.5
231 severity: warning
232 type: ceph_default
233 exp_annotations:
234 summary: PGs are not balanced across OSDs
235 description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
236
237 # pgs inactive
238 - interval: 1m
239 input_series:
240 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
241 name="device_health_metrics",pool_id="1"}'
242 values: '1 1 1 1 1 1 1 1'
243 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
244 name="device_health_metrics",pool_id="2"}'
245 values: '1 1 1 1 1 1 1 1'
246 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
247 name="device_health_metrics",pool_id="3"}'
248 values: '1 1 1 1 1 1 1 1'
249 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
250 values: '1 1 1 1 1 1 1 1'
251 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
252 values: '32 32 32 32 32 32 32 32'
253 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
254 values: '33 32 32 32 32 33 33 32'
255 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
256 values: '1 1 1 1 1 1 1 1 1'
257 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
258 values: '32 32 32 32 32 32 32 32'
259 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
260 values: '32 32 32 32 32 32 32 32'
261 promql_expr_test:
262 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
263 (ceph_pg_total - ceph_pg_active) > 0
264 eval_time: 5m
265 exp_samples:
266 - labels: '{instance="ceph:9283", job="ceph",
267 name="device_health_metrics",
268 pool_id="3"}'
269 value: 1
270 alert_rule_test:
271 - eval_time: 5m
272 alertname: CephPGsInactive
273 exp_alerts:
274 - exp_labels:
275 instance: ceph:9283
276 job: ceph
277 name: device_health_metrics
278 oid: 1.3.6.1.4.1.50495.1.2.1.7.1
279 pool_id: 3
280 severity: critical
281 type: ceph_default
282 exp_annotations:
283 summary: One or more placement groups are inactive
284 description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
285
286 #pgs unclean
287 - interval: 1m
288 input_series:
289 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
290 name="device_health_metrics",pool_id="1"}'
291 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
292 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
293 name="device_health_metrics",pool_id="2"}'
294 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
295 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
296 name="device_health_metrics",pool_id="3"}'
297 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
298 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
299 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
300 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
301 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
302 32 32 32'
303 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
304 values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
305 33 33'
306 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
307 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
308 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
309 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
310 32 32'
311 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
312 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
313 32 32'
314 promql_expr_test:
315 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
316 (ceph_pg_total - ceph_pg_clean) > 0
317 eval_time: 15m
318 exp_samples:
319 - labels: '{instance="ceph:9283", job="ceph",
320 name="device_health_metrics", pool_id="3"}'
321 value: 1
322 alert_rule_test:
323 - eval_time: 16m
324 alertname: CephPGsUnclean
325 exp_alerts:
326 - exp_labels:
327 instance: ceph:9283
328 job: ceph
329 name: device_health_metrics
330 oid: 1.3.6.1.4.1.50495.1.2.1.7.2
331 pool_id: 3
332 severity: warning
333 type: ceph_default
334 exp_annotations:
335 summary: One or more placement groups are marked unclean
336 description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
337
338 # root volume full
339 - interval: 1m
340 input_series:
341 - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
342 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
343 mountpoint="/"}'
344 values: '35336400896 35336400896 35336400896 35336400896 35336400896
345 3525385519.104 3533640089'
346 - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
347 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
348 mountpoint="/"}'
349 values: '73445531648 73445531648 73445531648 73445531648 73445531648
350 73445531648 73445531648'
351 promql_expr_test:
352 - expr: node_filesystem_avail_bytes{mountpoint="/"} /
353 node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
354 eval_time: 5m
355 exp_samples:
356 - labels: '{device="/dev/mapper/fedora_localhost --live-home",
357 fstype="ext4", instance="node-exporter", job="node-exporter",
358 mountpoint="/"}'
359 value: 4.8E+00
360 alert_rule_test:
361 - eval_time: 10m
362 alertname: CephNodeRootFilesystemFull
363 exp_alerts:
364 - exp_labels:
365 device: /dev/mapper/fedora_localhost --live-home
366 fstype: ext4
367 instance: node-exporter
368 job: node-exporter
369 mountpoint: /
370 oid: 1.3.6.1.4.1.50495.1.2.1.8.1
371 severity: critical
372 type: ceph_default
373 exp_annotations:
374 summary: Root filesystem is dangerously full
375 description: "Root volume is dangerously full: 4.811% free."
376
377 # network packets dropped
378 - interval: 1m
379 input_series:
380 - series: 'node_network_receive_drop_total{device="eth0",
381 instance="node-exporter",job="node-exporter"}'
382 values: '0+600x10'
383 - series: 'node_network_transmit_drop_total{device="eth0",
384 instance="node-exporter",job="node-exporter"}'
385 values: '0+600x10'
386 - series: 'node_network_receive_packets_total{device="eth0",
387 instance="node-exporter",job="node-exporter"}'
388 values: '0+750x10'
389 - series: 'node_network_transmit_packets_total{device="eth0",
390 instance="node-exporter",job="node-exporter"}'
391 values: '0+750x10'
392 promql_expr_test:
393 - expr: |
394 (
395 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
396 rate(node_network_transmit_drop_total{device!="lo"}[1m])
397 ) / (
398 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
399 rate(node_network_transmit_packets_total{device!="lo"}[1m])
400 ) >= 0.0050000000000000001 and (
401 rate(node_network_receive_drop_total{device!="lo"}[1m]) +
402 rate(node_network_transmit_drop_total{device!="lo"}[1m])
403 ) >= 10
404
405 eval_time: 5m
406 exp_samples:
407 - labels: '{device="eth0", instance="node-exporter",
408 job="node-exporter"}'
409 value: 8E-1
410 alert_rule_test:
411 - eval_time: 5m
412 alertname: CephNodeNetworkPacketDrops
413 exp_alerts:
414 - exp_labels:
415 device: eth0
416 instance: node-exporter
417 job: node-exporter
418 oid: 1.3.6.1.4.1.50495.1.2.1.8.2
419 severity: warning
420 type: ceph_default
421 exp_annotations:
422 summary: One or more NICs reports packet drops
423 description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
424
425 # network packets errors
426 - interval: 1m
427 input_series:
428 - series: 'node_network_receive_errs_total{device="eth0",
429 instance="node-exporter",job="node-exporter"}'
430 values: '0+600x10'
431 - series: 'node_network_transmit_errs_total{device="eth0",
432 instance="node-exporter",job="node-exporter"}'
433 values: '0+600x10'
434 - series: 'node_network_transmit_packets_total{device="eth0",
435 instance="node-exporter",job="node-exporter"}'
436 values: '0+750x10'
437 - series: 'node_network_receive_packets_total{device="eth0",
438 instance="node-exporter",job="node-exporter"}'
439 values: '0+750x10'
440 promql_expr_test:
441 - expr: |
442 (
443 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
444 rate(node_network_transmit_errs_total{device!="lo"}[1m])
445 ) / (
446 rate(node_network_receive_packets_total{device!="lo"}[1m]) +
447 rate(node_network_transmit_packets_total{device!="lo"}[1m])
448 ) >= 0.0001 or (
449 rate(node_network_receive_errs_total{device!="lo"}[1m]) +
450 rate(node_network_transmit_errs_total{device!="lo"}[1m])
451 ) >= 10
452
453 eval_time: 5m
454 exp_samples:
455 - labels: '{device="eth0", instance="node-exporter",
456 job="node-exporter"}'
457 value: 8E-01
458 alert_rule_test:
459 - eval_time: 5m
460 alertname: CephNodeNetworkPacketErrors
461 exp_alerts:
462 - exp_labels:
463 device: eth0
464 instance: node-exporter
465 job: node-exporter
466 oid: 1.3.6.1.4.1.50495.1.2.1.8.3
467 severity: warning
468 type: ceph_default
469 exp_annotations:
470 summary: One or more NICs reports packet errors
471 description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
472
473 # Node Storage disk space filling up
474 - interval: 1m
475 # 20GB = 21474836480, 256MB = 268435456
476 input_series:
477 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
478 fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
479 values: '21474836480-268435456x48'
480 - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
481 fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
482 values: '21474836480+0x48'
483 - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
484 values: 1+0x48
485 - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
486 values: 1+0x48
487 promql_expr_test:
488 - expr: |
489 predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
490 on(instance) group_left(nodename) node_uname_info < 0
491 eval_time: 5m
492 exp_samples:
493 - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
494 mountpoint="/rootfs",nodename="node-1.unittests.com"}'
495 value: -1.912602624E+12
496 alert_rule_test:
497 - eval_time: 5m
498 alertname: CephNodeDiskspaceWarning
499 exp_alerts:
500 - exp_labels:
501 severity: warning
502 type: ceph_default
503 oid: 1.3.6.1.4.1.50495.1.2.1.8.4
504 device: /dev/mapper/vg-root
505 fstype: xfs
506 instance: node-1
507 mountpoint: /rootfs
508 nodename: node-1.unittests.com
509 exp_annotations:
510 summary: Host filesystem free space is getting low
511 description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
512 # MTU Mismatch
513 - interval: 1m
514 input_series:
515 - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
516 job="node-exporter"}'
517 values: '1500 1500 1500 1500 1500'
518 - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
519 job="node-exporter"}'
520 values: '1500 1500 1500 1500 1500'
521 - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
522 job="node-exporter"}'
523 values: '1500 1500 1500 1500 1500'
524 - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
525 job="node-exporter"}'
526 values: '1500 1500 1500 1500 1500'
527 - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
528 job="node-exporter"}'
529 values: '9000 9000 9000 9000 9000'
530 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
531 job="node-exporter"}'
532 values: '2200 2200 2200 2200 2200'
533 - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
534 job="node-exporter"}'
535 values: '2400 2400 2400 2400 2400'
536 - series: 'node_network_up{device="eth0",instance="node-exporter",
537 job="node-exporter"}'
538 values: '0 0 0 0 0'
539 - series: 'node_network_up{device="eth1",instance="node-exporter",
540 job="node-exporter"}'
541 values: '0 0 0 0 0'
542 - series: 'node_network_up{device="eth2",instance="node-exporter",
543 job="node-exporter"}'
544 values: '1 1 1 1 1'
545 - series: 'node_network_up{device="eth3",instance="node-exporter",
546 job="node-exporter"}'
547 values: '1 1 1 1 1'
548 - series: 'node_network_up{device="eth4",instance="node-exporter",
549 job="node-exporter"}'
550 values: '1 1 1 1 1'
551 - series: 'node_network_up{device="eth4",instance="hostname1",
552 job="node-exporter"}'
553 values: '1 1 1 1 1'
554 - series: 'node_network_up{device="eth4",instance="hostname2",
555 job="node-exporter"}'
556 values: '0 0 0 0 0'
557 promql_expr_test:
558 - expr: |
559 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
560 scalar(
561 max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
562 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
563 )
564 or
565 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
566 scalar(
567 min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
568 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
569 )
570 eval_time: 1m
571 exp_samples:
572 - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
573 value: 9000
574 - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
575 value: 2200
576 alert_rule_test:
577 - eval_time: 1m
578 alertname: CephNodeInconsistentMTU
579 exp_alerts:
580 - exp_labels:
581 device: eth4
582 instance: hostname1
583 job: node-exporter
584 severity: warning
585 type: ceph_default
586 exp_annotations:
587 summary: MTU settings across Ceph hosts are inconsistent
588 description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
589 - exp_labels:
590 device: eth4
591 instance: node-exporter
592 job: node-exporter
593 severity: warning
594 type: ceph_default
595 exp_annotations:
596 summary: MTU settings across Ceph hosts are inconsistent
597 description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
598
599 # pool full, data series has 6 but using topk(5) so to ensure the
600 # results are working as expected
601 - interval: 1m
602 input_series:
603 - series: 'ceph_health_detail{name="POOL_FULL"}'
604 values: '0 0 0 1 1 1 1 1 1 1 1'
605 - series: 'ceph_pool_percent_used{pool_id="1"}'
606 values: '32+0x10'
607 - series: 'ceph_pool_percent_used{pool_id="2"}'
608 values: '96+0x10'
609 - series: 'ceph_pool_percent_used{pool_id="3"}'
610 values: '90+0x10'
611 - series: 'ceph_pool_percent_used{pool_id="4"}'
612 values: '72+0x10'
613 - series: 'ceph_pool_percent_used{pool_id="5"}'
614 values: '19+0x10'
615 - series: 'ceph_pool_percent_used{pool_id="6"}'
616 values: '10+0x10'
617 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
618 name="cephfs_data",pool_id="1"}'
619 values: '1 1 1 1 1 1 1 1 1'
620 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
621 name="rbd",pool_id="2"}'
622 values: '1 1 1 1 1 1 1 1 1'
623 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
624 name="iscsi",pool_id="3"}'
625 values: '1 1 1 1 1 1 1 1 1'
626 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
627 name="default.rgw.index",pool_id="4"}'
628 values: '1 1 1 1 1 1 1 1 1'
629 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
630 name="default.rgw.log",pool_id="5"}'
631 values: '1 1 1 1 1 1 1 1 1'
632 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
633 name="dummy",pool_id="6"}'
634 values: '1 1 1 1 1 1 1 1 1'
635 promql_expr_test:
636 - expr: ceph_health_detail{name="POOL_FULL"} > 0
637 eval_time: 5m
638 exp_samples:
639 - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}'
640 value: 1
641 alert_rule_test:
642 - eval_time: 1m
643 alertname: CephPoolFull
644 - eval_time: 10m
645 alertname: CephPoolFull
646 exp_alerts:
647 - exp_labels:
648 name: POOL_FULL
649 severity: critical
650 type: ceph_default
651 oid: 1.3.6.1.4.1.50495.1.2.1.9.1
652 exp_annotations:
653 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
654 summary: Pool is full - writes are blocked
655 description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
656 # slow OSD ops
657 - interval : 1m
658 input_series:
659 - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
660 values: '1+0x120'
661 promql_expr_test:
662 - expr: ceph_healthcheck_slow_ops > 0
663 eval_time: 1m
664 exp_samples:
665 - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
666 job="ceph"}'
667 value: 1
668 alert_rule_test:
669 - eval_time: 20m
670 alertname: CephSlowOps
671 exp_alerts:
672 - exp_labels:
673 instance: ceph:9283
674 job: ceph
675 severity: warning
676 type: ceph_default
677 exp_annotations:
678 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
679 summary: OSD operations are slow to complete
680 description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
681
682 # CEPHADM orchestrator alert triggers
683 - interval: 30s
684 input_series:
685 - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
686 values: '1+0x40'
687 promql_expr_test:
688 - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
689 eval_time: 2m
690 exp_samples:
691 - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
692 value: 1
693 alert_rule_test:
694 - eval_time: 1m
695 alertname: CephadmUpgradeFailed
696 - eval_time: 5m
697 alertname: CephadmUpgradeFailed
698 exp_alerts:
699 - exp_labels:
700 name: UPGRADE_EXCEPTION
701 severity: critical
702 type: ceph_default
703 oid: 1.3.6.1.4.1.50495.1.2.1.11.2
704 exp_annotations:
705 summary: Ceph version upgrade has failed
706 description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
707 - interval: 30s
708 input_series:
709 - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
710 values: '1+0x40'
711 promql_expr_test:
712 - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
713 eval_time: 2m
714 exp_samples:
715 - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
716 value: 1
717 alert_rule_test:
718 - eval_time: 1m
719 alertname: CephadmDaemonFailed
720 - eval_time: 5m
721 alertname: CephadmDaemonFailed
722 exp_alerts:
723 - exp_labels:
724 name: CEPHADM_FAILED_DAEMON
725 severity: critical
726 type: ceph_default
727 oid: 1.3.6.1.4.1.50495.1.2.1.11.1
728 exp_annotations:
729 summary: A ceph daemon manged by cephadm is down
730 description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
731 - interval: 1m
732 input_series:
733 - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
734 values: '1 1 1 1 1 1 1 1 1'
735 promql_expr_test:
736 - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
737 eval_time: 2m
738 exp_samples:
739 - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
740 value: 1
741 alert_rule_test:
742 - eval_time: 1m
743 alertname: CephadmPaused
744 - eval_time: 5m
745 alertname: CephadmPaused
746 exp_alerts:
747 - exp_labels:
748 name: CEPHADM_PAUSED
749 severity: warning
750 type: ceph_default
751 exp_annotations:
752 documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
753 summary: Orchestration tasks via cephadm are PAUSED
754 description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
755 # MDS
756 - interval: 1m
757 input_series:
758 - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
759 values: '1 1 1 1 1 1 1 1 1'
760 promql_expr_test:
761 - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
762 eval_time: 2m
763 exp_samples:
764 - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
765 value: 1
766 alert_rule_test:
767 - eval_time: 1m
768 alertname: CephFilesystemDamaged
769 - eval_time: 5m
770 alertname: CephFilesystemDamaged
771 exp_alerts:
772 - exp_labels:
773 name: MDS_DAMAGE
774 severity: critical
775 type: ceph_default
776 oid: 1.3.6.1.4.1.50495.1.2.1.5.1
777 exp_annotations:
778 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
779 summary: CephFS filesystem is damaged.
780 description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
781 - interval: 1m
782 input_series:
783 - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
784 values: '1 1 1 1 1 1 1 1 1'
785 promql_expr_test:
786 - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
787 eval_time: 2m
788 exp_samples:
789 - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
790 value: 1
791 alert_rule_test:
792 - eval_time: 1m
793 alertname: CephFilesystemReadOnly
794 - eval_time: 5m
795 alertname: CephFilesystemReadOnly
796 exp_alerts:
797 - exp_labels:
798 name: MDS_HEALTH_READ_ONLY
799 severity: critical
800 type: ceph_default
801 oid: 1.3.6.1.4.1.50495.1.2.1.5.2
802 exp_annotations:
803 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
804 summary: CephFS filesystem in read only mode due to write error(s)
805 description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
806 - interval: 1m
807 input_series:
808 - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
809 values: '0 0 1 1 1 1 1 1 1 1 1'
810 promql_expr_test:
811 - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
812 eval_time: 2m
813 exp_samples:
814 - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
815 value: 1
816 alert_rule_test:
817 - eval_time: 1m
818 alertname: CephFilesystemOffline
819 - eval_time: 10m
820 alertname: CephFilesystemOffline
821 exp_alerts:
822 - exp_labels:
823 name: MDS_ALL_DOWN
824 severity: critical
825 type: ceph_default
826 oid: 1.3.6.1.4.1.50495.1.2.1.5.3
827 exp_annotations:
828 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
829 summary: CephFS filesystem is offline
830 description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
831 - interval: 1m
832 input_series:
833 - series: 'ceph_health_detail{name="FS_DEGRADED"}'
834 values: '0 0 1 1 1 1 1 1 1 1 1'
835 promql_expr_test:
836 - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
837 eval_time: 2m
838 exp_samples:
839 - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
840 value: 1
841 alert_rule_test:
842 - eval_time: 1m
843 alertname: CephFilesystemDegraded
844 - eval_time: 10m
845 alertname: CephFilesystemDegraded
846 exp_alerts:
847 - exp_labels:
848 name: FS_DEGRADED
849 severity: critical
850 type: ceph_default
851 oid: 1.3.6.1.4.1.50495.1.2.1.5.4
852 exp_annotations:
853 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
854 summary: CephFS filesystem is degraded
855 description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
856 - interval: 1m
857 input_series:
858 - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
859 values: '0 0 1 1 1 1 1 1 1 1 1'
860 promql_expr_test:
861 - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
862 eval_time: 2m
863 exp_samples:
864 - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
865 value: 1
866 alert_rule_test:
867 - eval_time: 1m
868 alertname: CephFilesystemInsufficientStandby
869 - eval_time: 10m
870 alertname: CephFilesystemInsufficientStandby
871 exp_alerts:
872 - exp_labels:
873 name: MDS_INSUFFICIENT_STANDBY
874 severity: warning
875 type: ceph_default
876 exp_annotations:
877 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
878 summary: Ceph filesystem standby daemons too few
879 description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
880 - interval: 1m
881 input_series:
882 - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
883 values: '0 0 1 1 1 1 1 1 1 1 1'
884 promql_expr_test:
885 - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
886 eval_time: 2m
887 exp_samples:
888 - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
889 value: 1
890 alert_rule_test:
891 - eval_time: 1m
892 alertname: CephFilesystemFailureNoStandby
893 - eval_time: 10m
894 alertname: CephFilesystemFailureNoStandby
895 exp_alerts:
896 - exp_labels:
897 name: FS_WITH_FAILED_MDS
898 severity: critical
899 type: ceph_default
900 oid: 1.3.6.1.4.1.50495.1.2.1.5.5
901 exp_annotations:
902 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
903 summary: MDS daemon failed, no further standby available
904 description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
905 - interval: 1m
906 input_series:
907 - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
908 values: '0 0 1 1 1 1 1 1 1 1 1'
909 promql_expr_test:
910 - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
911 eval_time: 2m
912 exp_samples:
913 - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
914 value: 1
915 alert_rule_test:
916 - eval_time: 1m
917 alertname: CephFilesystemMDSRanksLow
918 - eval_time: 10m
919 alertname: CephFilesystemMDSRanksLow
920 exp_alerts:
921 - exp_labels:
922 name: MDS_UP_LESS_THAN_MAX
923 severity: warning
924 type: ceph_default
925 exp_annotations:
926 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
927 summary: Ceph MDS daemon count is lower than configured
928 description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
929 # MGR
930 - interval: 1m
931 input_series:
932 - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
933 values: '1+0x2 0+0x10'
934 promql_expr_test:
935 - expr: up{job="ceph"} == 0
936 eval_time: 3m
937 exp_samples:
938 - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
939 value: 0
940 alert_rule_test:
941 - eval_time: 1m
942 alertname: CephMgrPrometheusModuleInactive
943 - eval_time: 10m
944 alertname: CephMgrPrometheusModuleInactive
945 exp_alerts:
946 - exp_labels:
947 instance: ceph-mgr:9283
948 job: ceph
949 severity: critical
950 type: ceph_default
951 oid: 1.3.6.1.4.1.50495.1.2.1.6.2
952 exp_annotations:
953 summary: The mgr/prometheus module is not available
954 description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
955 - interval: 1m
956 input_series:
957 - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
958 values: '0+0x2 1+0x20'
959 promql_expr_test:
960 - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
961 eval_time: 3m
962 exp_samples:
963 - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
964 value: 1
965 alert_rule_test:
966 - eval_time: 1m
967 alertname: CephMgrModuleCrash
968 - eval_time: 15m
969 alertname: CephMgrModuleCrash
970 exp_alerts:
971 - exp_labels:
972 name: RECENT_MGR_MODULE_CRASH
973 severity: critical
974 type: ceph_default
975 oid: 1.3.6.1.4.1.50495.1.2.1.6.1
976 exp_annotations:
977 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
978 summary: A manager module has recently crashed
979 description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
980 # MON
981 - interval: 1m
982 input_series:
983 - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
984 values: '0+0x2 1+0x10'
985 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
986 values: '1+0x13'
987 promql_expr_test:
988 - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
989 eval_time: 3m
990 exp_samples:
991 - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
992 value: 1
993 alert_rule_test:
994 - eval_time: 1m
995 alertname: CephMonDiskspaceCritical
996 - eval_time: 10m
997 alertname: CephMonDiskspaceCritical
998 exp_alerts:
999 - exp_labels:
1000 name: "MON_DISK_CRIT"
1001 severity: critical
1002 type: ceph_default
1003 oid: 1.3.6.1.4.1.50495.1.2.1.3.2
1004 exp_annotations:
1005 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
1006 summary: Filesystem space on at least one monitor is critically low
1007 description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1008 - interval: 1m
1009 input_series:
1010 - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
1011 values: '0+0x2 1+0x10'
1012 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1013 values: '1+0x13'
1014 promql_expr_test:
1015 - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
1016 eval_time: 3m
1017 exp_samples:
1018 - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
1019 value: 1
1020 alert_rule_test:
1021 - eval_time: 1m
1022 alertname: CephMonDiskspaceLow
1023 - eval_time: 10m
1024 alertname: CephMonDiskspaceLow
1025 exp_alerts:
1026 - exp_labels:
1027 name: "MON_DISK_LOW"
1028 severity: warning
1029 type: ceph_default
1030 exp_annotations:
1031 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
1032 summary: Drive space on at least one monitor is approaching full
1033 description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1034 - interval: 1m
1035 input_series:
1036 - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
1037 values: '0+0x2 1+0x10'
1038 promql_expr_test:
1039 - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
1040 eval_time: 3m
1041 exp_samples:
1042 - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
1043 value: 1
1044 alert_rule_test:
1045 - eval_time: 1m
1046 alertname: CephMonClockSkew
1047 - eval_time: 10m
1048 alertname: CephMonClockSkew
1049 exp_alerts:
1050 - exp_labels:
1051 name: "MON_CLOCK_SKEW"
1052 severity: warning
1053 type: ceph_default
1054 exp_annotations:
1055 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
1056 summary: Clock skew detected among monitors
1057 description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
1058
1059 # Check 3 mons one down, quorum at risk
1060 - interval: 1m
1061 input_series:
1062 - series: 'ceph_health_detail{name="MON_DOWN"}'
1063 values: '0+0x2 1+0x12'
1064 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1065 values: '1+0x14'
1066 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1067 values: '1+0x14'
1068 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1069 values: '1+0x2 0+0x12'
1070 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1071 values: '1+0x14'
1072 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1073 values: '1+0x14'
1074 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1075 values: '1+0x14'
1076 promql_expr_test:
1077 - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
1078 eval_time: 3m
1079 exp_samples:
1080 - labels: '{}'
1081 value: 1
1082 alert_rule_test:
1083 - eval_time: 1m
1084 alertname: CephMonDownQuorumAtRisk
1085 # shouldn't fire
1086 - eval_time: 10m
1087 alertname: CephMonDownQuorumAtRisk
1088 exp_alerts:
1089 - exp_labels:
1090 severity: critical
1091 type: ceph_default
1092 oid: 1.3.6.1.4.1.50495.1.2.1.3.1
1093 exp_annotations:
1094 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1095 summary: Monitor quorum is at risk
1096 description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
1097 # check 5 mons, 1 down - warning only
1098 - interval: 1m
1099 input_series:
1100 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1101 values: '1+0x14'
1102 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1103 values: '1+0x14'
1104 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1105 values: '1+0x14'
1106 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
1107 values: '1+0x14'
1108 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
1109 values: '1+0x2 0+0x12'
1110 - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1111 values: '1+0x14'
1112 - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1113 values: '1+0x14'
1114 - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1115 values: '1+0x14'
1116 - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
1117 values: '1+0x14'
1118 - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
1119 values: '1+0x14'
1120 promql_expr_test:
1121 - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
1122 eval_time: 3m
1123 exp_samples:
1124 - labels: '{}'
1125 value: 1
1126 alert_rule_test:
1127 - eval_time: 1m
1128 alertname: CephMonDown
1129 - eval_time: 10m
1130 alertname: CephMonDown
1131 exp_alerts:
1132 - exp_labels:
1133 severity: warning
1134 type: ceph_default
1135 exp_annotations:
1136 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1137 summary: One or more monitors down
1138 description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: - mon.e on ceph-mon-5\n"
1139 # Device Health
1140 - interval: 1m
1141 input_series:
1142 - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
1143 values: '0+0x2 1+0x10'
1144 promql_expr_test:
1145 - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
1146 eval_time: 3m
1147 exp_samples:
1148 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
1149 value: 1
1150 alert_rule_test:
1151 - eval_time: 1m
1152 alertname: CephDeviceFailurePredicted
1153 - eval_time: 10m
1154 alertname: CephDeviceFailurePredicted
1155 exp_alerts:
1156 - exp_labels:
1157 name: "DEVICE_HEALTH"
1158 severity: warning
1159 type: ceph_default
1160 exp_annotations:
1161 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
1162 summary: Device(s) predicted to fail soon
1163 description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
1164 - interval: 1m
1165 input_series:
1166 - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
1167 values: '0+0x2 1+0x10'
1168 promql_expr_test:
1169 - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
1170 eval_time: 3m
1171 exp_samples:
1172 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
1173 value: 1
1174 alert_rule_test:
1175 - eval_time: 1m
1176 alertname: CephDeviceFailurePredictionTooHigh
1177 - eval_time: 10m
1178 alertname: CephDeviceFailurePredictionTooHigh
1179 exp_alerts:
1180 - exp_labels:
1181 name: "DEVICE_HEALTH_TOOMANY"
1182 severity: critical
1183 type: ceph_default
1184 oid: 1.3.6.1.4.1.50495.1.2.1.4.7
1185 exp_annotations:
1186 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
1187 summary: Too many devices are predicted to fail, unable to resolve
1188 description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
1189 - interval: 1m
1190 input_series:
1191 - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
1192 values: '0+0x2 1+0x10'
1193 promql_expr_test:
1194 - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
1195 eval_time: 3m
1196 exp_samples:
1197 - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
1198 value: 1
1199 alert_rule_test:
1200 - eval_time: 1m
1201 alertname: CephDeviceFailureRelocationIncomplete
1202 - eval_time: 10m
1203 alertname: CephDeviceFailureRelocationIncomplete
1204 exp_alerts:
1205 - exp_labels:
1206 name: "DEVICE_HEALTH_IN_USE"
1207 severity: warning
1208 type: ceph_default
1209 exp_annotations:
1210 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
1211 summary: Device failure is predicted, but unable to relocate data
1212 description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
1213 # OSD
1214 - interval: 1m
1215 input_series:
1216 - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
1217 values: '0+0x2 1+0x10'
1218 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1219 values: '1+0x2 0+0x10'
1220 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1221 values: '1+0x12'
1222 promql_expr_test:
1223 - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
1224 eval_time: 3m
1225 exp_samples:
1226 - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
1227 value: 1
1228 alert_rule_test:
1229 - eval_time: 1m
1230 alertname: CephOSDHostDown
1231 - eval_time: 10m
1232 alertname: CephOSDHostDown
1233 exp_alerts:
1234 - exp_labels:
1235 name: "OSD_HOST_DOWN"
1236 severity: warning
1237 type: ceph_default
1238 oid: 1.3.6.1.4.1.50495.1.2.1.4.8
1239 exp_annotations:
1240 summary: An OSD host is offline
1241 description: "The following OSDs are down: - ceph-osd-1 : osd.0"
1242 - interval: 1m
1243 input_series:
1244 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
1245 values: '0+0x2 1+0x20'
1246 promql_expr_test:
1247 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
1248 eval_time: 1m
1249 exp_samples:
1250 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
1251 value: 0
1252 alert_rule_test:
1253 - eval_time: 1m
1254 alertname: CephOSDTimeoutsPublicNetwork
1255 - eval_time: 10m
1256 alertname: CephOSDTimeoutsPublicNetwork
1257 exp_alerts:
1258 - exp_labels:
1259 name: "OSD_SLOW_PING_TIME_FRONT"
1260 severity: warning
1261 type: ceph_default
1262 exp_annotations:
1263 summary: Network issues delaying OSD heartbeats (public network)
1264 description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
1265 - interval: 1m
1266 input_series:
1267 - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
1268 values: '0+0x2 1+0x20'
1269 promql_expr_test:
1270 - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
1271 eval_time: 1m
1272 exp_samples:
1273 - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
1274 value: 0
1275 alert_rule_test:
1276 - eval_time: 1m
1277 alertname: CephOSDTimeoutsClusterNetwork
1278 - eval_time: 10m
1279 alertname: CephOSDTimeoutsClusterNetwork
1280 exp_alerts:
1281 - exp_labels:
1282 name: "OSD_SLOW_PING_TIME_BACK"
1283 severity: warning
1284 type: ceph_default
1285 exp_annotations:
1286 summary: Network issues delaying OSD heartbeats (cluster network)
1287 description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
1288 - interval: 1m
1289 input_series:
1290 - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1291 values: '0+0x2 1+0x20'
1292 promql_expr_test:
1293 - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
1294 eval_time: 1m
1295 exp_samples:
1296 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1297 value: 0
1298 alert_rule_test:
1299 - eval_time: 1m
1300 alertname: CephOSDInternalDiskSizeMismatch
1301 - eval_time: 10m
1302 alertname: CephOSDInternalDiskSizeMismatch
1303 exp_alerts:
1304 - exp_labels:
1305 name: "BLUESTORE_DISK_SIZE_MISMATCH"
1306 severity: warning
1307 type: ceph_default
1308 exp_annotations:
1309 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
1310 summary: OSD size inconsistency error
1311 description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
1312 - interval: 30s
1313 input_series:
1314 - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1315 values: '0+0x2 1+0x20'
1316 promql_expr_test:
1317 - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
1318 eval_time: 3m
1319 exp_samples:
1320 - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1321 value: 1
1322 alert_rule_test:
1323 - eval_time: 1m
1324 alertname: CephOSDReadErrors
1325 - eval_time: 10m
1326 alertname: CephOSDReadErrors
1327 exp_alerts:
1328 - exp_labels:
1329 name: "BLUESTORE_SPURIOUS_READ_ERRORS"
1330 severity: warning
1331 type: ceph_default
1332 exp_annotations:
1333 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
1334 summary: Device read errors detected
1335 description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
1336 - interval: 1m
1337 input_series:
1338 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1339 values: '0+0x2 1+0x10'
1340 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1341 values: '1+0x12'
1342 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1343 values: '1+0x2 0+0x10'
1344 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1345 values: '1+0x12'
1346 - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1347 values: '1+0x12'
1348 - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
1349 values: '1+0x12'
1350 - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
1351 values: '1+0x12'
1352 promql_expr_test:
1353 - expr: ceph_health_detail{name="OSD_DOWN"} == 1
1354 eval_time: 3m
1355 exp_samples:
1356 - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
1357 value: 1
1358 alert_rule_test:
1359 - eval_time: 1m
1360 alertname: CephOSDDown
1361 - eval_time: 10m
1362 alertname: CephOSDDown
1363 exp_alerts:
1364 - exp_labels:
1365 name: "OSD_DOWN"
1366 severity: warning
1367 type: ceph_default
1368 oid: 1.3.6.1.4.1.50495.1.2.1.4.2
1369 exp_annotations:
1370 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
1371 summary: An OSD has been marked down
1372 description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
1373 - interval: 1m
1374 input_series:
1375 - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
1376 values: '0+0x2 1+0x10'
1377 promql_expr_test:
1378 - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
1379 eval_time: 3m
1380 exp_samples:
1381 - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
1382 value: 1
1383 alert_rule_test:
1384 - eval_time: 1m
1385 alertname: CephOSDNearFull
1386 - eval_time: 10m
1387 alertname: CephOSDNearFull
1388 exp_alerts:
1389 - exp_labels:
1390 name: "OSD_NEARFULL"
1391 severity: warning
1392 type: ceph_default
1393 oid: 1.3.6.1.4.1.50495.1.2.1.4.3
1394 exp_annotations:
1395 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
1396 summary: OSD(s) running low on free space (NEARFULL)
1397 description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1398 - interval: 1m
1399 input_series:
1400 - series: 'ceph_health_detail{name="OSD_FULL"}'
1401 values: '0+0x2 1+0x10'
1402 promql_expr_test:
1403 - expr: ceph_health_detail{name="OSD_FULL"} == 1
1404 eval_time: 3m
1405 exp_samples:
1406 - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
1407 value: 1
1408 alert_rule_test:
1409 - eval_time: 1m
1410 alertname: CephOSDFull
1411 - eval_time: 10m
1412 alertname: CephOSDFull
1413 exp_alerts:
1414 - exp_labels:
1415 name: "OSD_FULL"
1416 severity: critical
1417 type: ceph_default
1418 oid: 1.3.6.1.4.1.50495.1.2.1.4.6
1419 exp_annotations:
1420 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
1421 summary: OSD full, writes blocked
1422 description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1423 - interval: 1m
1424 input_series:
1425 - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
1426 values: '0+0x2 1+0x10'
1427 promql_expr_test:
1428 - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
1429 eval_time: 3m
1430 exp_samples:
1431 - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
1432 value: 1
1433 alert_rule_test:
1434 - eval_time: 1m
1435 alertname: CephOSDBackfillFull
1436 - eval_time: 10m
1437 alertname: CephOSDBackfillFull
1438 exp_alerts:
1439 - exp_labels:
1440 name: "OSD_BACKFILLFULL"
1441 severity: warning
1442 type: ceph_default
1443 exp_annotations:
1444 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
1445 summary: OSD(s) too full for backfill operations
1446 description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
1447 - interval: 30s
1448 input_series:
1449 - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
1450 values: '0+0x2 1+0x20'
1451 promql_expr_test:
1452 - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
1453 eval_time: 1m
1454 exp_samples:
1455 - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
1456 value: 0
1457 alert_rule_test:
1458 - eval_time: 1m
1459 alertname: CephOSDTooManyRepairs
1460 - eval_time: 10m
1461 alertname: CephOSDTooManyRepairs
1462 exp_alerts:
1463 - exp_labels:
1464 name: "OSD_TOO_MANY_REPAIRS"
1465 severity: warning
1466 type: ceph_default
1467 exp_annotations:
1468 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
1469 summary: OSD reports a high number of read errors
1470 description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
1471 # Pools
1472 # trigger percent full prediction on pools 1 and 2 only
1473 - interval: 12h
1474 input_series:
1475 - series: 'ceph_pool_percent_used{pool_id="1"}'
1476 values: '70 75 80 87 92'
1477 - series: 'ceph_pool_percent_used{pool_id="2"}'
1478 values: '22 22 23 23 24'
1479 - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
1480 values: '1 1 1 1 1'
1481 - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
1482 values: '1 1 1 1 1'
1483 promql_expr_test:
1484 - expr: |
1485 (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
1486 group_right ceph_pool_metadata) >= 95
1487 eval_time: 36h
1488 exp_samples:
1489 - labels: '{name="rbd",pool_id="1",type="replicated"}'
1490 value: 1.424E+02 # 142%
1491 alert_rule_test:
1492 - eval_time: 48h
1493 alertname: CephPoolGrowthWarning
1494 exp_alerts:
1495 - exp_labels:
1496 name: rbd
1497 pool_id: 1
1498 severity: warning
1499 type: ceph_default
1500 oid: 1.3.6.1.4.1.50495.1.2.1.9.2
1501 exp_annotations:
1502 summary: Pool growth rate may soon exceed capacity
1503 description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
1504 - interval: 1m
1505 input_series:
1506 - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
1507 values: '0+0x2 1+0x10'
1508 promql_expr_test:
1509 - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
1510 eval_time: 3m
1511 exp_samples:
1512 - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
1513 value: 1
1514 alert_rule_test:
1515 - eval_time: 1m
1516 alertname: CephPoolBackfillFull
1517 - eval_time: 5m
1518 alertname: CephPoolBackfillFull
1519 exp_alerts:
1520 - exp_labels:
1521 name: "POOL_BACKFILLFULL"
1522 severity: warning
1523 type: ceph_default
1524 exp_annotations:
1525 summary: Free space in a pool is too low for recovery/backfill
1526 description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
1527
1528 - interval: 1m
1529 input_series:
1530 - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
1531 values: '0+0x2 1+0x10'
1532 promql_expr_test:
1533 - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
1534 eval_time: 3m
1535 exp_samples:
1536 - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
1537 value: 1
1538 alert_rule_test:
1539 - eval_time: 1m
1540 alertname: CephPoolNearFull
1541 - eval_time: 10m
1542 alertname: CephPoolNearFull
1543 exp_alerts:
1544 - exp_labels:
1545 name: "POOL_NEAR_FULL"
1546 severity: warning
1547 type: ceph_default
1548 exp_annotations:
1549 summary: One or more Ceph pools are nearly full
1550 description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
1551
1552 # PGs
1553 - interval: 1m
1554 input_series:
1555 - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
1556 values: '0+0x2 1+0x10'
1557 promql_expr_test:
1558 - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
1559 eval_time: 3m
1560 exp_samples:
1561 - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
1562 value: 1
1563 alert_rule_test:
1564 - eval_time: 1m
1565 alertname: CephPGNotScrubbed
1566 - eval_time: 10m
1567 alertname: CephPGNotScrubbed
1568 exp_alerts:
1569 - exp_labels:
1570 name: "PG_NOT_SCRUBBED"
1571 severity: warning
1572 type: ceph_default
1573 exp_annotations:
1574 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
1575 summary: Placement group(s) have not been scrubbed
1576 description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
1577 - interval: 1m
1578 input_series:
1579 - series: 'ceph_health_detail{name="PG_DAMAGED"}'
1580 values: '0+0x4 1+0x20'
1581 promql_expr_test:
1582 - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
1583 eval_time: 5m
1584 exp_samples:
1585 - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
1586 value: 1
1587 alert_rule_test:
1588 - eval_time: 1m
1589 alertname: CephPGsDamaged
1590 - eval_time: 10m
1591 alertname: CephPGsDamaged
1592 exp_alerts:
1593 - exp_labels:
1594 name: "PG_DAMAGED"
1595 severity: critical
1596 type: ceph_default
1597 oid: 1.3.6.1.4.1.50495.1.2.1.7.4
1598 exp_annotations:
1599 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
1600 summary: Placement group damaged, manual intervention needed
1601 description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
1602 - interval: 1m
1603 input_series:
1604 - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
1605 values: '0+0x4 1+0x20'
1606 promql_expr_test:
1607 - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
1608 eval_time: 5m
1609 exp_samples:
1610 - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
1611 value: 1
1612 alert_rule_test:
1613 - eval_time: 1m
1614 alertname: CephPGsHighPerOSD
1615 - eval_time: 10m
1616 alertname: CephPGsHighPerOSD
1617 exp_alerts:
1618 - exp_labels:
1619 name: "TOO_MANY_PGS"
1620 severity: warning
1621 type: ceph_default
1622 exp_annotations:
1623 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
1624 summary: Placement groups per OSD is too high
1625 description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
1626 - interval: 1m
1627 input_series:
1628 - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
1629 values: '0+0x2 1+0x20'
1630 promql_expr_test:
1631 - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
1632 eval_time: 1m
1633 exp_samples:
1634 - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
1635 value: 0
1636 alert_rule_test:
1637 - eval_time: 1m
1638 alertname: CephPGRecoveryAtRisk
1639 - eval_time: 10m
1640 alertname: CephPGRecoveryAtRisk
1641 exp_alerts:
1642 - exp_labels:
1643 name: "PG_RECOVERY_FULL"
1644 severity: critical
1645 type: ceph_default
1646 oid: 1.3.6.1.4.1.50495.1.2.1.7.5
1647 exp_annotations:
1648 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
1649 summary: OSDs are too full for recovery
1650 description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
1651 - interval: 1m
1652 input_series:
1653 - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
1654 values: '0+0x2 1+0x20'
1655 promql_expr_test:
1656 - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
1657 eval_time: 1m
1658 exp_samples:
1659 - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
1660 value: 0
1661 alert_rule_test:
1662 - eval_time: 1m
1663 alertname: CephPGBackfillAtRisk
1664 - eval_time: 10m
1665 alertname: CephPGBackfillAtRisk
1666 exp_alerts:
1667 - exp_labels:
1668 name: "PG_BACKFILL_FULL"
1669 severity: critical
1670 type: ceph_default
1671 oid: 1.3.6.1.4.1.50495.1.2.1.7.6
1672 exp_annotations:
1673 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
1674 summary: Backfill operations are blocked due to lack of free space
1675 description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
1676 - interval: 1m
1677 input_series:
1678 - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
1679 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
1680 - series: 'ceph_health_detail{name="OSD_DOWN"}'
1681 values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
1682 promql_expr_test:
1683 - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
1684 eval_time: 1m
1685 # empty set at 1m
1686 exp_samples:
1687 alert_rule_test:
1688 # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
1689 - eval_time: 1m
1690 alertname: CephPGUnavilableBlockingIO
1691 exp_alerts:
1692 # PG_AVAILABILITY firing, but osd_down is active .. no alert
1693 - eval_time: 5m
1694 alertname: CephPGUnavilableBlockingIO
1695 exp_alerts:
1696 # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
1697 - eval_time: 15m
1698 alertname: CephPGUnavilableBlockingIO
1699 exp_alerts:
1700 - exp_labels:
1701 name: "PG_AVAILABILITY"
1702 severity: critical
1703 type: ceph_default
1704 oid: 1.3.6.1.4.1.50495.1.2.1.7.3
1705 exp_annotations:
1706 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
1707 summary: PG is unavailable, blocking I/O
1708 description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
1709 - interval: 1m
1710 input_series:
1711 - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
1712 values: '0+0x2 1+0x10'
1713 promql_expr_test:
1714 - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
1715 eval_time: 3m
1716 exp_samples:
1717 - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
1718 value: 1
1719 alert_rule_test:
1720 - eval_time: 1m
1721 alertname: CephPGNotDeepScrubbed
1722 - eval_time: 10m
1723 alertname: CephPGNotDeepScrubbed
1724 exp_alerts:
1725 - exp_labels:
1726 name: "PG_NOT_DEEP_SCRUBBED"
1727 severity: warning
1728 type: ceph_default
1729 exp_annotations:
1730 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
1731 summary: Placement group(s) have not been deep scrubbed
1732 description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
1733
1734 # Prometheus
1735 - interval: 1m
1736 input_series:
1737 - series: 'up{job="myjob"}'
1738 values: '1+0x10'
1739 promql_expr_test:
1740 - expr: absent(up{job="ceph"})
1741 eval_time: 1m
1742 exp_samples:
1743 - labels: '{job="ceph"}'
1744 value: 1
1745 alert_rule_test:
1746 - eval_time: 5m
1747 alertname: PrometheusJobMissing
1748 exp_alerts:
1749 - exp_labels:
1750 job: ceph
1751 severity: critical
1752 type: ceph_default
1753 oid: 1.3.6.1.4.1.50495.1.2.1.12.1
1754 exp_annotations:
1755 summary: The scrape job for Ceph is missing from Prometheus
1756 description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance.
1757 # RADOS
1758 - interval: 1m
1759 input_series:
1760 - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
1761 values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1762 - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1763 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1764 - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1765 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1766 - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1767 values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1768 - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
1769 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1770 - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
1771 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1772 - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
1773 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1774 promql_expr_test:
1775 - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
1776 eval_time: 1m
1777 exp_samples:
1778 alert_rule_test:
1779 # OBJECT_UNFOUND but osd.2 is down, so don't fire
1780 - eval_time: 5m
1781 alertname: CephObjectMissing
1782 exp_alerts:
1783 # OBJECT_UNFOUND and all osd's are online, so fire
1784 - eval_time: 15m
1785 alertname: CephObjectMissing
1786 exp_alerts:
1787 - exp_labels:
1788 severity: critical
1789 type: ceph_default
1790 oid: 1.3.6.1.4.1.50495.1.2.1.10.1
1791 exp_annotations:
1792 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
1793 summary: Object(s) marked UNFOUND
1794 description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
1795 # Generic Alerts
1796 - interval: 1m
1797 input_series:
1798 - series: 'ceph_health_detail{name="RECENT_CRASH"}'
1799 values: '0 0 0 1 1 1 1 1 1 1 1'
1800 promql_expr_test:
1801 - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1802 eval_time: 1m
1803 exp_samples:
1804 alert_rule_test:
1805 # not firing
1806 - eval_time: 1m
1807 alertname: CephDaemonCrash
1808 exp_alerts:
1809 # firing
1810 - eval_time: 10m
1811 alertname: CephDaemonCrash
1812 exp_alerts:
1813 - exp_labels:
1814 name: RECENT_CRASH
1815 severity: critical
1816 type: ceph_default
1817 oid: 1.3.6.1.4.1.50495.1.2.1.1.2
1818 exp_annotations:
1819 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
1820 summary: One or more Ceph daemons have crashed, and are pending acknowledgement
1821 description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.