]> git.proxmox.com Git - ceph.git/blame - ceph/monitoring/prometheus/alerts/test_alerts.yml
buildsys: switch source download to quincy
[ceph.git] / ceph / monitoring / prometheus / alerts / test_alerts.yml
CommitLineData
f67539c2
TL
1rule_files:
2 - ceph_default_alerts.yml
3evaluation_interval: 5m
4tests:
5 # health error
6 - interval: 5m
7 input_series:
8 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
9 values: '2 2 2 2 2 2 2'
10 promql_expr_test:
11 - expr: ceph_health_status == 2
12 eval_time: 5m
13 exp_samples:
14 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
15 value: 2
16 alert_rule_test:
17 - eval_time: 1m
18 alertname: health error
19 - eval_time: 6m
20 alertname: health error
21 exp_alerts:
22 - exp_labels:
23 instance: ceph:9283
24 job: ceph
25 oid: 1.3.6.1.4.1.50495.15.1.2.2.1
26 type: ceph_default
27 severity: critical
28 exp_annotations:
29 description: >
30 Ceph in HEALTH_ERROR state for more than 5 minutes.
31 Please check "ceph health detail" for more information.
32
33 # health warning
34 - interval: 5m
35 input_series:
36 - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
37 values: '1 1 1 1 1 1 1 1 1 1'
38 promql_expr_test:
39 - expr: ceph_health_status == 1
40 eval_time: 15m
41 exp_samples:
42 - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
43 value: 1
44 alert_rule_test:
45 - eval_time: 10m
46 alertname: health warn
47 - eval_time: 20m
48 alertname: health warn
49 exp_alerts:
50 - exp_labels:
51 instance: ceph:9283
52 job: ceph
53 oid: 1.3.6.1.4.1.50495.15.1.2.2.2
54 type: ceph_default
55 severity: warning
56 exp_annotations:
57 description: >
58 Ceph has been in HEALTH_WARN for more than 15 minutes.
59 Please check "ceph health detail" for more information.
60
61 # low monitor quorum count
62 - interval: 1m
63 input_series:
64 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283",
65 job="ceph"}'
66 values: '1 1 1 1 1'
67 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283",
68 job="ceph"}'
69 values: '1 1 1 1 1'
70 - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283",
71 job="ceph"}'
72 values: '0 0 0 0 0'
73 - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version
74 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
75 (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
76 public_addr="172.20.0.2",rank="0"}'
77 values: '1 1 1 1 1'
78 - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version
79 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
80 (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
81 public_addr="172.20.0.2",rank="1"}'
82 values: '1 1 1 1 1'
83 - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version
84 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific
85 (dev)",hostname="ceph",instance="ceph:9283",job="ceph",
86 public_addr="172.20.0.2",rank="2"}'
87 values: '1 1 1 1 1'
88 promql_expr_test:
89 - expr: sum(ceph_mon_quorum_status) < 3
90 eval_time: 1m
91 exp_samples:
92 - labels: '{}'
93 value: 2
94 alert_rule_test:
95 - eval_time: 1m
96 alertname: low monitor quorum count
97 exp_alerts:
98 - exp_labels:
99 oid: 1.3.6.1.4.1.50495.15.1.2.3.1
100 type: ceph_default
101 severity: critical
102 exp_annotations:
103 description: |
104 Monitor count in quorum is below three.
105
106 Only 2 of 3 monitors are active.
107
108 The following monitors are down:
109 - mon.c on ceph
110
111
112 # 10% OSDs down
113 - interval: 1m
114 input_series:
115 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
116 values: '1 1 1 1 1'
117 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
118 values: '0 0 0 0 0'
119 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
120 values: '1 1 1 1 1'
121 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
122 ceph_version="ceph version 17.0.0-189-g3558fd72
123 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
124 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
125 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
126 public_addr="172.20.0.2"}'
127 values: '1 1 1 1 1'
128 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
129 ceph_version="ceph version 17.0.0-189-g3558fd72
130 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
131 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
132 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
133 public_addr="172.20.0.2"}'
134 values: '1 1 1 1 1'
135 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
136 ceph_version="ceph version 17.0.0-189-g3558fd72
137 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
138 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
139 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
140 public_addr="172.20.0.2"}'
141 values: '1 1 1 1 1'
142 promql_expr_test:
143 - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
144 eval_time: 1m
145 exp_samples:
146 - labels: '{}'
147 value: 3.333333333333333E+01
148 alert_rule_test:
149 - eval_time: 1m
150 alertname: 10% OSDs down
151 exp_alerts:
152 - exp_labels:
153 oid: 1.3.6.1.4.1.50495.15.1.2.4.1
154 type: ceph_default
155 severity: critical
156 exp_annotations:
157 description: |
158 33.33% or 1 of 3 OSDs are down (≥ 10%).
159
160 The following OSDs are down:
161 - osd.1 on ceph
162
163 # OSD down
164 - interval: 1m
165 input_series:
166 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
167 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
168 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
169 values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
170 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
171 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
172 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
173 ceph_version="ceph version 17.0.0-189-g3558fd72
174 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
175 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
176 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
177 public_addr="172.20.0.2"}'
178 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
179 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
180 ceph_version="ceph version 17.0.0-189-g3558fd72
181 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
182 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
183 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
184 public_addr="172.20.0.2"}'
185 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
186 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
187 ceph_version="ceph version 17.0.0-189-g3558fd72
188 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
189 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
190 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
191 public_addr="172.20.0.2"}'
192 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
193 promql_expr_test:
194 - expr: count(ceph_osd_up == 0) > 0
195 eval_time: 1m
196 exp_samples:
197 - labels: '{}'
198 value: 1
199 alert_rule_test:
200 - eval_time: 15m
201 alertname: OSD down
202 exp_alerts:
203 - exp_labels:
204 oid: 1.3.6.1.4.1.50495.15.1.2.4.2
205 type: ceph_default
206 severity: warning
207 exp_annotations:
208 description: |
209
210 1 OSD down for more than 15 minutes.
211
212 1 of 3 OSDs are down.
213
214 The following OSD is down:
215 - osd.1 on ceph
216
217 # OSDs near full
218 - interval: 1m
219 input_series:
220 - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283"
221 ,job="ceph"}'
222 values: '1076310016 1076310016 1076310016 1076310016 1076310016
223 1076310016'
224 - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283"
225 ,job="ceph"}'
226 values: '1076310016 1076310016 1076310016 1076310016 1076310016
227 1076310016'
228 - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283"
229 ,job="ceph"}'
230 values: '1076310016 1076310016 1076310016 1076310016 1076310016
231 100856561909.76'
232 - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283"
233 ,job="ceph"}'
234 values: '108447916032 108447916032 108447916032 108447916032 108447916032
235 108447916032'
236 - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283"
237 ,job="ceph"}'
238 values: '108447916032 108447916032 108447916032 108447916032 108447916032
239 108447916032'
240 - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283"
241 ,job="ceph"}'
242 values: '108447916032 108447916032 108447916032 108447916032 108447916032
243 108447916032'
244 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
245 values: '1 1 1 1 1 1'
246 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
247 values: '1 1 1 1 1 1'
248 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
249 values: '1 1 1 1 1 1'
250 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
251 ceph_version="ceph version 17.0.0-189-g3558fd72
252 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
253 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
254 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
255 public_addr="172.20.0.2"}'
256 values: '1 1 1 1 1 1'
257 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
258 ceph_version="ceph version 17.0.0-189-g3558fd72
259 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
260 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
261 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
262 public_addr="172.20.0.2"}'
263 values: '1 1 1 1 1 1'
264 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
265 ceph_version="ceph version 17.0.0-189-g3558fd72
266 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
267 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
268 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
269 public_addr="172.20.0.2"}'
270 values: '1 1 1 1 1 1'
271 promql_expr_test:
272 - expr: |
273 (
274 ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon)
275 ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname)
276 ceph_osd_metadata
277 ) * 100 > 90
278
279 eval_time: 5m
280 exp_samples:
281 - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283",
282 job="ceph"}'
283 value: 9.3E+01
284 alert_rule_test:
285 - eval_time: 10m
286 alertname: OSDs near full
287 exp_alerts:
288 - exp_labels:
289 ceph_daemon: osd.2
290 hostname: ceph
291 instance: ceph:9283
292 job: ceph
293 oid: 1.3.6.1.4.1.50495.15.1.2.4.3
294 type: ceph_default
295 severity: critical
296 exp_annotations:
297 description: >
298 OSD osd.2 on ceph is dangerously full: 93%
299
300 # flapping OSD
301 - interval: 1s
302 input_series:
303 - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
304 values: '1+1x100'
305 - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
306 values: '1+0x100'
307 - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
308 values: '1+0x100'
309 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
310 ceph_version="ceph version 17.0.0-189-g3558fd72
311 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
312 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
313 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
314 public_addr="172.20.0.2"}'
315 values: '1 1 1 1 1 1'
316 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
317 ceph_version="ceph version 17.0.0-189-g3558fd72
318 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
319 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
320 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
321 public_addr="172.20.0.2"}'
322 values: '1 1 1 1 1 1'
323 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
324 ceph_version="ceph version 17.0.0-189-g3558fd72
325 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
326 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
327 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
328 public_addr="172.20.0.2"}'
329 values: '1 1 1 1 1 1'
330 promql_expr_test:
331 - expr: |
332 (
333 rate(ceph_osd_up[5m])
334 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
335 ) * 60 > 1
336 eval_time: 1m
337 exp_samples:
338 - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
339 job="ceph"}'
340 value: 1.2200000000000001E+01
341 alert_rule_test:
342 - eval_time: 5m
343 alertname: flapping OSD
344 exp_alerts:
345 - exp_labels:
346 ceph_daemon: osd.0
347 hostname: ceph
348 instance: ceph:9283
349 job: ceph
350 oid: 1.3.6.1.4.1.50495.15.1.2.4.4
351 severity: warning
352 type: ceph_default
353 exp_annotations:
354 description: >
355 OSD osd.0 on ceph was
356 marked down and back up at 20.1 times once a
357 minute for 5 minutes.
358
359 # high pg count deviation
360 - interval: 1m
361 input_series:
362 - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
363 job="ceph"}'
364 values: '100 100 100 100 100 160'
365 - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
366 job="ceph"}'
367 values: '100 100 100 100 100 320'
368 - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
369 job="ceph"}'
370 values: '100 100 100 100 100 160'
371 - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
372 job="ceph"}'
373 values: '100 100 100 100 100 160'
374 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
375 ceph_version="ceph version 17.0.0-189-g3558fd72
376 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
377 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
378 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
379 public_addr="172.20.0.2"}'
380 values: '1 1 1 1 1 1'
381 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
382 ceph_version="ceph version 17.0.0-189-g3558fd72
383 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
384 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
385 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
386 public_addr="172.20.0.2"}'
387 values: '1 1 1 1 1 1'
388 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
389 ceph_version="ceph version 17.0.0-189-g3558fd72
390 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
391 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
392 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
393 public_addr="172.20.0.2"}'
394 values: '1 1 1 1 1 1'
395 - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
396 ceph_version="ceph version 17.0.0-189-g3558fd72
397 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
398 cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
399 hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
400 public_addr="172.20.0.2"}'
401 values: '1 1 1 1 1 1'
402 promql_expr_test:
403 - expr: |
404 abs(
405 (
406 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
407 by (job)
408 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
409 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
410
411 eval_time: 5m
412 exp_samples:
413 - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
414 job="ceph"}'
415 value: 6E-01
416 alert_rule_test:
417 - eval_time: 10m
418 alertname: high pg count deviation
419 exp_alerts:
420 - exp_labels:
421 ceph_daemon: osd.1
422 hostname: ceph
423 instance: ceph:9283
424 job: ceph
425 oid: 1.3.6.1.4.1.50495.15.1.2.4.5
426 severity: warning
427 type: ceph_default
428 exp_annotations:
429 description: >
430 OSD osd.1 on ceph deviates
431 by more than 30% from average PG count.
432
433 # pgs inactive
434 - interval: 1m
435 input_series:
436 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
437 name="device_health_metrics",pool_id="1"}'
438 values: '1 1 1 1 1 1 1 1'
439 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
440 name="device_health_metrics",pool_id="2"}'
441 values: '1 1 1 1 1 1 1 1'
442 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
443 name="device_health_metrics",pool_id="3"}'
444 values: '1 1 1 1 1 1 1 1'
445 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
446 values: '1 1 1 1 1 1 1 1'
447 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
448 values: '32 32 32 32 32 32 32 32'
449 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
450 values: '33 32 32 32 32 33 33 32'
451 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
452 values: '1 1 1 1 1 1 1 1 1'
453 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
454 values: '32 32 32 32 32 32 32 32'
455 - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
456 values: '32 32 32 32 32 32 32 32'
457 promql_expr_test:
458 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
459 (ceph_pg_total - ceph_pg_active) > 0
460 eval_time: 5m
461 exp_samples:
462 - labels: '{instance="ceph:9283", job="ceph",
463 name="device_health_metrics",
464 pool_id="3"}'
465 value: 1
466 alert_rule_test:
467 - eval_time: 5m
468 alertname: pgs inactive
469 exp_alerts:
470 - exp_labels:
471 instance: ceph:9283
472 job: ceph
473 name: device_health_metrics
474 oid: 1.3.6.1.4.1.50495.15.1.2.7.1
475 pool_id: 3
476 severity: critical
477 type: ceph_default
478 exp_annotations:
479 description: >
480 1 PGs have been inactive for more than 5 minutes in pool
481 device_health_metrics.
482 Inactive placement groups aren't able to serve read/write
483 requests.
484
485 #pgs unclean
486 - interval: 1m
487 input_series:
488 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
489 name="device_health_metrics",pool_id="1"}'
490 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
491 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
492 name="device_health_metrics",pool_id="2"}'
493 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
494 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
495 name="device_health_metrics",pool_id="3"}'
496 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
497 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
498 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
499 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
500 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
501 32 32 32'
502 - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
503 values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
504 33 33'
505 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
506 values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
507 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
508 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
509 32 32'
510 - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
511 values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
512 32 32'
513 promql_expr_test:
514 - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
515 (ceph_pg_total - ceph_pg_clean) > 0
516 eval_time: 15m
517 exp_samples:
518 - labels: '{instance="ceph:9283", job="ceph",
519 name="device_health_metrics", pool_id="3"}'
520 value: 1
521 alert_rule_test:
522 - eval_time: 16m
523 alertname: pgs unclean
524 exp_alerts:
525 - exp_labels:
526 instance: ceph:9283
527 job: ceph
528 name: device_health_metrics
529 oid: 1.3.6.1.4.1.50495.15.1.2.7.2
530 pool_id: 3
531 severity: warning
532 type: ceph_default
533 exp_annotations:
534 description: >
535 1 PGs haven't been clean for more than 15 minutes in pool
536 device_health_metrics.
537 Unclean PGs haven't been able to completely recover from a
538 previous failure.
539
540 # root volume full
541 - interval: 1m
542 input_series:
543 - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
544 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
545 mountpoint="/"}'
546 values: '35336400896 35336400896 35336400896 35336400896 35336400896
547 3525385519.104 3533640089'
548 - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
549 --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
550 mountpoint="/"}'
551 values: '73445531648 73445531648 73445531648 73445531648 73445531648
552 73445531648 73445531648'
553 promql_expr_test:
554 - expr: node_filesystem_avail_bytes{mountpoint="/"} /
555 node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
556 eval_time: 5m
557 exp_samples:
558 - labels: '{device="/dev/mapper/fedora_localhost --live-home",
559 fstype="ext4", instance="node-exporter", job="node-exporter",
560 mountpoint="/"}'
561 value: 4.8E+00
562 alert_rule_test:
563 - eval_time: 10m
564 alertname: root volume full
565 exp_alerts:
566 - exp_labels:
567 device: /dev/mapper/fedora_localhost --live-home
568 fstype: ext4
569 instance: node-exporter
570 job: node-exporter
571 mountpoint: /
572 oid: 1.3.6.1.4.1.50495.15.1.2.8.1
573 severity: critical
574 type: ceph_default
575 exp_annotations:
576 description: >
577 Root volume (OSD and MON store) is dangerously full: 4.811% free.
578
579 # network packets dropped
580 - interval: 1s
581 input_series:
582 - series: 'node_network_receive_drop_total{device="eth0",
583 instance="node-exporter",job="node-exporter"}'
584 values: '1+1x500'
585 - series: 'node_network_transmit_drop_total{device="eth0",
586 instance="node-exporter",job="node-exporter"}'
587 values: '1+1x500'
588 promql_expr_test:
589 - expr: |
590 (
591 increase(node_network_receive_drop_total{device!="lo"}[1m]) +
592 increase(node_network_transmit_drop_total{device!="lo"}[1m])
593 ) / (
594 increase(node_network_receive_packets_total{device!="lo"}[1m]) +
595 increase(node_network_transmit_packets_total{device!="lo"}[1m])
596 ) >= 0.0001 or (
597 increase(node_network_receive_drop_total{device!="lo"}[1m]) +
598 increase(node_network_transmit_drop_total{device!="lo"}[1m])
599 ) >= 10
600
601 eval_time: 5m
602 exp_samples:
603 - labels: '{device="eth0", instance="node-exporter",
604 job="node-exporter"}'
605 value: 1.2E+02
606 alert_rule_test:
607 - eval_time: 5m
608 alertname: network packets dropped
609 exp_alerts:
610 - exp_labels:
611 device: eth0
612 instance: node-exporter
613 job: node-exporter
614 oid: 1.3.6.1.4.1.50495.15.1.2.8.2
615 severity: warning
616 type: ceph_default
617 exp_annotations:
618 description: >
619 Node node-exporter experiences packet drop > 0.01% or >
620 10 packets/s on interface eth0.
621
622 # network packets errors
623 - interval: 1s
624 input_series:
625 - series: 'node_network_receive_errs_total{device="eth0",
626 instance="node-exporter",job="node-exporter"}'
627 values: '1+1x500'
628 - series: 'node_network_transmit_errs_total{device="eth0",
629 instance="node-exporter",job="node-exporter"}'
630 values: '1+1x500'
631 promql_expr_test:
632 - expr: |
633 (
634 increase(node_network_receive_errs_total{device!="lo"}[1m]) +
635 increase(node_network_transmit_errs_total{device!="lo"}[1m])
636 ) / (
637 increase(node_network_receive_packets_total{device!="lo"}[1m]) +
638 increase(node_network_transmit_packets_total{device!="lo"}[1m])
639 ) >= 0.0001 or (
640 increase(node_network_receive_errs_total{device!="lo"}[1m]) +
641 increase(node_network_transmit_errs_total{device!="lo"}[1m])
642 ) >= 10
643
644 eval_time: 5m
645 exp_samples:
646 - labels: '{device="eth0", instance="node-exporter",
647 job="node-exporter"}'
648 value: 1.2E+02
649 alert_rule_test:
650 - eval_time: 5m
651 alertname: network packet errors
652 exp_alerts:
653 - exp_labels:
654 device: eth0
655 instance: node-exporter
656 job: node-exporter
657 oid: 1.3.6.1.4.1.50495.15.1.2.8.3
658 severity: warning
659 type: ceph_default
660 exp_annotations:
661 description: >
662 Node node-exporter experiences packet errors > 0.01% or > 10
663 packets/s on interface eth0.
664
665 # MTU Mismatch
666 - interval: 1m
667 input_series:
668 - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
669 job="node-exporter"}'
670 values: '1500 1500 1500 1500 1500'
671 - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
672 job="node-exporter"}'
673 values: '1500 1500 1500 1500 1500'
674 - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
675 job="node-exporter"}'
676 values: '1500 1500 1500 1500 1500'
677 - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
678 job="node-exporter"}'
679 values: '1500 1500 1500 1500 1500'
680 - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
681 job="node-exporter"}'
682 values: '9000 9000 9000 9000 9000'
a4b75251
TL
683 - series: 'node_network_up{device="eth0",instance="node-exporter",
684 job="node-exporter"}'
685 values: '0 0 0 0 0'
686 - series: 'node_network_up{device="eth1",instance="node-exporter",
687 job="node-exporter"}'
688 values: '0 0 0 0 0'
689 - series: 'node_network_up{device="eth2",instance="node-exporter",
690 job="node-exporter"}'
691 values: '1 1 1 1 1'
692 - series: 'node_network_up{device="eth3",instance="node-exporter",
693 job="node-exporter"}'
694 values: '0 0 0 0 0'
695 - series: 'node_network_up{device="eth4",instance="node-exporter",
696 job="node-exporter"}'
697 values: '1 1 1 1 1'
f67539c2 698 promql_expr_test:
a4b75251 699 - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left()
f67539c2
TL
700 (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
701 eval_time: 1m
702 exp_samples:
a4b75251 703 - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
f67539c2
TL
704 value: 9000
705 alert_rule_test:
706 - eval_time: 1m
707 alertname: MTU Mismatch
708 exp_alerts:
709 - exp_labels:
710 device: eth4
711 instance: node-exporter
712 job: node-exporter
713 oid: 1.3.6.1.4.1.50495.15.1.2.8.5
714 severity: warning
715 type: ceph_default
716 exp_annotations:
717 description: >
718 Node node-exporter has a different MTU size (9000)
719 than the median value on device eth4.
720
721 # pool full
722 - interval: 1m
723 input_series:
724 - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="1"}'
725 values: '0 0 0 0 0 0 0 0 0'
726 - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="2"}'
727 values: '1850 1850 1850 1850 1850 1850 1850'
728 - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="3"}'
729 values: '900 900 23524 23524 23524 23524 23524 23524
730 23524'
731 - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="1"}'
732 values: '106287063040 106287063040 106287063040 106287063040 106287063040
733 106287063040 106287063040'
734 - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="2"}'
735 values: '106287063040 106287063040 106287063040 106287063040 106287063040
736 106287063040 106287063040'
737 - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="3"}'
738 values: '37.5 37.5 37.5 37.5 37.5 37.5 37.5'
739 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
740 name="device_health_metrics",pool_id="1"}'
741 values: '1 1 1 1 1 1 1 1 1'
742 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
743 name=".rgw.root",pool_id="2"}'
744 values: '1 1 1 1 1 1 1 1 1'
745 - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
746 name="default.rgw.log",pool_id="3"}'
747 values: '1 1 1 1 1 1 1 1 1'
748 promql_expr_test:
749 - expr: |
750 ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
751 * on(pool_id) group_right ceph_pool_metadata * 100 > 90
752
753 eval_time: 1m
754 exp_samples:
755 - labels: '{instance="ceph:9283", job="ceph", name="default.rgw.log",
756 pool_id="3"}'
757 value: 9.6E+01
758 alert_rule_test:
759 - eval_time: 2m
760 alertname: pool full
761 exp_alerts:
762 - exp_labels:
763 instance: ceph:9283
764 job: ceph
765 name: default.rgw.log
766 oid: 1.3.6.1.4.1.50495.15.1.2.9.1
767 pool_id: 3
768 severity: critical
769 type: ceph_default
770 exp_annotations:
771 description: Pool default.rgw.log at 96% capacity.
772
773 # slow OSD ops
774 - interval : 1m
775 input_series:
776 - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
777 values: '1+0x120'
778 promql_expr_test:
779 - expr: ceph_healthcheck_slow_ops > 0
780 eval_time: 1m
781 exp_samples:
782 - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
783 job="ceph"}'
784 value: 1
785 alert_rule_test:
786 - eval_time: 20m
787 alertname: Slow OSD Ops
788 exp_alerts:
789 - exp_labels:
790 instance: ceph:9283
791 job: ceph
792 severity: warning
793 type: ceph_default
794 exp_annotations:
795 description: >
796 1 OSD requests are taking too long to process
797 (osd_op_complaint_time exceeded)