]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | rule_files: |
2 | - ceph_default_alerts.yml | |
3 | evaluation_interval: 5m | |
4 | tests: | |
5 | # health error | |
6 | - interval: 5m | |
7 | input_series: | |
8 | - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}' | |
9 | values: '2 2 2 2 2 2 2' | |
10 | promql_expr_test: | |
11 | - expr: ceph_health_status == 2 | |
12 | eval_time: 5m | |
13 | exp_samples: | |
14 | - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}' | |
15 | value: 2 | |
16 | alert_rule_test: | |
17 | - eval_time: 1m | |
18 | alertname: health error | |
19 | - eval_time: 6m | |
20 | alertname: health error | |
21 | exp_alerts: | |
22 | - exp_labels: | |
23 | instance: ceph:9283 | |
24 | job: ceph | |
25 | oid: 1.3.6.1.4.1.50495.15.1.2.2.1 | |
26 | type: ceph_default | |
27 | severity: critical | |
28 | exp_annotations: | |
29 | description: > | |
30 | Ceph in HEALTH_ERROR state for more than 5 minutes. | |
31 | Please check "ceph health detail" for more information. | |
32 | ||
33 | # health warning | |
34 | - interval: 5m | |
35 | input_series: | |
36 | - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}' | |
37 | values: '1 1 1 1 1 1 1 1 1 1' | |
38 | promql_expr_test: | |
39 | - expr: ceph_health_status == 1 | |
40 | eval_time: 15m | |
41 | exp_samples: | |
42 | - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}' | |
43 | value: 1 | |
44 | alert_rule_test: | |
45 | - eval_time: 10m | |
46 | alertname: health warn | |
47 | - eval_time: 20m | |
48 | alertname: health warn | |
49 | exp_alerts: | |
50 | - exp_labels: | |
51 | instance: ceph:9283 | |
52 | job: ceph | |
53 | oid: 1.3.6.1.4.1.50495.15.1.2.2.2 | |
54 | type: ceph_default | |
55 | severity: warning | |
56 | exp_annotations: | |
57 | description: > | |
58 | Ceph has been in HEALTH_WARN for more than 15 minutes. | |
59 | Please check "ceph health detail" for more information. | |
60 | ||
61 | # low monitor quorum count | |
62 | - interval: 1m | |
63 | input_series: | |
64 | - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283", | |
65 | job="ceph"}' | |
66 | values: '1 1 1 1 1' | |
67 | - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283", | |
68 | job="ceph"}' | |
69 | values: '1 1 1 1 1' | |
70 | - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283", | |
71 | job="ceph"}' | |
72 | values: '0 0 0 0 0' | |
73 | - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version | |
74 | 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific | |
75 | (dev)",hostname="ceph",instance="ceph:9283",job="ceph", | |
76 | public_addr="172.20.0.2",rank="0"}' | |
77 | values: '1 1 1 1 1' | |
78 | - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version | |
79 | 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific | |
80 | (dev)",hostname="ceph",instance="ceph:9283",job="ceph", | |
81 | public_addr="172.20.0.2",rank="1"}' | |
82 | values: '1 1 1 1 1' | |
83 | - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version | |
84 | 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific | |
85 | (dev)",hostname="ceph",instance="ceph:9283",job="ceph", | |
86 | public_addr="172.20.0.2",rank="2"}' | |
87 | values: '1 1 1 1 1' | |
88 | promql_expr_test: | |
89 | - expr: sum(ceph_mon_quorum_status) < 3 | |
90 | eval_time: 1m | |
91 | exp_samples: | |
92 | - labels: '{}' | |
93 | value: 2 | |
94 | alert_rule_test: | |
95 | - eval_time: 1m | |
96 | alertname: low monitor quorum count | |
97 | exp_alerts: | |
98 | - exp_labels: | |
99 | oid: 1.3.6.1.4.1.50495.15.1.2.3.1 | |
100 | type: ceph_default | |
101 | severity: critical | |
102 | exp_annotations: | |
103 | description: | | |
104 | Monitor count in quorum is below three. | |
105 | ||
106 | Only 2 of 3 monitors are active. | |
107 | ||
108 | The following monitors are down: | |
109 | - mon.c on ceph | |
110 | ||
111 | ||
112 | # 10% OSDs down | |
113 | - interval: 1m | |
114 | input_series: | |
115 | - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' | |
116 | values: '1 1 1 1 1' | |
117 | - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' | |
118 | values: '0 0 0 0 0' | |
119 | - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' | |
120 | values: '1 1 1 1 1' | |
121 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", | |
122 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
123 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
124 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
125 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
126 | public_addr="172.20.0.2"}' | |
127 | values: '1 1 1 1 1' | |
128 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", | |
129 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
130 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
131 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
132 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
133 | public_addr="172.20.0.2"}' | |
134 | values: '1 1 1 1 1' | |
135 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", | |
136 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
137 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
138 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
139 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
140 | public_addr="172.20.0.2"}' | |
141 | values: '1 1 1 1 1' | |
142 | promql_expr_test: | |
143 | - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 | |
144 | eval_time: 1m | |
145 | exp_samples: | |
146 | - labels: '{}' | |
147 | value: 3.333333333333333E+01 | |
148 | alert_rule_test: | |
149 | - eval_time: 1m | |
150 | alertname: 10% OSDs down | |
151 | exp_alerts: | |
152 | - exp_labels: | |
153 | oid: 1.3.6.1.4.1.50495.15.1.2.4.1 | |
154 | type: ceph_default | |
155 | severity: critical | |
156 | exp_annotations: | |
157 | description: | | |
158 | 33.33% or 1 of 3 OSDs are down (≥ 10%). | |
159 | ||
160 | The following OSDs are down: | |
161 | - osd.1 on ceph | |
162 | ||
163 | # OSD down | |
164 | - interval: 1m | |
165 | input_series: | |
166 | - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' | |
167 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
168 | - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' | |
169 | values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' | |
170 | - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' | |
171 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
172 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", | |
173 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
174 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
175 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
176 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
177 | public_addr="172.20.0.2"}' | |
178 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
179 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", | |
180 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
181 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
182 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
183 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
184 | public_addr="172.20.0.2"}' | |
185 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
186 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", | |
187 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
188 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
189 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
190 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
191 | public_addr="172.20.0.2"}' | |
192 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
193 | promql_expr_test: | |
194 | - expr: count(ceph_osd_up == 0) > 0 | |
195 | eval_time: 1m | |
196 | exp_samples: | |
197 | - labels: '{}' | |
198 | value: 1 | |
199 | alert_rule_test: | |
200 | - eval_time: 15m | |
201 | alertname: OSD down | |
202 | exp_alerts: | |
203 | - exp_labels: | |
204 | oid: 1.3.6.1.4.1.50495.15.1.2.4.2 | |
205 | type: ceph_default | |
206 | severity: warning | |
207 | exp_annotations: | |
208 | description: | | |
209 | ||
210 | 1 OSD down for more than 15 minutes. | |
211 | ||
212 | 1 of 3 OSDs are down. | |
213 | ||
214 | The following OSD is down: | |
215 | - osd.1 on ceph | |
216 | ||
217 | # OSDs near full | |
218 | - interval: 1m | |
219 | input_series: | |
220 | - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283" | |
221 | ,job="ceph"}' | |
222 | values: '1076310016 1076310016 1076310016 1076310016 1076310016 | |
223 | 1076310016' | |
224 | - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283" | |
225 | ,job="ceph"}' | |
226 | values: '1076310016 1076310016 1076310016 1076310016 1076310016 | |
227 | 1076310016' | |
228 | - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283" | |
229 | ,job="ceph"}' | |
230 | values: '1076310016 1076310016 1076310016 1076310016 1076310016 | |
231 | 100856561909.76' | |
232 | - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283" | |
233 | ,job="ceph"}' | |
234 | values: '108447916032 108447916032 108447916032 108447916032 108447916032 | |
235 | 108447916032' | |
236 | - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283" | |
237 | ,job="ceph"}' | |
238 | values: '108447916032 108447916032 108447916032 108447916032 108447916032 | |
239 | 108447916032' | |
240 | - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283" | |
241 | ,job="ceph"}' | |
242 | values: '108447916032 108447916032 108447916032 108447916032 108447916032 | |
243 | 108447916032' | |
244 | - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' | |
245 | values: '1 1 1 1 1 1' | |
246 | - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' | |
247 | values: '1 1 1 1 1 1' | |
248 | - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' | |
249 | values: '1 1 1 1 1 1' | |
250 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", | |
251 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
252 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
253 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
254 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
255 | public_addr="172.20.0.2"}' | |
256 | values: '1 1 1 1 1 1' | |
257 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", | |
258 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
259 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
260 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
261 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
262 | public_addr="172.20.0.2"}' | |
263 | values: '1 1 1 1 1 1' | |
264 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", | |
265 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
266 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
267 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
268 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
269 | public_addr="172.20.0.2"}' | |
270 | values: '1 1 1 1 1 1' | |
271 | promql_expr_test: | |
272 | - expr: | | |
273 | ( | |
274 | ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) | |
275 | ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname) | |
276 | ceph_osd_metadata | |
277 | ) * 100 > 90 | |
278 | ||
279 | eval_time: 5m | |
280 | exp_samples: | |
281 | - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283", | |
282 | job="ceph"}' | |
283 | value: 9.3E+01 | |
284 | alert_rule_test: | |
285 | - eval_time: 10m | |
286 | alertname: OSDs near full | |
287 | exp_alerts: | |
288 | - exp_labels: | |
289 | ceph_daemon: osd.2 | |
290 | hostname: ceph | |
291 | instance: ceph:9283 | |
292 | job: ceph | |
293 | oid: 1.3.6.1.4.1.50495.15.1.2.4.3 | |
294 | type: ceph_default | |
295 | severity: critical | |
296 | exp_annotations: | |
297 | description: > | |
298 | OSD osd.2 on ceph is dangerously full: 93% | |
299 | ||
300 | # flapping OSD | |
301 | - interval: 1s | |
302 | input_series: | |
303 | - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' | |
304 | values: '1+1x100' | |
305 | - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' | |
306 | values: '1+0x100' | |
307 | - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' | |
308 | values: '1+0x100' | |
309 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", | |
310 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
311 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
312 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
313 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
314 | public_addr="172.20.0.2"}' | |
315 | values: '1 1 1 1 1 1' | |
316 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", | |
317 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
318 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
319 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
320 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
321 | public_addr="172.20.0.2"}' | |
322 | values: '1 1 1 1 1 1' | |
323 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", | |
324 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
325 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
326 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
327 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
328 | public_addr="172.20.0.2"}' | |
329 | values: '1 1 1 1 1 1' | |
330 | promql_expr_test: | |
331 | - expr: | | |
332 | ( | |
333 | rate(ceph_osd_up[5m]) | |
334 | * on(ceph_daemon) group_left(hostname) ceph_osd_metadata | |
335 | ) * 60 > 1 | |
336 | eval_time: 1m | |
337 | exp_samples: | |
338 | - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283", | |
339 | job="ceph"}' | |
340 | value: 1.2200000000000001E+01 | |
341 | alert_rule_test: | |
342 | - eval_time: 5m | |
343 | alertname: flapping OSD | |
344 | exp_alerts: | |
345 | - exp_labels: | |
346 | ceph_daemon: osd.0 | |
347 | hostname: ceph | |
348 | instance: ceph:9283 | |
349 | job: ceph | |
350 | oid: 1.3.6.1.4.1.50495.15.1.2.4.4 | |
351 | severity: warning | |
352 | type: ceph_default | |
353 | exp_annotations: | |
354 | description: > | |
355 | OSD osd.0 on ceph was | |
356 | marked down and back up at 20.1 times once a | |
357 | minute for 5 minutes. | |
358 | ||
359 | # high pg count deviation | |
360 | - interval: 1m | |
361 | input_series: | |
362 | - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283", | |
363 | job="ceph"}' | |
364 | values: '100 100 100 100 100 160' | |
365 | - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283", | |
366 | job="ceph"}' | |
367 | values: '100 100 100 100 100 320' | |
368 | - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283", | |
369 | job="ceph"}' | |
370 | values: '100 100 100 100 100 160' | |
371 | - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283", | |
372 | job="ceph"}' | |
373 | values: '100 100 100 100 100 160' | |
374 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", | |
375 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
376 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
377 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
378 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
379 | public_addr="172.20.0.2"}' | |
380 | values: '1 1 1 1 1 1' | |
381 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", | |
382 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
383 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
384 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
385 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
386 | public_addr="172.20.0.2"}' | |
387 | values: '1 1 1 1 1 1' | |
388 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", | |
389 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
390 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
391 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
392 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
393 | public_addr="172.20.0.2"}' | |
394 | values: '1 1 1 1 1 1' | |
395 | - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3", | |
396 | ceph_version="ceph version 17.0.0-189-g3558fd72 | |
397 | (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", | |
398 | cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", | |
399 | hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", | |
400 | public_addr="172.20.0.2"}' | |
401 | values: '1 1 1 1 1 1' | |
402 | promql_expr_test: | |
403 | - expr: | | |
404 | abs( | |
405 | ( | |
406 | (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) | |
407 | by (job) | |
408 | ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) | |
409 | ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 | |
410 | ||
411 | eval_time: 5m | |
412 | exp_samples: | |
413 | - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283", | |
414 | job="ceph"}' | |
415 | value: 6E-01 | |
416 | alert_rule_test: | |
417 | - eval_time: 10m | |
418 | alertname: high pg count deviation | |
419 | exp_alerts: | |
420 | - exp_labels: | |
421 | ceph_daemon: osd.1 | |
422 | hostname: ceph | |
423 | instance: ceph:9283 | |
424 | job: ceph | |
425 | oid: 1.3.6.1.4.1.50495.15.1.2.4.5 | |
426 | severity: warning | |
427 | type: ceph_default | |
428 | exp_annotations: | |
429 | description: > | |
430 | OSD osd.1 on ceph deviates | |
431 | by more than 30% from average PG count. | |
432 | ||
433 | # pgs inactive | |
434 | - interval: 1m | |
435 | input_series: | |
436 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
437 | name="device_health_metrics",pool_id="1"}' | |
438 | values: '1 1 1 1 1 1 1 1' | |
439 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
440 | name="device_health_metrics",pool_id="2"}' | |
441 | values: '1 1 1 1 1 1 1 1' | |
442 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
443 | name="device_health_metrics",pool_id="3"}' | |
444 | values: '1 1 1 1 1 1 1 1' | |
445 | - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}' | |
446 | values: '1 1 1 1 1 1 1 1' | |
447 | - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}' | |
448 | values: '32 32 32 32 32 32 32 32' | |
449 | - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}' | |
450 | values: '33 32 32 32 32 33 33 32' | |
451 | - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}' | |
452 | values: '1 1 1 1 1 1 1 1 1' | |
453 | - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}' | |
454 | values: '32 32 32 32 32 32 32 32' | |
455 | - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}' | |
456 | values: '32 32 32 32 32 32 32 32' | |
457 | promql_expr_test: | |
458 | - expr: ceph_pool_metadata * on(pool_id,instance) group_left() | |
459 | (ceph_pg_total - ceph_pg_active) > 0 | |
460 | eval_time: 5m | |
461 | exp_samples: | |
462 | - labels: '{instance="ceph:9283", job="ceph", | |
463 | name="device_health_metrics", | |
464 | pool_id="3"}' | |
465 | value: 1 | |
466 | alert_rule_test: | |
467 | - eval_time: 5m | |
468 | alertname: pgs inactive | |
469 | exp_alerts: | |
470 | - exp_labels: | |
471 | instance: ceph:9283 | |
472 | job: ceph | |
473 | name: device_health_metrics | |
474 | oid: 1.3.6.1.4.1.50495.15.1.2.7.1 | |
475 | pool_id: 3 | |
476 | severity: critical | |
477 | type: ceph_default | |
478 | exp_annotations: | |
479 | description: > | |
480 | 1 PGs have been inactive for more than 5 minutes in pool | |
481 | device_health_metrics. | |
482 | Inactive placement groups aren't able to serve read/write | |
483 | requests. | |
484 | ||
485 | #pgs unclean | |
486 | - interval: 1m | |
487 | input_series: | |
488 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
489 | name="device_health_metrics",pool_id="1"}' | |
490 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
491 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
492 | name="device_health_metrics",pool_id="2"}' | |
493 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
494 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
495 | name="device_health_metrics",pool_id="3"}' | |
496 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
497 | - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}' | |
498 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
499 | - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}' | |
500 | values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 | |
501 | 32 32 32' | |
502 | - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}' | |
503 | values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 | |
504 | 33 33' | |
505 | - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}' | |
506 | values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' | |
507 | - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}' | |
508 | values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 | |
509 | 32 32' | |
510 | - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}' | |
511 | values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 | |
512 | 32 32' | |
513 | promql_expr_test: | |
514 | - expr: ceph_pool_metadata * on(pool_id,instance) group_left() | |
515 | (ceph_pg_total - ceph_pg_clean) > 0 | |
516 | eval_time: 15m | |
517 | exp_samples: | |
518 | - labels: '{instance="ceph:9283", job="ceph", | |
519 | name="device_health_metrics", pool_id="3"}' | |
520 | value: 1 | |
521 | alert_rule_test: | |
522 | - eval_time: 16m | |
523 | alertname: pgs unclean | |
524 | exp_alerts: | |
525 | - exp_labels: | |
526 | instance: ceph:9283 | |
527 | job: ceph | |
528 | name: device_health_metrics | |
529 | oid: 1.3.6.1.4.1.50495.15.1.2.7.2 | |
530 | pool_id: 3 | |
531 | severity: warning | |
532 | type: ceph_default | |
533 | exp_annotations: | |
534 | description: > | |
535 | 1 PGs haven't been clean for more than 15 minutes in pool | |
536 | device_health_metrics. | |
537 | Unclean PGs haven't been able to completely recover from a | |
538 | previous failure. | |
539 | ||
540 | # root volume full | |
541 | - interval: 1m | |
542 | input_series: | |
543 | - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost | |
544 | --live-home",fstype="ext4",instance="node-exporter",job="node-exporter", | |
545 | mountpoint="/"}' | |
546 | values: '35336400896 35336400896 35336400896 35336400896 35336400896 | |
547 | 3525385519.104 3533640089' | |
548 | - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost | |
549 | --live-home",fstype="ext4",instance="node-exporter",job="node-exporter", | |
550 | mountpoint="/"}' | |
551 | values: '73445531648 73445531648 73445531648 73445531648 73445531648 | |
552 | 73445531648 73445531648' | |
553 | promql_expr_test: | |
554 | - expr: node_filesystem_avail_bytes{mountpoint="/"} / | |
555 | node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 | |
556 | eval_time: 5m | |
557 | exp_samples: | |
558 | - labels: '{device="/dev/mapper/fedora_localhost --live-home", | |
559 | fstype="ext4", instance="node-exporter", job="node-exporter", | |
560 | mountpoint="/"}' | |
561 | value: 4.8E+00 | |
562 | alert_rule_test: | |
563 | - eval_time: 10m | |
564 | alertname: root volume full | |
565 | exp_alerts: | |
566 | - exp_labels: | |
567 | device: /dev/mapper/fedora_localhost --live-home | |
568 | fstype: ext4 | |
569 | instance: node-exporter | |
570 | job: node-exporter | |
571 | mountpoint: / | |
572 | oid: 1.3.6.1.4.1.50495.15.1.2.8.1 | |
573 | severity: critical | |
574 | type: ceph_default | |
575 | exp_annotations: | |
576 | description: > | |
577 | Root volume (OSD and MON store) is dangerously full: 4.811% free. | |
578 | ||
579 | # network packets dropped | |
580 | - interval: 1s | |
581 | input_series: | |
582 | - series: 'node_network_receive_drop_total{device="eth0", | |
583 | instance="node-exporter",job="node-exporter"}' | |
584 | values: '1+1x500' | |
585 | - series: 'node_network_transmit_drop_total{device="eth0", | |
586 | instance="node-exporter",job="node-exporter"}' | |
587 | values: '1+1x500' | |
588 | promql_expr_test: | |
589 | - expr: | | |
590 | ( | |
591 | increase(node_network_receive_drop_total{device!="lo"}[1m]) + | |
592 | increase(node_network_transmit_drop_total{device!="lo"}[1m]) | |
593 | ) / ( | |
594 | increase(node_network_receive_packets_total{device!="lo"}[1m]) + | |
595 | increase(node_network_transmit_packets_total{device!="lo"}[1m]) | |
596 | ) >= 0.0001 or ( | |
597 | increase(node_network_receive_drop_total{device!="lo"}[1m]) + | |
598 | increase(node_network_transmit_drop_total{device!="lo"}[1m]) | |
599 | ) >= 10 | |
600 | ||
601 | eval_time: 5m | |
602 | exp_samples: | |
603 | - labels: '{device="eth0", instance="node-exporter", | |
604 | job="node-exporter"}' | |
605 | value: 1.2E+02 | |
606 | alert_rule_test: | |
607 | - eval_time: 5m | |
608 | alertname: network packets dropped | |
609 | exp_alerts: | |
610 | - exp_labels: | |
611 | device: eth0 | |
612 | instance: node-exporter | |
613 | job: node-exporter | |
614 | oid: 1.3.6.1.4.1.50495.15.1.2.8.2 | |
615 | severity: warning | |
616 | type: ceph_default | |
617 | exp_annotations: | |
618 | description: > | |
619 | Node node-exporter experiences packet drop > 0.01% or > | |
620 | 10 packets/s on interface eth0. | |
621 | ||
622 | # network packets errors | |
623 | - interval: 1s | |
624 | input_series: | |
625 | - series: 'node_network_receive_errs_total{device="eth0", | |
626 | instance="node-exporter",job="node-exporter"}' | |
627 | values: '1+1x500' | |
628 | - series: 'node_network_transmit_errs_total{device="eth0", | |
629 | instance="node-exporter",job="node-exporter"}' | |
630 | values: '1+1x500' | |
631 | promql_expr_test: | |
632 | - expr: | | |
633 | ( | |
634 | increase(node_network_receive_errs_total{device!="lo"}[1m]) + | |
635 | increase(node_network_transmit_errs_total{device!="lo"}[1m]) | |
636 | ) / ( | |
637 | increase(node_network_receive_packets_total{device!="lo"}[1m]) + | |
638 | increase(node_network_transmit_packets_total{device!="lo"}[1m]) | |
639 | ) >= 0.0001 or ( | |
640 | increase(node_network_receive_errs_total{device!="lo"}[1m]) + | |
641 | increase(node_network_transmit_errs_total{device!="lo"}[1m]) | |
642 | ) >= 10 | |
643 | ||
644 | eval_time: 5m | |
645 | exp_samples: | |
646 | - labels: '{device="eth0", instance="node-exporter", | |
647 | job="node-exporter"}' | |
648 | value: 1.2E+02 | |
649 | alert_rule_test: | |
650 | - eval_time: 5m | |
651 | alertname: network packet errors | |
652 | exp_alerts: | |
653 | - exp_labels: | |
654 | device: eth0 | |
655 | instance: node-exporter | |
656 | job: node-exporter | |
657 | oid: 1.3.6.1.4.1.50495.15.1.2.8.3 | |
658 | severity: warning | |
659 | type: ceph_default | |
660 | exp_annotations: | |
661 | description: > | |
662 | Node node-exporter experiences packet errors > 0.01% or > 10 | |
663 | packets/s on interface eth0. | |
664 | ||
665 | # MTU Mismatch | |
666 | - interval: 1m | |
667 | input_series: | |
668 | - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter", | |
669 | job="node-exporter"}' | |
670 | values: '1500 1500 1500 1500 1500' | |
671 | - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter", | |
672 | job="node-exporter"}' | |
673 | values: '1500 1500 1500 1500 1500' | |
674 | - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter", | |
675 | job="node-exporter"}' | |
676 | values: '1500 1500 1500 1500 1500' | |
677 | - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter", | |
678 | job="node-exporter"}' | |
679 | values: '1500 1500 1500 1500 1500' | |
680 | - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter", | |
681 | job="node-exporter"}' | |
682 | values: '9000 9000 9000 9000 9000' | |
a4b75251 TL |
683 | - series: 'node_network_up{device="eth0",instance="node-exporter", |
684 | job="node-exporter"}' | |
685 | values: '0 0 0 0 0' | |
686 | - series: 'node_network_up{device="eth1",instance="node-exporter", | |
687 | job="node-exporter"}' | |
688 | values: '0 0 0 0 0' | |
689 | - series: 'node_network_up{device="eth2",instance="node-exporter", | |
690 | job="node-exporter"}' | |
691 | values: '1 1 1 1 1' | |
692 | - series: 'node_network_up{device="eth3",instance="node-exporter", | |
693 | job="node-exporter"}' | |
694 | values: '0 0 0 0 0' | |
695 | - series: 'node_network_up{device="eth4",instance="node-exporter", | |
696 | job="node-exporter"}' | |
697 | values: '1 1 1 1 1' | |
f67539c2 | 698 | promql_expr_test: |
a4b75251 | 699 | - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() |
f67539c2 TL |
700 | (quantile(0.5, node_network_mtu_bytes{device!="lo"})) |
701 | eval_time: 1m | |
702 | exp_samples: | |
a4b75251 | 703 | - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}' |
f67539c2 TL |
704 | value: 9000 |
705 | alert_rule_test: | |
706 | - eval_time: 1m | |
707 | alertname: MTU Mismatch | |
708 | exp_alerts: | |
709 | - exp_labels: | |
710 | device: eth4 | |
711 | instance: node-exporter | |
712 | job: node-exporter | |
713 | oid: 1.3.6.1.4.1.50495.15.1.2.8.5 | |
714 | severity: warning | |
715 | type: ceph_default | |
716 | exp_annotations: | |
717 | description: > | |
718 | Node node-exporter has a different MTU size (9000) | |
719 | than the median value on device eth4. | |
720 | ||
721 | # pool full | |
722 | - interval: 1m | |
723 | input_series: | |
724 | - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="1"}' | |
725 | values: '0 0 0 0 0 0 0 0 0' | |
726 | - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="2"}' | |
727 | values: '1850 1850 1850 1850 1850 1850 1850' | |
728 | - series: 'ceph_pool_stored{instance="ceph:9283",job="ceph",pool_id="3"}' | |
729 | values: '900 900 23524 23524 23524 23524 23524 23524 | |
730 | 23524' | |
731 | - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="1"}' | |
732 | values: '106287063040 106287063040 106287063040 106287063040 106287063040 | |
733 | 106287063040 106287063040' | |
734 | - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="2"}' | |
735 | values: '106287063040 106287063040 106287063040 106287063040 106287063040 | |
736 | 106287063040 106287063040' | |
737 | - series: 'ceph_pool_max_avail{instance="ceph:9283",job="ceph",pool_id="3"}' | |
738 | values: '37.5 37.5 37.5 37.5 37.5 37.5 37.5' | |
739 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
740 | name="device_health_metrics",pool_id="1"}' | |
741 | values: '1 1 1 1 1 1 1 1 1' | |
742 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
743 | name=".rgw.root",pool_id="2"}' | |
744 | values: '1 1 1 1 1 1 1 1 1' | |
745 | - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph", | |
746 | name="default.rgw.log",pool_id="3"}' | |
747 | values: '1 1 1 1 1 1 1 1 1' | |
748 | promql_expr_test: | |
749 | - expr: | | |
750 | ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail) | |
751 | * on(pool_id) group_right ceph_pool_metadata * 100 > 90 | |
752 | ||
753 | eval_time: 1m | |
754 | exp_samples: | |
755 | - labels: '{instance="ceph:9283", job="ceph", name="default.rgw.log", | |
756 | pool_id="3"}' | |
757 | value: 9.6E+01 | |
758 | alert_rule_test: | |
759 | - eval_time: 2m | |
760 | alertname: pool full | |
761 | exp_alerts: | |
762 | - exp_labels: | |
763 | instance: ceph:9283 | |
764 | job: ceph | |
765 | name: default.rgw.log | |
766 | oid: 1.3.6.1.4.1.50495.15.1.2.9.1 | |
767 | pool_id: 3 | |
768 | severity: critical | |
769 | type: ceph_default | |
770 | exp_annotations: | |
771 | description: Pool default.rgw.log at 96% capacity. | |
772 | ||
773 | # slow OSD ops | |
774 | - interval : 1m | |
775 | input_series: | |
776 | - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}' | |
777 | values: '1+0x120' | |
778 | promql_expr_test: | |
779 | - expr: ceph_healthcheck_slow_ops > 0 | |
780 | eval_time: 1m | |
781 | exp_samples: | |
782 | - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283", | |
783 | job="ceph"}' | |
784 | value: 1 | |
785 | alert_rule_test: | |
786 | - eval_time: 20m | |
787 | alertname: Slow OSD Ops | |
788 | exp_alerts: | |
789 | - exp_labels: | |
790 | instance: ceph:9283 | |
791 | job: ceph | |
792 | severity: warning | |
793 | type: ceph_default | |
794 | exp_annotations: | |
795 | description: > | |
796 | 1 OSD requests are taking too long to process | |
797 | (osd_op_complaint_time exceeded) |