]> git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/prometheus_alerts.yml
import quincy 17.2.0
[ceph.git] / ceph / monitoring / ceph-mixin / prometheus_alerts.yml
1 groups:
2 - name: cluster health
3 rules:
4 - alert: CephHealthError
5 expr: ceph_health_status == 2
6 for: 5m
7 labels:
8 severity: critical
9 type: ceph_default
10 oid: 1.3.6.1.4.1.50495.1.2.1.2.1
11 annotations:
12 summary: Cluster is in an ERROR state
13 description: >
14 Ceph in HEALTH_ERROR state for more than 5 minutes.
15 Please check "ceph health detail" for more information.
16
17 - alert: CephHealthWarning
18 expr: ceph_health_status == 1
19 for: 15m
20 labels:
21 severity: warning
22 type: ceph_default
23 annotations:
24 summary: Cluster is in a WARNING state
25 description: >
26 Ceph has been in HEALTH_WARN for more than 15 minutes.
27 Please check "ceph health detail" for more information.
28
29 - name: mon
30 rules:
31 - alert: CephMonDownQuorumAtRisk
32 expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
33 for: 30s
34 labels:
35 severity: critical
36 type: ceph_default
37 oid: 1.3.6.1.4.1.50495.1.2.1.3.1
38 annotations:
39 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
40 summary: Monitor quorum is at risk
41 description: |
42 {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active
43 Without quorum the cluster will become inoperable, affecting all connected clients and services.
44
45 The following monitors are down:
46 {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
47 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
48 {{- end }}
49 - alert: CephMonDown
50 expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
51 for: 30s
52 labels:
53 severity: warning
54 type: ceph_default
55 annotations:
56 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
57 summary: One of more ceph monitors are down
58 description: |
59 {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down.
60 Quorum is still intact, but the loss of further monitors will make your cluster inoperable.
61
62 The following monitors are down:
63 {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
64 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
65 {{- end }}
66 - alert: CephMonDiskspaceCritical
67 expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
68 for: 1m
69 labels:
70 severity: critical
71 type: ceph_default
72 oid: 1.3.6.1.4.1.50495.1.2.1.3.2
73 annotations:
74 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
75 summary: Disk space on at least one monitor is critically low
76 description: |
77 The free space available to a monitor's store is critically low (<5% by default).
78 You should increase the space available to the monitor(s). The
79 default location for the store sits under /var/lib/ceph. Your monitor hosts are;
80 {{- range query "ceph_mon_metadata"}}
81 - {{ .Labels.hostname }}
82 {{- end }}
83
84 - alert: CephMonDiskspaceLow
85 expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
86 for: 5m
87 labels:
88 severity: warning
89 type: ceph_default
90 annotations:
91 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
92 summary: Disk space on at least one monitor is approaching full
93 description: |
94 The space available to a monitor's store is approaching full (>70% is the default).
95 You should increase the space available to the monitor store. The
96 default location for the store sits under /var/lib/ceph. Your monitor hosts are;
97 {{- range query "ceph_mon_metadata"}}
98 - {{ .Labels.hostname }}
99 {{- end }}
100
101 - alert: CephMonClockSkew
102 expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
103 for: 1m
104 labels:
105 severity: warning
106 type: ceph_default
107 annotations:
108 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
109 summary: Clock skew across the Monitor hosts detected
110 description: |
111 The ceph monitors rely on a consistent time reference to maintain
112 quorum and cluster consistency. This event indicates that at least
113 one of your mons is not sync'd correctly.
114
115 Review the cluster status with ceph -s. This will show which monitors
116 are affected. Check the time sync status on each monitor host.
117
118 - name: osd
119 rules:
120 - alert: CephOSDDownHigh
121 expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
122 labels:
123 severity: critical
124 type: ceph_default
125 oid: 1.3.6.1.4.1.50495.1.2.1.4.1
126 annotations:
127 summary: More than 10% of OSDs are down
128 description: |
129 {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%).
130
131 The following OSDs are down:
132 {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
133 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
134 {{- end }}
135 - alert: CephOSDHostDown
136 expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
137 for: 5m
138 labels:
139 severity: warning
140 type: ceph_default
141 oid: 1.3.6.1.4.1.50495.1.2.1.4.8
142 annotations:
143 summary: An OSD host is offline
144 description: |
145 The following OSDs are down:
146 {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
147 - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }}
148 {{- end }}
149 - alert: CephOSDDown
150 expr: ceph_health_detail{name="OSD_DOWN"} == 1
151 for: 5m
152 labels:
153 severity: warning
154 type: ceph_default
155 oid: 1.3.6.1.4.1.50495.1.2.1.4.2
156 annotations:
157 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
158 summary: An OSD has been marked down/unavailable
159 description: |
160 {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins.
161
162 The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
163 {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
164 - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
165 {{- end }}
166
167 - alert: CephOSDNearFull
168 expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
169 for: 5m
170 labels:
171 severity: warning
172 type: ceph_default
173 oid: 1.3.6.1.4.1.50495.1.2.1.4.3
174 annotations:
175 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
176 summary: OSD(s) running low on free space (NEARFULL)
177 description: |
178 One or more OSDs have reached their NEARFULL threshold
179
180 Use 'ceph health detail' to identify which OSDs have reached this threshold.
181 To resolve, either add capacity to the cluster, or delete unwanted data
182 - alert: CephOSDFull
183 expr: ceph_health_detail{name="OSD_FULL"} > 0
184 for: 1m
185 labels:
186 severity: critical
187 type: ceph_default
188 oid: 1.3.6.1.4.1.50495.1.2.1.4.6
189 annotations:
190 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
191 summary: OSD(s) is full, writes blocked
192 description: |
193 An OSD has reached it's full threshold. Writes from all pools that share the
194 affected OSD will be blocked.
195
196 To resolve, either add capacity to the cluster, or delete unwanted data
197 - alert: CephOSDBackfillFull
198 expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
199 for: 1m
200 labels:
201 severity: warning
202 type: ceph_default
203 annotations:
204 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
205 summary: OSD(s) too full for backfill operations
206 description: |
207 An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations
208 completing for some pools. Check the current capacity utilisation with 'ceph df'
209
210 To resolve, either add capacity to the cluster, or delete unwanted data
211 - alert: CephOSDTooManyRepairs
212 expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
213 for: 30s
214 labels:
215 severity: warning
216 type: ceph_default
217 annotations:
218 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
219 summary: OSD has hit a high number of read errors
220 description: |
221 Reads from an OSD have used a secondary PG to return data to the client, indicating
222 a potential failing disk.
223 - alert: CephOSDTimeoutsPublicNetwork
224 expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
225 for: 1m
226 labels:
227 severity: warning
228 type: ceph_default
229 annotations:
230 summary: Network issues delaying OSD heartbeats (public network)
231 description: |
232 OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network
233 for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
234 - alert: CephOSDTimeoutsClusterNetwork
235 expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
236 for: 1m
237 labels:
238 severity: warning
239 type: ceph_default
240 annotations:
241 summary: Network issues delaying OSD heartbeats (cluster network)
242 description: |
243 OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network
244 for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs.
245 - alert: CephOSDInternalDiskSizeMismatch
246 expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
247 for: 1m
248 labels:
249 severity: warning
250 type: ceph_default
251 annotations:
252 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
253 summary: OSD size inconsistency error
254 description: |
255 One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata.
256 This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs.
257 - alert: CephDeviceFailurePredicted
258 expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
259 for: 1m
260 labels:
261 severity: warning
262 type: ceph_default
263 annotations:
264 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
265 summary: Device(s) have been predicted to fail soon
266 description: |
267 The device health module has determined that one or more devices will fail
268 soon. To review the device states use 'ceph device ls'. To show a specific
269 device use 'ceph device info <dev id>'.
270
271 Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once
272 the osd is empty remove and replace the OSD.
273 - alert: CephDeviceFailurePredictionTooHigh
274 expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
275 for: 1m
276 labels:
277 severity: critical
278 type: ceph_default
279 oid: 1.3.6.1.4.1.50495.1.2.1.4.7
280 annotations:
281 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
282 summary: Too many devices have been predicted to fail, unable to resolve
283 description: |
284 The device health module has determined that the number of devices predicted to
285 fail can not be remediated automatically, since it would take too many osd's out of
286 the cluster, impacting performance and potentially availabililty. You should add new
287 OSDs to the cluster to allow data to be relocated to avoid the data integrity issues.
288 - alert: CephDeviceFailureRelocationIncomplete
289 expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
290 for: 1m
291 labels:
292 severity: warning
293 type: ceph_default
294 annotations:
295 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
296 summary: A device failure is predicted, but unable to relocate data
297 description: |
298 The device health module has determined that one or more devices will fail
299 soon, but the normal process of relocating the data on the device to other
300 OSDs in the cluster is blocked.
301
302 Check the the cluster has available freespace. It may be necessary to add
303 more disks to the cluster to allow the data from the failing device to
304 successfully migrate.
305
306 - alert: CephOSDFlapping
307 expr: |
308 (
309 rate(ceph_osd_up[5m])
310 * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
311 ) * 60 > 1
312 labels:
313 severity: warning
314 type: ceph_default
315 oid: 1.3.6.1.4.1.50495.1.2.1.4.4
316 annotations:
317 documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
318 summary: Network issues are causing OSD's to flap (mark each other out)
319 description: >
320 OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
321 marked down and back up at {{ $value | humanize }} times once a
322 minute for 5 minutes. This could indicate a network issue (latency,
323 packet drop, disruption) on the clusters "cluster network". Check the
324 network environment on the listed host(s).
325
326 - alert: CephOSDReadErrors
327 expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
328 for: 30s
329 labels:
330 severity: warning
331 type: ceph_default
332 annotations:
333 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
334 summary: Device read errors detected
335 description: >
336 An OSD has encountered read errors, but the OSD has recovered by retrying
337 the reads. This may indicate an issue with the Hardware or Kernel.
338 # alert on high deviation from average PG count
339 - alert: CephPGImbalance
340 expr: |
341 abs(
342 (
343 (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
344 ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
345 ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
346 for: 5m
347 labels:
348 severity: warning
349 type: ceph_default
350 oid: 1.3.6.1.4.1.50495.1.2.1.4.5
351 annotations:
352 summary: PG allocations are not balanced across devices
353 description: >
354 OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
355 by more than 30% from average PG count.
356 # alert on high commit latency...but how high is too high
357
358 - name: mds
359 rules:
360 - alert: CephFilesystemDamaged
361 expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
362 for: 1m
363 labels:
364 severity: critical
365 type: ceph_default
366 oid: 1.3.6.1.4.1.50495.1.2.1.5.1
367 annotations:
368 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
369 summary: Ceph filesystem is damaged.
370 description: >
371 The filesystems metadata has been corrupted. Data access
372 may be blocked.
373
374 Either analyse the output from the mds daemon admin socket, or
375 escalate to support
376 - alert: CephFilesystemOffline
377 expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
378 for: 1m
379 labels:
380 severity: critical
381 type: ceph_default
382 oid: 1.3.6.1.4.1.50495.1.2.1.5.3
383 annotations:
384 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
385 summary: Ceph filesystem is offline
386 description: >
387 All MDS ranks are unavailable. The ceph daemons providing the metadata
388 for the Ceph filesystem are all down, rendering the filesystem offline.
389 - alert: CephFilesystemDegraded
390 expr: ceph_health_detail{name="FS_DEGRADED"} > 0
391 for: 1m
392 labels:
393 severity: critical
394 type: ceph_default
395 oid: 1.3.6.1.4.1.50495.1.2.1.5.4
396 annotations:
397 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
398 summary: Ceph filesystem is degraded
399 description: >
400 One or more metadata daemons (MDS ranks) are failed or in a
401 damaged state. At best the filesystem is partially available,
402 worst case is the filesystem is completely unusable.
403 - alert: CephFilesystemMDSRanksLow
404 expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
405 for: 1m
406 labels:
407 severity: warning
408 type: ceph_default
409 annotations:
410 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
411 summary: Ceph MDS daemon count is lower than configured
412 description: >
413 The filesystem's "max_mds" setting defined the number of MDS ranks in
414 the filesystem. The current number of active MDS daemons is less than
415 this setting.
416 - alert: CephFilesystemInsufficientStandby
417 expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
418 for: 1m
419 labels:
420 severity: warning
421 type: ceph_default
422 annotations:
423 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
424 summary: Ceph filesystem standby daemons too low
425 description: >
426 The minimum number of standby daemons determined by standby_count_wanted
427 is less than the actual number of standby daemons. Adjust the standby count
428 or increase the number of mds daemons within the filesystem.
429 - alert: CephFilesystemFailureNoStandby
430 expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
431 for: 1m
432 labels:
433 severity: critical
434 type: ceph_default
435 oid: 1.3.6.1.4.1.50495.1.2.1.5.5
436 annotations:
437 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
438 summary: Ceph MDS daemon failed, no further standby available
439 description: >
440 An MDS daemon has failed, leaving only one active rank without
441 further standby. Investigate the cause of the failure or add a
442 standby daemon
443 - alert: CephFilesystemReadOnly
444 expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
445 for: 1m
446 labels:
447 severity: critical
448 type: ceph_default
449 oid: 1.3.6.1.4.1.50495.1.2.1.5.2
450 annotations:
451 documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
452 summary: Ceph filesystem in read only mode, due to write error(s)
453 description: >
454 The filesystem has switched to READ ONLY due to an unexpected
455 write error, when writing to the metadata pool
456
457 Either analyse the output from the mds daemon admin socket, or
458 escalate to support
459
460 - name: mgr
461 rules:
462 - alert: CephMgrModuleCrash
463 expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
464 for: 5m
465 labels:
466 severity: critical
467 type: ceph_default
468 oid: 1.3.6.1.4.1.50495.1.2.1.6.1
469 annotations:
470 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
471 summary: A mgr module has recently crashed
472 description: >
473 One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A
474 crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to
475 investigate which module has failed, and archive it to acknowledge the failure.
476 - alert: CephMgrPrometheusModuleInactive
477 expr: up{job="ceph"} == 0
478 for: 1m
479 labels:
480 severity: critical
481 type: ceph_default
482 oid: 1.3.6.1.4.1.50495.1.2.1.6.2
483 annotations:
484 summary: Ceph's mgr/prometheus module is not available
485 description: >
486 The mgr/prometheus module at {{ $labels.instance }} is unreachable. This
487 could mean that the module has been disabled or the mgr itself is down.
488
489 Without the mgr/prometheus module metrics and alerts will no longer
490 function. Open a shell to ceph and use 'ceph -s' to to determine whether the
491 mgr is active. If the mgr is not active, restart it, otherwise you can check
492 the mgr/prometheus module is loaded with 'ceph mgr module ls' and if it's
493 not listed as enabled, enable it with 'ceph mgr module enable prometheus'
494
495 - name: pgs
496 rules:
497 - alert: CephPGsInactive
498 expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
499 for: 5m
500 labels:
501 severity: critical
502 type: ceph_default
503 oid: 1.3.6.1.4.1.50495.1.2.1.7.1
504 annotations:
505 summary: One or more Placement Groups are inactive
506 description: >
507 {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
508 Inactive placement groups aren't able to serve read/write
509 requests.
510 - alert: CephPGsUnclean
511 expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
512 for: 15m
513 labels:
514 severity: warning
515 type: ceph_default
516 oid: 1.3.6.1.4.1.50495.1.2.1.7.2
517 annotations:
518 summary: One or more platcment groups are marked unclean
519 description: >
520 {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
521 Unclean PGs haven't been able to completely recover from a previous failure.
522 - alert: CephPGsDamaged
523 expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
524 for: 5m
525 labels:
526 severity: critical
527 type: ceph_default
528 oid: 1.3.6.1.4.1.50495.1.2.1.7.4
529 annotations:
530 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
531 summary: Placement group damaged, manual intervention needed
532 description: >
533 During data consistency checks (scrub), at least one PG has been flagged as being
534 damaged or inconsistent.
535
536 Check to see which PG is affected, and attempt a manual repair if necessary. To list
537 problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use
538 the 'ceph pg repair <pg_num>' command.
539 - alert: CephPGRecoveryAtRisk
540 expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
541 for: 1m
542 labels:
543 severity: critical
544 type: ceph_default
545 oid: 1.3.6.1.4.1.50495.1.2.1.7.5
546 annotations:
547 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
548 summary: OSDs are too full for automatic recovery
549 description: >
550 Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their
551 'full' threshold. Add more capacity to the cluster, or delete unwanted data.
552 - alert: CephPGUnavilableBlockingIO
553 # PG_AVAILABILITY, but an OSD is not in a DOWN state
554 expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1
555 for: 1m
556 labels:
557 severity: critical
558 type: ceph_default
559 oid: 1.3.6.1.4.1.50495.1.2.1.7.3
560 annotations:
561 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
562 summary: Placement group is unavailable, blocking some I/O
563 description: >
564 Data availability is reduced impacting the clusters ability to service I/O to some data. One or
565 more placement groups (PGs) are in a state that blocks IO.
566 - alert: CephPGBackfillAtRisk
567 expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
568 for: 1m
569 labels:
570 severity: critical
571 type: ceph_default
572 oid: 1.3.6.1.4.1.50495.1.2.1.7.6
573 annotations:
574 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
575 summary: Backfill operations are blocked, due to lack of freespace
576 description: >
577 Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs
578 have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data.
579 - alert: CephPGNotScrubbed
580 expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
581 for: 5m
582 labels:
583 severity: warning
584 type: ceph_default
585 annotations:
586 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
587 summary: Placement group(s) have not been scrubbed
588 description: |
589 One or more PGs have not been scrubbed recently. The scrub process is a data integrity
590 feature, protectng against bit-rot. It checks that objects and their metadata (size and
591 attributes) match across object replicas. When PGs miss their scrub window, it may
592 indicate the scrub window is too small, or PGs were not in a 'clean' state during the
593 scrub window.
594
595 You can manually initiate a scrub with: ceph pg scrub <pgid>
596 - alert: CephPGsHighPerOSD
597 expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
598 for: 1m
599 labels:
600 severity: warning
601 type: ceph_default
602 annotations:
603 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
604 summary: Placement groups per OSD is too high
605 description: |
606 The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).
607
608 Check that the pg_autoscaler hasn't been disabled for any of the pools, with 'ceph osd pool autoscale-status'
609 and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide
610 the autoscaler based on the expected relative size of the pool
611 (i.e. 'ceph osd pool set cephfs.cephfs.meta target_size_ratio .1')
612 - alert: CephPGNotDeepScrubbed
613 expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
614 for: 5m
615 labels:
616 severity: warning
617 type: ceph_default
618 annotations:
619 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
620 summary: Placement group(s) have not been deep scrubbed
621 description: |
622 One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity
623 feature, protectng against bit-rot. It compares the contents of objects and their
624 replicas for inconsistency. When PGs miss their deep scrub window, it may indicate
625 that the window is too small or PGs were not in a 'clean' state during the deep-scrub
626 window.
627
628 You can manually initiate a deep scrub with: ceph pg deep-scrub <pgid>
629
630 - name: nodes
631 rules:
632 - alert: CephNodeRootFilesystemFull
633 expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
634 for: 5m
635 labels:
636 severity: critical
637 type: ceph_default
638 oid: 1.3.6.1.4.1.50495.1.2.1.8.1
639 annotations:
640 summary: Root filesystem is dangerously full
641 description: >
642 Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
643
644 # alert on nic packet errors and drops rates > 1% packets/s
645 - alert: CephNodeNetworkPacketDrops
646 expr: |
647 (
648 increase(node_network_receive_drop_total{device!="lo"}[1m]) +
649 increase(node_network_transmit_drop_total{device!="lo"}[1m])
650 ) / (
651 increase(node_network_receive_packets_total{device!="lo"}[1m]) +
652 increase(node_network_transmit_packets_total{device!="lo"}[1m])
653 ) >= 0.0001 or (
654 increase(node_network_receive_drop_total{device!="lo"}[1m]) +
655 increase(node_network_transmit_drop_total{device!="lo"}[1m])
656 ) >= 10
657 labels:
658 severity: warning
659 type: ceph_default
660 oid: 1.3.6.1.4.1.50495.1.2.1.8.2
661 annotations:
662 summary: One or more Nics is seeing packet drops
663 description: >
664 Node {{ $labels.instance }} experiences packet drop > 0.01% or >
665 10 packets/s on interface {{ $labels.device }}.
666
667 - alert: CephNodeNetworkPacketErrors
668 expr: |
669 (
670 increase(node_network_receive_errs_total{device!="lo"}[1m]) +
671 increase(node_network_transmit_errs_total{device!="lo"}[1m])
672 ) / (
673 increase(node_network_receive_packets_total{device!="lo"}[1m]) +
674 increase(node_network_transmit_packets_total{device!="lo"}[1m])
675 ) >= 0.0001 or (
676 increase(node_network_receive_errs_total{device!="lo"}[1m]) +
677 increase(node_network_transmit_errs_total{device!="lo"}[1m])
678 ) >= 10
679 labels:
680 severity: warning
681 type: ceph_default
682 oid: 1.3.6.1.4.1.50495.1.2.1.8.3
683 annotations:
684 summary: One or more Nics is seeing packet errors
685 description: >
686 Node {{ $labels.instance }} experiences packet errors > 0.01% or
687 > 10 packets/s on interface {{ $labels.device }}.
688
689 # Restrict to device names beginning with '/' to skip false alarms from
690 # tmpfs, overlay type filesystems
691 - alert: CephNodeDiskspaceWarning
692 expr: |
693 predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
694 on(instance) group_left(nodename) node_uname_info < 0
695 labels:
696 severity: warning
697 type: ceph_default
698 oid: 1.3.6.1.4.1.50495.1.2.1.8.4
699 annotations:
700 summary: Host filesystem freespace is getting low
701 description: >
702 Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
703 will be full in less than 5 days assuming the average fill-up
704 rate of the past 48 hours.
705
706 - alert: CephNodeInconsistentMTU
707 expr: |
708 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
709 scalar(
710 max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
711 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
712 )
713 or
714 node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
715 scalar(
716 min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
717 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
718 )
719 labels:
720 severity: warning
721 type: ceph_default
722 annotations:
723 summary: MTU settings across Ceph hosts are inconsistent
724 description: >
725 Node {{ $labels.instance }} has a different MTU size ({{ $value }})
726 than the median of devices named {{ $labels.device }}.
727
728 - name: pools
729 rules:
730 - alert: CephPoolGrowthWarning
731 expr: |
732 (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
733 group_right ceph_pool_metadata) >= 95
734 labels:
735 severity: warning
736 type: ceph_default
737 oid: 1.3.6.1.4.1.50495.1.2.1.9.2
738 annotations:
739 summary: Pool growth rate may soon exceed it's capacity
740 description: >
741 Pool '{{ $labels.name }}' will be full in less than 5 days
742 assuming the average fill-up rate of the past 48 hours.
743 - alert: CephPoolBackfillFull
744 expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
745 labels:
746 severity: warning
747 type: ceph_default
748 annotations:
749 summary: Freespace in a pool is too low for recovery/rebalance
750 description: >
751 A pool is approaching it's near full threshold, which will
752 prevent rebalance operations from completing. You should
753 consider adding more capacity to the pool.
754
755 - alert: CephPoolFull
756 expr: ceph_health_detail{name="POOL_FULL"} > 0
757 for: 1m
758 labels:
759 severity: critical
760 type: ceph_default
761 oid: 1.3.6.1.4.1.50495.1.2.1.9.1
762 annotations:
763 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
764 summary: Pool is full - writes are blocked
765 description: |
766 A pool has reached it's MAX quota, or the OSDs supporting the pool
767 have reached their FULL threshold. Until this is resolved, writes to
768 the pool will be blocked.
769 Pool Breakdown (top 5)
770 {{- range query "topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))" }}
771 - {{ .Labels.name }} at {{ .Value }}%
772 {{- end }}
773 Either increase the pools quota, or add capacity to the cluster first
774 then increase it's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
775 - alert: CephPoolNearFull
776 expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
777 for: 5m
778 labels:
779 severity: warning
780 type: ceph_default
781 annotations:
782 summary: One or more Ceph pools are getting full
783 description: |
784 A pool has exceeeded it warning (percent full) threshold, or the OSDs
785 supporting the pool have reached their NEARFULL thresholds. Writes may
786 continue, but you are at risk of the pool going read only if more capacity
787 isn't made available.
788
789 Determine the affected pool with 'ceph df detail', for example looking
790 at QUOTA BYTES and STORED. Either increase the pools quota, or add
791 capacity to the cluster first then increase it's quota
792 (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
793 - name: healthchecks
794 rules:
795 - alert: CephSlowOps
796 expr: ceph_healthcheck_slow_ops > 0
797 for: 30s
798 labels:
799 severity: warning
800 type: ceph_default
801 annotations:
802 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
803 summary: MON/OSD operations are slow to complete
804 description: >
805 {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
806 # cephadm alerts
807 - name: cephadm
808 rules:
809 - alert: CephadmUpgradeFailed
810 expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
811 for: 30s
812 labels:
813 severity: critical
814 type: ceph_default
815 oid: 1.3.6.1.4.1.50495.1.2.1.11.2
816 annotations:
817 summary: Ceph version upgrade has failed
818 description: >
819 The cephadm cluster upgrade process has failed. The cluster remains in
820 an undetermined state.
821
822 Please review the cephadm logs, to understand the nature of the issue
823 - alert: CephadmDaemonFailed
824 expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
825 for: 30s
826 labels:
827 severity: critical
828 type: ceph_default
829 oid: 1.3.6.1.4.1.50495.1.2.1.11.1
830 annotations:
831 summary: A ceph daemon manged by cephadm is down
832 description: >
833 A daemon managed by cephadm is no longer active. Determine, which
834 daemon is down with 'ceph health detail'. you may start daemons with
835 the 'ceph orch daemon start <daemon_id>'
836 - alert: CephadmPaused
837 expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
838 for: 1m
839 labels:
840 severity: warning
841 type: ceph_default
842 annotations:
843 documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
844 summary: Orchestration tasks via cephadm are PAUSED
845 description: >
846 Cluster management has been paused manually. This will prevent the
847 orchestrator from service management and reconciliation. If this is
848 not intentional, resume cephadm operations with 'ceph orch resume'
849
850 # prometheus alerts
851 - name: PrometheusServer
852 rules:
853 - alert: PrometheusJobMissing
854 expr: absent(up{job="ceph"})
855 for: 30s
856 labels:
857 severity: critical
858 type: ceph_default
859 oid: 1.3.6.1.4.1.50495.1.2.1.12.1
860 annotations:
861 summary: The scrape job for Ceph is missing from Prometheus
862 description: |
863 The prometheus job that scrapes from Ceph is no longer defined, this
864 will effectively mean you'll have no metrics or alerts for the cluster.
865
866 Please review the job definitions in the prometheus.yml file of the prometheus
867 instance.
868 # Object related events
869 - name: rados
870 rules:
871 - alert: CephObjectMissing
872 expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
873 for: 30s
874 labels:
875 severity: critical
876 type: ceph_default
877 oid: 1.3.6.1.4.1.50495.1.2.1.10.1
878 annotations:
879 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
880 summary: Object(s) has been marked UNFOUND
881 description: |
882 A version of a RADOS object can not be found, even though all OSDs are up. I/O
883 requests for this object from clients will block (hang). Resolving this issue may
884 require the object to be rolled back to a prior version manually, and manually verified.
885 # Generic
886 - name: generic
887 rules:
888 - alert: CephDaemonCrash
889 expr: ceph_health_detail{name="RECENT_CRASH"} == 1
890 for: 1m
891 labels:
892 severity: critical
893 type: ceph_default
894 oid: 1.3.6.1.4.1.50495.1.2.1.1.2
895 annotations:
896 documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
897 summary: One or more Ceph daemons have crashed, and are pending acknowledgement
898 description: |
899 One or more daemons have crashed recently, and need to be acknowledged. This notification
900 ensures that software crashes don't go unseen. To acknowledge a crash, use the
901 'ceph crash archive <id>' command.