[ceph.git] / ceph / monitoring / prometheus / alerts / ceph_default_alerts.yml

groups:
  - name: cluster health
    rules:
      - alert: health error
        expr: ceph_health_status == 2
        for: 5m
        labels:
          severity: critical
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.2.1
        annotations:
          description: >
            Ceph in HEALTH_ERROR state for more than 5 minutes.
            Please check "ceph health detail" for more information.

      - alert: health warn
        expr: ceph_health_status == 1
        for: 15m
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.2.2
        annotations:
          description: >
            Ceph has been in HEALTH_WARN for more than 15 minutes.
            Please check "ceph health detail" for more information.

  - name: mon
    rules:
      - alert: low monitor quorum count
        expr: sum(ceph_mon_quorum_status) < 3
        labels:
          severity: critical
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.3.1
        annotations:
          description: |
            Monitor count in quorum is below three.

            Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active.

            The following monitors are down:
            {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
              - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
            {{- end }}

  - name: osd
    rules:
      - alert: 10% OSDs down
        expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
        labels:
          severity: critical
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.4.1
        annotations:
          description: |
            {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (≥ 10%).

            The following OSDs are down:
            {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
              - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
            {{- end }}

      - alert: OSD down
        expr: count(ceph_osd_up == 0) > 0
        for: 15m
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.4.2
        annotations:
          description: |
            {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
            {{ $value }} OSD{{ $s }} down for more than 15 minutes.

            {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down.

            The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
              {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
                - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
              {{- end }}

      - alert: OSDs near full
        expr: |
          (
            ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
            * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
          ) * 100 > 90
        for: 5m
        labels:
          severity: critical
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.4.3
        annotations:
          description: >
            OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
            dangerously full: {{ $value | humanize }}%

      - alert: flapping OSD
        expr: |
          (
            rate(ceph_osd_up[5m])
            * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
          ) * 60 > 1
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.4.4
        annotations:
          description: >
            OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
            marked down and back up at {{ $value | humanize }} times once a
            minute for 5 minutes.

      # alert on high deviation from average PG count
      - alert: high pg count deviation
        expr: |
          abs(
            (
              (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
            ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
          ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
        for: 5m
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.4.5
        annotations:
          description: >
            OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
            by more than 30% from average PG count.
      # alert on high commit latency...but how high is too high
  - name: mds
    rules:
    # no mds metrics are exported yet
  - name: mgr
    rules:
    # no mgr metrics are exported yet
  - name: pgs
    rules:
      - alert: pgs inactive
        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
        for: 5m
        labels:
          severity: critical
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.7.1
        annotations:
          description: >
            {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
            Inactive placement groups aren't able to serve read/write
            requests.
      - alert: pgs unclean
        expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
        for: 15m
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.7.2
        annotations:
          description: >
            {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
            Unclean PGs haven't been able to completely recover from a
            previous failure.
  - name: nodes
    rules:
      - alert: root volume full
        expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
        for: 5m
        labels:
          severity: critical
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.8.1
        annotations:
          description: >
            Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.

      # alert on nic packet errors and drops rates > 1% packets/s
      - alert: network packets dropped
        expr: |
          (
            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
            increase(node_network_transmit_drop_total{device!="lo"}[1m])
          ) / (
            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
            increase(node_network_transmit_packets_total{device!="lo"}[1m])
          ) >= 0.0001 or (
            increase(node_network_receive_drop_total{device!="lo"}[1m]) +
            increase(node_network_transmit_drop_total{device!="lo"}[1m])
          ) >= 10
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.8.2
        annotations:
          description: >
            Node {{ $labels.instance }} experiences packet drop > 0.01% or >
            10 packets/s on interface {{ $labels.device }}.

      - alert: network packet errors
        expr: |
          (
            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
            increase(node_network_transmit_errs_total{device!="lo"}[1m])
          ) / (
            increase(node_network_receive_packets_total{device!="lo"}[1m]) +
            increase(node_network_transmit_packets_total{device!="lo"}[1m])
          ) >= 0.0001 or (
            increase(node_network_receive_errs_total{device!="lo"}[1m]) +
            increase(node_network_transmit_errs_total{device!="lo"}[1m])
          ) >= 10
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.8.3
        annotations:
          description: >
            Node {{ $labels.instance }} experiences packet errors > 0.01% or
            > 10 packets/s on interface {{ $labels.device }}.

      - alert: storage filling up
        expr: |
          predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
          on(instance) group_left(nodename) node_uname_info < 0
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.8.4
        annotations:
          description: >
            Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
            will be full in less than 5 days assuming the average fill-up
            rate of the past 48 hours.

      - alert: MTU Mismatch
        expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.8.5
        annotations:
          description: >
            Node {{ $labels.instance }} has a different MTU size ({{ $value }})
            than the median value on device {{ $labels.device }}.

  - name: pools
    rules:
      - alert: pool full
        expr: |
          ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
          * on(pool_id) group_right ceph_pool_metadata * 100 > 90
        labels:
          severity: critical
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.9.1
        annotations:
          description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity.

      - alert: pool filling up
        expr: |
          (
            predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
            >= ceph_pool_stored + ceph_pool_max_avail
          ) * on(pool_id) group_left(name) ceph_pool_metadata
        labels:
          severity: warning
          type: ceph_default
          oid: 1.3.6.1.4.1.50495.15.1.2.9.2
        annotations:
          description: >
            Pool {{ $labels.name }} will be full in less than 5 days
            assuming the average fill-up rate of the past 48 hours.

  - name: healthchecks
    rules:
      - alert: Slow OSD Ops
        expr: ceph_healthcheck_slow_ops > 0
        for: 30s
        labels:
          severity: warning
          type: ceph_default
        annotations:
          description: >
            {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)
Commit	Line	Data
81eedcae TL	1	groups:
	2	- name: cluster health
	3	rules:
	4	- alert: health error
	5	expr: ceph_health_status == 2
	6	for: 5m
	7	labels:
	8	severity: critical
	9	type: ceph_default
9f95a23c	10	oid: 1.3.6.1.4.1.50495.15.1.2.2.1
81eedcae	11	annotations:
92f5a8d4 TL	12	description: >
	13	Ceph in HEALTH_ERROR state for more than 5 minutes.
	14	Please check "ceph health detail" for more information.
	15
81eedcae TL	16	- alert: health warn
	17	expr: ceph_health_status == 1
	18	for: 15m
	19	labels:
	20	severity: warning
	21	type: ceph_default
9f95a23c	22	oid: 1.3.6.1.4.1.50495.15.1.2.2.2
81eedcae	23	annotations:
92f5a8d4 TL	24	description: >
	25	Ceph has been in HEALTH_WARN for more than 15 minutes.
	26	Please check "ceph health detail" for more information.
	27
81eedcae TL	28	- name: mon
	29	rules:
	30	- alert: low monitor quorum count
	31	expr: sum(ceph_mon_quorum_status) < 3
	32	labels:
	33	severity: critical
	34	type: ceph_default
9f95a23c	35	oid: 1.3.6.1.4.1.50495.15.1.2.3.1
81eedcae	36	annotations:
92f5a8d4 TL	37	description: \|
	38	Monitor count in quorum is below three.
	39
	40	Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . \| first \| value }}{{ end }} monitors are active.
	41
	42	The following monitors are down:
	43	{{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }}
	44	- {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
	45	{{- end }}
	46
81eedcae TL	47	- name: osd
	48	rules:
	49	- alert: 10% OSDs down
e306af50	50	expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
81eedcae TL	51	labels:
	52	severity: critical
	53	type: ceph_default
9f95a23c	54	oid: 1.3.6.1.4.1.50495.15.1.2.4.1
81eedcae	55	annotations:
92f5a8d4	56	description: \|
e306af50	57	{{ $value \| humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . \| first \| value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . \| first \| value }}{{ end }} OSDs are down (≥ 10%).
92f5a8d4 TL	58
	59	The following OSDs are down:
	60	{{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }}
	61	- {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
	62	{{- end }}
	63
81eedcae TL	64	- alert: OSD down
	65	expr: count(ceph_osd_up == 0) > 0
	66	for: 15m
	67	labels:
	68	severity: warning
	69	type: ceph_default
9f95a23c	70	oid: 1.3.6.1.4.1.50495.15.1.2.4.2
81eedcae	71	annotations:
92f5a8d4 TL	72	description: \|
	73	{{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }}
	74	{{ $value }} OSD{{ $s }} down for more than 15 minutes.
	75
	76	{{ $value }} of {{ query "count(ceph_osd_up)" \| first \| value }} OSDs are down.
	77
	78	The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down:
	79	{{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}}
	80	- {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }}
	81	{{- end }}
	82
81eedcae	83	- alert: OSDs near full
92f5a8d4 TL	84	expr: \|
	85	(
	86	((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1)
	87	* on(ceph_daemon) group_left(hostname) ceph_osd_metadata
	88	) * 100 > 90
	89	for: 5m
81eedcae TL	90	labels:
	91	severity: critical
	92	type: ceph_default
9f95a23c	93	oid: 1.3.6.1.4.1.50495.15.1.2.4.3
81eedcae	94	annotations:
92f5a8d4 TL	95	description: >
	96	OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is
	97	dangerously full: {{ $value \| humanize }}%
	98
	99	- alert: flapping OSD
	100	expr: \|
	101	(
	102	rate(ceph_osd_up[5m])
	103	* on(ceph_daemon) group_left(hostname) ceph_osd_metadata
	104	) * 60 > 1
81eedcae TL	105	labels:
	106	severity: warning
	107	type: ceph_default
9f95a23c	108	oid: 1.3.6.1.4.1.50495.15.1.2.4.4
81eedcae TL	109	annotations:
81eedcae TL	110	description: >
92f5a8d4 TL	111	OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was
	112	marked down and back up at {{ $value \| humanize }} times once a
	113	minute for 5 minutes.
	114
81eedcae TL	115	# alert on high deviation from average PG count
81eedcae TL	116	- alert: high pg count deviation
92f5a8d4 TL	117	expr: \|
	118	abs(
	119	(
	120	(ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)
	121	) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
	122	) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
81eedcae TL	123	for: 5m
	124	labels:
	125	severity: warning
	126	type: ceph_default
9f95a23c	127	oid: 1.3.6.1.4.1.50495.15.1.2.4.5
81eedcae TL	128	annotations:
81eedcae TL	129	description: >
92f5a8d4 TL	130	OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates
92f5a8d4 TL	131	by more than 30% from average PG count.
81eedcae TL	132	# alert on high commit latency...but how high is too high
	133	- name: mds
	134	rules:
	135	# no mds metrics are exported yet
	136	- name: mgr
	137	rules:
	138	# no mgr metrics are exported yet
	139	- name: pgs
	140	rules:
	141	- alert: pgs inactive
92f5a8d4	142	expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
81eedcae TL	143	for: 5m
	144	labels:
	145	severity: critical
	146	type: ceph_default
9f95a23c	147	oid: 1.3.6.1.4.1.50495.15.1.2.7.1
81eedcae	148	annotations:
92f5a8d4 TL	149	description: >
	150	{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
	151	Inactive placement groups aren't able to serve read/write
	152	requests.
81eedcae	153	- alert: pgs unclean
92f5a8d4	154	expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
81eedcae TL	155	for: 15m
	156	labels:
	157	severity: warning
	158	type: ceph_default
9f95a23c	159	oid: 1.3.6.1.4.1.50495.15.1.2.7.2
81eedcae	160	annotations:
92f5a8d4 TL	161	description: >
	162	{{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
	163	Unclean PGs haven't been able to completely recover from a
	164	previous failure.
81eedcae TL	165	- name: nodes
	166	rules:
	167	- alert: root volume full
92f5a8d4	168	expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
1911f103	169	for: 5m
81eedcae TL	170	labels:
	171	severity: critical
	172	type: ceph_default
9f95a23c	173	oid: 1.3.6.1.4.1.50495.15.1.2.8.1
81eedcae	174	annotations:
92f5a8d4 TL	175	description: >
	176	Root volume (OSD and MON store) is dangerously full: {{ $value \| humanize }}% free.
	177
cd265ab1	178	# alert on nic packet errors and drops rates > 1% packets/s
81eedcae	179	- alert: network packets dropped
cd265ab1 TL	180	expr: \|
	181	(
	182	increase(node_network_receive_drop_total{device!="lo"}[1m]) +
	183	increase(node_network_transmit_drop_total{device!="lo"}[1m])
	184	) / (
	185	increase(node_network_receive_packets_total{device!="lo"}[1m]) +
	186	increase(node_network_transmit_packets_total{device!="lo"}[1m])
	187	) >= 0.0001 or (
	188	increase(node_network_receive_drop_total{device!="lo"}[1m]) +
	189	increase(node_network_transmit_drop_total{device!="lo"}[1m])
	190	) >= 10
81eedcae TL	191	labels:
	192	severity: warning
	193	type: ceph_default
9f95a23c	194	oid: 1.3.6.1.4.1.50495.15.1.2.8.2
81eedcae TL	195	annotations:
81eedcae TL	196	description: >
cd265ab1 TL	197	Node {{ $labels.instance }} experiences packet drop > 0.01% or >
cd265ab1 TL	198	10 packets/s on interface {{ $labels.device }}.
92f5a8d4	199
81eedcae	200	- alert: network packet errors
92f5a8d4	201	expr: \|
cd265ab1 TL	202	(
	203	increase(node_network_receive_errs_total{device!="lo"}[1m]) +
	204	increase(node_network_transmit_errs_total{device!="lo"}[1m])
	205	) / (
	206	increase(node_network_receive_packets_total{device!="lo"}[1m]) +
	207	increase(node_network_transmit_packets_total{device!="lo"}[1m])
	208	) >= 0.0001 or (
	209	increase(node_network_receive_errs_total{device!="lo"}[1m]) +
	210	increase(node_network_transmit_errs_total{device!="lo"}[1m])
	211	) >= 10
81eedcae TL	212	labels:
	213	severity: warning
	214	type: ceph_default
9f95a23c	215	oid: 1.3.6.1.4.1.50495.15.1.2.8.3
81eedcae TL	216	annotations:
81eedcae TL	217	description: >
cd265ab1 TL	218	Node {{ $labels.instance }} experiences packet errors > 0.01% or
cd265ab1 TL	219	> 10 packets/s on interface {{ $labels.device }}.
92f5a8d4	220
1911f103	221	- alert: storage filling up
92f5a8d4	222	expr: \|
1911f103 TL	223	predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) *
1911f103 TL	224	on(instance) group_left(nodename) node_uname_info < 0
81eedcae TL	225	labels:
	226	severity: warning
	227	type: ceph_default
9f95a23c	228	oid: 1.3.6.1.4.1.50495.15.1.2.8.4
81eedcae TL	229	annotations:
81eedcae TL	230	description: >
92f5a8d4 TL	231	Mountpoint {{ $labels.mountpoint }} on {{ $labels.nodename }}
	232	will be full in less than 5 days assuming the average fill-up
	233	rate of the past 48 hours.
	234
cd265ab1	235	- alert: MTU Mismatch
a4b75251	236	expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
cd265ab1 TL	237	labels:
	238	severity: warning
	239	type: ceph_default
	240	oid: 1.3.6.1.4.1.50495.15.1.2.8.5
	241	annotations:
	242	description: >
	243	Node {{ $labels.instance }} has a different MTU size ({{ $value }})
	244	than the median value on device {{ $labels.device }}.
	245
81eedcae TL	246	- name: pools
	247	rules:
	248	- alert: pool full
92f5a8d4	249	expr: \|
9f95a23c	250	ceph_pool_stored / (ceph_pool_stored + ceph_pool_max_avail)
92f5a8d4	251	* on(pool_id) group_right ceph_pool_metadata * 100 > 90
81eedcae TL	252	labels:
	253	severity: critical
	254	type: ceph_default
9f95a23c	255	oid: 1.3.6.1.4.1.50495.15.1.2.9.1
81eedcae	256	annotations:
92f5a8d4 TL	257	description: Pool {{ $labels.name }} at {{ $value \| humanize }}% capacity.
92f5a8d4 TL	258
81eedcae	259	- alert: pool filling up
92f5a8d4 TL	260	expr: \|
92f5a8d4 TL	261	(
adb31ebb TL	262	predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5)
adb31ebb TL	263	>= ceph_pool_stored + ceph_pool_max_avail
f6b5b4d7	264	) * on(pool_id) group_left(name) ceph_pool_metadata
81eedcae TL	265	labels:
	266	severity: warning
	267	type: ceph_default
9f95a23c	268	oid: 1.3.6.1.4.1.50495.15.1.2.9.2
81eedcae TL	269	annotations:
	270	description: >
	271	Pool {{ $labels.name }} will be full in less than 5 days
92f5a8d4	272	assuming the average fill-up rate of the past 48 hours.
adb31ebb TL	273
	274	- name: healthchecks
	275	rules:
	276	- alert: Slow OSD Ops
	277	expr: ceph_healthcheck_slow_ops > 0
	278	for: 30s
	279	labels:
	280	severity: warning
	281	type: ceph_default
	282	annotations:
	283	description: >
	284	{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)