summary: OSD operations are slow to complete
description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
+ # slow daemon ops
+ - interval : 1m
+ input_series:
+ - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
+ values: '1+0x120'
+ promql_expr_test:
+ - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
+ eval_time: 1m
+ exp_samples:
+ - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
+ job="ceph", type="SLOW_OPS"}'
+ value: 1
+ alert_rule_test:
+ - eval_time: 20m
+ alertname: CephDaemonSlowOps
+ exp_alerts:
+ - exp_labels:
+ instance: ceph:9283
+ ceph_daemon: "osd.1"
+ job: ceph
+ severity: warning
+ type: ceph_default
+ exp_annotations:
+ documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+ summary: osd.1 operations are slow to complete
+ description: "osd.1 operations are taking too long to process (complaint time exceeded)"
+
# CEPHADM orchestrator alert triggers
- interval: 30s
input_series:
# trigger percent full prediction on pools 1 and 2 only
- interval: 12h
input_series:
- - series: 'ceph_pool_percent_used{pool_id="1"}'
- values: '70 75 80 87 92'
- - series: 'ceph_pool_percent_used{pool_id="2"}'
+ - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
+ values: '78 89 79 98 78'
+ - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
values: '22 22 23 23 24'
- - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
+ - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
+ values: '1 1 1 1 1'
+ - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
values: '1 1 1 1 1'
- - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
+ - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
values: '1 1 1 1 1'
promql_expr_test:
- expr: |
- (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
- group_right ceph_pool_metadata) >= 95
+ (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
+ group_right() ceph_pool_metadata) >= 95
eval_time: 36h
exp_samples:
- - labels: '{name="rbd",pool_id="1",type="replicated"}'
- value: 1.424E+02 # 142%
+ - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
+ value: 1.435E+02 # 142%
alert_rule_test:
- eval_time: 48h
alertname: CephPoolGrowthWarning
exp_alerts:
- exp_labels:
- name: rbd
+ instance: 8090
+ name: default.rgw.index
pool_id: 1
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.9.2
exp_annotations:
summary: Pool growth rate may soon exceed capacity
- description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
+ description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
- interval: 1m
input_series:
- series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'