]> git.proxmox.com Git - ceph.git/blobdiff - ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml
import ceph quincy 17.2.6
[ceph.git] / ceph / monitoring / ceph-mixin / tests_alerts / test_alerts.yml
index 7b7e7db7301bddc9f0d55e2ca2b56198fb89652c..d68b43badf2b42784096e825a965bd2440267700 100644 (file)
@@ -679,6 +679,33 @@ tests:
            summary: OSD operations are slow to complete
            description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 
+ # slow daemon ops
+ - interval : 1m
+   input_series:
+    - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
+      values: '1+0x120'
+   promql_expr_test:
+     - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
+       eval_time: 1m
+       exp_samples:
+         - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
+           job="ceph", type="SLOW_OPS"}'
+           value: 1
+   alert_rule_test:
+     - eval_time: 20m
+       alertname: CephDaemonSlowOps
+       exp_alerts:
+       - exp_labels:
+           instance: ceph:9283
+           ceph_daemon: "osd.1"
+           job: ceph
+           severity: warning
+           type: ceph_default
+         exp_annotations:
+           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
+           summary: osd.1 operations are slow to complete
+           description: "osd.1 operations are taking too long to process (complaint time exceeded)"
+
 # CEPHADM orchestrator alert triggers
  - interval: 30s
    input_series:
@@ -1472,35 +1499,44 @@ tests:
    # trigger percent full prediction on pools 1 and 2 only
  - interval: 12h
    input_series:
-    - series: 'ceph_pool_percent_used{pool_id="1"}'
-      values: '70 75 80 87 92'
-    - series: 'ceph_pool_percent_used{pool_id="2"}'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
+      values: '78 89 79 98 78'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
       values: '22 22 23 23 24'
-    - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
+      values: '1 1 1 1 1'
+    - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
       values: '1 1 1 1 1'
-    - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
+    - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
       values: '1 1 1 1 1'
    promql_expr_test:
      - expr: |
-         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
-              group_right ceph_pool_metadata) >= 95
+         (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
+              group_right() ceph_pool_metadata) >= 95
        eval_time: 36h
        exp_samples:
-         - labels: '{name="rbd",pool_id="1",type="replicated"}'
-           value: 1.424E+02 # 142%
+         - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
+           value: 1.435E+02 # 142%
    alert_rule_test:
     - eval_time: 48h
       alertname: CephPoolGrowthWarning
       exp_alerts:
       - exp_labels:
-          name: rbd
+          instance: 8090
+          name: default.rgw.index
           pool_id: 1
           severity: warning
           type: ceph_default
           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
         exp_annotations:
           summary: Pool growth rate may soon exceed capacity
-          description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
+          description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
  - interval: 1m
    input_series:
     - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'