ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml

   1 rule_files:
   2   - ../prometheus_alerts.yml
   3 evaluation_interval: 5m
   4 tests:
   5  # health error
   6  - interval: 5m
   7    input_series:
   8     - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
   9       values: '2 2 2 2 2 2 2'
  10    promql_expr_test:
  11     - expr: ceph_health_status == 2
  12       eval_time: 5m
  13       exp_samples:
  14        - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
  15          value: 2
  16    alert_rule_test:
  17     - eval_time: 1m
  18       alertname: CephHealthError
  19     - eval_time: 6m
  20       alertname: CephHealthError
  21       exp_alerts:
  22       - exp_labels:
  23           instance: ceph:9283
  24           job: ceph
  25           oid: 1.3.6.1.4.1.50495.1.2.1.2.1
  26           type: ceph_default
  27           severity: critical
  28         exp_annotations:
  29           summary: Ceph is in the ERROR state
  30           description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
  31
  32  # health warning
  33  - interval: 5m
  34    input_series:
  35     - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
  36       values: '1 1 1 1 1 1 1 1 1 1'
  37    promql_expr_test:
  38      - expr: ceph_health_status == 1
  39        eval_time: 15m
  40        exp_samples:
  41          - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
  42            value: 1
  43    alert_rule_test:
  44     - eval_time: 10m
  45       alertname: CephHealthWarning
  46     - eval_time: 20m
  47       alertname: CephHealthWarning
  48       exp_alerts:
  49       - exp_labels:
  50           instance: ceph:9283
  51           job: ceph
  52           type: ceph_default
  53           severity: warning
  54         exp_annotations:
  55           summary: Ceph is in the WARNING state
  56           description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
  57
  58  # 10% OSDs down
  59  - interval: 1m
  60    input_series:
  61     - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
  62       values: '1 1 1 1 1'
  63     - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
  64       values: '0 0 0 0 0'
  65     - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
  66       values: '1 1 1 1 1'
  67     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
  68       ceph_version="ceph version 17.0.0-189-g3558fd72
  69       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
  70       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
  71       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
  72       public_addr="172.20.0.2"}'
  73       values: '1 1 1 1 1'
  74     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
  75       ceph_version="ceph version 17.0.0-189-g3558fd72
  76       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
  77       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
  78       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
  79       public_addr="172.20.0.2"}'
  80       values: '1 1 1 1 1'
  81     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
  82       ceph_version="ceph version 17.0.0-189-g3558fd72
  83       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
  84       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
  85       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
  86       public_addr="172.20.0.2"}'
  87       values: '1 1 1 1 1'
  88    promql_expr_test:
  89      - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
  90        eval_time: 1m
  91        exp_samples:
  92          - labels: '{}'
  93            value: 3.333333333333333E+01
  94    alert_rule_test:
  95      - eval_time: 1m
  96        alertname: CephOSDDownHigh
  97        exp_alerts:
  98        - exp_labels:
  99            oid: 1.3.6.1.4.1.50495.1.2.1.4.1
 100            type: ceph_default
 101            severity: critical
 102          exp_annotations:
 103            summary: More than 10% of OSDs are down
 104            description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
 105
 106  # flapping OSD
 107  - interval: 1s
 108    input_series:
 109     - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
 110       values: '1+1x100'
 111     - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
 112       values: '1+0x100'
 113     - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
 114       values: '1+0x100'
 115     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
 116       ceph_version="ceph version 17.0.0-189-g3558fd72
 117       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 118       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 119       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 120       public_addr="172.20.0.2"}'
 121       values: '1 1 1 1 1 1'
 122     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
 123       ceph_version="ceph version 17.0.0-189-g3558fd72
 124       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 125       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 126       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 127       public_addr="172.20.0.2"}'
 128       values: '1 1 1 1 1 1'
 129     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
 130       ceph_version="ceph version 17.0.0-189-g3558fd72
 131       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 132       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 133       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 134       public_addr="172.20.0.2"}'
 135       values: '1 1 1 1 1 1'
 136    promql_expr_test:
 137      - expr: |
 138          (
 139            rate(ceph_osd_up[5m])
 140            * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
 141          ) * 60 > 1
 142        eval_time: 1m
 143        exp_samples:
 144          - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
 145            job="ceph"}'
 146            value: 1.2200000000000001E+01
 147    alert_rule_test:
 148      - eval_time: 5m
 149        alertname: CephOSDFlapping
 150        exp_alerts:
 151        - exp_labels:
 152            ceph_daemon: osd.0
 153            hostname: ceph
 154            instance: ceph:9283
 155            job: ceph
 156            oid: 1.3.6.1.4.1.50495.1.2.1.4.4
 157            severity: warning
 158            type: ceph_default
 159          exp_annotations:
 160            documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
 161            summary: Network issues are causing OSDs to flap (mark each other down)
 162            description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
 163
 164  # high pg count deviation
 165  - interval: 1m
 166    input_series:
 167     - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
 168       job="ceph"}'
 169       values: '100 100 100 100 100 160'
 170     - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
 171       job="ceph"}'
 172       values: '100 100 100 100 100 320'
 173     - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
 174       job="ceph"}'
 175       values: '100 100 100 100 100 160'
 176     - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
 177       job="ceph"}'
 178       values: '100 100 100 100 100 160'
 179     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
 180       ceph_version="ceph version 17.0.0-189-g3558fd72
 181       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 182       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 183       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 184       public_addr="172.20.0.2"}'
 185       values: '1 1 1 1 1 1'
 186     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
 187       ceph_version="ceph version 17.0.0-189-g3558fd72
 188       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 189       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 190       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 191       public_addr="172.20.0.2"}'
 192       values: '1 1 1 1 1 1'
 193     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
 194       ceph_version="ceph version 17.0.0-189-g3558fd72
 195       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 196       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 197       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 198       public_addr="172.20.0.2"}'
 199       values: '1 1 1 1 1 1'
 200     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
 201       ceph_version="ceph version 17.0.0-189-g3558fd72
 202       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 203       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 204       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 205       public_addr="172.20.0.2"}'
 206       values: '1 1 1 1 1 1'
 207    promql_expr_test:
 208      - expr: |
 209          abs(
 210            (
 211              (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
 212              by (job)
 213            ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 214          ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 215
 216        eval_time: 5m
 217        exp_samples:
 218          - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
 219            job="ceph"}'
 220            value: 6E-01
 221    alert_rule_test:
 222      - eval_time: 10m
 223        alertname: CephPGImbalance
 224        exp_alerts:
 225        - exp_labels:
 226            ceph_daemon: osd.1
 227            hostname: ceph
 228            instance: ceph:9283
 229            job: ceph
 230            oid: 1.3.6.1.4.1.50495.1.2.1.4.5
 231            severity: warning
 232            type: ceph_default
 233          exp_annotations:
 234            summary: PGs are not balanced across OSDs
 235            description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
 236
 237  # pgs inactive
 238  - interval: 1m
 239    input_series:
 240     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 241       name="device_health_metrics",pool_id="1"}'
 242       values: '1 1 1 1 1 1 1 1'
 243     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 244       name="device_health_metrics",pool_id="2"}'
 245       values: '1 1 1 1 1 1 1 1'
 246     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 247       name="device_health_metrics",pool_id="3"}'
 248       values: '1 1 1 1 1 1 1 1'
 249     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
 250       values: '1 1 1 1 1 1 1 1'
 251     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
 252       values: '32 32 32 32 32 32 32 32'
 253     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
 254       values: '33 32 32 32 32 33 33 32'
 255     - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
 256       values: '1 1 1 1 1 1 1 1 1'
 257     - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
 258       values: '32 32 32 32 32 32 32 32'
 259     - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
 260       values: '32 32 32 32 32 32 32 32'
 261    promql_expr_test:
 262      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
 263              (ceph_pg_total - ceph_pg_active) > 0
 264        eval_time: 5m
 265        exp_samples:
 266          - labels: '{instance="ceph:9283", job="ceph",
 267            name="device_health_metrics",
 268            pool_id="3"}'
 269            value: 1
 270    alert_rule_test:
 271      - eval_time: 5m
 272        alertname: CephPGsInactive
 273        exp_alerts:
 274        - exp_labels:
 275            instance: ceph:9283
 276            job: ceph
 277            name: device_health_metrics
 278            oid: 1.3.6.1.4.1.50495.1.2.1.7.1
 279            pool_id: 3
 280            severity: critical
 281            type: ceph_default
 282          exp_annotations:
 283            summary: One or more placement groups are inactive
 284            description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
 285
 286  #pgs unclean
 287  - interval: 1m
 288    input_series:
 289     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 290       name="device_health_metrics",pool_id="1"}'
 291       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 292     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 293       name="device_health_metrics",pool_id="2"}'
 294       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 295     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 296       name="device_health_metrics",pool_id="3"}'
 297       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 298     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
 299       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 300     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
 301       values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
 302       32 32 32'
 303     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
 304       values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
 305       33 33'
 306     - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
 307       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 308     - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
 309       values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
 310       32 32'
 311     - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
 312       values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
 313       32 32'
 314    promql_expr_test:
 315      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
 316              (ceph_pg_total - ceph_pg_clean) > 0
 317        eval_time: 15m
 318        exp_samples:
 319          - labels: '{instance="ceph:9283", job="ceph",
 320            name="device_health_metrics", pool_id="3"}'
 321            value: 1
 322    alert_rule_test:
 323      - eval_time: 16m
 324        alertname: CephPGsUnclean
 325        exp_alerts:
 326        - exp_labels:
 327            instance: ceph:9283
 328            job: ceph
 329            name: device_health_metrics
 330            oid: 1.3.6.1.4.1.50495.1.2.1.7.2
 331            pool_id: 3
 332            severity: warning
 333            type: ceph_default
 334          exp_annotations:
 335            summary: One or more placement groups are marked unclean
 336            description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
 337
 338  # root volume full
 339  - interval: 1m
 340    input_series:
 341     - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
 342       --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
 343       mountpoint="/"}'
 344       values: '35336400896 35336400896 35336400896 35336400896 35336400896
 345       3525385519.104 3533640089'
 346     - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
 347       --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
 348       mountpoint="/"}'
 349       values: '73445531648 73445531648 73445531648 73445531648 73445531648
 350       73445531648 73445531648'
 351    promql_expr_test:
 352      - expr: node_filesystem_avail_bytes{mountpoint="/"} /
 353              node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
 354        eval_time: 5m
 355        exp_samples:
 356          - labels: '{device="/dev/mapper/fedora_localhost --live-home",
 357            fstype="ext4", instance="node-exporter", job="node-exporter",
 358            mountpoint="/"}'
 359            value: 4.8E+00
 360    alert_rule_test:
 361      - eval_time: 10m
 362        alertname: CephNodeRootFilesystemFull
 363        exp_alerts:
 364        - exp_labels:
 365            device: /dev/mapper/fedora_localhost --live-home
 366            fstype: ext4
 367            instance: node-exporter
 368            job: node-exporter
 369            mountpoint: /
 370            oid: 1.3.6.1.4.1.50495.1.2.1.8.1
 371            severity: critical
 372            type: ceph_default
 373          exp_annotations:
 374            summary: Root filesystem is dangerously full
 375            description: "Root volume is dangerously full: 4.811% free."
 376
 377  # network packets dropped
 378  - interval: 1m
 379    input_series:
 380     - series: 'node_network_receive_drop_total{device="eth0",
 381       instance="node-exporter",job="node-exporter"}'
 382       values: '0+600x10'
 383     - series: 'node_network_transmit_drop_total{device="eth0",
 384       instance="node-exporter",job="node-exporter"}'
 385       values: '0+600x10'
 386     - series: 'node_network_receive_packets_total{device="eth0",
 387       instance="node-exporter",job="node-exporter"}'
 388       values: '0+750x10'
 389     - series: 'node_network_transmit_packets_total{device="eth0",
 390       instance="node-exporter",job="node-exporter"}'
 391       values: '0+750x10'
 392    promql_expr_test:
 393      - expr: |
 394          (
 395            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 396            rate(node_network_transmit_drop_total{device!="lo"}[1m])
 397          ) / (
 398            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 399            rate(node_network_transmit_packets_total{device!="lo"}[1m])
 400          ) >= 0.0050000000000000001 and (
 401            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 402            rate(node_network_transmit_drop_total{device!="lo"}[1m])
 403          ) >= 10
 404
 405        eval_time: 5m
 406        exp_samples:
 407          - labels: '{device="eth0", instance="node-exporter",
 408            job="node-exporter"}'
 409            value: 8E-1
 410    alert_rule_test:
 411      - eval_time: 5m
 412        alertname: CephNodeNetworkPacketDrops
 413        exp_alerts:
 414        - exp_labels:
 415            device: eth0
 416            instance: node-exporter
 417            job: node-exporter
 418            oid: 1.3.6.1.4.1.50495.1.2.1.8.2
 419            severity: warning
 420            type: ceph_default
 421          exp_annotations:
 422            summary: One or more NICs reports packet drops
 423            description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
 424
 425  # network packets errors
 426  - interval: 1m
 427    input_series:
 428     - series: 'node_network_receive_errs_total{device="eth0",
 429       instance="node-exporter",job="node-exporter"}'
 430       values: '0+600x10'
 431     - series: 'node_network_transmit_errs_total{device="eth0",
 432       instance="node-exporter",job="node-exporter"}'
 433       values: '0+600x10'
 434     - series: 'node_network_transmit_packets_total{device="eth0",
 435       instance="node-exporter",job="node-exporter"}'
 436       values: '0+750x10'
 437     - series: 'node_network_receive_packets_total{device="eth0",
 438       instance="node-exporter",job="node-exporter"}'
 439       values: '0+750x10'
 440    promql_expr_test:
 441      - expr: |
 442          (
 443            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 444            rate(node_network_transmit_errs_total{device!="lo"}[1m])
 445          ) / (
 446            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 447            rate(node_network_transmit_packets_total{device!="lo"}[1m])
 448          ) >= 0.0001 or (
 449            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 450            rate(node_network_transmit_errs_total{device!="lo"}[1m])
 451          ) >= 10
 452
 453        eval_time: 5m
 454        exp_samples:
 455          - labels: '{device="eth0", instance="node-exporter",
 456            job="node-exporter"}'
 457            value: 8E-01
 458    alert_rule_test:
 459      - eval_time: 5m
 460        alertname: CephNodeNetworkPacketErrors
 461        exp_alerts:
 462        - exp_labels:
 463            device: eth0
 464            instance: node-exporter
 465            job: node-exporter
 466            oid: 1.3.6.1.4.1.50495.1.2.1.8.3
 467            severity: warning
 468            type: ceph_default
 469          exp_annotations:
 470            summary: One or more NICs reports packet errors
 471            description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
 472
 473  # Bond is missing a peer
 474  - interval: 1m
 475    input_series:
 476     - series: 'node_bonding_active{master="bond0",
 477       instance="node-exporter",job="node-exporter"}'
 478       values: '3'
 479     - series: 'node_bonding_slaves{master="bond0",
 480       instance="node-exporter",job="node-exporter"}'
 481       values: '4'
 482    promql_expr_test:
 483      - expr: |
 484          node_bonding_slaves - node_bonding_active != 0
 485        eval_time: 5m
 486        exp_samples:
 487          - labels: '{master="bond0", instance="node-exporter",
 488            job="node-exporter"}'
 489            value: 1
 490    alert_rule_test:
 491      - eval_time: 5m
 492        alertname: CephNodeNetworkBondDegraded
 493        exp_alerts:
 494        - exp_labels:
 495            master: bond0
 496            instance: node-exporter
 497            job: node-exporter
 498            severity: warning
 499            type: ceph_default
 500          exp_annotations:
 501            summary: Degraded Bond on Node node-exporter
 502            description: "Bond bond0 is degraded on Node node-exporter."
 503
 504 # Node Storage disk space filling up
 505  - interval: 1m
 506    # 20GB = 21474836480, 256MB = 268435456
 507    input_series:
 508     - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
 509       fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
 510       values: '21474836480-268435456x48'
 511     - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
 512       fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
 513       values: '21474836480+0x48'
 514     - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
 515       values: 1+0x48
 516     - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
 517       values: 1+0x48
 518    promql_expr_test:
 519      - expr: |
 520          predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
 521           on(instance) group_left(nodename) node_uname_info < 0
 522        eval_time: 5m
 523        exp_samples:
 524          - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
 525          mountpoint="/rootfs",nodename="node-1.unittests.com"}'
 526            value: -1.912602624E+12
 527    alert_rule_test:
 528      - eval_time: 5m
 529        alertname: CephNodeDiskspaceWarning
 530        exp_alerts:
 531        - exp_labels:
 532            severity: warning
 533            type: ceph_default
 534            oid: 1.3.6.1.4.1.50495.1.2.1.8.4
 535            device: /dev/mapper/vg-root
 536            fstype: xfs
 537            instance: node-1
 538            mountpoint: /rootfs
 539            nodename: node-1.unittests.com
 540          exp_annotations:
 541            summary: Host filesystem free space is getting low
 542            description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
 543  # MTU Mismatch
 544  - interval: 1m
 545    input_series:
 546     - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
 547       job="node-exporter"}'
 548       values: '1500 1500 1500 1500 1500'
 549     - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
 550       job="node-exporter"}'
 551       values: '1500 1500 1500 1500 1500'
 552     - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
 553       job="node-exporter"}'
 554       values: '1500 1500 1500 1500 1500'
 555     - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
 556       job="node-exporter"}'
 557       values: '1500 1500 1500 1500 1500'
 558     - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
 559       job="node-exporter"}'
 560       values: '9000 9000 9000 9000 9000'
 561     - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
 562       job="node-exporter"}'
 563       values: '2200 2200 2200 2200 2200'
 564     - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
 565       job="node-exporter"}'
 566       values: '2400 2400 2400 2400 2400'
 567     - series: 'node_network_up{device="eth0",instance="node-exporter",
 568       job="node-exporter"}'
 569       values: '0 0 0 0 0'
 570     - series: 'node_network_up{device="eth1",instance="node-exporter",
 571       job="node-exporter"}'
 572       values: '0 0 0 0 0'
 573     - series: 'node_network_up{device="eth2",instance="node-exporter",
 574       job="node-exporter"}'
 575       values: '1 1 1 1 1'
 576     - series: 'node_network_up{device="eth3",instance="node-exporter",
 577       job="node-exporter"}'
 578       values: '1 1 1 1 1'
 579     - series: 'node_network_up{device="eth4",instance="node-exporter",
 580       job="node-exporter"}'
 581       values: '1 1 1 1 1'
 582     - series: 'node_network_up{device="eth4",instance="hostname1",
 583       job="node-exporter"}'
 584       values: '1 1 1 1 1'
 585     - series: 'node_network_up{device="eth4",instance="hostname2",
 586       job="node-exporter"}'
 587       values: '0 0 0 0 0'
 588    promql_expr_test:
 589      - expr: |
 590           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
 591             scalar(
 592               max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
 593                 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
 594             )
 595           or
 596           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
 597             scalar(
 598               min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
 599                 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
 600             )
 601        eval_time: 1m
 602        exp_samples:
 603          - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
 604            value: 9000
 605          - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
 606            value: 2200
 607    alert_rule_test:
 608      - eval_time: 1m
 609        alertname: CephNodeInconsistentMTU
 610        exp_alerts:
 611        - exp_labels:
 612            device: eth4
 613            instance: hostname1
 614            job: node-exporter
 615            severity: warning
 616            type: ceph_default
 617          exp_annotations:
 618            summary: MTU settings across Ceph hosts are inconsistent
 619            description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
 620        - exp_labels:
 621            device: eth4
 622            instance: node-exporter
 623            job: node-exporter
 624            severity: warning
 625            type: ceph_default
 626          exp_annotations:
 627            summary: MTU settings across Ceph hosts are inconsistent
 628            description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
 629
 630  # pool full, data series has 6 but using topk(5) so to ensure the
 631  # results are working as expected
 632  - interval: 1m
 633    input_series:
 634     - series: 'ceph_health_detail{name="POOL_FULL"}'
 635       values: '0 0 0 1 1 1 1 1 1 1 1'
 636     - series: 'ceph_pool_percent_used{pool_id="1"}'
 637       values: '32+0x10'
 638     - series: 'ceph_pool_percent_used{pool_id="2"}'
 639       values: '96+0x10'
 640     - series: 'ceph_pool_percent_used{pool_id="3"}'
 641       values: '90+0x10'
 642     - series: 'ceph_pool_percent_used{pool_id="4"}'
 643       values: '72+0x10'
 644     - series: 'ceph_pool_percent_used{pool_id="5"}'
 645       values: '19+0x10'
 646     - series: 'ceph_pool_percent_used{pool_id="6"}'
 647       values: '10+0x10'
 648     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 649       name="cephfs_data",pool_id="1"}'
 650       values: '1 1 1 1 1 1 1 1 1'
 651     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 652       name="rbd",pool_id="2"}'
 653       values: '1 1 1 1 1 1 1 1 1'
 654     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 655       name="iscsi",pool_id="3"}'
 656       values: '1 1 1 1 1 1 1 1 1'
 657     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 658       name="default.rgw.index",pool_id="4"}'
 659       values: '1 1 1 1 1 1 1 1 1'
 660     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 661       name="default.rgw.log",pool_id="5"}'
 662       values: '1 1 1 1 1 1 1 1 1'
 663     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 664       name="dummy",pool_id="6"}'
 665       values: '1 1 1 1 1 1 1 1 1'
 666    promql_expr_test:
 667      - expr: ceph_health_detail{name="POOL_FULL"} > 0
 668        eval_time: 5m
 669        exp_samples:
 670          - labels:  '{__name__="ceph_health_detail", name="POOL_FULL"}'
 671            value: 1
 672    alert_rule_test:
 673      - eval_time: 1m
 674        alertname: CephPoolFull
 675      - eval_time: 10m
 676        alertname: CephPoolFull
 677        exp_alerts:
 678        - exp_labels:
 679            name: POOL_FULL
 680            severity: critical
 681            type: ceph_default
 682            oid: 1.3.6.1.4.1.50495.1.2.1.9.1
 683          exp_annotations:
 684            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
 685            summary: Pool is full - writes are blocked
 686            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
 687  # slow OSD ops
 688  - interval : 1m
 689    input_series:
 690     - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
 691       values: '1+0x120'
 692    promql_expr_test:
 693      - expr: ceph_healthcheck_slow_ops > 0
 694        eval_time: 1m
 695        exp_samples:
 696          - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
 697            job="ceph"}'
 698            value: 1
 699    alert_rule_test:
 700      - eval_time: 20m
 701        alertname: CephSlowOps
 702        exp_alerts:
 703        - exp_labels:
 704            instance: ceph:9283
 705            job: ceph
 706            severity: warning
 707            type: ceph_default
 708          exp_annotations:
 709            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
 710            summary: OSD operations are slow to complete
 711            description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 712
 713  # slow daemon ops
 714  - interval : 1m
 715    input_series:
 716     - series: 'ceph_daemon_health_metrics{ceph_daemon="osd.1", instance="ceph:9283",job="ceph", type="SLOW_OPS"}'
 717       values: '1+0x120'
 718    promql_expr_test:
 719      - expr: 'ceph_daemon_health_metrics{type="SLOW_OPS"} > 0'
 720        eval_time: 1m
 721        exp_samples:
 722          - labels: '{__name__="ceph_daemon_health_metrics", ceph_daemon="osd.1",instance="ceph:9283",
 723            job="ceph", type="SLOW_OPS"}'
 724            value: 1
 725    alert_rule_test:
 726      - eval_time: 20m
 727        alertname: CephDaemonSlowOps
 728        exp_alerts:
 729        - exp_labels:
 730            instance: ceph:9283
 731            ceph_daemon: "osd.1"
 732            job: ceph
 733            severity: warning
 734            type: ceph_default
 735          exp_annotations:
 736            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
 737            summary: osd.1 operations are slow to complete
 738            description: "osd.1 operations are taking too long to process (complaint time exceeded)"
 739
 740 # CEPHADM orchestrator alert triggers
 741  - interval: 30s
 742    input_series:
 743     - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
 744       values: '1+0x40'
 745    promql_expr_test:
 746      - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
 747        eval_time: 2m
 748        exp_samples:
 749          - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
 750            value: 1
 751    alert_rule_test:
 752     - eval_time: 1m
 753       alertname: CephadmUpgradeFailed
 754     - eval_time: 5m
 755       alertname: CephadmUpgradeFailed
 756       exp_alerts:
 757       - exp_labels:
 758           name: UPGRADE_EXCEPTION
 759           severity: critical
 760           type: ceph_default
 761           oid: 1.3.6.1.4.1.50495.1.2.1.11.2
 762         exp_annotations:
 763           summary: Ceph version upgrade has failed
 764           description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
 765  - interval: 30s
 766    input_series:
 767     - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
 768       values: '1+0x40'
 769    promql_expr_test:
 770      - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
 771        eval_time: 2m
 772        exp_samples:
 773          - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
 774            value: 1
 775    alert_rule_test:
 776     - eval_time: 1m
 777       alertname: CephadmDaemonFailed
 778     - eval_time: 5m
 779       alertname: CephadmDaemonFailed
 780       exp_alerts:
 781       - exp_labels:
 782           name: CEPHADM_FAILED_DAEMON
 783           severity: critical
 784           type: ceph_default
 785           oid: 1.3.6.1.4.1.50495.1.2.1.11.1
 786         exp_annotations:
 787           summary: A ceph daemon manged by cephadm is down
 788           description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
 789  - interval: 1m
 790    input_series:
 791     - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
 792       values: '1 1 1 1 1 1 1 1 1'
 793    promql_expr_test:
 794      - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
 795        eval_time: 2m
 796        exp_samples:
 797          - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
 798            value: 1
 799    alert_rule_test:
 800     - eval_time: 1m
 801       alertname: CephadmPaused
 802     - eval_time: 5m
 803       alertname: CephadmPaused
 804       exp_alerts:
 805       - exp_labels:
 806           name: CEPHADM_PAUSED
 807           severity: warning
 808           type: ceph_default
 809         exp_annotations:
 810           documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
 811           summary: Orchestration tasks via cephadm are PAUSED
 812           description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
 813 # MDS
 814  - interval: 1m
 815    input_series:
 816     - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
 817       values: '1 1 1 1 1 1 1 1 1'
 818    promql_expr_test:
 819      - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
 820        eval_time: 2m
 821        exp_samples:
 822          - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
 823            value: 1
 824    alert_rule_test:
 825     - eval_time: 1m
 826       alertname: CephFilesystemDamaged
 827     - eval_time: 5m
 828       alertname: CephFilesystemDamaged
 829       exp_alerts:
 830       - exp_labels:
 831           name: MDS_DAMAGE
 832           severity: critical
 833           type: ceph_default
 834           oid: 1.3.6.1.4.1.50495.1.2.1.5.1
 835         exp_annotations:
 836           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
 837           summary: CephFS filesystem is damaged.
 838           description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
 839  - interval: 1m
 840    input_series:
 841     - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
 842       values: '1 1 1 1 1 1 1 1 1'
 843    promql_expr_test:
 844      - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
 845        eval_time: 2m
 846        exp_samples:
 847          - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
 848            value: 1
 849    alert_rule_test:
 850     - eval_time: 1m
 851       alertname: CephFilesystemReadOnly
 852     - eval_time: 5m
 853       alertname: CephFilesystemReadOnly
 854       exp_alerts:
 855       - exp_labels:
 856           name: MDS_HEALTH_READ_ONLY
 857           severity: critical
 858           type: ceph_default
 859           oid: 1.3.6.1.4.1.50495.1.2.1.5.2
 860         exp_annotations:
 861           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
 862           summary: CephFS filesystem in read only mode due to write error(s)
 863           description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
 864  - interval: 1m
 865    input_series:
 866     - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
 867       values: '0 0 1 1 1 1 1 1 1 1 1'
 868    promql_expr_test:
 869      - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
 870        eval_time: 2m
 871        exp_samples:
 872          - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
 873            value: 1
 874    alert_rule_test:
 875     - eval_time: 1m
 876       alertname: CephFilesystemOffline
 877     - eval_time: 10m
 878       alertname: CephFilesystemOffline
 879       exp_alerts:
 880       - exp_labels:
 881           name: MDS_ALL_DOWN
 882           severity: critical
 883           type: ceph_default
 884           oid: 1.3.6.1.4.1.50495.1.2.1.5.3
 885         exp_annotations:
 886           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
 887           summary: CephFS filesystem is offline
 888           description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
 889  - interval: 1m
 890    input_series:
 891     - series: 'ceph_health_detail{name="FS_DEGRADED"}'
 892       values: '0 0 1 1 1 1 1 1 1 1 1'
 893    promql_expr_test:
 894      - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
 895        eval_time: 2m
 896        exp_samples:
 897          - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
 898            value: 1
 899    alert_rule_test:
 900     - eval_time: 1m
 901       alertname: CephFilesystemDegraded
 902     - eval_time: 10m
 903       alertname: CephFilesystemDegraded
 904       exp_alerts:
 905       - exp_labels:
 906           name: FS_DEGRADED
 907           severity: critical
 908           type: ceph_default
 909           oid: 1.3.6.1.4.1.50495.1.2.1.5.4
 910         exp_annotations:
 911           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
 912           summary: CephFS filesystem is degraded
 913           description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
 914  - interval: 1m
 915    input_series:
 916     - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
 917       values: '0 0 1 1 1 1 1 1 1 1 1'
 918    promql_expr_test:
 919      - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
 920        eval_time: 2m
 921        exp_samples:
 922          - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
 923            value: 1
 924    alert_rule_test:
 925     - eval_time: 1m
 926       alertname: CephFilesystemInsufficientStandby
 927     - eval_time: 10m
 928       alertname: CephFilesystemInsufficientStandby
 929       exp_alerts:
 930       - exp_labels:
 931           name: MDS_INSUFFICIENT_STANDBY
 932           severity: warning
 933           type: ceph_default
 934         exp_annotations:
 935           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
 936           summary: Ceph filesystem standby daemons too few
 937           description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
 938  - interval: 1m
 939    input_series:
 940     - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
 941       values: '0 0 1 1 1 1 1 1 1 1 1'
 942    promql_expr_test:
 943      - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
 944        eval_time: 2m
 945        exp_samples:
 946          - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
 947            value: 1
 948    alert_rule_test:
 949     - eval_time: 1m
 950       alertname: CephFilesystemFailureNoStandby
 951     - eval_time: 10m
 952       alertname: CephFilesystemFailureNoStandby
 953       exp_alerts:
 954       - exp_labels:
 955           name: FS_WITH_FAILED_MDS
 956           severity: critical
 957           type: ceph_default
 958           oid: 1.3.6.1.4.1.50495.1.2.1.5.5
 959         exp_annotations:
 960           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
 961           summary: MDS daemon failed, no further standby available
 962           description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
 963  - interval: 1m
 964    input_series:
 965     - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
 966       values: '0 0 1 1 1 1 1 1 1 1 1'
 967    promql_expr_test:
 968      - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
 969        eval_time: 2m
 970        exp_samples:
 971          - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
 972            value: 1
 973    alert_rule_test:
 974     - eval_time: 1m
 975       alertname: CephFilesystemMDSRanksLow
 976     - eval_time: 10m
 977       alertname: CephFilesystemMDSRanksLow
 978       exp_alerts:
 979       - exp_labels:
 980           name: MDS_UP_LESS_THAN_MAX
 981           severity: warning
 982           type: ceph_default
 983         exp_annotations:
 984           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
 985           summary: Ceph MDS daemon count is lower than configured
 986           description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
 987 # MGR
 988  - interval: 1m
 989    input_series:
 990     - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
 991       values: '1+0x2 0+0x10'
 992    promql_expr_test:
 993      - expr: up{job="ceph"} == 0
 994        eval_time: 3m
 995        exp_samples:
 996          - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
 997            value: 0
 998    alert_rule_test:
 999     - eval_time: 1m
1000       alertname: CephMgrPrometheusModuleInactive
1001     - eval_time: 10m
1002       alertname: CephMgrPrometheusModuleInactive
1003       exp_alerts:
1004       - exp_labels:
1005           instance: ceph-mgr:9283
1006           job: ceph
1007           severity: critical
1008           type: ceph_default
1009           oid: 1.3.6.1.4.1.50495.1.2.1.6.2
1010         exp_annotations:
1011           summary: The mgr/prometheus module is not available
1012           description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
1013  - interval: 1m
1014    input_series:
1015     - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
1016       values: '0+0x2 1+0x20'
1017    promql_expr_test:
1018      - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
1019        eval_time: 3m
1020        exp_samples:
1021          - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
1022            value: 1
1023    alert_rule_test:
1024     - eval_time: 1m
1025       alertname: CephMgrModuleCrash
1026     - eval_time: 15m
1027       alertname: CephMgrModuleCrash
1028       exp_alerts:
1029       - exp_labels:
1030           name: RECENT_MGR_MODULE_CRASH
1031           severity: critical
1032           type: ceph_default
1033           oid: 1.3.6.1.4.1.50495.1.2.1.6.1
1034         exp_annotations:
1035           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
1036           summary: A manager module has recently crashed
1037           description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
1038 # MON
1039  - interval: 1m
1040    input_series:
1041     - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
1042       values: '0+0x2 1+0x10'
1043     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1044       values: '1+0x13'
1045    promql_expr_test:
1046      - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
1047        eval_time: 3m
1048        exp_samples:
1049          - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
1050            value: 1
1051    alert_rule_test:
1052     - eval_time: 1m
1053       alertname: CephMonDiskspaceCritical
1054     - eval_time: 10m
1055       alertname: CephMonDiskspaceCritical
1056       exp_alerts:
1057       - exp_labels:
1058           name: "MON_DISK_CRIT"
1059           severity: critical
1060           type: ceph_default
1061           oid: 1.3.6.1.4.1.50495.1.2.1.3.2
1062         exp_annotations:
1063           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
1064           summary: Filesystem space on at least one monitor is critically low
1065           description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1066  - interval: 1m
1067    input_series:
1068     - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
1069       values: '0+0x2 1+0x10'
1070     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1071       values: '1+0x13'
1072    promql_expr_test:
1073      - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
1074        eval_time: 3m
1075        exp_samples:
1076          - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
1077            value: 1
1078    alert_rule_test:
1079     - eval_time: 1m
1080       alertname: CephMonDiskspaceLow
1081     - eval_time: 10m
1082       alertname: CephMonDiskspaceLow
1083       exp_alerts:
1084       - exp_labels:
1085           name: "MON_DISK_LOW"
1086           severity: warning
1087           type: ceph_default
1088         exp_annotations:
1089           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
1090           summary: Drive space on at least one monitor is approaching full
1091           description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1092  - interval: 1m
1093    input_series:
1094     - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
1095       values: '0+0x2 1+0x10'
1096    promql_expr_test:
1097      - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
1098        eval_time: 3m
1099        exp_samples:
1100          - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
1101            value: 1
1102    alert_rule_test:
1103     - eval_time: 1m
1104       alertname: CephMonClockSkew
1105     - eval_time: 10m
1106       alertname: CephMonClockSkew
1107       exp_alerts:
1108       - exp_labels:
1109           name: "MON_CLOCK_SKEW"
1110           severity: warning
1111           type: ceph_default
1112         exp_annotations:
1113           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
1114           summary: Clock skew detected among monitors
1115           description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
1116
1117 # Check 3 mons one down, quorum at risk
1118  - interval: 1m
1119    input_series:
1120     - series: 'ceph_health_detail{name="MON_DOWN"}'
1121       values: '0+0x2 1+0x12'
1122     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1123       values: '1+0x14'
1124     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1125       values: '1+0x14'
1126     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1127       values: '1+0x2 0+0x12'
1128     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1129       values: '1+0x14'
1130     - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1131       values: '1+0x14'
1132     - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1133       values: '1+0x14'
1134    promql_expr_test:
1135      - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
1136        eval_time: 3m
1137        exp_samples:
1138          - labels: '{}'
1139            value: 1
1140    alert_rule_test:
1141     - eval_time: 1m
1142       alertname: CephMonDownQuorumAtRisk
1143       # shouldn't fire
1144     - eval_time: 10m
1145       alertname: CephMonDownQuorumAtRisk
1146       exp_alerts:
1147       - exp_labels:
1148           severity: critical
1149           type: ceph_default
1150           oid: 1.3.6.1.4.1.50495.1.2.1.3.1
1151         exp_annotations:
1152           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1153           summary: Monitor quorum is at risk
1154           description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
1155 # check 5 mons, 1 down - warning only
1156  - interval: 1m
1157    input_series:
1158     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1159       values: '1+0x14'
1160     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1161       values: '1+0x14'
1162     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1163       values: '1+0x14'
1164     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
1165       values: '1+0x14'
1166     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
1167       values: '1+0x2 0+0x12'
1168     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1169       values: '1+0x14'
1170     - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1171       values: '1+0x14'
1172     - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1173       values: '1+0x14'
1174     - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
1175       values: '1+0x14'
1176     - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
1177       values: '1+0x14'
1178    promql_expr_test:
1179      - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
1180        eval_time: 3m
1181        exp_samples:
1182          - labels: '{}'
1183            value: 1
1184    alert_rule_test:
1185     - eval_time: 1m
1186       alertname: CephMonDown
1187     - eval_time: 10m
1188       alertname: CephMonDown
1189       exp_alerts:
1190       - exp_labels:
1191           severity: warning
1192           type: ceph_default
1193         exp_annotations:
1194           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1195           summary: One or more monitors down
1196           description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down:   - mon.e on ceph-mon-5\n"
1197 # Device Health
1198  - interval: 1m
1199    input_series:
1200     - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
1201       values: '0+0x2 1+0x10'
1202    promql_expr_test:
1203      - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
1204        eval_time: 3m
1205        exp_samples:
1206          - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
1207            value: 1
1208    alert_rule_test:
1209     - eval_time: 1m
1210       alertname: CephDeviceFailurePredicted
1211     - eval_time: 10m
1212       alertname: CephDeviceFailurePredicted
1213       exp_alerts:
1214       - exp_labels:
1215           name: "DEVICE_HEALTH"
1216           severity: warning
1217           type: ceph_default
1218         exp_annotations:
1219           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
1220           summary: Device(s) predicted to fail soon
1221           description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
1222  - interval: 1m
1223    input_series:
1224     - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
1225       values: '0+0x2 1+0x10'
1226    promql_expr_test:
1227      - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
1228        eval_time: 3m
1229        exp_samples:
1230          - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
1231            value: 1
1232    alert_rule_test:
1233     - eval_time: 1m
1234       alertname: CephDeviceFailurePredictionTooHigh
1235     - eval_time: 10m
1236       alertname: CephDeviceFailurePredictionTooHigh
1237       exp_alerts:
1238       - exp_labels:
1239           name: "DEVICE_HEALTH_TOOMANY"
1240           severity: critical
1241           type: ceph_default
1242           oid: 1.3.6.1.4.1.50495.1.2.1.4.7
1243         exp_annotations:
1244           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
1245           summary: Too many devices are predicted to fail, unable to resolve
1246           description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
1247  - interval: 1m
1248    input_series:
1249     - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
1250       values: '0+0x2 1+0x10'
1251    promql_expr_test:
1252      - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
1253        eval_time: 3m
1254        exp_samples:
1255          - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
1256            value: 1
1257    alert_rule_test:
1258     - eval_time: 1m
1259       alertname: CephDeviceFailureRelocationIncomplete
1260     - eval_time: 10m
1261       alertname: CephDeviceFailureRelocationIncomplete
1262       exp_alerts:
1263       - exp_labels:
1264           name: "DEVICE_HEALTH_IN_USE"
1265           severity: warning
1266           type: ceph_default
1267         exp_annotations:
1268           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
1269           summary: Device failure is predicted, but unable to relocate data
1270           description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
1271 # OSD
1272  - interval: 1m
1273    input_series:
1274     - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
1275       values: '0+0x2 1+0x10'
1276     - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1277       values: '1+0x2 0+0x10'
1278     - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1279       values: '1+0x12'
1280    promql_expr_test:
1281      - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
1282        eval_time: 3m
1283        exp_samples:
1284          - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
1285            value: 1
1286    alert_rule_test:
1287     - eval_time: 1m
1288       alertname: CephOSDHostDown
1289     - eval_time: 10m
1290       alertname: CephOSDHostDown
1291       exp_alerts:
1292       - exp_labels:
1293           name: "OSD_HOST_DOWN"
1294           severity: warning
1295           type: ceph_default
1296           oid: 1.3.6.1.4.1.50495.1.2.1.4.8
1297         exp_annotations:
1298           summary: An OSD host is offline
1299           description: "The following OSDs are down: - ceph-osd-1 : osd.0"
1300  - interval: 1m
1301    input_series:
1302     - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
1303       values: '0+0x2 1+0x20'
1304    promql_expr_test:
1305      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
1306        eval_time: 1m
1307        exp_samples:
1308          - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
1309            value: 0
1310    alert_rule_test:
1311     - eval_time: 1m
1312       alertname: CephOSDTimeoutsPublicNetwork
1313     - eval_time: 10m
1314       alertname: CephOSDTimeoutsPublicNetwork
1315       exp_alerts:
1316       - exp_labels:
1317           name: "OSD_SLOW_PING_TIME_FRONT"
1318           severity: warning
1319           type: ceph_default
1320         exp_annotations:
1321           summary: Network issues delaying OSD heartbeats (public network)
1322           description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
1323  - interval: 1m
1324    input_series:
1325     - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
1326       values: '0+0x2 1+0x20'
1327    promql_expr_test:
1328      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
1329        eval_time: 1m
1330        exp_samples:
1331          - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
1332            value: 0
1333    alert_rule_test:
1334     - eval_time: 1m
1335       alertname: CephOSDTimeoutsClusterNetwork
1336     - eval_time: 10m
1337       alertname: CephOSDTimeoutsClusterNetwork
1338       exp_alerts:
1339       - exp_labels:
1340           name: "OSD_SLOW_PING_TIME_BACK"
1341           severity: warning
1342           type: ceph_default
1343         exp_annotations:
1344           summary: Network issues delaying OSD heartbeats (cluster network)
1345           description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
1346  - interval: 1m
1347    input_series:
1348     - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1349       values: '0+0x2 1+0x20'
1350    promql_expr_test:
1351      - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
1352        eval_time: 1m
1353        exp_samples:
1354          - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1355            value: 0
1356    alert_rule_test:
1357     - eval_time: 1m
1358       alertname: CephOSDInternalDiskSizeMismatch
1359     - eval_time: 10m
1360       alertname: CephOSDInternalDiskSizeMismatch
1361       exp_alerts:
1362       - exp_labels:
1363           name: "BLUESTORE_DISK_SIZE_MISMATCH"
1364           severity: warning
1365           type: ceph_default
1366         exp_annotations:
1367           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
1368           summary: OSD size inconsistency error
1369           description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
1370  - interval: 30s
1371    input_series:
1372     - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1373       values: '0+0x2 1+0x20'
1374    promql_expr_test:
1375      - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
1376        eval_time: 3m
1377        exp_samples:
1378          - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1379            value: 1
1380    alert_rule_test:
1381     - eval_time: 1m
1382       alertname: CephOSDReadErrors
1383     - eval_time: 10m
1384       alertname: CephOSDReadErrors
1385       exp_alerts:
1386       - exp_labels:
1387           name: "BLUESTORE_SPURIOUS_READ_ERRORS"
1388           severity: warning
1389           type: ceph_default
1390         exp_annotations:
1391           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
1392           summary: Device read errors detected
1393           description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
1394  - interval: 1m
1395    input_series:
1396     - series: 'ceph_health_detail{name="OSD_DOWN"}'
1397       values: '0+0x2 1+0x10'
1398     - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1399       values: '1+0x12'
1400     - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1401       values: '1+0x2 0+0x10'
1402     - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1403       values: '1+0x12'
1404     - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1405       values: '1+0x12'
1406     - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
1407       values: '1+0x12'
1408     - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
1409       values: '1+0x12'
1410    promql_expr_test:
1411      - expr: ceph_health_detail{name="OSD_DOWN"} == 1
1412        eval_time: 3m
1413        exp_samples:
1414          - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
1415            value: 1
1416    alert_rule_test:
1417     - eval_time: 1m
1418       alertname: CephOSDDown
1419     - eval_time: 10m
1420       alertname: CephOSDDown
1421       exp_alerts:
1422       - exp_labels:
1423           name: "OSD_DOWN"
1424           severity: warning
1425           type: ceph_default
1426           oid: 1.3.6.1.4.1.50495.1.2.1.4.2
1427         exp_annotations:
1428           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
1429           summary: An OSD has been marked down
1430           description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
1431  - interval: 1m
1432    input_series:
1433     - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
1434       values: '0+0x2 1+0x10'
1435    promql_expr_test:
1436      - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
1437        eval_time: 3m
1438        exp_samples:
1439          - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
1440            value: 1
1441    alert_rule_test:
1442     - eval_time: 1m
1443       alertname: CephOSDNearFull
1444     - eval_time: 10m
1445       alertname: CephOSDNearFull
1446       exp_alerts:
1447       - exp_labels:
1448           name: "OSD_NEARFULL"
1449           severity: warning
1450           type: ceph_default
1451           oid: 1.3.6.1.4.1.50495.1.2.1.4.3
1452         exp_annotations:
1453           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
1454           summary: OSD(s) running low on free space (NEARFULL)
1455           description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1456  - interval: 1m
1457    input_series:
1458     - series: 'ceph_health_detail{name="OSD_FULL"}'
1459       values: '0+0x2 1+0x10'
1460    promql_expr_test:
1461      - expr: ceph_health_detail{name="OSD_FULL"} == 1
1462        eval_time: 3m
1463        exp_samples:
1464          - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
1465            value: 1
1466    alert_rule_test:
1467     - eval_time: 1m
1468       alertname: CephOSDFull
1469     - eval_time: 10m
1470       alertname: CephOSDFull
1471       exp_alerts:
1472       - exp_labels:
1473           name: "OSD_FULL"
1474           severity: critical
1475           type: ceph_default
1476           oid: 1.3.6.1.4.1.50495.1.2.1.4.6
1477         exp_annotations:
1478           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
1479           summary: OSD full, writes blocked
1480           description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1481  - interval: 1m
1482    input_series:
1483     - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
1484       values: '0+0x2 1+0x10'
1485    promql_expr_test:
1486      - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
1487        eval_time: 3m
1488        exp_samples:
1489          - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
1490            value: 1
1491    alert_rule_test:
1492     - eval_time: 1m
1493       alertname: CephOSDBackfillFull
1494     - eval_time: 10m
1495       alertname: CephOSDBackfillFull
1496       exp_alerts:
1497       - exp_labels:
1498           name: "OSD_BACKFILLFULL"
1499           severity: warning
1500           type: ceph_default
1501         exp_annotations:
1502           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
1503           summary: OSD(s) too full for backfill operations
1504           description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
1505  - interval: 30s
1506    input_series:
1507     - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
1508       values: '0+0x2 1+0x20'
1509    promql_expr_test:
1510      - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
1511        eval_time: 1m
1512        exp_samples:
1513          - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
1514            value: 0
1515    alert_rule_test:
1516     - eval_time: 1m
1517       alertname: CephOSDTooManyRepairs
1518     - eval_time: 10m
1519       alertname: CephOSDTooManyRepairs
1520       exp_alerts:
1521       - exp_labels:
1522           name: "OSD_TOO_MANY_REPAIRS"
1523           severity: warning
1524           type: ceph_default
1525         exp_annotations:
1526           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
1527           summary: OSD reports a high number of read errors
1528           description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
1529 # Pools
1530    # trigger percent full prediction on pools 1 and 2 only
1531  - interval: 12h
1532    input_series:
1533     - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
1534       values: '1 1 1 1 1'
1535     - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
1536       values: '78 89 79 98 78'
1537     - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
1538       values: '1 1 1 1 1'
1539     - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
1540       values: '22 22 23 23 24'
1541     - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
1542       values: '1 1 1 1 1'
1543     - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
1544       values: '1 1 1 1 1'
1545     - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
1546       values: '1 1 1 1 1'
1547     - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
1548       values: '1 1 1 1 1'
1549    promql_expr_test:
1550      - expr: |
1551          (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
1552               group_right() ceph_pool_metadata) >= 95
1553        eval_time: 36h
1554        exp_samples:
1555          - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
1556            value: 1.435E+02 # 142%
1557    alert_rule_test:
1558     - eval_time: 48h
1559       alertname: CephPoolGrowthWarning
1560       exp_alerts:
1561       - exp_labels:
1562           instance: 8090
1563           name: default.rgw.index
1564           pool_id: 1
1565           severity: warning
1566           type: ceph_default
1567           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
1568         exp_annotations:
1569           summary: Pool growth rate may soon exceed capacity
1570           description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
1571  - interval: 1m
1572    input_series:
1573     - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
1574       values: '0+0x2 1+0x10'
1575    promql_expr_test:
1576      - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
1577        eval_time: 3m
1578        exp_samples:
1579          - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
1580            value: 1
1581    alert_rule_test:
1582     - eval_time: 1m
1583       alertname: CephPoolBackfillFull
1584     - eval_time: 5m
1585       alertname: CephPoolBackfillFull
1586       exp_alerts:
1587       - exp_labels:
1588           name: "POOL_BACKFILLFULL"
1589           severity: warning
1590           type: ceph_default
1591         exp_annotations:
1592           summary: Free space in a pool is too low for recovery/backfill
1593           description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
1594
1595  - interval: 1m
1596    input_series:
1597     - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
1598       values: '0+0x2 1+0x10'
1599    promql_expr_test:
1600      - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
1601        eval_time: 3m
1602        exp_samples:
1603          - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
1604            value: 1
1605    alert_rule_test:
1606     - eval_time: 1m
1607       alertname: CephPoolNearFull
1608     - eval_time: 10m
1609       alertname: CephPoolNearFull
1610       exp_alerts:
1611       - exp_labels:
1612           name: "POOL_NEAR_FULL"
1613           severity: warning
1614           type: ceph_default
1615         exp_annotations:
1616           summary: One or more Ceph pools are nearly full
1617           description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
1618
1619 # PGs
1620  - interval: 1m
1621    input_series:
1622     - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
1623       values: '0+0x2 1+0x10'
1624    promql_expr_test:
1625      - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
1626        eval_time: 3m
1627        exp_samples:
1628          - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
1629            value: 1
1630    alert_rule_test:
1631     - eval_time: 1m
1632       alertname: CephPGNotScrubbed
1633     - eval_time: 10m
1634       alertname: CephPGNotScrubbed
1635       exp_alerts:
1636       - exp_labels:
1637           name: "PG_NOT_SCRUBBED"
1638           severity: warning
1639           type: ceph_default
1640         exp_annotations:
1641           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
1642           summary: Placement group(s) have not been scrubbed
1643           description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
1644  - interval: 1m
1645    input_series:
1646     - series: 'ceph_health_detail{name="PG_DAMAGED"}'
1647       values: '0+0x4 1+0x20'
1648    promql_expr_test:
1649      - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
1650        eval_time: 5m
1651        exp_samples:
1652          - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
1653            value: 1
1654    alert_rule_test:
1655     - eval_time: 1m
1656       alertname: CephPGsDamaged
1657     - eval_time: 10m
1658       alertname: CephPGsDamaged
1659       exp_alerts:
1660       - exp_labels:
1661           name: "PG_DAMAGED"
1662           severity: critical
1663           type: ceph_default
1664           oid: 1.3.6.1.4.1.50495.1.2.1.7.4
1665         exp_annotations:
1666           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
1667           summary: Placement group damaged, manual intervention needed
1668           description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
1669  - interval: 1m
1670    input_series:
1671     - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
1672       values: '0+0x4 1+0x20'
1673    promql_expr_test:
1674      - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
1675        eval_time: 5m
1676        exp_samples:
1677          - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
1678            value: 1
1679    alert_rule_test:
1680     - eval_time: 1m
1681       alertname: CephPGsHighPerOSD
1682     - eval_time: 10m
1683       alertname: CephPGsHighPerOSD
1684       exp_alerts:
1685       - exp_labels:
1686           name: "TOO_MANY_PGS"
1687           severity: warning
1688           type: ceph_default
1689         exp_annotations:
1690           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
1691           summary: Placement groups per OSD is too high
1692           description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
1693  - interval: 1m
1694    input_series:
1695     - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
1696       values: '0+0x2 1+0x20'
1697    promql_expr_test:
1698      - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
1699        eval_time: 1m
1700        exp_samples:
1701          - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
1702            value: 0
1703    alert_rule_test:
1704     - eval_time: 1m
1705       alertname: CephPGRecoveryAtRisk
1706     - eval_time: 10m
1707       alertname: CephPGRecoveryAtRisk
1708       exp_alerts:
1709       - exp_labels:
1710           name: "PG_RECOVERY_FULL"
1711           severity: critical
1712           type: ceph_default
1713           oid: 1.3.6.1.4.1.50495.1.2.1.7.5
1714         exp_annotations:
1715           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
1716           summary: OSDs are too full for recovery
1717           description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
1718  - interval: 1m
1719    input_series:
1720     - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
1721       values: '0+0x2 1+0x20'
1722    promql_expr_test:
1723      - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
1724        eval_time: 1m
1725        exp_samples:
1726          - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
1727            value: 0
1728    alert_rule_test:
1729     - eval_time: 1m
1730       alertname: CephPGBackfillAtRisk
1731     - eval_time: 10m
1732       alertname: CephPGBackfillAtRisk
1733       exp_alerts:
1734       - exp_labels:
1735           name: "PG_BACKFILL_FULL"
1736           severity: critical
1737           type: ceph_default
1738           oid: 1.3.6.1.4.1.50495.1.2.1.7.6
1739         exp_annotations:
1740           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
1741           summary: Backfill operations are blocked due to lack of free space
1742           description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
1743  - interval: 1m
1744    input_series:
1745     - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
1746       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
1747     - series: 'ceph_health_detail{name="OSD_DOWN"}'
1748       values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
1749    promql_expr_test:
1750      - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
1751        eval_time: 1m
1752        # empty set at 1m
1753        exp_samples:
1754    alert_rule_test:
1755     # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
1756     - eval_time: 1m
1757       alertname: CephPGUnavilableBlockingIO
1758       exp_alerts:
1759     # PG_AVAILABILITY firing, but osd_down is active .. no alert
1760     - eval_time: 5m
1761       alertname: CephPGUnavilableBlockingIO
1762       exp_alerts:
1763     # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
1764     - eval_time: 15m
1765       alertname: CephPGUnavilableBlockingIO
1766       exp_alerts:
1767       - exp_labels:
1768           name: "PG_AVAILABILITY"
1769           severity: critical
1770           type: ceph_default
1771           oid: 1.3.6.1.4.1.50495.1.2.1.7.3
1772         exp_annotations:
1773           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
1774           summary: PG is unavailable, blocking I/O
1775           description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
1776  - interval: 1m
1777    input_series:
1778     - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
1779       values: '0+0x2 1+0x10'
1780    promql_expr_test:
1781      - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
1782        eval_time: 3m
1783        exp_samples:
1784          - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
1785            value: 1
1786    alert_rule_test:
1787     - eval_time: 1m
1788       alertname: CephPGNotDeepScrubbed
1789     - eval_time: 10m
1790       alertname: CephPGNotDeepScrubbed
1791       exp_alerts:
1792       - exp_labels:
1793           name: "PG_NOT_DEEP_SCRUBBED"
1794           severity: warning
1795           type: ceph_default
1796         exp_annotations:
1797           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
1798           summary: Placement group(s) have not been deep scrubbed
1799           description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
1800
1801 # Prometheus
1802  - interval: 1m
1803    input_series:
1804     - series: 'up{job="myjob"}'
1805       values: '1+0x10'
1806    promql_expr_test:
1807      - expr: absent(up{job="ceph"})
1808        eval_time: 1m
1809        exp_samples:
1810          - labels: '{job="ceph"}'
1811            value: 1
1812    alert_rule_test:
1813     - eval_time: 5m
1814       alertname: PrometheusJobMissing
1815       exp_alerts:
1816       - exp_labels:
1817           job: ceph
1818           severity: critical
1819           type: ceph_default
1820           oid: 1.3.6.1.4.1.50495.1.2.1.12.1
1821         exp_annotations:
1822           summary: The scrape job for Ceph is missing from Prometheus
1823           description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance.
1824 # RADOS
1825  - interval: 1m
1826    input_series:
1827     - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
1828       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1829     - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1830       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1831     - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1832       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1833     - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1834       values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1835     - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
1836       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1837     - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
1838       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1839     - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
1840       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1841    promql_expr_test:
1842      - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
1843        eval_time: 1m
1844        exp_samples:
1845    alert_rule_test:
1846     # OBJECT_UNFOUND but osd.2 is down, so don't fire
1847     - eval_time: 5m
1848       alertname: CephObjectMissing
1849       exp_alerts:
1850     # OBJECT_UNFOUND and all osd's are online, so fire
1851     - eval_time: 15m
1852       alertname: CephObjectMissing
1853       exp_alerts:
1854       - exp_labels:
1855           severity: critical
1856           type: ceph_default
1857           oid: 1.3.6.1.4.1.50495.1.2.1.10.1
1858         exp_annotations:
1859           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
1860           summary: Object(s) marked UNFOUND
1861           description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
1862 # Generic Alerts
1863  - interval: 1m
1864    input_series:
1865     - series: 'ceph_health_detail{name="RECENT_CRASH"}'
1866       values: '0 0 0 1 1 1 1 1 1 1 1'
1867    promql_expr_test:
1868      - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1869        eval_time: 1m
1870        exp_samples:
1871    alert_rule_test:
1872     # not firing
1873     - eval_time: 1m
1874       alertname: CephDaemonCrash
1875       exp_alerts:
1876     # firing
1877     - eval_time: 10m
1878       alertname: CephDaemonCrash
1879       exp_alerts:
1880       - exp_labels:
1881           name: RECENT_CRASH
1882           severity: critical
1883           type: ceph_default
1884           oid: 1.3.6.1.4.1.50495.1.2.1.1.2
1885         exp_annotations:
1886           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
1887           summary: One or more Ceph daemons have crashed, and are pending acknowledgement
1888           description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.