ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml

   1 rule_files:
   2   - ../prometheus_alerts.yml
   3 evaluation_interval: 5m
   4 tests:
   5  # health error
   6  - interval: 5m
   7    input_series:
   8     - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
   9       values: '2 2 2 2 2 2 2'
  10    promql_expr_test:
  11     - expr: ceph_health_status == 2
  12       eval_time: 5m
  13       exp_samples:
  14        - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
  15          value: 2
  16    alert_rule_test:
  17     - eval_time: 1m
  18       alertname: CephHealthError
  19     - eval_time: 6m
  20       alertname: CephHealthError
  21       exp_alerts:
  22       - exp_labels:
  23           instance: ceph:9283
  24           job: ceph
  25           oid: 1.3.6.1.4.1.50495.1.2.1.2.1
  26           type: ceph_default
  27           severity: critical
  28         exp_annotations:
  29           summary: Ceph is in the ERROR state
  30           description: The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information.
  31
  32  # health warning
  33  - interval: 5m
  34    input_series:
  35     - series: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
  36       values: '1 1 1 1 1 1 1 1 1 1'
  37    promql_expr_test:
  38      - expr: ceph_health_status == 1
  39        eval_time: 15m
  40        exp_samples:
  41          - labels: 'ceph_health_status{instance="ceph:9283",job="ceph"}'
  42            value: 1
  43    alert_rule_test:
  44     - eval_time: 10m
  45       alertname: CephHealthWarning
  46     - eval_time: 20m
  47       alertname: CephHealthWarning
  48       exp_alerts:
  49       - exp_labels:
  50           instance: ceph:9283
  51           job: ceph
  52           type: ceph_default
  53           severity: warning
  54         exp_annotations:
  55           summary: Ceph is in the WARNING state
  56           description: The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information.
  57
  58  # 10% OSDs down
  59  - interval: 1m
  60    input_series:
  61     - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
  62       values: '1 1 1 1 1'
  63     - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
  64       values: '0 0 0 0 0'
  65     - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
  66       values: '1 1 1 1 1'
  67     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
  68       ceph_version="ceph version 17.0.0-189-g3558fd72
  69       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
  70       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
  71       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
  72       public_addr="172.20.0.2"}'
  73       values: '1 1 1 1 1'
  74     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
  75       ceph_version="ceph version 17.0.0-189-g3558fd72
  76       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
  77       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
  78       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
  79       public_addr="172.20.0.2"}'
  80       values: '1 1 1 1 1'
  81     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
  82       ceph_version="ceph version 17.0.0-189-g3558fd72
  83       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
  84       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
  85       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
  86       public_addr="172.20.0.2"}'
  87       values: '1 1 1 1 1'
  88    promql_expr_test:
  89      - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10
  90        eval_time: 1m
  91        exp_samples:
  92          - labels: '{}'
  93            value: 3.333333333333333E+01
  94    alert_rule_test:
  95      - eval_time: 1m
  96        alertname: CephOSDDownHigh
  97        exp_alerts:
  98        - exp_labels:
  99            oid: 1.3.6.1.4.1.50495.1.2.1.4.1
 100            type: ceph_default
 101            severity: critical
 102          exp_annotations:
 103            summary: More than 10% of OSDs are down
 104            description: "33.33% or 1 of 3 OSDs are down (>= 10%). The following OSDs are down: - osd.1 on ceph"
 105
 106  # flapping OSD
 107  - interval: 1s
 108    input_series:
 109     - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}'
 110       values: '1+1x100'
 111     - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}'
 112       values: '1+0x100'
 113     - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}'
 114       values: '1+0x100'
 115     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
 116       ceph_version="ceph version 17.0.0-189-g3558fd72
 117       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 118       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 119       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 120       public_addr="172.20.0.2"}'
 121       values: '1 1 1 1 1 1'
 122     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
 123       ceph_version="ceph version 17.0.0-189-g3558fd72
 124       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 125       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 126       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 127       public_addr="172.20.0.2"}'
 128       values: '1 1 1 1 1 1'
 129     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
 130       ceph_version="ceph version 17.0.0-189-g3558fd72
 131       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 132       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 133       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 134       public_addr="172.20.0.2"}'
 135       values: '1 1 1 1 1 1'
 136    promql_expr_test:
 137      - expr: |
 138          (
 139            rate(ceph_osd_up[5m])
 140            * on(ceph_daemon) group_left(hostname) ceph_osd_metadata
 141          ) * 60 > 1
 142        eval_time: 1m
 143        exp_samples:
 144          - labels: '{ceph_daemon="osd.0", hostname="ceph", instance="ceph:9283",
 145            job="ceph"}'
 146            value: 1.2200000000000001E+01
 147    alert_rule_test:
 148      - eval_time: 5m
 149        alertname: CephOSDFlapping
 150        exp_alerts:
 151        - exp_labels:
 152            ceph_daemon: osd.0
 153            hostname: ceph
 154            instance: ceph:9283
 155            job: ceph
 156            oid: 1.3.6.1.4.1.50495.1.2.1.4.4
 157            severity: warning
 158            type: ceph_default
 159          exp_annotations:
 160            documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
 161            summary: Network issues are causing OSDs to flap (mark each other down)
 162            description: "OSD osd.0 on ceph was marked down and back up 20.1 times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)."
 163
 164  # high pg count deviation
 165  - interval: 1m
 166    input_series:
 167     - series: 'ceph_osd_numpg{ceph_daemon="osd.0",instance="ceph:9283",
 168       job="ceph"}'
 169       values: '100 100 100 100 100 160'
 170     - series: 'ceph_osd_numpg{ceph_daemon="osd.1",instance="ceph:9283",
 171       job="ceph"}'
 172       values: '100 100 100 100 100 320'
 173     - series: 'ceph_osd_numpg{ceph_daemon="osd.2",instance="ceph:9283",
 174       job="ceph"}'
 175       values: '100 100 100 100 100 160'
 176     - series: 'ceph_osd_numpg{ceph_daemon="osd.3",instance="ceph:9283",
 177       job="ceph"}'
 178       values: '100 100 100 100 100 160'
 179     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0",
 180       ceph_version="ceph version 17.0.0-189-g3558fd72
 181       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 182       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 183       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 184       public_addr="172.20.0.2"}'
 185       values: '1 1 1 1 1 1'
 186     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1",
 187       ceph_version="ceph version 17.0.0-189-g3558fd72
 188       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 189       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 190       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 191       public_addr="172.20.0.2"}'
 192       values: '1 1 1 1 1 1'
 193     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2",
 194       ceph_version="ceph version 17.0.0-189-g3558fd72
 195       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 196       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 197       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 198       public_addr="172.20.0.2"}'
 199       values: '1 1 1 1 1 1'
 200     - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.3",
 201       ceph_version="ceph version 17.0.0-189-g3558fd72
 202       (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)",
 203       cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0",
 204       hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore",
 205       public_addr="172.20.0.2"}'
 206       values: '1 1 1 1 1 1'
 207    promql_expr_test:
 208      - expr: |
 209          abs(
 210            (
 211              (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0)
 212              by (job)
 213            ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)
 214          ) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30
 215
 216        eval_time: 5m
 217        exp_samples:
 218          - labels: '{ceph_daemon="osd.1", hostname="ceph", instance="ceph:9283",
 219            job="ceph"}'
 220            value: 6E-01
 221    alert_rule_test:
 222      - eval_time: 10m
 223        alertname: CephPGImbalance
 224        exp_alerts:
 225        - exp_labels:
 226            ceph_daemon: osd.1
 227            hostname: ceph
 228            instance: ceph:9283
 229            job: ceph
 230            oid: 1.3.6.1.4.1.50495.1.2.1.4.5
 231            severity: warning
 232            type: ceph_default
 233          exp_annotations:
 234            summary: PGs are not balanced across OSDs
 235            description: "OSD osd.1 on ceph deviates by more than 30% from average PG count."
 236
 237  # pgs inactive
 238  - interval: 1m
 239    input_series:
 240     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 241       name="device_health_metrics",pool_id="1"}'
 242       values: '1 1 1 1 1 1 1 1'
 243     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 244       name="device_health_metrics",pool_id="2"}'
 245       values: '1 1 1 1 1 1 1 1'
 246     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 247       name="device_health_metrics",pool_id="3"}'
 248       values: '1 1 1 1 1 1 1 1'
 249     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
 250       values: '1 1 1 1 1 1 1 1'
 251     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
 252       values: '32 32 32 32 32 32 32 32'
 253     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
 254       values: '33 32 32 32 32 33 33 32'
 255     - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="1"}'
 256       values: '1 1 1 1 1 1 1 1 1'
 257     - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="2"}'
 258       values: '32 32 32 32 32 32 32 32'
 259     - series: 'ceph_pg_active{instance="ceph:9283",job="ceph",pool_id="3"}'
 260       values: '32 32 32 32 32 32 32 32'
 261    promql_expr_test:
 262      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
 263              (ceph_pg_total - ceph_pg_active) > 0
 264        eval_time: 5m
 265        exp_samples:
 266          - labels: '{instance="ceph:9283", job="ceph",
 267            name="device_health_metrics",
 268            pool_id="3"}'
 269            value: 1
 270    alert_rule_test:
 271      - eval_time: 5m
 272        alertname: CephPGsInactive
 273        exp_alerts:
 274        - exp_labels:
 275            instance: ceph:9283
 276            job: ceph
 277            name: device_health_metrics
 278            oid: 1.3.6.1.4.1.50495.1.2.1.7.1
 279            pool_id: 3
 280            severity: critical
 281            type: ceph_default
 282          exp_annotations:
 283            summary: One or more placement groups are inactive
 284            description: "1 PGs have been inactive for more than 5 minutes in pool device_health_metrics. Inactive placement groups are not able to serve read/write requests."
 285
 286  #pgs unclean
 287  - interval: 1m
 288    input_series:
 289     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 290       name="device_health_metrics",pool_id="1"}'
 291       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 292     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 293       name="device_health_metrics",pool_id="2"}'
 294       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 295     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 296       name="device_health_metrics",pool_id="3"}'
 297       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 298     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="1"}'
 299       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 300     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="2"}'
 301       values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
 302       32 32 32'
 303     - series: 'ceph_pg_total{instance="ceph:9283",job="ceph",pool_id="3"}'
 304       values: '33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
 305       33 33'
 306     - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="1"}'
 307       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
 308     - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="2"}'
 309       values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
 310       32 32'
 311     - series: 'ceph_pg_clean{instance="ceph:9283",job="ceph",pool_id="3"}'
 312       values: '32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
 313       32 32'
 314    promql_expr_test:
 315      - expr: ceph_pool_metadata * on(pool_id,instance) group_left()
 316              (ceph_pg_total - ceph_pg_clean) > 0
 317        eval_time: 15m
 318        exp_samples:
 319          - labels: '{instance="ceph:9283", job="ceph",
 320            name="device_health_metrics", pool_id="3"}'
 321            value: 1
 322    alert_rule_test:
 323      - eval_time: 16m
 324        alertname: CephPGsUnclean
 325        exp_alerts:
 326        - exp_labels:
 327            instance: ceph:9283
 328            job: ceph
 329            name: device_health_metrics
 330            oid: 1.3.6.1.4.1.50495.1.2.1.7.2
 331            pool_id: 3
 332            severity: warning
 333            type: ceph_default
 334          exp_annotations:
 335            summary: One or more placement groups are marked unclean
 336            description: "1 PGs have been unclean for more than 15 minutes in pool device_health_metrics. Unclean PGs have not recovered from a previous failure."
 337
 338  # root volume full
 339  - interval: 1m
 340    input_series:
 341     - series: 'node_filesystem_avail_bytes{device="/dev/mapper/fedora_localhost
 342       --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
 343       mountpoint="/"}'
 344       values: '35336400896 35336400896 35336400896 35336400896 35336400896
 345       3525385519.104 3533640089'
 346     - series: 'node_filesystem_size_bytes{device="/dev/mapper/fedora_localhost
 347       --live-home",fstype="ext4",instance="node-exporter",job="node-exporter",
 348       mountpoint="/"}'
 349       values: '73445531648 73445531648 73445531648 73445531648 73445531648
 350       73445531648 73445531648'
 351    promql_expr_test:
 352      - expr: node_filesystem_avail_bytes{mountpoint="/"} /
 353              node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
 354        eval_time: 5m
 355        exp_samples:
 356          - labels: '{device="/dev/mapper/fedora_localhost --live-home",
 357            fstype="ext4", instance="node-exporter", job="node-exporter",
 358            mountpoint="/"}'
 359            value: 4.8E+00
 360    alert_rule_test:
 361      - eval_time: 10m
 362        alertname: CephNodeRootFilesystemFull
 363        exp_alerts:
 364        - exp_labels:
 365            device: /dev/mapper/fedora_localhost --live-home
 366            fstype: ext4
 367            instance: node-exporter
 368            job: node-exporter
 369            mountpoint: /
 370            oid: 1.3.6.1.4.1.50495.1.2.1.8.1
 371            severity: critical
 372            type: ceph_default
 373          exp_annotations:
 374            summary: Root filesystem is dangerously full
 375            description: "Root volume is dangerously full: 4.811% free."
 376
 377  # network packets dropped
 378  - interval: 1m
 379    input_series:
 380     - series: 'node_network_receive_drop_total{device="eth0",
 381       instance="node-exporter",job="node-exporter"}'
 382       values: '0+600x10'
 383     - series: 'node_network_transmit_drop_total{device="eth0",
 384       instance="node-exporter",job="node-exporter"}'
 385       values: '0+600x10'
 386     - series: 'node_network_receive_packets_total{device="eth0",
 387       instance="node-exporter",job="node-exporter"}'
 388       values: '0+750x10'
 389     - series: 'node_network_transmit_packets_total{device="eth0",
 390       instance="node-exporter",job="node-exporter"}'
 391       values: '0+750x10'
 392    promql_expr_test:
 393      - expr: |
 394          (
 395            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 396            rate(node_network_transmit_drop_total{device!="lo"}[1m])
 397          ) / (
 398            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 399            rate(node_network_transmit_packets_total{device!="lo"}[1m])
 400          ) >= 0.0050000000000000001 and (
 401            rate(node_network_receive_drop_total{device!="lo"}[1m]) +
 402            rate(node_network_transmit_drop_total{device!="lo"}[1m])
 403          ) >= 10
 404
 405        eval_time: 5m
 406        exp_samples:
 407          - labels: '{device="eth0", instance="node-exporter",
 408            job="node-exporter"}'
 409            value: 8E-1
 410    alert_rule_test:
 411      - eval_time: 5m
 412        alertname: CephNodeNetworkPacketDrops
 413        exp_alerts:
 414        - exp_labels:
 415            device: eth0
 416            instance: node-exporter
 417            job: node-exporter
 418            oid: 1.3.6.1.4.1.50495.1.2.1.8.2
 419            severity: warning
 420            type: ceph_default
 421          exp_annotations:
 422            summary: One or more NICs reports packet drops
 423            description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
 424
 425  # network packets errors
 426  - interval: 1m
 427    input_series:
 428     - series: 'node_network_receive_errs_total{device="eth0",
 429       instance="node-exporter",job="node-exporter"}'
 430       values: '0+600x10'
 431     - series: 'node_network_transmit_errs_total{device="eth0",
 432       instance="node-exporter",job="node-exporter"}'
 433       values: '0+600x10'
 434     - series: 'node_network_transmit_packets_total{device="eth0",
 435       instance="node-exporter",job="node-exporter"}'
 436       values: '0+750x10'
 437     - series: 'node_network_receive_packets_total{device="eth0",
 438       instance="node-exporter",job="node-exporter"}'
 439       values: '0+750x10'
 440    promql_expr_test:
 441      - expr: |
 442          (
 443            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 444            rate(node_network_transmit_errs_total{device!="lo"}[1m])
 445          ) / (
 446            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
 447            rate(node_network_transmit_packets_total{device!="lo"}[1m])
 448          ) >= 0.0001 or (
 449            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
 450            rate(node_network_transmit_errs_total{device!="lo"}[1m])
 451          ) >= 10
 452
 453        eval_time: 5m
 454        exp_samples:
 455          - labels: '{device="eth0", instance="node-exporter",
 456            job="node-exporter"}'
 457            value: 8E-01
 458    alert_rule_test:
 459      - eval_time: 5m
 460        alertname: CephNodeNetworkPacketErrors
 461        exp_alerts:
 462        - exp_labels:
 463            device: eth0
 464            instance: node-exporter
 465            job: node-exporter
 466            oid: 1.3.6.1.4.1.50495.1.2.1.8.3
 467            severity: warning
 468            type: ceph_default
 469          exp_annotations:
 470            summary: One or more NICs reports packet errors
 471            description: "Node node-exporter experiences packet errors > 0.01% or > 10 packets/s on interface eth0."
 472
 473 # Node Storage disk space filling up
 474  - interval: 1m
 475    # 20GB = 21474836480, 256MB = 268435456
 476    input_series:
 477     - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
 478       fstype="xfs",instance="node-1",mountpoint="/rootfs"}'
 479       values: '21474836480-268435456x48'
 480     - series: 'node_filesystem_free_bytes{device="/dev/mapper/vg-root",
 481       fstype="xfs",instance="node-2",mountpoint="/rootfs"}'
 482       values: '21474836480+0x48'
 483     - series: 'node_uname_info{instance="node-1", nodename="node-1.unittests.com"}'
 484       values: 1+0x48
 485     - series: 'node_uname_info{instance="node-2", nodename="node-2.unittests.com"}'
 486       values: 1+0x48
 487    promql_expr_test:
 488      - expr: |
 489          predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) *
 490           on(instance) group_left(nodename) node_uname_info < 0
 491        eval_time: 5m
 492        exp_samples:
 493          - labels: '{device="/dev/mapper/vg-root",instance="node-1",fstype="xfs",
 494          mountpoint="/rootfs",nodename="node-1.unittests.com"}'
 495            value: -1.912602624E+12
 496    alert_rule_test:
 497      - eval_time: 5m
 498        alertname: CephNodeDiskspaceWarning
 499        exp_alerts:
 500        - exp_labels:
 501            severity: warning
 502            type: ceph_default
 503            oid: 1.3.6.1.4.1.50495.1.2.1.8.4
 504            device: /dev/mapper/vg-root
 505            fstype: xfs
 506            instance: node-1
 507            mountpoint: /rootfs
 508            nodename: node-1.unittests.com
 509          exp_annotations:
 510            summary: Host filesystem free space is getting low
 511            description: "Mountpoint /rootfs on node-1.unittests.com will be full in less than 5 days based on the 48 hour trailing fill rate."
 512  # MTU Mismatch
 513  - interval: 1m
 514    input_series:
 515     - series: 'node_network_mtu_bytes{device="eth0",instance="node-exporter",
 516       job="node-exporter"}'
 517       values: '1500 1500 1500 1500 1500'
 518     - series: 'node_network_mtu_bytes{device="eth1",instance="node-exporter",
 519       job="node-exporter"}'
 520       values: '1500 1500 1500 1500 1500'
 521     - series: 'node_network_mtu_bytes{device="eth2",instance="node-exporter",
 522       job="node-exporter"}'
 523       values: '1500 1500 1500 1500 1500'
 524     - series: 'node_network_mtu_bytes{device="eth3",instance="node-exporter",
 525       job="node-exporter"}'
 526       values: '1500 1500 1500 1500 1500'
 527     - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
 528       job="node-exporter"}'
 529       values: '9000 9000 9000 9000 9000'
 530     - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
 531       job="node-exporter"}'
 532       values: '2200 2200 2200 2200 2200'
 533     - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
 534       job="node-exporter"}'
 535       values: '2400 2400 2400 2400 2400'
 536     - series: 'node_network_up{device="eth0",instance="node-exporter",
 537       job="node-exporter"}'
 538       values: '0 0 0 0 0'
 539     - series: 'node_network_up{device="eth1",instance="node-exporter",
 540       job="node-exporter"}'
 541       values: '0 0 0 0 0'
 542     - series: 'node_network_up{device="eth2",instance="node-exporter",
 543       job="node-exporter"}'
 544       values: '1 1 1 1 1'
 545     - series: 'node_network_up{device="eth3",instance="node-exporter",
 546       job="node-exporter"}'
 547       values: '1 1 1 1 1'
 548     - series: 'node_network_up{device="eth4",instance="node-exporter",
 549       job="node-exporter"}'
 550       values: '1 1 1 1 1'
 551     - series: 'node_network_up{device="eth4",instance="hostname1",
 552       job="node-exporter"}'
 553       values: '1 1 1 1 1'
 554     - series: 'node_network_up{device="eth4",instance="hostname2",
 555       job="node-exporter"}'
 556       values: '0 0 0 0 0'
 557    promql_expr_test:
 558      - expr: |
 559           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
 560             scalar(
 561               max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
 562                 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
 563             )
 564           or
 565           node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
 566             scalar(
 567               min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
 568                 quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
 569             )
 570        eval_time: 1m
 571        exp_samples:
 572          - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
 573            value: 9000
 574          - labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
 575            value: 2200
 576    alert_rule_test:
 577      - eval_time: 1m
 578        alertname: CephNodeInconsistentMTU
 579        exp_alerts:
 580        - exp_labels:
 581            device: eth4
 582            instance: hostname1
 583            job: node-exporter
 584            severity: warning
 585            type: ceph_default
 586          exp_annotations:
 587            summary: MTU settings across Ceph hosts are inconsistent
 588            description: "Node hostname1 has a different MTU size (2200) than the median of devices named eth4."
 589        - exp_labels:
 590            device: eth4
 591            instance: node-exporter
 592            job: node-exporter
 593            severity: warning
 594            type: ceph_default
 595          exp_annotations:
 596            summary: MTU settings across Ceph hosts are inconsistent
 597            description: "Node node-exporter has a different MTU size (9000) than the median of devices named eth4."
 598
 599  # pool full, data series has 6 but using topk(5) so to ensure the
 600  # results are working as expected
 601  - interval: 1m
 602    input_series:
 603     - series: 'ceph_health_detail{name="POOL_FULL"}'
 604       values: '0 0 0 1 1 1 1 1 1 1 1'
 605     - series: 'ceph_pool_percent_used{pool_id="1"}'
 606       values: '32+0x10'
 607     - series: 'ceph_pool_percent_used{pool_id="2"}'
 608       values: '96+0x10'
 609     - series: 'ceph_pool_percent_used{pool_id="3"}'
 610       values: '90+0x10'
 611     - series: 'ceph_pool_percent_used{pool_id="4"}'
 612       values: '72+0x10'
 613     - series: 'ceph_pool_percent_used{pool_id="5"}'
 614       values: '19+0x10'
 615     - series: 'ceph_pool_percent_used{pool_id="6"}'
 616       values: '10+0x10'
 617     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 618       name="cephfs_data",pool_id="1"}'
 619       values: '1 1 1 1 1 1 1 1 1'
 620     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 621       name="rbd",pool_id="2"}'
 622       values: '1 1 1 1 1 1 1 1 1'
 623     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 624       name="iscsi",pool_id="3"}'
 625       values: '1 1 1 1 1 1 1 1 1'
 626     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 627       name="default.rgw.index",pool_id="4"}'
 628       values: '1 1 1 1 1 1 1 1 1'
 629     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 630       name="default.rgw.log",pool_id="5"}'
 631       values: '1 1 1 1 1 1 1 1 1'
 632     - series: 'ceph_pool_metadata{instance="ceph:9283",job="ceph",
 633       name="dummy",pool_id="6"}'
 634       values: '1 1 1 1 1 1 1 1 1'
 635    promql_expr_test:
 636      - expr: ceph_health_detail{name="POOL_FULL"} > 0
 637        eval_time: 5m
 638        exp_samples:
 639          - labels:  '{__name__="ceph_health_detail", name="POOL_FULL"}'
 640            value: 1
 641    alert_rule_test:
 642      - eval_time: 1m
 643        alertname: CephPoolFull
 644      - eval_time: 10m
 645        alertname: CephPoolFull
 646        exp_alerts:
 647        - exp_labels:
 648            name: POOL_FULL
 649            severity: critical
 650            type: ceph_default
 651            oid: 1.3.6.1.4.1.50495.1.2.1.9.1
 652          exp_annotations:
 653            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
 654            summary: Pool is full - writes are blocked
 655            description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) - rbd at 96% - iscsi at 90% - default.rgw.index at 72% - cephfs_data at 32% - default.rgw.log at 19% Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)"
 656  # slow OSD ops
 657  - interval : 1m
 658    input_series:
 659     - series: 'ceph_healthcheck_slow_ops{instance="ceph:9283",job="ceph"}'
 660       values: '1+0x120'
 661    promql_expr_test:
 662      - expr: ceph_healthcheck_slow_ops > 0
 663        eval_time: 1m
 664        exp_samples:
 665          - labels: '{__name__="ceph_healthcheck_slow_ops", instance="ceph:9283",
 666            job="ceph"}'
 667            value: 1
 668    alert_rule_test:
 669      - eval_time: 20m
 670        alertname: CephSlowOps
 671        exp_alerts:
 672        - exp_labels:
 673            instance: ceph:9283
 674            job: ceph
 675            severity: warning
 676            type: ceph_default
 677          exp_annotations:
 678            documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
 679            summary: OSD operations are slow to complete
 680            description: "1 OSD requests are taking too long to process (osd_op_complaint_time exceeded)"
 681
 682 # CEPHADM orchestrator alert triggers
 683  - interval: 30s
 684    input_series:
 685     - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}'
 686       values: '1+0x40'
 687    promql_expr_test:
 688      - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
 689        eval_time: 2m
 690        exp_samples:
 691          - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}'
 692            value: 1
 693    alert_rule_test:
 694     - eval_time: 1m
 695       alertname: CephadmUpgradeFailed
 696     - eval_time: 5m
 697       alertname: CephadmUpgradeFailed
 698       exp_alerts:
 699       - exp_labels:
 700           name: UPGRADE_EXCEPTION
 701           severity: critical
 702           type: ceph_default
 703           oid: 1.3.6.1.4.1.50495.1.2.1.11.2
 704         exp_annotations:
 705           summary: Ceph version upgrade has failed
 706           description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue"
 707  - interval: 30s
 708    input_series:
 709     - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}'
 710       values: '1+0x40'
 711    promql_expr_test:
 712      - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
 713        eval_time: 2m
 714        exp_samples:
 715          - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}'
 716            value: 1
 717    alert_rule_test:
 718     - eval_time: 1m
 719       alertname: CephadmDaemonFailed
 720     - eval_time: 5m
 721       alertname: CephadmDaemonFailed
 722       exp_alerts:
 723       - exp_labels:
 724           name: CEPHADM_FAILED_DAEMON
 725           severity: critical
 726           type: ceph_default
 727           oid: 1.3.6.1.4.1.50495.1.2.1.11.1
 728         exp_annotations:
 729           summary: A ceph daemon manged by cephadm is down
 730           description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start <daemon_id>'"
 731  - interval: 1m
 732    input_series:
 733     - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}'
 734       values: '1 1 1 1 1 1 1 1 1'
 735    promql_expr_test:
 736      - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
 737        eval_time: 2m
 738        exp_samples:
 739          - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}'
 740            value: 1
 741    alert_rule_test:
 742     - eval_time: 1m
 743       alertname: CephadmPaused
 744     - eval_time: 5m
 745       alertname: CephadmPaused
 746       exp_alerts:
 747       - exp_labels:
 748           name: CEPHADM_PAUSED
 749           severity: warning
 750           type: ceph_default
 751         exp_annotations:
 752           documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
 753           summary: Orchestration tasks via cephadm are PAUSED
 754           description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'"
 755 # MDS
 756  - interval: 1m
 757    input_series:
 758     - series: 'ceph_health_detail{name="MDS_DAMAGE"}'
 759       values: '1 1 1 1 1 1 1 1 1'
 760    promql_expr_test:
 761      - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
 762        eval_time: 2m
 763        exp_samples:
 764          - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}'
 765            value: 1
 766    alert_rule_test:
 767     - eval_time: 1m
 768       alertname: CephFilesystemDamaged
 769     - eval_time: 5m
 770       alertname: CephFilesystemDamaged
 771       exp_alerts:
 772       - exp_labels:
 773           name: MDS_DAMAGE
 774           severity: critical
 775           type: ceph_default
 776           oid: 1.3.6.1.4.1.50495.1.2.1.5.1
 777         exp_annotations:
 778           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
 779           summary: CephFS filesystem is damaged.
 780           description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support."
 781  - interval: 1m
 782    input_series:
 783     - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}'
 784       values: '1 1 1 1 1 1 1 1 1'
 785    promql_expr_test:
 786      - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
 787        eval_time: 2m
 788        exp_samples:
 789          - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}'
 790            value: 1
 791    alert_rule_test:
 792     - eval_time: 1m
 793       alertname: CephFilesystemReadOnly
 794     - eval_time: 5m
 795       alertname: CephFilesystemReadOnly
 796       exp_alerts:
 797       - exp_labels:
 798           name: MDS_HEALTH_READ_ONLY
 799           severity: critical
 800           type: ceph_default
 801           oid: 1.3.6.1.4.1.50495.1.2.1.5.2
 802         exp_annotations:
 803           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
 804           summary: CephFS filesystem in read only mode due to write error(s)
 805           description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support."
 806  - interval: 1m
 807    input_series:
 808     - series: 'ceph_health_detail{name="MDS_ALL_DOWN"}'
 809       values: '0 0 1 1 1 1 1 1 1 1 1'
 810    promql_expr_test:
 811      - expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
 812        eval_time: 2m
 813        exp_samples:
 814          - labels: '{__name__="ceph_health_detail", name="MDS_ALL_DOWN"}'
 815            value: 1
 816    alert_rule_test:
 817     - eval_time: 1m
 818       alertname: CephFilesystemOffline
 819     - eval_time: 10m
 820       alertname: CephFilesystemOffline
 821       exp_alerts:
 822       - exp_labels:
 823           name: MDS_ALL_DOWN
 824           severity: critical
 825           type: ceph_default
 826           oid: 1.3.6.1.4.1.50495.1.2.1.5.3
 827         exp_annotations:
 828           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
 829           summary: CephFS filesystem is offline
 830           description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline."
 831  - interval: 1m
 832    input_series:
 833     - series: 'ceph_health_detail{name="FS_DEGRADED"}'
 834       values: '0 0 1 1 1 1 1 1 1 1 1'
 835    promql_expr_test:
 836      - expr: ceph_health_detail{name="FS_DEGRADED"} > 0
 837        eval_time: 2m
 838        exp_samples:
 839          - labels: '{__name__="ceph_health_detail", name="FS_DEGRADED"}'
 840            value: 1
 841    alert_rule_test:
 842     - eval_time: 1m
 843       alertname: CephFilesystemDegraded
 844     - eval_time: 10m
 845       alertname: CephFilesystemDegraded
 846       exp_alerts:
 847       - exp_labels:
 848           name: FS_DEGRADED
 849           severity: critical
 850           type: ceph_default
 851           oid: 1.3.6.1.4.1.50495.1.2.1.5.4
 852         exp_annotations:
 853           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
 854           summary: CephFS filesystem is degraded
 855           description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable."
 856  - interval: 1m
 857    input_series:
 858     - series: 'ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"}'
 859       values: '0 0 1 1 1 1 1 1 1 1 1'
 860    promql_expr_test:
 861      - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
 862        eval_time: 2m
 863        exp_samples:
 864          - labels: '{__name__="ceph_health_detail", name="MDS_INSUFFICIENT_STANDBY"}'
 865            value: 1
 866    alert_rule_test:
 867     - eval_time: 1m
 868       alertname: CephFilesystemInsufficientStandby
 869     - eval_time: 10m
 870       alertname: CephFilesystemInsufficientStandby
 871       exp_alerts:
 872       - exp_labels:
 873           name: MDS_INSUFFICIENT_STANDBY
 874           severity: warning
 875           type: ceph_default
 876         exp_annotations:
 877           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
 878           summary: Ceph filesystem standby daemons too few
 879           description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons."
 880  - interval: 1m
 881    input_series:
 882     - series: 'ceph_health_detail{name="FS_WITH_FAILED_MDS"}'
 883       values: '0 0 1 1 1 1 1 1 1 1 1'
 884    promql_expr_test:
 885      - expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
 886        eval_time: 2m
 887        exp_samples:
 888          - labels: '{__name__="ceph_health_detail", name="FS_WITH_FAILED_MDS"}'
 889            value: 1
 890    alert_rule_test:
 891     - eval_time: 1m
 892       alertname: CephFilesystemFailureNoStandby
 893     - eval_time: 10m
 894       alertname: CephFilesystemFailureNoStandby
 895       exp_alerts:
 896       - exp_labels:
 897           name: FS_WITH_FAILED_MDS
 898           severity: critical
 899           type: ceph_default
 900           oid: 1.3.6.1.4.1.50495.1.2.1.5.5
 901         exp_annotations:
 902           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
 903           summary: MDS daemon failed, no further standby available
 904           description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS."
 905  - interval: 1m
 906    input_series:
 907     - series: 'ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"}'
 908       values: '0 0 1 1 1 1 1 1 1 1 1'
 909    promql_expr_test:
 910      - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
 911        eval_time: 2m
 912        exp_samples:
 913          - labels: '{__name__="ceph_health_detail", name="MDS_UP_LESS_THAN_MAX"}'
 914            value: 1
 915    alert_rule_test:
 916     - eval_time: 1m
 917       alertname: CephFilesystemMDSRanksLow
 918     - eval_time: 10m
 919       alertname: CephFilesystemMDSRanksLow
 920       exp_alerts:
 921       - exp_labels:
 922           name: MDS_UP_LESS_THAN_MAX
 923           severity: warning
 924           type: ceph_default
 925         exp_annotations:
 926           documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
 927           summary: Ceph MDS daemon count is lower than configured
 928           description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value."
 929 # MGR
 930  - interval: 1m
 931    input_series:
 932     - series: 'up{job="ceph", instance="ceph-mgr:9283"}'
 933       values: '1+0x2 0+0x10'
 934    promql_expr_test:
 935      - expr: up{job="ceph"} == 0
 936        eval_time: 3m
 937        exp_samples:
 938          - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}'
 939            value: 0
 940    alert_rule_test:
 941     - eval_time: 1m
 942       alertname: CephMgrPrometheusModuleInactive
 943     - eval_time: 10m
 944       alertname: CephMgrPrometheusModuleInactive
 945       exp_alerts:
 946       - exp_labels:
 947           instance: ceph-mgr:9283
 948           job: ceph
 949           severity: critical
 950           type: ceph_default
 951           oid: 1.3.6.1.4.1.50495.1.2.1.6.2
 952         exp_annotations:
 953           summary: The mgr/prometheus module is not available
 954           description: "The mgr/prometheus module at ceph-mgr:9283 is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'."
 955  - interval: 1m
 956    input_series:
 957     - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}'
 958       values: '0+0x2 1+0x20'
 959    promql_expr_test:
 960      - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
 961        eval_time: 3m
 962        exp_samples:
 963          - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}'
 964            value: 1
 965    alert_rule_test:
 966     - eval_time: 1m
 967       alertname: CephMgrModuleCrash
 968     - eval_time: 15m
 969       alertname: CephMgrModuleCrash
 970       exp_alerts:
 971       - exp_labels:
 972           name: RECENT_MGR_MODULE_CRASH
 973           severity: critical
 974           type: ceph_default
 975           oid: 1.3.6.1.4.1.50495.1.2.1.6.1
 976         exp_annotations:
 977           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
 978           summary: A manager module has recently crashed
 979           description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure."
 980 # MON
 981  - interval: 1m
 982    input_series:
 983     - series: 'ceph_health_detail{name="MON_DISK_CRIT"}'
 984       values: '0+0x2 1+0x10'
 985     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
 986       values: '1+0x13'
 987    promql_expr_test:
 988      - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
 989        eval_time: 3m
 990        exp_samples:
 991          - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}'
 992            value: 1
 993    alert_rule_test:
 994     - eval_time: 1m
 995       alertname: CephMonDiskspaceCritical
 996     - eval_time: 10m
 997       alertname: CephMonDiskspaceCritical
 998       exp_alerts:
 999       - exp_labels:
1000           name: "MON_DISK_CRIT"
1001           severity: critical
1002           type: ceph_default
1003           oid: 1.3.6.1.4.1.50495.1.2.1.3.2
1004         exp_annotations:
1005           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
1006           summary: Filesystem space on at least one monitor is critically low
1007           description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1008  - interval: 1m
1009    input_series:
1010     - series: 'ceph_health_detail{name="MON_DISK_LOW"}'
1011       values: '0+0x2 1+0x10'
1012     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}'
1013       values: '1+0x13'
1014    promql_expr_test:
1015      - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
1016        eval_time: 3m
1017        exp_samples:
1018          - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}'
1019            value: 1
1020    alert_rule_test:
1021     - eval_time: 1m
1022       alertname: CephMonDiskspaceLow
1023     - eval_time: 10m
1024       alertname: CephMonDiskspaceLow
1025       exp_alerts:
1026       - exp_labels:
1027           name: "MON_DISK_LOW"
1028           severity: warning
1029           type: ceph_default
1030         exp_annotations:
1031           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
1032           summary: Drive space on at least one monitor is approaching full
1033           description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; - ceph-mon-a"
1034  - interval: 1m
1035    input_series:
1036     - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}'
1037       values: '0+0x2 1+0x10'
1038    promql_expr_test:
1039      - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
1040        eval_time: 3m
1041        exp_samples:
1042          - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}'
1043            value: 1
1044    alert_rule_test:
1045     - eval_time: 1m
1046       alertname: CephMonClockSkew
1047     - eval_time: 10m
1048       alertname: CephMonClockSkew
1049       exp_alerts:
1050       - exp_labels:
1051           name: "MON_CLOCK_SKEW"
1052           severity: warning
1053           type: ceph_default
1054         exp_annotations:
1055           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
1056           summary: Clock skew detected among monitors
1057           description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon."
1058
1059 # Check 3 mons one down, quorum at risk
1060  - interval: 1m
1061    input_series:
1062     - series: 'ceph_health_detail{name="MON_DOWN"}'
1063       values: '0+0x2 1+0x12'
1064     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1065       values: '1+0x14'
1066     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1067       values: '1+0x14'
1068     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1069       values: '1+0x2 0+0x12'
1070     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1071       values: '1+0x14'
1072     - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1073       values: '1+0x14'
1074     - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1075       values: '1+0x14'
1076    promql_expr_test:
1077      - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1
1078        eval_time: 3m
1079        exp_samples:
1080          - labels: '{}'
1081            value: 1
1082    alert_rule_test:
1083     - eval_time: 1m
1084       alertname: CephMonDownQuorumAtRisk
1085       # shouldn't fire
1086     - eval_time: 10m
1087       alertname: CephMonDownQuorumAtRisk
1088       exp_alerts:
1089       - exp_labels:
1090           severity: critical
1091           type: ceph_default
1092           oid: 1.3.6.1.4.1.50495.1.2.1.3.1
1093         exp_annotations:
1094           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1095           summary: Monitor quorum is at risk
1096           description: "Quorum requires a majority of monitors (x 2) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: - mon.c on ceph-mon-3"
1097 # check 5 mons, 1 down - warning only
1098  - interval: 1m
1099    input_series:
1100     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}'
1101       values: '1+0x14'
1102     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}'
1103       values: '1+0x14'
1104     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}'
1105       values: '1+0x14'
1106     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}'
1107       values: '1+0x14'
1108     - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}'
1109       values: '1+0x2 0+0x12'
1110     - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}'
1111       values: '1+0x14'
1112     - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}'
1113       values: '1+0x14'
1114     - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}'
1115       values: '1+0x14'
1116     - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}'
1117       values: '1+0x14'
1118     - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}'
1119       values: '1+0x14'
1120    promql_expr_test:
1121      - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1))
1122        eval_time: 3m
1123        exp_samples:
1124          - labels: '{}'
1125            value: 1
1126    alert_rule_test:
1127     - eval_time: 1m
1128       alertname: CephMonDown
1129     - eval_time: 10m
1130       alertname: CephMonDown
1131       exp_alerts:
1132       - exp_labels:
1133           severity: warning
1134           type: ceph_default
1135         exp_annotations:
1136           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
1137           summary: One or more monitors down
1138           description: "You have 1 monitor down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable.  The following monitors are down:   - mon.e on ceph-mon-5\n"
1139 # Device Health
1140  - interval: 1m
1141    input_series:
1142     - series: 'ceph_health_detail{name="DEVICE_HEALTH"}'
1143       values: '0+0x2 1+0x10'
1144    promql_expr_test:
1145      - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
1146        eval_time: 3m
1147        exp_samples:
1148          - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}'
1149            value: 1
1150    alert_rule_test:
1151     - eval_time: 1m
1152       alertname: CephDeviceFailurePredicted
1153     - eval_time: 10m
1154       alertname: CephDeviceFailurePredicted
1155       exp_alerts:
1156       - exp_labels:
1157           name: "DEVICE_HEALTH"
1158           severity: warning
1159           type: ceph_default
1160         exp_annotations:
1161           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
1162           summary: Device(s) predicted to fail soon
1163           description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info <dev id>'. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD."
1164  - interval: 1m
1165    input_series:
1166     - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}'
1167       values: '0+0x2 1+0x10'
1168    promql_expr_test:
1169      - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
1170        eval_time: 3m
1171        exp_samples:
1172          - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}'
1173            value: 1
1174    alert_rule_test:
1175     - eval_time: 1m
1176       alertname: CephDeviceFailurePredictionTooHigh
1177     - eval_time: 10m
1178       alertname: CephDeviceFailurePredictionTooHigh
1179       exp_alerts:
1180       - exp_labels:
1181           name: "DEVICE_HEALTH_TOOMANY"
1182           severity: critical
1183           type: ceph_default
1184           oid: 1.3.6.1.4.1.50495.1.2.1.4.7
1185         exp_annotations:
1186           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
1187           summary: Too many devices are predicted to fail, unable to resolve
1188           description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
1189  - interval: 1m
1190    input_series:
1191     - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}'
1192       values: '0+0x2 1+0x10'
1193    promql_expr_test:
1194      - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
1195        eval_time: 3m
1196        exp_samples:
1197          - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}'
1198            value: 1
1199    alert_rule_test:
1200     - eval_time: 1m
1201       alertname: CephDeviceFailureRelocationIncomplete
1202     - eval_time: 10m
1203       alertname: CephDeviceFailureRelocationIncomplete
1204       exp_alerts:
1205       - exp_labels:
1206           name: "DEVICE_HEALTH_IN_USE"
1207           severity: warning
1208           type: ceph_default
1209         exp_annotations:
1210           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
1211           summary: Device failure is predicted, but unable to relocate data
1212           description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer."
1213 # OSD
1214  - interval: 1m
1215    input_series:
1216     - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}'
1217       values: '0+0x2 1+0x10'
1218     - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1219       values: '1+0x2 0+0x10'
1220     - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1221       values: '1+0x12'
1222    promql_expr_test:
1223      - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
1224        eval_time: 3m
1225        exp_samples:
1226          - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}'
1227            value: 1
1228    alert_rule_test:
1229     - eval_time: 1m
1230       alertname: CephOSDHostDown
1231     - eval_time: 10m
1232       alertname: CephOSDHostDown
1233       exp_alerts:
1234       - exp_labels:
1235           name: "OSD_HOST_DOWN"
1236           severity: warning
1237           type: ceph_default
1238           oid: 1.3.6.1.4.1.50495.1.2.1.4.8
1239         exp_annotations:
1240           summary: An OSD host is offline
1241           description: "The following OSDs are down: - ceph-osd-1 : osd.0"
1242  - interval: 1m
1243    input_series:
1244     - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}'
1245       values: '0+0x2 1+0x20'
1246    promql_expr_test:
1247      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0
1248        eval_time: 1m
1249        exp_samples:
1250          - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}'
1251            value: 0
1252    alert_rule_test:
1253     - eval_time: 1m
1254       alertname: CephOSDTimeoutsPublicNetwork
1255     - eval_time: 10m
1256       alertname: CephOSDTimeoutsPublicNetwork
1257       exp_alerts:
1258       - exp_labels:
1259           name: "OSD_SLOW_PING_TIME_FRONT"
1260           severity: warning
1261           type: ceph_default
1262         exp_annotations:
1263           summary: Network issues delaying OSD heartbeats (public network)
1264           description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs."
1265  - interval: 1m
1266    input_series:
1267     - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}'
1268       values: '0+0x2 1+0x20'
1269    promql_expr_test:
1270      - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0
1271        eval_time: 1m
1272        exp_samples:
1273          - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}'
1274            value: 0
1275    alert_rule_test:
1276     - eval_time: 1m
1277       alertname: CephOSDTimeoutsClusterNetwork
1278     - eval_time: 10m
1279       alertname: CephOSDTimeoutsClusterNetwork
1280       exp_alerts:
1281       - exp_labels:
1282           name: "OSD_SLOW_PING_TIME_BACK"
1283           severity: warning
1284           type: ceph_default
1285         exp_annotations:
1286           summary: Network issues delaying OSD heartbeats (cluster network)
1287           description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs."
1288  - interval: 1m
1289    input_series:
1290     - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1291       values: '0+0x2 1+0x20'
1292    promql_expr_test:
1293      - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0
1294        eval_time: 1m
1295        exp_samples:
1296          - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}'
1297            value: 0
1298    alert_rule_test:
1299     - eval_time: 1m
1300       alertname: CephOSDInternalDiskSizeMismatch
1301     - eval_time: 10m
1302       alertname: CephOSDInternalDiskSizeMismatch
1303       exp_alerts:
1304       - exp_labels:
1305           name: "BLUESTORE_DISK_SIZE_MISMATCH"
1306           severity: warning
1307           type: ceph_default
1308         exp_annotations:
1309           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
1310           summary: OSD size inconsistency error
1311           description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs."
1312  - interval: 30s
1313    input_series:
1314     - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1315       values: '0+0x2 1+0x20'
1316    promql_expr_test:
1317      - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
1318        eval_time: 3m
1319        exp_samples:
1320          - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}'
1321            value: 1
1322    alert_rule_test:
1323     - eval_time: 1m
1324       alertname: CephOSDReadErrors
1325     - eval_time: 10m
1326       alertname: CephOSDReadErrors
1327       exp_alerts:
1328       - exp_labels:
1329           name: "BLUESTORE_SPURIOUS_READ_ERRORS"
1330           severity: warning
1331           type: ceph_default
1332         exp_annotations:
1333           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
1334           summary: Device read errors detected
1335           description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel."
1336  - interval: 1m
1337    input_series:
1338     - series: 'ceph_health_detail{name="OSD_DOWN"}'
1339       values: '0+0x2 1+0x10'
1340     - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1341       values: '1+0x12'
1342     - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1343       values: '1+0x2 0+0x10'
1344     - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1345       values: '1+0x12'
1346     - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}'
1347       values: '1+0x12'
1348     - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}'
1349       values: '1+0x12'
1350     - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}'
1351       values: '1+0x12'
1352    promql_expr_test:
1353      - expr: ceph_health_detail{name="OSD_DOWN"} == 1
1354        eval_time: 3m
1355        exp_samples:
1356          - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}'
1357            value: 1
1358    alert_rule_test:
1359     - eval_time: 1m
1360       alertname: CephOSDDown
1361     - eval_time: 10m
1362       alertname: CephOSDDown
1363       exp_alerts:
1364       - exp_labels:
1365           name: "OSD_DOWN"
1366           severity: warning
1367           type: ceph_default
1368           oid: 1.3.6.1.4.1.50495.1.2.1.4.2
1369         exp_annotations:
1370           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
1371           summary: An OSD has been marked down
1372           description: "1 OSD down for over 5mins. The following OSD is down: - osd.1 on ceph-osd-2\n"
1373  - interval: 1m
1374    input_series:
1375     - series: 'ceph_health_detail{name="OSD_NEARFULL"}'
1376       values: '0+0x2 1+0x10'
1377    promql_expr_test:
1378      - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
1379        eval_time: 3m
1380        exp_samples:
1381          - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}'
1382            value: 1
1383    alert_rule_test:
1384     - eval_time: 1m
1385       alertname: CephOSDNearFull
1386     - eval_time: 10m
1387       alertname: CephOSDNearFull
1388       exp_alerts:
1389       - exp_labels:
1390           name: "OSD_NEARFULL"
1391           severity: warning
1392           type: ceph_default
1393           oid: 1.3.6.1.4.1.50495.1.2.1.4.3
1394         exp_annotations:
1395           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
1396           summary: OSD(s) running low on free space (NEARFULL)
1397           description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1398  - interval: 1m
1399    input_series:
1400     - series: 'ceph_health_detail{name="OSD_FULL"}'
1401       values: '0+0x2 1+0x10'
1402    promql_expr_test:
1403      - expr: ceph_health_detail{name="OSD_FULL"} == 1
1404        eval_time: 3m
1405        exp_samples:
1406          - labels: '{__name__="ceph_health_detail", name="OSD_FULL"}'
1407            value: 1
1408    alert_rule_test:
1409     - eval_time: 1m
1410       alertname: CephOSDFull
1411     - eval_time: 10m
1412       alertname: CephOSDFull
1413       exp_alerts:
1414       - exp_labels:
1415           name: "OSD_FULL"
1416           severity: critical
1417           type: ceph_default
1418           oid: 1.3.6.1.4.1.50495.1.2.1.4.6
1419         exp_annotations:
1420           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
1421           summary: OSD full, writes blocked
1422           description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data.
1423  - interval: 1m
1424    input_series:
1425     - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}'
1426       values: '0+0x2 1+0x10'
1427    promql_expr_test:
1428      - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1
1429        eval_time: 3m
1430        exp_samples:
1431          - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}'
1432            value: 1
1433    alert_rule_test:
1434     - eval_time: 1m
1435       alertname: CephOSDBackfillFull
1436     - eval_time: 10m
1437       alertname: CephOSDBackfillFull
1438       exp_alerts:
1439       - exp_labels:
1440           name: "OSD_BACKFILLFULL"
1441           severity: warning
1442           type: ceph_default
1443         exp_annotations:
1444           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
1445           summary: OSD(s) too full for backfill operations
1446           description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data."
1447  - interval: 30s
1448    input_series:
1449     - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}'
1450       values: '0+0x2 1+0x20'
1451    promql_expr_test:
1452      - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0
1453        eval_time: 1m
1454        exp_samples:
1455          - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}'
1456            value: 0
1457    alert_rule_test:
1458     - eval_time: 1m
1459       alertname: CephOSDTooManyRepairs
1460     - eval_time: 10m
1461       alertname: CephOSDTooManyRepairs
1462       exp_alerts:
1463       - exp_labels:
1464           name: "OSD_TOO_MANY_REPAIRS"
1465           severity: warning
1466           type: ceph_default
1467         exp_annotations:
1468           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
1469           summary: OSD reports a high number of read errors
1470           description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive.
1471 # Pools
1472    # trigger percent full prediction on pools 1 and 2 only
1473  - interval: 12h
1474    input_series:
1475     - series: 'ceph_pool_percent_used{pool_id="1"}'
1476       values: '70 75 80 87 92'
1477     - series: 'ceph_pool_percent_used{pool_id="2"}'
1478       values: '22 22 23 23 24'
1479     - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
1480       values: '1 1 1 1 1'
1481     - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
1482       values: '1 1 1 1 1'
1483    promql_expr_test:
1484      - expr: |
1485          (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
1486               group_right ceph_pool_metadata) >= 95
1487        eval_time: 36h
1488        exp_samples:
1489          - labels: '{name="rbd",pool_id="1",type="replicated"}'
1490            value: 1.424E+02 # 142%
1491    alert_rule_test:
1492     - eval_time: 48h
1493       alertname: CephPoolGrowthWarning
1494       exp_alerts:
1495       - exp_labels:
1496           name: rbd
1497           pool_id: 1
1498           severity: warning
1499           type: ceph_default
1500           oid: 1.3.6.1.4.1.50495.1.2.1.9.2
1501         exp_annotations:
1502           summary: Pool growth rate may soon exceed capacity
1503           description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
1504  - interval: 1m
1505    input_series:
1506     - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
1507       values: '0+0x2 1+0x10'
1508    promql_expr_test:
1509      - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1
1510        eval_time: 3m
1511        exp_samples:
1512          - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}'
1513            value: 1
1514    alert_rule_test:
1515     - eval_time: 1m
1516       alertname: CephPoolBackfillFull
1517     - eval_time: 5m
1518       alertname: CephPoolBackfillFull
1519       exp_alerts:
1520       - exp_labels:
1521           name: "POOL_BACKFILLFULL"
1522           severity: warning
1523           type: ceph_default
1524         exp_annotations:
1525           summary: Free space in a pool is too low for recovery/backfill
1526           description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity.
1527
1528  - interval: 1m
1529    input_series:
1530     - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}'
1531       values: '0+0x2 1+0x10'
1532    promql_expr_test:
1533      - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1
1534        eval_time: 3m
1535        exp_samples:
1536          - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}'
1537            value: 1
1538    alert_rule_test:
1539     - eval_time: 1m
1540       alertname: CephPoolNearFull
1541     - eval_time: 10m
1542       alertname: CephPoolNearFull
1543       exp_alerts:
1544       - exp_labels:
1545           name: "POOL_NEAR_FULL"
1546           severity: warning
1547           type: ceph_default
1548         exp_annotations:
1549           summary: One or more Ceph pools are nearly full
1550           description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>). Also ensure that the balancer is active."
1551
1552 # PGs
1553  - interval: 1m
1554    input_series:
1555     - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}'
1556       values: '0+0x2 1+0x10'
1557    promql_expr_test:
1558      - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
1559        eval_time: 3m
1560        exp_samples:
1561          - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}'
1562            value: 1
1563    alert_rule_test:
1564     - eval_time: 1m
1565       alertname: CephPGNotScrubbed
1566     - eval_time: 10m
1567       alertname: CephPGNotScrubbed
1568       exp_alerts:
1569       - exp_labels:
1570           name: "PG_NOT_SCRUBBED"
1571           severity: warning
1572           type: ceph_default
1573         exp_annotations:
1574           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
1575           summary: Placement group(s) have not been scrubbed
1576           description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub <pgid>"
1577  - interval: 1m
1578    input_series:
1579     - series: 'ceph_health_detail{name="PG_DAMAGED"}'
1580       values: '0+0x4 1+0x20'
1581    promql_expr_test:
1582      - expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
1583        eval_time: 5m
1584        exp_samples:
1585          - labels: '{__name__="ceph_health_detail", name="PG_DAMAGED"}'
1586            value: 1
1587    alert_rule_test:
1588     - eval_time: 1m
1589       alertname: CephPGsDamaged
1590     - eval_time: 10m
1591       alertname: CephPGsDamaged
1592       exp_alerts:
1593       - exp_labels:
1594           name: "PG_DAMAGED"
1595           severity: critical
1596           type: ceph_default
1597           oid: 1.3.6.1.4.1.50495.1.2.1.7.4
1598         exp_annotations:
1599           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
1600           summary: Placement group damaged, manual intervention needed
1601           description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair <pg_num>' command.
1602  - interval: 1m
1603    input_series:
1604     - series: 'ceph_health_detail{name="TOO_MANY_PGS"}'
1605       values: '0+0x4 1+0x20'
1606    promql_expr_test:
1607      - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
1608        eval_time: 5m
1609        exp_samples:
1610          - labels: '{__name__="ceph_health_detail", name="TOO_MANY_PGS"}'
1611            value: 1
1612    alert_rule_test:
1613     - eval_time: 1m
1614       alertname: CephPGsHighPerOSD
1615     - eval_time: 10m
1616       alertname: CephPGsHighPerOSD
1617       exp_alerts:
1618       - exp_labels:
1619           name: "TOO_MANY_PGS"
1620           severity: warning
1621           type: ceph_default
1622         exp_annotations:
1623           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
1624           summary: Placement groups per OSD is too high
1625           description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools."
1626  - interval: 1m
1627    input_series:
1628     - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}'
1629       values: '0+0x2 1+0x20'
1630    promql_expr_test:
1631      - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0
1632        eval_time: 1m
1633        exp_samples:
1634          - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}'
1635            value: 0
1636    alert_rule_test:
1637     - eval_time: 1m
1638       alertname: CephPGRecoveryAtRisk
1639     - eval_time: 10m
1640       alertname: CephPGRecoveryAtRisk
1641       exp_alerts:
1642       - exp_labels:
1643           name: "PG_RECOVERY_FULL"
1644           severity: critical
1645           type: ceph_default
1646           oid: 1.3.6.1.4.1.50495.1.2.1.7.5
1647         exp_annotations:
1648           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
1649           summary: OSDs are too full for recovery
1650           description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data.
1651  - interval: 1m
1652    input_series:
1653     - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}'
1654       values: '0+0x2 1+0x20'
1655    promql_expr_test:
1656      - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0
1657        eval_time: 1m
1658        exp_samples:
1659          - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}'
1660            value: 0
1661    alert_rule_test:
1662     - eval_time: 1m
1663       alertname: CephPGBackfillAtRisk
1664     - eval_time: 10m
1665       alertname: CephPGBackfillAtRisk
1666       exp_alerts:
1667       - exp_labels:
1668           name: "PG_BACKFILL_FULL"
1669           severity: critical
1670           type: ceph_default
1671           oid: 1.3.6.1.4.1.50495.1.2.1.7.6
1672         exp_annotations:
1673           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
1674           summary: Backfill operations are blocked due to lack of free space
1675           description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data.
1676  - interval: 1m
1677    input_series:
1678     - series: 'ceph_health_detail{name="PG_AVAILABILITY"}'
1679       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1'
1680     - series: 'ceph_health_detail{name="OSD_DOWN"}'
1681       values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0'
1682    promql_expr_test:
1683      - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
1684        eval_time: 1m
1685        # empty set at 1m
1686        exp_samples:
1687    alert_rule_test:
1688     # PG_AVAILABILITY and OSD_DOWN not firing .. no alert
1689     - eval_time: 1m
1690       alertname: CephPGUnavilableBlockingIO
1691       exp_alerts:
1692     # PG_AVAILABILITY firing, but osd_down is active .. no alert
1693     - eval_time: 5m
1694       alertname: CephPGUnavilableBlockingIO
1695       exp_alerts:
1696     # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert
1697     - eval_time: 15m
1698       alertname: CephPGUnavilableBlockingIO
1699       exp_alerts:
1700       - exp_labels:
1701           name: "PG_AVAILABILITY"
1702           severity: critical
1703           type: ceph_default
1704           oid: 1.3.6.1.4.1.50495.1.2.1.7.3
1705         exp_annotations:
1706           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
1707           summary: PG is unavailable, blocking I/O
1708           description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O.
1709  - interval: 1m
1710    input_series:
1711     - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}'
1712       values: '0+0x2 1+0x10'
1713    promql_expr_test:
1714      - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
1715        eval_time: 3m
1716        exp_samples:
1717          - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}'
1718            value: 1
1719    alert_rule_test:
1720     - eval_time: 1m
1721       alertname: CephPGNotDeepScrubbed
1722     - eval_time: 10m
1723       alertname: CephPGNotDeepScrubbed
1724       exp_alerts:
1725       - exp_labels:
1726           name: "PG_NOT_DEEP_SCRUBBED"
1727           severity: warning
1728           type: ceph_default
1729         exp_annotations:
1730           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
1731           summary: Placement group(s) have not been deep scrubbed
1732           description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window.
1733
1734 # Prometheus
1735  - interval: 1m
1736    input_series:
1737     - series: 'up{job="myjob"}'
1738       values: '1+0x10'
1739    promql_expr_test:
1740      - expr: absent(up{job="ceph"})
1741        eval_time: 1m
1742        exp_samples:
1743          - labels: '{job="ceph"}'
1744            value: 1
1745    alert_rule_test:
1746     - eval_time: 5m
1747       alertname: PrometheusJobMissing
1748       exp_alerts:
1749       - exp_labels:
1750           job: ceph
1751           severity: critical
1752           type: ceph_default
1753           oid: 1.3.6.1.4.1.50495.1.2.1.12.1
1754         exp_annotations:
1755           summary: The scrape job for Ceph is missing from Prometheus
1756           description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance.
1757 # RADOS
1758  - interval: 1m
1759    input_series:
1760     - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}'
1761       values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1762     - series: 'ceph_osd_up{ceph_daemon="osd.0"}'
1763       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1764     - series: 'ceph_osd_up{ceph_daemon="osd.1"}'
1765       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1766     - series: 'ceph_osd_up{ceph_daemon="osd.2"}'
1767       values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1768     - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}'
1769       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1770     - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}'
1771       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1772     - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}'
1773       values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
1774    promql_expr_test:
1775      - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1
1776        eval_time: 1m
1777        exp_samples:
1778    alert_rule_test:
1779     # OBJECT_UNFOUND but osd.2 is down, so don't fire
1780     - eval_time: 5m
1781       alertname: CephObjectMissing
1782       exp_alerts:
1783     # OBJECT_UNFOUND and all osd's are online, so fire
1784     - eval_time: 15m
1785       alertname: CephObjectMissing
1786       exp_alerts:
1787       - exp_labels:
1788           severity: critical
1789           type: ceph_default
1790           oid: 1.3.6.1.4.1.50495.1.2.1.10.1
1791         exp_annotations:
1792           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
1793           summary: Object(s) marked UNFOUND
1794           description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified.
1795 # Generic Alerts
1796  - interval: 1m
1797    input_series:
1798     - series: 'ceph_health_detail{name="RECENT_CRASH"}'
1799       values: '0 0 0 1 1 1 1 1 1 1 1'
1800    promql_expr_test:
1801      - expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1802        eval_time: 1m
1803        exp_samples:
1804    alert_rule_test:
1805     # not firing
1806     - eval_time: 1m
1807       alertname: CephDaemonCrash
1808       exp_alerts:
1809     # firing
1810     - eval_time: 10m
1811       alertname: CephDaemonCrash
1812       exp_alerts:
1813       - exp_labels:
1814           name: RECENT_CRASH
1815           severity: critical
1816           type: ceph_default
1817           oid: 1.3.6.1.4.1.50495.1.2.1.1.2
1818         exp_annotations:
1819           documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
1820           summary: One or more Ceph daemons have crashed, and are pending acknowledgement
1821           description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive <id>' command.