ceph/monitoring/ceph-mixin/tests_alerts/validate_rules.py

   1 #!/usr/bin/env python3
   2 #
   3 # Check the Prometheus rules for format, and integration
   4 # with the unit tests. This script has the following exit
   5 # codes:
   6 #  0 .. Everything worked
   7 #  4 .. rule problems or missing unit tests
   8 #  8 .. Missing fields in YAML
   9 # 12 .. Invalid YAML - unable to load
  10 # 16 .. Missing input files
  11 #
  12 # Externals
  13 # snmptranslate .. used to determine the oid's in the MIB to verify the rule -> MIB is correct
  14 #
  15
  16 import re
  17 import os
  18 import sys
  19 import yaml
  20 import shutil
  21 import string
  22 from bs4 import BeautifulSoup
  23 from typing import List, Any, Dict, Set, Optional, Tuple
  24 import subprocess
  25
  26 import urllib.request
  27 import urllib.error
  28 from urllib.parse import urlparse
  29
  30 from settings import ALERTS_FILE, MIB_FILE, UNIT_TESTS_FILE
  31
  32 DOCLINK_NAME = 'documentation'
  33
  34
  35 def isascii(s: str) -> bool:
  36     try:
  37         s.encode('ascii')
  38     except UnicodeEncodeError:
  39         return False
  40     return True
  41
  42
  43 def read_file(file_name: str) -> Tuple[str, str]:
  44     try:
  45         with open(file_name, 'r') as input_file:
  46             raw_data = input_file.read()
  47     except OSError:
  48         return '', f"Unable to open {file_name}"
  49
  50     return raw_data, ''
  51
  52
  53 def load_yaml(file_name: str) -> Tuple[Dict[str, Any], str]:
  54     data = {}
  55     errs = ''
  56
  57     raw_data, err = read_file(file_name)
  58     if not err:
  59
  60         try:
  61             data = yaml.safe_load(raw_data)
  62         except yaml.YAMLError as e:
  63             errs = f"filename '{file_name} is not a valid YAML file"
  64
  65     return data, errs
  66
  67
  68 def run_command(command: str):
  69     c = command.split()
  70     completion = subprocess.run(c, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  71     return (completion.returncode,
  72             completion.stdout.decode('utf-8').split('\n'),
  73             completion.stderr.decode('utf-8').split('\n'))
  74
  75
  76 class HTMLCache:
  77     def __init__(self) -> None:
  78         self.cache: Dict[str, Tuple[int, str]] = {}
  79
  80     def fetch(self, url_str: str) -> None:
  81         parsed = urlparse(url_str)
  82         url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
  83
  84         if url in self.cache:
  85             return self.cache[url]
  86
  87         req = urllib.request.Request(url)
  88         try:
  89             r = urllib.request.urlopen(req)
  90         except urllib.error.HTTPError as e:
  91             self.cache[url] = e.code, e.reason
  92             return self.cache[url]
  93         except urllib.error.URLError as e:
  94             self.cache[url] = 400, e.reason
  95             return self.cache[url]
  96
  97         if r.status == 200:
  98             html = r.read().decode('utf-8')
  99             self.cache[url] = 200, html
 100             return self.cache[url]
 101
 102         self.cache[url] = r.status, r.reason
 103         return r.status, r.reason
 104
 105     @property
 106     def cached_pages(self) -> List[str]:
 107         return self.cache.keys()
 108
 109     @property
 110     def cached_pages_total(self) -> int:
 111         return len(self.cache.keys())
 112
 113 class PrometheusRule:
 114     expected_attrs = [
 115         'alert',
 116         'expr',
 117         'labels',
 118         'annotations'
 119     ]
 120
 121     def __init__(self, rule_group, rule_data: Dict[str, Any]):
 122
 123         assert 'alert' in rule_data
 124         self.group: RuleGroup = rule_group
 125         self.name = rule_data.get('alert')
 126         self.rule = rule_data
 127         self.errors: List[str] = []
 128         self.warnings: List[str] = []
 129         self.validate()
 130
 131     @property
 132     def has_oid(self):
 133         return True if self.rule.get('labels', {}).get('oid', '') else False
 134
 135     @property
 136     def labels(self) -> Dict[str, str]:
 137         return self.rule.get('labels', {})
 138
 139     @property
 140     def annotations(self) -> Dict[str, str]:
 141         return self.rule.get('annotations', {})
 142
 143     def _check_alert_name(self):
 144         # this is simplistic, but works in the context of the alert name
 145         if self.name[0] in string.ascii_uppercase and \
 146           self.name != self.name.lower() and \
 147           self.name != self.name.upper() and \
 148           " " not in self.name and \
 149           "_" not in self.name:
 150             return
 151
 152         self.warnings.append("Alert name is not in CamelCase format")
 153
 154     def _check_structure(self):
 155         rule_attrs = self.rule.keys()
 156         missing_attrs = [a for a in PrometheusRule.expected_attrs if a not in rule_attrs]
 157
 158         if missing_attrs:
 159             self.errors.append(
 160                 f"invalid alert structure. Missing field{'s' if len(missing_attrs) > 1 else ''}"
 161                 f": {','.join(missing_attrs)}")
 162
 163     def _check_labels(self):
 164         for rqd in ['severity', 'type']:
 165             if rqd not in self.labels.keys():
 166                 self.errors.append(f"rule is missing {rqd} label definition")
 167
 168     def _check_annotations(self):
 169         for rqd in ['summary', 'description']:
 170                 if rqd not in self.annotations:
 171                     self.errors.append(f"rule is missing {rqd} annotation definition")
 172
 173     def _check_doclink(self):
 174         doclink = self.annotations.get(DOCLINK_NAME, '')
 175
 176         if doclink:
 177             url = urlparse(doclink)
 178             status, content = self.group.fetch_html_page(doclink)
 179             if status == 200:
 180                 if url.fragment:
 181                     soup = BeautifulSoup(content, 'html.parser')
 182                     if not soup.find(id=url.fragment):
 183                         self.errors.append(f"documentation link error: {url.fragment} anchor not found on the page")
 184             else:
 185                 # catch all
 186                 self.errors.append(f"documentation link error: {status} {content}")
 187
 188     def _check_snmp(self):
 189         oid = self.labels.get('oid', '')
 190
 191         if self.labels.get('severity', '') == 'critical' and not oid:
 192             self.warnings.append("critical level alert is missing an SNMP oid entry")
 193         if oid and not re.search('^1.3.6.1.4.1.50495.1.2.\\d+.\\d+.\\d+$', oid):
 194             self.errors.append("invalid OID format provided")
 195         if self.group.get_oids():
 196             if oid and oid not in self.group.get_oids():
 197                 self.errors.append(f"rule defines an OID {oid} that is missing from the MIB file({os.path.basename(MIB_FILE)})")
 198
 199     def _check_ascii(self):
 200         if 'oid' not in self.labels:
 201             return
 202
 203         desc = self.annotations.get('description', '')
 204         summary = self.annotations.get('summary', '')
 205         if not isascii(desc):
 206             self.errors.append(f"non-ascii characters found in 'description' field will cause issues in associated snmp trap.")
 207         if not isascii(summary):
 208             self.errors.append(f"non-ascii characters found in 'summary' field will cause issues in associated snmp trap.")
 209
 210     def validate(self):
 211
 212         self._check_alert_name()
 213         self._check_structure()
 214         self._check_labels()
 215         self._check_annotations()
 216         self._check_doclink()
 217         self._check_snmp()
 218         self._check_ascii()
 219         char = '.'
 220
 221         if self.errors:
 222             char = 'E'
 223             self.group.update('error', self.name)
 224         elif self.warnings:
 225             char = 'W'
 226             self.group.update('warning', self.name)
 227
 228         sys.stdout.write(char)
 229
 230
 231 class RuleGroup:
 232
 233     def __init__(self, rule_file, group_name: str, group_name_width: int):
 234         self.rule_file: RuleFile = rule_file
 235         self.group_name = group_name
 236         self.rules: Dict[str, PrometheusRule] = {}
 237         self.problems = {
 238             "error": [],
 239             "warning": [],
 240         }
 241
 242         sys.stdout.write(f"\n\t{group_name:<{group_name_width}} : ")
 243
 244     def add_rule(self, rule_data:Dict[str, Any]):
 245         alert_name = rule_data.get('alert')
 246         self.rules[alert_name] = PrometheusRule(self, rule_data)
 247
 248     def update(self, problem_type:str, alert_name:str):
 249         assert problem_type in ['error', 'warning']
 250
 251         self.problems[problem_type].append(alert_name)
 252         self.rule_file.update(self.group_name)
 253
 254     def fetch_html_page(self, url):
 255         return self.rule_file.fetch_html_page(url)
 256
 257     def get_oids(self):
 258         return self.rule_file.oid_list
 259
 260     @property
 261     def error_count(self):
 262         return len(self.problems['error'])
 263
 264     def warning_count(self):
 265         return len(self.problems['warning'])
 266
 267     @property
 268     def count(self):
 269         return len(self.rules)
 270
 271
 272 class RuleFile:
 273
 274     def __init__(self, parent, file_name, rules, oid_list):
 275         self.parent = parent
 276         self.file_name = file_name
 277         self.rules: Dict[str, Any] = rules
 278         self.oid_list = oid_list
 279         self.problems: Set[str] = set()
 280         self.group: Dict[str, RuleGroup] = {}
 281         self.alert_names_seen: Set[str] = set()
 282         self.duplicate_alert_names:List[str] = []
 283         self.html_cache = HTMLCache()
 284
 285         assert 'groups' in self.rules
 286         self.max_group_name_width = self.get_max_group_name()
 287         self.load_groups()
 288
 289     def update(self, group_name):
 290         self.problems.add(group_name)
 291         self.parent.mark_invalid()
 292
 293     def fetch_html_page(self, url):
 294         return self.html_cache.fetch(url)
 295
 296     @property
 297     def group_count(self):
 298         return len(self.rules['groups'])
 299
 300     @property
 301     def rule_count(self):
 302         rule_count = 0
 303         for _group_name, rule_group in self.group.items():
 304             rule_count += rule_group.count
 305         return rule_count
 306
 307     @property
 308     def oid_count(self):
 309         oid_count = 0
 310         for _group_name, rule_group in self.group.items():
 311             for _rule_name, rule in rule_group.rules.items():
 312                 if rule.has_oid:
 313                     oid_count += 1
 314         return oid_count
 315
 316     @property
 317     def group_names(self):
 318         return self.group.keys()
 319
 320     @property
 321     def problem_count(self):
 322         return len(self.problems)
 323
 324     def get_max_group_name(self):
 325         group_name_list = []
 326         for group in self.rules.get('groups'):
 327             group_name_list.append(group['name'])
 328         return max([len(g) for g in group_name_list])
 329
 330     def load_groups(self):
 331         sys.stdout.write("\nChecking rule groups")
 332         for group in self.rules.get('groups'):
 333             group_name = group['name']
 334             rules = group['rules']
 335             self.group[group_name] = RuleGroup(self, group_name, self.max_group_name_width)
 336             for rule_data in rules:
 337                 if 'alert' in rule_data:
 338                     alert_name = rule_data.get('alert')
 339                     if alert_name in self.alert_names_seen:
 340                         self.duplicate_alert_names.append(alert_name)
 341                     else:
 342                         self.alert_names_seen.add(alert_name)
 343                     self.group[group_name].add_rule(rule_data)
 344                 else:
 345                     # skipped recording rule
 346                     pass
 347
 348     def report(self):
 349         def max_width(item_list: Set[str], min_width: int = 0) -> int:
 350             return max([len(i) for i in item_list] + [min_width])
 351
 352         if not self.problems and not self.duplicate_alert_names:
 353             print("\nNo problems detected in the rule file")
 354             return
 355
 356         print("\nProblem Report\n")
 357
 358         group_width = max_width(self.problems, 5)
 359         alert_names = set()
 360         for g in self.problems:
 361             group = self.group[g]
 362             alert_names.update(group.problems.get('error', []))
 363             alert_names.update(group.problems.get('warning', []))
 364         alert_width = max_width(alert_names, 10)
 365
 366         template = "  {group:<{group_width}}  {severity:<8}  {alert_name:<{alert_width}}  {description}"
 367
 368         print(template.format(
 369             group="Group",
 370             group_width=group_width,
 371             severity="Severity",
 372             alert_name="Alert Name",
 373             alert_width=alert_width,
 374             description="Problem Description"))
 375
 376         print(template.format(
 377             group="-----",
 378             group_width=group_width,
 379             severity="--------",
 380             alert_name="----------",
 381             alert_width=alert_width,
 382             description="-------------------"))
 383
 384         for group_name in sorted(self.problems):
 385             group = self.group[group_name]
 386             rules = group.rules
 387             for alert_name in group.problems.get('error', []):
 388                 for desc in rules[alert_name].errors:
 389                     print(template.format(
 390                             group=group_name,
 391                             group_width=group_width,
 392                             severity="Error",
 393                             alert_name=alert_name,
 394                             alert_width=alert_width,
 395                             description=desc))
 396             for alert_name in group.problems.get('warning', []):
 397                 for desc in rules[alert_name].warnings:
 398                     print(template.format(
 399                             group=group_name,
 400                             group_width=group_width,
 401                             severity="Warning",
 402                             alert_name=alert_name,
 403                             alert_width=alert_width,
 404                             description=desc))
 405         if self.duplicate_alert_names:
 406             print("Duplicate alert names detected:")
 407             for a in self.duplicate_alert_names:
 408                 print(f"  - {a}")
 409
 410
 411 class UnitTests:
 412     expected_attrs = [
 413         'rule_files',
 414         'tests',
 415         'evaluation_interval'
 416     ]
 417     def __init__(self, filename):
 418         self.filename = filename
 419         self.unit_test_data: Dict[str, Any] = {}
 420         self.alert_names_seen: Set[str] = set()
 421         self.problems: List[str] = []
 422         self.load()
 423
 424     def load(self):
 425         self.unit_test_data, errs = load_yaml(self.filename)
 426         if errs:
 427             print(f"\n\nError in unit tests file: {errs}")
 428             sys.exit(12)
 429
 430         missing_attr = [a for a in UnitTests.expected_attrs if a not in self.unit_test_data.keys()]
 431         if missing_attr:
 432             print(f"\nMissing attributes in unit tests: {','.join(missing_attr)}")
 433             sys.exit(8)
 434
 435     def _check_alert_names(self, alert_names: List[str]):
 436         alerts_tested: Set[str] = set()
 437         for t in self.unit_test_data.get('tests'):
 438             test_cases = t.get('alert_rule_test', [])
 439             if not test_cases:
 440                 continue
 441             for case in test_cases:
 442                 alertname = case.get('alertname', '')
 443                 if alertname:
 444                     alerts_tested.add(alertname)
 445
 446         alerts_defined = set(alert_names)
 447         self.problems = list(alerts_defined.difference(alerts_tested))
 448
 449     def process(self, defined_alert_names: List[str]):
 450         self._check_alert_names(defined_alert_names)
 451
 452     def report(self) -> None:
 453
 454         if not self.problems:
 455             print("\nNo problems detected in unit tests file")
 456             return
 457
 458         print("\nUnit tests are incomplete. Tests missing for the following alerts;")
 459         for p in self.problems:
 460             print(f"  - {p}")
 461
 462 class RuleChecker:
 463
 464     def __init__(self, rules_filename: str = None, test_filename: str = None):
 465         self.rules_filename = rules_filename or ALERTS_FILE
 466         self.test_filename = test_filename or UNIT_TESTS_FILE
 467         self.rule_file: Optional[RuleFile] = None
 468         self.unit_tests: Optional[UnitTests] = None
 469         self.rule_file_problems: bool = False
 470         self.errors = {}
 471         self.warnings = {}
 472         self.error_count = 0
 473         self.warning_count = 0
 474         self.oid_count = 0
 475
 476         self.oid_list = self.build_oid_list()
 477
 478     def build_oid_list(self) -> List[str]:
 479
 480         cmd = shutil.which('snmptranslate')
 481         if not cmd:
 482             return []
 483
 484         rc, stdout, stderr = run_command(f"{cmd} -Pu -Tz -M ../../snmp:/usr/share/snmp/mibs -m CEPH-MIB")
 485         if rc != 0:
 486             return []
 487
 488         oid_list: List[str] = []
 489         for line in stdout[:-1]:
 490             _label, oid = line.replace('"', '').replace('\t', ' ').split()
 491             oid_list.append(oid)
 492
 493         return oid_list
 494
 495     @property
 496     def status(self):
 497         if self.rule_file_problems or self.unit_tests.problems:
 498             return 4
 499
 500         return 0
 501
 502     def mark_invalid(self):
 503         self.rule_file_problems = True
 504
 505     def summarise_rule_file(self):
 506         for group_name in self.rule_file.problems:
 507             group = self.rule_file.group[group_name]
 508             self.error_count += len(group.problems['error'])
 509             self.warning_count += len(group.problems['warning'])
 510
 511     def ready(self):
 512         errs: List[str] = []
 513         ready_state = True
 514         if not os.path.exists(self.rules_filename):
 515             errs.append(f"rule file '{self.rules_filename}' not found")
 516             ready_state = False
 517
 518         if not os.path.exists(self.test_filename):
 519             errs.append(f"test file '{self.test_filename}' not found")
 520             ready_state = False
 521
 522         return ready_state, errs
 523
 524     def run(self):
 525
 526         ready, errs = self.ready()
 527         if not ready:
 528             print("Unable to start:")
 529             for e in errs:
 530                 print(f"- {e}")
 531             sys.exit(16)
 532
 533         rules, errs = load_yaml(self.rules_filename)
 534         if errs:
 535             print(errs)
 536             sys.exit(12)
 537
 538         self.rule_file = RuleFile(self, self.rules_filename, rules, self.oid_list)
 539         self.summarise_rule_file()
 540
 541         self.unit_tests = UnitTests(self.test_filename)
 542         self.unit_tests.process(self.rule_file.alert_names_seen)
 543
 544     def report(self):
 545         print("\n\nSummary\n")
 546         print(f"Rule file             : {self.rules_filename}")
 547         print(f"Unit Test file        : {self.test_filename}")
 548         print(f"\nRule groups processed : {self.rule_file.group_count:>3}")
 549         print(f"Rules processed       : {self.rule_file.rule_count:>3}")
 550         print(f"SNMP OIDs declared    : {self.rule_file.oid_count:>3} {'(snmptranslate missing, unable to cross check)' if not self.oid_list else ''}")
 551         print(f"Rule errors           : {self.error_count:>3}")
 552         print(f"Rule warnings         : {self.warning_count:>3}")
 553         print(f"Rule name duplicates  : {len(self.rule_file.duplicate_alert_names):>3}")
 554         print(f"Unit tests missing    : {len(self.unit_tests.problems):>3}")
 555
 556         self.rule_file.report()
 557         self.unit_tests.report()
 558
 559
 560 def main():
 561     checker = RuleChecker()
 562
 563     checker.run()
 564     checker.report()
 565     print()
 566
 567     sys.exit(checker.status)
 568
 569
 570 if __name__ == '__main__':
 571     main()