]> git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/tests_alerts/validate_rules.py
import quincy beta 17.1.0
[ceph.git] / ceph / monitoring / ceph-mixin / tests_alerts / validate_rules.py
1 #!/usr/bin/env python3
2 #
3 # Check the Prometheus rules for format, and integration
4 # with the unit tests. This script has the following exit
5 # codes:
6 # 0 .. Everything worked
7 # 4 .. rule problems or missing unit tests
8 # 8 .. Missing fields in YAML
9 # 12 .. Invalid YAML - unable to load
10 # 16 .. Missing input files
11 #
12 # Externals
13 # snmptranslate .. used to determine the oid's in the MIB to verify the rule -> MIB is correct
14 #
15
16 import re
17 import os
18 import sys
19 import yaml
20 import shutil
21 import string
22 from bs4 import BeautifulSoup
23 from typing import List, Any, Dict, Set, Optional, Tuple
24 import subprocess
25
26 import urllib.request
27 import urllib.error
28 from urllib.parse import urlparse
29
30 from settings import ALERTS_FILE, MIB_FILE, UNIT_TESTS_FILE
31
32 DOCLINK_NAME = 'documentation'
33
34
35 def isascii(s: str) -> bool:
36 try:
37 s.encode('ascii')
38 except UnicodeEncodeError:
39 return False
40 return True
41
42
43 def read_file(file_name: str) -> Tuple[str, str]:
44 try:
45 with open(file_name, 'r') as input_file:
46 raw_data = input_file.read()
47 except OSError:
48 return '', f"Unable to open {file_name}"
49
50 return raw_data, ''
51
52
53 def load_yaml(file_name: str) -> Tuple[Dict[str, Any], str]:
54 data = {}
55 errs = ''
56
57 raw_data, err = read_file(file_name)
58 if not err:
59
60 try:
61 data = yaml.safe_load(raw_data)
62 except yaml.YAMLError as e:
63 errs = f"filename '{file_name} is not a valid YAML file"
64
65 return data, errs
66
67
68 def run_command(command: str):
69 c = command.split()
70 completion = subprocess.run(c, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
71 return (completion.returncode,
72 completion.stdout.decode('utf-8').split('\n'),
73 completion.stderr.decode('utf-8').split('\n'))
74
75
76 class HTMLCache:
77 def __init__(self) -> None:
78 self.cache: Dict[str, Tuple[int, str]] = {}
79
80 def fetch(self, url_str: str) -> None:
81 parsed = urlparse(url_str)
82 url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
83
84 if url in self.cache:
85 return self.cache[url]
86
87 req = urllib.request.Request(url)
88 try:
89 r = urllib.request.urlopen(req)
90 except urllib.error.HTTPError as e:
91 self.cache[url] = e.code, e.reason
92 return self.cache[url]
93 except urllib.error.URLError as e:
94 self.cache[url] = 400, e.reason
95 return self.cache[url]
96
97 if r.status == 200:
98 html = r.read().decode('utf-8')
99 self.cache[url] = 200, html
100 return self.cache[url]
101
102 self.cache[url] = r.status, r.reason
103 return r.status, r.reason
104
105 @property
106 def cached_pages(self) -> List[str]:
107 return self.cache.keys()
108
109 @property
110 def cached_pages_total(self) -> int:
111 return len(self.cache.keys())
112
113 class PrometheusRule:
114 expected_attrs = [
115 'alert',
116 'expr',
117 'labels',
118 'annotations'
119 ]
120
121 def __init__(self, rule_group, rule_data: Dict[str, Any]):
122
123 assert 'alert' in rule_data
124 self.group: RuleGroup = rule_group
125 self.name = rule_data.get('alert')
126 self.rule = rule_data
127 self.errors: List[str] = []
128 self.warnings: List[str] = []
129 self.validate()
130
131 @property
132 def has_oid(self):
133 return True if self.rule.get('labels', {}).get('oid', '') else False
134
135 @property
136 def labels(self) -> Dict[str, str]:
137 return self.rule.get('labels', {})
138
139 @property
140 def annotations(self) -> Dict[str, str]:
141 return self.rule.get('annotations', {})
142
143 def _check_alert_name(self):
144 # this is simplistic, but works in the context of the alert name
145 if self.name[0] in string.ascii_uppercase and \
146 self.name != self.name.lower() and \
147 self.name != self.name.upper() and \
148 " " not in self.name and \
149 "_" not in self.name:
150 return
151
152 self.warnings.append("Alert name is not in CamelCase format")
153
154 def _check_structure(self):
155 rule_attrs = self.rule.keys()
156 missing_attrs = [a for a in PrometheusRule.expected_attrs if a not in rule_attrs]
157
158 if missing_attrs:
159 self.errors.append(
160 f"invalid alert structure. Missing field{'s' if len(missing_attrs) > 1 else ''}"
161 f": {','.join(missing_attrs)}")
162
163 def _check_labels(self):
164 for rqd in ['severity', 'type']:
165 if rqd not in self.labels.keys():
166 self.errors.append(f"rule is missing {rqd} label definition")
167
168 def _check_annotations(self):
169 for rqd in ['summary', 'description']:
170 if rqd not in self.annotations:
171 self.errors.append(f"rule is missing {rqd} annotation definition")
172
173 def _check_doclink(self):
174 doclink = self.annotations.get(DOCLINK_NAME, '')
175
176 if doclink:
177 url = urlparse(doclink)
178 status, content = self.group.fetch_html_page(doclink)
179 if status == 200:
180 if url.fragment:
181 soup = BeautifulSoup(content, 'html.parser')
182 if not soup.find(id=url.fragment):
183 self.errors.append(f"documentation link error: {url.fragment} anchor not found on the page")
184 else:
185 # catch all
186 self.errors.append(f"documentation link error: {status} {content}")
187
188 def _check_snmp(self):
189 oid = self.labels.get('oid', '')
190
191 if self.labels.get('severity', '') == 'critical' and not oid:
192 self.warnings.append("critical level alert is missing an SNMP oid entry")
193 if oid and not re.search('^1.3.6.1.4.1.50495.1.2.\\d+.\\d+.\\d+$', oid):
194 self.errors.append("invalid OID format provided")
195 if self.group.get_oids():
196 if oid and oid not in self.group.get_oids():
197 self.errors.append(f"rule defines an OID {oid} that is missing from the MIB file({os.path.basename(MIB_FILE)})")
198
199 def _check_ascii(self):
200 if 'oid' not in self.labels:
201 return
202
203 desc = self.annotations.get('description', '')
204 summary = self.annotations.get('summary', '')
205 if not isascii(desc):
206 self.errors.append(f"non-ascii characters found in 'description' field will cause issues in associated snmp trap.")
207 if not isascii(summary):
208 self.errors.append(f"non-ascii characters found in 'summary' field will cause issues in associated snmp trap.")
209
210 def validate(self):
211
212 self._check_alert_name()
213 self._check_structure()
214 self._check_labels()
215 self._check_annotations()
216 self._check_doclink()
217 self._check_snmp()
218 self._check_ascii()
219 char = '.'
220
221 if self.errors:
222 char = 'E'
223 self.group.update('error', self.name)
224 elif self.warnings:
225 char = 'W'
226 self.group.update('warning', self.name)
227
228 sys.stdout.write(char)
229
230
231 class RuleGroup:
232
233 def __init__(self, rule_file, group_name: str, group_name_width: int):
234 self.rule_file: RuleFile = rule_file
235 self.group_name = group_name
236 self.rules: Dict[str, PrometheusRule] = {}
237 self.problems = {
238 "error": [],
239 "warning": [],
240 }
241
242 sys.stdout.write(f"\n\t{group_name:<{group_name_width}} : ")
243
244 def add_rule(self, rule_data:Dict[str, Any]):
245 alert_name = rule_data.get('alert')
246 self.rules[alert_name] = PrometheusRule(self, rule_data)
247
248 def update(self, problem_type:str, alert_name:str):
249 assert problem_type in ['error', 'warning']
250
251 self.problems[problem_type].append(alert_name)
252 self.rule_file.update(self.group_name)
253
254 def fetch_html_page(self, url):
255 return self.rule_file.fetch_html_page(url)
256
257 def get_oids(self):
258 return self.rule_file.oid_list
259
260 @property
261 def error_count(self):
262 return len(self.problems['error'])
263
264 def warning_count(self):
265 return len(self.problems['warning'])
266
267 @property
268 def count(self):
269 return len(self.rules)
270
271
272 class RuleFile:
273
274 def __init__(self, parent, file_name, rules, oid_list):
275 self.parent = parent
276 self.file_name = file_name
277 self.rules: Dict[str, Any] = rules
278 self.oid_list = oid_list
279 self.problems: Set[str] = set()
280 self.group: Dict[str, RuleGroup] = {}
281 self.alert_names_seen: Set[str] = set()
282 self.duplicate_alert_names:List[str] = []
283 self.html_cache = HTMLCache()
284
285 assert 'groups' in self.rules
286 self.max_group_name_width = self.get_max_group_name()
287 self.load_groups()
288
289 def update(self, group_name):
290 self.problems.add(group_name)
291 self.parent.mark_invalid()
292
293 def fetch_html_page(self, url):
294 return self.html_cache.fetch(url)
295
296 @property
297 def group_count(self):
298 return len(self.rules['groups'])
299
300 @property
301 def rule_count(self):
302 rule_count = 0
303 for _group_name, rule_group in self.group.items():
304 rule_count += rule_group.count
305 return rule_count
306
307 @property
308 def oid_count(self):
309 oid_count = 0
310 for _group_name, rule_group in self.group.items():
311 for _rule_name, rule in rule_group.rules.items():
312 if rule.has_oid:
313 oid_count += 1
314 return oid_count
315
316 @property
317 def group_names(self):
318 return self.group.keys()
319
320 @property
321 def problem_count(self):
322 return len(self.problems)
323
324 def get_max_group_name(self):
325 group_name_list = []
326 for group in self.rules.get('groups'):
327 group_name_list.append(group['name'])
328 return max([len(g) for g in group_name_list])
329
330 def load_groups(self):
331 sys.stdout.write("\nChecking rule groups")
332 for group in self.rules.get('groups'):
333 group_name = group['name']
334 rules = group['rules']
335 self.group[group_name] = RuleGroup(self, group_name, self.max_group_name_width)
336 for rule_data in rules:
337 if 'alert' in rule_data:
338 alert_name = rule_data.get('alert')
339 if alert_name in self.alert_names_seen:
340 self.duplicate_alert_names.append(alert_name)
341 else:
342 self.alert_names_seen.add(alert_name)
343 self.group[group_name].add_rule(rule_data)
344 else:
345 # skipped recording rule
346 pass
347
348 def report(self):
349 def max_width(item_list: Set[str], min_width: int = 0) -> int:
350 return max([len(i) for i in item_list] + [min_width])
351
352 if not self.problems and not self.duplicate_alert_names:
353 print("\nNo problems detected in the rule file")
354 return
355
356 print("\nProblem Report\n")
357
358 group_width = max_width(self.problems, 5)
359 alert_names = set()
360 for g in self.problems:
361 group = self.group[g]
362 alert_names.update(group.problems.get('error', []))
363 alert_names.update(group.problems.get('warning', []))
364 alert_width = max_width(alert_names, 10)
365
366 template = " {group:<{group_width}} {severity:<8} {alert_name:<{alert_width}} {description}"
367
368 print(template.format(
369 group="Group",
370 group_width=group_width,
371 severity="Severity",
372 alert_name="Alert Name",
373 alert_width=alert_width,
374 description="Problem Description"))
375
376 print(template.format(
377 group="-----",
378 group_width=group_width,
379 severity="--------",
380 alert_name="----------",
381 alert_width=alert_width,
382 description="-------------------"))
383
384 for group_name in sorted(self.problems):
385 group = self.group[group_name]
386 rules = group.rules
387 for alert_name in group.problems.get('error', []):
388 for desc in rules[alert_name].errors:
389 print(template.format(
390 group=group_name,
391 group_width=group_width,
392 severity="Error",
393 alert_name=alert_name,
394 alert_width=alert_width,
395 description=desc))
396 for alert_name in group.problems.get('warning', []):
397 for desc in rules[alert_name].warnings:
398 print(template.format(
399 group=group_name,
400 group_width=group_width,
401 severity="Warning",
402 alert_name=alert_name,
403 alert_width=alert_width,
404 description=desc))
405 if self.duplicate_alert_names:
406 print("Duplicate alert names detected:")
407 for a in self.duplicate_alert_names:
408 print(f" - {a}")
409
410
411 class UnitTests:
412 expected_attrs = [
413 'rule_files',
414 'tests',
415 'evaluation_interval'
416 ]
417 def __init__(self, filename):
418 self.filename = filename
419 self.unit_test_data: Dict[str, Any] = {}
420 self.alert_names_seen: Set[str] = set()
421 self.problems: List[str] = []
422 self.load()
423
424 def load(self):
425 self.unit_test_data, errs = load_yaml(self.filename)
426 if errs:
427 print(f"\n\nError in unit tests file: {errs}")
428 sys.exit(12)
429
430 missing_attr = [a for a in UnitTests.expected_attrs if a not in self.unit_test_data.keys()]
431 if missing_attr:
432 print(f"\nMissing attributes in unit tests: {','.join(missing_attr)}")
433 sys.exit(8)
434
435 def _check_alert_names(self, alert_names: List[str]):
436 alerts_tested: Set[str] = set()
437 for t in self.unit_test_data.get('tests'):
438 test_cases = t.get('alert_rule_test', [])
439 if not test_cases:
440 continue
441 for case in test_cases:
442 alertname = case.get('alertname', '')
443 if alertname:
444 alerts_tested.add(alertname)
445
446 alerts_defined = set(alert_names)
447 self.problems = list(alerts_defined.difference(alerts_tested))
448
449 def process(self, defined_alert_names: List[str]):
450 self._check_alert_names(defined_alert_names)
451
452 def report(self) -> None:
453
454 if not self.problems:
455 print("\nNo problems detected in unit tests file")
456 return
457
458 print("\nUnit tests are incomplete. Tests missing for the following alerts;")
459 for p in self.problems:
460 print(f" - {p}")
461
462 class RuleChecker:
463
464 def __init__(self, rules_filename: str = None, test_filename: str = None):
465 self.rules_filename = rules_filename or ALERTS_FILE
466 self.test_filename = test_filename or UNIT_TESTS_FILE
467 self.rule_file: Optional[RuleFile] = None
468 self.unit_tests: Optional[UnitTests] = None
469 self.rule_file_problems: bool = False
470 self.errors = {}
471 self.warnings = {}
472 self.error_count = 0
473 self.warning_count = 0
474 self.oid_count = 0
475
476 self.oid_list = self.build_oid_list()
477
478 def build_oid_list(self) -> List[str]:
479
480 cmd = shutil.which('snmptranslate')
481 if not cmd:
482 return []
483
484 rc, stdout, stderr = run_command(f"{cmd} -Pu -Tz -M ../../snmp:/usr/share/snmp/mibs -m CEPH-MIB")
485 if rc != 0:
486 return []
487
488 oid_list: List[str] = []
489 for line in stdout[:-1]:
490 _label, oid = line.replace('"', '').replace('\t', ' ').split()
491 oid_list.append(oid)
492
493 return oid_list
494
495 @property
496 def status(self):
497 if self.rule_file_problems or self.unit_tests.problems:
498 return 4
499
500 return 0
501
502 def mark_invalid(self):
503 self.rule_file_problems = True
504
505 def summarise_rule_file(self):
506 for group_name in self.rule_file.problems:
507 group = self.rule_file.group[group_name]
508 self.error_count += len(group.problems['error'])
509 self.warning_count += len(group.problems['warning'])
510
511 def ready(self):
512 errs: List[str] = []
513 ready_state = True
514 if not os.path.exists(self.rules_filename):
515 errs.append(f"rule file '{self.rules_filename}' not found")
516 ready_state = False
517
518 if not os.path.exists(self.test_filename):
519 errs.append(f"test file '{self.test_filename}' not found")
520 ready_state = False
521
522 return ready_state, errs
523
524 def run(self):
525
526 ready, errs = self.ready()
527 if not ready:
528 print("Unable to start:")
529 for e in errs:
530 print(f"- {e}")
531 sys.exit(16)
532
533 rules, errs = load_yaml(self.rules_filename)
534 if errs:
535 print(errs)
536 sys.exit(12)
537
538 self.rule_file = RuleFile(self, self.rules_filename, rules, self.oid_list)
539 self.summarise_rule_file()
540
541 self.unit_tests = UnitTests(self.test_filename)
542 self.unit_tests.process(self.rule_file.alert_names_seen)
543
544 def report(self):
545 print("\n\nSummary\n")
546 print(f"Rule file : {self.rules_filename}")
547 print(f"Unit Test file : {self.test_filename}")
548 print(f"\nRule groups processed : {self.rule_file.group_count:>3}")
549 print(f"Rules processed : {self.rule_file.rule_count:>3}")
550 print(f"SNMP OIDs declared : {self.rule_file.oid_count:>3} {'(snmptranslate missing, unable to cross check)' if not self.oid_list else ''}")
551 print(f"Rule errors : {self.error_count:>3}")
552 print(f"Rule warnings : {self.warning_count:>3}")
553 print(f"Rule name duplicates : {len(self.rule_file.duplicate_alert_names):>3}")
554 print(f"Unit tests missing : {len(self.unit_tests.problems):>3}")
555
556 self.rule_file.report()
557 self.unit_tests.report()
558
559
560 def main():
561 checker = RuleChecker()
562
563 checker.run()
564 checker.report()
565 print()
566
567 sys.exit(checker.status)
568
569
570 if __name__ == '__main__':
571 main()