]>
git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/tests_alerts/validate_rules.py
3 # Check the Prometheus rules for format, and integration
4 # with the unit tests. This script has the following exit
6 # 0 .. Everything worked
7 # 4 .. rule problems or missing unit tests
8 # 8 .. Missing fields in YAML
9 # 12 .. Invalid YAML - unable to load
10 # 16 .. Missing input files
13 # snmptranslate .. used to determine the oid's in the MIB to verify the rule -> MIB is correct
22 from bs4
import BeautifulSoup
23 from typing
import List
, Any
, Dict
, Set
, Optional
, Tuple
28 from urllib
.parse
import urlparse
30 from settings
import ALERTS_FILE
, MIB_FILE
, UNIT_TESTS_FILE
32 DOCLINK_NAME
= 'documentation'
35 def isascii(s
: str) -> bool:
38 except UnicodeEncodeError:
43 def read_file(file_name
: str) -> Tuple
[str, str]:
45 with
open(file_name
, 'r') as input_file
:
46 raw_data
= input_file
.read()
48 return '', f
"Unable to open {file_name}"
53 def load_yaml(file_name
: str) -> Tuple
[Dict
[str, Any
], str]:
57 raw_data
, err
= read_file(file_name
)
61 data
= yaml
.safe_load(raw_data
)
62 except yaml
.YAMLError
as e
:
63 errs
= f
"filename '{file_name} is not a valid YAML file"
68 def run_command(command
: str):
70 completion
= subprocess
.run(c
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.PIPE
)
71 return (completion
.returncode
,
72 completion
.stdout
.decode('utf-8').split('\n'),
73 completion
.stderr
.decode('utf-8').split('\n'))
77 def __init__(self
) -> None:
78 self
.cache
: Dict
[str, Tuple
[int, str]] = {}
80 def fetch(self
, url_str
: str) -> None:
81 parsed
= urlparse(url_str
)
82 url
= f
"{parsed.scheme}://{parsed.netloc}{parsed.path}"
85 return self
.cache
[url
]
87 req
= urllib
.request
.Request(url
)
89 r
= urllib
.request
.urlopen(req
)
90 except urllib
.error
.HTTPError
as e
:
91 self
.cache
[url
] = e
.code
, e
.reason
92 return self
.cache
[url
]
93 except urllib
.error
.URLError
as e
:
94 self
.cache
[url
] = 400, e
.reason
95 return self
.cache
[url
]
98 html
= r
.read().decode('utf-8')
99 self
.cache
[url
] = 200, html
100 return self
.cache
[url
]
102 self
.cache
[url
] = r
.status
, r
.reason
103 return r
.status
, r
.reason
106 def cached_pages(self
) -> List
[str]:
107 return self
.cache
.keys()
110 def cached_pages_total(self
) -> int:
111 return len(self
.cache
.keys())
113 class PrometheusRule
:
121 def __init__(self
, rule_group
, rule_data
: Dict
[str, Any
]):
123 assert 'alert' in rule_data
124 self
.group
: RuleGroup
= rule_group
125 self
.name
= rule_data
.get('alert')
126 self
.rule
= rule_data
127 self
.errors
: List
[str] = []
128 self
.warnings
: List
[str] = []
133 return True if self
.rule
.get('labels', {}).get('oid', '') else False
136 def labels(self
) -> Dict
[str, str]:
137 return self
.rule
.get('labels', {})
140 def annotations(self
) -> Dict
[str, str]:
141 return self
.rule
.get('annotations', {})
143 def _check_alert_name(self
):
144 # this is simplistic, but works in the context of the alert name
145 if self
.name
[0] in string
.ascii_uppercase
and \
146 self
.name
!= self
.name
.lower() and \
147 self
.name
!= self
.name
.upper() and \
148 " " not in self
.name
and \
149 "_" not in self
.name
:
152 self
.warnings
.append("Alert name is not in CamelCase format")
154 def _check_structure(self
):
155 rule_attrs
= self
.rule
.keys()
156 missing_attrs
= [a
for a
in PrometheusRule
.expected_attrs
if a
not in rule_attrs
]
160 f
"invalid alert structure. Missing field{'s' if len(missing_attrs) > 1 else ''}"
161 f
": {','.join(missing_attrs)}")
163 def _check_labels(self
):
164 for rqd
in ['severity', 'type']:
165 if rqd
not in self
.labels
.keys():
166 self
.errors
.append(f
"rule is missing {rqd} label definition")
168 def _check_annotations(self
):
169 for rqd
in ['summary', 'description']:
170 if rqd
not in self
.annotations
:
171 self
.errors
.append(f
"rule is missing {rqd} annotation definition")
173 def _check_doclink(self
):
174 doclink
= self
.annotations
.get(DOCLINK_NAME
, '')
177 url
= urlparse(doclink
)
178 status
, content
= self
.group
.fetch_html_page(doclink
)
181 soup
= BeautifulSoup(content
, 'html.parser')
182 if not soup
.find(id=url
.fragment
):
183 self
.errors
.append(f
"documentation link error: {url.fragment} anchor not found on the page")
186 self
.errors
.append(f
"documentation link error: {status} {content}")
188 def _check_snmp(self
):
189 oid
= self
.labels
.get('oid', '')
191 if self
.labels
.get('severity', '') == 'critical' and not oid
:
192 self
.warnings
.append("critical level alert is missing an SNMP oid entry")
193 if oid
and not re
.search('^1.3.6.1.4.1.50495.1.2.\\d+.\\d+.\\d+$', oid
):
194 self
.errors
.append("invalid OID format provided")
195 if self
.group
.get_oids():
196 if oid
and oid
not in self
.group
.get_oids():
197 self
.errors
.append(f
"rule defines an OID {oid} that is missing from the MIB file({os.path.basename(MIB_FILE)})")
199 def _check_ascii(self
):
200 if 'oid' not in self
.labels
:
203 desc
= self
.annotations
.get('description', '')
204 summary
= self
.annotations
.get('summary', '')
205 if not isascii(desc
):
206 self
.errors
.append(f
"non-ascii characters found in 'description' field will cause issues in associated snmp trap.")
207 if not isascii(summary
):
208 self
.errors
.append(f
"non-ascii characters found in 'summary' field will cause issues in associated snmp trap.")
212 self
._check
_alert
_name
()
213 self
._check
_structure
()
215 self
._check
_annotations
()
216 self
._check
_doclink
()
223 self
.group
.update('error', self
.name
)
226 self
.group
.update('warning', self
.name
)
228 sys
.stdout
.write(char
)
233 def __init__(self
, rule_file
, group_name
: str, group_name_width
: int):
234 self
.rule_file
: RuleFile
= rule_file
235 self
.group_name
= group_name
236 self
.rules
: Dict
[str, PrometheusRule
] = {}
242 sys
.stdout
.write(f
"\n\t{group_name:<{group_name_width}} : ")
244 def add_rule(self
, rule_data
:Dict
[str, Any
]):
245 alert_name
= rule_data
.get('alert')
246 self
.rules
[alert_name
] = PrometheusRule(self
, rule_data
)
248 def update(self
, problem_type
:str, alert_name
:str):
249 assert problem_type
in ['error', 'warning']
251 self
.problems
[problem_type
].append(alert_name
)
252 self
.rule_file
.update(self
.group_name
)
254 def fetch_html_page(self
, url
):
255 return self
.rule_file
.fetch_html_page(url
)
258 return self
.rule_file
.oid_list
261 def error_count(self
):
262 return len(self
.problems
['error'])
264 def warning_count(self
):
265 return len(self
.problems
['warning'])
269 return len(self
.rules
)
274 def __init__(self
, parent
, file_name
, rules
, oid_list
):
276 self
.file_name
= file_name
277 self
.rules
: Dict
[str, Any
] = rules
278 self
.oid_list
= oid_list
279 self
.problems
: Set
[str] = set()
280 self
.group
: Dict
[str, RuleGroup
] = {}
281 self
.alert_names_seen
: Set
[str] = set()
282 self
.duplicate_alert_names
:List
[str] = []
283 self
.html_cache
= HTMLCache()
285 assert 'groups' in self
.rules
286 self
.max_group_name_width
= self
.get_max_group_name()
289 def update(self
, group_name
):
290 self
.problems
.add(group_name
)
291 self
.parent
.mark_invalid()
293 def fetch_html_page(self
, url
):
294 return self
.html_cache
.fetch(url
)
297 def group_count(self
):
298 return len(self
.rules
['groups'])
301 def rule_count(self
):
303 for _group_name
, rule_group
in self
.group
.items():
304 rule_count
+= rule_group
.count
310 for _group_name
, rule_group
in self
.group
.items():
311 for _rule_name
, rule
in rule_group
.rules
.items():
317 def group_names(self
):
318 return self
.group
.keys()
321 def problem_count(self
):
322 return len(self
.problems
)
324 def get_max_group_name(self
):
326 for group
in self
.rules
.get('groups'):
327 group_name_list
.append(group
['name'])
328 return max([len(g
) for g
in group_name_list
])
330 def load_groups(self
):
331 sys
.stdout
.write("\nChecking rule groups")
332 for group
in self
.rules
.get('groups'):
333 group_name
= group
['name']
334 rules
= group
['rules']
335 self
.group
[group_name
] = RuleGroup(self
, group_name
, self
.max_group_name_width
)
336 for rule_data
in rules
:
337 if 'alert' in rule_data
:
338 alert_name
= rule_data
.get('alert')
339 if alert_name
in self
.alert_names_seen
:
340 self
.duplicate_alert_names
.append(alert_name
)
342 self
.alert_names_seen
.add(alert_name
)
343 self
.group
[group_name
].add_rule(rule_data
)
345 # skipped recording rule
349 def max_width(item_list
: Set
[str], min_width
: int = 0) -> int:
350 return max([len(i
) for i
in item_list
] + [min_width
])
352 if not self
.problems
and not self
.duplicate_alert_names
:
353 print("\nNo problems detected in the rule file")
356 print("\nProblem Report\n")
358 group_width
= max_width(self
.problems
, 5)
360 for g
in self
.problems
:
361 group
= self
.group
[g
]
362 alert_names
.update(group
.problems
.get('error', []))
363 alert_names
.update(group
.problems
.get('warning', []))
364 alert_width
= max_width(alert_names
, 10)
366 template
= " {group:<{group_width}} {severity:<8} {alert_name:<{alert_width}} {description}"
368 print(template
.format(
370 group_width
=group_width
,
372 alert_name
="Alert Name",
373 alert_width
=alert_width
,
374 description
="Problem Description"))
376 print(template
.format(
378 group_width
=group_width
,
380 alert_name
="----------",
381 alert_width
=alert_width
,
382 description
="-------------------"))
384 for group_name
in sorted(self
.problems
):
385 group
= self
.group
[group_name
]
387 for alert_name
in group
.problems
.get('error', []):
388 for desc
in rules
[alert_name
].errors
:
389 print(template
.format(
391 group_width
=group_width
,
393 alert_name
=alert_name
,
394 alert_width
=alert_width
,
396 for alert_name
in group
.problems
.get('warning', []):
397 for desc
in rules
[alert_name
].warnings
:
398 print(template
.format(
400 group_width
=group_width
,
402 alert_name
=alert_name
,
403 alert_width
=alert_width
,
405 if self
.duplicate_alert_names
:
406 print("Duplicate alert names detected:")
407 for a
in self
.duplicate_alert_names
:
415 'evaluation_interval'
417 def __init__(self
, filename
):
418 self
.filename
= filename
419 self
.unit_test_data
: Dict
[str, Any
] = {}
420 self
.alert_names_seen
: Set
[str] = set()
421 self
.problems
: List
[str] = []
425 self
.unit_test_data
, errs
= load_yaml(self
.filename
)
427 print(f
"\n\nError in unit tests file: {errs}")
430 missing_attr
= [a
for a
in UnitTests
.expected_attrs
if a
not in self
.unit_test_data
.keys()]
432 print(f
"\nMissing attributes in unit tests: {','.join(missing_attr)}")
435 def _check_alert_names(self
, alert_names
: List
[str]):
436 alerts_tested
: Set
[str] = set()
437 for t
in self
.unit_test_data
.get('tests'):
438 test_cases
= t
.get('alert_rule_test', [])
441 for case
in test_cases
:
442 alertname
= case
.get('alertname', '')
444 alerts_tested
.add(alertname
)
446 alerts_defined
= set(alert_names
)
447 self
.problems
= list(alerts_defined
.difference(alerts_tested
))
449 def process(self
, defined_alert_names
: List
[str]):
450 self
._check
_alert
_names
(defined_alert_names
)
452 def report(self
) -> None:
454 if not self
.problems
:
455 print("\nNo problems detected in unit tests file")
458 print("\nUnit tests are incomplete. Tests missing for the following alerts;")
459 for p
in self
.problems
:
464 def __init__(self
, rules_filename
: str = None, test_filename
: str = None):
465 self
.rules_filename
= rules_filename
or ALERTS_FILE
466 self
.test_filename
= test_filename
or UNIT_TESTS_FILE
467 self
.rule_file
: Optional
[RuleFile
] = None
468 self
.unit_tests
: Optional
[UnitTests
] = None
469 self
.rule_file_problems
: bool = False
473 self
.warning_count
= 0
476 self
.oid_list
= self
.build_oid_list()
478 def build_oid_list(self
) -> List
[str]:
480 cmd
= shutil
.which('snmptranslate')
484 rc
, stdout
, stderr
= run_command(f
"{cmd} -Pu -Tz -M ../../snmp:/usr/share/snmp/mibs -m CEPH-MIB")
488 oid_list
: List
[str] = []
489 for line
in stdout
[:-1]:
490 _label
, oid
= line
.replace('"', '').replace('\t', ' ').split()
497 if self
.rule_file_problems
or self
.unit_tests
.problems
:
502 def mark_invalid(self
):
503 self
.rule_file_problems
= True
505 def summarise_rule_file(self
):
506 for group_name
in self
.rule_file
.problems
:
507 group
= self
.rule_file
.group
[group_name
]
508 self
.error_count
+= len(group
.problems
['error'])
509 self
.warning_count
+= len(group
.problems
['warning'])
514 if not os
.path
.exists(self
.rules_filename
):
515 errs
.append(f
"rule file '{self.rules_filename}' not found")
518 if not os
.path
.exists(self
.test_filename
):
519 errs
.append(f
"test file '{self.test_filename}' not found")
522 return ready_state
, errs
526 ready
, errs
= self
.ready()
528 print("Unable to start:")
533 rules
, errs
= load_yaml(self
.rules_filename
)
538 self
.rule_file
= RuleFile(self
, self
.rules_filename
, rules
, self
.oid_list
)
539 self
.summarise_rule_file()
541 self
.unit_tests
= UnitTests(self
.test_filename
)
542 self
.unit_tests
.process(self
.rule_file
.alert_names_seen
)
545 print("\n\nSummary\n")
546 print(f
"Rule file : {self.rules_filename}")
547 print(f
"Unit Test file : {self.test_filename}")
548 print(f
"\nRule groups processed : {self.rule_file.group_count:>3}")
549 print(f
"Rules processed : {self.rule_file.rule_count:>3}")
550 print(f
"SNMP OIDs declared : {self.rule_file.oid_count:>3} {'(snmptranslate missing, unable to cross check)' if not self.oid_list else ''}")
551 print(f
"Rule errors : {self.error_count:>3}")
552 print(f
"Rule warnings : {self.warning_count:>3}")
553 print(f
"Rule name duplicates : {len(self.rule_file.duplicate_alert_names):>3}")
554 print(f
"Unit tests missing : {len(self.unit_tests.problems):>3}")
556 self
.rule_file
.report()
557 self
.unit_tests
.report()
561 checker
= RuleChecker()
567 sys
.exit(checker
.status
)
570 if __name__
== '__main__':