]>
Commit | Line | Data |
---|---|---|
9f95a23c | 1 | import hashlib |
20effc67 | 2 | from mgr_module import CLICommand, CLIReadCommand, CLIWriteCommand, MgrModule, Option |
11fdf7f2 TL |
3 | import datetime |
4 | import errno | |
20effc67 TL |
5 | import functools |
6 | import inspect | |
11fdf7f2 | 7 | import json |
11fdf7f2 | 8 | from collections import defaultdict |
eafe8130 | 9 | from prettytable import PrettyTable |
9f95a23c | 10 | import re |
adb31ebb | 11 | from threading import Event, Lock |
05a536ef | 12 | from typing import cast, Any, Callable, DefaultDict, Dict, Iterable, List, Optional, Tuple, TypeVar, \ |
20effc67 | 13 | Union, TYPE_CHECKING |
11fdf7f2 TL |
14 | |
15 | ||
9f95a23c TL |
16 | DATEFMT = '%Y-%m-%dT%H:%M:%S.%f' |
17 | OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f' | |
11fdf7f2 | 18 | |
eafe8130 TL |
19 | MAX_WAIT = 600 |
20 | MIN_WAIT = 60 | |
11fdf7f2 | 21 | |
20effc67 TL |
22 | |
23 | FuncT = TypeVar('FuncT', bound=Callable) | |
24 | ||
25 | ||
26 | def with_crashes(func: FuncT) -> FuncT: | |
27 | @functools.wraps(func) | |
28 | def wrapper(self: 'Module', *args: Any, **kwargs: Any) -> Tuple[int, str, str]: | |
29 | with self.crashes_lock: | |
30 | if not self.crashes: | |
31 | self._load_crashes() | |
32 | return func(self, *args, **kwargs) | |
33 | wrapper.__signature__ = inspect.signature(func) # type: ignore[attr-defined] | |
34 | return cast(FuncT, wrapper) | |
35 | ||
36 | ||
37 | CrashT = Dict[str, Union[str, List[str]]] | |
38 | ||
39 | ||
11fdf7f2 | 40 | class Module(MgrModule): |
eafe8130 | 41 | MODULE_OPTIONS = [ |
20effc67 TL |
42 | Option( |
43 | name='warn_recent_interval', | |
44 | type='secs', | |
45 | default=60 * 60 * 24 * 14, | |
46 | desc='time interval in which to warn about recent crashes', | |
47 | runtime=True), | |
48 | Option( | |
49 | name='retain_interval', | |
50 | type='secs', | |
51 | default=60 * 60 * 24 * 365, | |
52 | desc='how long to retain crashes before pruning them', | |
53 | runtime=True), | |
eafe8130 | 54 | ] |
11fdf7f2 | 55 | |
20effc67 | 56 | def __init__(self, *args: Any, **kwargs: Any) -> None: |
11fdf7f2 | 57 | super(Module, self).__init__(*args, **kwargs) |
20effc67 | 58 | self.crashes: Optional[Dict[str, CrashT]] = None |
adb31ebb | 59 | self.crashes_lock = Lock() |
eafe8130 TL |
60 | self.run = True |
61 | self.event = Event() | |
20effc67 TL |
62 | if TYPE_CHECKING: |
63 | self.warn_recent_interval = 0.0 | |
64 | self.retain_interval = 0.0 | |
eafe8130 | 65 | |
20effc67 | 66 | def shutdown(self) -> None: |
eafe8130 TL |
67 | self.run = False |
68 | self.event.set() | |
69 | ||
20effc67 | 70 | def serve(self) -> None: |
eafe8130 TL |
71 | self.config_notify() |
72 | while self.run: | |
adb31ebb TL |
73 | with self.crashes_lock: |
74 | self._refresh_health_checks() | |
75 | self._prune(self.retain_interval) | |
eafe8130 TL |
76 | wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT)) |
77 | self.event.wait(wait) | |
78 | self.event.clear() | |
79 | ||
20effc67 | 80 | def config_notify(self) -> None: |
eafe8130 TL |
81 | for opt in self.MODULE_OPTIONS: |
82 | setattr(self, | |
83 | opt['name'], | |
9f95a23c | 84 | self.get_module_option(opt['name'])) |
eafe8130 TL |
85 | self.log.debug(' mgr option %s = %s', |
86 | opt['name'], getattr(self, opt['name'])) | |
87 | ||
20effc67 | 88 | def _load_crashes(self) -> None: |
eafe8130 TL |
89 | raw = self.get_store_prefix('crash/') |
90 | self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()} | |
91 | ||
20effc67 | 92 | def _refresh_health_checks(self) -> None: |
eafe8130 TL |
93 | if not self.crashes: |
94 | self._load_crashes() | |
20effc67 | 95 | assert self.crashes is not None |
eafe8130 TL |
96 | cutoff = datetime.datetime.utcnow() - datetime.timedelta( |
97 | seconds=self.warn_recent_interval) | |
98 | recent = { | |
99 | crashid: crash for crashid, crash in self.crashes.items() | |
20effc67 TL |
100 | if (self.time_from_string(cast(str, crash['timestamp'])) > cutoff |
101 | and 'archived' not in crash) | |
eafe8130 | 102 | } |
20effc67 TL |
103 | |
104 | def prune_detail(ls: List[str]) -> int: | |
105 | num = len(ls) | |
eafe8130 | 106 | if num > 30: |
20effc67 TL |
107 | ls = ls[0:30] |
108 | ls.append('and %d more' % (num - 30)) | |
109 | return num | |
110 | ||
111 | daemon_crashes = [] | |
112 | module_crashes = [] | |
113 | for c in recent.values(): | |
114 | if 'mgr_module' in c: | |
115 | module_crashes.append(c) | |
116 | else: | |
117 | daemon_crashes.append(c) | |
118 | daemon_detail = [ | |
119 | '%s crashed on host %s at %s' % ( | |
120 | crash.get('entity_name', 'unidentified daemon'), | |
121 | crash.get('utsname_hostname', '(unknown)'), | |
122 | crash.get('timestamp', 'unknown time')) | |
123 | for crash in daemon_crashes] | |
124 | module_detail = [ | |
125 | 'mgr module %s crashed in daemon %s on host %s at %s' % ( | |
126 | crash.get('mgr_module', 'unidentified module'), | |
127 | crash.get('entity_name', 'unidentified daemon'), | |
128 | crash.get('utsname_hostname', '(unknown)'), | |
129 | crash.get('timestamp', 'unknown time')) | |
130 | for crash in module_crashes] | |
131 | daemon_num = prune_detail(daemon_detail) | |
132 | module_num = prune_detail(module_detail) | |
133 | ||
134 | health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {} | |
135 | if daemon_detail: | |
136 | self.log.debug('daemon detail %s' % daemon_detail) | |
eafe8130 TL |
137 | health_checks['RECENT_CRASH'] = { |
138 | 'severity': 'warning', | |
20effc67 TL |
139 | 'summary': '%d daemons have recently crashed' % (daemon_num), |
140 | 'count': daemon_num, | |
141 | 'detail': daemon_detail, | |
142 | } | |
143 | if module_detail: | |
144 | self.log.debug('module detail %s' % module_detail) | |
145 | health_checks['RECENT_MGR_MODULE_CRASH'] = { | |
146 | 'severity': 'warning', | |
147 | 'summary': '%d mgr modules have recently crashed' % (module_num), | |
148 | 'count': module_num, | |
149 | 'detail': module_detail, | |
eafe8130 | 150 | } |
11fdf7f2 | 151 | |
20effc67 | 152 | self.set_health_checks(health_checks) |
11fdf7f2 | 153 | |
20effc67 | 154 | def time_from_string(self, timestr: str) -> datetime.datetime: |
11fdf7f2 TL |
155 | # drop the 'Z' timezone indication, it's always UTC |
156 | timestr = timestr.rstrip('Z') | |
9f95a23c TL |
157 | try: |
158 | return datetime.datetime.strptime(timestr, DATEFMT) | |
159 | except ValueError: | |
160 | return datetime.datetime.strptime(timestr, OLD_DATEFMT) | |
11fdf7f2 | 161 | |
20effc67 | 162 | def validate_crash_metadata(self, inbuf: str) -> Dict[str, Union[str, List[str]]]: |
eafe8130 TL |
163 | # raise any exceptions to caller |
164 | metadata = json.loads(inbuf) | |
165 | for f in ['crash_id', 'timestamp']: | |
166 | if f not in metadata: | |
167 | raise AttributeError("missing '%s' field" % f) | |
20effc67 | 168 | _ = self.time_from_string(metadata['timestamp']) |
eafe8130 TL |
169 | return metadata |
170 | ||
20effc67 | 171 | def timestamp_filter(self, f: Callable[[datetime.datetime], bool]) -> Iterable[Tuple[str, CrashT]]: |
11fdf7f2 TL |
172 | """ |
173 | Filter crash reports by timestamp. | |
174 | ||
175 | :param f: f(time) return true to keep crash report | |
176 | :returns: crash reports for which f(time) returns true | |
177 | """ | |
20effc67 | 178 | def inner(pair: Tuple[str, CrashT]) -> bool: |
eafe8130 | 179 | _, crash = pair |
20effc67 | 180 | time = self.time_from_string(cast(str, crash["timestamp"])) |
11fdf7f2 | 181 | return f(time) |
20effc67 | 182 | assert self.crashes is not None |
eafe8130 | 183 | return filter(inner, self.crashes.items()) |
11fdf7f2 | 184 | |
9f95a23c TL |
185 | # stack signature helpers |
186 | ||
20effc67 | 187 | def sanitize_backtrace(self, bt: List[str]) -> List[str]: |
9f95a23c TL |
188 | ret = list() |
189 | for func_record in bt: | |
190 | # split into two fields on last space, take the first one, | |
191 | # strip off leading ( and trailing ) | |
192 | func_plus_offset = func_record.rsplit(' ', 1)[0][1:-1] | |
193 | ret.append(func_plus_offset.split('+')[0]) | |
194 | ||
195 | return ret | |
196 | ||
197 | ASSERT_MATCHEXPR = re.compile(r'(?s)(.*) thread .* time .*(: .*)\n') | |
198 | ||
20effc67 | 199 | def sanitize_assert_msg(self, msg: str) -> str: |
9f95a23c TL |
200 | # (?s) allows matching newline. get everything up to "thread" and |
201 | # then after-and-including the last colon-space. This skips the | |
202 | # thread id, timestamp, and file:lineno, because file is already in | |
203 | # the beginning, and lineno may vary. | |
20effc67 TL |
204 | matched = self.ASSERT_MATCHEXPR.match(msg) |
205 | assert matched | |
206 | return ''.join(matched.groups()) | |
9f95a23c | 207 | |
20effc67 | 208 | def calc_sig(self, bt: List[str], assert_msg: Optional[str]) -> str: |
9f95a23c TL |
209 | sig = hashlib.sha256() |
210 | for func in self.sanitize_backtrace(bt): | |
211 | sig.update(func.encode()) | |
212 | if assert_msg: | |
213 | sig.update(self.sanitize_assert_msg(assert_msg).encode()) | |
214 | return ''.join('%02x' % c for c in sig.digest()) | |
215 | ||
11fdf7f2 TL |
216 | # command handlers |
217 | ||
20effc67 TL |
218 | @CLIReadCommand('crash info') |
219 | @with_crashes | |
220 | def do_info(self, id: str) -> Tuple[int, str, str]: | |
221 | """ | |
222 | show crash dump metadata | |
223 | """ | |
224 | crashid = id | |
225 | assert self.crashes is not None | |
eafe8130 TL |
226 | crash = self.crashes.get(crashid) |
227 | if not crash: | |
11fdf7f2 | 228 | return errno.EINVAL, '', 'crash info: %s not found' % crashid |
9f95a23c | 229 | val = json.dumps(crash, indent=4, sort_keys=True) |
11fdf7f2 TL |
230 | return 0, val, '' |
231 | ||
20effc67 TL |
232 | @CLICommand('crash post') |
233 | def do_post(self, inbuf: str) -> Tuple[int, str, str]: | |
234 | """ | |
235 | Add a crash dump (use -i <jsonfile>) | |
236 | """ | |
11fdf7f2 TL |
237 | try: |
238 | metadata = self.validate_crash_metadata(inbuf) | |
239 | except Exception as e: | |
240 | return errno.EINVAL, '', 'malformed crash metadata: %s' % e | |
9f95a23c | 241 | if 'backtrace' in metadata: |
20effc67 TL |
242 | backtrace = cast(List[str], metadata.get('backtrace')) |
243 | assert_msg = cast(Optional[str], metadata.get('assert_msg')) | |
244 | metadata['stack_sig'] = self.calc_sig(backtrace, assert_msg) | |
245 | crashid = cast(str, metadata['crash_id']) | |
246 | assert self.crashes is not None | |
eafe8130 TL |
247 | if crashid not in self.crashes: |
248 | self.crashes[crashid] = metadata | |
249 | key = 'crash/%s' % crashid | |
250 | self.set_store(key, json.dumps(metadata)) | |
251 | self._refresh_health_checks() | |
11fdf7f2 TL |
252 | return 0, '', '' |
253 | ||
20effc67 | 254 | def ls(self) -> Tuple[int, str, str]: |
eafe8130 TL |
255 | if not self.crashes: |
256 | self._load_crashes() | |
20effc67 | 257 | return self.do_ls_all('') |
eafe8130 | 258 | |
20effc67 TL |
259 | def _do_ls(self, t: Iterable[CrashT], format: Optional[str]) -> Tuple[int, str, str]: |
260 | r = sorted(t, key=lambda i: i['crash_id']) | |
261 | if format in ('json', 'json-pretty'): | |
9f95a23c | 262 | return 0, json.dumps(r, indent=4, sort_keys=True), '' |
eafe8130 TL |
263 | else: |
264 | table = PrettyTable(['ID', 'ENTITY', 'NEW'], | |
265 | border=False) | |
266 | table.left_padding_width = 0 | |
9f95a23c | 267 | table.right_padding_width = 2 |
eafe8130 TL |
268 | table.align['ID'] = 'l' |
269 | table.align['ENTITY'] = 'l' | |
270 | for c in r: | |
271 | table.add_row([c.get('crash_id'), | |
20effc67 | 272 | c.get('entity_name', 'unknown'), |
eafe8130 TL |
273 | '' if 'archived' in c else '*']) |
274 | return 0, table.get_string(), '' | |
11fdf7f2 | 275 | |
20effc67 TL |
276 | @CLIReadCommand('crash ls') |
277 | @with_crashes | |
278 | def do_ls_all(self, format: Optional[str] = None) -> Tuple[int, str, str]: | |
279 | """ | |
280 | Show new and archived crash dumps | |
281 | """ | |
282 | assert self.crashes is not None | |
283 | return self._do_ls(self.crashes.values(), format) | |
284 | ||
285 | @CLIReadCommand('crash ls-new') | |
286 | @with_crashes | |
287 | def do_ls_new(self, format: Optional[str] = None) -> Tuple[int, str, str]: | |
288 | """ | |
289 | Show new crash dumps | |
290 | """ | |
291 | assert self.crashes is not None | |
292 | t = [crash for crashid, crash in self.crashes.items() | |
293 | if 'archived' not in crash] | |
294 | return self._do_ls(t, format) | |
295 | ||
296 | @CLICommand('crash rm') | |
297 | @with_crashes | |
298 | def do_rm(self, id: str) -> Tuple[int, str, str]: | |
299 | """ | |
300 | Remove a saved crash <id> | |
301 | """ | |
302 | crashid = id | |
303 | assert self.crashes is not None | |
eafe8130 TL |
304 | if crashid in self.crashes: |
305 | del self.crashes[crashid] | |
306 | key = 'crash/%s' % crashid | |
307 | self.set_store(key, None) # removes key | |
308 | self._refresh_health_checks() | |
11fdf7f2 TL |
309 | return 0, '', '' |
310 | ||
20effc67 TL |
311 | @CLICommand('crash prune') |
312 | @with_crashes | |
313 | def do_prune(self, keep: int) -> Tuple[int, str, str]: | |
314 | """ | |
315 | Remove crashes older than <keep> days | |
316 | """ | |
317 | self._prune(keep * datetime.timedelta(days=1).total_seconds()) | |
eafe8130 | 318 | return 0, '', '' |
11fdf7f2 | 319 | |
20effc67 | 320 | def _prune(self, seconds: float) -> None: |
eafe8130 TL |
321 | now = datetime.datetime.utcnow() |
322 | cutoff = now - datetime.timedelta(seconds=seconds) | |
323 | removed_any = False | |
324 | # make a copy of the list, since we'll modify self.crashes below | |
325 | to_prune = list(self.timestamp_filter(lambda ts: ts <= cutoff)) | |
20effc67 | 326 | assert self.crashes is not None |
eafe8130 TL |
327 | for crashid, crash in to_prune: |
328 | del self.crashes[crashid] | |
329 | key = 'crash/%s' % crashid | |
11fdf7f2 | 330 | self.set_store(key, None) |
eafe8130 TL |
331 | removed_any = True |
332 | if removed_any: | |
333 | self._refresh_health_checks() | |
334 | ||
20effc67 TL |
335 | @CLIWriteCommand('crash archive') |
336 | @with_crashes | |
337 | def do_archive(self, id: str) -> Tuple[int, str, str]: | |
338 | """ | |
339 | Acknowledge a crash and silence health warning(s) | |
340 | """ | |
341 | crashid = id | |
342 | assert self.crashes is not None | |
eafe8130 TL |
343 | crash = self.crashes.get(crashid) |
344 | if not crash: | |
345 | return errno.EINVAL, '', 'crash info: %s not found' % crashid | |
346 | if not crash.get('archived'): | |
347 | crash['archived'] = str(datetime.datetime.utcnow()) | |
348 | self.crashes[crashid] = crash | |
349 | key = 'crash/%s' % crashid | |
350 | self.set_store(key, json.dumps(crash)) | |
351 | self._refresh_health_checks() | |
352 | return 0, '', '' | |
11fdf7f2 | 353 | |
20effc67 TL |
354 | @CLIWriteCommand('crash archive-all') |
355 | @with_crashes | |
356 | def do_archive_all(self) -> Tuple[int, str, str]: | |
357 | """ | |
358 | Acknowledge all new crashes and silence health warning(s) | |
359 | """ | |
360 | assert self.crashes is not None | |
eafe8130 TL |
361 | for crashid, crash in self.crashes.items(): |
362 | if not crash.get('archived'): | |
363 | crash['archived'] = str(datetime.datetime.utcnow()) | |
364 | self.crashes[crashid] = crash | |
365 | key = 'crash/%s' % crashid | |
366 | self.set_store(key, json.dumps(crash)) | |
367 | self._refresh_health_checks() | |
11fdf7f2 TL |
368 | return 0, '', '' |
369 | ||
20effc67 TL |
370 | @CLIReadCommand('crash stat') |
371 | @with_crashes | |
372 | def do_stat(self) -> Tuple[int, str, str]: | |
373 | """ | |
374 | Summarize recorded crashes | |
375 | """ | |
11fdf7f2 | 376 | # age in days for reporting, ordered smallest first |
20effc67 | 377 | AGE_IN_DAYS = [1, 3, 7] |
11fdf7f2 TL |
378 | retlines = list() |
379 | ||
20effc67 TL |
380 | BinnedStatsT = Dict[str, Union[int, datetime.datetime, List[str]]] |
381 | ||
382 | def binstr(bindict: BinnedStatsT) -> str: | |
11fdf7f2 | 383 | binlines = list() |
20effc67 TL |
384 | id_list = cast(List[str], bindict['idlist']) |
385 | count = len(id_list) | |
11fdf7f2 TL |
386 | if count: |
387 | binlines.append( | |
388 | '%d older than %s days old:' % (count, bindict['age']) | |
389 | ) | |
20effc67 | 390 | for crashid in id_list: |
11fdf7f2 TL |
391 | binlines.append(crashid) |
392 | return '\n'.join(binlines) | |
393 | ||
394 | total = 0 | |
395 | now = datetime.datetime.utcnow() | |
20effc67 TL |
396 | bins: List[BinnedStatsT] = [] |
397 | for age in AGE_IN_DAYS: | |
11fdf7f2 | 398 | agelimit = now - datetime.timedelta(days=age) |
20effc67 | 399 | bins.append({ |
11fdf7f2 TL |
400 | 'age': age, |
401 | 'agelimit': agelimit, | |
402 | 'idlist': list() | |
20effc67 | 403 | }) |
11fdf7f2 | 404 | |
20effc67 | 405 | assert self.crashes is not None |
eafe8130 | 406 | for crashid, crash in self.crashes.items(): |
11fdf7f2 | 407 | total += 1 |
20effc67 TL |
408 | stamp = self.time_from_string(cast(str, crash['timestamp'])) |
409 | for bindict in bins: | |
410 | if stamp <= cast(datetime.datetime, bindict['agelimit']): | |
411 | cast(List[str], bindict['idlist']).append(crashid) | |
11fdf7f2 TL |
412 | # don't count this one again |
413 | continue | |
414 | ||
415 | retlines.append('%d crashes recorded' % total) | |
416 | ||
417 | for bindict in bins: | |
418 | retlines.append(binstr(bindict)) | |
419 | return 0, '\n'.join(retlines), '' | |
420 | ||
20effc67 TL |
421 | @CLIReadCommand('crash json_report') |
422 | @with_crashes | |
423 | def do_json_report(self, hours: int) -> Tuple[int, str, str]: | |
11fdf7f2 | 424 | """ |
20effc67 | 425 | Crashes in the last <hours> hours |
11fdf7f2 | 426 | """ |
20effc67 TL |
427 | # Return a machine readable summary of recent crashes. |
428 | report: DefaultDict[str, int] = defaultdict(lambda: 0) | |
429 | assert self.crashes is not None | |
eafe8130 | 430 | for crashid, crash in self.crashes.items(): |
20effc67 | 431 | pname = cast(str, crash.get("process_name", "unknown")) |
11fdf7f2 TL |
432 | if not pname: |
433 | pname = "unknown" | |
434 | report[pname] += 1 | |
435 | ||
9f95a23c | 436 | return 0, '', json.dumps(report, sort_keys=True) |
11fdf7f2 | 437 | |
20effc67 | 438 | def self_test(self) -> None: |
11fdf7f2 | 439 | # test time conversion |
9f95a23c TL |
440 | timestr = '2018-06-22T20:35:38.058818Z' |
441 | old_timestr = '2018-06-22 20:35:38.058818Z' | |
11fdf7f2 TL |
442 | dt = self.time_from_string(timestr) |
443 | if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818): | |
444 | raise RuntimeError('time_from_string() failed') | |
9f95a23c TL |
445 | dt = self.time_from_string(old_timestr) |
446 | if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818): | |
447 | raise RuntimeError('time_from_string() (old) failed') |