]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/crash/module.py
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / pybind / mgr / crash / module.py
CommitLineData
9f95a23c 1import hashlib
20effc67 2from mgr_module import CLICommand, CLIReadCommand, CLIWriteCommand, MgrModule, Option
11fdf7f2
TL
3import datetime
4import errno
20effc67
TL
5import functools
6import inspect
11fdf7f2 7import json
11fdf7f2 8from collections import defaultdict
eafe8130 9from prettytable import PrettyTable
9f95a23c 10import re
adb31ebb 11from threading import Event, Lock
05a536ef 12from typing import cast, Any, Callable, DefaultDict, Dict, Iterable, List, Optional, Tuple, TypeVar, \
20effc67 13 Union, TYPE_CHECKING
11fdf7f2
TL
14
15
9f95a23c
TL
16DATEFMT = '%Y-%m-%dT%H:%M:%S.%f'
17OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
11fdf7f2 18
eafe8130
TL
19MAX_WAIT = 600
20MIN_WAIT = 60
11fdf7f2 21
20effc67
TL
22
23FuncT = TypeVar('FuncT', bound=Callable)
24
25
26def with_crashes(func: FuncT) -> FuncT:
27 @functools.wraps(func)
28 def wrapper(self: 'Module', *args: Any, **kwargs: Any) -> Tuple[int, str, str]:
29 with self.crashes_lock:
30 if not self.crashes:
31 self._load_crashes()
32 return func(self, *args, **kwargs)
33 wrapper.__signature__ = inspect.signature(func) # type: ignore[attr-defined]
34 return cast(FuncT, wrapper)
35
36
37CrashT = Dict[str, Union[str, List[str]]]
38
39
11fdf7f2 40class Module(MgrModule):
eafe8130 41 MODULE_OPTIONS = [
20effc67
TL
42 Option(
43 name='warn_recent_interval',
44 type='secs',
45 default=60 * 60 * 24 * 14,
46 desc='time interval in which to warn about recent crashes',
47 runtime=True),
48 Option(
49 name='retain_interval',
50 type='secs',
51 default=60 * 60 * 24 * 365,
52 desc='how long to retain crashes before pruning them',
53 runtime=True),
eafe8130 54 ]
11fdf7f2 55
20effc67 56 def __init__(self, *args: Any, **kwargs: Any) -> None:
11fdf7f2 57 super(Module, self).__init__(*args, **kwargs)
20effc67 58 self.crashes: Optional[Dict[str, CrashT]] = None
adb31ebb 59 self.crashes_lock = Lock()
eafe8130
TL
60 self.run = True
61 self.event = Event()
20effc67
TL
62 if TYPE_CHECKING:
63 self.warn_recent_interval = 0.0
64 self.retain_interval = 0.0
eafe8130 65
20effc67 66 def shutdown(self) -> None:
eafe8130
TL
67 self.run = False
68 self.event.set()
69
20effc67 70 def serve(self) -> None:
eafe8130
TL
71 self.config_notify()
72 while self.run:
adb31ebb
TL
73 with self.crashes_lock:
74 self._refresh_health_checks()
75 self._prune(self.retain_interval)
eafe8130
TL
76 wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT))
77 self.event.wait(wait)
78 self.event.clear()
79
20effc67 80 def config_notify(self) -> None:
eafe8130
TL
81 for opt in self.MODULE_OPTIONS:
82 setattr(self,
83 opt['name'],
9f95a23c 84 self.get_module_option(opt['name']))
eafe8130
TL
85 self.log.debug(' mgr option %s = %s',
86 opt['name'], getattr(self, opt['name']))
87
20effc67 88 def _load_crashes(self) -> None:
eafe8130
TL
89 raw = self.get_store_prefix('crash/')
90 self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()}
91
20effc67 92 def _refresh_health_checks(self) -> None:
eafe8130
TL
93 if not self.crashes:
94 self._load_crashes()
20effc67 95 assert self.crashes is not None
eafe8130
TL
96 cutoff = datetime.datetime.utcnow() - datetime.timedelta(
97 seconds=self.warn_recent_interval)
98 recent = {
99 crashid: crash for crashid, crash in self.crashes.items()
20effc67
TL
100 if (self.time_from_string(cast(str, crash['timestamp'])) > cutoff
101 and 'archived' not in crash)
eafe8130 102 }
20effc67
TL
103
104 def prune_detail(ls: List[str]) -> int:
105 num = len(ls)
eafe8130 106 if num > 30:
20effc67
TL
107 ls = ls[0:30]
108 ls.append('and %d more' % (num - 30))
109 return num
110
111 daemon_crashes = []
112 module_crashes = []
113 for c in recent.values():
114 if 'mgr_module' in c:
115 module_crashes.append(c)
116 else:
117 daemon_crashes.append(c)
118 daemon_detail = [
119 '%s crashed on host %s at %s' % (
120 crash.get('entity_name', 'unidentified daemon'),
121 crash.get('utsname_hostname', '(unknown)'),
122 crash.get('timestamp', 'unknown time'))
123 for crash in daemon_crashes]
124 module_detail = [
125 'mgr module %s crashed in daemon %s on host %s at %s' % (
126 crash.get('mgr_module', 'unidentified module'),
127 crash.get('entity_name', 'unidentified daemon'),
128 crash.get('utsname_hostname', '(unknown)'),
129 crash.get('timestamp', 'unknown time'))
130 for crash in module_crashes]
131 daemon_num = prune_detail(daemon_detail)
132 module_num = prune_detail(module_detail)
133
134 health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {}
135 if daemon_detail:
136 self.log.debug('daemon detail %s' % daemon_detail)
eafe8130
TL
137 health_checks['RECENT_CRASH'] = {
138 'severity': 'warning',
20effc67
TL
139 'summary': '%d daemons have recently crashed' % (daemon_num),
140 'count': daemon_num,
141 'detail': daemon_detail,
142 }
143 if module_detail:
144 self.log.debug('module detail %s' % module_detail)
145 health_checks['RECENT_MGR_MODULE_CRASH'] = {
146 'severity': 'warning',
147 'summary': '%d mgr modules have recently crashed' % (module_num),
148 'count': module_num,
149 'detail': module_detail,
eafe8130 150 }
11fdf7f2 151
20effc67 152 self.set_health_checks(health_checks)
11fdf7f2 153
20effc67 154 def time_from_string(self, timestr: str) -> datetime.datetime:
11fdf7f2
TL
155 # drop the 'Z' timezone indication, it's always UTC
156 timestr = timestr.rstrip('Z')
9f95a23c
TL
157 try:
158 return datetime.datetime.strptime(timestr, DATEFMT)
159 except ValueError:
160 return datetime.datetime.strptime(timestr, OLD_DATEFMT)
11fdf7f2 161
20effc67 162 def validate_crash_metadata(self, inbuf: str) -> Dict[str, Union[str, List[str]]]:
eafe8130
TL
163 # raise any exceptions to caller
164 metadata = json.loads(inbuf)
165 for f in ['crash_id', 'timestamp']:
166 if f not in metadata:
167 raise AttributeError("missing '%s' field" % f)
20effc67 168 _ = self.time_from_string(metadata['timestamp'])
eafe8130
TL
169 return metadata
170
20effc67 171 def timestamp_filter(self, f: Callable[[datetime.datetime], bool]) -> Iterable[Tuple[str, CrashT]]:
11fdf7f2
TL
172 """
173 Filter crash reports by timestamp.
174
175 :param f: f(time) return true to keep crash report
176 :returns: crash reports for which f(time) returns true
177 """
20effc67 178 def inner(pair: Tuple[str, CrashT]) -> bool:
eafe8130 179 _, crash = pair
20effc67 180 time = self.time_from_string(cast(str, crash["timestamp"]))
11fdf7f2 181 return f(time)
20effc67 182 assert self.crashes is not None
eafe8130 183 return filter(inner, self.crashes.items())
11fdf7f2 184
9f95a23c
TL
185 # stack signature helpers
186
20effc67 187 def sanitize_backtrace(self, bt: List[str]) -> List[str]:
9f95a23c
TL
188 ret = list()
189 for func_record in bt:
190 # split into two fields on last space, take the first one,
191 # strip off leading ( and trailing )
192 func_plus_offset = func_record.rsplit(' ', 1)[0][1:-1]
193 ret.append(func_plus_offset.split('+')[0])
194
195 return ret
196
197 ASSERT_MATCHEXPR = re.compile(r'(?s)(.*) thread .* time .*(: .*)\n')
198
20effc67 199 def sanitize_assert_msg(self, msg: str) -> str:
9f95a23c
TL
200 # (?s) allows matching newline. get everything up to "thread" and
201 # then after-and-including the last colon-space. This skips the
202 # thread id, timestamp, and file:lineno, because file is already in
203 # the beginning, and lineno may vary.
20effc67
TL
204 matched = self.ASSERT_MATCHEXPR.match(msg)
205 assert matched
206 return ''.join(matched.groups())
9f95a23c 207
20effc67 208 def calc_sig(self, bt: List[str], assert_msg: Optional[str]) -> str:
9f95a23c
TL
209 sig = hashlib.sha256()
210 for func in self.sanitize_backtrace(bt):
211 sig.update(func.encode())
212 if assert_msg:
213 sig.update(self.sanitize_assert_msg(assert_msg).encode())
214 return ''.join('%02x' % c for c in sig.digest())
215
11fdf7f2
TL
216 # command handlers
217
20effc67
TL
218 @CLIReadCommand('crash info')
219 @with_crashes
220 def do_info(self, id: str) -> Tuple[int, str, str]:
221 """
222 show crash dump metadata
223 """
224 crashid = id
225 assert self.crashes is not None
eafe8130
TL
226 crash = self.crashes.get(crashid)
227 if not crash:
11fdf7f2 228 return errno.EINVAL, '', 'crash info: %s not found' % crashid
9f95a23c 229 val = json.dumps(crash, indent=4, sort_keys=True)
11fdf7f2
TL
230 return 0, val, ''
231
20effc67
TL
232 @CLICommand('crash post')
233 def do_post(self, inbuf: str) -> Tuple[int, str, str]:
234 """
235 Add a crash dump (use -i <jsonfile>)
236 """
11fdf7f2
TL
237 try:
238 metadata = self.validate_crash_metadata(inbuf)
239 except Exception as e:
240 return errno.EINVAL, '', 'malformed crash metadata: %s' % e
9f95a23c 241 if 'backtrace' in metadata:
20effc67
TL
242 backtrace = cast(List[str], metadata.get('backtrace'))
243 assert_msg = cast(Optional[str], metadata.get('assert_msg'))
244 metadata['stack_sig'] = self.calc_sig(backtrace, assert_msg)
245 crashid = cast(str, metadata['crash_id'])
246 assert self.crashes is not None
eafe8130
TL
247 if crashid not in self.crashes:
248 self.crashes[crashid] = metadata
249 key = 'crash/%s' % crashid
250 self.set_store(key, json.dumps(metadata))
251 self._refresh_health_checks()
11fdf7f2
TL
252 return 0, '', ''
253
20effc67 254 def ls(self) -> Tuple[int, str, str]:
eafe8130
TL
255 if not self.crashes:
256 self._load_crashes()
20effc67 257 return self.do_ls_all('')
eafe8130 258
20effc67
TL
259 def _do_ls(self, t: Iterable[CrashT], format: Optional[str]) -> Tuple[int, str, str]:
260 r = sorted(t, key=lambda i: i['crash_id'])
261 if format in ('json', 'json-pretty'):
9f95a23c 262 return 0, json.dumps(r, indent=4, sort_keys=True), ''
eafe8130
TL
263 else:
264 table = PrettyTable(['ID', 'ENTITY', 'NEW'],
265 border=False)
266 table.left_padding_width = 0
9f95a23c 267 table.right_padding_width = 2
eafe8130
TL
268 table.align['ID'] = 'l'
269 table.align['ENTITY'] = 'l'
270 for c in r:
271 table.add_row([c.get('crash_id'),
20effc67 272 c.get('entity_name', 'unknown'),
eafe8130
TL
273 '' if 'archived' in c else '*'])
274 return 0, table.get_string(), ''
11fdf7f2 275
20effc67
TL
276 @CLIReadCommand('crash ls')
277 @with_crashes
278 def do_ls_all(self, format: Optional[str] = None) -> Tuple[int, str, str]:
279 """
280 Show new and archived crash dumps
281 """
282 assert self.crashes is not None
283 return self._do_ls(self.crashes.values(), format)
284
285 @CLIReadCommand('crash ls-new')
286 @with_crashes
287 def do_ls_new(self, format: Optional[str] = None) -> Tuple[int, str, str]:
288 """
289 Show new crash dumps
290 """
291 assert self.crashes is not None
292 t = [crash for crashid, crash in self.crashes.items()
293 if 'archived' not in crash]
294 return self._do_ls(t, format)
295
296 @CLICommand('crash rm')
297 @with_crashes
298 def do_rm(self, id: str) -> Tuple[int, str, str]:
299 """
300 Remove a saved crash <id>
301 """
302 crashid = id
303 assert self.crashes is not None
eafe8130
TL
304 if crashid in self.crashes:
305 del self.crashes[crashid]
306 key = 'crash/%s' % crashid
307 self.set_store(key, None) # removes key
308 self._refresh_health_checks()
11fdf7f2
TL
309 return 0, '', ''
310
20effc67
TL
311 @CLICommand('crash prune')
312 @with_crashes
313 def do_prune(self, keep: int) -> Tuple[int, str, str]:
314 """
315 Remove crashes older than <keep> days
316 """
317 self._prune(keep * datetime.timedelta(days=1).total_seconds())
eafe8130 318 return 0, '', ''
11fdf7f2 319
20effc67 320 def _prune(self, seconds: float) -> None:
eafe8130
TL
321 now = datetime.datetime.utcnow()
322 cutoff = now - datetime.timedelta(seconds=seconds)
323 removed_any = False
324 # make a copy of the list, since we'll modify self.crashes below
325 to_prune = list(self.timestamp_filter(lambda ts: ts <= cutoff))
20effc67 326 assert self.crashes is not None
eafe8130
TL
327 for crashid, crash in to_prune:
328 del self.crashes[crashid]
329 key = 'crash/%s' % crashid
11fdf7f2 330 self.set_store(key, None)
eafe8130
TL
331 removed_any = True
332 if removed_any:
333 self._refresh_health_checks()
334
20effc67
TL
335 @CLIWriteCommand('crash archive')
336 @with_crashes
337 def do_archive(self, id: str) -> Tuple[int, str, str]:
338 """
339 Acknowledge a crash and silence health warning(s)
340 """
341 crashid = id
342 assert self.crashes is not None
eafe8130
TL
343 crash = self.crashes.get(crashid)
344 if not crash:
345 return errno.EINVAL, '', 'crash info: %s not found' % crashid
346 if not crash.get('archived'):
347 crash['archived'] = str(datetime.datetime.utcnow())
348 self.crashes[crashid] = crash
349 key = 'crash/%s' % crashid
350 self.set_store(key, json.dumps(crash))
351 self._refresh_health_checks()
352 return 0, '', ''
11fdf7f2 353
20effc67
TL
354 @CLIWriteCommand('crash archive-all')
355 @with_crashes
356 def do_archive_all(self) -> Tuple[int, str, str]:
357 """
358 Acknowledge all new crashes and silence health warning(s)
359 """
360 assert self.crashes is not None
eafe8130
TL
361 for crashid, crash in self.crashes.items():
362 if not crash.get('archived'):
363 crash['archived'] = str(datetime.datetime.utcnow())
364 self.crashes[crashid] = crash
365 key = 'crash/%s' % crashid
366 self.set_store(key, json.dumps(crash))
367 self._refresh_health_checks()
11fdf7f2
TL
368 return 0, '', ''
369
20effc67
TL
370 @CLIReadCommand('crash stat')
371 @with_crashes
372 def do_stat(self) -> Tuple[int, str, str]:
373 """
374 Summarize recorded crashes
375 """
11fdf7f2 376 # age in days for reporting, ordered smallest first
20effc67 377 AGE_IN_DAYS = [1, 3, 7]
11fdf7f2
TL
378 retlines = list()
379
20effc67
TL
380 BinnedStatsT = Dict[str, Union[int, datetime.datetime, List[str]]]
381
382 def binstr(bindict: BinnedStatsT) -> str:
11fdf7f2 383 binlines = list()
20effc67
TL
384 id_list = cast(List[str], bindict['idlist'])
385 count = len(id_list)
11fdf7f2
TL
386 if count:
387 binlines.append(
388 '%d older than %s days old:' % (count, bindict['age'])
389 )
20effc67 390 for crashid in id_list:
11fdf7f2
TL
391 binlines.append(crashid)
392 return '\n'.join(binlines)
393
394 total = 0
395 now = datetime.datetime.utcnow()
20effc67
TL
396 bins: List[BinnedStatsT] = []
397 for age in AGE_IN_DAYS:
11fdf7f2 398 agelimit = now - datetime.timedelta(days=age)
20effc67 399 bins.append({
11fdf7f2
TL
400 'age': age,
401 'agelimit': agelimit,
402 'idlist': list()
20effc67 403 })
11fdf7f2 404
20effc67 405 assert self.crashes is not None
eafe8130 406 for crashid, crash in self.crashes.items():
11fdf7f2 407 total += 1
20effc67
TL
408 stamp = self.time_from_string(cast(str, crash['timestamp']))
409 for bindict in bins:
410 if stamp <= cast(datetime.datetime, bindict['agelimit']):
411 cast(List[str], bindict['idlist']).append(crashid)
11fdf7f2
TL
412 # don't count this one again
413 continue
414
415 retlines.append('%d crashes recorded' % total)
416
417 for bindict in bins:
418 retlines.append(binstr(bindict))
419 return 0, '\n'.join(retlines), ''
420
20effc67
TL
421 @CLIReadCommand('crash json_report')
422 @with_crashes
423 def do_json_report(self, hours: int) -> Tuple[int, str, str]:
11fdf7f2 424 """
20effc67 425 Crashes in the last <hours> hours
11fdf7f2 426 """
20effc67
TL
427 # Return a machine readable summary of recent crashes.
428 report: DefaultDict[str, int] = defaultdict(lambda: 0)
429 assert self.crashes is not None
eafe8130 430 for crashid, crash in self.crashes.items():
20effc67 431 pname = cast(str, crash.get("process_name", "unknown"))
11fdf7f2
TL
432 if not pname:
433 pname = "unknown"
434 report[pname] += 1
435
9f95a23c 436 return 0, '', json.dumps(report, sort_keys=True)
11fdf7f2 437
20effc67 438 def self_test(self) -> None:
11fdf7f2 439 # test time conversion
9f95a23c
TL
440 timestr = '2018-06-22T20:35:38.058818Z'
441 old_timestr = '2018-06-22 20:35:38.058818Z'
11fdf7f2
TL
442 dt = self.time_from_string(timestr)
443 if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818):
444 raise RuntimeError('time_from_string() failed')
9f95a23c
TL
445 dt = self.time_from_string(old_timestr)
446 if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818):
447 raise RuntimeError('time_from_string() (old) failed')