]>
Commit | Line | Data |
---|---|---|
9f95a23c | 1 | import hashlib |
11fdf7f2 TL |
2 | from mgr_module import MgrModule |
3 | import datetime | |
4 | import errno | |
5 | import json | |
11fdf7f2 | 6 | from collections import defaultdict |
eafe8130 | 7 | from prettytable import PrettyTable |
9f95a23c | 8 | import re |
eafe8130 | 9 | from threading import Event |
11fdf7f2 TL |
10 | |
11 | ||
9f95a23c TL |
12 | DATEFMT = '%Y-%m-%dT%H:%M:%S.%f' |
13 | OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f' | |
11fdf7f2 | 14 | |
eafe8130 TL |
15 | MAX_WAIT = 600 |
16 | MIN_WAIT = 60 | |
11fdf7f2 TL |
17 | |
18 | class Module(MgrModule): | |
eafe8130 TL |
19 | MODULE_OPTIONS = [ |
20 | { | |
21 | 'name': 'warn_recent_interval', | |
22 | 'type': 'secs', | |
23 | 'default': 60*60*24*14, | |
24 | 'desc': 'time interval in which to warn about recent crashes', | |
25 | 'runtime': True, | |
26 | }, | |
27 | { | |
28 | 'name': 'retain_interval', | |
29 | 'type': 'secs', | |
30 | 'default': 60*60*24 * 365, | |
31 | 'desc': 'how long to retain crashes before pruning them', | |
32 | 'runtime': True, | |
33 | }, | |
34 | ] | |
11fdf7f2 TL |
35 | |
36 | def __init__(self, *args, **kwargs): | |
37 | super(Module, self).__init__(*args, **kwargs) | |
eafe8130 TL |
38 | self.crashes = None |
39 | self.run = True | |
40 | self.event = Event() | |
41 | ||
42 | def shutdown(self): | |
43 | self.run = False | |
44 | self.event.set() | |
45 | ||
46 | def serve(self): | |
47 | self.config_notify() | |
48 | while self.run: | |
49 | self._refresh_health_checks() | |
50 | self._prune(self.retain_interval) | |
51 | wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT)) | |
52 | self.event.wait(wait) | |
53 | self.event.clear() | |
54 | ||
55 | def config_notify(self): | |
56 | for opt in self.MODULE_OPTIONS: | |
57 | setattr(self, | |
58 | opt['name'], | |
9f95a23c | 59 | self.get_module_option(opt['name'])) |
eafe8130 TL |
60 | self.log.debug(' mgr option %s = %s', |
61 | opt['name'], getattr(self, opt['name'])) | |
62 | ||
63 | def _load_crashes(self): | |
64 | raw = self.get_store_prefix('crash/') | |
65 | self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()} | |
66 | ||
67 | def _refresh_health_checks(self): | |
68 | if not self.crashes: | |
69 | self._load_crashes() | |
70 | cutoff = datetime.datetime.utcnow() - datetime.timedelta( | |
71 | seconds=self.warn_recent_interval) | |
72 | recent = { | |
73 | crashid: crash for crashid, crash in self.crashes.items() | |
74 | if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash | |
75 | } | |
76 | num = len(recent) | |
77 | health_checks = {} | |
78 | if recent: | |
79 | detail = [ | |
80 | '%s crashed on host %s at %s' % ( | |
81 | crash.get('entity_name', 'unidentified daemon'), | |
82 | crash.get('utsname_hostname', '(unknown)'), | |
83 | crash.get('timestamp', 'unknown time')) | |
84 | for (_, crash) in recent.items()] | |
85 | if num > 30: | |
86 | detail = detail[0:30] | |
87 | detail.append('and %d more' % (num - 30)) | |
88 | self.log.debug('detail %s' % detail) | |
89 | health_checks['RECENT_CRASH'] = { | |
90 | 'severity': 'warning', | |
91 | 'summary': '%d daemons have recently crashed' % (num), | |
9f95a23c | 92 | 'count': num, |
eafe8130 TL |
93 | 'detail': detail, |
94 | } | |
95 | self.set_health_checks(health_checks) | |
11fdf7f2 TL |
96 | |
97 | def handle_command(self, inbuf, command): | |
eafe8130 TL |
98 | if not self.crashes: |
99 | self._load_crashes() | |
11fdf7f2 TL |
100 | for cmd in self.COMMANDS: |
101 | if cmd['cmd'].startswith(command['prefix']): | |
102 | handler = cmd['handler'] | |
103 | break | |
104 | if handler is None: | |
105 | return errno.EINVAL, '', 'unknown command %s' % command['prefix'] | |
106 | ||
107 | return handler(self, command, inbuf) | |
108 | ||
eafe8130 | 109 | def time_from_string(self, timestr): |
11fdf7f2 TL |
110 | # drop the 'Z' timezone indication, it's always UTC |
111 | timestr = timestr.rstrip('Z') | |
9f95a23c TL |
112 | try: |
113 | return datetime.datetime.strptime(timestr, DATEFMT) | |
114 | except ValueError: | |
115 | return datetime.datetime.strptime(timestr, OLD_DATEFMT) | |
11fdf7f2 | 116 | |
eafe8130 TL |
117 | def validate_crash_metadata(self, inbuf): |
118 | # raise any exceptions to caller | |
119 | metadata = json.loads(inbuf) | |
120 | for f in ['crash_id', 'timestamp']: | |
121 | if f not in metadata: | |
122 | raise AttributeError("missing '%s' field" % f) | |
123 | time = self.time_from_string(metadata['timestamp']) | |
124 | return metadata | |
125 | ||
11fdf7f2 TL |
126 | def timestamp_filter(self, f): |
127 | """ | |
128 | Filter crash reports by timestamp. | |
129 | ||
130 | :param f: f(time) return true to keep crash report | |
131 | :returns: crash reports for which f(time) returns true | |
132 | """ | |
133 | def inner(pair): | |
eafe8130 TL |
134 | _, crash = pair |
135 | time = self.time_from_string(crash["timestamp"]) | |
11fdf7f2 | 136 | return f(time) |
eafe8130 | 137 | return filter(inner, self.crashes.items()) |
11fdf7f2 | 138 | |
9f95a23c TL |
139 | # stack signature helpers |
140 | ||
141 | def sanitize_backtrace(self, bt): | |
142 | ret = list() | |
143 | for func_record in bt: | |
144 | # split into two fields on last space, take the first one, | |
145 | # strip off leading ( and trailing ) | |
146 | func_plus_offset = func_record.rsplit(' ', 1)[0][1:-1] | |
147 | ret.append(func_plus_offset.split('+')[0]) | |
148 | ||
149 | return ret | |
150 | ||
151 | ASSERT_MATCHEXPR = re.compile(r'(?s)(.*) thread .* time .*(: .*)\n') | |
152 | ||
153 | def sanitize_assert_msg(self, msg): | |
154 | # (?s) allows matching newline. get everything up to "thread" and | |
155 | # then after-and-including the last colon-space. This skips the | |
156 | # thread id, timestamp, and file:lineno, because file is already in | |
157 | # the beginning, and lineno may vary. | |
158 | return ''.join(self.ASSERT_MATCHEXPR.match(msg).groups()) | |
159 | ||
160 | def calc_sig(self, bt, assert_msg): | |
161 | sig = hashlib.sha256() | |
162 | for func in self.sanitize_backtrace(bt): | |
163 | sig.update(func.encode()) | |
164 | if assert_msg: | |
165 | sig.update(self.sanitize_assert_msg(assert_msg).encode()) | |
166 | return ''.join('%02x' % c for c in sig.digest()) | |
167 | ||
11fdf7f2 TL |
168 | # command handlers |
169 | ||
170 | def do_info(self, cmd, inbuf): | |
171 | crashid = cmd['id'] | |
eafe8130 TL |
172 | crash = self.crashes.get(crashid) |
173 | if not crash: | |
11fdf7f2 | 174 | return errno.EINVAL, '', 'crash info: %s not found' % crashid |
9f95a23c | 175 | val = json.dumps(crash, indent=4, sort_keys=True) |
11fdf7f2 TL |
176 | return 0, val, '' |
177 | ||
178 | def do_post(self, cmd, inbuf): | |
179 | try: | |
180 | metadata = self.validate_crash_metadata(inbuf) | |
181 | except Exception as e: | |
182 | return errno.EINVAL, '', 'malformed crash metadata: %s' % e | |
9f95a23c TL |
183 | if 'backtrace' in metadata: |
184 | metadata['stack_sig'] = self.calc_sig( | |
185 | metadata.get('backtrace'), metadata.get('assert_msg')) | |
11fdf7f2 | 186 | crashid = metadata['crash_id'] |
eafe8130 TL |
187 | |
188 | if crashid not in self.crashes: | |
189 | self.crashes[crashid] = metadata | |
190 | key = 'crash/%s' % crashid | |
191 | self.set_store(key, json.dumps(metadata)) | |
192 | self._refresh_health_checks() | |
11fdf7f2 TL |
193 | return 0, '', '' |
194 | ||
eafe8130 TL |
195 | def ls(self): |
196 | if not self.crashes: | |
197 | self._load_crashes() | |
198 | return self.do_ls({'prefix': 'crash ls'}, '') | |
199 | ||
11fdf7f2 | 200 | def do_ls(self, cmd, inbuf): |
eafe8130 TL |
201 | if cmd['prefix'] == 'crash ls': |
202 | t = self.crashes.values() | |
203 | else: | |
204 | t = [crash for crashid, crash in self.crashes.items() | |
205 | if 'archived' not in crash] | |
206 | r = sorted(t, key=lambda i: i.get('crash_id')) | |
207 | if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty': | |
9f95a23c | 208 | return 0, json.dumps(r, indent=4, sort_keys=True), '' |
eafe8130 TL |
209 | else: |
210 | table = PrettyTable(['ID', 'ENTITY', 'NEW'], | |
211 | border=False) | |
212 | table.left_padding_width = 0 | |
9f95a23c | 213 | table.right_padding_width = 2 |
eafe8130 TL |
214 | table.align['ID'] = 'l' |
215 | table.align['ENTITY'] = 'l' | |
216 | for c in r: | |
217 | table.add_row([c.get('crash_id'), | |
218 | c.get('entity_name','unknown'), | |
219 | '' if 'archived' in c else '*']) | |
220 | return 0, table.get_string(), '' | |
11fdf7f2 TL |
221 | |
222 | def do_rm(self, cmd, inbuf): | |
223 | crashid = cmd['id'] | |
eafe8130 TL |
224 | if crashid in self.crashes: |
225 | del self.crashes[crashid] | |
226 | key = 'crash/%s' % crashid | |
227 | self.set_store(key, None) # removes key | |
228 | self._refresh_health_checks() | |
11fdf7f2 TL |
229 | return 0, '', '' |
230 | ||
231 | def do_prune(self, cmd, inbuf): | |
11fdf7f2 TL |
232 | keep = cmd['keep'] |
233 | try: | |
234 | keep = int(keep) | |
235 | except ValueError: | |
236 | return errno.EINVAL, '', 'keep argument must be integer' | |
237 | ||
eafe8130 TL |
238 | self._prune(keep * 60*60*24) |
239 | return 0, '', '' | |
11fdf7f2 | 240 | |
eafe8130 TL |
241 | def _prune(self, seconds): |
242 | now = datetime.datetime.utcnow() | |
243 | cutoff = now - datetime.timedelta(seconds=seconds) | |
244 | removed_any = False | |
245 | # make a copy of the list, since we'll modify self.crashes below | |
246 | to_prune = list(self.timestamp_filter(lambda ts: ts <= cutoff)) | |
247 | for crashid, crash in to_prune: | |
248 | del self.crashes[crashid] | |
249 | key = 'crash/%s' % crashid | |
11fdf7f2 | 250 | self.set_store(key, None) |
eafe8130 TL |
251 | removed_any = True |
252 | if removed_any: | |
253 | self._refresh_health_checks() | |
254 | ||
255 | def do_archive(self, cmd, inbuf): | |
256 | crashid = cmd['id'] | |
257 | crash = self.crashes.get(crashid) | |
258 | if not crash: | |
259 | return errno.EINVAL, '', 'crash info: %s not found' % crashid | |
260 | if not crash.get('archived'): | |
261 | crash['archived'] = str(datetime.datetime.utcnow()) | |
262 | self.crashes[crashid] = crash | |
263 | key = 'crash/%s' % crashid | |
264 | self.set_store(key, json.dumps(crash)) | |
265 | self._refresh_health_checks() | |
266 | return 0, '', '' | |
11fdf7f2 | 267 | |
eafe8130 TL |
268 | def do_archive_all(self, cmd, inbuf): |
269 | for crashid, crash in self.crashes.items(): | |
270 | if not crash.get('archived'): | |
271 | crash['archived'] = str(datetime.datetime.utcnow()) | |
272 | self.crashes[crashid] = crash | |
273 | key = 'crash/%s' % crashid | |
274 | self.set_store(key, json.dumps(crash)) | |
275 | self._refresh_health_checks() | |
11fdf7f2 TL |
276 | return 0, '', '' |
277 | ||
278 | def do_stat(self, cmd, inbuf): | |
279 | # age in days for reporting, ordered smallest first | |
280 | bins = [1, 3, 7] | |
281 | retlines = list() | |
282 | ||
283 | def binstr(bindict): | |
284 | binlines = list() | |
285 | count = len(bindict['idlist']) | |
286 | if count: | |
287 | binlines.append( | |
288 | '%d older than %s days old:' % (count, bindict['age']) | |
289 | ) | |
290 | for crashid in bindict['idlist']: | |
291 | binlines.append(crashid) | |
292 | return '\n'.join(binlines) | |
293 | ||
294 | total = 0 | |
295 | now = datetime.datetime.utcnow() | |
296 | for i, age in enumerate(bins): | |
297 | agelimit = now - datetime.timedelta(days=age) | |
298 | bins[i] = { | |
299 | 'age': age, | |
300 | 'agelimit': agelimit, | |
301 | 'idlist': list() | |
302 | } | |
303 | ||
eafe8130 | 304 | for crashid, crash in self.crashes.items(): |
11fdf7f2 | 305 | total += 1 |
eafe8130 | 306 | stamp = self.time_from_string(crash['timestamp']) |
11fdf7f2 TL |
307 | for i, bindict in enumerate(bins): |
308 | if stamp <= bindict['agelimit']: | |
309 | bindict['idlist'].append(crashid) | |
310 | # don't count this one again | |
311 | continue | |
312 | ||
313 | retlines.append('%d crashes recorded' % total) | |
314 | ||
315 | for bindict in bins: | |
316 | retlines.append(binstr(bindict)) | |
317 | return 0, '\n'.join(retlines), '' | |
318 | ||
319 | def do_json_report(self, cmd, inbuf): | |
320 | """ | |
321 | Return a machine readable summary of recent crashes. | |
322 | """ | |
323 | try: | |
324 | hours = int(cmd['hours']) | |
325 | except ValueError: | |
326 | return errno.EINVAL, '', '<hours> argument must be integer' | |
327 | ||
328 | report = defaultdict(lambda: 0) | |
eafe8130 TL |
329 | for crashid, crash in self.crashes.items(): |
330 | pname = crash.get("process_name", "unknown") | |
11fdf7f2 TL |
331 | if not pname: |
332 | pname = "unknown" | |
333 | report[pname] += 1 | |
334 | ||
9f95a23c | 335 | return 0, '', json.dumps(report, sort_keys=True) |
11fdf7f2 TL |
336 | |
337 | def self_test(self): | |
338 | # test time conversion | |
9f95a23c TL |
339 | timestr = '2018-06-22T20:35:38.058818Z' |
340 | old_timestr = '2018-06-22 20:35:38.058818Z' | |
11fdf7f2 TL |
341 | dt = self.time_from_string(timestr) |
342 | if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818): | |
343 | raise RuntimeError('time_from_string() failed') | |
9f95a23c TL |
344 | dt = self.time_from_string(old_timestr) |
345 | if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818): | |
346 | raise RuntimeError('time_from_string() (old) failed') | |
11fdf7f2 TL |
347 | |
348 | COMMANDS = [ | |
349 | { | |
350 | 'cmd': 'crash info name=id,type=CephString', | |
351 | 'desc': 'show crash dump metadata', | |
352 | 'perm': 'r', | |
353 | 'handler': do_info, | |
354 | }, | |
355 | { | |
356 | 'cmd': 'crash ls', | |
eafe8130 TL |
357 | 'desc': 'Show new and archived crash dumps', |
358 | 'perm': 'r', | |
359 | 'handler': do_ls, | |
360 | }, | |
361 | { | |
362 | 'cmd': 'crash ls-new', | |
363 | 'desc': 'Show new crash dumps', | |
11fdf7f2 TL |
364 | 'perm': 'r', |
365 | 'handler': do_ls, | |
366 | }, | |
367 | { | |
368 | 'cmd': 'crash post', | |
369 | 'desc': 'Add a crash dump (use -i <jsonfile>)', | |
370 | 'perm': 'rw', | |
371 | 'handler': do_post, | |
372 | }, | |
373 | { | |
374 | 'cmd': 'crash prune name=keep,type=CephString', | |
375 | 'desc': 'Remove crashes older than <keep> days', | |
376 | 'perm': 'rw', | |
377 | 'handler': do_prune, | |
378 | }, | |
379 | { | |
380 | 'cmd': 'crash rm name=id,type=CephString', | |
381 | 'desc': 'Remove a saved crash <id>', | |
382 | 'perm': 'rw', | |
383 | 'handler': do_rm, | |
384 | }, | |
385 | { | |
386 | 'cmd': 'crash stat', | |
387 | 'desc': 'Summarize recorded crashes', | |
388 | 'perm': 'r', | |
389 | 'handler': do_stat, | |
390 | }, | |
391 | { | |
392 | 'cmd': 'crash json_report name=hours,type=CephString', | |
393 | 'desc': 'Crashes in the last <hours> hours', | |
394 | 'perm': 'r', | |
395 | 'handler': do_json_report, | |
396 | }, | |
eafe8130 TL |
397 | { |
398 | 'cmd': 'crash archive name=id,type=CephString', | |
399 | 'desc': 'Acknowledge a crash and silence health warning(s)', | |
400 | 'perm': 'w', | |
401 | 'handler': do_archive, | |
402 | }, | |
403 | { | |
404 | 'cmd': 'crash archive-all', | |
405 | 'desc': 'Acknowledge all new crashes and silence health warning(s)', | |
406 | 'perm': 'w', | |
407 | 'handler': do_archive_all, | |
408 | }, | |
11fdf7f2 | 409 | ] |