]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/crash/module.py
import 15.2.0 Octopus source
[ceph.git] / ceph / src / pybind / mgr / crash / module.py
CommitLineData
9f95a23c 1import hashlib
11fdf7f2
TL
2from mgr_module import MgrModule
3import datetime
4import errno
5import json
11fdf7f2 6from collections import defaultdict
eafe8130 7from prettytable import PrettyTable
9f95a23c 8import re
eafe8130 9from threading import Event
11fdf7f2
TL
10
11
9f95a23c
TL
12DATEFMT = '%Y-%m-%dT%H:%M:%S.%f'
13OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
11fdf7f2 14
eafe8130
TL
15MAX_WAIT = 600
16MIN_WAIT = 60
11fdf7f2
TL
17
18class Module(MgrModule):
eafe8130
TL
19 MODULE_OPTIONS = [
20 {
21 'name': 'warn_recent_interval',
22 'type': 'secs',
23 'default': 60*60*24*14,
24 'desc': 'time interval in which to warn about recent crashes',
25 'runtime': True,
26 },
27 {
28 'name': 'retain_interval',
29 'type': 'secs',
30 'default': 60*60*24 * 365,
31 'desc': 'how long to retain crashes before pruning them',
32 'runtime': True,
33 },
34 ]
11fdf7f2
TL
35
36 def __init__(self, *args, **kwargs):
37 super(Module, self).__init__(*args, **kwargs)
eafe8130
TL
38 self.crashes = None
39 self.run = True
40 self.event = Event()
41
42 def shutdown(self):
43 self.run = False
44 self.event.set()
45
46 def serve(self):
47 self.config_notify()
48 while self.run:
49 self._refresh_health_checks()
50 self._prune(self.retain_interval)
51 wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT))
52 self.event.wait(wait)
53 self.event.clear()
54
55 def config_notify(self):
56 for opt in self.MODULE_OPTIONS:
57 setattr(self,
58 opt['name'],
9f95a23c 59 self.get_module_option(opt['name']))
eafe8130
TL
60 self.log.debug(' mgr option %s = %s',
61 opt['name'], getattr(self, opt['name']))
62
63 def _load_crashes(self):
64 raw = self.get_store_prefix('crash/')
65 self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()}
66
67 def _refresh_health_checks(self):
68 if not self.crashes:
69 self._load_crashes()
70 cutoff = datetime.datetime.utcnow() - datetime.timedelta(
71 seconds=self.warn_recent_interval)
72 recent = {
73 crashid: crash for crashid, crash in self.crashes.items()
74 if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash
75 }
76 num = len(recent)
77 health_checks = {}
78 if recent:
79 detail = [
80 '%s crashed on host %s at %s' % (
81 crash.get('entity_name', 'unidentified daemon'),
82 crash.get('utsname_hostname', '(unknown)'),
83 crash.get('timestamp', 'unknown time'))
84 for (_, crash) in recent.items()]
85 if num > 30:
86 detail = detail[0:30]
87 detail.append('and %d more' % (num - 30))
88 self.log.debug('detail %s' % detail)
89 health_checks['RECENT_CRASH'] = {
90 'severity': 'warning',
91 'summary': '%d daemons have recently crashed' % (num),
9f95a23c 92 'count': num,
eafe8130
TL
93 'detail': detail,
94 }
95 self.set_health_checks(health_checks)
11fdf7f2
TL
96
97 def handle_command(self, inbuf, command):
eafe8130
TL
98 if not self.crashes:
99 self._load_crashes()
11fdf7f2
TL
100 for cmd in self.COMMANDS:
101 if cmd['cmd'].startswith(command['prefix']):
102 handler = cmd['handler']
103 break
104 if handler is None:
105 return errno.EINVAL, '', 'unknown command %s' % command['prefix']
106
107 return handler(self, command, inbuf)
108
eafe8130 109 def time_from_string(self, timestr):
11fdf7f2
TL
110 # drop the 'Z' timezone indication, it's always UTC
111 timestr = timestr.rstrip('Z')
9f95a23c
TL
112 try:
113 return datetime.datetime.strptime(timestr, DATEFMT)
114 except ValueError:
115 return datetime.datetime.strptime(timestr, OLD_DATEFMT)
11fdf7f2 116
eafe8130
TL
117 def validate_crash_metadata(self, inbuf):
118 # raise any exceptions to caller
119 metadata = json.loads(inbuf)
120 for f in ['crash_id', 'timestamp']:
121 if f not in metadata:
122 raise AttributeError("missing '%s' field" % f)
123 time = self.time_from_string(metadata['timestamp'])
124 return metadata
125
11fdf7f2
TL
126 def timestamp_filter(self, f):
127 """
128 Filter crash reports by timestamp.
129
130 :param f: f(time) return true to keep crash report
131 :returns: crash reports for which f(time) returns true
132 """
133 def inner(pair):
eafe8130
TL
134 _, crash = pair
135 time = self.time_from_string(crash["timestamp"])
11fdf7f2 136 return f(time)
eafe8130 137 return filter(inner, self.crashes.items())
11fdf7f2 138
9f95a23c
TL
139 # stack signature helpers
140
141 def sanitize_backtrace(self, bt):
142 ret = list()
143 for func_record in bt:
144 # split into two fields on last space, take the first one,
145 # strip off leading ( and trailing )
146 func_plus_offset = func_record.rsplit(' ', 1)[0][1:-1]
147 ret.append(func_plus_offset.split('+')[0])
148
149 return ret
150
151 ASSERT_MATCHEXPR = re.compile(r'(?s)(.*) thread .* time .*(: .*)\n')
152
153 def sanitize_assert_msg(self, msg):
154 # (?s) allows matching newline. get everything up to "thread" and
155 # then after-and-including the last colon-space. This skips the
156 # thread id, timestamp, and file:lineno, because file is already in
157 # the beginning, and lineno may vary.
158 return ''.join(self.ASSERT_MATCHEXPR.match(msg).groups())
159
160 def calc_sig(self, bt, assert_msg):
161 sig = hashlib.sha256()
162 for func in self.sanitize_backtrace(bt):
163 sig.update(func.encode())
164 if assert_msg:
165 sig.update(self.sanitize_assert_msg(assert_msg).encode())
166 return ''.join('%02x' % c for c in sig.digest())
167
11fdf7f2
TL
168 # command handlers
169
170 def do_info(self, cmd, inbuf):
171 crashid = cmd['id']
eafe8130
TL
172 crash = self.crashes.get(crashid)
173 if not crash:
11fdf7f2 174 return errno.EINVAL, '', 'crash info: %s not found' % crashid
9f95a23c 175 val = json.dumps(crash, indent=4, sort_keys=True)
11fdf7f2
TL
176 return 0, val, ''
177
178 def do_post(self, cmd, inbuf):
179 try:
180 metadata = self.validate_crash_metadata(inbuf)
181 except Exception as e:
182 return errno.EINVAL, '', 'malformed crash metadata: %s' % e
9f95a23c
TL
183 if 'backtrace' in metadata:
184 metadata['stack_sig'] = self.calc_sig(
185 metadata.get('backtrace'), metadata.get('assert_msg'))
11fdf7f2 186 crashid = metadata['crash_id']
eafe8130
TL
187
188 if crashid not in self.crashes:
189 self.crashes[crashid] = metadata
190 key = 'crash/%s' % crashid
191 self.set_store(key, json.dumps(metadata))
192 self._refresh_health_checks()
11fdf7f2
TL
193 return 0, '', ''
194
eafe8130
TL
195 def ls(self):
196 if not self.crashes:
197 self._load_crashes()
198 return self.do_ls({'prefix': 'crash ls'}, '')
199
11fdf7f2 200 def do_ls(self, cmd, inbuf):
eafe8130
TL
201 if cmd['prefix'] == 'crash ls':
202 t = self.crashes.values()
203 else:
204 t = [crash for crashid, crash in self.crashes.items()
205 if 'archived' not in crash]
206 r = sorted(t, key=lambda i: i.get('crash_id'))
207 if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
9f95a23c 208 return 0, json.dumps(r, indent=4, sort_keys=True), ''
eafe8130
TL
209 else:
210 table = PrettyTable(['ID', 'ENTITY', 'NEW'],
211 border=False)
212 table.left_padding_width = 0
9f95a23c 213 table.right_padding_width = 2
eafe8130
TL
214 table.align['ID'] = 'l'
215 table.align['ENTITY'] = 'l'
216 for c in r:
217 table.add_row([c.get('crash_id'),
218 c.get('entity_name','unknown'),
219 '' if 'archived' in c else '*'])
220 return 0, table.get_string(), ''
11fdf7f2
TL
221
222 def do_rm(self, cmd, inbuf):
223 crashid = cmd['id']
eafe8130
TL
224 if crashid in self.crashes:
225 del self.crashes[crashid]
226 key = 'crash/%s' % crashid
227 self.set_store(key, None) # removes key
228 self._refresh_health_checks()
11fdf7f2
TL
229 return 0, '', ''
230
231 def do_prune(self, cmd, inbuf):
11fdf7f2
TL
232 keep = cmd['keep']
233 try:
234 keep = int(keep)
235 except ValueError:
236 return errno.EINVAL, '', 'keep argument must be integer'
237
eafe8130
TL
238 self._prune(keep * 60*60*24)
239 return 0, '', ''
11fdf7f2 240
eafe8130
TL
241 def _prune(self, seconds):
242 now = datetime.datetime.utcnow()
243 cutoff = now - datetime.timedelta(seconds=seconds)
244 removed_any = False
245 # make a copy of the list, since we'll modify self.crashes below
246 to_prune = list(self.timestamp_filter(lambda ts: ts <= cutoff))
247 for crashid, crash in to_prune:
248 del self.crashes[crashid]
249 key = 'crash/%s' % crashid
11fdf7f2 250 self.set_store(key, None)
eafe8130
TL
251 removed_any = True
252 if removed_any:
253 self._refresh_health_checks()
254
255 def do_archive(self, cmd, inbuf):
256 crashid = cmd['id']
257 crash = self.crashes.get(crashid)
258 if not crash:
259 return errno.EINVAL, '', 'crash info: %s not found' % crashid
260 if not crash.get('archived'):
261 crash['archived'] = str(datetime.datetime.utcnow())
262 self.crashes[crashid] = crash
263 key = 'crash/%s' % crashid
264 self.set_store(key, json.dumps(crash))
265 self._refresh_health_checks()
266 return 0, '', ''
11fdf7f2 267
eafe8130
TL
268 def do_archive_all(self, cmd, inbuf):
269 for crashid, crash in self.crashes.items():
270 if not crash.get('archived'):
271 crash['archived'] = str(datetime.datetime.utcnow())
272 self.crashes[crashid] = crash
273 key = 'crash/%s' % crashid
274 self.set_store(key, json.dumps(crash))
275 self._refresh_health_checks()
11fdf7f2
TL
276 return 0, '', ''
277
278 def do_stat(self, cmd, inbuf):
279 # age in days for reporting, ordered smallest first
280 bins = [1, 3, 7]
281 retlines = list()
282
283 def binstr(bindict):
284 binlines = list()
285 count = len(bindict['idlist'])
286 if count:
287 binlines.append(
288 '%d older than %s days old:' % (count, bindict['age'])
289 )
290 for crashid in bindict['idlist']:
291 binlines.append(crashid)
292 return '\n'.join(binlines)
293
294 total = 0
295 now = datetime.datetime.utcnow()
296 for i, age in enumerate(bins):
297 agelimit = now - datetime.timedelta(days=age)
298 bins[i] = {
299 'age': age,
300 'agelimit': agelimit,
301 'idlist': list()
302 }
303
eafe8130 304 for crashid, crash in self.crashes.items():
11fdf7f2 305 total += 1
eafe8130 306 stamp = self.time_from_string(crash['timestamp'])
11fdf7f2
TL
307 for i, bindict in enumerate(bins):
308 if stamp <= bindict['agelimit']:
309 bindict['idlist'].append(crashid)
310 # don't count this one again
311 continue
312
313 retlines.append('%d crashes recorded' % total)
314
315 for bindict in bins:
316 retlines.append(binstr(bindict))
317 return 0, '\n'.join(retlines), ''
318
319 def do_json_report(self, cmd, inbuf):
320 """
321 Return a machine readable summary of recent crashes.
322 """
323 try:
324 hours = int(cmd['hours'])
325 except ValueError:
326 return errno.EINVAL, '', '<hours> argument must be integer'
327
328 report = defaultdict(lambda: 0)
eafe8130
TL
329 for crashid, crash in self.crashes.items():
330 pname = crash.get("process_name", "unknown")
11fdf7f2
TL
331 if not pname:
332 pname = "unknown"
333 report[pname] += 1
334
9f95a23c 335 return 0, '', json.dumps(report, sort_keys=True)
11fdf7f2
TL
336
337 def self_test(self):
338 # test time conversion
9f95a23c
TL
339 timestr = '2018-06-22T20:35:38.058818Z'
340 old_timestr = '2018-06-22 20:35:38.058818Z'
11fdf7f2
TL
341 dt = self.time_from_string(timestr)
342 if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818):
343 raise RuntimeError('time_from_string() failed')
9f95a23c
TL
344 dt = self.time_from_string(old_timestr)
345 if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818):
346 raise RuntimeError('time_from_string() (old) failed')
11fdf7f2
TL
347
348 COMMANDS = [
349 {
350 'cmd': 'crash info name=id,type=CephString',
351 'desc': 'show crash dump metadata',
352 'perm': 'r',
353 'handler': do_info,
354 },
355 {
356 'cmd': 'crash ls',
eafe8130
TL
357 'desc': 'Show new and archived crash dumps',
358 'perm': 'r',
359 'handler': do_ls,
360 },
361 {
362 'cmd': 'crash ls-new',
363 'desc': 'Show new crash dumps',
11fdf7f2
TL
364 'perm': 'r',
365 'handler': do_ls,
366 },
367 {
368 'cmd': 'crash post',
369 'desc': 'Add a crash dump (use -i <jsonfile>)',
370 'perm': 'rw',
371 'handler': do_post,
372 },
373 {
374 'cmd': 'crash prune name=keep,type=CephString',
375 'desc': 'Remove crashes older than <keep> days',
376 'perm': 'rw',
377 'handler': do_prune,
378 },
379 {
380 'cmd': 'crash rm name=id,type=CephString',
381 'desc': 'Remove a saved crash <id>',
382 'perm': 'rw',
383 'handler': do_rm,
384 },
385 {
386 'cmd': 'crash stat',
387 'desc': 'Summarize recorded crashes',
388 'perm': 'r',
389 'handler': do_stat,
390 },
391 {
392 'cmd': 'crash json_report name=hours,type=CephString',
393 'desc': 'Crashes in the last <hours> hours',
394 'perm': 'r',
395 'handler': do_json_report,
396 },
eafe8130
TL
397 {
398 'cmd': 'crash archive name=id,type=CephString',
399 'desc': 'Acknowledge a crash and silence health warning(s)',
400 'perm': 'w',
401 'handler': do_archive,
402 },
403 {
404 'cmd': 'crash archive-all',
405 'desc': 'Acknowledge all new crashes and silence health warning(s)',
406 'perm': 'w',
407 'handler': do_archive_all,
408 },
11fdf7f2 409 ]