]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/crash/module.py
import 15.2.0 Octopus source
[ceph.git] / ceph / src / pybind / mgr / crash / module.py
1 import hashlib
2 from mgr_module import MgrModule
3 import datetime
4 import errno
5 import json
6 from collections import defaultdict
7 from prettytable import PrettyTable
8 import re
9 from threading import Event
10
11
12 DATEFMT = '%Y-%m-%dT%H:%M:%S.%f'
13 OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
14
15 MAX_WAIT = 600
16 MIN_WAIT = 60
17
18 class Module(MgrModule):
19 MODULE_OPTIONS = [
20 {
21 'name': 'warn_recent_interval',
22 'type': 'secs',
23 'default': 60*60*24*14,
24 'desc': 'time interval in which to warn about recent crashes',
25 'runtime': True,
26 },
27 {
28 'name': 'retain_interval',
29 'type': 'secs',
30 'default': 60*60*24 * 365,
31 'desc': 'how long to retain crashes before pruning them',
32 'runtime': True,
33 },
34 ]
35
36 def __init__(self, *args, **kwargs):
37 super(Module, self).__init__(*args, **kwargs)
38 self.crashes = None
39 self.run = True
40 self.event = Event()
41
42 def shutdown(self):
43 self.run = False
44 self.event.set()
45
46 def serve(self):
47 self.config_notify()
48 while self.run:
49 self._refresh_health_checks()
50 self._prune(self.retain_interval)
51 wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT))
52 self.event.wait(wait)
53 self.event.clear()
54
55 def config_notify(self):
56 for opt in self.MODULE_OPTIONS:
57 setattr(self,
58 opt['name'],
59 self.get_module_option(opt['name']))
60 self.log.debug(' mgr option %s = %s',
61 opt['name'], getattr(self, opt['name']))
62
63 def _load_crashes(self):
64 raw = self.get_store_prefix('crash/')
65 self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()}
66
67 def _refresh_health_checks(self):
68 if not self.crashes:
69 self._load_crashes()
70 cutoff = datetime.datetime.utcnow() - datetime.timedelta(
71 seconds=self.warn_recent_interval)
72 recent = {
73 crashid: crash for crashid, crash in self.crashes.items()
74 if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash
75 }
76 num = len(recent)
77 health_checks = {}
78 if recent:
79 detail = [
80 '%s crashed on host %s at %s' % (
81 crash.get('entity_name', 'unidentified daemon'),
82 crash.get('utsname_hostname', '(unknown)'),
83 crash.get('timestamp', 'unknown time'))
84 for (_, crash) in recent.items()]
85 if num > 30:
86 detail = detail[0:30]
87 detail.append('and %d more' % (num - 30))
88 self.log.debug('detail %s' % detail)
89 health_checks['RECENT_CRASH'] = {
90 'severity': 'warning',
91 'summary': '%d daemons have recently crashed' % (num),
92 'count': num,
93 'detail': detail,
94 }
95 self.set_health_checks(health_checks)
96
97 def handle_command(self, inbuf, command):
98 if not self.crashes:
99 self._load_crashes()
100 for cmd in self.COMMANDS:
101 if cmd['cmd'].startswith(command['prefix']):
102 handler = cmd['handler']
103 break
104 if handler is None:
105 return errno.EINVAL, '', 'unknown command %s' % command['prefix']
106
107 return handler(self, command, inbuf)
108
109 def time_from_string(self, timestr):
110 # drop the 'Z' timezone indication, it's always UTC
111 timestr = timestr.rstrip('Z')
112 try:
113 return datetime.datetime.strptime(timestr, DATEFMT)
114 except ValueError:
115 return datetime.datetime.strptime(timestr, OLD_DATEFMT)
116
117 def validate_crash_metadata(self, inbuf):
118 # raise any exceptions to caller
119 metadata = json.loads(inbuf)
120 for f in ['crash_id', 'timestamp']:
121 if f not in metadata:
122 raise AttributeError("missing '%s' field" % f)
123 time = self.time_from_string(metadata['timestamp'])
124 return metadata
125
126 def timestamp_filter(self, f):
127 """
128 Filter crash reports by timestamp.
129
130 :param f: f(time) return true to keep crash report
131 :returns: crash reports for which f(time) returns true
132 """
133 def inner(pair):
134 _, crash = pair
135 time = self.time_from_string(crash["timestamp"])
136 return f(time)
137 return filter(inner, self.crashes.items())
138
139 # stack signature helpers
140
141 def sanitize_backtrace(self, bt):
142 ret = list()
143 for func_record in bt:
144 # split into two fields on last space, take the first one,
145 # strip off leading ( and trailing )
146 func_plus_offset = func_record.rsplit(' ', 1)[0][1:-1]
147 ret.append(func_plus_offset.split('+')[0])
148
149 return ret
150
151 ASSERT_MATCHEXPR = re.compile(r'(?s)(.*) thread .* time .*(: .*)\n')
152
153 def sanitize_assert_msg(self, msg):
154 # (?s) allows matching newline. get everything up to "thread" and
155 # then after-and-including the last colon-space. This skips the
156 # thread id, timestamp, and file:lineno, because file is already in
157 # the beginning, and lineno may vary.
158 return ''.join(self.ASSERT_MATCHEXPR.match(msg).groups())
159
160 def calc_sig(self, bt, assert_msg):
161 sig = hashlib.sha256()
162 for func in self.sanitize_backtrace(bt):
163 sig.update(func.encode())
164 if assert_msg:
165 sig.update(self.sanitize_assert_msg(assert_msg).encode())
166 return ''.join('%02x' % c for c in sig.digest())
167
168 # command handlers
169
170 def do_info(self, cmd, inbuf):
171 crashid = cmd['id']
172 crash = self.crashes.get(crashid)
173 if not crash:
174 return errno.EINVAL, '', 'crash info: %s not found' % crashid
175 val = json.dumps(crash, indent=4, sort_keys=True)
176 return 0, val, ''
177
178 def do_post(self, cmd, inbuf):
179 try:
180 metadata = self.validate_crash_metadata(inbuf)
181 except Exception as e:
182 return errno.EINVAL, '', 'malformed crash metadata: %s' % e
183 if 'backtrace' in metadata:
184 metadata['stack_sig'] = self.calc_sig(
185 metadata.get('backtrace'), metadata.get('assert_msg'))
186 crashid = metadata['crash_id']
187
188 if crashid not in self.crashes:
189 self.crashes[crashid] = metadata
190 key = 'crash/%s' % crashid
191 self.set_store(key, json.dumps(metadata))
192 self._refresh_health_checks()
193 return 0, '', ''
194
195 def ls(self):
196 if not self.crashes:
197 self._load_crashes()
198 return self.do_ls({'prefix': 'crash ls'}, '')
199
200 def do_ls(self, cmd, inbuf):
201 if cmd['prefix'] == 'crash ls':
202 t = self.crashes.values()
203 else:
204 t = [crash for crashid, crash in self.crashes.items()
205 if 'archived' not in crash]
206 r = sorted(t, key=lambda i: i.get('crash_id'))
207 if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
208 return 0, json.dumps(r, indent=4, sort_keys=True), ''
209 else:
210 table = PrettyTable(['ID', 'ENTITY', 'NEW'],
211 border=False)
212 table.left_padding_width = 0
213 table.right_padding_width = 2
214 table.align['ID'] = 'l'
215 table.align['ENTITY'] = 'l'
216 for c in r:
217 table.add_row([c.get('crash_id'),
218 c.get('entity_name','unknown'),
219 '' if 'archived' in c else '*'])
220 return 0, table.get_string(), ''
221
222 def do_rm(self, cmd, inbuf):
223 crashid = cmd['id']
224 if crashid in self.crashes:
225 del self.crashes[crashid]
226 key = 'crash/%s' % crashid
227 self.set_store(key, None) # removes key
228 self._refresh_health_checks()
229 return 0, '', ''
230
231 def do_prune(self, cmd, inbuf):
232 keep = cmd['keep']
233 try:
234 keep = int(keep)
235 except ValueError:
236 return errno.EINVAL, '', 'keep argument must be integer'
237
238 self._prune(keep * 60*60*24)
239 return 0, '', ''
240
241 def _prune(self, seconds):
242 now = datetime.datetime.utcnow()
243 cutoff = now - datetime.timedelta(seconds=seconds)
244 removed_any = False
245 # make a copy of the list, since we'll modify self.crashes below
246 to_prune = list(self.timestamp_filter(lambda ts: ts <= cutoff))
247 for crashid, crash in to_prune:
248 del self.crashes[crashid]
249 key = 'crash/%s' % crashid
250 self.set_store(key, None)
251 removed_any = True
252 if removed_any:
253 self._refresh_health_checks()
254
255 def do_archive(self, cmd, inbuf):
256 crashid = cmd['id']
257 crash = self.crashes.get(crashid)
258 if not crash:
259 return errno.EINVAL, '', 'crash info: %s not found' % crashid
260 if not crash.get('archived'):
261 crash['archived'] = str(datetime.datetime.utcnow())
262 self.crashes[crashid] = crash
263 key = 'crash/%s' % crashid
264 self.set_store(key, json.dumps(crash))
265 self._refresh_health_checks()
266 return 0, '', ''
267
268 def do_archive_all(self, cmd, inbuf):
269 for crashid, crash in self.crashes.items():
270 if not crash.get('archived'):
271 crash['archived'] = str(datetime.datetime.utcnow())
272 self.crashes[crashid] = crash
273 key = 'crash/%s' % crashid
274 self.set_store(key, json.dumps(crash))
275 self._refresh_health_checks()
276 return 0, '', ''
277
278 def do_stat(self, cmd, inbuf):
279 # age in days for reporting, ordered smallest first
280 bins = [1, 3, 7]
281 retlines = list()
282
283 def binstr(bindict):
284 binlines = list()
285 count = len(bindict['idlist'])
286 if count:
287 binlines.append(
288 '%d older than %s days old:' % (count, bindict['age'])
289 )
290 for crashid in bindict['idlist']:
291 binlines.append(crashid)
292 return '\n'.join(binlines)
293
294 total = 0
295 now = datetime.datetime.utcnow()
296 for i, age in enumerate(bins):
297 agelimit = now - datetime.timedelta(days=age)
298 bins[i] = {
299 'age': age,
300 'agelimit': agelimit,
301 'idlist': list()
302 }
303
304 for crashid, crash in self.crashes.items():
305 total += 1
306 stamp = self.time_from_string(crash['timestamp'])
307 for i, bindict in enumerate(bins):
308 if stamp <= bindict['agelimit']:
309 bindict['idlist'].append(crashid)
310 # don't count this one again
311 continue
312
313 retlines.append('%d crashes recorded' % total)
314
315 for bindict in bins:
316 retlines.append(binstr(bindict))
317 return 0, '\n'.join(retlines), ''
318
319 def do_json_report(self, cmd, inbuf):
320 """
321 Return a machine readable summary of recent crashes.
322 """
323 try:
324 hours = int(cmd['hours'])
325 except ValueError:
326 return errno.EINVAL, '', '<hours> argument must be integer'
327
328 report = defaultdict(lambda: 0)
329 for crashid, crash in self.crashes.items():
330 pname = crash.get("process_name", "unknown")
331 if not pname:
332 pname = "unknown"
333 report[pname] += 1
334
335 return 0, '', json.dumps(report, sort_keys=True)
336
337 def self_test(self):
338 # test time conversion
339 timestr = '2018-06-22T20:35:38.058818Z'
340 old_timestr = '2018-06-22 20:35:38.058818Z'
341 dt = self.time_from_string(timestr)
342 if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818):
343 raise RuntimeError('time_from_string() failed')
344 dt = self.time_from_string(old_timestr)
345 if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818):
346 raise RuntimeError('time_from_string() (old) failed')
347
348 COMMANDS = [
349 {
350 'cmd': 'crash info name=id,type=CephString',
351 'desc': 'show crash dump metadata',
352 'perm': 'r',
353 'handler': do_info,
354 },
355 {
356 'cmd': 'crash ls',
357 'desc': 'Show new and archived crash dumps',
358 'perm': 'r',
359 'handler': do_ls,
360 },
361 {
362 'cmd': 'crash ls-new',
363 'desc': 'Show new crash dumps',
364 'perm': 'r',
365 'handler': do_ls,
366 },
367 {
368 'cmd': 'crash post',
369 'desc': 'Add a crash dump (use -i <jsonfile>)',
370 'perm': 'rw',
371 'handler': do_post,
372 },
373 {
374 'cmd': 'crash prune name=keep,type=CephString',
375 'desc': 'Remove crashes older than <keep> days',
376 'perm': 'rw',
377 'handler': do_prune,
378 },
379 {
380 'cmd': 'crash rm name=id,type=CephString',
381 'desc': 'Remove a saved crash <id>',
382 'perm': 'rw',
383 'handler': do_rm,
384 },
385 {
386 'cmd': 'crash stat',
387 'desc': 'Summarize recorded crashes',
388 'perm': 'r',
389 'handler': do_stat,
390 },
391 {
392 'cmd': 'crash json_report name=hours,type=CephString',
393 'desc': 'Crashes in the last <hours> hours',
394 'perm': 'r',
395 'handler': do_json_report,
396 },
397 {
398 'cmd': 'crash archive name=id,type=CephString',
399 'desc': 'Acknowledge a crash and silence health warning(s)',
400 'perm': 'w',
401 'handler': do_archive,
402 },
403 {
404 'cmd': 'crash archive-all',
405 'desc': 'Acknowledge all new crashes and silence health warning(s)',
406 'perm': 'w',
407 'handler': do_archive_all,
408 },
409 ]