]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/crash/module.py
import ceph 14.2.5
[ceph.git] / ceph / src / pybind / mgr / crash / module.py
CommitLineData
11fdf7f2
TL
1from mgr_module import MgrModule
2import datetime
3import errno
4import json
11fdf7f2 5from collections import defaultdict
eafe8130
TL
6from prettytable import PrettyTable
7from threading import Event
11fdf7f2
TL
8
9
10DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
11
eafe8130
TL
12MAX_WAIT = 600
13MIN_WAIT = 60
11fdf7f2
TL
14
15class Module(MgrModule):
eafe8130
TL
16 MODULE_OPTIONS = [
17 {
18 'name': 'warn_recent_interval',
19 'type': 'secs',
20 'default': 60*60*24*14,
21 'desc': 'time interval in which to warn about recent crashes',
22 'runtime': True,
23 },
24 {
25 'name': 'retain_interval',
26 'type': 'secs',
27 'default': 60*60*24 * 365,
28 'desc': 'how long to retain crashes before pruning them',
29 'runtime': True,
30 },
31 ]
11fdf7f2
TL
32
33 def __init__(self, *args, **kwargs):
34 super(Module, self).__init__(*args, **kwargs)
eafe8130
TL
35 self.crashes = None
36 self.run = True
37 self.event = Event()
38
39 def shutdown(self):
40 self.run = False
41 self.event.set()
42
43 def serve(self):
44 self.config_notify()
45 while self.run:
46 self._refresh_health_checks()
47 self._prune(self.retain_interval)
48 wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT))
49 self.event.wait(wait)
50 self.event.clear()
51
52 def config_notify(self):
53 for opt in self.MODULE_OPTIONS:
54 setattr(self,
55 opt['name'],
56 self.get_module_option(opt['name']) or opt['default'])
57 self.log.debug(' mgr option %s = %s',
58 opt['name'], getattr(self, opt['name']))
59
60 def _load_crashes(self):
61 raw = self.get_store_prefix('crash/')
62 self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()}
63
64 def _refresh_health_checks(self):
65 if not self.crashes:
66 self._load_crashes()
67 cutoff = datetime.datetime.utcnow() - datetime.timedelta(
68 seconds=self.warn_recent_interval)
69 recent = {
70 crashid: crash for crashid, crash in self.crashes.items()
71 if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash
72 }
73 num = len(recent)
74 health_checks = {}
75 if recent:
76 detail = [
77 '%s crashed on host %s at %s' % (
78 crash.get('entity_name', 'unidentified daemon'),
79 crash.get('utsname_hostname', '(unknown)'),
80 crash.get('timestamp', 'unknown time'))
81 for (_, crash) in recent.items()]
82 if num > 30:
83 detail = detail[0:30]
84 detail.append('and %d more' % (num - 30))
85 self.log.debug('detail %s' % detail)
86 health_checks['RECENT_CRASH'] = {
87 'severity': 'warning',
88 'summary': '%d daemons have recently crashed' % (num),
89 'detail': detail,
90 }
91 self.set_health_checks(health_checks)
11fdf7f2
TL
92
93 def handle_command(self, inbuf, command):
eafe8130
TL
94 if not self.crashes:
95 self._load_crashes()
11fdf7f2
TL
96 for cmd in self.COMMANDS:
97 if cmd['cmd'].startswith(command['prefix']):
98 handler = cmd['handler']
99 break
100 if handler is None:
101 return errno.EINVAL, '', 'unknown command %s' % command['prefix']
102
103 return handler(self, command, inbuf)
104
eafe8130 105 def time_from_string(self, timestr):
11fdf7f2
TL
106 # drop the 'Z' timezone indication, it's always UTC
107 timestr = timestr.rstrip('Z')
108 return datetime.datetime.strptime(timestr, DATEFMT)
109
eafe8130
TL
110 def validate_crash_metadata(self, inbuf):
111 # raise any exceptions to caller
112 metadata = json.loads(inbuf)
113 for f in ['crash_id', 'timestamp']:
114 if f not in metadata:
115 raise AttributeError("missing '%s' field" % f)
116 time = self.time_from_string(metadata['timestamp'])
117 return metadata
118
11fdf7f2
TL
119 def timestamp_filter(self, f):
120 """
121 Filter crash reports by timestamp.
122
123 :param f: f(time) return true to keep crash report
124 :returns: crash reports for which f(time) returns true
125 """
126 def inner(pair):
eafe8130
TL
127 _, crash = pair
128 time = self.time_from_string(crash["timestamp"])
11fdf7f2 129 return f(time)
eafe8130 130 return filter(inner, self.crashes.items())
11fdf7f2
TL
131
132 # command handlers
133
134 def do_info(self, cmd, inbuf):
135 crashid = cmd['id']
eafe8130
TL
136 crash = self.crashes.get(crashid)
137 if not crash:
11fdf7f2 138 return errno.EINVAL, '', 'crash info: %s not found' % crashid
eafe8130 139 val = json.dumps(crash, indent=4)
11fdf7f2
TL
140 return 0, val, ''
141
142 def do_post(self, cmd, inbuf):
143 try:
144 metadata = self.validate_crash_metadata(inbuf)
145 except Exception as e:
146 return errno.EINVAL, '', 'malformed crash metadata: %s' % e
11fdf7f2 147 crashid = metadata['crash_id']
eafe8130
TL
148
149 if crashid not in self.crashes:
150 self.crashes[crashid] = metadata
151 key = 'crash/%s' % crashid
152 self.set_store(key, json.dumps(metadata))
153 self._refresh_health_checks()
11fdf7f2
TL
154 return 0, '', ''
155
eafe8130
TL
156 def ls(self):
157 if not self.crashes:
158 self._load_crashes()
159 return self.do_ls({'prefix': 'crash ls'}, '')
160
11fdf7f2 161 def do_ls(self, cmd, inbuf):
eafe8130
TL
162 if cmd['prefix'] == 'crash ls':
163 t = self.crashes.values()
164 else:
165 t = [crash for crashid, crash in self.crashes.items()
166 if 'archived' not in crash]
167 r = sorted(t, key=lambda i: i.get('crash_id'))
168 if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty':
169 return 0, json.dumps(r, indent=4), ''
170 else:
171 table = PrettyTable(['ID', 'ENTITY', 'NEW'],
172 border=False)
173 table.left_padding_width = 0
174 table.right_padding_width = 1
175 table.align['ID'] = 'l'
176 table.align['ENTITY'] = 'l'
177 for c in r:
178 table.add_row([c.get('crash_id'),
179 c.get('entity_name','unknown'),
180 '' if 'archived' in c else '*'])
181 return 0, table.get_string(), ''
11fdf7f2
TL
182
183 def do_rm(self, cmd, inbuf):
184 crashid = cmd['id']
eafe8130
TL
185 if crashid in self.crashes:
186 del self.crashes[crashid]
187 key = 'crash/%s' % crashid
188 self.set_store(key, None) # removes key
189 self._refresh_health_checks()
11fdf7f2
TL
190 return 0, '', ''
191
192 def do_prune(self, cmd, inbuf):
11fdf7f2
TL
193 keep = cmd['keep']
194 try:
195 keep = int(keep)
196 except ValueError:
197 return errno.EINVAL, '', 'keep argument must be integer'
198
eafe8130
TL
199 self._prune(keep * 60*60*24)
200 return 0, '', ''
11fdf7f2 201
eafe8130
TL
202 def _prune(self, seconds):
203 now = datetime.datetime.utcnow()
204 cutoff = now - datetime.timedelta(seconds=seconds)
205 removed_any = False
206 # make a copy of the list, since we'll modify self.crashes below
207 to_prune = list(self.timestamp_filter(lambda ts: ts <= cutoff))
208 for crashid, crash in to_prune:
209 del self.crashes[crashid]
210 key = 'crash/%s' % crashid
11fdf7f2 211 self.set_store(key, None)
eafe8130
TL
212 removed_any = True
213 if removed_any:
214 self._refresh_health_checks()
215
216 def do_archive(self, cmd, inbuf):
217 crashid = cmd['id']
218 crash = self.crashes.get(crashid)
219 if not crash:
220 return errno.EINVAL, '', 'crash info: %s not found' % crashid
221 if not crash.get('archived'):
222 crash['archived'] = str(datetime.datetime.utcnow())
223 self.crashes[crashid] = crash
224 key = 'crash/%s' % crashid
225 self.set_store(key, json.dumps(crash))
226 self._refresh_health_checks()
227 return 0, '', ''
11fdf7f2 228
eafe8130
TL
229 def do_archive_all(self, cmd, inbuf):
230 for crashid, crash in self.crashes.items():
231 if not crash.get('archived'):
232 crash['archived'] = str(datetime.datetime.utcnow())
233 self.crashes[crashid] = crash
234 key = 'crash/%s' % crashid
235 self.set_store(key, json.dumps(crash))
236 self._refresh_health_checks()
11fdf7f2
TL
237 return 0, '', ''
238
239 def do_stat(self, cmd, inbuf):
240 # age in days for reporting, ordered smallest first
241 bins = [1, 3, 7]
242 retlines = list()
243
244 def binstr(bindict):
245 binlines = list()
246 count = len(bindict['idlist'])
247 if count:
248 binlines.append(
249 '%d older than %s days old:' % (count, bindict['age'])
250 )
251 for crashid in bindict['idlist']:
252 binlines.append(crashid)
253 return '\n'.join(binlines)
254
255 total = 0
256 now = datetime.datetime.utcnow()
257 for i, age in enumerate(bins):
258 agelimit = now - datetime.timedelta(days=age)
259 bins[i] = {
260 'age': age,
261 'agelimit': agelimit,
262 'idlist': list()
263 }
264
eafe8130 265 for crashid, crash in self.crashes.items():
11fdf7f2 266 total += 1
eafe8130 267 stamp = self.time_from_string(crash['timestamp'])
11fdf7f2
TL
268 for i, bindict in enumerate(bins):
269 if stamp <= bindict['agelimit']:
270 bindict['idlist'].append(crashid)
271 # don't count this one again
272 continue
273
274 retlines.append('%d crashes recorded' % total)
275
276 for bindict in bins:
277 retlines.append(binstr(bindict))
278 return 0, '\n'.join(retlines), ''
279
280 def do_json_report(self, cmd, inbuf):
281 """
282 Return a machine readable summary of recent crashes.
283 """
284 try:
285 hours = int(cmd['hours'])
286 except ValueError:
287 return errno.EINVAL, '', '<hours> argument must be integer'
288
289 report = defaultdict(lambda: 0)
eafe8130
TL
290 for crashid, crash in self.crashes.items():
291 pname = crash.get("process_name", "unknown")
11fdf7f2
TL
292 if not pname:
293 pname = "unknown"
294 report[pname] += 1
295
296 return 0, '', json.dumps(report)
297
298 def self_test(self):
299 # test time conversion
300 timestr = '2018-06-22 20:35:38.058818Z'
301 dt = self.time_from_string(timestr)
302 if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818):
303 raise RuntimeError('time_from_string() failed')
304
305 COMMANDS = [
306 {
307 'cmd': 'crash info name=id,type=CephString',
308 'desc': 'show crash dump metadata',
309 'perm': 'r',
310 'handler': do_info,
311 },
312 {
313 'cmd': 'crash ls',
eafe8130
TL
314 'desc': 'Show new and archived crash dumps',
315 'perm': 'r',
316 'handler': do_ls,
317 },
318 {
319 'cmd': 'crash ls-new',
320 'desc': 'Show new crash dumps',
11fdf7f2
TL
321 'perm': 'r',
322 'handler': do_ls,
323 },
324 {
325 'cmd': 'crash post',
326 'desc': 'Add a crash dump (use -i <jsonfile>)',
327 'perm': 'rw',
328 'handler': do_post,
329 },
330 {
331 'cmd': 'crash prune name=keep,type=CephString',
332 'desc': 'Remove crashes older than <keep> days',
333 'perm': 'rw',
334 'handler': do_prune,
335 },
336 {
337 'cmd': 'crash rm name=id,type=CephString',
338 'desc': 'Remove a saved crash <id>',
339 'perm': 'rw',
340 'handler': do_rm,
341 },
342 {
343 'cmd': 'crash stat',
344 'desc': 'Summarize recorded crashes',
345 'perm': 'r',
346 'handler': do_stat,
347 },
348 {
349 'cmd': 'crash json_report name=hours,type=CephString',
350 'desc': 'Crashes in the last <hours> hours',
351 'perm': 'r',
352 'handler': do_json_report,
353 },
eafe8130
TL
354 {
355 'cmd': 'crash archive name=id,type=CephString',
356 'desc': 'Acknowledge a crash and silence health warning(s)',
357 'perm': 'w',
358 'handler': do_archive,
359 },
360 {
361 'cmd': 'crash archive-all',
362 'desc': 'Acknowledge all new crashes and silence health warning(s)',
363 'perm': 'w',
364 'handler': do_archive_all,
365 },
11fdf7f2 366 ]