]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | from mgr_module import MgrModule |
2 | import datetime | |
3 | import errno | |
4 | import json | |
11fdf7f2 | 5 | from collections import defaultdict |
eafe8130 TL |
6 | from prettytable import PrettyTable |
7 | from threading import Event | |
11fdf7f2 TL |
8 | |
9 | ||
10 | DATEFMT = '%Y-%m-%d %H:%M:%S.%f' | |
11 | ||
eafe8130 TL |
12 | MAX_WAIT = 600 |
13 | MIN_WAIT = 60 | |
11fdf7f2 TL |
14 | |
15 | class Module(MgrModule): | |
eafe8130 TL |
16 | MODULE_OPTIONS = [ |
17 | { | |
18 | 'name': 'warn_recent_interval', | |
19 | 'type': 'secs', | |
20 | 'default': 60*60*24*14, | |
21 | 'desc': 'time interval in which to warn about recent crashes', | |
22 | 'runtime': True, | |
23 | }, | |
24 | { | |
25 | 'name': 'retain_interval', | |
26 | 'type': 'secs', | |
27 | 'default': 60*60*24 * 365, | |
28 | 'desc': 'how long to retain crashes before pruning them', | |
29 | 'runtime': True, | |
30 | }, | |
31 | ] | |
11fdf7f2 TL |
32 | |
33 | def __init__(self, *args, **kwargs): | |
34 | super(Module, self).__init__(*args, **kwargs) | |
eafe8130 TL |
35 | self.crashes = None |
36 | self.run = True | |
37 | self.event = Event() | |
38 | ||
39 | def shutdown(self): | |
40 | self.run = False | |
41 | self.event.set() | |
42 | ||
43 | def serve(self): | |
44 | self.config_notify() | |
45 | while self.run: | |
46 | self._refresh_health_checks() | |
47 | self._prune(self.retain_interval) | |
48 | wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT)) | |
49 | self.event.wait(wait) | |
50 | self.event.clear() | |
51 | ||
52 | def config_notify(self): | |
53 | for opt in self.MODULE_OPTIONS: | |
54 | setattr(self, | |
55 | opt['name'], | |
56 | self.get_module_option(opt['name']) or opt['default']) | |
57 | self.log.debug(' mgr option %s = %s', | |
58 | opt['name'], getattr(self, opt['name'])) | |
59 | ||
60 | def _load_crashes(self): | |
61 | raw = self.get_store_prefix('crash/') | |
62 | self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()} | |
63 | ||
64 | def _refresh_health_checks(self): | |
65 | if not self.crashes: | |
66 | self._load_crashes() | |
67 | cutoff = datetime.datetime.utcnow() - datetime.timedelta( | |
68 | seconds=self.warn_recent_interval) | |
69 | recent = { | |
70 | crashid: crash for crashid, crash in self.crashes.items() | |
71 | if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash | |
72 | } | |
73 | num = len(recent) | |
74 | health_checks = {} | |
75 | if recent: | |
76 | detail = [ | |
77 | '%s crashed on host %s at %s' % ( | |
78 | crash.get('entity_name', 'unidentified daemon'), | |
79 | crash.get('utsname_hostname', '(unknown)'), | |
80 | crash.get('timestamp', 'unknown time')) | |
81 | for (_, crash) in recent.items()] | |
82 | if num > 30: | |
83 | detail = detail[0:30] | |
84 | detail.append('and %d more' % (num - 30)) | |
85 | self.log.debug('detail %s' % detail) | |
86 | health_checks['RECENT_CRASH'] = { | |
87 | 'severity': 'warning', | |
88 | 'summary': '%d daemons have recently crashed' % (num), | |
89 | 'detail': detail, | |
90 | } | |
91 | self.set_health_checks(health_checks) | |
11fdf7f2 TL |
92 | |
93 | def handle_command(self, inbuf, command): | |
eafe8130 TL |
94 | if not self.crashes: |
95 | self._load_crashes() | |
11fdf7f2 TL |
96 | for cmd in self.COMMANDS: |
97 | if cmd['cmd'].startswith(command['prefix']): | |
98 | handler = cmd['handler'] | |
99 | break | |
100 | if handler is None: | |
101 | return errno.EINVAL, '', 'unknown command %s' % command['prefix'] | |
102 | ||
103 | return handler(self, command, inbuf) | |
104 | ||
eafe8130 | 105 | def time_from_string(self, timestr): |
11fdf7f2 TL |
106 | # drop the 'Z' timezone indication, it's always UTC |
107 | timestr = timestr.rstrip('Z') | |
108 | return datetime.datetime.strptime(timestr, DATEFMT) | |
109 | ||
eafe8130 TL |
110 | def validate_crash_metadata(self, inbuf): |
111 | # raise any exceptions to caller | |
112 | metadata = json.loads(inbuf) | |
113 | for f in ['crash_id', 'timestamp']: | |
114 | if f not in metadata: | |
115 | raise AttributeError("missing '%s' field" % f) | |
116 | time = self.time_from_string(metadata['timestamp']) | |
117 | return metadata | |
118 | ||
11fdf7f2 TL |
119 | def timestamp_filter(self, f): |
120 | """ | |
121 | Filter crash reports by timestamp. | |
122 | ||
123 | :param f: f(time) return true to keep crash report | |
124 | :returns: crash reports for which f(time) returns true | |
125 | """ | |
126 | def inner(pair): | |
eafe8130 TL |
127 | _, crash = pair |
128 | time = self.time_from_string(crash["timestamp"]) | |
11fdf7f2 | 129 | return f(time) |
eafe8130 | 130 | return filter(inner, self.crashes.items()) |
11fdf7f2 TL |
131 | |
132 | # command handlers | |
133 | ||
134 | def do_info(self, cmd, inbuf): | |
135 | crashid = cmd['id'] | |
eafe8130 TL |
136 | crash = self.crashes.get(crashid) |
137 | if not crash: | |
11fdf7f2 | 138 | return errno.EINVAL, '', 'crash info: %s not found' % crashid |
eafe8130 | 139 | val = json.dumps(crash, indent=4) |
11fdf7f2 TL |
140 | return 0, val, '' |
141 | ||
142 | def do_post(self, cmd, inbuf): | |
143 | try: | |
144 | metadata = self.validate_crash_metadata(inbuf) | |
145 | except Exception as e: | |
146 | return errno.EINVAL, '', 'malformed crash metadata: %s' % e | |
11fdf7f2 | 147 | crashid = metadata['crash_id'] |
eafe8130 TL |
148 | |
149 | if crashid not in self.crashes: | |
150 | self.crashes[crashid] = metadata | |
151 | key = 'crash/%s' % crashid | |
152 | self.set_store(key, json.dumps(metadata)) | |
153 | self._refresh_health_checks() | |
11fdf7f2 TL |
154 | return 0, '', '' |
155 | ||
eafe8130 TL |
156 | def ls(self): |
157 | if not self.crashes: | |
158 | self._load_crashes() | |
159 | return self.do_ls({'prefix': 'crash ls'}, '') | |
160 | ||
11fdf7f2 | 161 | def do_ls(self, cmd, inbuf): |
eafe8130 TL |
162 | if cmd['prefix'] == 'crash ls': |
163 | t = self.crashes.values() | |
164 | else: | |
165 | t = [crash for crashid, crash in self.crashes.items() | |
166 | if 'archived' not in crash] | |
167 | r = sorted(t, key=lambda i: i.get('crash_id')) | |
168 | if cmd.get('format') == 'json' or cmd.get('format') == 'json-pretty': | |
169 | return 0, json.dumps(r, indent=4), '' | |
170 | else: | |
171 | table = PrettyTable(['ID', 'ENTITY', 'NEW'], | |
172 | border=False) | |
173 | table.left_padding_width = 0 | |
174 | table.right_padding_width = 1 | |
175 | table.align['ID'] = 'l' | |
176 | table.align['ENTITY'] = 'l' | |
177 | for c in r: | |
178 | table.add_row([c.get('crash_id'), | |
179 | c.get('entity_name','unknown'), | |
180 | '' if 'archived' in c else '*']) | |
181 | return 0, table.get_string(), '' | |
11fdf7f2 TL |
182 | |
183 | def do_rm(self, cmd, inbuf): | |
184 | crashid = cmd['id'] | |
eafe8130 TL |
185 | if crashid in self.crashes: |
186 | del self.crashes[crashid] | |
187 | key = 'crash/%s' % crashid | |
188 | self.set_store(key, None) # removes key | |
189 | self._refresh_health_checks() | |
11fdf7f2 TL |
190 | return 0, '', '' |
191 | ||
192 | def do_prune(self, cmd, inbuf): | |
11fdf7f2 TL |
193 | keep = cmd['keep'] |
194 | try: | |
195 | keep = int(keep) | |
196 | except ValueError: | |
197 | return errno.EINVAL, '', 'keep argument must be integer' | |
198 | ||
eafe8130 TL |
199 | self._prune(keep * 60*60*24) |
200 | return 0, '', '' | |
11fdf7f2 | 201 | |
eafe8130 TL |
202 | def _prune(self, seconds): |
203 | now = datetime.datetime.utcnow() | |
204 | cutoff = now - datetime.timedelta(seconds=seconds) | |
205 | removed_any = False | |
206 | # make a copy of the list, since we'll modify self.crashes below | |
207 | to_prune = list(self.timestamp_filter(lambda ts: ts <= cutoff)) | |
208 | for crashid, crash in to_prune: | |
209 | del self.crashes[crashid] | |
210 | key = 'crash/%s' % crashid | |
11fdf7f2 | 211 | self.set_store(key, None) |
eafe8130 TL |
212 | removed_any = True |
213 | if removed_any: | |
214 | self._refresh_health_checks() | |
215 | ||
216 | def do_archive(self, cmd, inbuf): | |
217 | crashid = cmd['id'] | |
218 | crash = self.crashes.get(crashid) | |
219 | if not crash: | |
220 | return errno.EINVAL, '', 'crash info: %s not found' % crashid | |
221 | if not crash.get('archived'): | |
222 | crash['archived'] = str(datetime.datetime.utcnow()) | |
223 | self.crashes[crashid] = crash | |
224 | key = 'crash/%s' % crashid | |
225 | self.set_store(key, json.dumps(crash)) | |
226 | self._refresh_health_checks() | |
227 | return 0, '', '' | |
11fdf7f2 | 228 | |
eafe8130 TL |
229 | def do_archive_all(self, cmd, inbuf): |
230 | for crashid, crash in self.crashes.items(): | |
231 | if not crash.get('archived'): | |
232 | crash['archived'] = str(datetime.datetime.utcnow()) | |
233 | self.crashes[crashid] = crash | |
234 | key = 'crash/%s' % crashid | |
235 | self.set_store(key, json.dumps(crash)) | |
236 | self._refresh_health_checks() | |
11fdf7f2 TL |
237 | return 0, '', '' |
238 | ||
239 | def do_stat(self, cmd, inbuf): | |
240 | # age in days for reporting, ordered smallest first | |
241 | bins = [1, 3, 7] | |
242 | retlines = list() | |
243 | ||
244 | def binstr(bindict): | |
245 | binlines = list() | |
246 | count = len(bindict['idlist']) | |
247 | if count: | |
248 | binlines.append( | |
249 | '%d older than %s days old:' % (count, bindict['age']) | |
250 | ) | |
251 | for crashid in bindict['idlist']: | |
252 | binlines.append(crashid) | |
253 | return '\n'.join(binlines) | |
254 | ||
255 | total = 0 | |
256 | now = datetime.datetime.utcnow() | |
257 | for i, age in enumerate(bins): | |
258 | agelimit = now - datetime.timedelta(days=age) | |
259 | bins[i] = { | |
260 | 'age': age, | |
261 | 'agelimit': agelimit, | |
262 | 'idlist': list() | |
263 | } | |
264 | ||
eafe8130 | 265 | for crashid, crash in self.crashes.items(): |
11fdf7f2 | 266 | total += 1 |
eafe8130 | 267 | stamp = self.time_from_string(crash['timestamp']) |
11fdf7f2 TL |
268 | for i, bindict in enumerate(bins): |
269 | if stamp <= bindict['agelimit']: | |
270 | bindict['idlist'].append(crashid) | |
271 | # don't count this one again | |
272 | continue | |
273 | ||
274 | retlines.append('%d crashes recorded' % total) | |
275 | ||
276 | for bindict in bins: | |
277 | retlines.append(binstr(bindict)) | |
278 | return 0, '\n'.join(retlines), '' | |
279 | ||
280 | def do_json_report(self, cmd, inbuf): | |
281 | """ | |
282 | Return a machine readable summary of recent crashes. | |
283 | """ | |
284 | try: | |
285 | hours = int(cmd['hours']) | |
286 | except ValueError: | |
287 | return errno.EINVAL, '', '<hours> argument must be integer' | |
288 | ||
289 | report = defaultdict(lambda: 0) | |
eafe8130 TL |
290 | for crashid, crash in self.crashes.items(): |
291 | pname = crash.get("process_name", "unknown") | |
11fdf7f2 TL |
292 | if not pname: |
293 | pname = "unknown" | |
294 | report[pname] += 1 | |
295 | ||
296 | return 0, '', json.dumps(report) | |
297 | ||
298 | def self_test(self): | |
299 | # test time conversion | |
300 | timestr = '2018-06-22 20:35:38.058818Z' | |
301 | dt = self.time_from_string(timestr) | |
302 | if dt != datetime.datetime(2018, 6, 22, 20, 35, 38, 58818): | |
303 | raise RuntimeError('time_from_string() failed') | |
304 | ||
305 | COMMANDS = [ | |
306 | { | |
307 | 'cmd': 'crash info name=id,type=CephString', | |
308 | 'desc': 'show crash dump metadata', | |
309 | 'perm': 'r', | |
310 | 'handler': do_info, | |
311 | }, | |
312 | { | |
313 | 'cmd': 'crash ls', | |
eafe8130 TL |
314 | 'desc': 'Show new and archived crash dumps', |
315 | 'perm': 'r', | |
316 | 'handler': do_ls, | |
317 | }, | |
318 | { | |
319 | 'cmd': 'crash ls-new', | |
320 | 'desc': 'Show new crash dumps', | |
11fdf7f2 TL |
321 | 'perm': 'r', |
322 | 'handler': do_ls, | |
323 | }, | |
324 | { | |
325 | 'cmd': 'crash post', | |
326 | 'desc': 'Add a crash dump (use -i <jsonfile>)', | |
327 | 'perm': 'rw', | |
328 | 'handler': do_post, | |
329 | }, | |
330 | { | |
331 | 'cmd': 'crash prune name=keep,type=CephString', | |
332 | 'desc': 'Remove crashes older than <keep> days', | |
333 | 'perm': 'rw', | |
334 | 'handler': do_prune, | |
335 | }, | |
336 | { | |
337 | 'cmd': 'crash rm name=id,type=CephString', | |
338 | 'desc': 'Remove a saved crash <id>', | |
339 | 'perm': 'rw', | |
340 | 'handler': do_rm, | |
341 | }, | |
342 | { | |
343 | 'cmd': 'crash stat', | |
344 | 'desc': 'Summarize recorded crashes', | |
345 | 'perm': 'r', | |
346 | 'handler': do_stat, | |
347 | }, | |
348 | { | |
349 | 'cmd': 'crash json_report name=hours,type=CephString', | |
350 | 'desc': 'Crashes in the last <hours> hours', | |
351 | 'perm': 'r', | |
352 | 'handler': do_json_report, | |
353 | }, | |
eafe8130 TL |
354 | { |
355 | 'cmd': 'crash archive name=id,type=CephString', | |
356 | 'desc': 'Acknowledge a crash and silence health warning(s)', | |
357 | 'perm': 'w', | |
358 | 'handler': do_archive, | |
359 | }, | |
360 | { | |
361 | 'cmd': 'crash archive-all', | |
362 | 'desc': 'Acknowledge all new crashes and silence health warning(s)', | |
363 | 'perm': 'w', | |
364 | 'handler': do_archive_all, | |
365 | }, | |
11fdf7f2 | 366 | ] |