]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/crash/module.py
2 from mgr_module
import MgrModule
6 from collections
import defaultdict
7 from prettytable
import PrettyTable
9 from threading
import Event
12 DATEFMT
= '%Y-%m-%dT%H:%M:%S.%f'
13 OLD_DATEFMT
= '%Y-%m-%d %H:%M:%S.%f'
18 class Module(MgrModule
):
21 'name': 'warn_recent_interval',
23 'default': 60*60*24*14,
24 'desc': 'time interval in which to warn about recent crashes',
28 'name': 'retain_interval',
30 'default': 60*60*24 * 365,
31 'desc': 'how long to retain crashes before pruning them',
36 def __init__(self
, *args
, **kwargs
):
37 super(Module
, self
).__init
__(*args
, **kwargs
)
49 self
._refresh
_health
_checks
()
50 self
._prune
(self
.retain_interval
)
51 wait
= min(MAX_WAIT
, max(self
.warn_recent_interval
/ 100, MIN_WAIT
))
55 def config_notify(self
):
56 for opt
in self
.MODULE_OPTIONS
:
59 self
.get_module_option(opt
['name']))
60 self
.log
.debug(' mgr option %s = %s',
61 opt
['name'], getattr(self
, opt
['name']))
63 def _load_crashes(self
):
64 raw
= self
.get_store_prefix('crash/')
65 self
.crashes
= {k
[6:]: json
.loads(m
) for (k
, m
) in raw
.items()}
67 def _refresh_health_checks(self
):
70 cutoff
= datetime
.datetime
.utcnow() - datetime
.timedelta(
71 seconds
=self
.warn_recent_interval
)
73 crashid
: crash
for crashid
, crash
in self
.crashes
.items()
74 if self
.time_from_string(crash
['timestamp']) > cutoff
and 'archived' not in crash
80 '%s crashed on host %s at %s' % (
81 crash
.get('entity_name', 'unidentified daemon'),
82 crash
.get('utsname_hostname', '(unknown)'),
83 crash
.get('timestamp', 'unknown time'))
84 for (_
, crash
) in recent
.items()]
87 detail
.append('and %d more' % (num
- 30))
88 self
.log
.debug('detail %s' % detail
)
89 health_checks
['RECENT_CRASH'] = {
90 'severity': 'warning',
91 'summary': '%d daemons have recently crashed' % (num
),
95 self
.set_health_checks(health_checks
)
97 def handle_command(self
, inbuf
, command
):
100 for cmd
in self
.COMMANDS
:
101 if cmd
['cmd'].startswith(command
['prefix']):
102 handler
= cmd
['handler']
105 return errno
.EINVAL
, '', 'unknown command %s' % command
['prefix']
107 return handler(self
, command
, inbuf
)
109 def time_from_string(self
, timestr
):
110 # drop the 'Z' timezone indication, it's always UTC
111 timestr
= timestr
.rstrip('Z')
113 return datetime
.datetime
.strptime(timestr
, DATEFMT
)
115 return datetime
.datetime
.strptime(timestr
, OLD_DATEFMT
)
117 def validate_crash_metadata(self
, inbuf
):
118 # raise any exceptions to caller
119 metadata
= json
.loads(inbuf
)
120 for f
in ['crash_id', 'timestamp']:
121 if f
not in metadata
:
122 raise AttributeError("missing '%s' field" % f
)
123 time
= self
.time_from_string(metadata
['timestamp'])
126 def timestamp_filter(self
, f
):
128 Filter crash reports by timestamp.
130 :param f: f(time) return true to keep crash report
131 :returns: crash reports for which f(time) returns true
135 time
= self
.time_from_string(crash
["timestamp"])
137 return filter(inner
, self
.crashes
.items())
139 # stack signature helpers
141 def sanitize_backtrace(self
, bt
):
143 for func_record
in bt
:
144 # split into two fields on last space, take the first one,
145 # strip off leading ( and trailing )
146 func_plus_offset
= func_record
.rsplit(' ', 1)[0][1:-1]
147 ret
.append(func_plus_offset
.split('+')[0])
151 ASSERT_MATCHEXPR
= re
.compile(r
'(?s)(.*) thread .* time .*(: .*)\n')
153 def sanitize_assert_msg(self
, msg
):
154 # (?s) allows matching newline. get everything up to "thread" and
155 # then after-and-including the last colon-space. This skips the
156 # thread id, timestamp, and file:lineno, because file is already in
157 # the beginning, and lineno may vary.
158 return ''.join(self
.ASSERT_MATCHEXPR
.match(msg
).groups())
160 def calc_sig(self
, bt
, assert_msg
):
161 sig
= hashlib
.sha256()
162 for func
in self
.sanitize_backtrace(bt
):
163 sig
.update(func
.encode())
165 sig
.update(self
.sanitize_assert_msg(assert_msg
).encode())
166 return ''.join('%02x' % c
for c
in sig
.digest())
170 def do_info(self
, cmd
, inbuf
):
172 crash
= self
.crashes
.get(crashid
)
174 return errno
.EINVAL
, '', 'crash info: %s not found' % crashid
175 val
= json
.dumps(crash
, indent
=4, sort_keys
=True)
178 def do_post(self
, cmd
, inbuf
):
180 metadata
= self
.validate_crash_metadata(inbuf
)
181 except Exception as e
:
182 return errno
.EINVAL
, '', 'malformed crash metadata: %s' % e
183 if 'backtrace' in metadata
:
184 metadata
['stack_sig'] = self
.calc_sig(
185 metadata
.get('backtrace'), metadata
.get('assert_msg'))
186 crashid
= metadata
['crash_id']
188 if crashid
not in self
.crashes
:
189 self
.crashes
[crashid
] = metadata
190 key
= 'crash/%s' % crashid
191 self
.set_store(key
, json
.dumps(metadata
))
192 self
._refresh
_health
_checks
()
198 return self
.do_ls({'prefix': 'crash ls'}, '')
200 def do_ls(self
, cmd
, inbuf
):
201 if cmd
['prefix'] == 'crash ls':
202 t
= self
.crashes
.values()
204 t
= [crash
for crashid
, crash
in self
.crashes
.items()
205 if 'archived' not in crash
]
206 r
= sorted(t
, key
=lambda i
: i
.get('crash_id'))
207 if cmd
.get('format') == 'json' or cmd
.get('format') == 'json-pretty':
208 return 0, json
.dumps(r
, indent
=4, sort_keys
=True), ''
210 table
= PrettyTable(['ID', 'ENTITY', 'NEW'],
212 table
.left_padding_width
= 0
213 table
.right_padding_width
= 2
214 table
.align
['ID'] = 'l'
215 table
.align
['ENTITY'] = 'l'
217 table
.add_row([c
.get('crash_id'),
218 c
.get('entity_name','unknown'),
219 '' if 'archived' in c
else '*'])
220 return 0, table
.get_string(), ''
222 def do_rm(self
, cmd
, inbuf
):
224 if crashid
in self
.crashes
:
225 del self
.crashes
[crashid
]
226 key
= 'crash/%s' % crashid
227 self
.set_store(key
, None) # removes key
228 self
._refresh
_health
_checks
()
231 def do_prune(self
, cmd
, inbuf
):
236 return errno
.EINVAL
, '', 'keep argument must be integer'
238 self
._prune
(keep
* 60*60*24)
241 def _prune(self
, seconds
):
242 now
= datetime
.datetime
.utcnow()
243 cutoff
= now
- datetime
.timedelta(seconds
=seconds
)
245 # make a copy of the list, since we'll modify self.crashes below
246 to_prune
= list(self
.timestamp_filter(lambda ts
: ts
<= cutoff
))
247 for crashid
, crash
in to_prune
:
248 del self
.crashes
[crashid
]
249 key
= 'crash/%s' % crashid
250 self
.set_store(key
, None)
253 self
._refresh
_health
_checks
()
255 def do_archive(self
, cmd
, inbuf
):
257 crash
= self
.crashes
.get(crashid
)
259 return errno
.EINVAL
, '', 'crash info: %s not found' % crashid
260 if not crash
.get('archived'):
261 crash
['archived'] = str(datetime
.datetime
.utcnow())
262 self
.crashes
[crashid
] = crash
263 key
= 'crash/%s' % crashid
264 self
.set_store(key
, json
.dumps(crash
))
265 self
._refresh
_health
_checks
()
268 def do_archive_all(self
, cmd
, inbuf
):
269 for crashid
, crash
in self
.crashes
.items():
270 if not crash
.get('archived'):
271 crash
['archived'] = str(datetime
.datetime
.utcnow())
272 self
.crashes
[crashid
] = crash
273 key
= 'crash/%s' % crashid
274 self
.set_store(key
, json
.dumps(crash
))
275 self
._refresh
_health
_checks
()
278 def do_stat(self
, cmd
, inbuf
):
279 # age in days for reporting, ordered smallest first
285 count
= len(bindict
['idlist'])
288 '%d older than %s days old:' % (count
, bindict
['age'])
290 for crashid
in bindict
['idlist']:
291 binlines
.append(crashid
)
292 return '\n'.join(binlines
)
295 now
= datetime
.datetime
.utcnow()
296 for i
, age
in enumerate(bins
):
297 agelimit
= now
- datetime
.timedelta(days
=age
)
300 'agelimit': agelimit
,
304 for crashid
, crash
in self
.crashes
.items():
306 stamp
= self
.time_from_string(crash
['timestamp'])
307 for i
, bindict
in enumerate(bins
):
308 if stamp
<= bindict
['agelimit']:
309 bindict
['idlist'].append(crashid
)
310 # don't count this one again
313 retlines
.append('%d crashes recorded' % total
)
316 retlines
.append(binstr(bindict
))
317 return 0, '\n'.join(retlines
), ''
319 def do_json_report(self
, cmd
, inbuf
):
321 Return a machine readable summary of recent crashes.
324 hours
= int(cmd
['hours'])
326 return errno
.EINVAL
, '', '<hours> argument must be integer'
328 report
= defaultdict(lambda: 0)
329 for crashid
, crash
in self
.crashes
.items():
330 pname
= crash
.get("process_name", "unknown")
335 return 0, '', json
.dumps(report
, sort_keys
=True)
338 # test time conversion
339 timestr
= '2018-06-22T20:35:38.058818Z'
340 old_timestr
= '2018-06-22 20:35:38.058818Z'
341 dt
= self
.time_from_string(timestr
)
342 if dt
!= datetime
.datetime(2018, 6, 22, 20, 35, 38, 58818):
343 raise RuntimeError('time_from_string() failed')
344 dt
= self
.time_from_string(old_timestr
)
345 if dt
!= datetime
.datetime(2018, 6, 22, 20, 35, 38, 58818):
346 raise RuntimeError('time_from_string() (old) failed')
350 'cmd': 'crash info name=id,type=CephString',
351 'desc': 'show crash dump metadata',
357 'desc': 'Show new and archived crash dumps',
362 'cmd': 'crash ls-new',
363 'desc': 'Show new crash dumps',
369 'desc': 'Add a crash dump (use -i <jsonfile>)',
374 'cmd': 'crash prune name=keep,type=CephString',
375 'desc': 'Remove crashes older than <keep> days',
380 'cmd': 'crash rm name=id,type=CephString',
381 'desc': 'Remove a saved crash <id>',
387 'desc': 'Summarize recorded crashes',
392 'cmd': 'crash json_report name=hours,type=CephString',
393 'desc': 'Crashes in the last <hours> hours',
395 'handler': do_json_report
,
398 'cmd': 'crash archive name=id,type=CephString',
399 'desc': 'Acknowledge a crash and silence health warning(s)',
401 'handler': do_archive
,
404 'cmd': 'crash archive-all',
405 'desc': 'Acknowledge all new crashes and silence health warning(s)',
407 'handler': do_archive_all
,