]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/crash/module.py
1 from mgr_module
import MgrModule
5 from collections
import defaultdict
6 from prettytable
import PrettyTable
7 from threading
import Event
10 DATEFMT
= '%Y-%m-%d %H:%M:%S.%f'
15 class Module(MgrModule
):
18 'name': 'warn_recent_interval',
20 'default': 60*60*24*14,
21 'desc': 'time interval in which to warn about recent crashes',
25 'name': 'retain_interval',
27 'default': 60*60*24 * 365,
28 'desc': 'how long to retain crashes before pruning them',
33 def __init__(self
, *args
, **kwargs
):
34 super(Module
, self
).__init
__(*args
, **kwargs
)
46 self
._refresh
_health
_checks
()
47 self
._prune
(self
.retain_interval
)
48 wait
= min(MAX_WAIT
, max(self
.warn_recent_interval
/ 100, MIN_WAIT
))
52 def config_notify(self
):
53 for opt
in self
.MODULE_OPTIONS
:
56 self
.get_module_option(opt
['name']) or opt
['default'])
57 self
.log
.debug(' mgr option %s = %s',
58 opt
['name'], getattr(self
, opt
['name']))
60 def _load_crashes(self
):
61 raw
= self
.get_store_prefix('crash/')
62 self
.crashes
= {k
[6:]: json
.loads(m
) for (k
, m
) in raw
.items()}
64 def _refresh_health_checks(self
):
67 cutoff
= datetime
.datetime
.utcnow() - datetime
.timedelta(
68 seconds
=self
.warn_recent_interval
)
70 crashid
: crash
for crashid
, crash
in self
.crashes
.items()
71 if self
.time_from_string(crash
['timestamp']) > cutoff
and 'archived' not in crash
77 '%s crashed on host %s at %s' % (
78 crash
.get('entity_name', 'unidentified daemon'),
79 crash
.get('utsname_hostname', '(unknown)'),
80 crash
.get('timestamp', 'unknown time'))
81 for (_
, crash
) in recent
.items()]
84 detail
.append('and %d more' % (num
- 30))
85 self
.log
.debug('detail %s' % detail
)
86 health_checks
['RECENT_CRASH'] = {
87 'severity': 'warning',
88 'summary': '%d daemons have recently crashed' % (num
),
91 self
.set_health_checks(health_checks
)
93 def handle_command(self
, inbuf
, command
):
96 for cmd
in self
.COMMANDS
:
97 if cmd
['cmd'].startswith(command
['prefix']):
98 handler
= cmd
['handler']
101 return errno
.EINVAL
, '', 'unknown command %s' % command
['prefix']
103 return handler(self
, command
, inbuf
)
105 def time_from_string(self
, timestr
):
106 # drop the 'Z' timezone indication, it's always UTC
107 timestr
= timestr
.rstrip('Z')
108 return datetime
.datetime
.strptime(timestr
, DATEFMT
)
110 def validate_crash_metadata(self
, inbuf
):
111 # raise any exceptions to caller
112 metadata
= json
.loads(inbuf
)
113 for f
in ['crash_id', 'timestamp']:
114 if f
not in metadata
:
115 raise AttributeError("missing '%s' field" % f
)
116 time
= self
.time_from_string(metadata
['timestamp'])
119 def timestamp_filter(self
, f
):
121 Filter crash reports by timestamp.
123 :param f: f(time) return true to keep crash report
124 :returns: crash reports for which f(time) returns true
128 time
= self
.time_from_string(crash
["timestamp"])
130 return filter(inner
, self
.crashes
.items())
134 def do_info(self
, cmd
, inbuf
):
136 crash
= self
.crashes
.get(crashid
)
138 return errno
.EINVAL
, '', 'crash info: %s not found' % crashid
139 val
= json
.dumps(crash
, indent
=4)
142 def do_post(self
, cmd
, inbuf
):
144 metadata
= self
.validate_crash_metadata(inbuf
)
145 except Exception as e
:
146 return errno
.EINVAL
, '', 'malformed crash metadata: %s' % e
147 crashid
= metadata
['crash_id']
149 if crashid
not in self
.crashes
:
150 self
.crashes
[crashid
] = metadata
151 key
= 'crash/%s' % crashid
152 self
.set_store(key
, json
.dumps(metadata
))
153 self
._refresh
_health
_checks
()
159 return self
.do_ls({'prefix': 'crash ls'}, '')
161 def do_ls(self
, cmd
, inbuf
):
162 if cmd
['prefix'] == 'crash ls':
163 t
= self
.crashes
.values()
165 t
= [crash
for crashid
, crash
in self
.crashes
.items()
166 if 'archived' not in crash
]
167 r
= sorted(t
, key
=lambda i
: i
.get('crash_id'))
168 if cmd
.get('format') == 'json' or cmd
.get('format') == 'json-pretty':
169 return 0, json
.dumps(r
, indent
=4), ''
171 table
= PrettyTable(['ID', 'ENTITY', 'NEW'],
173 table
.left_padding_width
= 0
174 table
.right_padding_width
= 1
175 table
.align
['ID'] = 'l'
176 table
.align
['ENTITY'] = 'l'
178 table
.add_row([c
.get('crash_id'),
179 c
.get('entity_name','unknown'),
180 '' if 'archived' in c
else '*'])
181 return 0, table
.get_string(), ''
183 def do_rm(self
, cmd
, inbuf
):
185 if crashid
in self
.crashes
:
186 del self
.crashes
[crashid
]
187 key
= 'crash/%s' % crashid
188 self
.set_store(key
, None) # removes key
189 self
._refresh
_health
_checks
()
192 def do_prune(self
, cmd
, inbuf
):
197 return errno
.EINVAL
, '', 'keep argument must be integer'
199 self
._prune
(keep
* 60*60*24)
202 def _prune(self
, seconds
):
203 now
= datetime
.datetime
.utcnow()
204 cutoff
= now
- datetime
.timedelta(seconds
=seconds
)
206 # make a copy of the list, since we'll modify self.crashes below
207 to_prune
= list(self
.timestamp_filter(lambda ts
: ts
<= cutoff
))
208 for crashid
, crash
in to_prune
:
209 del self
.crashes
[crashid
]
210 key
= 'crash/%s' % crashid
211 self
.set_store(key
, None)
214 self
._refresh
_health
_checks
()
216 def do_archive(self
, cmd
, inbuf
):
218 crash
= self
.crashes
.get(crashid
)
220 return errno
.EINVAL
, '', 'crash info: %s not found' % crashid
221 if not crash
.get('archived'):
222 crash
['archived'] = str(datetime
.datetime
.utcnow())
223 self
.crashes
[crashid
] = crash
224 key
= 'crash/%s' % crashid
225 self
.set_store(key
, json
.dumps(crash
))
226 self
._refresh
_health
_checks
()
229 def do_archive_all(self
, cmd
, inbuf
):
230 for crashid
, crash
in self
.crashes
.items():
231 if not crash
.get('archived'):
232 crash
['archived'] = str(datetime
.datetime
.utcnow())
233 self
.crashes
[crashid
] = crash
234 key
= 'crash/%s' % crashid
235 self
.set_store(key
, json
.dumps(crash
))
236 self
._refresh
_health
_checks
()
239 def do_stat(self
, cmd
, inbuf
):
240 # age in days for reporting, ordered smallest first
246 count
= len(bindict
['idlist'])
249 '%d older than %s days old:' % (count
, bindict
['age'])
251 for crashid
in bindict
['idlist']:
252 binlines
.append(crashid
)
253 return '\n'.join(binlines
)
256 now
= datetime
.datetime
.utcnow()
257 for i
, age
in enumerate(bins
):
258 agelimit
= now
- datetime
.timedelta(days
=age
)
261 'agelimit': agelimit
,
265 for crashid
, crash
in self
.crashes
.items():
267 stamp
= self
.time_from_string(crash
['timestamp'])
268 for i
, bindict
in enumerate(bins
):
269 if stamp
<= bindict
['agelimit']:
270 bindict
['idlist'].append(crashid
)
271 # don't count this one again
274 retlines
.append('%d crashes recorded' % total
)
277 retlines
.append(binstr(bindict
))
278 return 0, '\n'.join(retlines
), ''
280 def do_json_report(self
, cmd
, inbuf
):
282 Return a machine readable summary of recent crashes.
285 hours
= int(cmd
['hours'])
287 return errno
.EINVAL
, '', '<hours> argument must be integer'
289 report
= defaultdict(lambda: 0)
290 for crashid
, crash
in self
.crashes
.items():
291 pname
= crash
.get("process_name", "unknown")
296 return 0, '', json
.dumps(report
)
299 # test time conversion
300 timestr
= '2018-06-22 20:35:38.058818Z'
301 dt
= self
.time_from_string(timestr
)
302 if dt
!= datetime
.datetime(2018, 6, 22, 20, 35, 38, 58818):
303 raise RuntimeError('time_from_string() failed')
307 'cmd': 'crash info name=id,type=CephString',
308 'desc': 'show crash dump metadata',
314 'desc': 'Show new and archived crash dumps',
319 'cmd': 'crash ls-new',
320 'desc': 'Show new crash dumps',
326 'desc': 'Add a crash dump (use -i <jsonfile>)',
331 'cmd': 'crash prune name=keep,type=CephString',
332 'desc': 'Remove crashes older than <keep> days',
337 'cmd': 'crash rm name=id,type=CephString',
338 'desc': 'Remove a saved crash <id>',
344 'desc': 'Summarize recorded crashes',
349 'cmd': 'crash json_report name=hours,type=CephString',
350 'desc': 'Crashes in the last <hours> hours',
352 'handler': do_json_report
,
355 'cmd': 'crash archive name=id,type=CephString',
356 'desc': 'Acknowledge a crash and silence health warning(s)',
358 'handler': do_archive
,
361 'cmd': 'crash archive-all',
362 'desc': 'Acknowledge all new crashes and silence health warning(s)',
364 'handler': do_archive_all
,