]> git.proxmox.com Git - ceph.git/blob - ceph/src/ceph-crash.in
ae0e4f516464fd4d63a96d497cfe3d7545f89eda
[ceph.git] / ceph / src / ceph-crash.in
1 #!@Python3_EXECUTABLE@
2 # -*- mode:python -*-
3 # vim: ts=4 sw=4 smarttab expandtab
4
5 import argparse
6 import logging
7 import os
8 import signal
9 import socket
10 import subprocess
11 import sys
12 import time
13
14 logging.basicConfig(level=logging.INFO)
15 log = logging.getLogger('ceph-crash')
16
17 auth_names = ['client.crash.%s' % socket.gethostname(),
18 'client.crash',
19 'client.admin']
20
21 def parse_args():
22 parser = argparse.ArgumentParser()
23 parser.add_argument(
24 '-p', '--path', default='/var/lib/ceph/crash',
25 help='base path to monitor for crash dumps')
26 parser.add_argument(
27 '-d', '--delay', default=10.0, type=float,
28 help='minutes to delay between scans (0 to exit after one)',
29 )
30 parser.add_argument(
31 '--name', '-n',
32 help='ceph name to authenticate as (default: try client.crash, client.admin)')
33 parser.add_argument(
34 '--log-level', '-l',
35 help='log level output (default: INFO), support INFO or DEBUG')
36
37 return parser.parse_args()
38
39
40 def post_crash(path):
41 rc = 0
42 for n in auth_names:
43 pr = subprocess.Popen(
44 args=['timeout', '30', 'ceph',
45 '-n', n,
46 'crash', 'post', '-i', '-'],
47 stdin=subprocess.PIPE,
48 stderr=subprocess.PIPE,
49 )
50 f = open(os.path.join(path, 'meta'), 'rb')
51 stderr = pr.communicate(input=f.read())
52 rc = pr.wait()
53 f.close()
54 if rc != 0 or stderr != "":
55 log.warning('post %s as %s failed: %s' % (path, n, stderr))
56 if rc == 0:
57 break
58 return rc
59
60
61 def scrape_path(path):
62 for p in os.listdir(path):
63 crashpath = os.path.join(path, p)
64 metapath = os.path.join(crashpath, 'meta')
65 donepath = os.path.join(crashpath, 'done')
66 if os.path.isfile(metapath):
67 if not os.path.isfile(donepath):
68 # hang out just for a bit; either we interrupted the dump
69 # or the daemon crashed before finishing it
70 time.sleep(1)
71 if not os.path.isfile(donepath):
72 return
73 # ok, we can process this one
74 rc = post_crash(crashpath)
75 if rc == 0:
76 os.rename(crashpath, os.path.join(path, 'posted/', p))
77 log.debug(
78 "posted %s and renamed %s -> %s " %
79 (metapath, p, os.path.join('posted/', p))
80 )
81
82 def handler(signum):
83 print('*** Interrupted with signal %d ***' % signum)
84 sys.exit(0)
85
86 def main():
87 global auth_names
88 # exit code 0 on SIGINT, SIGTERM
89 signal.signal(signal.SIGINT, handler)
90 signal.signal(signal.SIGTERM, handler)
91
92 args = parse_args()
93 if args.log_level == 'DEBUG':
94 log.setLevel(logging.DEBUG)
95
96 postdir = os.path.join(args.path, 'posted')
97 if args.name:
98 auth_names = [args.name]
99
100 while not os.path.isdir(postdir):
101 log.error("directory %s does not exist; please create" % postdir)
102 time.sleep(30)
103
104 log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
105 while True:
106 scrape_path(args.path)
107 if args.delay == 0:
108 sys.exit(0)
109 time.sleep(args.delay * 60)
110
111
112 if __name__ == "__main__":
113 main()