]> git.proxmox.com Git - ceph.git/blame - ceph/src/ceph-crash.in
import 15.2.1 Octopus source
[ceph.git] / ceph / src / ceph-crash.in
CommitLineData
9f95a23c 1#!@Python3_EXECUTABLE@
11fdf7f2
TL
2# -*- mode:python -*-
3# vim: ts=4 sw=4 smarttab expandtab
4
5import argparse
6import logging
7import os
9f95a23c 8import signal
eafe8130 9import socket
11fdf7f2
TL
10import subprocess
11import sys
12import time
13
14logging.basicConfig(level=logging.INFO)
9f95a23c 15log = logging.getLogger('ceph-crash')
11fdf7f2 16
eafe8130
TL
17auth_names = ['client.crash.%s' % socket.gethostname(),
18 'client.crash',
19 'client.admin']
11fdf7f2
TL
20
21def parse_args():
22 parser = argparse.ArgumentParser()
23 parser.add_argument(
24 '-p', '--path', default='/var/lib/ceph/crash',
25 help='base path to monitor for crash dumps')
26 parser.add_argument(
27 '-d', '--delay', default=10.0, type=float,
28 help='minutes to delay between scans (0 to exit after one)',
29 )
eafe8130
TL
30 parser.add_argument(
31 '--name', '-n',
32 help='ceph name to authenticate as (default: try client.crash, client.admin)')
11fdf7f2
TL
33 return parser.parse_args()
34
35
36def post_crash(path):
eafe8130
TL
37 rc = 0
38 for n in auth_names:
39 pr = subprocess.Popen(
40 args=['timeout', '30', 'ceph',
41 '-n', n,
42 'crash', 'post', '-i', '-'],
43 stdin=subprocess.PIPE,
44 stdout=subprocess.PIPE,
45 stderr=subprocess.PIPE,
46 )
47 f = open(os.path.join(path, 'meta'), 'rb')
48 stdout, stderr = pr.communicate(input=f.read())
49 rc = pr.wait()
50 f.close()
51 if rc != 0:
52 log.warning('post %s as %s failed: %s' % (path, n, stderr))
53 if rc == 0:
54 break
11fdf7f2
TL
55 return rc
56
57
58def scrape_path(path):
59 for p in os.listdir(path):
60 crashpath = os.path.join(path, p)
61 metapath = os.path.join(crashpath, 'meta')
62 donepath = os.path.join(crashpath, 'done')
63 if os.path.isfile(metapath):
64 if not os.path.isfile(donepath):
65 # hang out just for a bit; either we interrupted the dump
66 # or the daemon crashed before finishing it
67 time.sleep(1)
68 if not os.path.isfile(donepath):
69 return
70 # ok, we can process this one
71 rc = post_crash(crashpath)
72 if rc == 0:
73 os.rename(crashpath, os.path.join(path, 'posted/', p))
74 log.debug(
75 "posted %s and renamed %s -> %s " %
76 (metapath, p, os.path.join('posted/', p))
77 )
78
9f95a23c
TL
79def handler(signum, frame):
80 print('*** Interrupted with signal %d ***' % signum)
81 sys.exit(0)
11fdf7f2
TL
82
83def main():
9f95a23c
TL
84 # exit code 0 on SIGINT, SIGTERM
85 signal.signal(signal.SIGINT, handler)
86 signal.signal(signal.SIGTERM, handler)
87
11fdf7f2
TL
88 args = parse_args()
89 postdir = os.path.join(args.path, 'posted')
eafe8130
TL
90 if args.name:
91 auth_names = [args.name]
11fdf7f2
TL
92
93 while not os.path.isdir(postdir):
94 log.error("directory %s does not exist; please create" % postdir)
95 time.sleep(30)
96
97 log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
98 while True:
99 scrape_path(args.path)
100 if args.delay == 0:
101 sys.exit(0)
102 time.sleep(args.delay * 60)
103
104
105if __name__ == "__main__":
106 main()