]> git.proxmox.com Git - ceph.git/blob - ceph/src/ceph-crash.in
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / ceph-crash.in
1 #!@Python3_EXECUTABLE@
2 # -*- mode:python -*-
3 # vim: ts=4 sw=4 smarttab expandtab
4
5 import argparse
6 import grp
7 import logging
8 import os
9 import pwd
10 import signal
11 import socket
12 import subprocess
13 import sys
14 import time
15
16 logging.basicConfig(level=logging.INFO)
17 log = logging.getLogger('ceph-crash')
18
19 auth_names = ['client.crash.%s' % socket.gethostname(),
20 'client.crash',
21 'client.admin']
22
23
24 def parse_args():
25 parser = argparse.ArgumentParser()
26 parser.add_argument(
27 '-p', '--path', default='/var/lib/ceph/crash',
28 help='base path to monitor for crash dumps')
29 parser.add_argument(
30 '-d', '--delay', default=10.0, type=float,
31 help='minutes to delay between scans (0 to exit after one)',
32 )
33 parser.add_argument(
34 '--name', '-n',
35 help='ceph name to authenticate as '
36 '(default: try client.crash, client.admin)')
37 parser.add_argument(
38 '--log-level', '-l',
39 help='log level output (default: INFO), support INFO or DEBUG')
40
41 return parser.parse_args()
42
43
44 def post_crash(path):
45 rc = 0
46 for n in auth_names:
47 pr = subprocess.Popen(
48 args=['timeout', '30', 'ceph',
49 '-n', n,
50 'crash', 'post', '-i', '-'],
51 stdin=subprocess.PIPE,
52 stderr=subprocess.PIPE,
53 )
54 f = open(os.path.join(path, 'meta'), 'rb')
55 (_, stderr) = pr.communicate(input=f.read())
56 stderr = stderr.decode()
57 rc = pr.wait()
58 f.close()
59 if rc != 0 or stderr != "":
60 log.warning('post %s as %s failed: %s' % (path, n, stderr))
61 if rc == 0:
62 break
63 return rc
64
65
66 def scrape_path(path):
67 for p in os.listdir(path):
68 crashpath = os.path.join(path, p)
69 if not os.access(crashpath, os.R_OK):
70 log.warning('unable to read crash path %s' % (crashpath))
71 continue
72 metapath = os.path.join(crashpath, 'meta')
73 donepath = os.path.join(crashpath, 'done')
74 if os.path.isfile(metapath):
75 if not os.path.isfile(donepath):
76 # hang out just for a bit; either we interrupted the dump
77 # or the daemon crashed before finishing it
78 time.sleep(1)
79 if not os.path.isfile(donepath):
80 return
81 # ok, we can process this one
82 rc = post_crash(crashpath)
83 if rc == 0:
84 os.rename(crashpath, os.path.join(path, 'posted/', p))
85 log.debug(
86 "posted %s and renamed %s -> %s " %
87 (metapath, p, os.path.join('posted/', p))
88 )
89
90
91 def handler(signum, frame):
92 print('*** Interrupted with signal %d ***' % signum)
93 sys.exit(0)
94
95
96 def drop_privs():
97 if os.getuid() == 0:
98 try:
99 ceph_uid = pwd.getpwnam("ceph").pw_uid
100 ceph_gid = grp.getgrnam("ceph").gr_gid
101 os.setgroups([])
102 os.setgid(ceph_gid)
103 os.setuid(ceph_uid)
104 except Exception as e:
105 log.error(f"Unable to drop privileges: {e}")
106 sys.exit(1)
107
108
109 def main():
110 global auth_names
111
112 # run as unprivileged ceph user
113 drop_privs()
114
115 # exit code 0 on SIGINT, SIGTERM
116 signal.signal(signal.SIGINT, handler)
117 signal.signal(signal.SIGTERM, handler)
118
119 args = parse_args()
120 if args.log_level == 'DEBUG':
121 log.setLevel(logging.DEBUG)
122
123 postdir = os.path.join(args.path, 'posted')
124 if args.name:
125 auth_names = [args.name]
126
127 while not os.path.isdir(postdir):
128 log.error("directory %s does not exist; please create" % postdir)
129 time.sleep(30)
130
131 log.info("pinging cluster to exercise our key")
132 pr = subprocess.Popen(args=['timeout', '30', 'ceph', '-s'])
133 pr.wait()
134
135 log.info("monitoring path %s, delay %ds" % (args.path, args.delay * 60.0))
136 while True:
137 try:
138 scrape_path(args.path)
139 except Exception as e:
140 log.error(f"Error scraping {args.path}: {e}")
141 if args.delay == 0:
142 sys.exit(0)
143 time.sleep(args.delay * 60)
144
145
146 if __name__ == "__main__":
147 main()