]>
Commit | Line | Data |
---|---|---|
9f95a23c TL |
1 | import logging |
2 | import signal | |
3 | import time | |
4 | ||
5 | from gevent import sleep | |
6 | from gevent.greenlet import Greenlet | |
7 | from gevent.event import Event | |
8 | ||
9 | log = logging.getLogger(__name__) | |
10 | ||
11 | class DaemonWatchdog(Greenlet): | |
12 | """ | |
13 | DaemonWatchdog:: | |
14 | ||
15 | Watch Ceph daemons for failures. If an extended failure is detected (i.e. | |
16 | not intentional), then the watchdog will unmount file systems and send | |
17 | SIGTERM to all daemons. The duration of an extended failure is configurable | |
18 | with watchdog_daemon_timeout. | |
19 | ||
f67539c2 TL |
20 | ceph: |
21 | watchdog: | |
22 | daemon_restart [default: no]: restart daemon if "normal" exit (status==0). | |
23 | ||
24 | daemon_timeout [default: 300]: number of seconds a daemon | |
25 | is allowed to be failed before the | |
26 | watchdog will bark. | |
9f95a23c TL |
27 | """ |
28 | ||
29 | def __init__(self, ctx, config, thrashers): | |
30 | super(DaemonWatchdog, self).__init__() | |
f67539c2 | 31 | self.config = ctx.config.get('watchdog', {}) |
9f95a23c | 32 | self.ctx = ctx |
9f95a23c TL |
33 | self.e = None |
34 | self.logger = log.getChild('daemon_watchdog') | |
35 | self.cluster = config.get('cluster', 'ceph') | |
36 | self.name = 'watchdog' | |
37 | self.stopping = Event() | |
38 | self.thrashers = thrashers | |
39 | ||
40 | def _run(self): | |
41 | try: | |
42 | self.watch() | |
43 | except Exception as e: | |
44 | # See _run exception comment for MDSThrasher | |
45 | self.e = e | |
46 | self.logger.exception("exception:") | |
47 | # allow successful completion so gevent doesn't see an exception... | |
48 | ||
49 | def log(self, x): | |
50 | """Write data to logger""" | |
51 | self.logger.info(x) | |
52 | ||
53 | def stop(self): | |
54 | self.stopping.set() | |
55 | ||
56 | def bark(self): | |
57 | self.log("BARK! unmounting mounts and killing all daemons") | |
58 | for mount in self.ctx.mounts.values(): | |
59 | try: | |
60 | mount.umount_wait(force=True) | |
61 | except: | |
62 | self.logger.exception("ignoring exception:") | |
63 | daemons = [] | |
64 | daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster))) | |
65 | daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster))) | |
66 | daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster))) | |
67 | daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster))) | |
68 | daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster))) | |
69 | ||
70 | for daemon in daemons: | |
71 | try: | |
72 | daemon.signal(signal.SIGTERM) | |
73 | except: | |
74 | self.logger.exception("ignoring exception:") | |
75 | ||
76 | def watch(self): | |
77 | self.log("watchdog starting") | |
f67539c2 TL |
78 | daemon_timeout = int(self.config.get('daemon_timeout', 300)) |
79 | daemon_restart = self.config.get('daemon_restart', False) | |
9f95a23c TL |
80 | daemon_failure_time = {} |
81 | while not self.stopping.is_set(): | |
82 | bark = False | |
83 | now = time.time() | |
84 | ||
85 | osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster) | |
86 | mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster) | |
87 | mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster) | |
88 | rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster) | |
89 | mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster) | |
90 | ||
91 | daemon_failures = [] | |
92 | daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds)) | |
93 | daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons)) | |
94 | daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss)) | |
95 | daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws)) | |
96 | daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs)) | |
97 | ||
98 | for daemon in daemon_failures: | |
99 | name = daemon.role + '.' + daemon.id_ | |
100 | dt = daemon_failure_time.setdefault(name, (daemon, now)) | |
101 | assert dt[0] is daemon | |
102 | delta = now-dt[1] | |
103 | self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta)) | |
104 | if delta > daemon_timeout: | |
105 | bark = True | |
f67539c2 TL |
106 | if daemon_restart == 'normal' and daemon.proc.exitstatus == 0: |
107 | self.log(f"attempting to restart daemon {name}") | |
108 | daemon.restart() | |
9f95a23c TL |
109 | |
110 | # If a daemon is no longer failed, remove it from tracking: | |
111 | for name in list(daemon_failure_time.keys()): | |
112 | if name not in [d.role + '.' + d.id_ for d in daemon_failures]: | |
113 | self.log("daemon {name} has been restored".format(name=name)) | |
114 | del daemon_failure_time[name] | |
115 | ||
116 | for thrasher in self.thrashers: | |
117 | if thrasher.exception is not None: | |
118 | self.log("{name} failed".format(name=thrasher.name)) | |
119 | bark = True | |
120 | ||
121 | if bark: | |
122 | self.bark() | |
123 | return | |
124 | ||
125 | sleep(5) | |
126 | ||
127 | self.log("watchdog finished") |