]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/daemonwatchdog.py
f72ccd7cef3883edf4277426262a914068af348e
[ceph.git] / ceph / qa / tasks / daemonwatchdog.py
1 import logging
2 import signal
3 import time
4
5 from gevent import sleep
6 from gevent.greenlet import Greenlet
7 from gevent.event import Event
8
9 log = logging.getLogger(__name__)
10
11 class DaemonWatchdog(Greenlet):
12 """
13 DaemonWatchdog::
14
15 Watch Ceph daemons for failures. If an extended failure is detected (i.e.
16 not intentional), then the watchdog will unmount file systems and send
17 SIGTERM to all daemons. The duration of an extended failure is configurable
18 with watchdog_daemon_timeout.
19
20 ceph:
21 watchdog:
22 daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
23
24 daemon_timeout [default: 300]: number of seconds a daemon
25 is allowed to be failed before the
26 watchdog will bark.
27 """
28
29 def __init__(self, ctx, config, thrashers):
30 super(DaemonWatchdog, self).__init__()
31 self.config = ctx.config.get('watchdog', {})
32 self.ctx = ctx
33 self.e = None
34 self.logger = log.getChild('daemon_watchdog')
35 self.cluster = config.get('cluster', 'ceph')
36 self.name = 'watchdog'
37 self.stopping = Event()
38 self.thrashers = thrashers
39
40 def _run(self):
41 try:
42 self.watch()
43 except Exception as e:
44 # See _run exception comment for MDSThrasher
45 self.e = e
46 self.logger.exception("exception:")
47 # allow successful completion so gevent doesn't see an exception...
48
49 def log(self, x):
50 """Write data to logger"""
51 self.logger.info(x)
52
53 def stop(self):
54 self.stopping.set()
55
56 def bark(self):
57 self.log("BARK! unmounting mounts and killing all daemons")
58 for mount in self.ctx.mounts.values():
59 try:
60 mount.umount_wait(force=True)
61 except:
62 self.logger.exception("ignoring exception:")
63 daemons = []
64 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
65 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
66 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
67 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
68 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))
69
70 for daemon in daemons:
71 try:
72 daemon.signal(signal.SIGTERM)
73 except:
74 self.logger.exception("ignoring exception:")
75
76 def watch(self):
77 self.log("watchdog starting")
78 daemon_timeout = int(self.config.get('daemon_timeout', 300))
79 daemon_restart = self.config.get('daemon_restart', False)
80 daemon_failure_time = {}
81 while not self.stopping.is_set():
82 bark = False
83 now = time.time()
84
85 osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)
86 mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)
87 mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)
88 rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)
89 mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)
90
91 daemon_failures = []
92 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))
93 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
94 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
95 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))
96 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))
97
98 for daemon in daemon_failures:
99 name = daemon.role + '.' + daemon.id_
100 dt = daemon_failure_time.setdefault(name, (daemon, now))
101 assert dt[0] is daemon
102 delta = now-dt[1]
103 self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
104 if delta > daemon_timeout:
105 bark = True
106 if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
107 self.log(f"attempting to restart daemon {name}")
108 daemon.restart()
109
110 # If a daemon is no longer failed, remove it from tracking:
111 for name in list(daemon_failure_time.keys()):
112 if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
113 self.log("daemon {name} has been restored".format(name=name))
114 del daemon_failure_time[name]
115
116 for thrasher in self.thrashers:
117 if thrasher.exception is not None:
118 self.log("{name} failed".format(name=thrasher.name))
119 bark = True
120
121 if bark:
122 self.bark()
123 return
124
125 sleep(5)
126
127 self.log("watchdog finished")