]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/daemonwatchdog.py
import quincy beta 17.1.0
[ceph.git] / ceph / qa / tasks / daemonwatchdog.py
1 import logging
2 import signal
3 import time
4
5 from gevent import sleep
6 from gevent.greenlet import Greenlet
7 from gevent.event import Event
8
9 log = logging.getLogger(__name__)
10
11 class DaemonWatchdog(Greenlet):
12 """
13 DaemonWatchdog::
14
15 Watch Ceph daemons for failures. If an extended failure is detected (i.e.
16 not intentional), then the watchdog will unmount file systems and send
17 SIGTERM to all daemons. The duration of an extended failure is configurable
18 with watchdog_daemon_timeout.
19
20 ceph:
21 watchdog:
22 daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
23
24 daemon_timeout [default: 300]: number of seconds a daemon
25 is allowed to be failed before the
26 watchdog will bark.
27 """
28
29 def __init__(self, ctx, config, thrashers):
30 super(DaemonWatchdog, self).__init__()
31 self.config = ctx.config.get('watchdog', {})
32 self.ctx = ctx
33 self.e = None
34 self.logger = log.getChild('daemon_watchdog')
35 self.cluster = config.get('cluster', 'ceph')
36 self.name = 'watchdog'
37 self.stopping = Event()
38 self.thrashers = thrashers
39
40 def _run(self):
41 try:
42 self.watch()
43 except Exception as e:
44 # See _run exception comment for MDSThrasher
45 self.e = e
46 self.logger.exception("exception:")
47 # allow successful completion so gevent doesn't see an exception...
48
49 def log(self, x):
50 """Write data to logger"""
51 self.logger.info(x)
52
53 def stop(self):
54 self.stopping.set()
55
56 def bark(self):
57 self.log("BARK! unmounting mounts and killing all daemons")
58 if hasattr(self.ctx, 'mounts'):
59 for mount in self.ctx.mounts.values():
60 try:
61 mount.umount_wait(force=True)
62 except:
63 self.logger.exception("ignoring exception:")
64 daemons = []
65 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
66 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
67 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
68 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
69 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))
70
71 for daemon in daemons:
72 try:
73 daemon.signal(signal.SIGTERM)
74 except:
75 self.logger.exception("ignoring exception:")
76
77 def watch(self):
78 self.log("watchdog starting")
79 daemon_timeout = int(self.config.get('daemon_timeout', 300))
80 daemon_restart = self.config.get('daemon_restart', False)
81 daemon_failure_time = {}
82 while not self.stopping.is_set():
83 bark = False
84 now = time.time()
85
86 osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)
87 mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)
88 mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)
89 rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)
90 mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)
91
92 daemon_failures = []
93 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))
94 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
95 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
96 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))
97 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))
98
99 for daemon in daemon_failures:
100 name = daemon.role + '.' + daemon.id_
101 dt = daemon_failure_time.setdefault(name, (daemon, now))
102 assert dt[0] is daemon
103 delta = now-dt[1]
104 self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
105 if delta > daemon_timeout:
106 bark = True
107 if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
108 self.log(f"attempting to restart daemon {name}")
109 daemon.restart()
110
111 # If a daemon is no longer failed, remove it from tracking:
112 for name in list(daemon_failure_time.keys()):
113 if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
114 self.log("daemon {name} has been restored".format(name=name))
115 del daemon_failure_time[name]
116
117 for thrasher in self.thrashers:
118 if thrasher.exception is not None:
119 self.log("{name} failed".format(name=thrasher.name))
120 bark = True
121
122 if bark:
123 self.bark()
124 return
125
126 sleep(5)
127
128 self.log("watchdog finished")