]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/daemonwatchdog.py
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / qa / tasks / daemonwatchdog.py
CommitLineData
9f95a23c
TL
1import logging
2import signal
3import time
4
5from gevent import sleep
6from gevent.greenlet import Greenlet
7from gevent.event import Event
8
9log = logging.getLogger(__name__)
10
11class DaemonWatchdog(Greenlet):
12 """
13 DaemonWatchdog::
14
15 Watch Ceph daemons for failures. If an extended failure is detected (i.e.
16 not intentional), then the watchdog will unmount file systems and send
17 SIGTERM to all daemons. The duration of an extended failure is configurable
18 with watchdog_daemon_timeout.
19
f67539c2
TL
20 ceph:
21 watchdog:
22 daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
23
24 daemon_timeout [default: 300]: number of seconds a daemon
25 is allowed to be failed before the
26 watchdog will bark.
9f95a23c
TL
27 """
28
29 def __init__(self, ctx, config, thrashers):
30 super(DaemonWatchdog, self).__init__()
f67539c2 31 self.config = ctx.config.get('watchdog', {})
9f95a23c 32 self.ctx = ctx
9f95a23c
TL
33 self.e = None
34 self.logger = log.getChild('daemon_watchdog')
35 self.cluster = config.get('cluster', 'ceph')
36 self.name = 'watchdog'
37 self.stopping = Event()
38 self.thrashers = thrashers
39
40 def _run(self):
41 try:
42 self.watch()
43 except Exception as e:
44 # See _run exception comment for MDSThrasher
45 self.e = e
46 self.logger.exception("exception:")
47 # allow successful completion so gevent doesn't see an exception...
48
49 def log(self, x):
50 """Write data to logger"""
51 self.logger.info(x)
52
53 def stop(self):
54 self.stopping.set()
55
56 def bark(self):
57 self.log("BARK! unmounting mounts and killing all daemons")
58 for mount in self.ctx.mounts.values():
59 try:
60 mount.umount_wait(force=True)
61 except:
62 self.logger.exception("ignoring exception:")
63 daemons = []
64 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
65 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
66 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
67 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
68 daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))
69
70 for daemon in daemons:
71 try:
72 daemon.signal(signal.SIGTERM)
73 except:
74 self.logger.exception("ignoring exception:")
75
76 def watch(self):
77 self.log("watchdog starting")
f67539c2
TL
78 daemon_timeout = int(self.config.get('daemon_timeout', 300))
79 daemon_restart = self.config.get('daemon_restart', False)
9f95a23c
TL
80 daemon_failure_time = {}
81 while not self.stopping.is_set():
82 bark = False
83 now = time.time()
84
85 osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)
86 mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)
87 mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)
88 rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)
89 mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)
90
91 daemon_failures = []
92 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))
93 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
94 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
95 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))
96 daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))
97
98 for daemon in daemon_failures:
99 name = daemon.role + '.' + daemon.id_
100 dt = daemon_failure_time.setdefault(name, (daemon, now))
101 assert dt[0] is daemon
102 delta = now-dt[1]
103 self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
104 if delta > daemon_timeout:
105 bark = True
f67539c2
TL
106 if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
107 self.log(f"attempting to restart daemon {name}")
108 daemon.restart()
9f95a23c
TL
109
110 # If a daemon is no longer failed, remove it from tracking:
111 for name in list(daemon_failure_time.keys()):
112 if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
113 self.log("daemon {name} has been restored".format(name=name))
114 del daemon_failure_time[name]
115
116 for thrasher in self.thrashers:
117 if thrasher.exception is not None:
118 self.log("{name} failed".format(name=thrasher.name))
119 bark = True
120
121 if bark:
122 self.bark()
123 return
124
125 sleep(5)
126
127 self.log("watchdog finished")