]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/daemonwatchdog.py
5 from gevent
import sleep
6 from gevent
.greenlet
import Greenlet
7 from gevent
.event
import Event
9 log
= logging
.getLogger(__name__
)
11 class DaemonWatchdog(Greenlet
):
15 Watch Ceph daemons for failures. If an extended failure is detected (i.e.
16 not intentional), then the watchdog will unmount file systems and send
17 SIGTERM to all daemons. The duration of an extended failure is configurable
18 with watchdog_daemon_timeout.
22 daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
24 daemon_timeout [default: 300]: number of seconds a daemon
25 is allowed to be failed before the
29 def __init__(self
, ctx
, config
, thrashers
):
30 super(DaemonWatchdog
, self
).__init
__()
31 self
.config
= ctx
.config
.get('watchdog', {})
34 self
.logger
= log
.getChild('daemon_watchdog')
35 self
.cluster
= config
.get('cluster', 'ceph')
36 self
.name
= 'watchdog'
37 self
.stopping
= Event()
38 self
.thrashers
= thrashers
43 except Exception as e
:
44 # See _run exception comment for MDSThrasher
46 self
.logger
.exception("exception:")
47 # allow successful completion so gevent doesn't see an exception...
50 """Write data to logger"""
57 self
.log("BARK! unmounting mounts and killing all daemons")
58 if hasattr(self
.ctx
, 'mounts'):
59 for mount
in self
.ctx
.mounts
.values():
61 mount
.umount_wait(force
=True)
63 self
.logger
.exception("ignoring exception:")
65 daemons
.extend(filter(lambda daemon
: daemon
.running() and not daemon
.proc
.finished
, self
.ctx
.daemons
.iter_daemons_of_role('osd', cluster
=self
.cluster
)))
66 daemons
.extend(filter(lambda daemon
: daemon
.running() and not daemon
.proc
.finished
, self
.ctx
.daemons
.iter_daemons_of_role('mds', cluster
=self
.cluster
)))
67 daemons
.extend(filter(lambda daemon
: daemon
.running() and not daemon
.proc
.finished
, self
.ctx
.daemons
.iter_daemons_of_role('mon', cluster
=self
.cluster
)))
68 daemons
.extend(filter(lambda daemon
: daemon
.running() and not daemon
.proc
.finished
, self
.ctx
.daemons
.iter_daemons_of_role('rgw', cluster
=self
.cluster
)))
69 daemons
.extend(filter(lambda daemon
: daemon
.running() and not daemon
.proc
.finished
, self
.ctx
.daemons
.iter_daemons_of_role('mgr', cluster
=self
.cluster
)))
71 for daemon
in daemons
:
73 daemon
.signal(signal
.SIGTERM
)
75 self
.logger
.exception("ignoring exception:")
78 self
.log("watchdog starting")
79 daemon_timeout
= int(self
.config
.get('daemon_timeout', 300))
80 daemon_restart
= self
.config
.get('daemon_restart', False)
81 daemon_failure_time
= {}
82 while not self
.stopping
.is_set():
86 osds
= self
.ctx
.daemons
.iter_daemons_of_role('osd', cluster
=self
.cluster
)
87 mons
= self
.ctx
.daemons
.iter_daemons_of_role('mon', cluster
=self
.cluster
)
88 mdss
= self
.ctx
.daemons
.iter_daemons_of_role('mds', cluster
=self
.cluster
)
89 rgws
= self
.ctx
.daemons
.iter_daemons_of_role('rgw', cluster
=self
.cluster
)
90 mgrs
= self
.ctx
.daemons
.iter_daemons_of_role('mgr', cluster
=self
.cluster
)
93 daemon_failures
.extend(filter(lambda daemon
: daemon
.running() and daemon
.proc
.finished
, osds
))
94 daemon_failures
.extend(filter(lambda daemon
: daemon
.running() and daemon
.proc
.finished
, mons
))
95 daemon_failures
.extend(filter(lambda daemon
: daemon
.running() and daemon
.proc
.finished
, mdss
))
96 daemon_failures
.extend(filter(lambda daemon
: daemon
.running() and daemon
.proc
.finished
, rgws
))
97 daemon_failures
.extend(filter(lambda daemon
: daemon
.running() and daemon
.proc
.finished
, mgrs
))
99 for daemon
in daemon_failures
:
100 name
= daemon
.role
+ '.' + daemon
.id_
101 dt
= daemon_failure_time
.setdefault(name
, (daemon
, now
))
102 assert dt
[0] is daemon
104 self
.log("daemon {name} is failed for ~{t:.0f}s".format(name
=name
, t
=delta
))
105 if delta
> daemon_timeout
:
107 if daemon_restart
== 'normal' and daemon
.proc
.exitstatus
== 0:
108 self
.log(f
"attempting to restart daemon {name}")
111 # If a daemon is no longer failed, remove it from tracking:
112 for name
in list(daemon_failure_time
.keys()):
113 if name
not in [d
.role
+ '.' + d
.id_
for d
in daemon_failures
]:
114 self
.log("daemon {name} has been restored".format(name
=name
))
115 del daemon_failure_time
[name
]
117 for thrasher
in self
.thrashers
:
118 if thrasher
.exception
is not None:
119 self
.log("{name} failed".format(name
=thrasher
.name
))
128 self
.log("watchdog finished")