]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/rbd_mirror_thrash.py
import 15.2.0 Octopus source
[ceph.git] / ceph / qa / tasks / rbd_mirror_thrash.py
1 """
2 Task for thrashing rbd-mirror daemons
3 """
4
5 import contextlib
6 import logging
7 import random
8 import signal
9 import socket
10 import time
11
12 from gevent import sleep
13 from gevent.greenlet import Greenlet
14 from gevent.event import Event
15
16 from teuthology.exceptions import CommandFailedError
17 from teuthology.orchestra import run
18 from tasks.thrasher import Thrasher
19
20 log = logging.getLogger(__name__)
21
22
23 class RBDMirrorThrasher(Thrasher, Greenlet):
24 """
25 RBDMirrorThrasher::
26
27 The RBDMirrorThrasher thrashes rbd-mirror daemons during execution of other
28 tasks (workunits, etc).
29
30 The config is optional. Many of the config parameters are a maximum value
31 to use when selecting a random value from a range. The config is a dict
32 containing some or all of:
33
34 cluster: [default: ceph] cluster to thrash
35
36 max_thrash: [default: 1] the maximum number of active rbd-mirror daemons per
37 cluster will be thrashed at any given time.
38
39 min_thrash_delay: [default: 60] minimum number of seconds to delay before
40 thrashing again.
41
42 max_thrash_delay: [default: 120] maximum number of seconds to delay before
43 thrashing again.
44
45 max_revive_delay: [default: 10] maximum number of seconds to delay before
46 bringing back a thrashed rbd-mirror daemon.
47
48 randomize: [default: true] enables randomization and use the max/min values
49
50 seed: [no default] seed the random number generator
51
52 Examples::
53
54 The following example disables randomization, and uses the max delay
55 values:
56
57 tasks:
58 - ceph:
59 - rbd_mirror_thrash:
60 randomize: False
61 max_thrash_delay: 10
62 """
63
64 def __init__(self, ctx, config, cluster, daemons):
65 super(RBDMirrorThrasher, self).__init__()
66
67 self.ctx = ctx
68 self.config = config
69 self.cluster = cluster
70 self.daemons = daemons
71
72 self.logger = log
73 self.name = 'thrasher.rbd_mirror.[{cluster}]'.format(cluster = cluster)
74 self.stopping = Event()
75
76 self.randomize = bool(self.config.get('randomize', True))
77 self.max_thrash = int(self.config.get('max_thrash', 1))
78 self.min_thrash_delay = float(self.config.get('min_thrash_delay', 60.0))
79 self.max_thrash_delay = float(self.config.get('max_thrash_delay', 120.0))
80 self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
81
82 def _run(self):
83 try:
84 self.do_thrash()
85 except Exception as e:
86 # See _run exception comment for MDSThrasher
87 self.set_thrasher_exception(e)
88 self.logger.exception("exception:")
89 # Allow successful completion so gevent doesn't see an exception.
90 # The DaemonWatchdog will observe the error and tear down the test.
91
92 def log(self, x):
93 """Write data to logger assigned to this RBDMirrorThrasher"""
94 self.logger.info(x)
95
96 def stop(self):
97 self.stopping.set()
98
99 def do_thrash(self):
100 """
101 Perform the random thrashing action
102 """
103
104 self.log('starting thrash for cluster {cluster}'.format(cluster=self.cluster))
105 stats = {
106 "kill": 0,
107 }
108
109 while not self.stopping.is_set():
110 delay = self.max_thrash_delay
111 if self.randomize:
112 delay = random.randrange(self.min_thrash_delay, self.max_thrash_delay)
113
114 if delay > 0.0:
115 self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
116 self.stopping.wait(delay)
117 if self.stopping.is_set():
118 continue
119
120 killed_daemons = []
121
122 weight = 1.0 / len(self.daemons)
123 count = 0
124 for daemon in self.daemons:
125 skip = random.uniform(0.0, 1.0)
126 if weight <= skip:
127 self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
128 label=daemon.id_, skip=skip, weight=weight))
129 continue
130
131 self.log('kill {label}'.format(label=daemon.id_))
132 try:
133 daemon.signal(signal.SIGTERM)
134 except socket.error:
135 pass
136 killed_daemons.append(daemon)
137 stats['kill'] += 1
138
139 # if we've reached max_thrash, we're done
140 count += 1
141 if count >= self.max_thrash:
142 break
143
144 if killed_daemons:
145 # wait for a while before restarting
146 delay = self.max_revive_delay
147 if self.randomize:
148 delay = random.randrange(0.0, self.max_revive_delay)
149
150 self.log('waiting for {delay} secs before reviving daemons'.format(delay=delay))
151 sleep(delay)
152
153 for daemon in killed_daemons:
154 self.log('waiting for {label}'.format(label=daemon.id_))
155 try:
156 run.wait([daemon.proc], timeout=600)
157 except CommandFailedError:
158 pass
159 except:
160 self.log('Failed to stop {label}'.format(label=daemon.id_))
161
162 try:
163 # try to capture a core dump
164 daemon.signal(signal.SIGABRT)
165 except socket.error:
166 pass
167 raise
168 finally:
169 daemon.reset()
170
171 for daemon in killed_daemons:
172 self.log('reviving {label}'.format(label=daemon.id_))
173 daemon.start()
174
175 for stat in stats:
176 self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat]))
177
178 @contextlib.contextmanager
179 def task(ctx, config):
180 """
181 Stress test the rbd-mirror by thrashing while another task/workunit
182 is running.
183
184 Please refer to RBDMirrorThrasher class for further information on the
185 available options.
186 """
187 if config is None:
188 config = {}
189 assert isinstance(config, dict), \
190 'rbd_mirror_thrash task only accepts a dict for configuration'
191
192 cluster = config.get('cluster', 'ceph')
193 daemons = list(ctx.daemons.iter_daemons_of_role('rbd-mirror', cluster))
194 assert len(daemons) > 0, \
195 'rbd_mirror_thrash task requires at least 1 rbd-mirror daemon'
196
197 # choose random seed
198 if 'seed' in config:
199 seed = int(config['seed'])
200 else:
201 seed = int(time.time())
202 log.info('rbd_mirror_thrash using random seed: {seed}'.format(seed=seed))
203 random.seed(seed)
204
205 thrasher = RBDMirrorThrasher(ctx, config, cluster, daemons)
206 thrasher.start()
207 ctx.ceph[cluster].thrashers.append(thrasher)
208
209 try:
210 log.debug('Yielding')
211 yield
212 finally:
213 log.info('joining rbd_mirror_thrash')
214 thrasher.stop()
215 if thrasher.exception is not None:
216 raise RuntimeError('error during thrashing')
217 thrasher.join()
218 log.info('done joining')