]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mon_thrash.py
11 from teuthology
import misc
as teuthology
12 from tasks
import ceph_manager
13 from tasks
.cephfs
.filesystem
import MDSCluster
14 from tasks
.thrasher
import Thrasher
16 log
= logging
.getLogger(__name__
)
20 Get monitor names from the context value.
22 mons
= [f
[len('mon.'):] for f
in teuthology
.get_mon_names(ctx
)]
25 class MonitorThrasher(Thrasher
):
31 - wait for quorum to be formed
32 - sleep for 'revive_delay' seconds
34 - wait for quorum to be formed
35 - sleep for 'thrash_delay' seconds
39 seed Seed to use on the RNG to reproduce a previous
40 behaviour (default: None; i.e., not set)
41 revive_delay Number of seconds to wait before reviving
42 the monitor (default: 10)
43 thrash_delay Number of seconds to wait in-between
44 test iterations (default: 0)
45 store_thrash Thrash monitor store before killing the monitor being thrashed (default: False)
46 store_thrash_probability Probability of thrashing a monitor's store
48 thrash_many Thrash multiple monitors instead of just one. If
49 'maintain_quorum' is set to False, then we will
50 thrash up to as many monitors as there are
51 available. (default: False)
52 maintain_quorum Always maintain quorum, taking care on how many
53 monitors we kill during the thrashing. If we
54 happen to only have one or two monitors configured,
55 if this option is set to True, then we won't run
56 this task as we cannot guarantee maintenance of
57 quorum. Setting it to false however would allow the
58 task to run with as many as just one single monitor.
60 freeze_mon_probability: how often to freeze the mon instead of killing it,
62 freeze_mon_duration: how many seconds to freeze the mon (default: 15)
63 scrub Scrub after each iteration (default: True)
64 check_mds_failover Check if mds failover happened (default: False)
66 Note: if 'store_thrash' is set to True, then 'maintain_quorum' must also
77 store_thrash_probability: 40
81 check_mds_failover: True
88 def __init__(self
, ctx
, manager
, config
, name
, logger
):
89 super(MonitorThrasher
, self
).__init
__()
92 self
.manager
= manager
93 self
.manager
.wait_for_clean()
100 if self
.config
is None:
103 """ Test reproducibility """
104 self
.random_seed
= self
.config
.get('seed', None)
106 if self
.random_seed
is None:
107 self
.random_seed
= int(time
.time())
109 self
.rng
= random
.Random()
110 self
.rng
.seed(int(self
.random_seed
))
112 """ Monitor thrashing """
113 self
.revive_delay
= float(self
.config
.get('revive_delay', 10.0))
114 self
.thrash_delay
= float(self
.config
.get('thrash_delay', 0.0))
116 self
.thrash_many
= self
.config
.get('thrash_many', False)
117 self
.maintain_quorum
= self
.config
.get('maintain_quorum', True)
119 self
.scrub
= self
.config
.get('scrub', True)
121 self
.freeze_mon_probability
= float(self
.config
.get('freeze_mon_probability', 10))
122 self
.freeze_mon_duration
= float(self
.config
.get('freeze_mon_duration', 15.0))
124 assert self
.max_killable() > 0, \
125 'Unable to kill at least one monitor with the current config.'
127 """ Store thrashing """
128 self
.store_thrash
= self
.config
.get('store_thrash', False)
129 self
.store_thrash_probability
= int(
130 self
.config
.get('store_thrash_probability', 50))
131 if self
.store_thrash
:
132 assert self
.store_thrash_probability
> 0, \
133 'store_thrash is set, probability must be > 0'
134 assert self
.maintain_quorum
, \
135 'store_thrash = true must imply maintain_quorum = true'
138 self
.mds_failover
= self
.config
.get('check_mds_failover', False)
140 if self
.mds_failover
:
141 self
.mds_cluster
= MDSCluster(ctx
)
143 self
.thread
= gevent
.spawn(self
.do_thrash
)
147 locally log info messages
153 Break out of this processes thrashing loop.
158 def should_thrash_store(self
):
160 If allowed, indicate that we should thrash a certain percentage of
161 the time as determined by the store_thrash_probability value.
163 if not self
.store_thrash
:
165 return self
.rng
.randrange(0, 101) < self
.store_thrash_probability
167 def thrash_store(self
, mon
):
169 Thrash the monitor specified.
170 :param mon: monitor to thrash
172 self
.log('thrashing mon.{id} store'.format(id=mon
))
173 out
= self
.manager
.raw_cluster_cmd(
174 'tell', 'mon.%s' % mon
, 'sync_force',
175 '--yes-i-really-mean-it')
177 assert j
['ret'] == 0, \
178 'error forcing store sync on mon.{id}:\n{ret}'.format(
181 def should_freeze_mon(self
):
183 Indicate that we should freeze a certain percentago of the time
184 as determined by the freeze_mon_probability value.
186 return self
.rng
.randrange(0, 101) < self
.freeze_mon_probability
188 def freeze_mon(self
, mon
):
190 Send STOP signal to freeze the monitor.
192 log
.info('Sending STOP to mon %s', mon
)
193 self
.manager
.signal_mon(mon
, 19) # STOP
195 def unfreeze_mon(self
, mon
):
197 Send CONT signal to unfreeze the monitor.
199 log
.info('Sending CONT to mon %s', mon
)
200 self
.manager
.signal_mon(mon
, 18) # CONT
202 def kill_mon(self
, mon
):
204 Kill the monitor specified
206 self
.log('killing mon.{id}'.format(id=mon
))
207 self
.manager
.kill_mon(mon
)
209 def revive_mon(self
, mon
):
211 Revive the monitor specified
213 self
.log('killing mon.{id}'.format(id=mon
))
214 self
.log('reviving mon.{id}'.format(id=mon
))
215 self
.manager
.revive_mon(mon
)
217 def max_killable(self
):
219 Return the maximum number of monitors we can kill.
221 m
= len(_get_mons(self
.ctx
))
222 if self
.maintain_quorum
:
223 return max(math
.ceil(m
/2.0)-1, 0)
229 _do_thrash() wrapper.
233 except Exception as e
:
234 # See _run exception comment for MDSThrasher
235 self
.set_thrasher_exception(e
)
236 self
.logger
.exception("exception:")
237 # Allow successful completion so gevent doesn't see an exception.
238 # The DaemonWatchdog will observe the error and tear down the test.
240 def _do_thrash(self
):
242 Continuously loop and thrash the monitors.
244 #status before mon thrashing
245 if self
.mds_failover
:
246 oldstatus
= self
.mds_cluster
.status()
248 self
.log('start thrashing')
249 self
.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
250 'thrash many: {tm}, maintain quorum: {mq} '\
251 'store thrash: {st}, probability: {stp} '\
252 'freeze mon: prob {fp} duration {fd}'.format(
253 s
=self
.random_seed
,r
=self
.revive_delay
,t
=self
.thrash_delay
,
254 tm
=self
.thrash_many
, mq
=self
.maintain_quorum
,
255 st
=self
.store_thrash
,stp
=self
.store_thrash_probability
,
256 fp
=self
.freeze_mon_probability
,fd
=self
.freeze_mon_duration
,
259 while not self
.stopping
:
260 mons
= _get_mons(self
.ctx
)
261 self
.manager
.wait_for_mon_quorum_size(len(mons
))
262 self
.log('making sure all monitors are in the quorum')
264 s
= self
.manager
.get_mon_status(m
)
265 assert s
['state'] == 'leader' or s
['state'] == 'peon'
266 assert len(s
['quorum']) == len(mons
)
268 kill_up_to
= self
.rng
.randrange(1, self
.max_killable()+1)
269 mons_to_kill
= self
.rng
.sample(mons
, kill_up_to
)
270 self
.log('monitors to thrash: {m}'.format(m
=mons_to_kill
))
274 if mon
in mons_to_kill
:
276 if self
.should_freeze_mon():
277 mons_to_freeze
.append(mon
)
278 self
.log('monitors to freeze: {m}'.format(m
=mons_to_freeze
))
280 for mon
in mons_to_kill
:
281 self
.log('thrashing mon.{m}'.format(m
=mon
))
283 """ we only thrash stores if we are maintaining quorum """
284 if self
.should_thrash_store() and self
.maintain_quorum
:
285 self
.thrash_store(mon
)
290 for mon
in mons_to_freeze
:
292 self
.log('waiting for {delay} secs to unfreeze mons'.format(
293 delay
=self
.freeze_mon_duration
))
294 time
.sleep(self
.freeze_mon_duration
)
295 for mon
in mons_to_freeze
:
296 self
.unfreeze_mon(mon
)
298 if self
.maintain_quorum
:
299 self
.manager
.wait_for_mon_quorum_size(len(mons
)-len(mons_to_kill
))
301 if m
in mons_to_kill
:
303 s
= self
.manager
.get_mon_status(m
)
304 assert s
['state'] == 'leader' or s
['state'] == 'peon'
305 assert len(s
['quorum']) == len(mons
)-len(mons_to_kill
)
307 self
.log('waiting for {delay} secs before reviving monitors'.format(
308 delay
=self
.revive_delay
))
309 time
.sleep(self
.revive_delay
)
311 for mon
in mons_to_kill
:
315 for mon
in mons_to_freeze
:
317 self
.log('waiting for {delay} secs to unfreeze mons'.format(
318 delay
=self
.freeze_mon_duration
))
319 time
.sleep(self
.freeze_mon_duration
)
320 for mon
in mons_to_freeze
:
321 self
.unfreeze_mon(mon
)
323 self
.manager
.wait_for_mon_quorum_size(len(mons
))
325 s
= self
.manager
.get_mon_status(m
)
326 assert s
['state'] == 'leader' or s
['state'] == 'peon'
327 assert len(s
['quorum']) == len(mons
)
330 self
.log('triggering scrub')
332 self
.manager
.raw_cluster_cmd('mon', 'scrub')
333 except Exception as e
:
334 log
.warning("Ignoring exception while triggering scrub: %s", e
)
336 if self
.thrash_delay
> 0.0:
337 self
.log('waiting for {delay} secs before continuing thrashing'.format(
338 delay
=self
.thrash_delay
))
339 time
.sleep(self
.thrash_delay
)
341 #status after thrashing
342 if self
.mds_failover
:
343 status
= self
.mds_cluster
.status()
344 assert not oldstatus
.hadfailover(status
), \
348 @contextlib.contextmanager
349 def task(ctx
, config
):
351 Stress test the monitor by thrashing them while another task/workunit
354 Please refer to MonitorThrasher class for further information on the
359 assert isinstance(config
, dict), \
360 'mon_thrash task only accepts a dict for configuration'
361 assert len(_get_mons(ctx
)) > 2, \
362 'mon_thrash task requires at least 3 monitors'
364 if 'cluster' not in config
:
365 config
['cluster'] = 'ceph'
367 log
.info('Beginning mon_thrash...')
368 first_mon
= teuthology
.get_first_mon(ctx
, config
)
369 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
370 manager
= ceph_manager
.CephManager(
373 logger
=log
.getChild('ceph_manager'),
375 thrash_proc
= MonitorThrasher(ctx
,
376 manager
, config
, "MonitorThrasher",
377 logger
=log
.getChild('mon_thrasher'))
378 ctx
.ceph
[config
['cluster']].thrashers
.append(thrash_proc
)
380 log
.debug('Yielding')
383 log
.info('joining mon_thrasher')
384 thrash_proc
.do_join()
385 mons
= _get_mons(ctx
)
386 manager
.wait_for_mon_quorum_size(len(mons
))