[ceph.git] / ceph / qa / tasks / mon_thrash.py

"""
Monitor thrash
"""
import logging
import contextlib
import random
import time
import gevent
import json
import math
from teuthology import misc as teuthology
from tasks import ceph_manager
from tasks.cephfs.filesystem import MDSCluster
from tasks.thrasher import Thrasher

log = logging.getLogger(__name__)

def _get_mons(ctx):
    """
    Get monitor names from the context value.
    """
    mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
    return mons

class MonitorThrasher(Thrasher):
    """
    How it works::

    - pick a monitor
    - kill it
    - wait for quorum to be formed
    - sleep for 'revive_delay' seconds
    - revive monitor
    - wait for quorum to be formed
    - sleep for 'thrash_delay' seconds

    Options::

    seed                Seed to use on the RNG to reproduce a previous
                        behaviour (default: None; i.e., not set)
    revive_delay        Number of seconds to wait before reviving
                        the monitor (default: 10)
    thrash_delay        Number of seconds to wait in-between
                        test iterations (default: 0)
    store_thrash        Thrash monitor store before killing the monitor being thrashed (default: False)
    store_thrash_probability  Probability of thrashing a monitor's store
                              (default: 50)
    thrash_many         Thrash multiple monitors instead of just one. If
                        'maintain_quorum' is set to False, then we will
                        thrash up to as many monitors as there are
                        available. (default: False)
    maintain_quorum     Always maintain quorum, taking care on how many
                        monitors we kill during the thrashing. If we
                        happen to only have one or two monitors configured,
                        if this option is set to True, then we won't run
                        this task as we cannot guarantee maintenance of
                        quorum. Setting it to false however would allow the
                        task to run with as many as just one single monitor.
                        (default: True)
    freeze_mon_probability: how often to freeze the mon instead of killing it,
                        in % (default: 0)
    freeze_mon_duration: how many seconds to freeze the mon (default: 15)
    scrub               Scrub after each iteration (default: True)
    check_mds_failover  Check if mds failover happened (default: False)

    Note: if 'store_thrash' is set to True, then 'maintain_quorum' must also
          be set to True.

    For example::

    tasks:
    - ceph:
    - mon_thrash:
        revive_delay: 20
        thrash_delay: 1
        store_thrash: true
        store_thrash_probability: 40
        seed: 31337
        maintain_quorum: true
        thrash_many: true
        check_mds_failover: True
    - ceph-fuse:
    - workunit:
        clients:
          all:
            - mon/workloadgen.sh
    """
    def __init__(self, ctx, manager, config, name, logger):
        super(MonitorThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        self.manager.wait_for_clean()

        self.stopping = False
        self.logger = logger
        self.config = config
        self.name = name

        if self.config is None:
            self.config = dict()

        """ Test reproducibility """
        self.random_seed = self.config.get('seed', None)

        if self.random_seed is None:
            self.random_seed = int(time.time())

        self.rng = random.Random()
        self.rng.seed(int(self.random_seed))

        """ Monitor thrashing """
        self.revive_delay = float(self.config.get('revive_delay', 10.0))
        self.thrash_delay = float(self.config.get('thrash_delay', 0.0))

        self.thrash_many = self.config.get('thrash_many', False)
        self.maintain_quorum = self.config.get('maintain_quorum', True)

        self.scrub = self.config.get('scrub', True)

        self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
        self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))

        assert self.max_killable() > 0, \
            'Unable to kill at least one monitor with the current config.'

        """ Store thrashing """
        self.store_thrash = self.config.get('store_thrash', False)
        self.store_thrash_probability = int(
            self.config.get('store_thrash_probability', 50))
        if self.store_thrash:
            assert self.store_thrash_probability > 0, \
                'store_thrash is set, probability must be > 0'
            assert self.maintain_quorum, \
                'store_thrash = true must imply maintain_quorum = true'

        #MDS failover
        self.mds_failover = self.config.get('check_mds_failover', False)

        if self.mds_failover:
            self.mds_cluster = MDSCluster(ctx)

        self.thread = gevent.spawn(self.do_thrash)

    def log(self, x):
        """
        locally log info messages
        """
        self.logger.info(x)

    def do_join(self):
        """
        Break out of this processes thrashing loop.
        """
        self.stopping = True
        self.thread.get()

    def should_thrash_store(self):
        """
        If allowed, indicate that we should thrash a certain percentage of
        the time as determined by the store_thrash_probability value.
        """
        if not self.store_thrash:
            return False
        return self.rng.randrange(0, 101) < self.store_thrash_probability

    def thrash_store(self, mon):
        """
        Thrash the monitor specified.
        :param mon: monitor to thrash
        """
        self.log('thrashing mon.{id} store'.format(id=mon))
        out = self.manager.raw_cluster_cmd(
            'tell', 'mon.%s' % mon, 'sync_force',
            '--yes-i-really-mean-it')
        j = json.loads(out)
        assert j['ret'] == 0, \
            'error forcing store sync on mon.{id}:\n{ret}'.format(
                id=mon,ret=out)

    def should_freeze_mon(self):
        """
        Indicate that we should freeze a certain percentago of the time
        as determined by the freeze_mon_probability value.
        """
        return self.rng.randrange(0, 101) < self.freeze_mon_probability

    def freeze_mon(self, mon):
        """
        Send STOP signal to freeze the monitor.
        """
        log.info('Sending STOP to mon %s', mon)
        self.manager.signal_mon(mon, 19)  # STOP

    def unfreeze_mon(self, mon):
        """
        Send CONT signal to unfreeze the monitor.
        """
        log.info('Sending CONT to mon %s', mon)
        self.manager.signal_mon(mon, 18)  # CONT

    def kill_mon(self, mon):
        """
        Kill the monitor specified
        """
        self.log('killing mon.{id}'.format(id=mon))
        self.manager.kill_mon(mon)

    def revive_mon(self, mon):
        """
        Revive the monitor specified
        """
        self.log('killing mon.{id}'.format(id=mon))
        self.log('reviving mon.{id}'.format(id=mon))
        self.manager.revive_mon(mon)

    def max_killable(self):
        """
        Return the maximum number of monitors we can kill.
        """
        m = len(_get_mons(self.ctx))
        if self.maintain_quorum:
            return max(math.ceil(m/2.0)-1, 0)
        else:
            return m

    def do_thrash(self):
        """
        _do_thrash() wrapper.
        """
        try:
            self._do_thrash()
        except Exception as e:
            # See _run exception comment for MDSThrasher
            self.set_thrasher_exception(e)
            self.logger.exception("exception:")
            # Allow successful completion so gevent doesn't see an exception.
            # The DaemonWatchdog will observe the error and tear down the test.

    def _do_thrash(self):
        """
        Continuously loop and thrash the monitors.
        """
        #status before mon thrashing
        if self.mds_failover:
            oldstatus = self.mds_cluster.status()

        self.log('start thrashing')
        self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
                   'thrash many: {tm}, maintain quorum: {mq} '\
                   'store thrash: {st}, probability: {stp} '\
                   'freeze mon: prob {fp} duration {fd}'.format(
                s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
                tm=self.thrash_many, mq=self.maintain_quorum,
                st=self.store_thrash,stp=self.store_thrash_probability,
                fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
                ))

        while not self.stopping:
            mons = _get_mons(self.ctx)
            self.manager.wait_for_mon_quorum_size(len(mons))
            self.log('making sure all monitors are in the quorum')
            for m in mons:
                s = self.manager.get_mon_status(m)
                assert s['state'] == 'leader' or s['state'] == 'peon'
                assert len(s['quorum']) == len(mons)

            kill_up_to = self.rng.randrange(1, self.max_killable()+1)
            mons_to_kill = self.rng.sample(mons, kill_up_to)
            self.log('monitors to thrash: {m}'.format(m=mons_to_kill))

            mons_to_freeze = []
            for mon in mons:
                if mon in mons_to_kill:
                    continue
                if self.should_freeze_mon():
                    mons_to_freeze.append(mon)
            self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))

            for mon in mons_to_kill:
                self.log('thrashing mon.{m}'.format(m=mon))

                """ we only thrash stores if we are maintaining quorum """
                if self.should_thrash_store() and self.maintain_quorum:
                    self.thrash_store(mon)

                self.kill_mon(mon)

            if mons_to_freeze:
                for mon in mons_to_freeze:
                    self.freeze_mon(mon)
                self.log('waiting for {delay} secs to unfreeze mons'.format(
                    delay=self.freeze_mon_duration))
                time.sleep(self.freeze_mon_duration)
                for mon in mons_to_freeze:
                    self.unfreeze_mon(mon)

            if self.maintain_quorum:
                self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
                for m in mons:
                    if m in mons_to_kill:
                        continue
                    s = self.manager.get_mon_status(m)
                    assert s['state'] == 'leader' or s['state'] == 'peon'
                    assert len(s['quorum']) == len(mons)-len(mons_to_kill)

            self.log('waiting for {delay} secs before reviving monitors'.format(
                delay=self.revive_delay))
            time.sleep(self.revive_delay)

            for mon in mons_to_kill:
                self.revive_mon(mon)
            # do more freezes
            if mons_to_freeze:
                for mon in mons_to_freeze:
                    self.freeze_mon(mon)
                self.log('waiting for {delay} secs to unfreeze mons'.format(
                    delay=self.freeze_mon_duration))
                time.sleep(self.freeze_mon_duration)
                for mon in mons_to_freeze:
                    self.unfreeze_mon(mon)

            self.manager.wait_for_mon_quorum_size(len(mons))
            for m in mons:
                s = self.manager.get_mon_status(m)
                assert s['state'] == 'leader' or s['state'] == 'peon'
                assert len(s['quorum']) == len(mons)

            if self.scrub:
                self.log('triggering scrub')
                try:
                    self.manager.raw_cluster_cmd('mon', 'scrub')
                except Exception as e:
                    log.warning("Ignoring exception while triggering scrub: %s", e)

            if self.thrash_delay > 0.0:
                self.log('waiting for {delay} secs before continuing thrashing'.format(
                    delay=self.thrash_delay))
                time.sleep(self.thrash_delay)

        #status after thrashing
        if self.mds_failover:
            status = self.mds_cluster.status()
            assert not oldstatus.hadfailover(status), \
                'MDS Failover'


@contextlib.contextmanager
def task(ctx, config):
    """
    Stress test the monitor by thrashing them while another task/workunit
    is running.

    Please refer to MonitorThrasher class for further information on the
    available options.
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mon_thrash task only accepts a dict for configuration'
    assert len(_get_mons(ctx)) > 2, \
        'mon_thrash task requires at least 3 monitors'

    if 'cluster' not in config:
        config['cluster'] = 'ceph'

    log.info('Beginning mon_thrash...')
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )
    thrash_proc = MonitorThrasher(ctx,
        manager, config, "MonitorThrasher",
        logger=log.getChild('mon_thrasher'))
    ctx.ceph[config['cluster']].thrashers.append(thrash_proc)
    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining mon_thrasher')
        thrash_proc.do_join()
        mons = _get_mons(ctx)
        manager.wait_for_mon_quorum_size(len(mons))
Commit	Line	Data
7c673cae FG	1	"""
	2	Monitor thrash
	3	"""
	4	import logging
	5	import contextlib
7c673cae FG	6	import random
	7	import time
	8	import gevent
	9	import json
	10	import math
	11	from teuthology import misc as teuthology
9f95a23c TL	12	from tasks import ceph_manager
	13	from tasks.cephfs.filesystem import MDSCluster
	14	from tasks.thrasher import Thrasher
7c673cae FG	15
	16	log = logging.getLogger(__name__)
	17
	18	def _get_mons(ctx):
	19	"""
	20	Get monitor names from the context value.
	21	"""
	22	mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
	23	return mons
	24
9f95a23c	25	class MonitorThrasher(Thrasher):
7c673cae FG	26	"""
	27	How it works::
	28
	29	- pick a monitor
	30	- kill it
	31	- wait for quorum to be formed
	32	- sleep for 'revive_delay' seconds
	33	- revive monitor
	34	- wait for quorum to be formed
	35	- sleep for 'thrash_delay' seconds
	36
	37	Options::
	38
	39	seed Seed to use on the RNG to reproduce a previous
	40	behaviour (default: None; i.e., not set)
	41	revive_delay Number of seconds to wait before reviving
	42	the monitor (default: 10)
	43	thrash_delay Number of seconds to wait in-between
	44	test iterations (default: 0)
9f95a23c TL	45	store_thrash Thrash monitor store before killing the monitor being thrashed (default: False)
9f95a23c TL	46	store_thrash_probability Probability of thrashing a monitor's store
7c673cae FG	47	(default: 50)
7c673cae FG	48	thrash_many Thrash multiple monitors instead of just one. If
9f95a23c	49	'maintain_quorum' is set to False, then we will
7c673cae FG	50	thrash up to as many monitors as there are
	51	available. (default: False)
	52	maintain_quorum Always maintain quorum, taking care on how many
	53	monitors we kill during the thrashing. If we
	54	happen to only have one or two monitors configured,
	55	if this option is set to True, then we won't run
	56	this task as we cannot guarantee maintenance of
	57	quorum. Setting it to false however would allow the
	58	task to run with as many as just one single monitor.
	59	(default: True)
	60	freeze_mon_probability: how often to freeze the mon instead of killing it,
	61	in % (default: 0)
	62	freeze_mon_duration: how many seconds to freeze the mon (default: 15)
	63	scrub Scrub after each iteration (default: True)
9f95a23c	64	check_mds_failover Check if mds failover happened (default: False)
7c673cae	65
9f95a23c	66	Note: if 'store_thrash' is set to True, then 'maintain_quorum' must also
7c673cae FG	67	be set to True.
	68
	69	For example::
	70
	71	tasks:
	72	- ceph:
	73	- mon_thrash:
	74	revive_delay: 20
	75	thrash_delay: 1
9f95a23c TL	76	store_thrash: true
9f95a23c TL	77	store_thrash_probability: 40
7c673cae FG	78	seed: 31337
	79	maintain_quorum: true
	80	thrash_many: true
9f95a23c	81	check_mds_failover: True
7c673cae FG	82	- ceph-fuse:
	83	- workunit:
	84	clients:
	85	all:
	86	- mon/workloadgen.sh
	87	"""
9f95a23c TL	88	def __init__(self, ctx, manager, config, name, logger):
	89	super(MonitorThrasher, self).__init__()
	90
7c673cae FG	91	self.ctx = ctx
	92	self.manager = manager
	93	self.manager.wait_for_clean()
	94
	95	self.stopping = False
	96	self.logger = logger
	97	self.config = config
9f95a23c	98	self.name = name
7c673cae FG	99
	100	if self.config is None:
	101	self.config = dict()
	102
	103	""" Test reproducibility """
	104	self.random_seed = self.config.get('seed', None)
	105
	106	if self.random_seed is None:
	107	self.random_seed = int(time.time())
	108
	109	self.rng = random.Random()
	110	self.rng.seed(int(self.random_seed))
	111
	112	""" Monitor thrashing """
	113	self.revive_delay = float(self.config.get('revive_delay', 10.0))
	114	self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
	115
	116	self.thrash_many = self.config.get('thrash_many', False)
	117	self.maintain_quorum = self.config.get('maintain_quorum', True)
	118
	119	self.scrub = self.config.get('scrub', True)
	120
	121	self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
	122	self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
	123
	124	assert self.max_killable() > 0, \
	125	'Unable to kill at least one monitor with the current config.'
	126
	127	""" Store thrashing """
	128	self.store_thrash = self.config.get('store_thrash', False)
	129	self.store_thrash_probability = int(
	130	self.config.get('store_thrash_probability', 50))
	131	if self.store_thrash:
	132	assert self.store_thrash_probability > 0, \
	133	'store_thrash is set, probability must be > 0'
	134	assert self.maintain_quorum, \
	135	'store_thrash = true must imply maintain_quorum = true'
	136
9f95a23c TL	137	#MDS failover
	138	self.mds_failover = self.config.get('check_mds_failover', False)
	139
	140	if self.mds_failover:
	141	self.mds_cluster = MDSCluster(ctx)
	142
7c673cae FG	143	self.thread = gevent.spawn(self.do_thrash)
	144
	145	def log(self, x):
	146	"""
	147	locally log info messages
	148	"""
	149	self.logger.info(x)
	150
	151	def do_join(self):
	152	"""
	153	Break out of this processes thrashing loop.
	154	"""
	155	self.stopping = True
	156	self.thread.get()
	157
	158	def should_thrash_store(self):
	159	"""
	160	If allowed, indicate that we should thrash a certain percentage of
	161	the time as determined by the store_thrash_probability value.
	162	"""
	163	if not self.store_thrash:
	164	return False
	165	return self.rng.randrange(0, 101) < self.store_thrash_probability
	166
	167	def thrash_store(self, mon):
	168	"""
	169	Thrash the monitor specified.
	170	:param mon: monitor to thrash
	171	"""
9f95a23c TL	172	self.log('thrashing mon.{id} store'.format(id=mon))
	173	out = self.manager.raw_cluster_cmd(
	174	'tell', 'mon.%s' % mon, 'sync_force',
	175	'--yes-i-really-mean-it')
7c673cae FG	176	j = json.loads(out)
	177	assert j['ret'] == 0, \
	178	'error forcing store sync on mon.{id}:\n{ret}'.format(
	179	id=mon,ret=out)
	180
	181	def should_freeze_mon(self):
	182	"""
	183	Indicate that we should freeze a certain percentago of the time
	184	as determined by the freeze_mon_probability value.
	185	"""
	186	return self.rng.randrange(0, 101) < self.freeze_mon_probability
	187
	188	def freeze_mon(self, mon):
	189	"""
	190	Send STOP signal to freeze the monitor.
	191	"""
	192	log.info('Sending STOP to mon %s', mon)
	193	self.manager.signal_mon(mon, 19) # STOP
	194
	195	def unfreeze_mon(self, mon):
	196	"""
	197	Send CONT signal to unfreeze the monitor.
	198	"""
	199	log.info('Sending CONT to mon %s', mon)
	200	self.manager.signal_mon(mon, 18) # CONT
	201
	202	def kill_mon(self, mon):
	203	"""
	204	Kill the monitor specified
	205	"""
	206	self.log('killing mon.{id}'.format(id=mon))
	207	self.manager.kill_mon(mon)
	208
	209	def revive_mon(self, mon):
	210	"""
	211	Revive the monitor specified
	212	"""
	213	self.log('killing mon.{id}'.format(id=mon))
	214	self.log('reviving mon.{id}'.format(id=mon))
	215	self.manager.revive_mon(mon)
	216
	217	def max_killable(self):
	218	"""
	219	Return the maximum number of monitors we can kill.
	220	"""
	221	m = len(_get_mons(self.ctx))
	222	if self.maintain_quorum:
	223	return max(math.ceil(m/2.0)-1, 0)
	224	else:
	225	return m
	226
	227	def do_thrash(self):
	228	"""
9f95a23c TL	229	_do_thrash() wrapper.
	230	"""
	231	try:
	232	self._do_thrash()
	233	except Exception as e:
	234	# See _run exception comment for MDSThrasher
	235	self.set_thrasher_exception(e)
	236	self.logger.exception("exception:")
	237	# Allow successful completion so gevent doesn't see an exception.
	238	# The DaemonWatchdog will observe the error and tear down the test.
	239
	240	def _do_thrash(self):
	241	"""
	242	Continuously loop and thrash the monitors.
7c673cae	243	"""
9f95a23c TL	244	#status before mon thrashing
	245	if self.mds_failover:
	246	oldstatus = self.mds_cluster.status()
	247
7c673cae FG	248	self.log('start thrashing')
	249	self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
	250	'thrash many: {tm}, maintain quorum: {mq} '\
	251	'store thrash: {st}, probability: {stp} '\
	252	'freeze mon: prob {fp} duration {fd}'.format(
	253	s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
	254	tm=self.thrash_many, mq=self.maintain_quorum,
	255	st=self.store_thrash,stp=self.store_thrash_probability,
	256	fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
	257	))
	258
	259	while not self.stopping:
	260	mons = _get_mons(self.ctx)
	261	self.manager.wait_for_mon_quorum_size(len(mons))
	262	self.log('making sure all monitors are in the quorum')
	263	for m in mons:
	264	s = self.manager.get_mon_status(m)
	265	assert s['state'] == 'leader' or s['state'] == 'peon'
	266	assert len(s['quorum']) == len(mons)
	267
	268	kill_up_to = self.rng.randrange(1, self.max_killable()+1)
	269	mons_to_kill = self.rng.sample(mons, kill_up_to)
	270	self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
	271
	272	mons_to_freeze = []
	273	for mon in mons:
	274	if mon in mons_to_kill:
	275	continue
	276	if self.should_freeze_mon():
	277	mons_to_freeze.append(mon)
	278	self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
	279
	280	for mon in mons_to_kill:
	281	self.log('thrashing mon.{m}'.format(m=mon))
	282
	283	""" we only thrash stores if we are maintaining quorum """
	284	if self.should_thrash_store() and self.maintain_quorum:
	285	self.thrash_store(mon)
	286
	287	self.kill_mon(mon)
	288
	289	if mons_to_freeze:
	290	for mon in mons_to_freeze:
	291	self.freeze_mon(mon)
	292	self.log('waiting for {delay} secs to unfreeze mons'.format(
	293	delay=self.freeze_mon_duration))
	294	time.sleep(self.freeze_mon_duration)
	295	for mon in mons_to_freeze:
	296	self.unfreeze_mon(mon)
	297
	298	if self.maintain_quorum:
	299	self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
	300	for m in mons:
	301	if m in mons_to_kill:
	302	continue
	303	s = self.manager.get_mon_status(m)
	304	assert s['state'] == 'leader' or s['state'] == 'peon'
	305	assert len(s['quorum']) == len(mons)-len(mons_to_kill)
	306
	307	self.log('waiting for {delay} secs before reviving monitors'.format(
	308	delay=self.revive_delay))
	309	time.sleep(self.revive_delay)
	310
	311	for mon in mons_to_kill:
312	self.revive_mon(mon)
313	# do more freezes
314	if mons_to_freeze:
315	for mon in mons_to_freeze:
316	self.freeze_mon(mon)
317	self.log('waiting for {delay} secs to unfreeze mons'.format(
318	delay=self.freeze_mon_duration))
319	time.sleep(self.freeze_mon_duration)
320	for mon in mons_to_freeze:
321	self.unfreeze_mon(mon)
322
323	self.manager.wait_for_mon_quorum_size(len(mons))
324	for m in mons:
325	s = self.manager.get_mon_status(m)
326	assert s['state'] == 'leader' or s['state'] == 'peon'
327	assert len(s['quorum']) == len(mons)
328
329	if self.scrub:
330	self.log('triggering scrub')
331	try:
9f95a23c TL	332	self.manager.raw_cluster_cmd('mon', 'scrub')
	333	except Exception as e:
	334	log.warning("Ignoring exception while triggering scrub: %s", e)
7c673cae FG	335
	336	if self.thrash_delay > 0.0:
	337	self.log('waiting for {delay} secs before continuing thrashing'.format(
	338	delay=self.thrash_delay))
	339	time.sleep(self.thrash_delay)
	340
9f95a23c TL	341	#status after thrashing
	342	if self.mds_failover:
	343	status = self.mds_cluster.status()
	344	assert not oldstatus.hadfailover(status), \
	345	'MDS Failover'
	346
	347
7c673cae FG	348	@contextlib.contextmanager
	349	def task(ctx, config):
	350	"""
	351	Stress test the monitor by thrashing them while another task/workunit
	352	is running.
	353
	354	Please refer to MonitorThrasher class for further information on the
	355	available options.
	356	"""
	357	if config is None:
	358	config = {}
	359	assert isinstance(config, dict), \
	360	'mon_thrash task only accepts a dict for configuration'
	361	assert len(_get_mons(ctx)) > 2, \
	362	'mon_thrash task requires at least 3 monitors'
9f95a23c TL	363
	364	if 'cluster' not in config:
	365	config['cluster'] = 'ceph'
	366
7c673cae FG	367	log.info('Beginning mon_thrash...')
7c673cae FG	368	first_mon = teuthology.get_first_mon(ctx, config)
9f95a23c	369	(mon,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae FG	370	manager = ceph_manager.CephManager(
	371	mon,
	372	ctx=ctx,
	373	logger=log.getChild('ceph_manager'),
	374	)
	375	thrash_proc = MonitorThrasher(ctx,
9f95a23c	376	manager, config, "MonitorThrasher",
7c673cae	377	logger=log.getChild('mon_thrasher'))
9f95a23c	378	ctx.ceph[config['cluster']].thrashers.append(thrash_proc)
7c673cae FG	379	try:
	380	log.debug('Yielding')
	381	yield
	382	finally:
	383	log.info('joining mon_thrasher')
	384	thrash_proc.do_join()
	385	mons = _get_mons(ctx)
	386	manager.wait_for_mon_quorum_size(len(mons))