[ceph.git] / ceph / qa / tasks / fwd_scrub.py

"""
Thrash mds by simulating failures
"""
import logging
import contextlib

from gevent import sleep, GreenletExit
from gevent.greenlet import Greenlet
from gevent.event import Event
from teuthology import misc as teuthology

from tasks import ceph_manager
from tasks.cephfs.filesystem import MDSCluster, Filesystem
from tasks.thrasher import Thrasher

log = logging.getLogger(__name__)

class ForwardScrubber(Thrasher, Greenlet):
    """
    ForwardScrubber::

    The ForwardScrubber does forward scrubbing of file-systems during execution
    of other tasks (workunits, etc).
    """

    def __init__(self, fs, scrub_timeout=300, sleep_between_iterations=1):
        super(ForwardScrubber, self).__init__()

        self.logger = log.getChild('fs.[{f}]'.format(f=fs.name))
        self.fs = fs
        self.name = 'thrasher.fs.[{f}]'.format(f=fs.name)
        self.stopping = Event()
        self.scrub_timeout = scrub_timeout
        self.sleep_between_iterations = sleep_between_iterations

    def _run(self):
        try:
            self.do_scrub()
        except Exception as e:
            self.set_thrasher_exception(e)
            self.logger.exception("exception:")
            # allow successful completion so gevent doesn't see an exception...

    def stop(self):
        self.stopping.set()

    def do_scrub(self):
        """
        Perform the file-system scrubbing
        """
        self.logger.info(f'start scrubbing fs: {self.fs.name}')

        try:
            while not self.stopping.is_set():
                self._scrub()
                sleep(self.sleep_between_iterations)
        except GreenletExit:
            pass

        self.logger.info(f'end scrubbing fs: {self.fs.name}')

    def _scrub(self, path="/", recursive=True):
        self.logger.info(f"scrubbing fs: {self.fs.name}")
        scrubopts = ["force"]
        if recursive:
            scrubopts.append("recursive")
        out_json = self.fs.run_scrub(["start", path, ",".join(scrubopts)])
        assert out_json is not None

        tag = out_json['scrub_tag']

        assert tag is not None
        assert out_json['return_code'] == 0
        assert out_json['mode'] == 'asynchronous'

        return self.fs.wait_until_scrub_complete(tag=tag, sleep=30,
                                                 timeout=self.scrub_timeout)

def stop_all_fwd_scrubbers(thrashers):
    for thrasher in thrashers:
        if not isinstance(thrasher, ForwardScrubber):
            continue
        thrasher.stop()
        thrasher.join()
        if thrasher.exception is not None:
            raise RuntimeError(f"error during scrub thrashing: {thrasher.exception}")


@contextlib.contextmanager
def task(ctx, config):
    """
    Stress test the mds by running scrub iterations while another task/workunit
    is running.
    Example config:

    - fwd_scrub:
      scrub_timeout: 300
      sleep_between_iterations: 1
    """

    mds_cluster = MDSCluster(ctx)

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'fwd_scrub task only accepts a dict for configuration'
    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
    assert len(mdslist) > 0, \
        'fwd_scrub task requires at least 1 metadata server'

    (first,) = ctx.cluster.only(f'mds.{mdslist[0]}').remotes.keys()
    manager = ceph_manager.CephManager(
        first, ctx=ctx, logger=log.getChild('ceph_manager'),
    )

    # make sure everyone is in active, standby, or standby-replay
    log.info('Wait for all MDSs to reach steady state...')
    status = mds_cluster.status()
    while True:
        steady = True
        for info in status.get_all():
            state = info['state']
            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
                steady = False
                break
        if steady:
            break
        sleep(2)
        status = mds_cluster.status()

    log.info('Ready to start scrub thrashing')

    manager.wait_for_clean()
    assert manager.is_clean()

    if 'cluster' not in config:
        config['cluster'] = 'ceph'

    for fs in status.get_filesystems():
        fwd_scrubber = ForwardScrubber(Filesystem(ctx, fscid=fs['id']),
                                       config['scrub_timeout'],
                                       config['sleep_between_iterations'])
        fwd_scrubber.start()
        ctx.ceph[config['cluster']].thrashers.append(fwd_scrubber)

    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining ForwardScrubbers')
        stop_all_fwd_scrubbers(ctx.ceph[config['cluster']].thrashers)
        log.info('done joining')
Commit	Line	Data
f67539c2 TL	1	"""
	2	Thrash mds by simulating failures
	3	"""
	4	import logging
	5	import contextlib
	6
	7	from gevent import sleep, GreenletExit
	8	from gevent.greenlet import Greenlet
	9	from gevent.event import Event
	10	from teuthology import misc as teuthology
	11
	12	from tasks import ceph_manager
	13	from tasks.cephfs.filesystem import MDSCluster, Filesystem
	14	from tasks.thrasher import Thrasher
	15
	16	log = logging.getLogger(__name__)
	17
	18	class ForwardScrubber(Thrasher, Greenlet):
	19	"""
	20	ForwardScrubber::
	21
	22	The ForwardScrubber does forward scrubbing of file-systems during execution
	23	of other tasks (workunits, etc).
	24	"""
	25
	26	def __init__(self, fs, scrub_timeout=300, sleep_between_iterations=1):
	27	super(ForwardScrubber, self).__init__()
	28
	29	self.logger = log.getChild('fs.[{f}]'.format(f=fs.name))
	30	self.fs = fs
	31	self.name = 'thrasher.fs.[{f}]'.format(f=fs.name)
	32	self.stopping = Event()
	33	self.scrub_timeout = scrub_timeout
	34	self.sleep_between_iterations = sleep_between_iterations
	35
	36	def _run(self):
	37	try:
	38	self.do_scrub()
	39	except Exception as e:
	40	self.set_thrasher_exception(e)
	41	self.logger.exception("exception:")
	42	# allow successful completion so gevent doesn't see an exception...
	43
	44	def stop(self):
	45	self.stopping.set()
	46
	47	def do_scrub(self):
	48	"""
	49	Perform the file-system scrubbing
	50	"""
	51	self.logger.info(f'start scrubbing fs: {self.fs.name}')
	52
	53	try:
	54	while not self.stopping.is_set():
	55	self._scrub()
	56	sleep(self.sleep_between_iterations)
	57	except GreenletExit:
	58	pass
	59
	60	self.logger.info(f'end scrubbing fs: {self.fs.name}')
	61
	62	def _scrub(self, path="/", recursive=True):
	63	self.logger.info(f"scrubbing fs: {self.fs.name}")
b3b6e05e TL	64	scrubopts = ["force"]
	65	if recursive:
	66	scrubopts.append("recursive")
	67	out_json = self.fs.run_scrub(["start", path, ",".join(scrubopts)])
f67539c2 TL	68	assert out_json is not None
	69
	70	tag = out_json['scrub_tag']
	71
	72	assert tag is not None
	73	assert out_json['return_code'] == 0
	74	assert out_json['mode'] == 'asynchronous'
	75
	76	return self.fs.wait_until_scrub_complete(tag=tag, sleep=30,
	77	timeout=self.scrub_timeout)
	78
	79	def stop_all_fwd_scrubbers(thrashers):
	80	for thrasher in thrashers:
	81	if not isinstance(thrasher, ForwardScrubber):
	82	continue
	83	thrasher.stop()
	84	thrasher.join()
	85	if thrasher.exception is not None:
	86	raise RuntimeError(f"error during scrub thrashing: {thrasher.exception}")
	87
	88
	89	@contextlib.contextmanager
	90	def task(ctx, config):
	91	"""
	92	Stress test the mds by running scrub iterations while another task/workunit
	93	is running.
	94	Example config:
	95
	96	- fwd_scrub:
	97	scrub_timeout: 300
	98	sleep_between_iterations: 1
	99	"""
	100
	101	mds_cluster = MDSCluster(ctx)
	102
	103	if config is None:
	104	config = {}
	105	assert isinstance(config, dict), \
	106	'fwd_scrub task only accepts a dict for configuration'
	107	mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
	108	assert len(mdslist) > 0, \
	109	'fwd_scrub task requires at least 1 metadata server'
	110
	111	(first,) = ctx.cluster.only(f'mds.{mdslist[0]}').remotes.keys()
	112	manager = ceph_manager.CephManager(
	113	first, ctx=ctx, logger=log.getChild('ceph_manager'),
	114	)
	115
	116	# make sure everyone is in active, standby, or standby-replay
	117	log.info('Wait for all MDSs to reach steady state...')
	118	status = mds_cluster.status()
	119	while True:
	120	steady = True
	121	for info in status.get_all():
	122	state = info['state']
	123	if state not in ('up:active', 'up:standby', 'up:standby-replay'):
	124	steady = False
	125	break
	126	if steady:
	127	break
	128	sleep(2)
	129	status = mds_cluster.status()
	130
	131	log.info('Ready to start scrub thrashing')
132
133	manager.wait_for_clean()
134	assert manager.is_clean()
135
136	if 'cluster' not in config:
137	config['cluster'] = 'ceph'
138
139	for fs in status.get_filesystems():
140	fwd_scrubber = ForwardScrubber(Filesystem(ctx, fscid=fs['id']),
141	config['scrub_timeout'],
142	config['sleep_between_iterations'])
143	fwd_scrubber.start()
144	ctx.ceph[config['cluster']].thrashers.append(fwd_scrubber)
145
146	try:
147	log.debug('Yielding')
148	yield
149	finally:
150	log.info('joining ForwardScrubbers')
151	stop_all_fwd_scrubbers(ctx.ceph[config['cluster']].thrashers)
152	log.info('done joining')