[ceph.git] / ceph / qa / tasks / mds_creation_failure.py

# FIXME: this file has many undefined vars which are accessed!
# flake8: noqa
import logging
import contextlib
import time
from tasks import ceph_manager
from teuthology import misc
from teuthology.orchestra.run import CommandFailedError, Raw

log = logging.getLogger(__name__)


@contextlib.contextmanager
def task(ctx, config):
    """
    Go through filesystem creation with a synthetic failure in an MDS
    in its 'up:creating' state, to exercise the retry behaviour.
    """
    # Grab handles to the teuthology objects of interest
    mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
    if len(mdslist) != 1:
        # Require exactly one MDS, the code path for creation failure when
        # a standby is available is different
        raise RuntimeError("This task requires exactly one MDS")

    mds_id = mdslist[0]
    (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.keys()
    manager = ceph_manager.CephManager(
        mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
    )

    # Stop MDS
    self.fs.set_max_mds(0)
    self.fs.mds_stop(mds_id)
    self.fs.mds_fail(mds_id)

    # Reset the filesystem so that next start will go into CREATING
    manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
    manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")

    # Start the MDS with mds_kill_create_at set, it will crash during creation
    mds.restart_with_args(["--mds_kill_create_at=1"])
    try:
        mds.wait_for_exit()
    except CommandFailedError as e:
        if e.exitstatus == 1:
            log.info("MDS creation killed as expected")
        else:
            log.error("Unexpected status code %s" % e.exitstatus)
            raise

    # Since I have intentionally caused a crash, I will clean up the resulting core
    # file to avoid task.internal.coredump seeing it as a failure.
    log.info("Removing core file from synthetic MDS failure")
    mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])

    # It should have left the MDS map state still in CREATING
    status = self.fs.status().get_mds(mds_id)
    assert status['state'] == 'up:creating'

    # Start the MDS again without the kill flag set, it should proceed with creation successfully
    mds.restart()

    # Wait for state ACTIVE
    self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)

    # The system should be back up in a happy healthy state, go ahead and run any further tasks
    # inside this context.
    yield
Commit	Line	Data
9f95a23c TL	1	# FIXME: this file has many undefined vars which are accessed!
9f95a23c TL	2	# flake8: noqa
7c673cae FG	3	import logging
	4	import contextlib
	5	import time
e306af50	6	from tasks import ceph_manager
7c673cae FG	7	from teuthology import misc
	8	from teuthology.orchestra.run import CommandFailedError, Raw
	9
	10	log = logging.getLogger(__name__)
	11
	12
	13	@contextlib.contextmanager
	14	def task(ctx, config):
	15	"""
	16	Go through filesystem creation with a synthetic failure in an MDS
	17	in its 'up:creating' state, to exercise the retry behaviour.
	18	"""
	19	# Grab handles to the teuthology objects of interest
	20	mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
	21	if len(mdslist) != 1:
	22	# Require exactly one MDS, the code path for creation failure when
	23	# a standby is available is different
	24	raise RuntimeError("This task requires exactly one MDS")
	25
	26	mds_id = mdslist[0]
9f95a23c	27	(mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.keys()
7c673cae FG	28	manager = ceph_manager.CephManager(
	29	mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
	30	)
	31
	32	# Stop MDS
11fdf7f2 TL	33	self.fs.set_max_mds(0)
	34	self.fs.mds_stop(mds_id)
	35	self.fs.mds_fail(mds_id)
7c673cae FG	36
	37	# Reset the filesystem so that next start will go into CREATING
	38	manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
	39	manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
	40
	41	# Start the MDS with mds_kill_create_at set, it will crash during creation
	42	mds.restart_with_args(["--mds_kill_create_at=1"])
	43	try:
	44	mds.wait_for_exit()
	45	except CommandFailedError as e:
	46	if e.exitstatus == 1:
	47	log.info("MDS creation killed as expected")
	48	else:
	49	log.error("Unexpected status code %s" % e.exitstatus)
	50	raise
	51
	52	# Since I have intentionally caused a crash, I will clean up the resulting core
	53	# file to avoid task.internal.coredump seeing it as a failure.
	54	log.info("Removing core file from synthetic MDS failure")
	55	mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
	56
	57	# It should have left the MDS map state still in CREATING
11fdf7f2	58	status = self.fs.status().get_mds(mds_id)
7c673cae FG	59	assert status['state'] == 'up:creating'
	60
	61	# Start the MDS again without the kill flag set, it should proceed with creation successfully
	62	mds.restart()
	63
	64	# Wait for state ACTIVE
11fdf7f2	65	self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)
7c673cae FG	66
	67	# The system should be back up in a happy healthy state, go ahead and run any further tasks
	68	# inside this context.
	69	yield