ceph/qa/tasks/mds_creation_failure.py

   1 # FIXME: this file has many undefined vars which are accessed!
   2 # flake8: noqa
   3 import logging
   4 import contextlib
   5 import time
   6 from tasks import ceph_manager
   7 from teuthology import misc
   8 from teuthology.exceptions import CommandFailedError
   9 from teuthology.orchestra.run import Raw
  10
  11 log = logging.getLogger(__name__)
  12
  13
  14 @contextlib.contextmanager
  15 def task(ctx, config):
  16     """
  17     Go through filesystem creation with a synthetic failure in an MDS
  18     in its 'up:creating' state, to exercise the retry behaviour.
  19     """
  20     # Grab handles to the teuthology objects of interest
  21     mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
  22     if len(mdslist) != 1:
  23         # Require exactly one MDS, the code path for creation failure when
  24         # a standby is available is different
  25         raise RuntimeError("This task requires exactly one MDS")
  26
  27     mds_id = mdslist[0]
  28     (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.keys()
  29     manager = ceph_manager.CephManager(
  30         mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
  31     )
  32
  33     # Stop MDS
  34     self.fs.set_max_mds(0)
  35     self.fs.mds_stop(mds_id)
  36     self.fs.mds_fail(mds_id)
  37
  38     # Reset the filesystem so that next start will go into CREATING
  39     manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
  40     manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
  41
  42     # Start the MDS with mds_kill_create_at set, it will crash during creation
  43     mds.restart_with_args(["--mds_kill_create_at=1"])
  44     try:
  45         mds.wait_for_exit()
  46     except CommandFailedError as e:
  47         if e.exitstatus == 1:
  48             log.info("MDS creation killed as expected")
  49         else:
  50             log.error("Unexpected status code %s" % e.exitstatus)
  51             raise
  52
  53     # Since I have intentionally caused a crash, I will clean up the resulting core
  54     # file to avoid task.internal.coredump seeing it as a failure.
  55     log.info("Removing core file from synthetic MDS failure")
  56     mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
  57
  58     # It should have left the MDS map state still in CREATING
  59     status = self.fs.status().get_mds(mds_id)
  60     assert status['state'] == 'up:creating'
  61
  62     # Start the MDS again without the kill flag set, it should proceed with creation successfully
  63     mds.restart()
  64
  65     # Wait for state ACTIVE
  66     self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)
  67
  68     # The system should be back up in a happy healthy state, go ahead and run any further tasks
  69     # inside this context.
  70     yield