]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mds_creation_failure.py
import quincy beta 17.1.0
[ceph.git] / ceph / qa / tasks / mds_creation_failure.py
1 # FIXME: this file has many undefined vars which are accessed!
2 # flake8: noqa
3 import logging
4 import contextlib
5 import time
6 from tasks import ceph_manager
7 from teuthology import misc
8 from teuthology.exceptions import CommandFailedError
9 from teuthology.orchestra.run import Raw
10
11 log = logging.getLogger(__name__)
12
13
14 @contextlib.contextmanager
15 def task(ctx, config):
16 """
17 Go through filesystem creation with a synthetic failure in an MDS
18 in its 'up:creating' state, to exercise the retry behaviour.
19 """
20 # Grab handles to the teuthology objects of interest
21 mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
22 if len(mdslist) != 1:
23 # Require exactly one MDS, the code path for creation failure when
24 # a standby is available is different
25 raise RuntimeError("This task requires exactly one MDS")
26
27 mds_id = mdslist[0]
28 (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.keys()
29 manager = ceph_manager.CephManager(
30 mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
31 )
32
33 # Stop MDS
34 self.fs.set_max_mds(0)
35 self.fs.mds_stop(mds_id)
36 self.fs.mds_fail(mds_id)
37
38 # Reset the filesystem so that next start will go into CREATING
39 manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
40 manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
41
42 # Start the MDS with mds_kill_create_at set, it will crash during creation
43 mds.restart_with_args(["--mds_kill_create_at=1"])
44 try:
45 mds.wait_for_exit()
46 except CommandFailedError as e:
47 if e.exitstatus == 1:
48 log.info("MDS creation killed as expected")
49 else:
50 log.error("Unexpected status code %s" % e.exitstatus)
51 raise
52
53 # Since I have intentionally caused a crash, I will clean up the resulting core
54 # file to avoid task.internal.coredump seeing it as a failure.
55 log.info("Removing core file from synthetic MDS failure")
56 mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
57
58 # It should have left the MDS map state still in CREATING
59 status = self.fs.status().get_mds(mds_id)
60 assert status['state'] == 'up:creating'
61
62 # Start the MDS again without the kill flag set, it should proceed with creation successfully
63 mds.restart()
64
65 # Wait for state ACTIVE
66 self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)
67
68 # The system should be back up in a happy healthy state, go ahead and run any further tasks
69 # inside this context.
70 yield