]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mds_creation_failure.py
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / qa / tasks / mds_creation_failure.py
1
2 import logging
3 import contextlib
4 import time
5 import ceph_manager
6 from teuthology import misc
7 from teuthology.orchestra.run import CommandFailedError, Raw
8
9 log = logging.getLogger(__name__)
10
11
12 @contextlib.contextmanager
13 def task(ctx, config):
14 """
15 Go through filesystem creation with a synthetic failure in an MDS
16 in its 'up:creating' state, to exercise the retry behaviour.
17 """
18 # Grab handles to the teuthology objects of interest
19 mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
20 if len(mdslist) != 1:
21 # Require exactly one MDS, the code path for creation failure when
22 # a standby is available is different
23 raise RuntimeError("This task requires exactly one MDS")
24
25 mds_id = mdslist[0]
26 (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
27 manager = ceph_manager.CephManager(
28 mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
29 )
30
31 # Stop MDS
32 manager.raw_cluster_cmd('mds', 'set', "max_mds", "0")
33 mds = ctx.daemons.get_daemon('mds', mds_id)
34 mds.stop()
35 manager.raw_cluster_cmd('mds', 'fail', mds_id)
36
37 # Reset the filesystem so that next start will go into CREATING
38 manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
39 manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
40
41 # Start the MDS with mds_kill_create_at set, it will crash during creation
42 mds.restart_with_args(["--mds_kill_create_at=1"])
43 try:
44 mds.wait_for_exit()
45 except CommandFailedError as e:
46 if e.exitstatus == 1:
47 log.info("MDS creation killed as expected")
48 else:
49 log.error("Unexpected status code %s" % e.exitstatus)
50 raise
51
52 # Since I have intentionally caused a crash, I will clean up the resulting core
53 # file to avoid task.internal.coredump seeing it as a failure.
54 log.info("Removing core file from synthetic MDS failure")
55 mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
56
57 # It should have left the MDS map state still in CREATING
58 status = manager.get_mds_status(mds_id)
59 assert status['state'] == 'up:creating'
60
61 # Start the MDS again without the kill flag set, it should proceed with creation successfully
62 mds.restart()
63
64 # Wait for state ACTIVE
65 t = 0
66 create_timeout = 120
67 while True:
68 status = manager.get_mds_status(mds_id)
69 if status['state'] == 'up:active':
70 log.info("MDS creation completed successfully")
71 break
72 elif status['state'] == 'up:creating':
73 log.info("MDS still in creating state")
74 if t > create_timeout:
75 log.error("Creating did not complete within %ss" % create_timeout)
76 raise RuntimeError("Creating did not complete within %ss" % create_timeout)
77 t += 1
78 time.sleep(1)
79 else:
80 log.error("Unexpected MDS state: %s" % status['state'])
81 assert(status['state'] in ['up:active', 'up:creating'])
82
83 # The system should be back up in a happy healthy state, go ahead and run any further tasks
84 # inside this context.
85 yield