import signal
import logging
import operator
-from random import randint
+from random import randint, choice
-from cephfs_test_case import CephFSTestCase
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.exceptions import CommandFailedError
from tasks.cephfs.fuse_mount import FuseMount
current = sorted(current, key=operator.itemgetter('name'))
log.info("current = %s", current)
self.assertEqual(len(current), len(target))
- for i in xrange(len(current)):
+ for i in range(len(current)):
for attr in target[i]:
self.assertIn(attr, current[i])
self.assertEqual(target[i][attr], current[i][attr])
except AssertionError as e:
log.debug("%s", e)
return False
- status = self.wait_until_true(takeover, 30)
+ self.wait_until_true(takeover, 30)
def test_join_fs_runtime(self):
"""
"""
That a vanilla standby is preferred over others with mds_join_fs set to another fs.
"""
- self.fs.set_allow_multifs()
fs2 = self.mds_cluster.newfs(name="cephfs2")
status, target = self._verify_init()
active = self.fs.get_active_names(status=status)[0]
That a standby with mds_join_fs set to another fs is still used if necessary.
"""
status, target = self._verify_init()
- active = self.fs.get_active_names(status=status)[0]
standbys = [info['name'] for info in status.get_standbys()]
for mds in standbys:
self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
- self.fs.set_allow_multifs()
fs2 = self.mds_cluster.newfs(name="cephfs2")
for mds in standbys:
self._change_target_state(target, mds, {'join_fscid': fs2.id})
self._reach_target(target)
class TestClusterResize(CephFSTestCase):
- CLIENTS_REQUIRED = 1
+ CLIENTS_REQUIRED = 0
MDSS_REQUIRED = 3
- def grow(self, n):
- grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
- fscid = self.fs.id
- status = self.fs.status()
- log.info("status = {0}".format(status))
-
- original_ranks = set([info['gid'] for info in status.get_ranks(fscid)])
- _ = set([info['gid'] for info in status.get_standbys()])
-
- oldmax = self.fs.get_var('max_mds')
- self.assertTrue(n > oldmax)
- self.fs.set_max_mds(n)
-
- log.info("Waiting for cluster to grow.")
- status = self.fs.wait_for_daemons(timeout=60+grace*2)
- ranks = set([info['gid'] for info in status.get_ranks(fscid)])
- self.assertTrue(original_ranks.issubset(ranks) and len(ranks) == n)
- return status
-
- def shrink(self, n):
- grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
- fscid = self.fs.id
- status = self.fs.status()
- log.info("status = {0}".format(status))
-
- original_ranks = set([info['gid'] for info in status.get_ranks(fscid)])
- _ = set([info['gid'] for info in status.get_standbys()])
-
- oldmax = self.fs.get_var('max_mds')
- self.assertTrue(n < oldmax)
- self.fs.set_max_mds(n)
-
- # Wait until the monitor finishes stopping ranks >= n
- log.info("Waiting for cluster to shink.")
- status = self.fs.wait_for_daemons(timeout=60+grace*2)
- ranks = set([info['gid'] for info in status.get_ranks(fscid)])
- self.assertTrue(ranks.issubset(original_ranks) and len(ranks) == n)
- return status
-
-
def test_grow(self):
"""
That the MDS cluster grows after increasing max_mds.
# Need all my standbys up as well as the active daemons
# self.wait_for_daemon_start() necessary?
- self.grow(2)
- self.grow(3)
+ self.fs.grow(2)
+ self.fs.grow(3)
def test_shrink(self):
That the MDS cluster shrinks automatically after decreasing max_mds.
"""
- self.grow(3)
- self.shrink(1)
+ self.fs.grow(3)
+ self.fs.shrink(1)
def test_up_less_than_max(self):
"""
mdss = [info['gid'] for info in status.get_all()]
self.fs.set_max_mds(len(mdss)+1)
self.wait_for_health("MDS_UP_LESS_THAN_MAX", 30)
- self.shrink(2)
+ self.fs.shrink(2)
self.wait_for_health_clear(30)
def test_down_health(self):
That marking a FS down does not generate a health warning
"""
- self.mount_a.umount_wait()
-
self.fs.set_down()
try:
self.wait_for_health("", 30)
That marking a FS down twice does not wipe old_max_mds.
"""
- self.mount_a.umount_wait()
-
- self.grow(2)
+ self.fs.grow(2)
self.fs.set_down()
self.fs.wait_for_daemons()
self.fs.set_down(False)
That setting max_mds undoes down.
"""
- self.mount_a.umount_wait()
-
self.fs.set_down()
self.fs.wait_for_daemons()
- self.grow(2)
+ self.fs.grow(2)
self.fs.wait_for_daemons()
def test_down(self):
That down setting toggles and sets max_mds appropriately.
"""
- self.mount_a.umount_wait()
-
self.fs.set_down()
self.fs.wait_for_daemons()
self.assertEqual(self.fs.get_var("max_mds"), 0)
fscid = self.fs.id
- self.grow(2)
+ self.fs.grow(2)
+ # Now add a delay which should slow down how quickly rank 1 stops
+ self.config_set('mds', 'ms_inject_delay_max', '5.0')
+ self.config_set('mds', 'ms_inject_delay_probability', '1.0')
self.fs.set_max_mds(1)
log.info("status = {0}".format(self.fs.status()))
- self.fs.set_max_mds(3)
# Don't wait for rank 1 to stop
+ self.fs.set_max_mds(3)
+ log.info("status = {0}".format(self.fs.status()))
+ # Now check that the mons didn't try to promote a standby to rank 2
self.fs.set_max_mds(2)
- # Prevent another MDS from taking rank 1
- # XXX This is a little racy because rank 1 may have stopped and a
- # standby assigned to rank 1 before joinable=0 is set.
- self.fs.set_joinable(False) # XXX keep in mind changing max_mds clears this flag
-
+ status = self.fs.status()
try:
status = self.fs.wait_for_daemons(timeout=90)
- raise RuntimeError("should not be able to successfully shrink cluster!")
- except:
- # could not shrink to max_mds=2 and reach 2 actives (because joinable=False)
- status = self.fs.status()
ranks = set([info['rank'] for info in status.get_ranks(fscid)])
- self.assertTrue(ranks == set([0]))
+ self.assertEqual(ranks, set([0, 1]))
finally:
log.info("status = {0}".format(status))
CLIENTS_REQUIRED = 1
MDSS_REQUIRED = 2
+ def test_repeated_boot(self):
+ """
+ That multiple boot messages do not result in the MDS getting evicted.
+ """
+
+ interval = 10
+ self.config_set("mon", "paxos_propose_interval", interval)
+
+ mds = choice(list(self.fs.status().get_all()))
+
+ with self.assert_cluster_log(f"daemon mds.{mds['name']} restarted", present=False):
+ # Avoid a beacon to the monitors with down:dne by restarting:
+ self.fs.mds_fail(mds_id=mds['name'])
+ # `ceph mds fail` won't return until the FSMap is committed, double-check:
+ self.assertIsNone(self.fs.status().get_mds_gid(mds['gid']))
+ time.sleep(2) # for mds to restart and accept asok commands
+ status1 = self.fs.mds_asok(['status'], mds_id=mds['name'])
+ time.sleep(interval*1.5)
+ status2 = self.fs.mds_asok(['status'], mds_id=mds['name'])
+ self.assertEqual(status1['id'], status2['id'])
+
def test_simple(self):
"""
That when the active MDS is killed, a standby MDS is promoted into
in thrashing tests.
"""
- # Need all my standbys up as well as the active daemons
- self.wait_for_daemon_start()
-
(original_active, ) = self.fs.get_active_names()
original_standbys = self.mds_cluster.get_standby_daemons()
# Kill the rank 0 daemon's physical process
self.fs.mds_stop(original_active)
- grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
# Wait until the monitor promotes his replacement
def promoted():
- active = self.fs.get_active_names()
- return active and active[0] in original_standbys
+ ranks = list(self.fs.get_ranks())
+ return len(ranks) > 0 and ranks[0]['name'] in original_standbys
log.info("Waiting for promotion of one of the original standbys {0}".format(
original_standbys))
- self.wait_until_true(
- promoted,
- timeout=grace*2)
+ self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
# Start the original rank 0 daemon up again, see that he becomes a standby
self.fs.mds_restart(original_active)
if not require_active:
self.skipTest("fuse_require_active_mds is not set")
- grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
# Check it's not laggy to begin with
(original_active, ) = self.fs.get_active_names()
self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
self.mounts[0].umount_wait()
# Control: that we can mount and unmount usually, while the cluster is healthy
- self.mounts[0].mount()
- self.mounts[0].wait_until_mounted()
+ self.mounts[0].mount_wait()
self.mounts[0].umount_wait()
# Stop the daemon processes
return True
- self.wait_until_true(laggy, grace * 2)
+ self.wait_until_true(laggy, self.fs.beacon_timeout)
with self.assertRaises(CommandFailedError):
- self.mounts[0].mount()
+ self.mounts[0].mount_wait()
def test_standby_count_wanted(self):
"""
# Need all my standbys up as well as the active daemons
self.wait_for_daemon_start()
- grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
standbys = self.mds_cluster.get_standby_daemons()
self.assertGreaterEqual(len(standbys), 1)
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
# Kill a standby and check for warning
victim = standbys.pop()
self.fs.mds_stop(victim)
- log.info("waiting for insufficient standby daemon warning")
- self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+ self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
# restart the standby, see that he becomes a standby, check health clears
self.fs.mds_restart(victim)
standbys = self.mds_cluster.get_standby_daemons()
self.assertGreaterEqual(len(standbys), 1)
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
- log.info("waiting for insufficient standby daemon warning")
- self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+ self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
# Set it to 0
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
self.mount_a.umount_wait()
- grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
mds_0 = self.fs.get_rank(rank=0, status=status)
self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
self.wait_until_true(
lambda: "laggy_since" in self.fs.get_rank(),
- timeout=grace * 2
+ timeout=self.fs.beacon_timeout
)
self.fs.rank_fail(rank=1)
self.fs.rank_signal(signal.SIGCONT, rank=0)
self.wait_until_true(
lambda: "laggy_since" not in self.fs.get_rank(rank=0),
- timeout=grace * 2
+ timeout=self.fs.beacon_timeout
)
# mds.b will be stuck at 'reconnect' state if snapserver gets confused
self.assertEqual(mds_0['gid'], self.fs.get_rank(rank=0)['gid'])
self.fs.rank_freeze(False, rank=0)
+ def test_connect_bootstrapping(self):
+ self.config_set("mds", "mds_sleep_rank_change", 10000000.0)
+ self.config_set("mds", "mds_connect_bootstrapping", True)
+ self.fs.set_max_mds(2)
+ self.fs.wait_for_daemons()
+ self.fs.rank_fail(rank=0)
+ # rank 0 will get stuck in up:resolve, see https://tracker.ceph.com/issues/53194
+ self.fs.wait_for_daemons()
+
+
class TestStandbyReplay(CephFSTestCase):
+ CLIENTS_REQUIRED = 0
MDSS_REQUIRED = 4
def _confirm_no_replay(self):
time.sleep(30)
self._confirm_single_replay()
+ def test_standby_replay_damaged(self):
+ """
+ That a standby-replay daemon can cause the rank to go damaged correctly.
+ """
+
+ self._confirm_no_replay()
+ self.config_set("mds", "mds_standby_replay_damaged", True)
+ self.fs.set_allow_standby_replay(True)
+ self.wait_until_true(
+ lambda: len(self.fs.get_damaged()) > 0,
+ timeout=30
+ )
+ status = self.fs.status()
+ self.assertListEqual([], list(self.fs.get_ranks(status=status)))
+ self.assertListEqual([0], self.fs.get_damaged(status=status))
+
+ def test_standby_replay_disable(self):
+ """
+ That turning off allow_standby_replay fails all standby-replay daemons.
+ """
+
+ self._confirm_no_replay()
+ self.fs.set_allow_standby_replay(True)
+ time.sleep(30)
+ self._confirm_single_replay()
+ self.fs.set_allow_standby_replay(False)
+ self._confirm_no_replay()
+
def test_standby_replay_singleton_fail(self):
"""
That failures don't violate singleton constraint.
self.fs.mds_restart(mds_id=victim['name'])
status = self._confirm_single_replay(status=status)
+ def test_standby_replay_prepare_beacon(self):
+ """
+ That a MDSMonitor::prepare_beacon handles standby-replay daemons
+ correctly without removing the standby. (Note, usually a standby-replay
+ beacon will just be replied to by MDSMonitor::preprocess_beacon.)
+ """
+
+ status = self._confirm_no_replay()
+ self.fs.set_max_mds(1)
+ self.fs.set_allow_standby_replay(True)
+ status = self._confirm_single_replay()
+ replays = list(status.get_replays(self.fs.id))
+ self.assertEqual(len(replays), 1)
+ self.config_set('mds.'+replays[0]['name'], 'mds_inject_health_dummy', True)
+ time.sleep(10) # for something not to happen...
+ status = self._confirm_single_replay()
+ replays2 = list(status.get_replays(self.fs.id))
+ self.assertEqual(replays[0]['gid'], replays2[0]['gid'])
+
def test_rank_stopped(self):
"""
That when a rank is STOPPED, standby replays for
fs_a, fs_b = self._setup_two()
# Mount a client on fs_a
- self.mount_a.mount(mount_fs_name=fs_a.name)
+ self.mount_a.mount_wait(cephfs_name=fs_a.name)
self.mount_a.write_n_mb("pad.bin", 1)
self.mount_a.write_n_mb("test.bin", 2)
a_created_ino = self.mount_a.path_to_ino("test.bin")
self.mount_a.create_files()
# Mount a client on fs_b
- self.mount_b.mount(mount_fs_name=fs_b.name)
+ self.mount_b.mount_wait(cephfs_name=fs_b.name)
self.mount_b.write_n_mb("test.bin", 1)
b_created_ino = self.mount_b.path_to_ino("test.bin")
self.mount_b.create_files()