import quincy beta 17.1.0

[ceph.git] / ceph / qa / tasks / cephfs / test_failover.py
diff --git a/ceph/qa/tasks/cephfs/test_failover.py b/ceph/qa/tasks/cephfs/test_failover.py

index f2032b23741086e331a7ad1e0be3f99e40fce9d4..7147807bf521cc4e770d5eae1c5affdbae601391 100644 (file)
--- a/ceph/qa/tasks/cephfs/test_failover.py
+++ b/ceph/qa/tasks/cephfs/test_failover.py
@@ -103,8 +103,6 @@ class TestClusterAffinity(CephFSTestCase):
          """
          That a vanilla standby is preferred over others with mds_join_fs set to another fs.
          """
-        # After Octopus is EOL, we can remove this setting:
-        self.fs.set_allow_multifs()
          fs2 = self.mds_cluster.newfs(name="cephfs2")
          status, target = self._verify_init()
          active = self.fs.get_active_names(status=status)[0]
@@ -129,8 +127,6 @@ class TestClusterAffinity(CephFSTestCase):
          standbys = [info['name'] for info in status.get_standbys()]
          for mds in standbys:
              self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
-        # After Octopus is EOL, we can remove this setting:
-        self.fs.set_allow_multifs()
          fs2 = self.mds_cluster.newfs(name="cephfs2")
          for mds in standbys:
              self._change_target_state(target, mds, {'join_fscid': fs2.id})
@@ -310,27 +306,20 @@ class TestFailover(CephFSTestCase):
          in thrashing tests.
          """
  
-        # Need all my standbys up as well as the active daemons
-        self.wait_for_daemon_start()
-
          (original_active, ) = self.fs.get_active_names()
          original_standbys = self.mds_cluster.get_standby_daemons()
  
          # Kill the rank 0 daemon's physical process
          self.fs.mds_stop(original_active)
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          # Wait until the monitor promotes his replacement
          def promoted():
-            active = self.fs.get_active_names()
-            return active and active[0] in original_standbys
+            ranks = list(self.fs.get_ranks())
+            return len(ranks) > 0 and ranks[0]['name'] in original_standbys
  
          log.info("Waiting for promotion of one of the original standbys {0}".format(
              original_standbys))
-        self.wait_until_true(
-            promoted,
-            timeout=grace*2)
+        self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
  
          # Start the original rank 0 daemon up again, see that he becomes a standby
          self.fs.mds_restart(original_active)
@@ -352,8 +341,6 @@ class TestFailover(CephFSTestCase):
          if not require_active:
              self.skipTest("fuse_require_active_mds is not set")
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          # Check it's not laggy to begin with
          (original_active, ) = self.fs.get_active_names()
          self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
@@ -376,7 +363,7 @@ class TestFailover(CephFSTestCase):
  
              return True
  
-        self.wait_until_true(laggy, grace * 2)
+        self.wait_until_true(laggy, self.fs.beacon_timeout)
          with self.assertRaises(CommandFailedError):
              self.mounts[0].mount_wait()
  
@@ -388,8 +375,6 @@ class TestFailover(CephFSTestCase):
          # Need all my standbys up as well as the active daemons
          self.wait_for_daemon_start()
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          standbys = self.mds_cluster.get_standby_daemons()
          self.assertGreaterEqual(len(standbys), 1)
          self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
@@ -397,8 +382,7 @@ class TestFailover(CephFSTestCase):
          # Kill a standby and check for warning
          victim = standbys.pop()
          self.fs.mds_stop(victim)
-        log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
  
          # restart the standby, see that he becomes a standby, check health clears
          self.fs.mds_restart(victim)
@@ -412,8 +396,7 @@ class TestFailover(CephFSTestCase):
          standbys = self.mds_cluster.get_standby_daemons()
          self.assertGreaterEqual(len(standbys), 1)
          self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
-        log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
  
          # Set it to 0
          self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
@@ -429,7 +412,6 @@ class TestFailover(CephFSTestCase):
  
          self.mount_a.umount_wait()
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
          monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
  
          mds_0 = self.fs.get_rank(rank=0, status=status)
@@ -437,7 +419,7 @@ class TestFailover(CephFSTestCase):
          self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
          self.wait_until_true(
              lambda: "laggy_since" in self.fs.get_rank(),
-            timeout=grace * 2
+            timeout=self.fs.beacon_timeout
          )
  
          self.fs.rank_fail(rank=1)
@@ -450,7 +432,7 @@ class TestFailover(CephFSTestCase):
          self.fs.rank_signal(signal.SIGCONT, rank=0)
          self.wait_until_true(
              lambda: "laggy_since" not in self.fs.get_rank(rank=0),
-            timeout=grace * 2
+            timeout=self.fs.beacon_timeout
          )
  
          # mds.b will be stuck at 'reconnect' state if snapserver gets confused
@@ -459,6 +441,16 @@ class TestFailover(CephFSTestCase):
          self.assertEqual(mds_0['gid'], self.fs.get_rank(rank=0)['gid'])
          self.fs.rank_freeze(False, rank=0)
  
+    def test_connect_bootstrapping(self):
+        self.config_set("mds", "mds_sleep_rank_change", 10000000.0)
+        self.config_set("mds", "mds_connect_bootstrapping", True)
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+        self.fs.rank_fail(rank=0)
+        # rank 0 will get stuck in up:resolve, see https://tracker.ceph.com/issues/53194
+        self.fs.wait_for_daemons()
+
+
  class TestStandbyReplay(CephFSTestCase):
      CLIENTS_REQUIRED = 0
      MDSS_REQUIRED = 4