]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mgr/test_failover.py
4 from tasks
.mgr
.mgr_test_case
import MgrTestCase
7 log
= logging
.getLogger(__name__
)
10 class TestFailover(MgrTestCase
):
13 def test_timeout(self
):
15 That when an active mgr stops responding, a standby is promoted
16 after mon_mgr_beacon_grace.
19 # Query which mgr is active
20 original_active
= self
.mgr_cluster
.get_active_id()
21 original_standbys
= self
.mgr_cluster
.get_standby_ids()
24 self
.mgr_cluster
.mgr_stop(original_active
)
26 # Assert that the other mgr becomes active
28 lambda: self
.mgr_cluster
.get_active_id() in original_standbys
,
32 self
.mgr_cluster
.mgr_restart(original_active
)
34 lambda: original_active
in self
.mgr_cluster
.get_standby_ids(),
38 def test_timeout_nostandby(self
):
40 That when an active mgr stop responding, and no standby is
41 available, the active mgr is removed from the map anyway.
43 # Query which mgr is active
44 original_active
= self
.mgr_cluster
.get_active_id()
45 original_standbys
= self
.mgr_cluster
.get_standby_ids()
47 for s
in original_standbys
:
48 self
.mgr_cluster
.mgr_stop(s
)
49 self
.mgr_cluster
.mgr_fail(s
)
51 self
.assertListEqual(self
.mgr_cluster
.get_standby_ids(), [])
52 self
.assertEqual(self
.mgr_cluster
.get_active_id(), original_active
)
54 grace
= int(self
.mgr_cluster
.get_config("mon_mgr_beacon_grace"))
55 log
.info("Should time out in about {0} seconds".format(grace
))
57 self
.mgr_cluster
.mgr_stop(original_active
)
59 # Now wait for the mon to notice the mgr is gone and remove it
61 self
.wait_until_equal(
62 lambda: self
.mgr_cluster
.get_active_id(),
67 self
.assertListEqual(self
.mgr_cluster
.get_standby_ids(), [])
68 self
.assertEqual(self
.mgr_cluster
.get_active_id(), "")
70 def test_explicit_fail(self
):
72 That when a user explicitly fails a daemon, a standby immediately
76 # Query which mgr is active
77 original_active
= self
.mgr_cluster
.get_active_id()
78 original_standbys
= self
.mgr_cluster
.get_standby_ids()
80 self
.mgr_cluster
.mgr_fail(original_active
)
82 # A standby should take over
84 lambda: self
.mgr_cluster
.get_active_id() in original_standbys
,
88 # The one we failed should come back as a standby (he isn't
91 lambda: original_active
in self
.mgr_cluster
.get_standby_ids(),
95 # We should be able to fail back over again: the exercises
96 # our re-initialization of the python runtime within
97 # a single process lifetime.
99 # Get rid of any bystander standbys so that the original_active
100 # will be selected as next active.
101 new_active
= self
.mgr_cluster
.get_active_id()
102 for daemon
in original_standbys
:
103 if daemon
!= new_active
:
104 self
.mgr_cluster
.mgr_stop(daemon
)
105 self
.mgr_cluster
.mgr_fail(daemon
)
107 self
.assertListEqual(self
.mgr_cluster
.get_standby_ids(),
110 self
.mgr_cluster
.mgr_stop(new_active
)
111 self
.mgr_cluster
.mgr_fail(new_active
)
113 self
.assertEqual(self
.mgr_cluster
.get_active_id(), original_active
)
114 self
.assertEqual(self
.mgr_cluster
.get_standby_ids(), [])
116 def test_standby_timeout(self
):
118 That when a standby daemon stops sending beacons, it is
119 removed from the list of standbys
122 original_active
= self
.mgr_cluster
.get_active_id()
123 original_standbys
= self
.mgr_cluster
.get_standby_ids()
125 victim
= original_standbys
[0]
126 self
.mgr_cluster
.mgr_stop(victim
)
128 expect_standbys
= set(original_standbys
) - {victim}
130 self
.wait_until_true(
131 lambda: set(self
.mgr_cluster
.get_standby_ids()) == expect_standbys
,
134 self
.assertEqual(self
.mgr_cluster
.get_active_id(), original_active
)