]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mgr/test_failover.py
5 from tasks
.mgr
.mgr_test_case
import MgrTestCase
8 log
= logging
.getLogger(__name__
)
11 class TestFailover(MgrTestCase
):
15 super(TestFailover
, self
).setUp()
18 def test_timeout(self
):
20 That when an active mgr stops responding, a standby is promoted
21 after mon_mgr_beacon_grace.
24 # Query which mgr is active
25 original_active
= self
.mgr_cluster
.get_active_id()
26 original_standbys
= self
.mgr_cluster
.get_standby_ids()
29 self
.mgr_cluster
.mgr_stop(original_active
)
31 # Assert that the other mgr becomes active
33 lambda: self
.mgr_cluster
.get_active_id() in original_standbys
,
37 self
.mgr_cluster
.mgr_restart(original_active
)
39 lambda: original_active
in self
.mgr_cluster
.get_standby_ids(),
43 def test_timeout_nostandby(self
):
45 That when an active mgr stop responding, and no standby is
46 available, the active mgr is removed from the map anyway.
48 # Query which mgr is active
49 original_active
= self
.mgr_cluster
.get_active_id()
50 original_standbys
= self
.mgr_cluster
.get_standby_ids()
52 for s
in original_standbys
:
53 self
.mgr_cluster
.mgr_stop(s
)
54 self
.mgr_cluster
.mgr_fail(s
)
56 self
.assertListEqual(self
.mgr_cluster
.get_standby_ids(), [])
57 self
.assertEqual(self
.mgr_cluster
.get_active_id(), original_active
)
59 grace
= int(self
.mgr_cluster
.get_config("mon_mgr_beacon_grace"))
60 log
.info("Should time out in about {0} seconds".format(grace
))
62 self
.mgr_cluster
.mgr_stop(original_active
)
64 # Now wait for the mon to notice the mgr is gone and remove it
66 self
.wait_until_equal(
67 lambda: self
.mgr_cluster
.get_active_id(),
72 self
.assertListEqual(self
.mgr_cluster
.get_standby_ids(), [])
73 self
.assertEqual(self
.mgr_cluster
.get_active_id(), "")
75 def test_explicit_fail(self
):
77 That when a user explicitly fails a daemon, a standby immediately
81 # Query which mgr is active
82 original_active
= self
.mgr_cluster
.get_active_id()
83 original_standbys
= self
.mgr_cluster
.get_standby_ids()
85 self
.mgr_cluster
.mgr_fail(original_active
)
87 # A standby should take over
89 lambda: self
.mgr_cluster
.get_active_id() in original_standbys
,
93 # The one we failed should come back as a standby (he isn't
96 lambda: original_active
in self
.mgr_cluster
.get_standby_ids(),
100 # Both daemons should have fully populated metadata
101 # (regression test for http://tracker.ceph.com/issues/21260)
102 meta
= json
.loads(self
.mgr_cluster
.mon_manager
.raw_cluster_cmd(
104 id_to_meta
= dict([(i
['name'], i
) for i
in meta
])
105 for i
in [original_active
] + original_standbys
:
106 self
.assertIn(i
, id_to_meta
)
107 self
.assertIn('ceph_version', id_to_meta
[i
])
109 # We should be able to fail back over again: the exercises
110 # our re-initialization of the python runtime within
111 # a single process lifetime.
113 # Get rid of any bystander standbys so that the original_active
114 # will be selected as next active.
115 new_active
= self
.mgr_cluster
.get_active_id()
116 for daemon
in original_standbys
:
117 if daemon
!= new_active
:
118 self
.mgr_cluster
.mgr_stop(daemon
)
119 self
.mgr_cluster
.mgr_fail(daemon
)
121 self
.assertListEqual(self
.mgr_cluster
.get_standby_ids(),
124 self
.mgr_cluster
.mgr_stop(new_active
)
125 self
.mgr_cluster
.mgr_fail(new_active
)
127 self
.assertEqual(self
.mgr_cluster
.get_active_id(), original_active
)
128 self
.assertEqual(self
.mgr_cluster
.get_standby_ids(), [])
130 def test_standby_timeout(self
):
132 That when a standby daemon stops sending beacons, it is
133 removed from the list of standbys
136 original_active
= self
.mgr_cluster
.get_active_id()
137 original_standbys
= self
.mgr_cluster
.get_standby_ids()
139 victim
= original_standbys
[0]
140 self
.mgr_cluster
.mgr_stop(victim
)
142 expect_standbys
= set(original_standbys
) - {victim}
144 self
.wait_until_true(
145 lambda: set(self
.mgr_cluster
.get_standby_ids()) == expect_standbys
,
148 self
.assertEqual(self
.mgr_cluster
.get_active_id(), original_active
)