]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mgr/test_failover.py
bump version to 12.2.0-pve1
[ceph.git] / ceph / qa / tasks / mgr / test_failover.py
1
2 import logging
3
4 from tasks.mgr.mgr_test_case import MgrTestCase
5
6
7 log = logging.getLogger(__name__)
8
9
10 class TestFailover(MgrTestCase):
11 MGRS_REQUIRED = 2
12
13 def test_timeout(self):
14 """
15 That when an active mgr stops responding, a standby is promoted
16 after mon_mgr_beacon_grace.
17 """
18
19 # Query which mgr is active
20 original_active = self.mgr_cluster.get_active_id()
21 original_standbys = self.mgr_cluster.get_standby_ids()
22
23 # Stop that daemon
24 self.mgr_cluster.mgr_stop(original_active)
25
26 # Assert that the other mgr becomes active
27 self.wait_until_true(
28 lambda: self.mgr_cluster.get_active_id() in original_standbys,
29 timeout=60
30 )
31
32 self.mgr_cluster.mgr_restart(original_active)
33 self.wait_until_true(
34 lambda: original_active in self.mgr_cluster.get_standby_ids(),
35 timeout=10
36 )
37
38 def test_timeout_nostandby(self):
39 """
40 That when an active mgr stop responding, and no standby is
41 available, the active mgr is removed from the map anyway.
42 """
43 # Query which mgr is active
44 original_active = self.mgr_cluster.get_active_id()
45 original_standbys = self.mgr_cluster.get_standby_ids()
46
47 for s in original_standbys:
48 self.mgr_cluster.mgr_stop(s)
49 self.mgr_cluster.mgr_fail(s)
50
51 self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
52 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
53
54 grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
55 log.info("Should time out in about {0} seconds".format(grace))
56
57 self.mgr_cluster.mgr_stop(original_active)
58
59 # Now wait for the mon to notice the mgr is gone and remove it
60 # from the map.
61 self.wait_until_equal(
62 lambda: self.mgr_cluster.get_active_id(),
63 "",
64 timeout=grace * 2
65 )
66
67 self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
68 self.assertEqual(self.mgr_cluster.get_active_id(), "")
69
70 def test_explicit_fail(self):
71 """
72 That when a user explicitly fails a daemon, a standby immediately
73 replaces it.
74 :return:
75 """
76 # Query which mgr is active
77 original_active = self.mgr_cluster.get_active_id()
78 original_standbys = self.mgr_cluster.get_standby_ids()
79
80 self.mgr_cluster.mgr_fail(original_active)
81
82 # A standby should take over
83 self.wait_until_true(
84 lambda: self.mgr_cluster.get_active_id() in original_standbys,
85 timeout=60
86 )
87
88 # The one we failed should come back as a standby (he isn't
89 # really dead)
90 self.wait_until_true(
91 lambda: original_active in self.mgr_cluster.get_standby_ids(),
92 timeout=10
93 )
94
95 # We should be able to fail back over again: the exercises
96 # our re-initialization of the python runtime within
97 # a single process lifetime.
98
99 # Get rid of any bystander standbys so that the original_active
100 # will be selected as next active.
101 new_active = self.mgr_cluster.get_active_id()
102 for daemon in original_standbys:
103 if daemon != new_active:
104 self.mgr_cluster.mgr_stop(daemon)
105 self.mgr_cluster.mgr_fail(daemon)
106
107 self.assertListEqual(self.mgr_cluster.get_standby_ids(),
108 [original_active])
109
110 self.mgr_cluster.mgr_stop(new_active)
111 self.mgr_cluster.mgr_fail(new_active)
112
113 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
114 self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
115
116 def test_standby_timeout(self):
117 """
118 That when a standby daemon stops sending beacons, it is
119 removed from the list of standbys
120 :return:
121 """
122 original_active = self.mgr_cluster.get_active_id()
123 original_standbys = self.mgr_cluster.get_standby_ids()
124
125 victim = original_standbys[0]
126 self.mgr_cluster.mgr_stop(victim)
127
128 expect_standbys = set(original_standbys) - {victim}
129
130 self.wait_until_true(
131 lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
132 timeout=60
133 )
134 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)