]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mgr/test_failover.py
import 15.2.4
[ceph.git] / ceph / qa / tasks / mgr / test_failover.py
1
2 import logging
3 import json
4
5 from tasks.mgr.mgr_test_case import MgrTestCase
6
7
8 log = logging.getLogger(__name__)
9
10
11 class TestFailover(MgrTestCase):
12 MGRS_REQUIRED = 2
13
14 def setUp(self):
15 super(TestFailover, self).setUp()
16 self.setup_mgrs()
17
18 def test_timeout(self):
19 """
20 That when an active mgr stops responding, a standby is promoted
21 after mon_mgr_beacon_grace.
22 """
23
24 # Query which mgr is active
25 original_active = self.mgr_cluster.get_active_id()
26 original_standbys = self.mgr_cluster.get_standby_ids()
27
28 # Stop that daemon
29 self.mgr_cluster.mgr_stop(original_active)
30
31 # Assert that the other mgr becomes active
32 self.wait_until_true(
33 lambda: self.mgr_cluster.get_active_id() in original_standbys,
34 timeout=60
35 )
36
37 self.mgr_cluster.mgr_restart(original_active)
38 self.wait_until_true(
39 lambda: original_active in self.mgr_cluster.get_standby_ids(),
40 timeout=10
41 )
42
43 def test_timeout_nostandby(self):
44 """
45 That when an active mgr stop responding, and no standby is
46 available, the active mgr is removed from the map anyway.
47 """
48 # Query which mgr is active
49 original_active = self.mgr_cluster.get_active_id()
50 original_standbys = self.mgr_cluster.get_standby_ids()
51
52 for s in original_standbys:
53 self.mgr_cluster.mgr_stop(s)
54 self.mgr_cluster.mgr_fail(s)
55
56 self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
57 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
58
59 grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
60 log.info("Should time out in about {0} seconds".format(grace))
61
62 self.mgr_cluster.mgr_stop(original_active)
63
64 # Now wait for the mon to notice the mgr is gone and remove it
65 # from the map.
66 self.wait_until_equal(
67 lambda: self.mgr_cluster.get_active_id(),
68 "",
69 timeout=grace * 2
70 )
71
72 self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
73 self.assertEqual(self.mgr_cluster.get_active_id(), "")
74
75 def test_explicit_fail(self):
76 """
77 That when a user explicitly fails a daemon, a standby immediately
78 replaces it.
79 :return:
80 """
81 # Query which mgr is active
82 original_active = self.mgr_cluster.get_active_id()
83 original_standbys = self.mgr_cluster.get_standby_ids()
84
85 self.mgr_cluster.mgr_fail(original_active)
86
87 # A standby should take over
88 self.wait_until_true(
89 lambda: self.mgr_cluster.get_active_id() in original_standbys,
90 timeout=60
91 )
92
93 # The one we failed should come back as a standby (he isn't
94 # really dead)
95 self.wait_until_true(
96 lambda: original_active in self.mgr_cluster.get_standby_ids(),
97 timeout=10
98 )
99
100 # Both daemons should have fully populated metadata
101 # (regression test for http://tracker.ceph.com/issues/21260)
102 meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
103 "mgr", "metadata"))
104 id_to_meta = dict([(i['name'], i) for i in meta])
105 for i in [original_active] + original_standbys:
106 self.assertIn(i, id_to_meta)
107 self.assertIn('ceph_version', id_to_meta[i])
108
109 # We should be able to fail back over again: the exercises
110 # our re-initialization of the python runtime within
111 # a single process lifetime.
112
113 # Get rid of any bystander standbys so that the original_active
114 # will be selected as next active.
115 new_active = self.mgr_cluster.get_active_id()
116 for daemon in original_standbys:
117 if daemon != new_active:
118 self.mgr_cluster.mgr_stop(daemon)
119 self.mgr_cluster.mgr_fail(daemon)
120
121 self.assertListEqual(self.mgr_cluster.get_standby_ids(),
122 [original_active])
123
124 self.mgr_cluster.mgr_stop(new_active)
125 self.mgr_cluster.mgr_fail(new_active)
126
127 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
128 self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
129
130 def test_standby_timeout(self):
131 """
132 That when a standby daemon stops sending beacons, it is
133 removed from the list of standbys
134 :return:
135 """
136 original_active = self.mgr_cluster.get_active_id()
137 original_standbys = self.mgr_cluster.get_standby_ids()
138
139 victim = original_standbys[0]
140 self.mgr_cluster.mgr_stop(victim)
141
142 expect_standbys = set(original_standbys) - {victim}
143
144 self.wait_until_true(
145 lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
146 timeout=60
147 )
148 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)