]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephfs/test_failover.py
update sources to 12.2.8
[ceph.git] / ceph / qa / tasks / cephfs / test_failover.py
CommitLineData
1adf2230
AA
1import time
2import signal
7c673cae
FG
3import json
4import logging
5from unittest import case, SkipTest
6
7from cephfs_test_case import CephFSTestCase
8from teuthology.exceptions import CommandFailedError
9from teuthology import misc as teuthology
10from tasks.cephfs.fuse_mount import FuseMount
11
12log = logging.getLogger(__name__)
13
14
15class TestFailover(CephFSTestCase):
16 CLIENTS_REQUIRED = 1
17 MDSS_REQUIRED = 2
18
19 def test_simple(self):
20 """
21 That when the active MDS is killed, a standby MDS is promoted into
22 its rank after the grace period.
23
24 This is just a simple unit test, the harder cases are covered
25 in thrashing tests.
26 """
27
28 # Need all my standbys up as well as the active daemons
29 self.wait_for_daemon_start()
30
31 (original_active, ) = self.fs.get_active_names()
32 original_standbys = self.mds_cluster.get_standby_daemons()
33
34 # Kill the rank 0 daemon's physical process
35 self.fs.mds_stop(original_active)
36
c07f9fc5 37 grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
7c673cae
FG
38
39 # Wait until the monitor promotes his replacement
40 def promoted():
41 active = self.fs.get_active_names()
42 return active and active[0] in original_standbys
43
44 log.info("Waiting for promotion of one of the original standbys {0}".format(
45 original_standbys))
46 self.wait_until_true(
47 promoted,
48 timeout=grace*2)
49
50 # Start the original rank 0 daemon up again, see that he becomes a standby
51 self.fs.mds_restart(original_active)
52 self.wait_until_true(
53 lambda: original_active in self.mds_cluster.get_standby_daemons(),
54 timeout=60 # Approximately long enough for MDS to start and mon to notice
55 )
56
57 def test_client_abort(self):
58 """
59 That a client will respect fuse_require_active_mds and error out
60 when the cluster appears to be unavailable.
61 """
62
63 if not isinstance(self.mount_a, FuseMount):
64 raise SkipTest("Requires FUSE client to inject client metadata")
65
66 require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true"
67 if not require_active:
68 raise case.SkipTest("fuse_require_active_mds is not set")
69
c07f9fc5 70 grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
7c673cae
FG
71
72 # Check it's not laggy to begin with
73 (original_active, ) = self.fs.get_active_names()
74 self.assertNotIn("laggy_since", self.fs.mon_manager.get_mds_status(original_active))
75
76 self.mounts[0].umount_wait()
77
78 # Control: that we can mount and unmount usually, while the cluster is healthy
79 self.mounts[0].mount()
80 self.mounts[0].wait_until_mounted()
81 self.mounts[0].umount_wait()
82
83 # Stop the daemon processes
84 self.fs.mds_stop()
85
86 # Wait for everyone to go laggy
87 def laggy():
88 mdsmap = self.fs.get_mds_map()
89 for info in mdsmap['info'].values():
90 if "laggy_since" not in info:
91 return False
92
93 return True
94
95 self.wait_until_true(laggy, grace * 2)
96 with self.assertRaises(CommandFailedError):
97 self.mounts[0].mount()
98
99 def test_standby_count_wanted(self):
100 """
101 That cluster health warnings are generated by insufficient standbys available.
102 """
103
104 # Need all my standbys up as well as the active daemons
105 self.wait_for_daemon_start()
106
c07f9fc5 107 grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
7c673cae
FG
108
109 standbys = self.mds_cluster.get_standby_daemons()
110 self.assertGreaterEqual(len(standbys), 1)
111 self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
112
113 # Kill a standby and check for warning
114 victim = standbys.pop()
115 self.fs.mds_stop(victim)
116 log.info("waiting for insufficient standby daemon warning")
224ce89b 117 self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
7c673cae
FG
118
119 # restart the standby, see that he becomes a standby, check health clears
120 self.fs.mds_restart(victim)
121 self.wait_until_true(
122 lambda: victim in self.mds_cluster.get_standby_daemons(),
123 timeout=60 # Approximately long enough for MDS to start and mon to notice
124 )
125 self.wait_for_health_clear(timeout=30)
126
127 # Set it one greater than standbys ever seen
128 standbys = self.mds_cluster.get_standby_daemons()
129 self.assertGreaterEqual(len(standbys), 1)
130 self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
131 log.info("waiting for insufficient standby daemon warning")
224ce89b 132 self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
7c673cae
FG
133
134 # Set it to 0
135 self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
136 self.wait_for_health_clear(timeout=30)
137
1adf2230
AA
138 def test_discontinuous_mdsmap(self):
139 """
140 That discontinuous mdsmap does not affect failover.
141 See http://tracker.ceph.com/issues/24856.
142 """
143 mds_ids = sorted(self.mds_cluster.mds_ids)
144 mds_a, mds_b = mds_ids[0:2]
145 # Assign mds to fixed ranks. To prevent standby mds from replacing frozen mds
146 rank = 0;
147 for mds_id in mds_ids:
148 self.set_conf("mds.{0}".format(mds_id), "mds_standby_for_rank", str(rank))
149 rank += 1
150 self.mds_cluster.mds_restart()
151 self.fs.wait_for_daemons()
152
153 self.fs.set_max_mds(2)
154 self.fs.wait_for_state('up:active', rank=1)
155
156 # Drop 'export prep' message, make import stay in 'discovered' state
157 self.fs.mds_asok(['config', 'set', 'mds_inject_migrator_message_loss', '82'], mds_id=mds_b)
158
159 self.mount_a.run_shell(["mkdir", "a"])
160 self.mount_a.setfattr("a", "ceph.dir.pin", "1")
161 self.mount_a.umount_wait()
162
163 # Should be long enough for start the export
164 time.sleep(30)
165
166 grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
167 monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
7c673cae 168
1adf2230
AA
169 # Freeze mds_b
170 self.mds_cluster.mds_signal(mds_b, signal.SIGSTOP)
171 self.wait_until_true(
172 lambda: "laggy_since" in self.fs.mon_manager.get_mds_status(mds_b),
173 timeout=grace * 2
174 )
175
176 self.mds_cluster.mds_restart(mds_a)
177 self.fs.wait_for_state('up:resolve', rank=0, timeout=30)
178
179 # Make sure of mds_b's monitor connection gets reset
180 time.sleep(monc_timeout * 2)
181
182 # Unfreeze mds_b, it will get discontinuous mdsmap
183 self.mds_cluster.mds_signal(mds_b, signal.SIGCONT)
184 self.wait_until_true(
185 lambda: "laggy_since" not in self.fs.mon_manager.get_mds_status(mds_b),
186 timeout=grace * 2
187 )
7c673cae 188
1adf2230
AA
189 # Check if mds_b sends 'resolve' message to mds_a. If not, mds_a can't become active
190 self.fs.wait_for_state('up:active', rank=0, timeout=30)
7c673cae
FG
191
192class TestStandbyReplay(CephFSTestCase):
193 MDSS_REQUIRED = 4
194 REQUIRE_FILESYSTEM = False
195
196 def set_standby_for(self, leader, follower, replay):
197 self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
198 if replay:
199 self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
200
201 def get_info_by_name(self, mds_name):
202 status = self.mds_cluster.status()
203 info = status.get_mds(mds_name)
204 if info is None:
205 log.warn(str(status))
206 raise RuntimeError("MDS '{0}' not found".format(mds_name))
207 else:
208 return info
209
210 def test_standby_replay_unused(self):
211 # Pick out exactly 3 daemons to be run during test
212 use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
213 mds_a, mds_b, mds_c = use_daemons
214 log.info("Using MDS daemons: {0}".format(use_daemons))
215
216 # B and C should both follow A, but only one will
217 # really get into standby replay state.
218 self.set_standby_for(mds_a, mds_b, True)
219 self.set_standby_for(mds_a, mds_c, True)
220
221 # Create FS and start A
222 fs_a = self.mds_cluster.newfs("alpha")
223 self.mds_cluster.mds_restart(mds_a)
224 fs_a.wait_for_daemons()
225 self.assertEqual(fs_a.get_active_names(), [mds_a])
226
227 # Start B, he should go into standby replay
228 self.mds_cluster.mds_restart(mds_b)
229 self.wait_for_daemon_start([mds_b])
230 info_b = self.get_info_by_name(mds_b)
231 self.assertEqual(info_b['state'], "up:standby-replay")
232 self.assertEqual(info_b['standby_for_name'], mds_a)
233 self.assertEqual(info_b['rank'], 0)
234
235 # Start C, he should go into standby (*not* replay)
236 self.mds_cluster.mds_restart(mds_c)
237 self.wait_for_daemon_start([mds_c])
238 info_c = self.get_info_by_name(mds_c)
239 self.assertEqual(info_c['state'], "up:standby")
240 self.assertEqual(info_c['standby_for_name'], mds_a)
241 self.assertEqual(info_c['rank'], -1)
242
243 # Kill B, C should go into standby replay
244 self.mds_cluster.mds_stop(mds_b)
245 self.mds_cluster.mds_fail(mds_b)
246 self.wait_until_equal(
247 lambda: self.get_info_by_name(mds_c)['state'],
248 "up:standby-replay",
249 60)
250 info_c = self.get_info_by_name(mds_c)
251 self.assertEqual(info_c['state'], "up:standby-replay")
252 self.assertEqual(info_c['standby_for_name'], mds_a)
253 self.assertEqual(info_c['rank'], 0)
254
255 def test_standby_failure(self):
256 """
257 That the failure of a standby-replay daemon happens cleanly
258 and doesn't interrupt anything else.
259 """
260 # Pick out exactly 2 daemons to be run during test
261 use_daemons = sorted(self.mds_cluster.mds_ids[0:2])
262 mds_a, mds_b = use_daemons
263 log.info("Using MDS daemons: {0}".format(use_daemons))
264
265 # Configure two pairs of MDSs that are standby for each other
266 self.set_standby_for(mds_a, mds_b, True)
267 self.set_standby_for(mds_b, mds_a, False)
268
269 # Create FS alpha and get mds_a to come up as active
270 fs_a = self.mds_cluster.newfs("alpha")
271 self.mds_cluster.mds_restart(mds_a)
272 fs_a.wait_for_daemons()
273 self.assertEqual(fs_a.get_active_names(), [mds_a])
274
275 # Start the standbys
276 self.mds_cluster.mds_restart(mds_b)
277 self.wait_for_daemon_start([mds_b])
278
279 # See the standby come up as the correct rank
280 info_b = self.get_info_by_name(mds_b)
281 self.assertEqual(info_b['state'], "up:standby-replay")
282 self.assertEqual(info_b['standby_for_name'], mds_a)
283 self.assertEqual(info_b['rank'], 0)
284
285 # Kill the standby
286 self.mds_cluster.mds_stop(mds_b)
287 self.mds_cluster.mds_fail(mds_b)
288
289 # See that the standby is gone and the active remains
290 self.assertEqual(fs_a.get_active_names(), [mds_a])
291 mds_map = fs_a.get_mds_map()
292 self.assertEqual(len(mds_map['info']), 1)
293 self.assertEqual(mds_map['failed'], [])
294 self.assertEqual(mds_map['damaged'], [])
295 self.assertEqual(mds_map['stopped'], [])
296
297 def test_rank_stopped(self):
298 """
299 That when a rank is STOPPED, standby replays for
300 that rank get torn down
301 """
302 # Pick out exactly 2 daemons to be run during test
303 use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
304 mds_a, mds_b, mds_a_s, mds_b_s = use_daemons
305 log.info("Using MDS daemons: {0}".format(use_daemons))
306
307 # a and b both get a standby
308 self.set_standby_for(mds_a, mds_a_s, True)
309 self.set_standby_for(mds_b, mds_b_s, True)
310
311 # Create FS alpha and get mds_a to come up as active
312 fs_a = self.mds_cluster.newfs("alpha")
7c673cae
FG
313 fs_a.set_max_mds(2)
314
315 self.mds_cluster.mds_restart(mds_a)
316 self.wait_until_equal(lambda: fs_a.get_active_names(), [mds_a], 30)
317 self.mds_cluster.mds_restart(mds_b)
318 fs_a.wait_for_daemons()
319 self.assertEqual(sorted(fs_a.get_active_names()), [mds_a, mds_b])
320
321 # Start the standbys
322 self.mds_cluster.mds_restart(mds_b_s)
323 self.wait_for_daemon_start([mds_b_s])
324 self.mds_cluster.mds_restart(mds_a_s)
325 self.wait_for_daemon_start([mds_a_s])
326 info_b_s = self.get_info_by_name(mds_b_s)
327 self.assertEqual(info_b_s['state'], "up:standby-replay")
328 info_a_s = self.get_info_by_name(mds_a_s)
329 self.assertEqual(info_a_s['state'], "up:standby-replay")
330
331 # Shrink the cluster
332 fs_a.set_max_mds(1)
333 fs_a.mon_manager.raw_cluster_cmd("mds", "stop", "{0}:1".format(fs_a.name))
334 self.wait_until_equal(
335 lambda: fs_a.get_active_names(), [mds_a],
336 60
337 )
338
339 # Both 'b' and 'b_s' should go back to being standbys
340 self.wait_until_equal(
341 lambda: self.mds_cluster.get_standby_daemons(), {mds_b, mds_b_s},
342 60
343 )
344
345
346class TestMultiFilesystems(CephFSTestCase):
347 CLIENTS_REQUIRED = 2
348 MDSS_REQUIRED = 4
349
350 # We'll create our own filesystems and start our own daemons
351 REQUIRE_FILESYSTEM = False
352
353 def setUp(self):
354 super(TestMultiFilesystems, self).setUp()
355 self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
356 "enable_multiple", "true",
357 "--yes-i-really-mean-it")
358
359 def _setup_two(self):
360 fs_a = self.mds_cluster.newfs("alpha")
361 fs_b = self.mds_cluster.newfs("bravo")
362
363 self.mds_cluster.mds_restart()
364
365 # Wait for both filesystems to go healthy
366 fs_a.wait_for_daemons()
367 fs_b.wait_for_daemons()
368
369 # Reconfigure client auth caps
370 for mount in self.mounts:
371 self.mds_cluster.mon_manager.raw_cluster_cmd_result(
372 'auth', 'caps', "client.{0}".format(mount.client_id),
373 'mds', 'allow',
374 'mon', 'allow r',
375 'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
376 fs_a.get_data_pool_name(), fs_b.get_data_pool_name()))
377
378 return fs_a, fs_b
379
380 def test_clients(self):
381 fs_a, fs_b = self._setup_two()
382
383 # Mount a client on fs_a
384 self.mount_a.mount(mount_fs_name=fs_a.name)
385 self.mount_a.write_n_mb("pad.bin", 1)
386 self.mount_a.write_n_mb("test.bin", 2)
387 a_created_ino = self.mount_a.path_to_ino("test.bin")
388 self.mount_a.create_files()
389
390 # Mount a client on fs_b
391 self.mount_b.mount(mount_fs_name=fs_b.name)
392 self.mount_b.write_n_mb("test.bin", 1)
393 b_created_ino = self.mount_b.path_to_ino("test.bin")
394 self.mount_b.create_files()
395
396 # Check that a non-default filesystem mount survives an MDS
397 # failover (i.e. that map subscription is continuous, not
398 # just the first time), reproduces #16022
399 old_fs_b_mds = fs_b.get_active_names()[0]
400 self.mds_cluster.mds_stop(old_fs_b_mds)
401 self.mds_cluster.mds_fail(old_fs_b_mds)
402 fs_b.wait_for_daemons()
403 background = self.mount_b.write_background()
404 # Raise exception if the write doesn't finish (i.e. if client
405 # has not kept up with MDS failure)
406 try:
407 self.wait_until_true(lambda: background.finished, timeout=30)
408 except RuntimeError:
409 # The mount is stuck, we'll have to force it to fail cleanly
410 background.stdin.close()
411 self.mount_b.umount_wait(force=True)
412 raise
413
414 self.mount_a.umount_wait()
415 self.mount_b.umount_wait()
416
417 # See that the client's files went into the correct pool
418 self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024))
419 self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024))
420
421 def test_standby(self):
422 fs_a, fs_b = self._setup_two()
423
424 # Assert that the remaining two MDS daemons are now standbys
425 a_daemons = fs_a.get_active_names()
426 b_daemons = fs_b.get_active_names()
427 self.assertEqual(len(a_daemons), 1)
428 self.assertEqual(len(b_daemons), 1)
429 original_a = a_daemons[0]
430 original_b = b_daemons[0]
431 expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons))
432
433 # Need all my standbys up as well as the active daemons
434 self.wait_for_daemon_start()
435 self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons())
436
437 # Kill fs_a's active MDS, see a standby take over
438 self.mds_cluster.mds_stop(original_a)
439 self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a)
440 self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30,
441 reject_fn=lambda v: v > 1)
442 # Assert that it's a *different* daemon that has now appeared in the map for fs_a
443 self.assertNotEqual(fs_a.get_active_names()[0], original_a)
444
445 # Kill fs_b's active MDS, see a standby take over
446 self.mds_cluster.mds_stop(original_b)
447 self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b)
448 self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
449 reject_fn=lambda v: v > 1)
450 # Assert that it's a *different* daemon that has now appeared in the map for fs_a
451 self.assertNotEqual(fs_b.get_active_names()[0], original_b)
452
453 # Both of the original active daemons should be gone, and all standbys used up
454 self.assertEqual(self.mds_cluster.get_standby_daemons(), set())
455
456 # Restart the ones I killed, see them reappear as standbys
457 self.mds_cluster.mds_restart(original_a)
458 self.mds_cluster.mds_restart(original_b)
459 self.wait_until_true(
460 lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(),
461 timeout=30
462 )
463
464 def test_grow_shrink(self):
465 # Usual setup...
466 fs_a, fs_b = self._setup_two()
7c673cae
FG
467
468 # Increase max_mds on fs_b, see a standby take up the role
469 fs_b.set_max_mds(2)
470 self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30,
471 reject_fn=lambda v: v > 2 or v < 1)
472
473 # Increase max_mds on fs_a, see a standby take up the role
474 fs_a.set_max_mds(2)
475 self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30,
476 reject_fn=lambda v: v > 2 or v < 1)
477
478 # Shrink fs_b back to 1, see a daemon go back to standby
479 fs_b.set_max_mds(1)
480 fs_b.deactivate(1)
481 self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
482 reject_fn=lambda v: v > 2 or v < 1)
483
484 # Grow fs_a up to 3, see the former fs_b daemon join it.
485 fs_a.set_max_mds(3)
486 self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60,
487 reject_fn=lambda v: v > 3 or v < 2)
488
489 def test_standby_for_name(self):
490 # Pick out exactly 4 daemons to be run during test
491 use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
492 mds_a, mds_b, mds_c, mds_d = use_daemons
493 log.info("Using MDS daemons: {0}".format(use_daemons))
494
495 def set_standby_for(leader, follower, replay):
496 self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
497 if replay:
498 self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
499
500 # Configure two pairs of MDSs that are standby for each other
501 set_standby_for(mds_a, mds_b, True)
502 set_standby_for(mds_b, mds_a, False)
503 set_standby_for(mds_c, mds_d, True)
504 set_standby_for(mds_d, mds_c, False)
505
506 # Create FS alpha and get mds_a to come up as active
507 fs_a = self.mds_cluster.newfs("alpha")
508 self.mds_cluster.mds_restart(mds_a)
509 fs_a.wait_for_daemons()
510 self.assertEqual(fs_a.get_active_names(), [mds_a])
511
512 # Create FS bravo and get mds_c to come up as active
513 fs_b = self.mds_cluster.newfs("bravo")
514 self.mds_cluster.mds_restart(mds_c)
515 fs_b.wait_for_daemons()
516 self.assertEqual(fs_b.get_active_names(), [mds_c])
517
518 # Start the standbys
519 self.mds_cluster.mds_restart(mds_b)
520 self.mds_cluster.mds_restart(mds_d)
521 self.wait_for_daemon_start([mds_b, mds_d])
522
523 def get_info_by_name(fs, mds_name):
524 mds_map = fs.get_mds_map()
525 for gid_str, info in mds_map['info'].items():
526 if info['name'] == mds_name:
527 return info
528
529 log.warn(json.dumps(mds_map, indent=2))
530 raise RuntimeError("MDS '{0}' not found in filesystem MDSMap".format(mds_name))
531
532 # See both standbys come up as standby replay for the correct ranks
533 # mds_b should be in filesystem alpha following mds_a
534 info_b = get_info_by_name(fs_a, mds_b)
535 self.assertEqual(info_b['state'], "up:standby-replay")
536 self.assertEqual(info_b['standby_for_name'], mds_a)
537 self.assertEqual(info_b['rank'], 0)
538 # mds_d should be in filesystem alpha following mds_c
539 info_d = get_info_by_name(fs_b, mds_d)
540 self.assertEqual(info_d['state'], "up:standby-replay")
541 self.assertEqual(info_d['standby_for_name'], mds_c)
542 self.assertEqual(info_d['rank'], 0)
543
544 # Kill both active daemons
545 self.mds_cluster.mds_stop(mds_a)
546 self.mds_cluster.mds_fail(mds_a)
547 self.mds_cluster.mds_stop(mds_c)
548 self.mds_cluster.mds_fail(mds_c)
549
550 # Wait for standbys to take over
551 fs_a.wait_for_daemons()
552 self.assertEqual(fs_a.get_active_names(), [mds_b])
553 fs_b.wait_for_daemons()
554 self.assertEqual(fs_b.get_active_names(), [mds_d])
555
556 # Start the original active daemons up again
557 self.mds_cluster.mds_restart(mds_a)
558 self.mds_cluster.mds_restart(mds_c)
559 self.wait_for_daemon_start([mds_a, mds_c])
560
561 self.assertEqual(set(self.mds_cluster.get_standby_daemons()),
562 {mds_a, mds_c})
563
564 def test_standby_for_rank(self):
565 use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
566 mds_a, mds_b, mds_c, mds_d = use_daemons
567 log.info("Using MDS daemons: {0}".format(use_daemons))
568
569 def set_standby_for(leader_rank, leader_fs, follower_id):
570 self.set_conf("mds.{0}".format(follower_id),
571 "mds_standby_for_rank", leader_rank)
572
573 fscid = leader_fs.get_namespace_id()
574 self.set_conf("mds.{0}".format(follower_id),
575 "mds_standby_for_fscid", fscid)
576
577 fs_a = self.mds_cluster.newfs("alpha")
578 fs_b = self.mds_cluster.newfs("bravo")
579 set_standby_for(0, fs_a, mds_a)
580 set_standby_for(0, fs_a, mds_b)
581 set_standby_for(0, fs_b, mds_c)
582 set_standby_for(0, fs_b, mds_d)
583
584 self.mds_cluster.mds_restart(mds_a)
585 fs_a.wait_for_daemons()
586 self.assertEqual(fs_a.get_active_names(), [mds_a])
587
588 self.mds_cluster.mds_restart(mds_c)
589 fs_b.wait_for_daemons()
590 self.assertEqual(fs_b.get_active_names(), [mds_c])
591
592 self.mds_cluster.mds_restart(mds_b)
593 self.mds_cluster.mds_restart(mds_d)
594 self.wait_for_daemon_start([mds_b, mds_d])
595
596 self.mds_cluster.mds_stop(mds_a)
597 self.mds_cluster.mds_fail(mds_a)
598 self.mds_cluster.mds_stop(mds_c)
599 self.mds_cluster.mds_fail(mds_c)
600
601 fs_a.wait_for_daemons()
602 self.assertEqual(fs_a.get_active_names(), [mds_b])
603 fs_b.wait_for_daemons()
604 self.assertEqual(fs_b.get_active_names(), [mds_d])
605
606 def test_standby_for_fscid(self):
607 """
608 That I can set a standby FSCID with no rank, and the result is
609 that daemons join any rank for that filesystem.
610 """
611 use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
612 mds_a, mds_b, mds_c, mds_d = use_daemons
613
614 log.info("Using MDS daemons: {0}".format(use_daemons))
615
616 def set_standby_for(leader_fs, follower_id):
617 fscid = leader_fs.get_namespace_id()
618 self.set_conf("mds.{0}".format(follower_id),
619 "mds_standby_for_fscid", fscid)
620
621 # Create two filesystems which should have two ranks each
622 fs_a = self.mds_cluster.newfs("alpha")
7c673cae
FG
623
624 fs_b = self.mds_cluster.newfs("bravo")
7c673cae
FG
625
626 fs_a.set_max_mds(2)
627 fs_b.set_max_mds(2)
628
629 # Set all the daemons to have a FSCID assignment but no other
630 # standby preferences.
631 set_standby_for(fs_a, mds_a)
632 set_standby_for(fs_a, mds_b)
633 set_standby_for(fs_b, mds_c)
634 set_standby_for(fs_b, mds_d)
635
636 # Now when we start all daemons at once, they should fall into
637 # ranks in the right filesystem
638 self.mds_cluster.mds_restart(mds_a)
639 self.mds_cluster.mds_restart(mds_b)
640 self.mds_cluster.mds_restart(mds_c)
641 self.mds_cluster.mds_restart(mds_d)
642 self.wait_for_daemon_start([mds_a, mds_b, mds_c, mds_d])
643 fs_a.wait_for_daemons()
644 fs_b.wait_for_daemons()
645 self.assertEqual(set(fs_a.get_active_names()), {mds_a, mds_b})
646 self.assertEqual(set(fs_b.get_active_names()), {mds_c, mds_d})
647
648 def test_standby_for_invalid_fscid(self):
649 """
650 That an invalid standby_fscid does not cause a mon crash
651 """
652 use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
653 mds_a, mds_b, mds_c = use_daemons
654 log.info("Using MDS daemons: {0}".format(use_daemons))
655
656 def set_standby_for_rank(leader_rank, follower_id):
657 self.set_conf("mds.{0}".format(follower_id),
658 "mds_standby_for_rank", leader_rank)
659
660 # Create one fs
661 fs_a = self.mds_cluster.newfs("cephfs")
662
663 # Get configured mons in the cluster, so we can see if any
664 # crashed later.
665 configured_mons = fs_a.mon_manager.get_mon_quorum()
666
667 # Set all the daemons to have a rank assignment but no other
668 # standby preferences.
669 set_standby_for_rank(0, mds_a)
670 set_standby_for_rank(0, mds_b)
671
672 # Set third daemon to have invalid fscid assignment and no other
673 # standby preferences
674 invalid_fscid = 123
675 self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid)
676
677 #Restart all the daemons to make the standby preference applied
678 self.mds_cluster.mds_restart(mds_a)
679 self.mds_cluster.mds_restart(mds_b)
680 self.mds_cluster.mds_restart(mds_c)
681 self.wait_for_daemon_start([mds_a, mds_b, mds_c])
682
683 #Stop active mds daemon service of fs
684 if (fs_a.get_active_names(), [mds_a]):
685 self.mds_cluster.mds_stop(mds_a)
686 self.mds_cluster.mds_fail(mds_a)
687 fs_a.wait_for_daemons()
688 else:
689 self.mds_cluster.mds_stop(mds_b)
690 self.mds_cluster.mds_fail(mds_b)
691 fs_a.wait_for_daemons()
692
693 #Get active mons from cluster
694 active_mons = fs_a.mon_manager.get_mon_quorum()
695
696 #Check for active quorum mon status and configured mon status
697 self.assertEqual(active_mons, configured_mons,
698 "Not all mons are in quorum Invalid standby invalid fscid test failed!")