3 Teuthology task for exercising CephFS client recovery
7 from textwrap
import dedent
9 import distutils
.version
as version
13 from teuthology
.orchestra
import run
14 from teuthology
.orchestra
.run
import CommandFailedError
, ConnectionLostError
15 from tasks
.cephfs
.fuse_mount
import FuseMount
16 from tasks
.cephfs
.cephfs_test_case
import CephFSTestCase
17 from teuthology
.packaging
import get_package_version
19 log
= logging
.getLogger(__name__
)
22 # Arbitrary timeouts for operations involving restarting
23 # an MDS or waiting for it to come up
24 MDS_RESTART_GRACE
= 60
27 class TestClientNetworkRecovery(CephFSTestCase
):
28 REQUIRE_KCLIENT_REMOTE
= True
29 REQUIRE_ONE_CLIENT_REMOTE
= True
32 LOAD_SETTINGS
= ["mds_reconnect_timeout", "ms_max_backoff"]
34 # Environment references
35 mds_reconnect_timeout
= None
38 def test_network_death(self
):
40 Simulate software freeze or temporary network failure.
42 Check that the client blocks I/O during failure, and completes
46 session_timeout
= self
.fs
.get_var("session_timeout")
47 self
.fs
.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
49 # We only need one client
50 self
.mount_b
.umount_wait()
52 # Initially our one client session should be visible
53 client_id
= self
.mount_a
.get_global_id()
54 ls_data
= self
._session
_list
()
55 self
.assert_session_count(1, ls_data
)
56 self
.assertEqual(ls_data
[0]['id'], client_id
)
57 self
.assert_session_state(client_id
, "open")
59 # ...and capable of doing I/O without blocking
60 self
.mount_a
.create_files()
62 # ...but if we turn off the network
63 self
.fs
.set_clients_block(True)
65 # ...and try and start an I/O
66 write_blocked
= self
.mount_a
.write_background()
68 # ...then it should block
69 self
.assertFalse(write_blocked
.finished
)
70 self
.assert_session_state(client_id
, "open")
71 time
.sleep(session_timeout
* 1.5) # Long enough for MDS to consider session stale
72 self
.assertFalse(write_blocked
.finished
)
73 self
.assert_session_state(client_id
, "stale")
75 # ...until we re-enable I/O
76 self
.fs
.set_clients_block(False)
78 # ...when it should complete promptly
80 self
.wait_until_true(lambda: write_blocked
.finished
, self
.ms_max_backoff
* 2)
81 write_blocked
.wait() # Already know we're finished, wait() to raise exception on errors
82 recovery_time
= time
.time() - a
83 log
.info("recovery time: {0}".format(recovery_time
))
84 self
.assert_session_state(client_id
, "open")
87 class TestClientRecovery(CephFSTestCase
):
88 REQUIRE_KCLIENT_REMOTE
= True
91 LOAD_SETTINGS
= ["mds_reconnect_timeout", "ms_max_backoff"]
93 # Environment references
94 mds_reconnect_timeout
= None
98 # Check that two clients come up healthy and see each others' files
99 # =====================================================
100 self
.mount_a
.create_files()
101 self
.mount_a
.check_files()
102 self
.mount_a
.umount_wait()
104 self
.mount_b
.check_files()
106 self
.mount_a
.mount_wait()
108 # Check that the admin socket interface is correctly reporting
110 # =====================================================
111 ls_data
= self
._session
_list
()
112 self
.assert_session_count(2, ls_data
)
115 set([l
['id'] for l
in ls_data
]),
116 {self
.mount_a
.get_global_id(), self
.mount_b
.get_global_id()}
119 def test_restart(self
):
120 # Check that after an MDS restart both clients reconnect and continue
122 # =====================================================
123 self
.fs
.mds_fail_restart()
124 self
.fs
.wait_for_state('up:active', timeout
=MDS_RESTART_GRACE
)
126 self
.mount_a
.create_destroy()
127 self
.mount_b
.create_destroy()
129 def _session_num_caps(self
, client_id
):
130 ls_data
= self
.fs
.mds_asok(['session', 'ls'])
131 return int(self
._session
_by
_id
(ls_data
).get(client_id
, {'num_caps': None})['num_caps'])
133 def test_reconnect_timeout(self
):
136 # Check that if I stop an MDS and a client goes away, the MDS waits
137 # for the reconnect period
141 mount_a_client_id
= self
.mount_a
.get_global_id()
142 self
.mount_a
.umount_wait(force
=True)
144 self
.fs
.mds_restart()
146 self
.fs
.wait_for_state('up:reconnect', reject
='up:active', timeout
=MDS_RESTART_GRACE
)
147 # Check that the MDS locally reports its state correctly
148 status
= self
.fs
.mds_asok(['status'])
149 self
.assertIn("reconnect_status", status
)
151 ls_data
= self
._session
_list
()
152 self
.assert_session_count(2, ls_data
)
154 # The session for the dead client should have the 'reconnect' flag set
155 self
.assertTrue(self
.get_session(mount_a_client_id
)['reconnecting'])
157 # Wait for the reconnect state to clear, this should take the
158 # reconnect timeout period.
159 in_reconnect_for
= self
.fs
.wait_for_state('up:active', timeout
=self
.mds_reconnect_timeout
* 2)
160 # Check that the period we waited to enter active is within a factor
161 # of two of the reconnect timeout.
162 self
.assertGreater(in_reconnect_for
, self
.mds_reconnect_timeout
// 2,
163 "Should have been in reconnect phase for {0} but only took {1}".format(
164 self
.mds_reconnect_timeout
, in_reconnect_for
167 self
.assert_session_count(1)
169 # Check that the client that timed out during reconnect can
170 # mount again and do I/O
171 self
.mount_a
.mount_wait()
172 self
.mount_a
.create_destroy()
174 self
.assert_session_count(2)
176 def test_reconnect_eviction(self
):
177 # Eviction during reconnect
178 # =========================
179 mount_a_client_id
= self
.mount_a
.get_global_id()
184 # The mount goes away while the MDS is offline
190 self
.fs
.mds_restart()
192 # Enter reconnect phase
193 self
.fs
.wait_for_state('up:reconnect', reject
='up:active', timeout
=MDS_RESTART_GRACE
)
194 self
.assert_session_count(2)
196 # Evict the stuck client
197 self
.fs
.mds_asok(['session', 'evict', "%s" % mount_a_client_id
])
198 self
.assert_session_count(1)
200 # Observe that we proceed to active phase without waiting full reconnect timeout
201 evict_til_active
= self
.fs
.wait_for_state('up:active', timeout
=MDS_RESTART_GRACE
)
202 # Once we evict the troublemaker, the reconnect phase should complete
203 # in well under the reconnect timeout.
204 self
.assertLess(evict_til_active
, self
.mds_reconnect_timeout
* 0.5,
205 "reconnect did not complete soon enough after eviction, took {0}".format(
209 # We killed earlier so must clean up before trying to use again
210 self
.mount_a
.kill_cleanup()
212 # Bring the client back
213 self
.mount_a
.mount_wait()
214 self
.mount_a
.create_destroy()
216 def _test_stale_caps(self
, write
):
217 session_timeout
= self
.fs
.get_var("session_timeout")
219 # Capability release from stale session
220 # =====================================
222 cap_holder
= self
.mount_a
.open_background()
224 self
.mount_a
.run_shell(["touch", "background_file"])
225 self
.mount_a
.umount_wait()
226 self
.mount_a
.mount_wait()
227 cap_holder
= self
.mount_a
.open_background(write
=False)
229 self
.assert_session_count(2)
230 mount_a_gid
= self
.mount_a
.get_global_id()
232 # Wait for the file to be visible from another client, indicating
233 # that mount_a has completed its network ops
234 self
.mount_b
.wait_for_visible()
236 # Simulate client death
239 # wait for it to die so it doesn't voluntarily release buffer cap
243 # Now, after session_timeout seconds, the waiter should
244 # complete their operation when the MDS marks the holder's
246 cap_waiter
= self
.mount_b
.write_background()
251 # Should have succeeded
252 self
.assertEqual(cap_waiter
.exitstatus
, 0)
255 self
.assert_session_count(1)
257 self
.assert_session_state(mount_a_gid
, "stale")
260 log
.info("cap_waiter waited {0}s".format(cap_waited
))
261 self
.assertTrue(session_timeout
/ 2.0 <= cap_waited
<= session_timeout
* 2.0,
262 "Capability handover took {0}, expected approx {1}".format(
263 cap_waited
, session_timeout
266 cap_holder
.stdin
.close()
269 except (CommandFailedError
, ConnectionLostError
):
270 # We killed it (and possibly its node), so it raises an error
273 # teardown() doesn't quite handle this case cleanly, so help it out
274 self
.mount_a
.kill_cleanup()
277 self
.mount_a
.wait_until_mounted()
279 def test_stale_read_caps(self
):
280 self
._test
_stale
_caps
(False)
282 def test_stale_write_caps(self
):
283 self
._test
_stale
_caps
(True)
285 def test_evicted_caps(self
):
286 # Eviction while holding a capability
287 # ===================================
289 session_timeout
= self
.fs
.get_var("session_timeout")
291 # Take out a write capability on a file on client A,
292 # and then immediately kill it.
293 cap_holder
= self
.mount_a
.open_background()
294 mount_a_client_id
= self
.mount_a
.get_global_id()
296 # Wait for the file to be visible from another client, indicating
297 # that mount_a has completed its network ops
298 self
.mount_b
.wait_for_visible()
300 # Simulate client death
303 # wait for it to die so it doesn't voluntarily release buffer cap
307 # The waiter should get stuck waiting for the capability
308 # held on the MDS by the now-dead client A
309 cap_waiter
= self
.mount_b
.write_background()
311 self
.assertFalse(cap_waiter
.finished
)
313 self
.fs
.mds_asok(['session', 'evict', "%s" % mount_a_client_id
])
314 # Now, because I evicted the old holder of the capability, it should
315 # immediately get handed over to the waiter
320 log
.info("cap_waiter waited {0}s".format(cap_waited
))
321 # This is the check that it happened 'now' rather than waiting
322 # for the session timeout
323 self
.assertLess(cap_waited
, session_timeout
/ 2.0,
324 "Capability handover took {0}, expected less than {1}".format(
325 cap_waited
, session_timeout
/ 2.0
328 cap_holder
.stdin
.close()
331 except (CommandFailedError
, ConnectionLostError
):
332 # We killed it (and possibly its node), so it raises an error
335 self
.mount_a
.kill_cleanup()
338 self
.mount_a
.wait_until_mounted()
340 def test_trim_caps(self
):
341 # Trim capability when reconnecting MDS
342 # ===================================
345 # Create lots of files
346 for i
in range(count
):
347 self
.mount_a
.run_shell(["touch", "f{0}".format(i
)])
349 # Populate mount_b's cache
350 self
.mount_b
.run_shell(["ls", "-l"])
352 client_id
= self
.mount_b
.get_global_id()
353 num_caps
= self
._session
_num
_caps
(client_id
)
354 self
.assertGreaterEqual(num_caps
, count
)
356 # Restart MDS. client should trim its cache when reconnecting to the MDS
357 self
.fs
.mds_fail_restart()
358 self
.fs
.wait_for_state('up:active', timeout
=MDS_RESTART_GRACE
)
360 num_caps
= self
._session
_num
_caps
(client_id
)
361 self
.assertLess(num_caps
, count
,
362 "should have less than {0} capabilities, have {1}".format(
366 def _is_flockable(self
):
367 a_version_str
= get_package_version(self
.mount_a
.client_remote
, "fuse")
368 b_version_str
= get_package_version(self
.mount_b
.client_remote
, "fuse")
369 flock_version_str
= "2.9"
371 version_regex
= re
.compile(r
"[0-9\.]+")
372 a_result
= version_regex
.match(a_version_str
)
373 self
.assertTrue(a_result
)
374 b_result
= version_regex
.match(b_version_str
)
375 self
.assertTrue(b_result
)
376 a_version
= version
.StrictVersion(a_result
.group())
377 b_version
= version
.StrictVersion(b_result
.group())
378 flock_version
=version
.StrictVersion(flock_version_str
)
380 if (a_version
>= flock_version
and b_version
>= flock_version
):
381 log
.info("flock locks are available")
384 log
.info("not testing flock locks, machines have versions {av} and {bv}".format(
385 av
=a_version_str
,bv
=b_version_str
))
388 def test_filelock(self
):
390 Check that file lock doesn't get lost after an MDS restart
393 flockable
= self
._is
_flockable
()
394 lock_holder
= self
.mount_a
.lock_background(do_flock
=flockable
)
396 self
.mount_b
.wait_for_visible("background_file-2")
397 self
.mount_b
.check_filelock(do_flock
=flockable
)
399 self
.fs
.mds_fail_restart()
400 self
.fs
.wait_for_state('up:active', timeout
=MDS_RESTART_GRACE
)
402 self
.mount_b
.check_filelock(do_flock
=flockable
)
404 # Tear down the background process
405 lock_holder
.stdin
.close()
408 except (CommandFailedError
, ConnectionLostError
):
409 # We killed it, so it raises an error
412 def test_filelock_eviction(self
):
414 Check that file lock held by evicted client is given to
417 if not self
._is
_flockable
():
418 self
.skipTest("flock is not available")
420 lock_holder
= self
.mount_a
.lock_background()
421 self
.mount_b
.wait_for_visible("background_file-2")
422 self
.mount_b
.check_filelock()
424 lock_taker
= self
.mount_b
.lock_and_release()
425 # Check the taker is waiting (doesn't get it immediately)
427 self
.assertFalse(lock_holder
.finished
)
428 self
.assertFalse(lock_taker
.finished
)
431 mount_a_client_id
= self
.mount_a
.get_global_id()
432 self
.fs
.mds_asok(['session', 'evict', "%s" % mount_a_client_id
])
434 # Evicting mount_a should let mount_b's attempt to take the lock
436 self
.wait_until_true(lambda: lock_taker
.finished
, timeout
=10)
438 # teardown() doesn't quite handle this case cleanly, so help it out
440 self
.mount_a
.kill_cleanup()
442 # Bring the client back
443 self
.mount_a
.mount_wait()
445 def test_dir_fsync(self
):
446 self
._test
_fsync
(True);
448 def test_create_fsync(self
):
449 self
._test
_fsync
(False);
451 def _test_fsync(self
, dirfsync
):
453 That calls to fsync guarantee visibility of metadata to another
454 client immediately after the fsyncing client dies.
457 # Leave this guy out until he's needed
458 self
.mount_b
.umount_wait()
460 # Create dir + child dentry on client A, and fsync the dir
461 path
= os
.path
.join(self
.mount_a
.mountpoint
, "subdir")
462 self
.mount_a
.run_python(
469 print("Starting creation...")
473 dfd = os.open(path, os.O_DIRECTORY)
475 fd = open(os.path.join(path, "childfile"), "w")
476 print("Finished creation in {{0}}s".format(time.time() - start))
478 print("Starting fsync...")
484 print("Finished fsync in {{0}}s".format(time.time() - start))
485 """.format(path
=path
,dirfsync
=str(dirfsync
)))
488 # Immediately kill the MDS and then client A
492 self
.mount_a
.kill_cleanup()
494 # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay
495 self
.fs
.mds_restart()
496 log
.info("Waiting for reconnect...")
497 self
.fs
.wait_for_state("up:reconnect")
498 log
.info("Waiting for active...")
499 self
.fs
.wait_for_state("up:active", timeout
=MDS_RESTART_GRACE
+ self
.mds_reconnect_timeout
)
500 log
.info("Reached active...")
502 # Is the child dentry visible from mount B?
503 self
.mount_b
.mount_wait()
504 self
.mount_b
.run_shell(["ls", "subdir/childfile"])
506 def test_unmount_for_evicted_client(self
):
507 """Test if client hangs on unmount after evicting the client."""
508 mount_a_client_id
= self
.mount_a
.get_global_id()
509 self
.fs
.mds_asok(['session', 'evict', "%s" % mount_a_client_id
])
511 self
.mount_a
.umount_wait(require_clean
=True, timeout
=30)
513 def test_stale_renew(self
):
514 if not isinstance(self
.mount_a
, FuseMount
):
515 self
.skipTest("Require FUSE client to handle signal STOP/CONT")
517 session_timeout
= self
.fs
.get_var("session_timeout")
519 self
.mount_a
.run_shell(["mkdir", "testdir"])
520 self
.mount_a
.run_shell(["touch", "testdir/file1"])
521 # populate readdir cache
522 self
.mount_a
.run_shell(["ls", "testdir"])
523 self
.mount_b
.run_shell(["ls", "testdir"])
525 # check if readdir cache is effective
526 initial_readdirs
= self
.fs
.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
527 self
.mount_b
.run_shell(["ls", "testdir"])
528 current_readdirs
= self
.fs
.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
529 self
.assertEqual(current_readdirs
, initial_readdirs
);
531 mount_b_gid
= self
.mount_b
.get_global_id()
532 mount_b_pid
= self
.mount_b
.get_client_pid()
533 # stop ceph-fuse process of mount_b
534 self
.mount_b
.client_remote
.run(args
=["sudo", "kill", "-STOP", mount_b_pid
])
536 self
.assert_session_state(mount_b_gid
, "open")
537 time
.sleep(session_timeout
* 1.5) # Long enough for MDS to consider session stale
539 self
.mount_a
.run_shell(["touch", "testdir/file2"])
540 self
.assert_session_state(mount_b_gid
, "stale")
542 # resume ceph-fuse process of mount_b
543 self
.mount_b
.client_remote
.run(args
=["sudo", "kill", "-CONT", mount_b_pid
])
544 # Is the new file visible from mount_b? (caps become invalid after session stale)
545 self
.mount_b
.run_shell(["ls", "testdir/file2"])
547 def test_abort_conn(self
):
549 Check that abort_conn() skips closing mds sessions.
551 if not isinstance(self
.mount_a
, FuseMount
):
552 self
.skipTest("Testing libcephfs function")
554 self
.fs
.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
555 session_timeout
= self
.fs
.get_var("session_timeout")
557 self
.mount_a
.umount_wait()
558 self
.mount_b
.umount_wait()
560 gid_str
= self
.mount_a
.run_python(dedent("""
561 import cephfs as libcephfs
562 cephfs = libcephfs.LibCephFS(conffile='')
564 client_id = cephfs.get_instance_id()
571 self
.assert_session_state(gid
, "open")
572 time
.sleep(session_timeout
* 1.5) # Long enough for MDS to consider session stale
573 self
.assert_session_state(gid
, "stale")
575 def test_dont_mark_unresponsive_client_stale(self
):
577 Test that an unresponsive client holding caps is not marked stale or
578 evicted unless another clients wants its caps.
580 if not isinstance(self
.mount_a
, FuseMount
):
581 self
.skipTest("Require FUSE client to handle signal STOP/CONT")
583 # XXX: To conduct this test we need at least two clients since a
584 # single client is never evcited by MDS.
586 SESSION_AUTOCLOSE
= 50
587 time_at_beg
= time
.time()
588 mount_a_gid
= self
.mount_a
.get_global_id()
589 _
= self
.mount_a
.client_pid
590 self
.fs
.set_var('session_timeout', SESSION_TIMEOUT
)
591 self
.fs
.set_var('session_autoclose', SESSION_AUTOCLOSE
)
592 self
.assert_session_count(2, self
.fs
.mds_asok(['session', 'ls']))
594 # test that client holding cap not required by any other client is not
595 # marked stale when it becomes unresponsive.
596 self
.mount_a
.run_shell(['mkdir', 'dir'])
597 self
.mount_a
.send_signal('sigstop')
598 time
.sleep(SESSION_TIMEOUT
+ 2)
599 self
.assert_session_state(mount_a_gid
, "open")
601 # test that other clients have to wait to get the caps from
602 # unresponsive client until session_autoclose.
603 self
.mount_b
.run_shell(['stat', 'dir'])
604 self
.assert_session_count(1, self
.fs
.mds_asok(['session', 'ls']))
605 self
.assertLess(time
.time(), time_at_beg
+ SESSION_AUTOCLOSE
)
607 self
.mount_a
.send_signal('sigcont')
609 def test_config_session_timeout(self
):
610 self
.fs
.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
611 session_timeout
= self
.fs
.get_var("session_timeout")
612 mount_a_gid
= self
.mount_a
.get_global_id()
614 self
.fs
.mds_asok(['session', 'config', '%s' % mount_a_gid
, 'timeout', '%s' % (session_timeout
* 2)])
618 self
.assert_session_count(2)
620 time
.sleep(session_timeout
* 1.5)
621 self
.assert_session_state(mount_a_gid
, "open")
623 time
.sleep(session_timeout
)
624 self
.assert_session_count(1)
626 self
.mount_a
.kill_cleanup()
628 def test_reconnect_after_blacklisted(self
):
630 Test reconnect after blacklisted.
631 - writing to a fd that was opened before blacklist should return -EBADF
632 - reading/writing to a file with lost file locks should return -EIO
633 - readonly fd should continue to work
636 self
.mount_a
.umount_wait()
638 if isinstance(self
.mount_a
, FuseMount
):
639 self
.mount_a
.mount(mount_options
=['--client_reconnect_stale=1', '--fuse_disable_pagecache=1'])
642 self
.mount_a
.mount(mount_options
=['recover_session=clean'])
643 except CommandFailedError
:
644 self
.mount_a
.kill_cleanup()
645 self
.skipTest("Not implemented in current kernel")
647 self
.mount_a
.wait_until_mounted()
649 path
= os
.path
.join(self
.mount_a
.mountpoint
, 'testfile_reconnect_after_blacklisted')
650 pyscript
= dedent("""
657 fd1 = os.open("{path}.1", os.O_RDWR | os.O_CREAT, 0O666)
658 fd2 = os.open("{path}.1", os.O_RDONLY)
659 fd3 = os.open("{path}.2", os.O_RDWR | os.O_CREAT, 0O666)
660 fd4 = os.open("{path}.2", os.O_RDONLY)
662 os.write(fd1, b'content')
665 os.write(fd3, b'content')
667 fcntl.flock(fd4, fcntl.LOCK_SH | fcntl.LOCK_NB)
674 # wait for mds to close session
677 # trigger 'open session' message. kclient relies on 'session reject' message
678 # to detect if itself is blacklisted
684 # wait for auto reconnect
688 os.write(fd1, b'content')
690 if e.errno != errno.EBADF:
693 raise RuntimeError("write() failed to raise error")
700 if e.errno != errno.EIO:
703 raise RuntimeError("read() failed to raise error")
704 """).format(path
=path
)
705 rproc
= self
.mount_a
.client_remote
.run(
706 args
=['sudo', 'python3', '-c', pyscript
],
707 wait
=False, stdin
=run
.PIPE
, stdout
=run
.PIPE
)
709 rproc
.stdout
.readline()
711 mount_a_client_id
= self
.mount_a
.get_global_id()
712 self
.fs
.mds_asok(['session', 'evict', "%s" % mount_a_client_id
])
714 rproc
.stdin
.writelines(['done\n'])
718 self
.assertEqual(rproc
.exitstatus
, 0)