ceph/qa/tasks/cephfs/test_client_recovery.py

   1
   2 """
   3 Teuthology task for exercising CephFS client recovery
   4 """
   5
   6 import logging
   7 from textwrap import dedent
   8 import time
   9 import distutils.version as version
  10 import re
  11 import os
  12
  13 from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
  14 from tasks.cephfs.fuse_mount import FuseMount
  15 from tasks.cephfs.cephfs_test_case import CephFSTestCase
  16 from teuthology.packaging import get_package_version
  17 from unittest import SkipTest
  18
  19
  20 log = logging.getLogger(__name__)
  21
  22
  23 # Arbitrary timeouts for operations involving restarting
  24 # an MDS or waiting for it to come up
  25 MDS_RESTART_GRACE = 60
  26
  27
  28 class TestClientNetworkRecovery(CephFSTestCase):
  29     REQUIRE_KCLIENT_REMOTE = True
  30     REQUIRE_ONE_CLIENT_REMOTE = True
  31     CLIENTS_REQUIRED = 2
  32
  33     LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
  34
  35     # Environment references
  36     mds_session_timeout = None
  37     mds_reconnect_timeout = None
  38     ms_max_backoff = None
  39
  40     def test_network_death(self):
  41         """
  42         Simulate software freeze or temporary network failure.
  43
  44         Check that the client blocks I/O during failure, and completes
  45         I/O after failure.
  46         """
  47
  48         # We only need one client
  49         self.mount_b.umount_wait()
  50
  51         # Initially our one client session should be visible
  52         client_id = self.mount_a.get_global_id()
  53         ls_data = self._session_list()
  54         self.assert_session_count(1, ls_data)
  55         self.assertEqual(ls_data[0]['id'], client_id)
  56         self.assert_session_state(client_id, "open")
  57
  58         # ...and capable of doing I/O without blocking
  59         self.mount_a.create_files()
  60
  61         # ...but if we turn off the network
  62         self.fs.set_clients_block(True)
  63
  64         # ...and try and start an I/O
  65         write_blocked = self.mount_a.write_background()
  66
  67         # ...then it should block
  68         self.assertFalse(write_blocked.finished)
  69         self.assert_session_state(client_id, "open")
  70         time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
  71         self.assertFalse(write_blocked.finished)
  72         self.assert_session_state(client_id, "stale")
  73
  74         # ...until we re-enable I/O
  75         self.fs.set_clients_block(False)
  76
  77         # ...when it should complete promptly
  78         a = time.time()
  79         self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2)
  80         write_blocked.wait()  # Already know we're finished, wait() to raise exception on errors
  81         recovery_time = time.time() - a
  82         log.info("recovery time: {0}".format(recovery_time))
  83         self.assert_session_state(client_id, "open")
  84
  85
  86 class TestClientRecovery(CephFSTestCase):
  87     REQUIRE_KCLIENT_REMOTE = True
  88     CLIENTS_REQUIRED = 2
  89
  90     LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
  91
  92     # Environment references
  93     mds_session_timeout = None
  94     mds_reconnect_timeout = None
  95     ms_max_backoff = None
  96
  97     def test_basic(self):
  98         # Check that two clients come up healthy and see each others' files
  99         # =====================================================
 100         self.mount_a.create_files()
 101         self.mount_a.check_files()
 102         self.mount_a.umount_wait()
 103
 104         self.mount_b.check_files()
 105
 106         self.mount_a.mount()
 107         self.mount_a.wait_until_mounted()
 108
 109         # Check that the admin socket interface is correctly reporting
 110         # two sessions
 111         # =====================================================
 112         ls_data = self._session_list()
 113         self.assert_session_count(2, ls_data)
 114
 115         self.assertSetEqual(
 116             set([l['id'] for l in ls_data]),
 117             {self.mount_a.get_global_id(), self.mount_b.get_global_id()}
 118         )
 119
 120     def test_restart(self):
 121         # Check that after an MDS restart both clients reconnect and continue
 122         # to handle I/O
 123         # =====================================================
 124         self.fs.mds_fail_restart()
 125         self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
 126
 127         self.mount_a.create_destroy()
 128         self.mount_b.create_destroy()
 129
 130     def _session_num_caps(self, client_id):
 131         ls_data = self.fs.mds_asok(['session', 'ls'])
 132         return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps'])
 133
 134     def test_reconnect_timeout(self):
 135         # Reconnect timeout
 136         # =================
 137         # Check that if I stop an MDS and a client goes away, the MDS waits
 138         # for the reconnect period
 139         self.fs.mds_stop()
 140         self.fs.mds_fail()
 141
 142         mount_a_client_id = self.mount_a.get_global_id()
 143         self.mount_a.umount_wait(force=True)
 144
 145         self.fs.mds_restart()
 146
 147         self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
 148         # Check that the MDS locally reports its state correctly
 149         status = self.fs.mds_asok(['status'])
 150         self.assertIn("reconnect_status", status)
 151
 152         ls_data = self._session_list()
 153         self.assert_session_count(2, ls_data)
 154
 155         # The session for the dead client should have the 'reconnect' flag set
 156         self.assertTrue(self.get_session(mount_a_client_id)['reconnecting'])
 157
 158         # Wait for the reconnect state to clear, this should take the
 159         # reconnect timeout period.
 160         in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2)
 161         # Check that the period we waited to enter active is within a factor
 162         # of two of the reconnect timeout.
 163         self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2,
 164                            "Should have been in reconnect phase for {0} but only took {1}".format(
 165                                self.mds_reconnect_timeout, in_reconnect_for
 166                            ))
 167
 168         self.assert_session_count(1)
 169
 170         # Check that the client that timed out during reconnect can
 171         # mount again and do I/O
 172         self.mount_a.mount()
 173         self.mount_a.wait_until_mounted()
 174         self.mount_a.create_destroy()
 175
 176         self.assert_session_count(2)
 177
 178     def test_reconnect_eviction(self):
 179         # Eviction during reconnect
 180         # =========================
 181         mount_a_client_id = self.mount_a.get_global_id()
 182
 183         self.fs.mds_stop()
 184         self.fs.mds_fail()
 185
 186         # The mount goes away while the MDS is offline
 187         self.mount_a.kill()
 188
 189         self.fs.mds_restart()
 190
 191         # Enter reconnect phase
 192         self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
 193         self.assert_session_count(2)
 194
 195         # Evict the stuck client
 196         self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
 197         self.assert_session_count(1)
 198
 199         # Observe that we proceed to active phase without waiting full reconnect timeout
 200         evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
 201         # Once we evict the troublemaker, the reconnect phase should complete
 202         # in well under the reconnect timeout.
 203         self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5,
 204                         "reconnect did not complete soon enough after eviction, took {0}".format(
 205                             evict_til_active
 206                         ))
 207
 208         # We killed earlier so must clean up before trying to use again
 209         self.mount_a.kill_cleanup()
 210
 211         # Bring the client back
 212         self.mount_a.mount()
 213         self.mount_a.wait_until_mounted()
 214         self.mount_a.create_destroy()
 215
 216     def test_stale_caps(self):
 217         # Capability release from stale session
 218         # =====================================
 219         cap_holder = self.mount_a.open_background()
 220
 221         # Wait for the file to be visible from another client, indicating
 222         # that mount_a has completed its network ops
 223         self.mount_b.wait_for_visible()
 224
 225         # Simulate client death
 226         self.mount_a.kill()
 227
 228         try:
 229             # Now, after mds_session_timeout seconds, the waiter should
 230             # complete their operation when the MDS marks the holder's
 231             # session stale.
 232             cap_waiter = self.mount_b.write_background()
 233             a = time.time()
 234             cap_waiter.wait()
 235             b = time.time()
 236
 237             # Should have succeeded
 238             self.assertEqual(cap_waiter.exitstatus, 0)
 239
 240             cap_waited = b - a
 241             log.info("cap_waiter waited {0}s".format(cap_waited))
 242             self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
 243                             "Capability handover took {0}, expected approx {1}".format(
 244                                 cap_waited, self.mds_session_timeout
 245                             ))
 246
 247             cap_holder.stdin.close()
 248             try:
 249                 cap_holder.wait()
 250             except (CommandFailedError, ConnectionLostError):
 251                 # We killed it (and possibly its node), so it raises an error
 252                 pass
 253         finally:
 254             # teardown() doesn't quite handle this case cleanly, so help it out
 255             self.mount_a.kill_cleanup()
 256
 257         self.mount_a.mount()
 258         self.mount_a.wait_until_mounted()
 259
 260     def test_evicted_caps(self):
 261         # Eviction while holding a capability
 262         # ===================================
 263
 264         # Take out a write capability on a file on client A,
 265         # and then immediately kill it.
 266         cap_holder = self.mount_a.open_background()
 267         mount_a_client_id = self.mount_a.get_global_id()
 268
 269         # Wait for the file to be visible from another client, indicating
 270         # that mount_a has completed its network ops
 271         self.mount_b.wait_for_visible()
 272
 273         # Simulate client death
 274         self.mount_a.kill()
 275
 276         try:
 277             # The waiter should get stuck waiting for the capability
 278             # held on the MDS by the now-dead client A
 279             cap_waiter = self.mount_b.write_background()
 280             time.sleep(5)
 281             self.assertFalse(cap_waiter.finished)
 282
 283             self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
 284             # Now, because I evicted the old holder of the capability, it should
 285             # immediately get handed over to the waiter
 286             a = time.time()
 287             cap_waiter.wait()
 288             b = time.time()
 289             cap_waited = b - a
 290             log.info("cap_waiter waited {0}s".format(cap_waited))
 291             # This is the check that it happened 'now' rather than waiting
 292             # for the session timeout
 293             self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
 294                             "Capability handover took {0}, expected less than {1}".format(
 295                                 cap_waited, self.mds_session_timeout / 2.0
 296                             ))
 297
 298             cap_holder.stdin.close()
 299             try:
 300                 cap_holder.wait()
 301             except (CommandFailedError, ConnectionLostError):
 302                 # We killed it (and possibly its node), so it raises an error
 303                 pass
 304         finally:
 305             self.mount_a.kill_cleanup()
 306
 307         self.mount_a.mount()
 308         self.mount_a.wait_until_mounted()
 309
 310     def test_trim_caps(self):
 311         # Trim capability when reconnecting MDS
 312         # ===================================
 313
 314         count = 500
 315         # Create lots of files
 316         for i in range(count):
 317             self.mount_a.run_shell(["touch", "f{0}".format(i)])
 318
 319         # Populate mount_b's cache
 320         self.mount_b.run_shell(["ls", "-l"])
 321
 322         client_id = self.mount_b.get_global_id()
 323         num_caps = self._session_num_caps(client_id)
 324         self.assertGreaterEqual(num_caps, count)
 325
 326         # Restart MDS. client should trim its cache when reconnecting to the MDS
 327         self.fs.mds_fail_restart()
 328         self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
 329
 330         num_caps = self._session_num_caps(client_id)
 331         self.assertLess(num_caps, count,
 332                         "should have less than {0} capabilities, have {1}".format(
 333                             count, num_caps
 334                         ))
 335
 336     def _is_flockable(self):
 337         a_version_str = get_package_version(self.mount_a.client_remote, "fuse")
 338         b_version_str = get_package_version(self.mount_b.client_remote, "fuse")
 339         flock_version_str = "2.9"
 340
 341         version_regex = re.compile(r"[0-9\.]+")
 342         a_result = version_regex.match(a_version_str)
 343         self.assertTrue(a_result)
 344         b_result = version_regex.match(b_version_str)
 345         self.assertTrue(b_result)
 346         a_version = version.StrictVersion(a_result.group())
 347         b_version = version.StrictVersion(b_result.group())
 348         flock_version=version.StrictVersion(flock_version_str)
 349
 350         if (a_version >= flock_version and b_version >= flock_version):
 351             log.info("flock locks are available")
 352             return True
 353         else:
 354             log.info("not testing flock locks, machines have versions {av} and {bv}".format(
 355                 av=a_version_str,bv=b_version_str))
 356             return False
 357
 358     def test_filelock(self):
 359         """
 360         Check that file lock doesn't get lost after an MDS restart
 361         """
 362
 363         flockable = self._is_flockable()
 364         lock_holder = self.mount_a.lock_background(do_flock=flockable)
 365
 366         self.mount_b.wait_for_visible("background_file-2")
 367         self.mount_b.check_filelock(do_flock=flockable)
 368
 369         self.fs.mds_fail_restart()
 370         self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
 371
 372         self.mount_b.check_filelock(do_flock=flockable)
 373
 374         # Tear down the background process
 375         lock_holder.stdin.close()
 376         try:
 377             lock_holder.wait()
 378         except (CommandFailedError, ConnectionLostError):
 379             # We killed it, so it raises an error
 380             pass
 381
 382     def test_filelock_eviction(self):
 383         """
 384         Check that file lock held by evicted client is given to
 385         waiting client.
 386         """
 387         if not self._is_flockable():
 388             self.skipTest("flock is not available")
 389
 390         lock_holder = self.mount_a.lock_background()
 391         self.mount_b.wait_for_visible("background_file-2")
 392         self.mount_b.check_filelock()
 393
 394         lock_taker = self.mount_b.lock_and_release()
 395         # Check the taker is waiting (doesn't get it immediately)
 396         time.sleep(2)
 397         self.assertFalse(lock_holder.finished)
 398         self.assertFalse(lock_taker.finished)
 399
 400         try:
 401             mount_a_client_id = self.mount_a.get_global_id()
 402             self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
 403
 404             # Evicting mount_a should let mount_b's attempt to take the lock
 405             # succeed
 406             self.wait_until_true(lambda: lock_taker.finished, timeout=10)
 407         finally:
 408             # teardown() doesn't quite handle this case cleanly, so help it out
 409             self.mount_a.kill()
 410             self.mount_a.kill_cleanup()
 411
 412         # Bring the client back
 413         self.mount_a.mount()
 414         self.mount_a.wait_until_mounted()
 415
 416     def test_dir_fsync(self):
 417         self._test_fsync(True);
 418
 419     def test_create_fsync(self):
 420         self._test_fsync(False);
 421
 422     def _test_fsync(self, dirfsync):
 423         """
 424         That calls to fsync guarantee visibility of metadata to another
 425         client immediately after the fsyncing client dies.
 426         """
 427
 428         # Leave this guy out until he's needed
 429         self.mount_b.umount_wait()
 430
 431         # Create dir + child dentry on client A, and fsync the dir
 432         path = os.path.join(self.mount_a.mountpoint, "subdir")
 433         self.mount_a.run_python(
 434             dedent("""
 435                 import os
 436                 import time
 437
 438                 path = "{path}"
 439
 440                 print "Starting creation..."
 441                 start = time.time()
 442
 443                 os.mkdir(path)
 444                 dfd = os.open(path, os.O_DIRECTORY)
 445
 446                 fd = open(os.path.join(path, "childfile"), "w")
 447                 print "Finished creation in {{0}}s".format(time.time() - start)
 448
 449                 print "Starting fsync..."
 450                 start = time.time()
 451                 if {dirfsync}:
 452                     os.fsync(dfd)
 453                 else:
 454                     os.fsync(fd)
 455                 print "Finished fsync in {{0}}s".format(time.time() - start)
 456             """.format(path=path,dirfsync=str(dirfsync)))
 457         )
 458
 459         # Immediately kill the MDS and then client A
 460         self.fs.mds_stop()
 461         self.fs.mds_fail()
 462         self.mount_a.kill()
 463         self.mount_a.kill_cleanup()
 464
 465         # Restart the MDS.  Wait for it to come up, it'll have to time out in clientreplay
 466         self.fs.mds_restart()
 467         log.info("Waiting for reconnect...")
 468         self.fs.wait_for_state("up:reconnect")
 469         log.info("Waiting for active...")
 470         self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout)
 471         log.info("Reached active...")
 472
 473         # Is the child dentry visible from mount B?
 474         self.mount_b.mount()
 475         self.mount_b.wait_until_mounted()
 476         self.mount_b.run_shell(["ls", "subdir/childfile"])
 477
 478     def test_stale_renew(self):
 479         if not isinstance(self.mount_a, FuseMount):
 480             raise SkipTest("Require FUSE client to handle signal STOP/CONT")
 481
 482         self.mount_a.run_shell(["mkdir", "testdir"])
 483         self.mount_a.run_shell(["touch", "testdir/file1"])
 484         # populate readdir cache
 485         self.mount_a.run_shell(["ls", "testdir"])
 486         self.mount_b.run_shell(["ls", "testdir"])
 487
 488         # check if readdir cache is effective
 489         initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
 490         self.mount_b.run_shell(["ls", "testdir"])
 491         current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency'])
 492         self.assertEqual(current_readdirs, initial_readdirs);
 493
 494         mount_b_gid = self.mount_b.get_global_id()
 495         mount_b_pid = self.mount_b.get_client_pid()
 496         # stop ceph-fuse process of mount_b
 497         self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid])
 498
 499         self.assert_session_state(mount_b_gid, "open")
 500         time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
 501         self.assert_session_state(mount_b_gid, "stale")
 502
 503         self.mount_a.run_shell(["touch", "testdir/file2"])
 504
 505         # resume ceph-fuse process of mount_b
 506         self.mount_b.client_remote.run(args=["sudo", "kill", "-CONT", mount_b_pid])
 507         # Is the new file visible from mount_b? (caps become invalid after session stale)
 508         self.mount_b.run_shell(["ls", "testdir/file2"])
 509
 510     def test_unmount_for_evicted_client(self):
 511         """Test if client hangs on unmount after evicting the client."""
 512         mount_a_client_id = self.mount_a.get_global_id()
 513         self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
 514
 515         self.mount_a.umount_wait(require_clean=True, timeout=30)