[ceph.git] / ceph / qa / tasks / cephfs / test_misc.py

from io import StringIO

from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.exceptions import CommandFailedError
from textwrap import dedent
from threading import Thread
import errno
import platform
import time
import json
import logging
import os

log = logging.getLogger(__name__)

class TestMisc(CephFSTestCase):
    CLIENTS_REQUIRED = 2

    def test_statfs_on_deleted_fs(self):
        """
        That statfs does not cause monitors to SIGSEGV after fs deletion.
        """

        self.mount_b.umount_wait()
        self.mount_a.run_shell_payload("stat -f .")
        self.fs.delete_all_filesystems()
        # This will hang either way, run in background.
        p = self.mount_a.run_shell_payload("stat -f .", wait=False, timeout=60, check_status=False)
        time.sleep(30)
        self.assertFalse(p.finished)
        # the process is stuck in uninterruptible sleep, just kill the mount
        self.mount_a.umount_wait(force=True)
        p.wait()

    def test_fuse_mount_on_already_mounted_path(self):
        if platform.system() != "Linux":
            self.skipTest("Require Linux platform")

        if not isinstance(self.mount_a, FuseMount):
            self.skipTest("Require FUSE client")

        # Try to mount already mounted path
        # expecting EBUSY error
        try:
            mount_cmd = ['sudo'] + self.mount_a._mount_bin + [self.mount_a.hostfs_mntpt]
            self.mount_a.client_remote.run(args=mount_cmd, stderr=StringIO(),
                    stdout=StringIO(), timeout=60, omit_sudo=False)
        except CommandFailedError as e:
            self.assertEqual(e.exitstatus, errno.EBUSY)
        else:
            self.fail("Expected EBUSY")

    def test_getattr_caps(self):
        """
        Check if MDS recognizes the 'mask' parameter of open request.
        The parameter allows client to request caps when opening file
        """

        if not isinstance(self.mount_a, FuseMount):
            self.skipTest("Require FUSE client")

        # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
        # on lookup/open
        self.mount_b.umount_wait()
        self.set_conf('client', 'client debug getattr caps', 'true')
        self.mount_b.mount_wait()

        # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
        # to mount_a
        p = self.mount_a.open_background("testfile")
        self.mount_b.wait_for_visible("testfile")

        # this triggers a lookup request and an open request. The debug
        # code will check if lookup/open reply contains xattrs
        self.mount_b.run_shell(["cat", "testfile"])

        self.mount_a.kill_background(p)

    def test_root_rctime(self):
        """
        Check that the root inode has a non-default rctime on startup.
        """

        t = time.time()
        rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
        log.info("rctime = {}".format(rctime))
        self.assertGreaterEqual(float(rctime), t - 10)

    def test_fs_new(self):
        self.mount_a.umount_wait()
        self.mount_b.umount_wait()

        data_pool_name = self.fs.get_data_pool_name()

        self.fs.fail()

        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                            '--yes-i-really-mean-it')

        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
                                            self.fs.metadata_pool_name,
                                            self.fs.metadata_pool_name,
                                            '--yes-i-really-really-mean-it')
        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
                                            self.fs.metadata_pool_name,
                                            '--pg_num_min', str(self.fs.pg_num_min))

        # insert a garbage object
        self.fs.radosm(["put", "foo", "-"], stdin=StringIO("bar"))

        def get_pool_df(fs, name):
            try:
                return fs.get_pool_df(name)['objects'] > 0
            except RuntimeError:
                return False

        self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30)

        try:
            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
                                                self.fs.metadata_pool_name,
                                                data_pool_name)
        except CommandFailedError as e:
            self.assertEqual(e.exitstatus, errno.EINVAL)
        else:
            raise AssertionError("Expected EINVAL")

        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
                                            self.fs.metadata_pool_name,
                                            data_pool_name, "--force")

        self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)

        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                            '--yes-i-really-mean-it')

        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
                                            self.fs.metadata_pool_name,
                                            self.fs.metadata_pool_name,
                                            '--yes-i-really-really-mean-it')
        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
                                            self.fs.metadata_pool_name,
                                            '--pg_num_min', str(self.fs.pg_num_min))
        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
                                            self.fs.metadata_pool_name,
                                            data_pool_name)

    def test_cap_revoke_nonresponder(self):
        """
        Check that a client is evicted if it has not responded to cap revoke
        request for configured number of seconds.
        """
        session_timeout = self.fs.get_var("session_timeout")
        eviction_timeout = session_timeout / 2.0

        self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
                          str(eviction_timeout)])

        cap_holder = self.mount_a.open_background()

        # Wait for the file to be visible from another client, indicating
        # that mount_a has completed its network ops
        self.mount_b.wait_for_visible()

        # Simulate client death
        self.mount_a.suspend_netns()

        try:
            # The waiter should get stuck waiting for the capability
            # held on the MDS by the now-dead client A
            cap_waiter = self.mount_b.write_background()

            a = time.time()
            time.sleep(eviction_timeout)
            cap_waiter.wait()
            b = time.time()
            cap_waited = b - a
            log.info("cap_waiter waited {0}s".format(cap_waited))

            # check if the cap is transferred before session timeout kicked in.
            # this is a good enough check to ensure that the client got evicted
            # by the cap auto evicter rather than transitioning to stale state
            # and then getting evicted.
            self.assertLess(cap_waited, session_timeout,
                            "Capability handover took {0}, expected less than {1}".format(
                                cap_waited, session_timeout
                            ))

            self.assertTrue(self.mds_cluster.is_addr_blocklisted(
                self.mount_a.get_global_addr()))
            self.mount_a._kill_background(cap_holder)
        finally:
            self.mount_a.resume_netns()

    def test_filtered_df(self):
        pool_name = self.fs.get_data_pool_name()
        raw_df = self.fs.get_pool_df(pool_name)
        raw_avail = float(raw_df["max_avail"])
        out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
                                                  pool_name, 'size',
                                                  '-f', 'json-pretty')
        _ = json.loads(out)

        proc = self.mount_a.run_shell(['df', '.'])
        output = proc.stdout.getvalue()
        fs_avail = output.split('\n')[1].split()[3]
        fs_avail = float(fs_avail) * 1024

        ratio = raw_avail / fs_avail
        assert 0.9 < ratio < 1.1

    def test_dump_inode(self):
        info = self.fs.mds_asok(['dump', 'inode', '1'])
        assert(info['path'] == "/")

    def test_dump_inode_hexademical(self):
        self.mount_a.run_shell(["mkdir", "-p", "foo"])
        ino = self.mount_a.path_to_ino("foo")
        assert type(ino) is int
        info = self.fs.mds_asok(['dump', 'inode', hex(ino)])
        assert info['path'] == "/foo"

    def test_fs_lsflags(self):
        """
        Check that the lsflags displays the default state and the new state of flags
        """
        # Set some flags
        self.fs.set_joinable(False)
        self.fs.set_allow_new_snaps(False)
        self.fs.set_allow_standby_replay(True)

        lsflags = json.loads(self.fs.mon_manager.raw_cluster_cmd('fs', 'lsflags',
                                                                 self.fs.name,
                                                                 "--format=json-pretty"))
        self.assertEqual(lsflags["joinable"], False)
        self.assertEqual(lsflags["allow_snaps"], False)
        self.assertEqual(lsflags["allow_multimds_snaps"], True)
        self.assertEqual(lsflags["allow_standby_replay"], True)

    def _test_sync_stuck_for_around_5s(self, dir_path, file_sync=False):
        self.mount_a.run_shell(["mkdir", dir_path])

        sync_dir_pyscript = dedent("""
                import os

                path = "{path}"
                dfd = os.open(path, os.O_DIRECTORY)
                os.fsync(dfd)
                os.close(dfd)
            """.format(path=dir_path))

        # run create/delete directories and test the sync time duration
        for i in range(300):
            for j in range(5):
                self.mount_a.run_shell(["mkdir", os.path.join(dir_path, f"{i}_{j}")])
            start = time.time()
            if file_sync:
                self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
            else:
                self.mount_a.run_shell(["sync"])
            duration = time.time() - start
            log.info(f"sync mkdir i = {i}, duration = {duration}")
            self.assertLess(duration, 4)

            for j in range(5):
                self.mount_a.run_shell(["rm", "-rf", os.path.join(dir_path, f"{i}_{j}")])
            start = time.time()
            if file_sync:
                self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
            else:
                self.mount_a.run_shell(["sync"])
            duration = time.time() - start
            log.info(f"sync rmdir i = {i}, duration = {duration}")
            self.assertLess(duration, 4)

        self.mount_a.run_shell(["rm", "-rf", dir_path])

    def test_filesystem_sync_stuck_for_around_5s(self):
        """
        To check whether the fsync will be stuck to wait for the mdlog to be
        flushed for at most 5 seconds.
        """

        dir_path = "filesystem_sync_do_not_wait_mdlog_testdir"
        self._test_sync_stuck_for_around_5s(dir_path)

    def test_file_sync_stuck_for_around_5s(self):
        """
        To check whether the filesystem sync will be stuck to wait for the
        mdlog to be flushed for at most 5 seconds.
        """

        dir_path = "file_sync_do_not_wait_mdlog_testdir"
        self._test_sync_stuck_for_around_5s(dir_path, True)

    def test_file_filesystem_sync_crash(self):
        """
        To check whether the kernel crashes when doing the file/filesystem sync.
        """

        stop_thread = False
        dir_path = "file_filesystem_sync_crash_testdir"
        self.mount_a.run_shell(["mkdir", dir_path])

        def mkdir_rmdir_thread(mount, path):
            #global stop_thread

            log.info(" mkdir_rmdir_thread starting...")
            num = 0
            while not stop_thread:
                n = num
                m = num
                for __ in range(10):
                    mount.run_shell(["mkdir", os.path.join(path, f"{n}")])
                    n += 1
                for __ in range(10):
                    mount.run_shell(["rm", "-rf", os.path.join(path, f"{m}")])
                    m += 1
                num += 10
            log.info(" mkdir_rmdir_thread stopped")

        def filesystem_sync_thread(mount, path):
            #global stop_thread

            log.info(" filesystem_sync_thread starting...")
            while not stop_thread:
                mount.run_shell(["sync"])
            log.info(" filesystem_sync_thread stopped")

        def file_sync_thread(mount, path):
            #global stop_thread

            log.info(" file_sync_thread starting...")
            pyscript = dedent("""
                    import os

                    path = "{path}"
                    dfd = os.open(path, os.O_DIRECTORY)
                    os.fsync(dfd)
                    os.close(dfd)
                """.format(path=path))

            while not stop_thread:
                mount.run_shell(['python3', '-c', pyscript])
            log.info(" file_sync_thread stopped")

        td1 = Thread(target=mkdir_rmdir_thread, args=(self.mount_a, dir_path,))
        td2 = Thread(target=filesystem_sync_thread, args=(self.mount_a, dir_path,))
        td3 = Thread(target=file_sync_thread, args=(self.mount_a, dir_path,))

        td1.start()
        td2.start()
        td3.start()
        time.sleep(1200) # run 20 minutes
        stop_thread = True
        td1.join()
        td2.join()
        td3.join()
        self.mount_a.run_shell(["rm", "-rf", dir_path])


class TestCacheDrop(CephFSTestCase):
    CLIENTS_REQUIRED = 1

    def _run_drop_cache_cmd(self, timeout=None):
        result = None
        args = ["cache", "drop"]
        if timeout is not None:
            args.append(str(timeout))
        result = self.fs.rank_tell(args)
        return result

    def _setup(self, max_caps=20, threshold=400):
        # create some files
        self.mount_a.create_n_files("dc-dir/dc-file", 1000, sync=True)

        # Reduce this so the MDS doesn't rkcall the maximum for simple tests
        self.fs.rank_asok(['config', 'set', 'mds_recall_max_caps', str(max_caps)])
        self.fs.rank_asok(['config', 'set', 'mds_recall_max_decay_threshold', str(threshold)])

    def test_drop_cache_command(self):
        """
        Basic test for checking drop cache command.
        Confirm it halts without a timeout.
        Note that the cache size post trimming is not checked here.
        """
        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
        self._setup()
        result = self._run_drop_cache_cmd()
        self.assertEqual(result['client_recall']['return_code'], 0)
        self.assertEqual(result['flush_journal']['return_code'], 0)
        # It should take at least 1 second
        self.assertGreater(result['duration'], 1)
        self.assertGreaterEqual(result['trim_cache']['trimmed'], 1000-2*mds_min_caps_per_client)

    def test_drop_cache_command_timeout(self):
        """
        Basic test for checking drop cache command.
        Confirm recall halts early via a timeout.
        Note that the cache size post trimming is not checked here.
        """
        self._setup()
        result = self._run_drop_cache_cmd(timeout=10)
        self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
        self.assertEqual(result['flush_journal']['return_code'], 0)
        self.assertGreater(result['duration'], 10)
        self.assertGreaterEqual(result['trim_cache']['trimmed'], 100) # we did something, right?

    def test_drop_cache_command_dead_timeout(self):
        """
        Check drop cache command with non-responding client using tell
        interface. Note that the cache size post trimming is not checked
        here.
        """
        self._setup()
        self.mount_a.suspend_netns()
        # Note: recall is subject to the timeout. The journal flush will
        # be delayed due to the client being dead.
        result = self._run_drop_cache_cmd(timeout=5)
        self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
        self.assertEqual(result['flush_journal']['return_code'], 0)
        self.assertGreater(result['duration'], 5)
        self.assertLess(result['duration'], 120)
        # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
        # cache now causes the Locker to drive eviction of stale clients (a
        # stale session will be autoclosed at mdsmap['session_timeout']). The
        # particular operation causing this is journal flush which causes the
        # MDS to wait wait for cap revoke.
        #self.assertEqual(0, result['trim_cache']['trimmed'])
        self.mount_a.resume_netns()

    def test_drop_cache_command_dead(self):
        """
        Check drop cache command with non-responding client using tell
        interface. Note that the cache size post trimming is not checked
        here.
        """
        self._setup()
        self.mount_a.suspend_netns()
        result = self._run_drop_cache_cmd()
        self.assertEqual(result['client_recall']['return_code'], 0)
        self.assertEqual(result['flush_journal']['return_code'], 0)
        self.assertGreater(result['duration'], 5)
        self.assertLess(result['duration'], 120)
        # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
        # cache now causes the Locker to drive eviction of stale clients (a
        # stale session will be autoclosed at mdsmap['session_timeout']). The
        # particular operation causing this is journal flush which causes the
        # MDS to wait wait for cap revoke.
        self.mount_a.resume_netns()
Commit	Line	Data
f67539c2	1	from io import StringIO
7c673cae	2
7c673cae FG	3	from tasks.cephfs.fuse_mount import FuseMount
7c673cae FG	4	from tasks.cephfs.cephfs_test_case import CephFSTestCase
20effc67	5	from teuthology.exceptions import CommandFailedError
33c7a0ef TL	6	from textwrap import dedent
33c7a0ef TL	7	from threading import Thread
7c673cae	8	import errno
33c7a0ef	9	import platform
7c673cae	10	import time
d2e6a577	11	import json
91327a77	12	import logging
33c7a0ef	13	import os
7c673cae	14
91327a77	15	log = logging.getLogger(__name__)
31f18b77	16
7c673cae FG	17	class TestMisc(CephFSTestCase):
	18	CLIENTS_REQUIRED = 2
	19
f91f0fd5 TL	20	def test_statfs_on_deleted_fs(self):
	21	"""
	22	That statfs does not cause monitors to SIGSEGV after fs deletion.
	23	"""
	24
	25	self.mount_b.umount_wait()
	26	self.mount_a.run_shell_payload("stat -f .")
	27	self.fs.delete_all_filesystems()
	28	# This will hang either way, run in background.
	29	p = self.mount_a.run_shell_payload("stat -f .", wait=False, timeout=60, check_status=False)
	30	time.sleep(30)
	31	self.assertFalse(p.finished)
	32	# the process is stuck in uninterruptible sleep, just kill the mount
	33	self.mount_a.umount_wait(force=True)
	34	p.wait()
	35
33c7a0ef TL	36	def test_fuse_mount_on_already_mounted_path(self):
	37	if platform.system() != "Linux":
	38	self.skipTest("Require Linux platform")
	39
	40	if not isinstance(self.mount_a, FuseMount):
	41	self.skipTest("Require FUSE client")
	42
	43	# Try to mount already mounted path
	44	# expecting EBUSY error
	45	try:
	46	mount_cmd = ['sudo'] + self.mount_a._mount_bin + [self.mount_a.hostfs_mntpt]
	47	self.mount_a.client_remote.run(args=mount_cmd, stderr=StringIO(),
	48	stdout=StringIO(), timeout=60, omit_sudo=False)
	49	except CommandFailedError as e:
	50	self.assertEqual(e.exitstatus, errno.EBUSY)
	51	else:
	52	self.fail("Expected EBUSY")
	53
7c673cae FG	54	def test_getattr_caps(self):
	55	"""
	56	Check if MDS recognizes the 'mask' parameter of open request.
11fdf7f2	57	The parameter allows client to request caps when opening file
7c673cae FG	58	"""
	59
	60	if not isinstance(self.mount_a, FuseMount):
9f95a23c	61	self.skipTest("Require FUSE client")
7c673cae FG	62
	63	# Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
	64	# on lookup/open
	65	self.mount_b.umount_wait()
	66	self.set_conf('client', 'client debug getattr caps', 'true')
e306af50	67	self.mount_b.mount_wait()
7c673cae FG	68
	69	# create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
	70	# to mount_a
	71	p = self.mount_a.open_background("testfile")
	72	self.mount_b.wait_for_visible("testfile")
	73
11fdf7f2	74	# this triggers a lookup request and an open request. The debug
7c673cae FG	75	# code will check if lookup/open reply contains xattrs
	76	self.mount_b.run_shell(["cat", "testfile"])
	77
	78	self.mount_a.kill_background(p)
	79
f64942e4 AA	80	def test_root_rctime(self):
	81	"""
	82	Check that the root inode has a non-default rctime on startup.
	83	"""
	84
	85	t = time.time()
	86	rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
	87	log.info("rctime = {}".format(rctime))
e306af50	88	self.assertGreaterEqual(float(rctime), t - 10)
f64942e4	89
7c673cae	90	def test_fs_new(self):
a8e16298 TL	91	self.mount_a.umount_wait()
	92	self.mount_b.umount_wait()
	93
7c673cae FG	94	data_pool_name = self.fs.get_data_pool_name()
7c673cae FG	95
f67539c2	96	self.fs.fail()
7c673cae FG	97
	98	self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
	99	'--yes-i-really-mean-it')
	100
	101	self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
	102	self.fs.metadata_pool_name,
	103	self.fs.metadata_pool_name,
	104	'--yes-i-really-really-mean-it')
	105	self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
	106	self.fs.metadata_pool_name,
522d829b	107	'--pg_num_min', str(self.fs.pg_num_min))
7c673cae	108
f67539c2 TL	109	# insert a garbage object
f67539c2 TL	110	self.fs.radosm(["put", "foo", "-"], stdin=StringIO("bar"))
7c673cae	111
224ce89b WB	112	def get_pool_df(fs, name):
	113	try:
	114	return fs.get_pool_df(name)['objects'] > 0
9f95a23c	115	except RuntimeError:
224ce89b	116	return False
7c673cae	117
224ce89b	118	self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30)
7c673cae FG	119
	120	try:
	121	self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
	122	self.fs.metadata_pool_name,
	123	data_pool_name)
	124	except CommandFailedError as e:
	125	self.assertEqual(e.exitstatus, errno.EINVAL)
	126	else:
	127	raise AssertionError("Expected EINVAL")
	128
	129	self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
	130	self.fs.metadata_pool_name,
	131	data_pool_name, "--force")
	132
f67539c2 TL	133	self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
f67539c2 TL	134
7c673cae FG	135	self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
	136	'--yes-i-really-mean-it')
	137
7c673cae FG	138	self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
	139	self.fs.metadata_pool_name,
	140	self.fs.metadata_pool_name,
	141	'--yes-i-really-really-mean-it')
	142	self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
	143	self.fs.metadata_pool_name,
522d829b	144	'--pg_num_min', str(self.fs.pg_num_min))
7c673cae FG	145	self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
	146	self.fs.metadata_pool_name,
	147	data_pool_name)
	148
91327a77 AA	149	def test_cap_revoke_nonresponder(self):
	150	"""
	151	Check that a client is evicted if it has not responded to cap revoke
	152	request for configured number of seconds.
	153	"""
	154	session_timeout = self.fs.get_var("session_timeout")
	155	eviction_timeout = session_timeout / 2.0
	156
	157	self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
	158	str(eviction_timeout)])
	159
	160	cap_holder = self.mount_a.open_background()
	161
	162	# Wait for the file to be visible from another client, indicating
	163	# that mount_a has completed its network ops
	164	self.mount_b.wait_for_visible()
	165
	166	# Simulate client death
f67539c2	167	self.mount_a.suspend_netns()
91327a77 AA	168
	169	try:
	170	# The waiter should get stuck waiting for the capability
	171	# held on the MDS by the now-dead client A
	172	cap_waiter = self.mount_b.write_background()
	173
	174	a = time.time()
	175	time.sleep(eviction_timeout)
	176	cap_waiter.wait()
	177	b = time.time()
	178	cap_waited = b - a
	179	log.info("cap_waiter waited {0}s".format(cap_waited))
	180
	181	# check if the cap is transferred before session timeout kicked in.
	182	# this is a good enough check to ensure that the client got evicted
	183	# by the cap auto evicter rather than transitioning to stale state
	184	# and then getting evicted.
	185	self.assertLess(cap_waited, session_timeout,
	186	"Capability handover took {0}, expected less than {1}".format(
	187	cap_waited, session_timeout
	188	))
	189
f67539c2 TL	190	self.assertTrue(self.mds_cluster.is_addr_blocklisted(
	191	self.mount_a.get_global_addr()))
	192	self.mount_a._kill_background(cap_holder)
91327a77	193	finally:
f67539c2	194	self.mount_a.resume_netns()
91327a77	195
d2e6a577 FG	196	def test_filtered_df(self):
	197	pool_name = self.fs.get_data_pool_name()
	198	raw_df = self.fs.get_pool_df(pool_name)
	199	raw_avail = float(raw_df["max_avail"])
	200	out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
	201	pool_name, 'size',
	202	'-f', 'json-pretty')
9f95a23c	203	_ = json.loads(out)
d2e6a577 FG	204
	205	proc = self.mount_a.run_shell(['df', '.'])
	206	output = proc.stdout.getvalue()
	207	fs_avail = output.split('\n')[1].split()[3]
	208	fs_avail = float(fs_avail) * 1024
	209
181888fb	210	ratio = raw_avail / fs_avail
d2e6a577	211	assert 0.9 < ratio < 1.1
f64942e4	212
11fdf7f2 TL	213	def test_dump_inode(self):
	214	info = self.fs.mds_asok(['dump', 'inode', '1'])
	215	assert(info['path'] == "/")
f64942e4	216
11fdf7f2 TL	217	def test_dump_inode_hexademical(self):
	218	self.mount_a.run_shell(["mkdir", "-p", "foo"])
	219	ino = self.mount_a.path_to_ino("foo")
	220	assert type(ino) is int
	221	info = self.fs.mds_asok(['dump', 'inode', hex(ino)])
	222	assert info['path'] == "/foo"
f64942e4	223
20effc67 TL	224	def test_fs_lsflags(self):
	225	"""
	226	Check that the lsflags displays the default state and the new state of flags
	227	"""
	228	# Set some flags
	229	self.fs.set_joinable(False)
	230	self.fs.set_allow_new_snaps(False)
	231	self.fs.set_allow_standby_replay(True)
	232
	233	lsflags = json.loads(self.fs.mon_manager.raw_cluster_cmd('fs', 'lsflags',
	234	self.fs.name,
	235	"--format=json-pretty"))
	236	self.assertEqual(lsflags["joinable"], False)
	237	self.assertEqual(lsflags["allow_snaps"], False)
	238	self.assertEqual(lsflags["allow_multimds_snaps"], True)
	239	self.assertEqual(lsflags["allow_standby_replay"], True)
f64942e4	240
33c7a0ef TL	241	def _test_sync_stuck_for_around_5s(self, dir_path, file_sync=False):
	242	self.mount_a.run_shell(["mkdir", dir_path])
	243
	244	sync_dir_pyscript = dedent("""
	245	import os
	246
	247	path = "{path}"
	248	dfd = os.open(path, os.O_DIRECTORY)
	249	os.fsync(dfd)
	250	os.close(dfd)
	251	""".format(path=dir_path))
	252
	253	# run create/delete directories and test the sync time duration
	254	for i in range(300):
	255	for j in range(5):
	256	self.mount_a.run_shell(["mkdir", os.path.join(dir_path, f"{i}_{j}")])
	257	start = time.time()
	258	if file_sync:
	259	self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
	260	else:
	261	self.mount_a.run_shell(["sync"])
	262	duration = time.time() - start
	263	log.info(f"sync mkdir i = {i}, duration = {duration}")
	264	self.assertLess(duration, 4)
	265
	266	for j in range(5):
	267	self.mount_a.run_shell(["rm", "-rf", os.path.join(dir_path, f"{i}_{j}")])
	268	start = time.time()
	269	if file_sync:
	270	self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
	271	else:
	272	self.mount_a.run_shell(["sync"])
	273	duration = time.time() - start
	274	log.info(f"sync rmdir i = {i}, duration = {duration}")
	275	self.assertLess(duration, 4)
	276
	277	self.mount_a.run_shell(["rm", "-rf", dir_path])
	278
	279	def test_filesystem_sync_stuck_for_around_5s(self):
	280	"""
	281	To check whether the fsync will be stuck to wait for the mdlog to be
	282	flushed for at most 5 seconds.
	283	"""
	284
	285	dir_path = "filesystem_sync_do_not_wait_mdlog_testdir"
	286	self._test_sync_stuck_for_around_5s(dir_path)
	287
	288	def test_file_sync_stuck_for_around_5s(self):
	289	"""
	290	To check whether the filesystem sync will be stuck to wait for the
	291	mdlog to be flushed for at most 5 seconds.
	292	"""
	293
	294	dir_path = "file_sync_do_not_wait_mdlog_testdir"
	295	self._test_sync_stuck_for_around_5s(dir_path, True)
	296
	297	def test_file_filesystem_sync_crash(self):
	298	"""
	299	To check whether the kernel crashes when doing the file/filesystem sync.
	300	"""
	301
	302	stop_thread = False
	303	dir_path = "file_filesystem_sync_crash_testdir"
	304	self.mount_a.run_shell(["mkdir", dir_path])
305
306	def mkdir_rmdir_thread(mount, path):
307	#global stop_thread
308
309	log.info(" mkdir_rmdir_thread starting...")
310	num = 0
311	while not stop_thread:
312	n = num
313	m = num
314	for __ in range(10):
315	mount.run_shell(["mkdir", os.path.join(path, f"{n}")])
316	n += 1
317	for __ in range(10):
318	mount.run_shell(["rm", "-rf", os.path.join(path, f"{m}")])
319	m += 1
320	num += 10
321	log.info(" mkdir_rmdir_thread stopped")
322
323	def filesystem_sync_thread(mount, path):
324	#global stop_thread
325
326	log.info(" filesystem_sync_thread starting...")
327	while not stop_thread:
328	mount.run_shell(["sync"])
329	log.info(" filesystem_sync_thread stopped")
330
331	def file_sync_thread(mount, path):
332	#global stop_thread
333
334	log.info(" file_sync_thread starting...")
335	pyscript = dedent("""
336	import os
337
338	path = "{path}"
339	dfd = os.open(path, os.O_DIRECTORY)
340	os.fsync(dfd)
341	os.close(dfd)
342	""".format(path=path))
343
344	while not stop_thread:
345	mount.run_shell(['python3', '-c', pyscript])
346	log.info(" file_sync_thread stopped")
347
348	td1 = Thread(target=mkdir_rmdir_thread, args=(self.mount_a, dir_path,))
349	td2 = Thread(target=filesystem_sync_thread, args=(self.mount_a, dir_path,))
350	td3 = Thread(target=file_sync_thread, args=(self.mount_a, dir_path,))
351
352	td1.start()
353	td2.start()
354	td3.start()
355	time.sleep(1200) # run 20 minutes
356	stop_thread = True
357	td1.join()
358	td2.join()
359	td3.join()
360	self.mount_a.run_shell(["rm", "-rf", dir_path])
361
362
11fdf7f2 TL	363	class TestCacheDrop(CephFSTestCase):
11fdf7f2 TL	364	CLIENTS_REQUIRED = 1
f64942e4	365
11fdf7f2 TL	366	def _run_drop_cache_cmd(self, timeout=None):
11fdf7f2 TL	367	result = None
f67539c2	368	args = ["cache", "drop"]
11fdf7f2	369	if timeout is not None:
f67539c2 TL	370	args.append(str(timeout))
	371	result = self.fs.rank_tell(args)
	372	return result
f64942e4	373
11fdf7f2	374	def _setup(self, max_caps=20, threshold=400):
f64942e4	375	# create some files
11fdf7f2	376	self.mount_a.create_n_files("dc-dir/dc-file", 1000, sync=True)
f64942e4	377
11fdf7f2 TL	378	# Reduce this so the MDS doesn't rkcall the maximum for simple tests
	379	self.fs.rank_asok(['config', 'set', 'mds_recall_max_caps', str(max_caps)])
	380	self.fs.rank_asok(['config', 'set', 'mds_recall_max_decay_threshold', str(threshold)])
f64942e4	381
11fdf7f2	382	def test_drop_cache_command(self):
f64942e4	383	"""
11fdf7f2 TL	384	Basic test for checking drop cache command.
11fdf7f2 TL	385	Confirm it halts without a timeout.
f64942e4 AA	386	Note that the cache size post trimming is not checked here.
f64942e4 AA	387	"""
11fdf7f2 TL	388	mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
	389	self._setup()
	390	result = self._run_drop_cache_cmd()
92f5a8d4 TL	391	self.assertEqual(result['client_recall']['return_code'], 0)
92f5a8d4 TL	392	self.assertEqual(result['flush_journal']['return_code'], 0)
11fdf7f2	393	# It should take at least 1 second
92f5a8d4	394	self.assertGreater(result['duration'], 1)
11fdf7f2 TL	395	self.assertGreaterEqual(result['trim_cache']['trimmed'], 1000-2*mds_min_caps_per_client)
	396
	397	def test_drop_cache_command_timeout(self):
f64942e4	398	"""
11fdf7f2 TL	399	Basic test for checking drop cache command.
11fdf7f2 TL	400	Confirm recall halts early via a timeout.
f64942e4 AA	401	Note that the cache size post trimming is not checked here.
f64942e4 AA	402	"""
11fdf7f2 TL	403	self._setup()
11fdf7f2 TL	404	result = self._run_drop_cache_cmd(timeout=10)
92f5a8d4 TL	405	self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
	406	self.assertEqual(result['flush_journal']['return_code'], 0)
	407	self.assertGreater(result['duration'], 10)
11fdf7f2 TL	408	self.assertGreaterEqual(result['trim_cache']['trimmed'], 100) # we did something, right?
	409
	410	def test_drop_cache_command_dead_timeout(self):
f64942e4	411	"""
11fdf7f2 TL	412	Check drop cache command with non-responding client using tell
	413	interface. Note that the cache size post trimming is not checked
	414	here.
f64942e4	415	"""
11fdf7f2	416	self._setup()
f67539c2	417	self.mount_a.suspend_netns()
11fdf7f2 TL	418	# Note: recall is subject to the timeout. The journal flush will
	419	# be delayed due to the client being dead.
	420	result = self._run_drop_cache_cmd(timeout=5)
92f5a8d4 TL	421	self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
	422	self.assertEqual(result['flush_journal']['return_code'], 0)
	423	self.assertGreater(result['duration'], 5)
	424	self.assertLess(result['duration'], 120)
	425	# Note: result['trim_cache']['trimmed'] may be >0 because dropping the
	426	# cache now causes the Locker to drive eviction of stale clients (a
	427	# stale session will be autoclosed at mdsmap['session_timeout']). The
	428	# particular operation causing this is journal flush which causes the
	429	# MDS to wait wait for cap revoke.
	430	#self.assertEqual(0, result['trim_cache']['trimmed'])
f67539c2	431	self.mount_a.resume_netns()
f64942e4	432
11fdf7f2	433	def test_drop_cache_command_dead(self):
f64942e4 AA	434	"""
	435	Check drop cache command with non-responding client using tell
	436	interface. Note that the cache size post trimming is not checked
	437	here.
	438	"""
11fdf7f2	439	self._setup()
f67539c2	440	self.mount_a.suspend_netns()
11fdf7f2	441	result = self._run_drop_cache_cmd()
92f5a8d4 TL	442	self.assertEqual(result['client_recall']['return_code'], 0)
	443	self.assertEqual(result['flush_journal']['return_code'], 0)
	444	self.assertGreater(result['duration'], 5)
	445	self.assertLess(result['duration'], 120)
	446	# Note: result['trim_cache']['trimmed'] may be >0 because dropping the
	447	# cache now causes the Locker to drive eviction of stale clients (a
	448	# stale session will be autoclosed at mdsmap['session_timeout']). The
	449	# particular operation causing this is journal flush which causes the
	450	# MDS to wait wait for cap revoke.
f67539c2	451	self.mount_a.resume_netns()