[ceph.git] / ceph / qa / tasks / cephfs / cephfs_test_case.py

import json
import logging
import os
import re

from shlex import split as shlex_split

from tasks.ceph_test_case import CephTestCase

from teuthology import contextutil
from teuthology.orchestra import run
from teuthology.exceptions import CommandFailedError

log = logging.getLogger(__name__)

def classhook(m):
    def dec(cls):
        getattr(cls, m)()
        return cls
    return dec

def for_teuthology(f):
    """
    Decorator that adds an "is_for_teuthology" attribute to the wrapped function
    """
    f.is_for_teuthology = True
    return f


def needs_trimming(f):
    """
    Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
    this means it needs to be able to run as root, currently)
    """
    f.needs_trimming = True
    return f


class MountDetails():

    def __init__(self, mntobj):
        self.client_id = mntobj.client_id
        self.client_keyring_path = mntobj.client_keyring_path
        self.client_remote = mntobj.client_remote
        self.cephfs_name = mntobj.cephfs_name
        self.cephfs_mntpt = mntobj.cephfs_mntpt
        self.hostfs_mntpt = mntobj.hostfs_mntpt

    def restore(self, mntobj):
        mntobj.client_id = self.client_id
        mntobj.client_keyring_path = self.client_keyring_path
        mntobj.client_remote = self.client_remote
        mntobj.cephfs_name = self.cephfs_name
        mntobj.cephfs_mntpt = self.cephfs_mntpt
        mntobj.hostfs_mntpt = self.hostfs_mntpt


class CephFSTestCase(CephTestCase):
    """
    Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
    into the fs, mount_a, mount_b class attributes (setting mount_b is optional)

    Handles resetting the cluster under test between tests.
    """

    # FIXME weird explicit naming
    mount_a = None
    mount_b = None
    recovery_mount = None

    # Declarative test requirements: subclasses should override these to indicate
    # their special needs.  If not met, tests will be skipped.
    CLIENTS_REQUIRED = 1
    MDSS_REQUIRED = 1
    REQUIRE_ONE_CLIENT_REMOTE = False

    # Whether to create the default filesystem during setUp
    REQUIRE_FILESYSTEM = True

    # create a backup filesystem if required.
    # required REQUIRE_FILESYSTEM enabled
    REQUIRE_BACKUP_FILESYSTEM = False

    LOAD_SETTINGS = [] # type: ignore

    def _save_mount_details(self):
        """
        XXX: Tests may change details of mount objects, so let's stash them so
        that these details are restored later to ensure smooth setUps and
        tearDowns for upcoming tests.
        """
        self._orig_mount_details = [MountDetails(m) for m in self.mounts]
        log.info(self._orig_mount_details)

    def _remove_blocklist(self):
        # In case anything is in the OSD blocklist list, clear it out.  This is to avoid
        # the OSD map changing in the background (due to blocklist expiry) while tests run.
        try:
            self.mds_cluster.mon_manager.run_cluster_cmd(args="osd blocklist clear")
        except CommandFailedError:
            # Fallback for older Ceph cluster
            try:
                blocklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
                                      "dump", "--format=json-pretty"))['blocklist']
                log.info(f"Removing {len(blocklist)} blocklist entries")
                for addr, blocklisted_at in blocklist.items():
                    self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blocklist", "rm", addr)
            except KeyError:
                # Fallback for more older Ceph clusters, who will use 'blacklist' instead.
                blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
                                      "dump", "--format=json-pretty"))['blacklist']
                log.info(f"Removing {len(blacklist)} blacklist entries")
                for addr, blocklisted_at in blacklist.items():
                    self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)

    def setUp(self):
        super(CephFSTestCase, self).setUp()

        self.config_set('mon', 'mon_allow_pool_delete', True)

        if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
            self.skipTest("Only have {0} MDSs, require {1}".format(
                len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
            ))

        if len(self.mounts) < self.CLIENTS_REQUIRED:
            self.skipTest("Only have {0} clients, require {1}".format(
                len(self.mounts), self.CLIENTS_REQUIRED
            ))

        if self.REQUIRE_ONE_CLIENT_REMOTE:
            if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
                self.skipTest("Require first client to be on separate server from MDSs")

        # Create friendly mount_a, mount_b attrs
        for i in range(0, self.CLIENTS_REQUIRED):
            setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])

        self.mds_cluster.clear_firewall()

        # Unmount all clients, we are about to blow away the filesystem
        for mount in self.mounts:
            if mount.is_mounted():
                mount.umount_wait(force=True)
        self._save_mount_details()

        # To avoid any issues with e.g. unlink bugs, we destroy and recreate
        # the filesystem rather than just doing a rm -rf of files
        self.mds_cluster.delete_all_filesystems()
        self.mds_cluster.mds_restart() # to reset any run-time configs, etc.
        self.fs = None # is now invalid!
        self.backup_fs = None
        self.recovery_fs = None

        self._remove_blocklist()

        client_mount_ids = [m.client_id for m in self.mounts]
        # In case there were any extra auth identities around from a previous
        # test, delete them
        for entry in self.auth_list():
            ent_type, ent_id = entry['entity'].split(".")
            if ent_type == "client" and ent_id not in client_mount_ids and not (ent_id == "admin" or ent_id[:6] == 'mirror'):
                self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])

        if self.REQUIRE_FILESYSTEM:
            self.fs = self.mds_cluster.newfs(create=True)

            # In case some test messed with auth caps, reset them
            for client_id in client_mount_ids:
                cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r',
                       'osd', f'allow rw tag cephfs data={self.fs.name}',
                       'mds', 'allow']

                if self.run_cluster_cmd_result(cmd) == 0:
                    break

                cmd[1] = 'add'
                if self.run_cluster_cmd_result(cmd) != 0:
                    raise RuntimeError(f'Failed to create new client {cmd[2]}')

            # wait for ranks to become active
            self.fs.wait_for_daemons()

            # Mount the requested number of clients
            for i in range(0, self.CLIENTS_REQUIRED):
                self.mounts[i].mount_wait()

        if self.REQUIRE_BACKUP_FILESYSTEM:
            if not self.REQUIRE_FILESYSTEM:
                self.skipTest("backup filesystem requires a primary filesystem as well")
            self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
                                                'enable_multiple', 'true',
                                                '--yes-i-really-mean-it')
            self.backup_fs = self.mds_cluster.newfs(name="backup_fs")
            self.backup_fs.wait_for_daemons()

        # Load an config settings of interest
        for setting in self.LOAD_SETTINGS:
            setattr(self, setting, float(self.fs.mds_asok(
                ['config', 'get', setting], list(self.mds_cluster.mds_ids)[0]
            )[setting]))

        self.configs_set = set()

    def tearDown(self):
        self.mds_cluster.clear_firewall()
        for m in self.mounts:
            m.teardown()

        # To prevent failover messages during Unwind of ceph task
        self.mds_cluster.delete_all_filesystems()

        for m, md in zip(self.mounts, self._orig_mount_details):
            md.restore(m)

        for subsys, key in self.configs_set:
            self.mds_cluster.clear_ceph_conf(subsys, key)

        return super(CephFSTestCase, self).tearDown()

    def set_conf(self, subsys, key, value):
        self.configs_set.add((subsys, key))
        self.mds_cluster.set_ceph_conf(subsys, key, value)

    def auth_list(self):
        """
        Convenience wrapper on "ceph auth ls"
        """
        return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
            "auth", "ls", "--format=json-pretty"
        ))['auth_dump']

    def assert_session_count(self, expected, ls_data=None, mds_id=None):
        if ls_data is None:
            ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)

        alive_count = len([s for s in ls_data if s['state'] != 'killing'])

        self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
            expected, alive_count
        ))

    def assert_session_state(self, client_id,  expected_state):
        self.assertEqual(
            self._session_by_id(
                self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
            expected_state)

    def get_session_data(self, client_id):
        return self._session_by_id(client_id)

    def _session_list(self):
        ls_data = self.fs.mds_asok(['session', 'ls'])
        ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
        return ls_data

    def get_session(self, client_id, session_ls=None):
        if session_ls is None:
            session_ls = self.fs.mds_asok(['session', 'ls'])

        return self._session_by_id(session_ls)[client_id]

    def _session_by_id(self, session_ls):
        return dict([(s['id'], s) for s in session_ls])

    def perf_dump(self, rank=None, status=None):
        return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)

    def wait_until_evicted(self, client_id, timeout=30):
        def is_client_evicted():
            ls = self._session_list()
            for s in ls:
                if s['id'] == client_id:
                    return False
            return True
        self.wait_until_true(is_client_evicted, timeout)

    def wait_for_daemon_start(self, daemon_ids=None):
        """
        Wait until all the daemons appear in the FSMap, either assigned
        MDS ranks or in the list of standbys
        """
        def get_daemon_names():
            return [info['name'] for info in self.mds_cluster.status().get_all()]

        if daemon_ids is None:
            daemon_ids = self.mds_cluster.mds_ids

        try:
            self.wait_until_true(
                lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
                timeout=30
            )
        except RuntimeError:
            log.warning("Timeout waiting for daemons {0}, while we have {1}".format(
                daemon_ids, get_daemon_names()
            ))
            raise

    def delete_mds_coredump(self, daemon_id):
        # delete coredump file, otherwise teuthology.internal.coredump will
        # catch it later and treat it as a failure.
        core_pattern = self.mds_cluster.mds_daemons[daemon_id].remote.sh(
            "sudo sysctl -n kernel.core_pattern")
        core_dir = os.path.dirname(core_pattern.strip())
        if core_dir:  # Non-default core_pattern with a directory in it
            # We have seen a core_pattern that looks like it's from teuthology's coredump
            # task, so proceed to clear out the core file
            if core_dir[0] == '|':
                log.info("Piped core dumps to program {0}, skip cleaning".format(core_dir[1:]))
                return;

            log.info("Clearing core from directory: {0}".format(core_dir))

            # Verify that we see the expected single coredump
            ls_output = self.mds_cluster.mds_daemons[daemon_id].remote.sh([
                "cd", core_dir, run.Raw('&&'),
                "sudo", "ls", run.Raw('|'), "sudo", "xargs", "file"
            ])
            cores = [l.partition(":")[0]
                     for l in ls_output.strip().split("\n")
                     if re.match(r'.*ceph-mds.* -i +{0}'.format(daemon_id), l)]

            log.info("Enumerated cores: {0}".format(cores))
            self.assertEqual(len(cores), 1)

            log.info("Found core file {0}, deleting it".format(cores[0]))

            self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
                "cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0]
            ])
        else:
            log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")

    def _get_subtrees(self, status=None, rank=None, path=None):
        if path is None:
            path = "/"
        try:
            with contextutil.safe_while(sleep=1, tries=3) as proceed:
                while proceed():
                    try:
                        if rank == "all":
                            subtrees = []
                            for r in self.fs.get_ranks(status=status):
                                s = self.fs.rank_asok(["get", "subtrees"], status=status, rank=r['rank'])
                                s = filter(lambda s: s['auth_first'] == r['rank'] and s['auth_second'] == -2, s)
                                subtrees += s
                        else:
                            subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank)
                        subtrees = filter(lambda s: s['dir']['path'].startswith(path), subtrees)
                        return list(subtrees)
                    except CommandFailedError as e:
                        # Sometimes we get transient errors
                        if e.exitstatus == 22:
                            pass
                        else:
                            raise
        except contextutil.MaxWhileTries as e:
            raise RuntimeError(f"could not get subtree state from rank {rank}") from e

    def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None, path=None):
        test = sorted(test)
        try:
            with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
                while proceed():
                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
                    filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
                    log.info("%s =?= %s", filtered, test)
                    if filtered == test:
                        # Confirm export_pin in output is correct:
                        for s in subtrees:
                            if s['export_pin_target'] >= 0:
                                self.assertTrue(s['export_pin_target'] == s['auth_first'])
                        return subtrees
                    if action is not None:
                        action()
        except contextutil.MaxWhileTries as e:
            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e

    def _wait_until_scrub_complete(self, path="/", recursive=True, timeout=100):
        out_json = self.fs.run_scrub(["start", path] + ["recursive"] if recursive else [])
        if not self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"],
                                                 sleep=10, timeout=timeout):
            log.info("timed out waiting for scrub to complete")

    def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None):
        try:
            with contextutil.safe_while(sleep=5, tries=20) as proceed:
                while proceed():
                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
                    subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == True and
                                                     s['auth_first'] == s['export_pin_target'],
                                           subtrees))
                    log.info(f"len={len(subtrees)} {subtrees}")
                    if len(subtrees) >= count:
                        return subtrees
        except contextutil.MaxWhileTries as e:
            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e

    def _wait_random_subtrees(self, count, status=None, rank=None, path=None):
        try:
            with contextutil.safe_while(sleep=5, tries=20) as proceed:
                while proceed():
                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
                    subtrees = list(filter(lambda s: s['random_ephemeral_pin'] == True and
                                                     s['auth_first'] == s['export_pin_target'],
                                           subtrees))
                    log.info(f"len={len(subtrees)} {subtrees}")
                    if len(subtrees) >= count:
                        return subtrees
        except contextutil.MaxWhileTries as e:
            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e

    def run_cluster_cmd(self, cmd):
        if isinstance(cmd, str):
            cmd = shlex_split(cmd)
        return self.fs.mon_manager.raw_cluster_cmd(*cmd)

    def run_cluster_cmd_result(self, cmd):
        if isinstance(cmd, str):
            cmd = shlex_split(cmd)
        return self.fs.mon_manager.raw_cluster_cmd_result(*cmd)

    def create_client(self, client_id, moncap=None, osdcap=None, mdscap=None):
        if not (moncap or osdcap or mdscap):
            if self.fs:
                return self.fs.authorize(client_id, ('/', 'rw'))
            else:
                raise RuntimeError('no caps were passed and the default FS '
                                   'is not created yet to allow client auth '
                                   'for it.')

        cmd = ['auth', 'add', f'client.{client_id}']
        if moncap:
            cmd += ['mon', moncap]
        if osdcap:
            cmd += ['osd', osdcap]
        if mdscap:
            cmd += ['mds', mdscap]

        self.run_cluster_cmd(cmd)
        return self.run_cluster_cmd(f'auth get {self.client_name}')
Commit	Line	Data
7c673cae FG	1	import json
7c673cae FG	2	import logging
7c673cae FG	3	import os
7c673cae FG	4	import re
7c673cae	5
f67539c2	6	from shlex import split as shlex_split
f67539c2 TL	7
f67539c2 TL	8	from tasks.ceph_test_case import CephTestCase
7c673cae	9
f6b5b4d7	10	from teuthology import contextutil
7c673cae	11	from teuthology.orchestra import run
20effc67	12	from teuthology.exceptions import CommandFailedError
7c673cae FG	13
	14	log = logging.getLogger(__name__)
	15
aee94f69 TL	16	def classhook(m):
	17	def dec(cls):
	18	getattr(cls, m)()
	19	return cls
	20	return dec
	21
7c673cae FG	22	def for_teuthology(f):
	23	"""
	24	Decorator that adds an "is_for_teuthology" attribute to the wrapped function
	25	"""
	26	f.is_for_teuthology = True
	27	return f
	28
	29
	30	def needs_trimming(f):
	31	"""
	32	Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
	33	this means it needs to be able to run as root, currently)
	34	"""
	35	f.needs_trimming = True
	36	return f
	37
	38
f67539c2 TL	39	class MountDetails():
	40
	41	def __init__(self, mntobj):
	42	self.client_id = mntobj.client_id
	43	self.client_keyring_path = mntobj.client_keyring_path
	44	self.client_remote = mntobj.client_remote
	45	self.cephfs_name = mntobj.cephfs_name
	46	self.cephfs_mntpt = mntobj.cephfs_mntpt
	47	self.hostfs_mntpt = mntobj.hostfs_mntpt
	48
	49	def restore(self, mntobj):
	50	mntobj.client_id = self.client_id
	51	mntobj.client_keyring_path = self.client_keyring_path
	52	mntobj.client_remote = self.client_remote
	53	mntobj.cephfs_name = self.cephfs_name
	54	mntobj.cephfs_mntpt = self.cephfs_mntpt
	55	mntobj.hostfs_mntpt = self.hostfs_mntpt
	56
	57
7c673cae FG	58	class CephFSTestCase(CephTestCase):
	59	"""
	60	Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
	61	into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
	62
	63	Handles resetting the cluster under test between tests.
	64	"""
	65
	66	# FIXME weird explicit naming
	67	mount_a = None
	68	mount_b = None
181888fb	69	recovery_mount = None
7c673cae FG	70
	71	# Declarative test requirements: subclasses should override these to indicate
	72	# their special needs. If not met, tests will be skipped.
	73	CLIENTS_REQUIRED = 1
	74	MDSS_REQUIRED = 1
7c673cae	75	REQUIRE_ONE_CLIENT_REMOTE = False
7c673cae FG	76
	77	# Whether to create the default filesystem during setUp
	78	REQUIRE_FILESYSTEM = True
	79
f67539c2 TL	80	# create a backup filesystem if required.
	81	# required REQUIRE_FILESYSTEM enabled
	82	REQUIRE_BACKUP_FILESYSTEM = False
	83
9f95a23c	84	LOAD_SETTINGS = [] # type: ignore
7c673cae	85
f67539c2 TL	86	def _save_mount_details(self):
	87	"""
	88	XXX: Tests may change details of mount objects, so let's stash them so
	89	that these details are restored later to ensure smooth setUps and
	90	tearDowns for upcoming tests.
	91	"""
	92	self._orig_mount_details = [MountDetails(m) for m in self.mounts]
	93	log.info(self._orig_mount_details)
	94
39ae355f TL	95	def _remove_blocklist(self):
	96	# In case anything is in the OSD blocklist list, clear it out. This is to avoid
	97	# the OSD map changing in the background (due to blocklist expiry) while tests run.
	98	try:
	99	self.mds_cluster.mon_manager.run_cluster_cmd(args="osd blocklist clear")
	100	except CommandFailedError:
	101	# Fallback for older Ceph cluster
	102	try:
	103	blocklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
	104	"dump", "--format=json-pretty"))['blocklist']
	105	log.info(f"Removing {len(blocklist)} blocklist entries")
	106	for addr, blocklisted_at in blocklist.items():
	107	self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blocklist", "rm", addr)
	108	except KeyError:
	109	# Fallback for more older Ceph clusters, who will use 'blacklist' instead.
	110	blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
	111	"dump", "--format=json-pretty"))['blacklist']
	112	log.info(f"Removing {len(blacklist)} blacklist entries")
	113	for addr, blocklisted_at in blacklist.items():
	114	self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
	115
7c673cae FG	116	def setUp(self):
	117	super(CephFSTestCase, self).setUp()
	118
f91f0fd5 TL	119	self.config_set('mon', 'mon_allow_pool_delete', True)
f91f0fd5 TL	120
7c673cae	121	if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
9f95a23c	122	self.skipTest("Only have {0} MDSs, require {1}".format(
7c673cae FG	123	len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
	124	))
	125
	126	if len(self.mounts) < self.CLIENTS_REQUIRED:
9f95a23c	127	self.skipTest("Only have {0} clients, require {1}".format(
7c673cae FG	128	len(self.mounts), self.CLIENTS_REQUIRED
	129	))
	130
7c673cae FG	131	if self.REQUIRE_ONE_CLIENT_REMOTE:
7c673cae FG	132	if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
9f95a23c	133	self.skipTest("Require first client to be on separate server from MDSs")
7c673cae	134
7c673cae FG	135	# Create friendly mount_a, mount_b attrs
	136	for i in range(0, self.CLIENTS_REQUIRED):
	137	setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
	138
	139	self.mds_cluster.clear_firewall()
	140
	141	# Unmount all clients, we are about to blow away the filesystem
	142	for mount in self.mounts:
	143	if mount.is_mounted():
	144	mount.umount_wait(force=True)
f67539c2	145	self._save_mount_details()
7c673cae FG	146
	147	# To avoid any issues with e.g. unlink bugs, we destroy and recreate
	148	# the filesystem rather than just doing a rm -rf of files
7c673cae	149	self.mds_cluster.delete_all_filesystems()
92f5a8d4	150	self.mds_cluster.mds_restart() # to reset any run-time configs, etc.
7c673cae	151	self.fs = None # is now invalid!
f67539c2	152	self.backup_fs = None
181888fb	153	self.recovery_fs = None
7c673cae	154
39ae355f	155	self._remove_blocklist()
7c673cae FG	156
7c673cae FG	157	client_mount_ids = [m.client_id for m in self.mounts]
7c673cae FG	158	# In case there were any extra auth identities around from a previous
	159	# test, delete them
	160	for entry in self.auth_list():
	161	ent_type, ent_id = entry['entity'].split(".")
f67539c2	162	if ent_type == "client" and ent_id not in client_mount_ids and not (ent_id == "admin" or ent_id[:6] == 'mirror'):
7c673cae FG	163	self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
	164
	165	if self.REQUIRE_FILESYSTEM:
181888fb	166	self.fs = self.mds_cluster.newfs(create=True)
7c673cae FG	167
	168	# In case some test messed with auth caps, reset them
	169	for client_id in client_mount_ids:
f67539c2	170	cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r',
1e59de90	171	'osd', f'allow rw tag cephfs data={self.fs.name}',
f67539c2 TL	172	'mds', 'allow']
	173
	174	if self.run_cluster_cmd_result(cmd) == 0:
	175	break
	176
	177	cmd[1] = 'add'
	178	if self.run_cluster_cmd_result(cmd) != 0:
	179	raise RuntimeError(f'Failed to create new client {cmd[2]}')
7c673cae	180
92f5a8d4	181	# wait for ranks to become active
7c673cae FG	182	self.fs.wait_for_daemons()
	183
	184	# Mount the requested number of clients
	185	for i in range(0, self.CLIENTS_REQUIRED):
e306af50	186	self.mounts[i].mount_wait()
7c673cae	187
f67539c2 TL	188	if self.REQUIRE_BACKUP_FILESYSTEM:
	189	if not self.REQUIRE_FILESYSTEM:
	190	self.skipTest("backup filesystem requires a primary filesystem as well")
	191	self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
	192	'enable_multiple', 'true',
	193	'--yes-i-really-mean-it')
	194	self.backup_fs = self.mds_cluster.newfs(name="backup_fs")
	195	self.backup_fs.wait_for_daemons()
	196
7c673cae FG	197	# Load an config settings of interest
7c673cae FG	198	for setting in self.LOAD_SETTINGS:
c07f9fc5	199	setattr(self, setting, float(self.fs.mds_asok(
e306af50	200	['config', 'get', setting], list(self.mds_cluster.mds_ids)[0]
7c673cae FG	201	)[setting]))
	202
	203	self.configs_set = set()
	204
	205	def tearDown(self):
7c673cae FG	206	self.mds_cluster.clear_firewall()
	207	for m in self.mounts:
	208	m.teardown()
	209
f67539c2 TL	210	# To prevent failover messages during Unwind of ceph task
	211	self.mds_cluster.delete_all_filesystems()
	212
	213	for m, md in zip(self.mounts, self._orig_mount_details):
	214	md.restore(m)
7c673cae FG	215
	216	for subsys, key in self.configs_set:
	217	self.mds_cluster.clear_ceph_conf(subsys, key)
	218
9f95a23c TL	219	return super(CephFSTestCase, self).tearDown()
9f95a23c TL	220
7c673cae FG	221	def set_conf(self, subsys, key, value):
	222	self.configs_set.add((subsys, key))
	223	self.mds_cluster.set_ceph_conf(subsys, key, value)
	224
	225	def auth_list(self):
	226	"""
c07f9fc5	227	Convenience wrapper on "ceph auth ls"
7c673cae FG	228	"""
7c673cae FG	229	return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
c07f9fc5	230	"auth", "ls", "--format=json-pretty"
7c673cae FG	231	))['auth_dump']
	232
	233	def assert_session_count(self, expected, ls_data=None, mds_id=None):
	234	if ls_data is None:
	235	ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
	236
31f18b77 FG	237	alive_count = len([s for s in ls_data if s['state'] != 'killing'])
	238
	239	self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
	240	expected, alive_count
7c673cae FG	241	))
	242
	243	def assert_session_state(self, client_id, expected_state):
	244	self.assertEqual(
	245	self._session_by_id(
	246	self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
	247	expected_state)
	248
	249	def get_session_data(self, client_id):
	250	return self._session_by_id(client_id)
	251
	252	def _session_list(self):
	253	ls_data = self.fs.mds_asok(['session', 'ls'])
	254	ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
	255	return ls_data
	256
	257	def get_session(self, client_id, session_ls=None):
	258	if session_ls is None:
	259	session_ls = self.fs.mds_asok(['session', 'ls'])
	260
	261	return self._session_by_id(session_ls)[client_id]
	262
	263	def _session_by_id(self, session_ls):
	264	return dict([(s['id'], s) for s in session_ls])
	265
adb31ebb TL	266	def perf_dump(self, rank=None, status=None):
	267	return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
	268
92f5a8d4 TL	269	def wait_until_evicted(self, client_id, timeout=30):
	270	def is_client_evicted():
	271	ls = self._session_list()
	272	for s in ls:
	273	if s['id'] == client_id:
	274	return False
	275	return True
	276	self.wait_until_true(is_client_evicted, timeout)
	277
7c673cae FG	278	def wait_for_daemon_start(self, daemon_ids=None):
	279	"""
	280	Wait until all the daemons appear in the FSMap, either assigned
	281	MDS ranks or in the list of standbys
	282	"""
	283	def get_daemon_names():
	284	return [info['name'] for info in self.mds_cluster.status().get_all()]
	285
	286	if daemon_ids is None:
	287	daemon_ids = self.mds_cluster.mds_ids
	288
	289	try:
	290	self.wait_until_true(
	291	lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
	292	timeout=30
	293	)
	294	except RuntimeError:
e306af50	295	log.warning("Timeout waiting for daemons {0}, while we have {1}".format(
7c673cae FG	296	daemon_ids, get_daemon_names()
	297	))
	298	raise
	299
11fdf7f2 TL	300	def delete_mds_coredump(self, daemon_id):
	301	# delete coredump file, otherwise teuthology.internal.coredump will
	302	# catch it later and treat it as a failure.
e306af50 TL	303	core_pattern = self.mds_cluster.mds_daemons[daemon_id].remote.sh(
	304	"sudo sysctl -n kernel.core_pattern")
	305	core_dir = os.path.dirname(core_pattern.strip())
11fdf7f2 TL	306	if core_dir: # Non-default core_pattern with a directory in it
	307	# We have seen a core_pattern that looks like it's from teuthology's coredump
	308	# task, so proceed to clear out the core file
f6b5b4d7 TL	309	if core_dir[0] == '\|':
	310	log.info("Piped core dumps to program {0}, skip cleaning".format(core_dir[1:]))
	311	return;
	312
11fdf7f2 TL	313	log.info("Clearing core from directory: {0}".format(core_dir))
	314
	315	# Verify that we see the expected single coredump
e306af50	316	ls_output = self.mds_cluster.mds_daemons[daemon_id].remote.sh([
11fdf7f2 TL	317	"cd", core_dir, run.Raw('&&'),
11fdf7f2 TL	318	"sudo", "ls", run.Raw('\|'), "sudo", "xargs", "file"
e306af50	319	])
11fdf7f2	320	cores = [l.partition(":")[0]
e306af50	321	for l in ls_output.strip().split("\n")
11fdf7f2 TL	322	if re.match(r'.ceph-mds. -i +{0}'.format(daemon_id), l)]
	323
	324	log.info("Enumerated cores: {0}".format(cores))
	325	self.assertEqual(len(cores), 1)
	326
	327	log.info("Found core file {0}, deleting it".format(cores[0]))
	328
	329	self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
	330	"cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0]
	331	])
7c673cae	332	else:
11fdf7f2	333	log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
81eedcae	334
f6b5b4d7 TL	335	def _get_subtrees(self, status=None, rank=None, path=None):
	336	if path is None:
	337	path = "/"
	338	try:
	339	with contextutil.safe_while(sleep=1, tries=3) as proceed:
	340	while proceed():
	341	try:
	342	if rank == "all":
	343	subtrees = []
	344	for r in self.fs.get_ranks(status=status):
	345	s = self.fs.rank_asok(["get", "subtrees"], status=status, rank=r['rank'])
	346	s = filter(lambda s: s['auth_first'] == r['rank'] and s['auth_second'] == -2, s)
	347	subtrees += s
	348	else:
	349	subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank)
	350	subtrees = filter(lambda s: s['dir']['path'].startswith(path), subtrees)
	351	return list(subtrees)
	352	except CommandFailedError as e:
	353	# Sometimes we get transient errors
	354	if e.exitstatus == 22:
	355	pass
	356	else:
	357	raise
	358	except contextutil.MaxWhileTries as e:
	359	raise RuntimeError(f"could not get subtree state from rank {rank}") from e
	360
	361	def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None, path=None):
81eedcae	362	test = sorted(test)
f6b5b4d7 TL	363	try:
	364	with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
	365	while proceed():
	366	subtrees = self._get_subtrees(status=status, rank=rank, path=path)
	367	filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
	368	log.info("%s =?= %s", filtered, test)
	369	if filtered == test:
	370	# Confirm export_pin in output is correct:
	371	for s in subtrees:
f67539c2 TL	372	if s['export_pin_target'] >= 0:
f67539c2 TL	373	self.assertTrue(s['export_pin_target'] == s['auth_first'])
f6b5b4d7 TL	374	return subtrees
	375	if action is not None:
	376	action()
	377	except contextutil.MaxWhileTries as e:
	378	raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
e306af50	379
f67539c2 TL	380	def _wait_until_scrub_complete(self, path="/", recursive=True, timeout=100):
	381	out_json = self.fs.run_scrub(["start", path] + ["recursive"] if recursive else [])
	382	if not self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"],
	383	sleep=10, timeout=timeout):
	384	log.info("timed out waiting for scrub to complete")
e306af50	385
f6b5b4d7 TL	386	def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None):
	387	try:
	388	with contextutil.safe_while(sleep=5, tries=20) as proceed:
	389	while proceed():
	390	subtrees = self._get_subtrees(status=status, rank=rank, path=path)
f67539c2 TL	391	subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == True and
	392	s['auth_first'] == s['export_pin_target'],
	393	subtrees))
f6b5b4d7 TL	394	log.info(f"len={len(subtrees)} {subtrees}")
	395	if len(subtrees) >= count:
	396	return subtrees
	397	except contextutil.MaxWhileTries as e:
	398	raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
	399
	400	def _wait_random_subtrees(self, count, status=None, rank=None, path=None):
	401	try:
	402	with contextutil.safe_while(sleep=5, tries=20) as proceed:
	403	while proceed():
	404	subtrees = self._get_subtrees(status=status, rank=rank, path=path)
f67539c2 TL	405	subtrees = list(filter(lambda s: s['random_ephemeral_pin'] == True and
	406	s['auth_first'] == s['export_pin_target'],
	407	subtrees))
f6b5b4d7 TL	408	log.info(f"len={len(subtrees)} {subtrees}")
	409	if len(subtrees) >= count:
	410	return subtrees
	411	except contextutil.MaxWhileTries as e:
	412	raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
f67539c2 TL	413
	414	def run_cluster_cmd(self, cmd):
	415	if isinstance(cmd, str):
	416	cmd = shlex_split(cmd)
	417	return self.fs.mon_manager.raw_cluster_cmd(*cmd)
	418
	419	def run_cluster_cmd_result(self, cmd):
	420	if isinstance(cmd, str):
	421	cmd = shlex_split(cmd)
	422	return self.fs.mon_manager.raw_cluster_cmd_result(*cmd)
	423
	424	def create_client(self, client_id, moncap=None, osdcap=None, mdscap=None):
	425	if not (moncap or osdcap or mdscap):
	426	if self.fs:
	427	return self.fs.authorize(client_id, ('/', 'rw'))
	428	else:
	429	raise RuntimeError('no caps were passed and the default FS '
	430	'is not created yet to allow client auth '
	431	'for it.')
	432
	433	cmd = ['auth', 'add', f'client.{client_id}']
	434	if moncap:
	435	cmd += ['mon', moncap]
	436	if osdcap:
	437	cmd += ['osd', osdcap]
	438	if mdscap:
	439	cmd += ['mds', mdscap]
	440
	441	self.run_cluster_cmd(cmd)
	442	return self.run_cluster_cmd(f'auth get {self.client_name}')