[ceph.git] / ceph / qa / tasks / cephfs / cephfs_test_case.py

import json
import logging
from unittest import case
from tasks.ceph_test_case import CephTestCase
import os
import re
from StringIO import StringIO

from tasks.cephfs.fuse_mount import FuseMount

from teuthology.orchestra import run
from teuthology.orchestra.run import CommandFailedError


log = logging.getLogger(__name__)


def for_teuthology(f):
    """
    Decorator that adds an "is_for_teuthology" attribute to the wrapped function
    """
    f.is_for_teuthology = True
    return f


def needs_trimming(f):
    """
    Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
    this means it needs to be able to run as root, currently)
    """
    f.needs_trimming = True
    return f


class CephFSTestCase(CephTestCase):
    """
    Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
    into the fs, mount_a, mount_b class attributes (setting mount_b is optional)

    Handles resetting the cluster under test between tests.
    """

    # FIXME weird explicit naming
    mount_a = None
    mount_b = None

    # Declarative test requirements: subclasses should override these to indicate
    # their special needs.  If not met, tests will be skipped.
    CLIENTS_REQUIRED = 1
    MDSS_REQUIRED = 1
    REQUIRE_KCLIENT_REMOTE = False
    REQUIRE_ONE_CLIENT_REMOTE = False
    REQUIRE_MEMSTORE = False

    # Whether to create the default filesystem during setUp
    REQUIRE_FILESYSTEM = True

    LOAD_SETTINGS = []

    def setUp(self):
        super(CephFSTestCase, self).setUp()

        if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
            raise case.SkipTest("Only have {0} MDSs, require {1}".format(
                len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
            ))

        if len(self.mounts) < self.CLIENTS_REQUIRED:
            raise case.SkipTest("Only have {0} clients, require {1}".format(
                len(self.mounts), self.CLIENTS_REQUIRED
            ))

        if self.REQUIRE_KCLIENT_REMOTE:
            if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
                # kclient kill() power cycles nodes, so requires clients to each be on
                # their own node
                if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
                    raise case.SkipTest("kclient clients must be on separate nodes")

        if self.REQUIRE_ONE_CLIENT_REMOTE:
            if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
                raise case.SkipTest("Require first client to be on separate server from MDSs")

        if self.REQUIRE_MEMSTORE:
            objectstore = self.mds_cluster.get_config("osd_objectstore", "osd")
            if objectstore != "memstore":
                # You certainly *could* run this on a real OSD, but you don't want to sit
                # here for hours waiting for the test to fill up a 1TB drive!
                raise case.SkipTest("Require `memstore` OSD backend to simulate full drives")

        # Create friendly mount_a, mount_b attrs
        for i in range(0, self.CLIENTS_REQUIRED):
            setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])

        self.mds_cluster.clear_firewall()

        # Unmount all clients, we are about to blow away the filesystem
        for mount in self.mounts:
            if mount.is_mounted():
                mount.umount_wait(force=True)

        # To avoid any issues with e.g. unlink bugs, we destroy and recreate
        # the filesystem rather than just doing a rm -rf of files
        self.mds_cluster.mds_stop()
        self.mds_cluster.mds_fail()
        self.mds_cluster.delete_all_filesystems()
        self.fs = None # is now invalid!

        # In case the previous filesystem had filled up the RADOS cluster, wait for that
        # flag to pass.
        osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd'))
        self.wait_until_true(lambda: not self.mds_cluster.is_full(),
                             timeout=osd_mon_report_interval_max * 5)

        # In case anything is in the OSD blacklist list, clear it out.  This is to avoid
        # the OSD map changing in the background (due to blacklist expiry) while tests run.
        try:
            self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
        except CommandFailedError:
            # Fallback for older Ceph cluster
            blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
                                  "dump", "--format=json-pretty"))['blacklist']
            log.info("Removing {0} blacklist entries".format(len(blacklist)))
            for addr, blacklisted_at in blacklist.items():
                self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)

        client_mount_ids = [m.client_id for m in self.mounts]
        # In case the test changes the IDs of clients, stash them so that we can
        # reset in tearDown
        self._original_client_ids = client_mount_ids
        log.info(client_mount_ids)

        # In case there were any extra auth identities around from a previous
        # test, delete them
        for entry in self.auth_list():
            ent_type, ent_id = entry['entity'].split(".")
            if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
                self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])

        if self.REQUIRE_FILESYSTEM:
            self.fs = self.mds_cluster.newfs(True)
            self.fs.mds_restart()

            # In case some test messed with auth caps, reset them
            for client_id in client_mount_ids:
                self.mds_cluster.mon_manager.raw_cluster_cmd_result(
                    'auth', 'caps', "client.{0}".format(client_id),
                    'mds', 'allow',
                    'mon', 'allow r',
                    'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))

            # wait for mds restart to complete...
            self.fs.wait_for_daemons()

            # Mount the requested number of clients
            for i in range(0, self.CLIENTS_REQUIRED):
                self.mounts[i].mount()
                self.mounts[i].wait_until_mounted()

        # Load an config settings of interest
        for setting in self.LOAD_SETTINGS:
            setattr(self, setting, float(self.fs.mds_asok(
                ['config', 'get', setting], self.mds_cluster.mds_ids[0]
            )[setting]))

        self.configs_set = set()

    def tearDown(self):
        super(CephFSTestCase, self).tearDown()

        self.mds_cluster.clear_firewall()
        for m in self.mounts:
            m.teardown()

        for i, m in enumerate(self.mounts):
            m.client_id = self._original_client_ids[i]

        for subsys, key in self.configs_set:
            self.mds_cluster.clear_ceph_conf(subsys, key)

    def set_conf(self, subsys, key, value):
        self.configs_set.add((subsys, key))
        self.mds_cluster.set_ceph_conf(subsys, key, value)

    def auth_list(self):
        """
        Convenience wrapper on "ceph auth ls"
        """
        return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
            "auth", "ls", "--format=json-pretty"
        ))['auth_dump']

    def assert_session_count(self, expected, ls_data=None, mds_id=None):
        if ls_data is None:
            ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)

        alive_count = len([s for s in ls_data if s['state'] != 'killing'])

        self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
            expected, alive_count
        ))

    def assert_session_state(self, client_id,  expected_state):
        self.assertEqual(
            self._session_by_id(
                self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
            expected_state)

    def get_session_data(self, client_id):
        return self._session_by_id(client_id)

    def _session_list(self):
        ls_data = self.fs.mds_asok(['session', 'ls'])
        ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
        return ls_data

    def get_session(self, client_id, session_ls=None):
        if session_ls is None:
            session_ls = self.fs.mds_asok(['session', 'ls'])

        return self._session_by_id(session_ls)[client_id]

    def _session_by_id(self, session_ls):
        return dict([(s['id'], s) for s in session_ls])

    def wait_for_daemon_start(self, daemon_ids=None):
        """
        Wait until all the daemons appear in the FSMap, either assigned
        MDS ranks or in the list of standbys
        """
        def get_daemon_names():
            return [info['name'] for info in self.mds_cluster.status().get_all()]

        if daemon_ids is None:
            daemon_ids = self.mds_cluster.mds_ids

        try:
            self.wait_until_true(
                lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
                timeout=30
            )
        except RuntimeError:
            log.warn("Timeout waiting for daemons {0}, while we have {1}".format(
                daemon_ids, get_daemon_names()
            ))
            raise

    def assert_mds_crash(self, daemon_id):
        """
        Assert that the a particular MDS daemon crashes (block until
        it does)
        """
        try:
            self.mds_cluster.mds_daemons[daemon_id].proc.wait()
        except CommandFailedError as e:
            log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
            self.mds_cluster.mds_daemons[daemon_id].proc = None

            # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
            # catch it later and treat it as a failure.
            p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
                "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
            core_pattern = p.stdout.getvalue().strip()
            if os.path.dirname(core_pattern):  # Non-default core_pattern with a directory in it
                # We have seen a core_pattern that looks like it's from teuthology's coredump
                # task, so proceed to clear out the core file
                log.info("Clearing core from pattern: {0}".format(core_pattern))

                # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
                # to talk to the mons to get assigned a rank to reach the point of crashing
                addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr']
                pid_str = addr.split("/")[1]
                log.info("Determined crasher PID was {0}".format(pid_str))

                # Substitute PID into core_pattern to get a glob
                core_glob = core_pattern.replace("%p", pid_str)
                core_glob = re.sub("%[a-z]", "*", core_glob)  # Match all for all other % tokens

                # Verify that we see the expected single coredump matching the expected pattern
                ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
                    "sudo", "ls", run.Raw(core_glob)
                ], stdout=StringIO())
                cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
                log.info("Enumerated cores: {0}".format(cores))
                self.assertEqual(len(cores), 1)

                log.info("Found core file {0}, deleting it".format(cores[0]))

                self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
                    "sudo", "rm", "-f", cores[0]
                ])
            else:
                log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")

        else:
            raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))
Commit	Line	Data
7c673cae FG	1	import json
	2	import logging
	3	from unittest import case
	4	from tasks.ceph_test_case import CephTestCase
	5	import os
	6	import re
	7	from StringIO import StringIO
	8
	9	from tasks.cephfs.fuse_mount import FuseMount
	10
	11	from teuthology.orchestra import run
	12	from teuthology.orchestra.run import CommandFailedError
	13
	14
	15	log = logging.getLogger(__name__)
	16
	17
	18	def for_teuthology(f):
	19	"""
	20	Decorator that adds an "is_for_teuthology" attribute to the wrapped function
	21	"""
	22	f.is_for_teuthology = True
	23	return f
	24
	25
	26	def needs_trimming(f):
	27	"""
	28	Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
	29	this means it needs to be able to run as root, currently)
	30	"""
	31	f.needs_trimming = True
	32	return f
	33
	34
	35	class CephFSTestCase(CephTestCase):
	36	"""
	37	Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
	38	into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
	39
	40	Handles resetting the cluster under test between tests.
	41	"""
	42
	43	# FIXME weird explicit naming
	44	mount_a = None
	45	mount_b = None
	46
	47	# Declarative test requirements: subclasses should override these to indicate
	48	# their special needs. If not met, tests will be skipped.
	49	CLIENTS_REQUIRED = 1
	50	MDSS_REQUIRED = 1
	51	REQUIRE_KCLIENT_REMOTE = False
	52	REQUIRE_ONE_CLIENT_REMOTE = False
	53	REQUIRE_MEMSTORE = False
	54
	55	# Whether to create the default filesystem during setUp
	56	REQUIRE_FILESYSTEM = True
	57
	58	LOAD_SETTINGS = []
	59
	60	def setUp(self):
	61	super(CephFSTestCase, self).setUp()
	62
	63	if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
	64	raise case.SkipTest("Only have {0} MDSs, require {1}".format(
65	len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
66	))
67
68	if len(self.mounts) < self.CLIENTS_REQUIRED:
69	raise case.SkipTest("Only have {0} clients, require {1}".format(
70	len(self.mounts), self.CLIENTS_REQUIRED
71	))
72
73	if self.REQUIRE_KCLIENT_REMOTE:
74	if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
75	# kclient kill() power cycles nodes, so requires clients to each be on
76	# their own node
77	if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
78	raise case.SkipTest("kclient clients must be on separate nodes")
79
80	if self.REQUIRE_ONE_CLIENT_REMOTE:
81	if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
82	raise case.SkipTest("Require first client to be on separate server from MDSs")
83
84	if self.REQUIRE_MEMSTORE:
85	objectstore = self.mds_cluster.get_config("osd_objectstore", "osd")
86	if objectstore != "memstore":
87	# You certainly could run this on a real OSD, but you don't want to sit
88	# here for hours waiting for the test to fill up a 1TB drive!
89	raise case.SkipTest("Require `memstore` OSD backend to simulate full drives")
90
91	# Create friendly mount_a, mount_b attrs
92	for i in range(0, self.CLIENTS_REQUIRED):
93	setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
94
95	self.mds_cluster.clear_firewall()
96
97	# Unmount all clients, we are about to blow away the filesystem
98	for mount in self.mounts:
99	if mount.is_mounted():
100	mount.umount_wait(force=True)
101
102	# To avoid any issues with e.g. unlink bugs, we destroy and recreate
103	# the filesystem rather than just doing a rm -rf of files
104	self.mds_cluster.mds_stop()
31f18b77	105	self.mds_cluster.mds_fail()
7c673cae FG	106	self.mds_cluster.delete_all_filesystems()
	107	self.fs = None # is now invalid!
	108
	109	# In case the previous filesystem had filled up the RADOS cluster, wait for that
	110	# flag to pass.
	111	osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd'))
	112	self.wait_until_true(lambda: not self.mds_cluster.is_full(),
	113	timeout=osd_mon_report_interval_max * 5)
	114
	115	# In case anything is in the OSD blacklist list, clear it out. This is to avoid
	116	# the OSD map changing in the background (due to blacklist expiry) while tests run.
	117	try:
	118	self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
	119	except CommandFailedError:
	120	# Fallback for older Ceph cluster
	121	blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
	122	"dump", "--format=json-pretty"))['blacklist']
	123	log.info("Removing {0} blacklist entries".format(len(blacklist)))
	124	for addr, blacklisted_at in blacklist.items():
	125	self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
	126
	127	client_mount_ids = [m.client_id for m in self.mounts]
	128	# In case the test changes the IDs of clients, stash them so that we can
	129	# reset in tearDown
	130	self._original_client_ids = client_mount_ids
	131	log.info(client_mount_ids)
	132
	133	# In case there were any extra auth identities around from a previous
	134	# test, delete them
	135	for entry in self.auth_list():
	136	ent_type, ent_id = entry['entity'].split(".")
	137	if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
	138	self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
	139
	140	if self.REQUIRE_FILESYSTEM:
	141	self.fs = self.mds_cluster.newfs(True)
	142	self.fs.mds_restart()
	143
	144	# In case some test messed with auth caps, reset them
	145	for client_id in client_mount_ids:
	146	self.mds_cluster.mon_manager.raw_cluster_cmd_result(
	147	'auth', 'caps', "client.{0}".format(client_id),
	148	'mds', 'allow',
	149	'mon', 'allow r',
	150	'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))
	151
	152	# wait for mds restart to complete...
	153	self.fs.wait_for_daemons()
	154
	155	# Mount the requested number of clients
	156	for i in range(0, self.CLIENTS_REQUIRED):
	157	self.mounts[i].mount()
	158	self.mounts[i].wait_until_mounted()
	159
	160	# Load an config settings of interest
	161	for setting in self.LOAD_SETTINGS:
c07f9fc5	162	setattr(self, setting, float(self.fs.mds_asok(
7c673cae FG	163	['config', 'get', setting], self.mds_cluster.mds_ids[0]
	164	)[setting]))
	165
	166	self.configs_set = set()
	167
	168	def tearDown(self):
	169	super(CephFSTestCase, self).tearDown()
	170
	171	self.mds_cluster.clear_firewall()
	172	for m in self.mounts:
	173	m.teardown()
	174
	175	for i, m in enumerate(self.mounts):
	176	m.client_id = self._original_client_ids[i]
	177
	178	for subsys, key in self.configs_set:
	179	self.mds_cluster.clear_ceph_conf(subsys, key)
	180
	181	def set_conf(self, subsys, key, value):
	182	self.configs_set.add((subsys, key))
	183	self.mds_cluster.set_ceph_conf(subsys, key, value)
	184
	185	def auth_list(self):
	186	"""
c07f9fc5	187	Convenience wrapper on "ceph auth ls"
7c673cae FG	188	"""
7c673cae FG	189	return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
c07f9fc5	190	"auth", "ls", "--format=json-pretty"
7c673cae FG	191	))['auth_dump']
	192
	193	def assert_session_count(self, expected, ls_data=None, mds_id=None):
	194	if ls_data is None:
	195	ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
	196
31f18b77 FG	197	alive_count = len([s for s in ls_data if s['state'] != 'killing'])
	198
	199	self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
	200	expected, alive_count
7c673cae FG	201	))
	202
	203	def assert_session_state(self, client_id, expected_state):
	204	self.assertEqual(
	205	self._session_by_id(
	206	self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
	207	expected_state)
	208
	209	def get_session_data(self, client_id):
	210	return self._session_by_id(client_id)
	211
	212	def _session_list(self):
	213	ls_data = self.fs.mds_asok(['session', 'ls'])
	214	ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
	215	return ls_data
	216
	217	def get_session(self, client_id, session_ls=None):
	218	if session_ls is None:
	219	session_ls = self.fs.mds_asok(['session', 'ls'])
	220
	221	return self._session_by_id(session_ls)[client_id]
	222
	223	def _session_by_id(self, session_ls):
	224	return dict([(s['id'], s) for s in session_ls])
	225
	226	def wait_for_daemon_start(self, daemon_ids=None):
	227	"""
	228	Wait until all the daemons appear in the FSMap, either assigned
	229	MDS ranks or in the list of standbys
	230	"""
	231	def get_daemon_names():
	232	return [info['name'] for info in self.mds_cluster.status().get_all()]
	233
	234	if daemon_ids is None:
	235	daemon_ids = self.mds_cluster.mds_ids
	236
	237	try:
	238	self.wait_until_true(
	239	lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
	240	timeout=30
	241	)
	242	except RuntimeError:
	243	log.warn("Timeout waiting for daemons {0}, while we have {1}".format(
	244	daemon_ids, get_daemon_names()
	245	))
	246	raise
	247
	248	def assert_mds_crash(self, daemon_id):
	249	"""
	250	Assert that the a particular MDS daemon crashes (block until
	251	it does)
	252	"""
	253	try:
	254	self.mds_cluster.mds_daemons[daemon_id].proc.wait()
	255	except CommandFailedError as e:
	256	log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
	257	self.mds_cluster.mds_daemons[daemon_id].proc = None
	258
	259	# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
	260	# catch it later and treat it as a failure.
	261	p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
	262	"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
	263	core_pattern = p.stdout.getvalue().strip()
	264	if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
265	# We have seen a core_pattern that looks like it's from teuthology's coredump
266	# task, so proceed to clear out the core file
267	log.info("Clearing core from pattern: {0}".format(core_pattern))
268
269	# Determine the PID of the crashed MDS by inspecting the MDSMap, it had
270	# to talk to the mons to get assigned a rank to reach the point of crashing
271	addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr']
272	pid_str = addr.split("/")[1]
273	log.info("Determined crasher PID was {0}".format(pid_str))
274
275	# Substitute PID into core_pattern to get a glob
276	core_glob = core_pattern.replace("%p", pid_str)
277	core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
278
279	# Verify that we see the expected single coredump matching the expected pattern
280	ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
281	"sudo", "ls", run.Raw(core_glob)
282	], stdout=StringIO())
283	cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
284	log.info("Enumerated cores: {0}".format(cores))
285	self.assertEqual(len(cores), 1)
286
287	log.info("Found core file {0}, deleting it".format(cores[0]))
288
289	self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
290	"sudo", "rm", "-f", cores[0]
291	])
292	else:
293	log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
294
295	else:
296	raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))