[ceph.git] / ceph / qa / tasks / cephfs / test_scrub.py

"""
Test CephFS scrub (distinct from OSD scrub) functionality
"""

from io import BytesIO
import logging
from collections import namedtuple

from tasks.cephfs.cephfs_test_case import CephFSTestCase

log = logging.getLogger(__name__)

ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])


class Workload(CephFSTestCase):
    def __init__(self, test, filesystem, mount):
        super().__init__()
        self._test =  test
        self._mount = mount
        self._filesystem = filesystem
        self._initial_state = None

        # Accumulate backtraces for every failed validation, and return them.  Backtraces
        # are rather verbose, but we only see them when something breaks, and they
        # let us see which check failed without having to decorate each check with
        # a string
        self._errors = []

    def write(self):
        """
        Write the workload files to the mount
        """
        raise NotImplementedError()

    def validate(self):
        """
        Read from the mount and validate that the workload files are present (i.e. have
        survived or been reconstructed from the test scenario)
        """
        raise NotImplementedError()

    def damage(self):
        """
        Damage the filesystem pools in ways that will be interesting to recover from.  By
        default just wipe everything in the metadata pool
        """
        # Delete every object in the metadata pool
        pool = self._filesystem.get_metadata_pool_name()
        self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])

    def flush(self):
        """
        Called after client unmount, after write: flush whatever you want
        """
        self._filesystem.mds_asok(["flush", "journal"])


class BacktraceWorkload(Workload):
    """
    Single file, single directory, wipe the backtrace and check it.
    """
    def write(self):
        self._mount.run_shell(["mkdir", "subdir"])
        self._mount.write_n_mb("subdir/sixmegs", 6)

    def validate(self):
        st = self._mount.stat("subdir/sixmegs")
        self._filesystem.mds_asok(["flush", "journal"])
        bt = self._filesystem.read_backtrace(st['st_ino'])
        parent = bt['ancestors'][0]['dname']
        self.assertEqual(parent, 'sixmegs')
        return self._errors

    def damage(self):
        st = self._mount.stat("subdir/sixmegs")
        self._filesystem.mds_asok(["flush", "journal"])
        self._filesystem._write_data_xattr(st['st_ino'], "parent", "")

    def create_files(self, nfiles=1000):
        self._mount.create_n_files("scrub-new-files/file", nfiles)


class DupInodeWorkload(Workload):
    """
    Duplicate an inode and try scrubbing it twice."
    """

    def write(self):
        self._mount.run_shell(["mkdir", "parent"])
        self._mount.run_shell(["mkdir", "parent/child"])
        self._mount.write_n_mb("parent/parentfile", 6)
        self._mount.write_n_mb("parent/child/childfile", 6)

    def damage(self):
        self._mount.umount_wait()
        self._filesystem.mds_asok(["flush", "journal"])
        self._filesystem.fail()
        d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"])
        self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d))
        self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
        self._filesystem.set_joinable()
        self._filesystem.wait_for_daemons()

    def validate(self):
        out_json = self._filesystem.run_scrub(["start", "/", "recursive,repair"])
        self.assertNotEqual(out_json, None)
        self.assertEqual(out_json["return_code"], 0)
        self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
        self.assertTrue(self._filesystem.are_daemons_healthy())
        return self._errors


class TestScrub(CephFSTestCase):
    MDSS_REQUIRED = 1

    def setUp(self):
        super().setUp()

    def _scrub(self, workload, workers=1):
        """
        That when all objects in metadata pool are removed, we can rebuild a metadata pool
        based on the contents of a data pool, and a client can see and read our files.
        """

        # First, inject some files

        workload.write()

        # are off by default, but in QA we need to explicitly disable them)
        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)

        # Apply any data damage the workload wants
        workload.damage()

        out_json = self.fs.run_scrub(["start", "/", "recursive,repair"])
        self.assertNotEqual(out_json, None)
        self.assertEqual(out_json["return_code"], 0)
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        # See that the files are present and correct
        errors = workload.validate()
        if errors:
            log.error("Validation errors found: {0}".format(len(errors)))
            for e in errors:
                log.error(e.exception)
                log.error(e.backtrace)
            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
                errors[0].exception, errors[0].backtrace
            ))

    def _get_damage_count(self, damage_type='backtrace'):
        out_json = self.fs.rank_tell(["damage", "ls"])
        self.assertNotEqual(out_json, None)

        damage_count = 0
        for it in out_json:
            if it['damage_type'] == damage_type:
                damage_count += 1
        return damage_count

    def _scrub_new_files(self, workload):
        """
        That scrubbing new files does not lead to errors
        """
        workload.create_files(1000)
        self.fs.wait_until_scrub_complete()
        self.assertEqual(self._get_damage_count(), 0)

    def test_scrub_backtrace_for_new_files(self):
        self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))

    def test_scrub_backtrace(self):
        self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))

    def test_scrub_dup_inode(self):
        self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))
Commit	Line	Data
94b18763 FG	1	"""
	2	Test CephFS scrub (distinct from OSD scrub) functionality
	3	"""
f67539c2 TL	4
f67539c2 TL	5	from io import BytesIO
94b18763	6	import logging
94b18763 FG	7	from collections import namedtuple
94b18763 FG	8
9f95a23c	9	from tasks.cephfs.cephfs_test_case import CephFSTestCase
94b18763 FG	10
	11	log = logging.getLogger(__name__)
	12
	13	ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
	14
	15
1adf2230	16	class Workload(CephFSTestCase):
f67539c2	17	def __init__(self, test, filesystem, mount):
e306af50	18	super().__init__()
f67539c2	19	self._test = test
94b18763 FG	20	self._mount = mount
	21	self._filesystem = filesystem
	22	self._initial_state = None
	23
	24	# Accumulate backtraces for every failed validation, and return them. Backtraces
	25	# are rather verbose, but we only see them when something breaks, and they
	26	# let us see which check failed without having to decorate each check with
	27	# a string
	28	self._errors = []
	29
94b18763 FG	30	def write(self):
	31	"""
	32	Write the workload files to the mount
	33	"""
	34	raise NotImplementedError()
	35
	36	def validate(self):
	37	"""
	38	Read from the mount and validate that the workload files are present (i.e. have
	39	survived or been reconstructed from the test scenario)
	40	"""
	41	raise NotImplementedError()
	42
	43	def damage(self):
	44	"""
	45	Damage the filesystem pools in ways that will be interesting to recover from. By
	46	default just wipe everything in the metadata pool
	47	"""
	48	# Delete every object in the metadata pool
f67539c2 TL	49	pool = self._filesystem.get_metadata_pool_name()
f67539c2 TL	50	self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])
94b18763 FG	51
	52	def flush(self):
	53	"""
	54	Called after client unmount, after write: flush whatever you want
	55	"""
	56	self._filesystem.mds_asok(["flush", "journal"])
	57
	58
	59	class BacktraceWorkload(Workload):
	60	"""
	61	Single file, single directory, wipe the backtrace and check it.
	62	"""
	63	def write(self):
	64	self._mount.run_shell(["mkdir", "subdir"])
	65	self._mount.write_n_mb("subdir/sixmegs", 6)
	66
	67	def validate(self):
	68	st = self._mount.stat("subdir/sixmegs")
	69	self._filesystem.mds_asok(["flush", "journal"])
	70	bt = self._filesystem.read_backtrace(st['st_ino'])
	71	parent = bt['ancestors'][0]['dname']
1adf2230	72	self.assertEqual(parent, 'sixmegs')
94b18763 FG	73	return self._errors
	74
	75	def damage(self):
	76	st = self._mount.stat("subdir/sixmegs")
	77	self._filesystem.mds_asok(["flush", "journal"])
	78	self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
	79
e306af50 TL	80	def create_files(self, nfiles=1000):
	81	self._mount.create_n_files("scrub-new-files/file", nfiles)
	82
94b18763 FG	83
	84	class DupInodeWorkload(Workload):
	85	"""
	86	Duplicate an inode and try scrubbing it twice."
	87	"""
	88
	89	def write(self):
	90	self._mount.run_shell(["mkdir", "parent"])
	91	self._mount.run_shell(["mkdir", "parent/child"])
	92	self._mount.write_n_mb("parent/parentfile", 6)
	93	self._mount.write_n_mb("parent/child/childfile", 6)
	94
	95	def damage(self):
e306af50	96	self._mount.umount_wait()
94b18763	97	self._filesystem.mds_asok(["flush", "journal"])
f67539c2 TL	98	self._filesystem.fail()
	99	d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"])
	100	self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d))
	101	self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
	102	self._filesystem.set_joinable()
94b18763 FG	103	self._filesystem.wait_for_daemons()
	104
	105	def validate(self):
b3b6e05e	106	out_json = self._filesystem.run_scrub(["start", "/", "recursive,repair"])
1adf2230	107	self.assertNotEqual(out_json, None)
f67539c2 TL	108	self.assertEqual(out_json["return_code"], 0)
f67539c2 TL	109	self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
1adf2230	110	self.assertTrue(self._filesystem.are_daemons_healthy())
94b18763 FG	111	return self._errors
	112
	113
	114	class TestScrub(CephFSTestCase):
	115	MDSS_REQUIRED = 1
	116
e306af50 TL	117	def setUp(self):
	118	super().setUp()
	119
94b18763 FG	120	def _scrub(self, workload, workers=1):
	121	"""
	122	That when all objects in metadata pool are removed, we can rebuild a metadata pool
	123	based on the contents of a data pool, and a client can see and read our files.
	124	"""
	125
	126	# First, inject some files
	127
	128	workload.write()
	129
	130	# are off by default, but in QA we need to explicitly disable them)
	131	self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
	132	self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
	133
	134	# Apply any data damage the workload wants
	135	workload.damage()
	136
b3b6e05e	137	out_json = self.fs.run_scrub(["start", "/", "recursive,repair"])
1adf2230	138	self.assertNotEqual(out_json, None)
f67539c2 TL	139	self.assertEqual(out_json["return_code"], 0)
f67539c2 TL	140	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
94b18763 FG	141
	142	# See that the files are present and correct
	143	errors = workload.validate()
	144	if errors:
	145	log.error("Validation errors found: {0}".format(len(errors)))
	146	for e in errors:
	147	log.error(e.exception)
	148	log.error(e.backtrace)
	149	raise AssertionError("Validation failed, first error: {0}\n{1}".format(
	150	errors[0].exception, errors[0].backtrace
	151	))
	152
e306af50 TL	153	def _get_damage_count(self, damage_type='backtrace'):
	154	out_json = self.fs.rank_tell(["damage", "ls"])
	155	self.assertNotEqual(out_json, None)
	156
	157	damage_count = 0
	158	for it in out_json:
	159	if it['damage_type'] == damage_type:
	160	damage_count += 1
	161	return damage_count
	162
	163	def _scrub_new_files(self, workload):
	164	"""
	165	That scrubbing new files does not lead to errors
	166	"""
	167	workload.create_files(1000)
f67539c2	168	self.fs.wait_until_scrub_complete()
e306af50 TL	169	self.assertEqual(self._get_damage_count(), 0)
	170
	171	def test_scrub_backtrace_for_new_files(self):
f67539c2	172	self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
e306af50	173
94b18763	174	def test_scrub_backtrace(self):
f67539c2	175	self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
94b18763 FG	176
94b18763 FG	177	def test_scrub_dup_inode(self):
f67539c2	178	self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))