[ceph.git] / ceph / qa / tasks / cephfs / test_recovery_pool.py

"""
Test our tools for recovering metadata from the data pool into an alternate pool
"""

import logging
import traceback
from collections import namedtuple

from teuthology.orchestra.run import CommandFailedError
from tasks.cephfs.cephfs_test_case import CephFSTestCase

log = logging.getLogger(__name__)


ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])


class OverlayWorkload(object):
    def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
        self._orig_fs = orig_fs
        self._recovery_fs = recovery_fs
        self._orig_mount = orig_mount
        self._recovery_mount = recovery_mount
        self._initial_state = None

        # Accumulate backtraces for every failed validation, and return them.  Backtraces
        # are rather verbose, but we only see them when something breaks, and they
        # let us see which check failed without having to decorate each check with
        # a string
        self._errors = []

    def assert_equal(self, a, b):
        try:
            if a != b:
                raise AssertionError("{0} != {1}".format(a, b))
        except AssertionError as e:
            self._errors.append(
                ValidationError(e, traceback.format_exc(3))
            )

    def write(self):
        """
        Write the workload files to the mount
        """
        raise NotImplementedError()

    def validate(self):
        """
        Read from the mount and validate that the workload files are present (i.e. have
        survived or been reconstructed from the test scenario)
        """
        raise NotImplementedError()

    def damage(self):
        """
        Damage the filesystem pools in ways that will be interesting to recover from.  By
        default just wipe everything in the metadata pool
        """
        # Delete every object in the metadata pool
        objects = self._orig_fs.rados(["ls"]).split("\n")
        for o in objects:
            self._orig_fs.rados(["rm", o])

    def flush(self):
        """
        Called after client unmount, after write: flush whatever you want
        """
        self._orig_fs.mds_asok(["flush", "journal"])
        self._recovery_fs.mds_asok(["flush", "journal"])


class SimpleOverlayWorkload(OverlayWorkload):
    """
    Single file, single directory, check that it gets recovered and so does its size
    """
    def write(self):
        self._orig_mount.run_shell(["mkdir", "subdir"])
        self._orig_mount.write_n_mb("subdir/sixmegs", 6)
        self._initial_state = self._orig_mount.stat("subdir/sixmegs")

    def validate(self):
        self._recovery_mount.run_shell(["ls", "subdir"])
        st = self._recovery_mount.stat("subdir/sixmegs")
        self.assert_equal(st['st_size'], self._initial_state['st_size'])
        return self._errors

class TestRecoveryPool(CephFSTestCase):
    MDSS_REQUIRED = 2
    CLIENTS_REQUIRED = 2
    REQUIRE_RECOVERY_FILESYSTEM = True

    def is_marked_damaged(self, rank):
        mds_map = self.fs.get_mds_map()
        return rank in mds_map['damaged']

    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
        """
        That when all objects in metadata pool are removed, we can rebuild a metadata pool
        based on the contents of a data pool, and a client can see and read our files.
        """

        # First, inject some files

        workload.write()

        # Unmount the client and flush the journal: the tool should also cope with
        # situations where there is dirty metadata, but we'll test that separately
        self.mount_a.umount_wait()
        self.mount_b.umount_wait()
        workload.flush()

        # Create the alternate pool if requested
        recovery_fs = self.recovery_fs.name
        recovery_pool = self.recovery_fs.get_metadata_pool_name()
        self.recovery_fs.data_scan(['init', '--force-init',
                                    '--filesystem', recovery_fs,
                                    '--alternate-pool', recovery_pool])
        self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])

        # Stop the MDS
        self.fs.mds_stop()
        self.fs.mds_fail()

        # After recovery, we need the MDS to not be strict about stats (in production these options
        # are off by default, but in QA we need to explicitly disable them)
        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)

        # Apply any data damage the workload wants
        workload.damage()

        # Reset the MDS map in case multiple ranks were in play: recovery procedure
        # only understands how to rebuild metadata under rank 0
        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
                '--yes-i-really-mean-it')

        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])

        # Run the recovery procedure
        if False:
            with self.assertRaises(CommandFailedError):
                # Normal reset should fail when no objects are present, we'll use --force instead
                self.fs.journal_tool(["journal", "reset"], 0)

        self.fs.mds_stop()
        self.fs.data_scan(['scan_extents', '--alternate-pool',
                           recovery_pool, '--filesystem', self.fs.name,
                           self.fs.get_data_pool_name()])
        self.fs.data_scan(['scan_inodes', '--alternate-pool',
                           recovery_pool, '--filesystem', self.fs.name,
                           '--force-corrupt', '--force-init',
                           self.fs.get_data_pool_name()])
        self.fs.journal_tool(['event', 'recover_dentries', 'list',
                              '--alternate-pool', recovery_pool], 0)

        self.fs.data_scan(['init', '--force-init', '--filesystem',
                           self.fs.name])
        self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
                           '--force-corrupt', '--force-init',
                           self.fs.get_data_pool_name()])
        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)

        self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
        self.fs.journal_tool(['journal', 'reset', '--force'], 0)
        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
                                            recovery_fs + ":0")

        # Mark the MDS repaired
        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

        # Start the MDS
        self.fs.mds_restart()
        self.recovery_fs.mds_restart()
        self.fs.wait_for_daemons()
        self.recovery_fs.wait_for_daemons()
        status = self.recovery_fs.status()
        for rank in self.recovery_fs.get_ranks(status=status):
            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
                                                'injectargs', '--debug-mds=20')
            self.fs.rank_tell(['scrub', 'start', '/', 'recursive', 'repair'], rank=rank['rank'], status=status)
        log.info(str(self.mds_cluster.status()))

        # Mount a client
        self.mount_a.mount_wait()
        self.mount_b.mount_wait(mount_fs_name=recovery_fs)

        # See that the files are present and correct
        errors = workload.validate()
        if errors:
            log.error("Validation errors found: {0}".format(len(errors)))
            for e in errors:
                log.error(e.exception)
                log.error(e.backtrace)
            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
                errors[0].exception, errors[0].backtrace
            ))

    def test_rebuild_simple(self):
        self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
                                                     self.mount_a, self.mount_b))
Commit	Line	Data
181888fb FG	1	"""
	2	Test our tools for recovering metadata from the data pool into an alternate pool
	3	"""
181888fb FG	4
181888fb FG	5	import logging
181888fb	6	import traceback
9f95a23c	7	from collections import namedtuple
181888fb FG	8
181888fb FG	9	from teuthology.orchestra.run import CommandFailedError
9f95a23c	10	from tasks.cephfs.cephfs_test_case import CephFSTestCase
181888fb FG	11
	12	log = logging.getLogger(__name__)
	13
	14
	15	ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
	16
	17
	18	class OverlayWorkload(object):
	19	def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
	20	self._orig_fs = orig_fs
	21	self._recovery_fs = recovery_fs
	22	self._orig_mount = orig_mount
	23	self._recovery_mount = recovery_mount
	24	self._initial_state = None
	25
	26	# Accumulate backtraces for every failed validation, and return them. Backtraces
	27	# are rather verbose, but we only see them when something breaks, and they
	28	# let us see which check failed without having to decorate each check with
	29	# a string
	30	self._errors = []
	31
	32	def assert_equal(self, a, b):
	33	try:
	34	if a != b:
	35	raise AssertionError("{0} != {1}".format(a, b))
	36	except AssertionError as e:
	37	self._errors.append(
	38	ValidationError(e, traceback.format_exc(3))
	39	)
	40
	41	def write(self):
	42	"""
	43	Write the workload files to the mount
	44	"""
	45	raise NotImplementedError()
	46
	47	def validate(self):
	48	"""
	49	Read from the mount and validate that the workload files are present (i.e. have
	50	survived or been reconstructed from the test scenario)
	51	"""
	52	raise NotImplementedError()
	53
	54	def damage(self):
	55	"""
	56	Damage the filesystem pools in ways that will be interesting to recover from. By
	57	default just wipe everything in the metadata pool
	58	"""
	59	# Delete every object in the metadata pool
	60	objects = self._orig_fs.rados(["ls"]).split("\n")
	61	for o in objects:
	62	self._orig_fs.rados(["rm", o])
	63
	64	def flush(self):
	65	"""
	66	Called after client unmount, after write: flush whatever you want
	67	"""
	68	self._orig_fs.mds_asok(["flush", "journal"])
	69	self._recovery_fs.mds_asok(["flush", "journal"])
	70
	71
	72	class SimpleOverlayWorkload(OverlayWorkload):
	73	"""
	74	Single file, single directory, check that it gets recovered and so does its size
75	"""
76	def write(self):
77	self._orig_mount.run_shell(["mkdir", "subdir"])
78	self._orig_mount.write_n_mb("subdir/sixmegs", 6)
79	self._initial_state = self._orig_mount.stat("subdir/sixmegs")
80
81	def validate(self):
82	self._recovery_mount.run_shell(["ls", "subdir"])
83	st = self._recovery_mount.stat("subdir/sixmegs")
84	self.assert_equal(st['st_size'], self._initial_state['st_size'])
85	return self._errors
86
87	class TestRecoveryPool(CephFSTestCase):
88	MDSS_REQUIRED = 2
89	CLIENTS_REQUIRED = 2
90	REQUIRE_RECOVERY_FILESYSTEM = True
91
92	def is_marked_damaged(self, rank):
93	mds_map = self.fs.get_mds_map()
94	return rank in mds_map['damaged']
95
96	def _rebuild_metadata(self, workload, other_pool=None, workers=1):
97	"""
98	That when all objects in metadata pool are removed, we can rebuild a metadata pool
99	based on the contents of a data pool, and a client can see and read our files.
100	"""
101
102	# First, inject some files
103
104	workload.write()
105
106	# Unmount the client and flush the journal: the tool should also cope with
107	# situations where there is dirty metadata, but we'll test that separately
108	self.mount_a.umount_wait()
109	self.mount_b.umount_wait()
110	workload.flush()
111
112	# Create the alternate pool if requested
113	recovery_fs = self.recovery_fs.name
114	recovery_pool = self.recovery_fs.get_metadata_pool_name()
115	self.recovery_fs.data_scan(['init', '--force-init',
116	'--filesystem', recovery_fs,
117	'--alternate-pool', recovery_pool])
118	self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
119	self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
120	self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
121	self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
122
123	# Stop the MDS
124	self.fs.mds_stop()
125	self.fs.mds_fail()
126
127	# After recovery, we need the MDS to not be strict about stats (in production these options
128	# are off by default, but in QA we need to explicitly disable them)
129	self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
130	self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
131
132	# Apply any data damage the workload wants
133	workload.damage()
134
135	# Reset the MDS map in case multiple ranks were in play: recovery procedure
136	# only understands how to rebuild metadata under rank 0
137	self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
138	'--yes-i-really-mean-it')
139
181888fb FG	140	self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
	141	self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
	142	self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
	143
	144	# Run the recovery procedure
	145	if False:
	146	with self.assertRaises(CommandFailedError):
	147	# Normal reset should fail when no objects are present, we'll use --force instead
f64942e4	148	self.fs.journal_tool(["journal", "reset"], 0)
181888fb FG	149
	150	self.fs.mds_stop()
	151	self.fs.data_scan(['scan_extents', '--alternate-pool',
	152	recovery_pool, '--filesystem', self.fs.name,
	153	self.fs.get_data_pool_name()])
	154	self.fs.data_scan(['scan_inodes', '--alternate-pool',
	155	recovery_pool, '--filesystem', self.fs.name,
	156	'--force-corrupt', '--force-init',
	157	self.fs.get_data_pool_name()])
f64942e4 AA	158	self.fs.journal_tool(['event', 'recover_dentries', 'list',
f64942e4 AA	159	'--alternate-pool', recovery_pool], 0)
181888fb FG	160
	161	self.fs.data_scan(['init', '--force-init', '--filesystem',
	162	self.fs.name])
	163	self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
	164	'--force-corrupt', '--force-init',
	165	self.fs.get_data_pool_name()])
f64942e4	166	self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
181888fb	167
f64942e4 AA	168	self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
f64942e4 AA	169	self.fs.journal_tool(['journal', 'reset', '--force'], 0)
181888fb FG	170	self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
	171	recovery_fs + ":0")
	172
	173	# Mark the MDS repaired
	174	self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
	175
	176	# Start the MDS
	177	self.fs.mds_restart()
	178	self.recovery_fs.mds_restart()
	179	self.fs.wait_for_daemons()
	180	self.recovery_fs.wait_for_daemons()
f64942e4 AA	181	status = self.recovery_fs.status()
	182	for rank in self.recovery_fs.get_ranks(status=status):
	183	self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
181888fb	184	'injectargs', '--debug-mds=20')
11fdf7f2	185	self.fs.rank_tell(['scrub', 'start', '/', 'recursive', 'repair'], rank=rank['rank'], status=status)
181888fb FG	186	log.info(str(self.mds_cluster.status()))
	187
	188	# Mount a client
e306af50 TL	189	self.mount_a.mount_wait()
e306af50 TL	190	self.mount_b.mount_wait(mount_fs_name=recovery_fs)
181888fb FG	191
	192	# See that the files are present and correct
	193	errors = workload.validate()
	194	if errors:
	195	log.error("Validation errors found: {0}".format(len(errors)))
	196	for e in errors:
	197	log.error(e.exception)
	198	log.error(e.backtrace)
	199	raise AssertionError("Validation failed, first error: {0}\n{1}".format(
	200	errors[0].exception, errors[0].backtrace
	201	))
	202
	203	def test_rebuild_simple(self):
	204	self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
	205	self.mount_a, self.mount_b))