[ceph.git] / ceph / qa / tasks / cephfs / test_damage.py

from io import BytesIO, StringIO
import json
import logging
import errno
import re
from teuthology.contextutil import MaxWhileTries
from teuthology.exceptions import CommandFailedError
from teuthology.orchestra.run import wait
from tasks.cephfs.fuse_mount import FuseMount
from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology

DAMAGED_ON_START = "damaged_on_start"
DAMAGED_ON_LS = "damaged_on_ls"
CRASHED = "server crashed"
NO_DAMAGE = "no damage"
READONLY = "readonly"
FAILED_CLIENT = "client failed"
FAILED_SERVER = "server failed"

# An EIO in response to a stat from the client
EIO_ON_LS = "eio"

# An EIO, but nothing in damage table (not ever what we expect)
EIO_NO_DAMAGE = "eio without damage entry"


log = logging.getLogger(__name__)


class TestDamage(CephFSTestCase):
    def _simple_workload_write(self):
        self.mount_a.run_shell(["mkdir", "subdir"])
        self.mount_a.write_n_mb("subdir/sixmegs", 6)
        return self.mount_a.stat("subdir/sixmegs")

    def is_marked_damaged(self, rank):
        mds_map = self.fs.get_mds_map()
        return rank in mds_map['damaged']

    @for_teuthology #459s
    def test_object_deletion(self):
        """
        That the MDS has a clean 'damaged' response to loss of any single metadata object
        """

        self._simple_workload_write()

        # Hmm, actually it would be nice to permute whether the metadata pool
        # state contains sessions or not, but for the moment close this session
        # to avoid waiting through reconnect on every MDS start.
        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.fail()

        serialized = self.fs.radosmo(['export', '-'])

        def is_ignored(obj_id, dentry=None):
            """
            A filter to avoid redundantly mutating many similar objects (e.g.
            stray dirfrags) or similar dentries (e.g. stray dir dentries)
            """
            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
                return True

            if dentry and obj_id == "100.00000000":
                if re.match("stray.+_head", dentry) and dentry != "stray0_head":
                    return True

            return False

        def get_path(obj_id, dentry=None):
            """
            What filesystem path does this object or dentry correspond to?   i.e.
            what should I poke to see EIO after damaging it?
            """

            if obj_id == "1.00000000" and dentry == "subdir_head":
                return "./subdir"
            elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
                return "./subdir/sixmegs"

            # None means ls will do an "ls -R" in hope of seeing some errors
            return None

        objects = self.fs.radosmo(["ls"], stdout=StringIO()).strip().split("\n")
        objects = [o for o in objects if not is_ignored(o)]

        # Find all objects with an OMAP header
        omap_header_objs = []
        for o in objects:
            header = self.fs.radosmo(["getomapheader", o], stdout=StringIO())
            # The rados CLI wraps the header output in a hex-printed style
            header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
            if header_bytes > 0:
                omap_header_objs.append(o)

        # Find all OMAP key/vals
        omap_keys = []
        for o in objects:
            keys_str = self.fs.radosmo(["listomapkeys", o], stdout=StringIO())
            if keys_str:
                for key in keys_str.strip().split("\n"):
                    if not is_ignored(o, key):
                        omap_keys.append((o, key))

        # Find objects that have data in their bodies
        data_objects = []
        for obj_id in objects:
            stat_out = self.fs.radosmo(["stat", obj_id], stdout=StringIO())
            size = int(re.match(".+, size (.+)$", stat_out).group(1))
            if size > 0:
                data_objects.append(obj_id)

        # Define the various forms of damage we will inflict
        class MetadataMutation(object):
            def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
                self.obj_id = obj_id_
                self.desc = desc_
                self.mutate_fn = mutate_fn_
                self.expectation = expectation_
                if ls_path is None:
                    self.ls_path = "."
                else:
                    self.ls_path = ls_path

            def __eq__(self, other):
                return self.desc == other.desc

            def __hash__(self):
                return hash(self.desc)

        junk = "deadbeef" * 10
        mutations = []

        # Removals
        for o in objects:
            if o in [
                # JournalPointers are auto-replaced if missing (same path as upgrade)
                "400.00000000",
                # Missing dirfrags for non-system dirs result in empty directory
                "10000000000.00000000",
                # PurgeQueue is auto-created if not found on startup
                "500.00000000",
                # open file table is auto-created if not found on startup
                "mds0_openfiles.0"
            ]:
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            log.info("Expectation on rm '{0}' will be '{1}'".format(
                o, expectation
            ))

            mutations.append(MetadataMutation(
                o,
                "Delete {0}".format(o),
                lambda o=o: self.fs.radosm(["rm", o]),
                expectation
            ))

        # Blatant corruptions
        for obj_id in data_objects:
            if obj_id == "500.00000000":
                # purge queue corruption results in read-only FS
                mutations.append(MetadataMutation(
                    obj_id,
                    "Corrupt {0}".format(obj_id),
                    lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
                    READONLY
                ))
            else:
                mutations.append(MetadataMutation(
                    obj_id,
                    "Corrupt {0}".format(obj_id),
                    lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
                    DAMAGED_ON_START
                ))

        # Truncations
        for o in data_objects:
            if o == "500.00000000":
                # The PurgeQueue is allowed to be empty: Journaler interprets
                # an empty header object as an empty journal.
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            mutations.append(
                MetadataMutation(
                    o,
                    "Truncate {0}".format(o),
                    lambda o=o: self.fs.radosm(["truncate", o, "0"]),
                    expectation
            ))

        # OMAP value corruptions
        for o, k in omap_keys:
            if o.startswith("100."):
                # Anything in rank 0's 'mydir'
                expectation = DAMAGED_ON_START
            else:
                expectation = EIO_ON_LS

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap key {0}:{1}".format(o, k),
                    lambda o=o,k=k: self.fs.radosm(["setomapval", o, k, junk]),
                    expectation,
                    get_path(o, k)
                )
            )

        # OMAP header corruptions
        for o in omap_header_objs:
            if re.match("60.\.00000000", o) \
                    or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
                expectation = DAMAGED_ON_START
            else:
                expectation = NO_DAMAGE

            log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
                o, expectation
            ))

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap header on {0}".format(o),
                    lambda o=o: self.fs.radosm(["setomapheader", o, junk]),
                    expectation
                )
            )

        results = {}

        for mutation in mutations:
            log.info("Applying mutation '{0}'".format(mutation.desc))

            # Reset MDS state
            self.mount_a.umount_wait(force=True)
            self.fs.fail()
            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

            # Reset RADOS pool state
            self.fs.radosm(['import', '-'], stdin=BytesIO(serialized))

            # Inject the mutation
            mutation.mutate_fn()

            # Try starting the MDS
            self.fs.set_joinable()

            # How long we'll wait between starting a daemon and expecting
            # it to make it through startup, and potentially declare itself
            # damaged to the mon cluster.
            startup_timeout = 60

            if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
                if mutation.expectation == DAMAGED_ON_START:
                    # The MDS may pass through active before making it to damaged
                    try:
                        self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
                    except RuntimeError:
                        pass

                # Wait for MDS to either come up or go into damaged state
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
                except RuntimeError:
                    crashed = False
                    # Didn't make it to healthy or damaged, did it crash?
                    for daemon_id, daemon in self.fs.mds_daemons.items():
                        if daemon.proc and daemon.proc.finished:
                            crashed = True
                            log.error("Daemon {0} crashed!".format(daemon_id))
                            daemon.proc = None  # So that subsequent stop() doesn't raise error
                    if not crashed:
                        # Didn't go health, didn't go damaged, didn't crash, so what?
                        raise
                    else:
                        log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
                        results[mutation] = CRASHED
                        continue
                if self.is_marked_damaged(0):
                    log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_START
                    continue
                else:
                    log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
            else:
                try:
                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
                except RuntimeError:
                    log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
                    if self.is_marked_damaged(0):
                        results[mutation] = DAMAGED_ON_START
                    else:
                        results[mutation] = FAILED_SERVER
                    continue
                log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))

            # MDS is up, should go damaged on ls or client mount
            self.mount_a.mount_wait()
            if mutation.ls_path == ".":
                proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
            else:
                proc = self.mount_a.stat(mutation.ls_path, wait=False)

            if mutation.expectation == DAMAGED_ON_LS:
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
                    log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_LS
                except RuntimeError:
                    if self.fs.are_daemons_healthy():
                        log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
                            mutation.desc))
                        results[mutation] = NO_DAMAGE
                    else:
                        log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
                        results[mutation] = FAILED_SERVER
            elif mutation.expectation == READONLY:
                proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
                try:
                    proc.wait()
                except CommandFailedError:
                    stderr = proc.stderr.getvalue()
                    log.info(stderr)
                    if "Read-only file system".lower() in stderr.lower():
                        pass
                    else:
                        raise
            else:
                try:
                    wait([proc], 20)
                    log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
                    results[mutation] = NO_DAMAGE
                except MaxWhileTries:
                    log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
                    results[mutation] = FAILED_CLIENT
                except CommandFailedError as e:
                    if e.exitstatus == errno.EIO:
                        log.info("Result: EIO on client")
                        results[mutation] = EIO_ON_LS
                    else:
                        log.info("Result: unexpected error {0} on client".format(e))
                        results[mutation] = FAILED_CLIENT

            if mutation.expectation == EIO_ON_LS:
                # EIOs mean something handled by DamageTable: assert that it has
                # been populated
                damage = json.loads(
                    self.fs.mon_manager.raw_cluster_cmd(
                        'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
                if len(damage) == 0:
                    results[mutation] = EIO_NO_DAMAGE

        failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
        if failures:
            log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
            for mutation, result in failures:
                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
                    mutation.expectation, result, mutation.desc
                ))
            raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
        else:
            log.info("All {0} mutations had expected outcomes".format(len(mutations)))

    def test_damaged_dentry(self):
        # Damage to dentrys is interesting because it leaves the
        # directory's `complete` flag in a subtle state where
        # we have marked the dir complete in order that folks
        # can access it, but in actual fact there is a dentry
        # missing
        self.mount_a.run_shell(["mkdir", "subdir/"])

        self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
        self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])

        subdir_ino = self.mount_a.path_to_ino("subdir")

        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.fail()

        # Corrupt a dentry
        junk = "deadbeef" * 10
        dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])

        # Start up and try to list it
        self.fs.set_joinable()
        self.fs.wait_for_daemons()

        self.mount_a.mount_wait()
        dentries = self.mount_a.ls("subdir/")

        # The damaged guy should have disappeared
        self.assertEqual(dentries, ["file_undamaged"])

        # I should get ENOENT if I try and read it normally, because
        # the dir is considered complete
        try:
            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
        except CommandFailedError as e:
            self.assertEqual(e.exitstatus, errno.ENOENT)
        else:
            raise AssertionError("Expected ENOENT")

        # The fact that there is damaged should have bee recorded
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 1)
        damage_id = damage[0]['id']

        # If I try to create a dentry with the same name as the damaged guy
        # then that should be forbidden
        try:
            self.mount_a.touch("subdir/file_to_be_damaged")
        except CommandFailedError as e:
            self.assertEqual(e.exitstatus, errno.EIO)
        else:
            raise AssertionError("Expected EIO")

        # Attempting that touch will clear the client's complete flag, now
        # when I stat it I'll get EIO instead of ENOENT
        try:
            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
        except CommandFailedError as e:
            if isinstance(self.mount_a, FuseMount):
                self.assertEqual(e.exitstatus, errno.EIO)
            else:
                # Old kernel client handles this case differently
                self.assertIn(e.exitstatus, [errno.ENOENT, errno.EIO])
        else:
            raise AssertionError("Expected EIO")

        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
        self.assertEqual(nfiles, "2")

        self.mount_a.umount_wait()

        # Now repair the stats
        scrub_json = self.fs.run_scrub(["start", "/subdir", "repair"])
        log.info(json.dumps(scrub_json, indent=2))

        self.assertNotEqual(scrub_json, None)
        self.assertEqual(scrub_json["return_code"], 0)
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=scrub_json["scrub_tag"]), True)

        # Check that the file count is now correct
        self.mount_a.mount_wait()
        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
        self.assertEqual(nfiles, "1")

        # Clean up the omap object
        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])

        # Clean up the damagetable entry
        self.fs.mon_manager.raw_cluster_cmd(
            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
            "damage", "rm", "{did}".format(did=damage_id))

        # Now I should be able to create a file with the same name as the
        # damaged guy if I want.
        self.mount_a.touch("subdir/file_to_be_damaged")

    def test_open_ino_errors(self):
        """
        That errors encountered during opening inos are properly propagated
        """

        self.mount_a.run_shell(["mkdir", "dir1"])
        self.mount_a.run_shell(["touch", "dir1/file1"])
        self.mount_a.run_shell(["mkdir", "dir2"])
        self.mount_a.run_shell(["touch", "dir2/file2"])
        self.mount_a.run_shell(["mkdir", "testdir"])
        self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"])
        self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"])

        file1_ino = self.mount_a.path_to_ino("dir1/file1")
        file2_ino = self.mount_a.path_to_ino("dir2/file2")
        dir2_ino = self.mount_a.path_to_ino("dir2")

        # Ensure everything is written to backing store
        self.mount_a.umount_wait()
        self.fs.mds_asok(["flush", "journal"])

        # Drop everything from the MDS cache
        self.fs.fail()
        self.fs.journal_tool(['journal', 'reset'], 0)
        self.fs.set_joinable()
        self.fs.wait_for_daemons()

        self.mount_a.mount_wait()

        # Case 1: un-decodeable backtrace

        # Validate that the backtrace is present and decodable
        self.fs.read_backtrace(file1_ino)
        # Go corrupt the backtrace of alpha/target (used for resolving
        # bravo/hardlink).
        self.fs._write_data_xattr(file1_ino, "parent", "rhubarb")

        # Check that touching the hardlink gives EIO
        ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False)
        try:
            ran.wait()
        except CommandFailedError:
            self.assertTrue("Input/output error" in ran.stderr.getvalue())

        # Check that an entry is created in the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 1)
        self.assertEqual(damage[0]['damage_type'], "backtrace")
        self.assertEqual(damage[0]['ino'], file1_ino)

        self.fs.mon_manager.raw_cluster_cmd(
            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
            "damage", "rm", str(damage[0]['id']))


        # Case 2: missing dirfrag for the target inode

        self.fs.radosm(["rm", "{0:x}.00000000".format(dir2_ino)])

        # Check that touching the hardlink gives EIO
        ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False)
        try:
            ran.wait()
        except CommandFailedError:
            self.assertTrue("Input/output error" in ran.stderr.getvalue())

        # Check that an entry is created in the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 2)
        if damage[0]['damage_type'] == "backtrace" :
            self.assertEqual(damage[0]['ino'], file2_ino)
            self.assertEqual(damage[1]['damage_type'], "dir_frag")
            self.assertEqual(damage[1]['ino'], dir2_ino)
        else:
            self.assertEqual(damage[0]['damage_type'], "dir_frag")
            self.assertEqual(damage[0]['ino'], dir2_ino)
            self.assertEqual(damage[1]['damage_type'], "backtrace")
            self.assertEqual(damage[1]['ino'], file2_ino)

        for entry in damage:
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "rm", str(entry['id']))
Commit	Line	Data
f67539c2	1	from io import BytesIO, StringIO
7c673cae FG	2	import json
	3	import logging
	4	import errno
	5	import re
	6	from teuthology.contextutil import MaxWhileTries
	7	from teuthology.exceptions import CommandFailedError
	8	from teuthology.orchestra.run import wait
	9	from tasks.cephfs.fuse_mount import FuseMount
	10	from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
	11
	12	DAMAGED_ON_START = "damaged_on_start"
	13	DAMAGED_ON_LS = "damaged_on_ls"
	14	CRASHED = "server crashed"
	15	NO_DAMAGE = "no damage"
f64942e4	16	READONLY = "readonly"
7c673cae FG	17	FAILED_CLIENT = "client failed"
	18	FAILED_SERVER = "server failed"
	19
	20	# An EIO in response to a stat from the client
	21	EIO_ON_LS = "eio"
	22
	23	# An EIO, but nothing in damage table (not ever what we expect)
	24	EIO_NO_DAMAGE = "eio without damage entry"
	25
	26
	27	log = logging.getLogger(__name__)
	28
	29
	30	class TestDamage(CephFSTestCase):
	31	def _simple_workload_write(self):
	32	self.mount_a.run_shell(["mkdir", "subdir"])
	33	self.mount_a.write_n_mb("subdir/sixmegs", 6)
	34	return self.mount_a.stat("subdir/sixmegs")
	35
	36	def is_marked_damaged(self, rank):
	37	mds_map = self.fs.get_mds_map()
	38	return rank in mds_map['damaged']
	39
	40	@for_teuthology #459s
	41	def test_object_deletion(self):
	42	"""
	43	That the MDS has a clean 'damaged' response to loss of any single metadata object
	44	"""
	45
	46	self._simple_workload_write()
	47
	48	# Hmm, actually it would be nice to permute whether the metadata pool
	49	# state contains sessions or not, but for the moment close this session
	50	# to avoid waiting through reconnect on every MDS start.
	51	self.mount_a.umount_wait()
	52	for mds_name in self.fs.get_active_names():
	53	self.fs.mds_asok(["flush", "journal"], mds_name)
	54
f67539c2	55	self.fs.fail()
7c673cae	56
f67539c2	57	serialized = self.fs.radosmo(['export', '-'])
7c673cae FG	58
	59	def is_ignored(obj_id, dentry=None):
	60	"""
	61	A filter to avoid redundantly mutating many similar objects (e.g.
	62	stray dirfrags) or similar dentries (e.g. stray dir dentries)
	63	"""
	64	if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
	65	return True
	66
	67	if dentry and obj_id == "100.00000000":
	68	if re.match("stray.+_head", dentry) and dentry != "stray0_head":
	69	return True
	70
	71	return False
	72
	73	def get_path(obj_id, dentry=None):
	74	"""
	75	What filesystem path does this object or dentry correspond to? i.e.
	76	what should I poke to see EIO after damaging it?
	77	"""
	78
	79	if obj_id == "1.00000000" and dentry == "subdir_head":
	80	return "./subdir"
	81	elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
	82	return "./subdir/sixmegs"
	83
	84	# None means ls will do an "ls -R" in hope of seeing some errors
	85	return None
	86
f67539c2	87	objects = self.fs.radosmo(["ls"], stdout=StringIO()).strip().split("\n")
7c673cae FG	88	objects = [o for o in objects if not is_ignored(o)]
	89
	90	# Find all objects with an OMAP header
	91	omap_header_objs = []
	92	for o in objects:
f67539c2	93	header = self.fs.radosmo(["getomapheader", o], stdout=StringIO())
7c673cae FG	94	# The rados CLI wraps the header output in a hex-printed style
	95	header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
	96	if header_bytes > 0:
	97	omap_header_objs.append(o)
	98
	99	# Find all OMAP key/vals
	100	omap_keys = []
	101	for o in objects:
f67539c2	102	keys_str = self.fs.radosmo(["listomapkeys", o], stdout=StringIO())
7c673cae	103	if keys_str:
f67539c2	104	for key in keys_str.strip().split("\n"):
7c673cae FG	105	if not is_ignored(o, key):
	106	omap_keys.append((o, key))
	107
	108	# Find objects that have data in their bodies
	109	data_objects = []
	110	for obj_id in objects:
f67539c2	111	stat_out = self.fs.radosmo(["stat", obj_id], stdout=StringIO())
7c673cae FG	112	size = int(re.match(".+, size (.+)$", stat_out).group(1))
	113	if size > 0:
	114	data_objects.append(obj_id)
	115
	116	# Define the various forms of damage we will inflict
	117	class MetadataMutation(object):
	118	def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
	119	self.obj_id = obj_id_
	120	self.desc = desc_
	121	self.mutate_fn = mutate_fn_
	122	self.expectation = expectation_
	123	if ls_path is None:
	124	self.ls_path = "."
	125	else:
	126	self.ls_path = ls_path
	127
	128	def __eq__(self, other):
	129	return self.desc == other.desc
	130
	131	def __hash__(self):
	132	return hash(self.desc)
	133
	134	junk = "deadbeef" * 10
	135	mutations = []
	136
	137	# Removals
f64942e4 AA	138	for o in objects:
f64942e4 AA	139	if o in [
7c673cae FG	140	# JournalPointers are auto-replaced if missing (same path as upgrade)
	141	"400.00000000",
	142	# Missing dirfrags for non-system dirs result in empty directory
	143	"10000000000.00000000",
	144	# PurgeQueue is auto-created if not found on startup
11fdf7f2 TL	145	"500.00000000",
	146	# open file table is auto-created if not found on startup
	147	"mds0_openfiles.0"
7c673cae FG	148	]:
	149	expectation = NO_DAMAGE
	150	else:
	151	expectation = DAMAGED_ON_START
	152
	153	log.info("Expectation on rm '{0}' will be '{1}'".format(
f64942e4	154	o, expectation
7c673cae FG	155	))
	156
	157	mutations.append(MetadataMutation(
f64942e4 AA	158	o,
f64942e4 AA	159	"Delete {0}".format(o),
f67539c2	160	lambda o=o: self.fs.radosm(["rm", o]),
7c673cae FG	161	expectation
	162	))
	163
	164	# Blatant corruptions
7c673cae FG	165	for obj_id in data_objects:
7c673cae FG	166	if obj_id == "500.00000000":
f64942e4 AA	167	# purge queue corruption results in read-only FS
	168	mutations.append(MetadataMutation(
	169	obj_id,
	170	"Corrupt {0}".format(obj_id),
f67539c2	171	lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
f64942e4 AA	172	READONLY
	173	))
	174	else:
	175	mutations.append(MetadataMutation(
	176	obj_id,
	177	"Corrupt {0}".format(obj_id),
f67539c2	178	lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)),
f64942e4 AA	179	DAMAGED_ON_START
	180	))
	181
	182	# Truncations
	183	for o in data_objects:
	184	if o == "500.00000000":
7c673cae FG	185	# The PurgeQueue is allowed to be empty: Journaler interprets
	186	# an empty header object as an empty journal.
	187	expectation = NO_DAMAGE
	188	else:
	189	expectation = DAMAGED_ON_START
	190
	191	mutations.append(
	192	MetadataMutation(
	193	o,
	194	"Truncate {0}".format(o),
f67539c2	195	lambda o=o: self.fs.radosm(["truncate", o, "0"]),
f64942e4	196	expectation
7c673cae FG	197	))
	198
	199	# OMAP value corruptions
	200	for o, k in omap_keys:
	201	if o.startswith("100."):
	202	# Anything in rank 0's 'mydir'
	203	expectation = DAMAGED_ON_START
	204	else:
	205	expectation = EIO_ON_LS
	206
	207	mutations.append(
	208	MetadataMutation(
	209	o,
	210	"Corrupt omap key {0}:{1}".format(o, k),
f67539c2	211	lambda o=o,k=k: self.fs.radosm(["setomapval", o, k, junk]),
7c673cae FG	212	expectation,
	213	get_path(o, k)
	214	)
	215	)
	216
	217	# OMAP header corruptions
f64942e4 AA	218	for o in omap_header_objs:
	219	if re.match("60.\.00000000", o) \
	220	or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
7c673cae FG	221	expectation = DAMAGED_ON_START
	222	else:
	223	expectation = NO_DAMAGE
	224
	225	log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
f64942e4	226	o, expectation
7c673cae FG	227	))
	228
	229	mutations.append(
	230	MetadataMutation(
f64942e4 AA	231	o,
f64942e4 AA	232	"Corrupt omap header on {0}".format(o),
f67539c2	233	lambda o=o: self.fs.radosm(["setomapheader", o, junk]),
7c673cae FG	234	expectation
	235	)
	236	)
	237
	238	results = {}
	239
	240	for mutation in mutations:
	241	log.info("Applying mutation '{0}'".format(mutation.desc))
	242
	243	# Reset MDS state
	244	self.mount_a.umount_wait(force=True)
f67539c2	245	self.fs.fail()
7c673cae FG	246	self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
	247
	248	# Reset RADOS pool state
f67539c2	249	self.fs.radosm(['import', '-'], stdin=BytesIO(serialized))
7c673cae FG	250
	251	# Inject the mutation
	252	mutation.mutate_fn()
	253
	254	# Try starting the MDS
f67539c2	255	self.fs.set_joinable()
7c673cae FG	256
	257	# How long we'll wait between starting a daemon and expecting
	258	# it to make it through startup, and potentially declare itself
	259	# damaged to the mon cluster.
	260	startup_timeout = 60
	261
	262	if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
	263	if mutation.expectation == DAMAGED_ON_START:
	264	# The MDS may pass through active before making it to damaged
	265	try:
	266	self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
	267	except RuntimeError:
	268	pass
	269
	270	# Wait for MDS to either come up or go into damaged state
	271	try:
	272	self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
	273	except RuntimeError:
	274	crashed = False
	275	# Didn't make it to healthy or damaged, did it crash?
	276	for daemon_id, daemon in self.fs.mds_daemons.items():
	277	if daemon.proc and daemon.proc.finished:
	278	crashed = True
	279	log.error("Daemon {0} crashed!".format(daemon_id))
	280	daemon.proc = None # So that subsequent stop() doesn't raise error
	281	if not crashed:
	282	# Didn't go health, didn't go damaged, didn't crash, so what?
	283	raise
	284	else:
	285	log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
	286	results[mutation] = CRASHED
	287	continue
	288	if self.is_marked_damaged(0):
	289	log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
	290	results[mutation] = DAMAGED_ON_START
	291	continue
	292	else:
	293	log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
	294	else:
	295	try:
	296	self.wait_until_true(self.fs.are_daemons_healthy, 60)
	297	except RuntimeError:
	298	log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
	299	if self.is_marked_damaged(0):
	300	results[mutation] = DAMAGED_ON_START
	301	else:
	302	results[mutation] = FAILED_SERVER
	303	continue
	304	log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))
	305
	306	# MDS is up, should go damaged on ls or client mount
e306af50	307	self.mount_a.mount_wait()
7c673cae FG	308	if mutation.ls_path == ".":
	309	proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
	310	else:
	311	proc = self.mount_a.stat(mutation.ls_path, wait=False)
	312
	313	if mutation.expectation == DAMAGED_ON_LS:
	314	try:
	315	self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
	316	log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
	317	results[mutation] = DAMAGED_ON_LS
	318	except RuntimeError:
	319	if self.fs.are_daemons_healthy():
	320	log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
	321	mutation.desc))
	322	results[mutation] = NO_DAMAGE
	323	else:
	324	log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
	325	results[mutation] = FAILED_SERVER
f64942e4 AA	326	elif mutation.expectation == READONLY:
	327	proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
	328	try:
	329	proc.wait()
	330	except CommandFailedError:
	331	stderr = proc.stderr.getvalue()
	332	log.info(stderr)
	333	if "Read-only file system".lower() in stderr.lower():
	334	pass
	335	else:
	336	raise
7c673cae FG	337	else:
	338	try:
	339	wait([proc], 20)
	340	log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
	341	results[mutation] = NO_DAMAGE
	342	except MaxWhileTries:
	343	log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
	344	results[mutation] = FAILED_CLIENT
	345	except CommandFailedError as e:
	346	if e.exitstatus == errno.EIO:
	347	log.info("Result: EIO on client")
	348	results[mutation] = EIO_ON_LS
	349	else:
	350	log.info("Result: unexpected error {0} on client".format(e))
	351	results[mutation] = FAILED_CLIENT
	352
	353	if mutation.expectation == EIO_ON_LS:
	354	# EIOs mean something handled by DamageTable: assert that it has
	355	# been populated
	356	damage = json.loads(
	357	self.fs.mon_manager.raw_cluster_cmd(
	358	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
	359	if len(damage) == 0:
	360	results[mutation] = EIO_NO_DAMAGE
	361
	362	failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
	363	if failures:
	364	log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
	365	for mutation, result in failures:
	366	log.error(" Expected '{0}' actually '{1}' from '{2}'".format(
	367	mutation.expectation, result, mutation.desc
	368	))
	369	raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
	370	else:
	371	log.info("All {0} mutations had expected outcomes".format(len(mutations)))
	372
	373	def test_damaged_dentry(self):
	374	# Damage to dentrys is interesting because it leaves the
	375	# directory's `complete` flag in a subtle state where
	376	# we have marked the dir complete in order that folks
	377	# can access it, but in actual fact there is a dentry
	378	# missing
	379	self.mount_a.run_shell(["mkdir", "subdir/"])
	380
	381	self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
	382	self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])
	383
	384	subdir_ino = self.mount_a.path_to_ino("subdir")
	385
	386	self.mount_a.umount_wait()
	387	for mds_name in self.fs.get_active_names():
	388	self.fs.mds_asok(["flush", "journal"], mds_name)
	389
f67539c2	390	self.fs.fail()
7c673cae FG	391
	392	# Corrupt a dentry
	393	junk = "deadbeef" * 10
	394	dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
f67539c2	395	self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
7c673cae FG	396
7c673cae FG	397	# Start up and try to list it
f67539c2	398	self.fs.set_joinable()
7c673cae FG	399	self.fs.wait_for_daemons()
7c673cae FG	400
e306af50	401	self.mount_a.mount_wait()
7c673cae FG	402	dentries = self.mount_a.ls("subdir/")
	403
	404	# The damaged guy should have disappeared
	405	self.assertEqual(dentries, ["file_undamaged"])
	406
	407	# I should get ENOENT if I try and read it normally, because
	408	# the dir is considered complete
	409	try:
	410	self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
	411	except CommandFailedError as e:
	412	self.assertEqual(e.exitstatus, errno.ENOENT)
	413	else:
	414	raise AssertionError("Expected ENOENT")
	415
	416	# The fact that there is damaged should have bee recorded
	417	damage = json.loads(
	418	self.fs.mon_manager.raw_cluster_cmd(
	419	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
	420	"damage", "ls", '--format=json-pretty'))
	421	self.assertEqual(len(damage), 1)
	422	damage_id = damage[0]['id']
	423
	424	# If I try to create a dentry with the same name as the damaged guy
	425	# then that should be forbidden
	426	try:
	427	self.mount_a.touch("subdir/file_to_be_damaged")
	428	except CommandFailedError as e:
	429	self.assertEqual(e.exitstatus, errno.EIO)
	430	else:
	431	raise AssertionError("Expected EIO")
	432
	433	# Attempting that touch will clear the client's complete flag, now
	434	# when I stat it I'll get EIO instead of ENOENT
	435	try:
	436	self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
	437	except CommandFailedError as e:
	438	if isinstance(self.mount_a, FuseMount):
	439	self.assertEqual(e.exitstatus, errno.EIO)
	440	else:
f67539c2 TL	441	# Old kernel client handles this case differently
f67539c2 TL	442	self.assertIn(e.exitstatus, [errno.ENOENT, errno.EIO])
7c673cae FG	443	else:
	444	raise AssertionError("Expected EIO")
	445
	446	nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
	447	self.assertEqual(nfiles, "2")
	448
	449	self.mount_a.umount_wait()
	450
	451	# Now repair the stats
f67539c2	452	scrub_json = self.fs.run_scrub(["start", "/subdir", "repair"])
7c673cae FG	453	log.info(json.dumps(scrub_json, indent=2))
7c673cae FG	454
f67539c2 TL	455	self.assertNotEqual(scrub_json, None)
	456	self.assertEqual(scrub_json["return_code"], 0)
	457	self.assertEqual(self.fs.wait_until_scrub_complete(tag=scrub_json["scrub_tag"]), True)
7c673cae FG	458
7c673cae FG	459	# Check that the file count is now correct
e306af50	460	self.mount_a.mount_wait()
7c673cae FG	461	nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
	462	self.assertEqual(nfiles, "1")
	463
	464	# Clean up the omap object
f67539c2	465	self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
7c673cae FG	466
	467	# Clean up the damagetable entry
	468	self.fs.mon_manager.raw_cluster_cmd(
	469	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
	470	"damage", "rm", "{did}".format(did=damage_id))
	471
	472	# Now I should be able to create a file with the same name as the
	473	# damaged guy if I want.
	474	self.mount_a.touch("subdir/file_to_be_damaged")
	475
	476	def test_open_ino_errors(self):
	477	"""
	478	That errors encountered during opening inos are properly propagated
	479	"""
	480
	481	self.mount_a.run_shell(["mkdir", "dir1"])
	482	self.mount_a.run_shell(["touch", "dir1/file1"])
	483	self.mount_a.run_shell(["mkdir", "dir2"])
	484	self.mount_a.run_shell(["touch", "dir2/file2"])
	485	self.mount_a.run_shell(["mkdir", "testdir"])
	486	self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"])
	487	self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"])
	488
	489	file1_ino = self.mount_a.path_to_ino("dir1/file1")
	490	file2_ino = self.mount_a.path_to_ino("dir2/file2")
	491	dir2_ino = self.mount_a.path_to_ino("dir2")
	492
	493	# Ensure everything is written to backing store
	494	self.mount_a.umount_wait()
	495	self.fs.mds_asok(["flush", "journal"])
	496
	497	# Drop everything from the MDS cache
f67539c2	498	self.fs.fail()
f64942e4	499	self.fs.journal_tool(['journal', 'reset'], 0)
f67539c2	500	self.fs.set_joinable()
7c673cae FG	501	self.fs.wait_for_daemons()
7c673cae FG	502
e306af50	503	self.mount_a.mount_wait()
7c673cae FG	504
	505	# Case 1: un-decodeable backtrace
	506
	507	# Validate that the backtrace is present and decodable
	508	self.fs.read_backtrace(file1_ino)
	509	# Go corrupt the backtrace of alpha/target (used for resolving
	510	# bravo/hardlink).
	511	self.fs._write_data_xattr(file1_ino, "parent", "rhubarb")
	512
	513	# Check that touching the hardlink gives EIO
	514	ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False)
	515	try:
	516	ran.wait()
	517	except CommandFailedError:
	518	self.assertTrue("Input/output error" in ran.stderr.getvalue())
	519
	520	# Check that an entry is created in the damage table
	521	damage = json.loads(
	522	self.fs.mon_manager.raw_cluster_cmd(
	523	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
	524	"damage", "ls", '--format=json-pretty'))
	525	self.assertEqual(len(damage), 1)
	526	self.assertEqual(damage[0]['damage_type'], "backtrace")
	527	self.assertEqual(damage[0]['ino'], file1_ino)
	528
	529	self.fs.mon_manager.raw_cluster_cmd(
	530	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
	531	"damage", "rm", str(damage[0]['id']))
	532
	533
	534	# Case 2: missing dirfrag for the target inode
	535
f67539c2	536	self.fs.radosm(["rm", "{0:x}.00000000".format(dir2_ino)])
7c673cae FG	537
	538	# Check that touching the hardlink gives EIO
	539	ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False)
	540	try:
	541	ran.wait()
	542	except CommandFailedError:
	543	self.assertTrue("Input/output error" in ran.stderr.getvalue())
	544
	545	# Check that an entry is created in the damage table
	546	damage = json.loads(
	547	self.fs.mon_manager.raw_cluster_cmd(
	548	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
	549	"damage", "ls", '--format=json-pretty'))
	550	self.assertEqual(len(damage), 2)
	551	if damage[0]['damage_type'] == "backtrace" :
	552	self.assertEqual(damage[0]['ino'], file2_ino)
	553	self.assertEqual(damage[1]['damage_type'], "dir_frag")
	554	self.assertEqual(damage[1]['ino'], dir2_ino)
	555	else:
	556	self.assertEqual(damage[0]['damage_type'], "dir_frag")
	557	self.assertEqual(damage[0]['ino'], dir2_ino)
	558	self.assertEqual(damage[1]['damage_type'], "backtrace")
	559	self.assertEqual(damage[1]['ino'], file2_ino)
	560
	561	for entry in damage:
	562	self.fs.mon_manager.raw_cluster_cmd(
	563	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
	564	"damage", "rm", str(entry['id']))