[ceph.git] / ceph / qa / tasks / cephfs / test_forward_scrub.py


"""
Test that the forward scrub functionality can traverse metadata and apply
requested tags, on well formed metadata.

This is *not* the real testing for forward scrub, which will need to test
how the functionality responds to damaged metadata.

"""
import logging
import json
import errno

from collections import namedtuple
from io import BytesIO
from textwrap import dedent

from teuthology.exceptions import CommandFailedError
from tasks.cephfs.cephfs_test_case import CephFSTestCase

import struct

log = logging.getLogger(__name__)


ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])


class TestForwardScrub(CephFSTestCase):
    MDSS_REQUIRED = 1

    def _read_str_xattr(self, pool, obj, attr):
        """
        Read a ceph-encoded string from a rados xattr
        """
        output = self.fs.mon_manager.do_rados(["getxattr", obj, attr], pool=pool,
                               stdout=BytesIO()).stdout.getvalue()
        strlen = struct.unpack('i', output[0:4])[0]
        return output[4:(4 + strlen)].decode(encoding='ascii')

    def _get_paths_to_ino(self):
        inos = {}
        p = self.mount_a.run_shell(["find", "./"])
        paths = p.stdout.getvalue().strip().split()
        for path in paths:
            inos[path] = self.mount_a.path_to_ino(path)

        return inos

    def _is_MDS_damage(self):
        return "MDS_DAMAGE" in self.mds_cluster.mon_manager.get_mon_health()['checks']

    def test_apply_tag(self):
        self.mount_a.run_shell(["mkdir", "parentdir"])
        self.mount_a.run_shell(["mkdir", "parentdir/childdir"])
        self.mount_a.run_shell(["touch", "rfile"])
        self.mount_a.run_shell(["touch", "parentdir/pfile"])
        self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"])

        # Build a structure mapping path to inode, as we will later want
        # to check object by object and objects are named after ino number
        inos = self._get_paths_to_ino()

        # Flush metadata: this is a friendly test of forward scrub so we're skipping
        # the part where it's meant to cope with dirty metadata
        self.mount_a.umount_wait()
        self.fs.mds_asok(["flush", "journal"])

        tag = "mytag"

        # Execute tagging forward scrub
        self.fs.mds_asok(["tag", "path", "/parentdir", tag])
        # Wait for completion
        import time
        time.sleep(10)
        # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll
        # watch that instead

        # Check that dirs were tagged
        for dirpath in ["./parentdir", "./parentdir/childdir"]:
            self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name())

        # Check that files were tagged
        for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]:
            self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name())

        # This guy wasn't in the tag path, shouldn't have been tagged
        self.assertUntagged(inos["./rfile"])

    def assertUntagged(self, ino):
        file_obj_name = "{0:x}.00000000".format(ino)
        with self.assertRaises(CommandFailedError):
            self._read_str_xattr(
                self.fs.get_data_pool_name(),
                file_obj_name,
                "scrub_tag"
            )

    def assertTagged(self, ino, tag, pool):
        file_obj_name = "{0:x}.00000000".format(ino)
        wrote = self._read_str_xattr(
            pool,
            file_obj_name,
            "scrub_tag"
        )
        self.assertEqual(wrote, tag)

    def _validate_linkage(self, expected):
        inos = self._get_paths_to_ino()
        try:
            self.assertDictEqual(inos, expected)
        except AssertionError:
            log.error("Expected: {0}".format(json.dumps(expected, indent=2)))
            log.error("Actual: {0}".format(json.dumps(inos, indent=2)))
            raise

    def test_orphan_scan(self):
        # Create some files whose metadata we will flush
        self.mount_a.run_python(dedent("""
            import os
            mount_point = "{mount_point}"
            parent = os.path.join(mount_point, "parent")
            os.mkdir(parent)
            flushed = os.path.join(parent, "flushed")
            os.mkdir(flushed)
            for f in ["alpha", "bravo", "charlie"]:
                open(os.path.join(flushed, f), 'w').write(f)
        """.format(mount_point=self.mount_a.mountpoint)))

        inos = self._get_paths_to_ino()

        # Flush journal
        # Umount before flush to avoid cap releases putting
        # things we don't want in the journal later.
        self.mount_a.umount_wait()
        self.fs.flush()

        # Create a new inode that's just in the log, i.e. would
        # look orphaned to backward scan if backward scan wisnae
        # respectin' tha scrub_tag xattr.
        self.mount_a.mount_wait()
        self.mount_a.run_shell(["mkdir", "parent/unflushed"])
        self.mount_a.run_shell(["dd", "if=/dev/urandom",
                                "of=./parent/unflushed/jfile",
                                "bs=1M", "count=8"])
        inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed")
        inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile")
        self.mount_a.umount_wait()

        # Orphan an inode by deleting its dentry
        # Our victim will be.... bravo.
        self.mount_a.umount_wait()
        self.fs.fail()
        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
        frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
        self.fs.radosm(["rmomapkey", frag_obj_id, "bravo_head"])

        self.fs.set_joinable()
        self.fs.wait_for_daemons()

        # See that the orphaned file is indeed missing from a client's POV
        self.mount_a.mount_wait()
        damaged_state = self._get_paths_to_ino()
        self.assertNotIn("./parent/flushed/bravo", damaged_state)
        self.mount_a.umount_wait()

        # Run a tagging forward scrub
        tag = "mytag123"
        self.fs.rank_asok(["tag", "path", "/parent", tag])

        # See that the orphan wisnae tagged
        self.assertUntagged(inos['./parent/flushed/bravo'])

        # See that the flushed-metadata-and-still-present files are tagged
        self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name())
        self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name())

        # See that journalled-but-not-flushed file *was* tagged
        self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())

        # okay, now we are going to run cephfs-data-scan. It's necessary to
        # have a clean journal otherwise replay will blowup on mismatched
        # inotable versions (due to scan_links)
        self.fs.flush()
        self.fs.fail()
        self.fs.journal_tool(["journal", "reset", "--force"], 0)

        # Run cephfs-data-scan targeting only orphans
        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
        self.fs.data_scan([
            "scan_inodes",
            "--filter-tag", tag,
            self.fs.get_data_pool_name()
        ])
        self.fs.data_scan(["scan_links"])

        # After in-place injection stats should be kosher again
        self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True)

        # And we should have all the same linkage we started with,
        # and no lost+found, and no extra inodes!
        self.fs.set_joinable()
        self.fs.wait_for_daemons()
        self.mount_a.mount_wait()
        self._validate_linkage(inos)

    def _stash_inotable(self):
        # Get all active ranks
        ranks = self.fs.get_all_mds_rank()

        inotable_dict = {}
        for rank in ranks:
            inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable"
            print("Trying to fetch inotable object: " + inotable_oid)

            #self.fs.get_metadata_object("InoTable", "mds0_inotable")
            inotable_raw = self.fs.radosmo(['get', inotable_oid, '-'])
            inotable_dict[inotable_oid] = inotable_raw
        return inotable_dict

    def test_inotable_sync(self):
        self.mount_a.write_n_mb("file1_sixmegs", 6)

        # Flush journal
        self.mount_a.umount_wait()
        self.fs.mds_asok(["flush", "journal"])

        inotable_copy = self._stash_inotable()

        self.mount_a.mount_wait()

        self.mount_a.write_n_mb("file2_sixmegs", 6)
        self.mount_a.write_n_mb("file3_sixmegs", 6)

        inos = self._get_paths_to_ino()

        # Flush journal
        self.mount_a.umount_wait()
        self.fs.mds_asok(["flush", "journal"])

        self.mount_a.umount_wait()

        with self.assert_cluster_log("inode table repaired", invert_match=True):
            out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
            self.assertNotEqual(out_json, None)
            self.assertEqual(out_json["return_code"], 0)
            self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        self.fs.fail()

        # Truncate the journal (to ensure the inotable on disk
        # is all that will be in the InoTable in memory)

        self.fs.journal_tool(["event", "splice",
                              "--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0)

        self.fs.journal_tool(["event", "splice",
                              "--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0)

        # Revert to old inotable.
        for key, value in inotable_copy.items():
            self.fs.radosm(["put", key, "-"], stdin=BytesIO(value))

        self.fs.set_joinable()
        self.fs.wait_for_daemons()

        with self.assert_cluster_log("inode table repaired"):
            out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
            self.assertNotEqual(out_json, None)
            self.assertEqual(out_json["return_code"], 0)
            self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        self.fs.fail()
        table_text = self.fs.table_tool(["0", "show", "inode"])
        table = json.loads(table_text)
        self.assertGreater(
                table['0']['data']['inotable']['free'][0]['start'],
                inos['./file3_sixmegs'])

    def test_backtrace_repair(self):
        """
        That the MDS can repair an inodes backtrace in the data pool
        if it is found to be damaged.
        """
        # Create a file for subsequent checks
        self.mount_a.run_shell(["mkdir", "parent_a"])
        self.mount_a.run_shell(["touch", "parent_a/alpha"])
        file_ino = self.mount_a.path_to_ino("parent_a/alpha")

        # That backtrace and layout are written after initial flush
        self.fs.mds_asok(["flush", "journal"])
        backtrace = self.fs.read_backtrace(file_ino)
        self.assertEqual(['alpha', 'parent_a'],
                         [a['dname'] for a in backtrace['ancestors']])

        # Go corrupt the backtrace
        self.fs._write_data_xattr(file_ino, "parent",
                                  "oh i'm sorry did i overwrite your xattr?")

        with self.assert_cluster_log("bad backtrace on inode"):
            out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
            self.assertNotEqual(out_json, None)
            self.assertEqual(out_json["return_code"], 0)
            self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        self.fs.mds_asok(["flush", "journal"])
        backtrace = self.fs.read_backtrace(file_ino)
        self.assertEqual(['alpha', 'parent_a'],
                         [a['dname'] for a in backtrace['ancestors']])

    def test_health_status_after_dentry_repair(self):
        """
        Test that the damage health status is cleared
        after the damaged dentry is repaired
        """
        # Create a file for checks
        self.mount_a.run_shell(["mkdir", "subdir/"])

        self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
        self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])

        subdir_ino = self.mount_a.path_to_ino("subdir")

        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.fail()

        # Corrupt a dentry
        junk = "deadbeef" * 10
        dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])

        # Start up and try to list it
        self.fs.set_joinable()
        self.fs.wait_for_daemons()

        self.mount_a.mount_wait()
        dentries = self.mount_a.ls("subdir/")

        # The damaged guy should have disappeared
        self.assertEqual(dentries, ["file_undamaged"])

        # I should get ENOENT if I try and read it normally, because
        # the dir is considered complete
        try:
            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
        except CommandFailedError as e:
            self.assertEqual(e.exitstatus, errno.ENOENT)
        else:
            raise AssertionError("Expected ENOENT")

        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
        self.assertEqual(nfiles, "2")

        self.mount_a.umount_wait()

        out_json = self.fs.run_scrub(["start", "/subdir", "recursive"])
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        # Check that an entry for dentry damage is created in the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 1)
        self.assertEqual(damage[0]['damage_type'], "dentry")
        self.wait_until_true(lambda: self._is_MDS_damage(), timeout=100)

        out_json = self.fs.run_scrub(["start", "/subdir", "repair,recursive"])
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        # Check that the entry is cleared from the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 0)
        self.wait_until_true(lambda: not self._is_MDS_damage(), timeout=100)

        self.mount_a.mount_wait()

        # Check that the file count is now correct
        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
        self.assertEqual(nfiles, "1")

        # Clean up the omap object
        self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])

    def test_health_status_after_dirfrag_repair(self):
        """
        Test that the damage health status is cleared
        after the damaged dirfrag is repaired
        """
        self.mount_a.run_shell(["mkdir", "dir"])
        self.mount_a.run_shell(["touch", "dir/file"])
        self.mount_a.run_shell(["mkdir", "testdir"])
        self.mount_a.run_shell(["ln", "dir/file", "testdir/hardlink"])

        dir_ino = self.mount_a.path_to_ino("dir")

        # Ensure everything is written to backing store
        self.mount_a.umount_wait()
        self.fs.mds_asok(["flush", "journal"])

        # Drop everything from the MDS cache
        self.fs.fail()

        self.fs.radosm(["rm", "{0:x}.00000000".format(dir_ino)])

        self.fs.journal_tool(['journal', 'reset'], 0)
        self.fs.set_joinable()
        self.fs.wait_for_daemons()
        self.mount_a.mount_wait()

        # Check that touching the hardlink gives EIO
        ran = self.mount_a.run_shell(["stat", "testdir/hardlink"], wait=False)
        try:
            ran.wait()
        except CommandFailedError:
            self.assertTrue("Input/output error" in ran.stderr.getvalue())

        out_json = self.fs.run_scrub(["start", "/dir", "recursive"])
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        # Check that an entry is created in the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 3)
        damage_types = set()
        for i in range(0, 3):
            damage_types.add(damage[i]['damage_type'])
        self.assertIn("dir_frag", damage_types)
        self.wait_until_true(lambda: self._is_MDS_damage(), timeout=100)

        out_json = self.fs.run_scrub(["start", "/dir", "recursive,repair"])
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        # Check that the entry is cleared from the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 1)
        self.assertNotEqual(damage[0]['damage_type'], "dir_frag")

        self.mount_a.umount_wait()
        self.fs.mds_asok(["flush", "journal"])
        self.fs.fail()

        # Run cephfs-data-scan
        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()])
        self.fs.data_scan(["scan_links"])

        self.fs.set_joinable()
        self.fs.wait_for_daemons()
        self.mount_a.mount_wait()

        out_json = self.fs.run_scrub(["start", "/dir", "recursive,repair"])
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 0)
        self.wait_until_true(lambda: not self._is_MDS_damage(), timeout=100)

    def test_health_status_after_backtrace_repair(self):
        """
        Test that the damage health status is cleared
        after the damaged backtrace is repaired
        """
        # Create a file for checks
        self.mount_a.run_shell(["mkdir", "dir_test"])
        self.mount_a.run_shell(["touch", "dir_test/file"])
        file_ino = self.mount_a.path_to_ino("dir_test/file")

        # That backtrace and layout are written after initial flush
        self.fs.mds_asok(["flush", "journal"])
        backtrace = self.fs.read_backtrace(file_ino)
        self.assertEqual(['file', 'dir_test'],
                         [a['dname'] for a in backtrace['ancestors']])

        # Corrupt the backtrace
        self.fs._write_data_xattr(file_ino, "parent",
                                  "The backtrace is corrupted")

        out_json = self.fs.run_scrub(["start", "/", "recursive"])
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
        
        # Check that an entry for backtrace damage is created in the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 1)
        self.assertEqual(damage[0]['damage_type'], "backtrace")
        self.wait_until_true(lambda: self._is_MDS_damage(), timeout=100)

        out_json = self.fs.run_scrub(["start", "/", "repair,recursive,force"])
        self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)

        # Check that the entry is cleared from the damage table
        damage = json.loads(
            self.fs.mon_manager.raw_cluster_cmd(
                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
                "damage", "ls", '--format=json-pretty'))
        self.assertEqual(len(damage), 0)
        self.wait_until_true(lambda: not self._is_MDS_damage(), timeout=100)
Commit	Line	Data
7c673cae FG	1
	2	"""
	3	Test that the forward scrub functionality can traverse metadata and apply
	4	requested tags, on well formed metadata.
	5
	6	This is not the real testing for forward scrub, which will need to test
	7	how the functionality responds to damaged metadata.
	8
	9	"""
7c673cae	10	import logging
f67539c2	11	import json
f38dd50b	12	import errno
e306af50	13
7c673cae	14	from collections import namedtuple
e306af50	15	from io import BytesIO
7c673cae FG	16	from textwrap import dedent
7c673cae FG	17
20effc67	18	from teuthology.exceptions import CommandFailedError
7c673cae FG	19	from tasks.cephfs.cephfs_test_case import CephFSTestCase
	20
	21	import struct
	22
	23	log = logging.getLogger(__name__)
	24
	25
	26	ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
	27
	28
	29	class TestForwardScrub(CephFSTestCase):
	30	MDSS_REQUIRED = 1
	31
	32	def _read_str_xattr(self, pool, obj, attr):
	33	"""
	34	Read a ceph-encoded string from a rados xattr
	35	"""
f67539c2 TL	36	output = self.fs.mon_manager.do_rados(["getxattr", obj, attr], pool=pool,
f67539c2 TL	37	stdout=BytesIO()).stdout.getvalue()
7c673cae	38	strlen = struct.unpack('i', output[0:4])[0]
f67539c2	39	return output[4:(4 + strlen)].decode(encoding='ascii')
7c673cae FG	40
	41	def _get_paths_to_ino(self):
	42	inos = {}
	43	p = self.mount_a.run_shell(["find", "./"])
	44	paths = p.stdout.getvalue().strip().split()
	45	for path in paths:
	46	inos[path] = self.mount_a.path_to_ino(path)
	47
	48	return inos
	49
f38dd50b TL	50	def _is_MDS_damage(self):
	51	return "MDS_DAMAGE" in self.mds_cluster.mon_manager.get_mon_health()['checks']
	52
7c673cae FG	53	def test_apply_tag(self):
	54	self.mount_a.run_shell(["mkdir", "parentdir"])
	55	self.mount_a.run_shell(["mkdir", "parentdir/childdir"])
	56	self.mount_a.run_shell(["touch", "rfile"])
	57	self.mount_a.run_shell(["touch", "parentdir/pfile"])
	58	self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"])
	59
	60	# Build a structure mapping path to inode, as we will later want
	61	# to check object by object and objects are named after ino number
	62	inos = self._get_paths_to_ino()
	63
	64	# Flush metadata: this is a friendly test of forward scrub so we're skipping
	65	# the part where it's meant to cope with dirty metadata
	66	self.mount_a.umount_wait()
	67	self.fs.mds_asok(["flush", "journal"])
	68
	69	tag = "mytag"
	70
	71	# Execute tagging forward scrub
	72	self.fs.mds_asok(["tag", "path", "/parentdir", tag])
	73	# Wait for completion
	74	import time
	75	time.sleep(10)
	76	# FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll
	77	# watch that instead
	78
	79	# Check that dirs were tagged
	80	for dirpath in ["./parentdir", "./parentdir/childdir"]:
	81	self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name())
	82
	83	# Check that files were tagged
	84	for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]:
	85	self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name())
	86
	87	# This guy wasn't in the tag path, shouldn't have been tagged
	88	self.assertUntagged(inos["./rfile"])
	89
	90	def assertUntagged(self, ino):
	91	file_obj_name = "{0:x}.00000000".format(ino)
	92	with self.assertRaises(CommandFailedError):
	93	self._read_str_xattr(
	94	self.fs.get_data_pool_name(),
	95	file_obj_name,
	96	"scrub_tag"
	97	)
	98
	99	def assertTagged(self, ino, tag, pool):
	100	file_obj_name = "{0:x}.00000000".format(ino)
	101	wrote = self._read_str_xattr(
	102	pool,
	103	file_obj_name,
	104	"scrub_tag"
	105	)
	106	self.assertEqual(wrote, tag)
	107
	108	def _validate_linkage(self, expected):
	109	inos = self._get_paths_to_ino()
	110	try:
	111	self.assertDictEqual(inos, expected)
	112	except AssertionError:
	113	log.error("Expected: {0}".format(json.dumps(expected, indent=2)))
	114	log.error("Actual: {0}".format(json.dumps(inos, indent=2)))
	115	raise
	116
117	def test_orphan_scan(self):
118	# Create some files whose metadata we will flush
119	self.mount_a.run_python(dedent("""
120	import os
121	mount_point = "{mount_point}"
122	parent = os.path.join(mount_point, "parent")
123	os.mkdir(parent)
124	flushed = os.path.join(parent, "flushed")
125	os.mkdir(flushed)
126	for f in ["alpha", "bravo", "charlie"]:
127	open(os.path.join(flushed, f), 'w').write(f)
128	""".format(mount_point=self.mount_a.mountpoint)))
129
130	inos = self._get_paths_to_ino()
131
132	# Flush journal
133	# Umount before flush to avoid cap releases putting
134	# things we don't want in the journal later.
135	self.mount_a.umount_wait()
1e59de90	136	self.fs.flush()
7c673cae FG	137
	138	# Create a new inode that's just in the log, i.e. would
	139	# look orphaned to backward scan if backward scan wisnae
	140	# respectin' tha scrub_tag xattr.
e306af50	141	self.mount_a.mount_wait()
7c673cae FG	142	self.mount_a.run_shell(["mkdir", "parent/unflushed"])
	143	self.mount_a.run_shell(["dd", "if=/dev/urandom",
	144	"of=./parent/unflushed/jfile",
	145	"bs=1M", "count=8"])
	146	inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed")
	147	inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile")
	148	self.mount_a.umount_wait()
	149
	150	# Orphan an inode by deleting its dentry
	151	# Our victim will be.... bravo.
	152	self.mount_a.umount_wait()
f67539c2	153	self.fs.fail()
7c673cae FG	154	self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
	155	self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
	156	frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
f67539c2	157	self.fs.radosm(["rmomapkey", frag_obj_id, "bravo_head"])
7c673cae	158
f67539c2	159	self.fs.set_joinable()
7c673cae FG	160	self.fs.wait_for_daemons()
	161
	162	# See that the orphaned file is indeed missing from a client's POV
e306af50	163	self.mount_a.mount_wait()
7c673cae FG	164	damaged_state = self._get_paths_to_ino()
	165	self.assertNotIn("./parent/flushed/bravo", damaged_state)
	166	self.mount_a.umount_wait()
	167
	168	# Run a tagging forward scrub
	169	tag = "mytag123"
1e59de90	170	self.fs.rank_asok(["tag", "path", "/parent", tag])
7c673cae FG	171
	172	# See that the orphan wisnae tagged
	173	self.assertUntagged(inos['./parent/flushed/bravo'])
	174
	175	# See that the flushed-metadata-and-still-present files are tagged
	176	self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name())
	177	self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name())
	178
	179	# See that journalled-but-not-flushed file was tagged
	180	self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
	181
1e59de90 TL	182	# okay, now we are going to run cephfs-data-scan. It's necessary to
	183	# have a clean journal otherwise replay will blowup on mismatched
	184	# inotable versions (due to scan_links)
	185	self.fs.flush()
f67539c2	186	self.fs.fail()
1e59de90 TL	187	self.fs.journal_tool(["journal", "reset", "--force"], 0)
	188
	189	# Run cephfs-data-scan targeting only orphans
7c673cae FG	190	self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
	191	self.fs.data_scan([
	192	"scan_inodes",
	193	"--filter-tag", tag,
	194	self.fs.get_data_pool_name()
	195	])
1e59de90	196	self.fs.data_scan(["scan_links"])
7c673cae FG	197
	198	# After in-place injection stats should be kosher again
	199	self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
	200	self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True)
	201
	202	# And we should have all the same linkage we started with,
	203	# and no lost+found, and no extra inodes!
f67539c2	204	self.fs.set_joinable()
7c673cae	205	self.fs.wait_for_daemons()
e306af50	206	self.mount_a.mount_wait()
7c673cae FG	207	self._validate_linkage(inos)
	208
	209	def _stash_inotable(self):
	210	# Get all active ranks
	211	ranks = self.fs.get_all_mds_rank()
	212
	213	inotable_dict = {}
	214	for rank in ranks:
	215	inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable"
9f95a23c	216	print("Trying to fetch inotable object: " + inotable_oid)
7c673cae FG	217
7c673cae FG	218	#self.fs.get_metadata_object("InoTable", "mds0_inotable")
f67539c2	219	inotable_raw = self.fs.radosmo(['get', inotable_oid, '-'])
7c673cae FG	220	inotable_dict[inotable_oid] = inotable_raw
	221	return inotable_dict
	222
	223	def test_inotable_sync(self):
	224	self.mount_a.write_n_mb("file1_sixmegs", 6)
	225
	226	# Flush journal
	227	self.mount_a.umount_wait()
	228	self.fs.mds_asok(["flush", "journal"])
	229
	230	inotable_copy = self._stash_inotable()
	231
e306af50	232	self.mount_a.mount_wait()
7c673cae FG	233
	234	self.mount_a.write_n_mb("file2_sixmegs", 6)
	235	self.mount_a.write_n_mb("file3_sixmegs", 6)
	236
	237	inos = self._get_paths_to_ino()
	238
	239	# Flush journal
	240	self.mount_a.umount_wait()
	241	self.fs.mds_asok(["flush", "journal"])
	242
	243	self.mount_a.umount_wait()
	244
	245	with self.assert_cluster_log("inode table repaired", invert_match=True):
b3b6e05e	246	out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
1adf2230	247	self.assertNotEqual(out_json, None)
f67539c2 TL	248	self.assertEqual(out_json["return_code"], 0)
f67539c2 TL	249	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
7c673cae	250
f67539c2	251	self.fs.fail()
7c673cae FG	252
	253	# Truncate the journal (to ensure the inotable on disk
	254	# is all that will be in the InoTable in memory)
	255
	256	self.fs.journal_tool(["event", "splice",
f64942e4	257	"--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0)
7c673cae FG	258
7c673cae FG	259	self.fs.journal_tool(["event", "splice",
f64942e4	260	"--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0)
7c673cae FG	261
7c673cae FG	262	# Revert to old inotable.
9f95a23c	263	for key, value in inotable_copy.items():
f67539c2	264	self.fs.radosm(["put", key, "-"], stdin=BytesIO(value))
7c673cae	265
f67539c2	266	self.fs.set_joinable()
7c673cae FG	267	self.fs.wait_for_daemons()
	268
	269	with self.assert_cluster_log("inode table repaired"):
b3b6e05e	270	out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
1adf2230	271	self.assertNotEqual(out_json, None)
f67539c2 TL	272	self.assertEqual(out_json["return_code"], 0)
f67539c2 TL	273	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
7c673cae	274
f67539c2	275	self.fs.fail()
7c673cae FG	276	table_text = self.fs.table_tool(["0", "show", "inode"])
	277	table = json.loads(table_text)
	278	self.assertGreater(
	279	table['0']['data']['inotable']['free'][0]['start'],
	280	inos['./file3_sixmegs'])
	281
	282	def test_backtrace_repair(self):
	283	"""
	284	That the MDS can repair an inodes backtrace in the data pool
	285	if it is found to be damaged.
	286	"""
	287	# Create a file for subsequent checks
	288	self.mount_a.run_shell(["mkdir", "parent_a"])
	289	self.mount_a.run_shell(["touch", "parent_a/alpha"])
	290	file_ino = self.mount_a.path_to_ino("parent_a/alpha")
	291
	292	# That backtrace and layout are written after initial flush
	293	self.fs.mds_asok(["flush", "journal"])
	294	backtrace = self.fs.read_backtrace(file_ino)
	295	self.assertEqual(['alpha', 'parent_a'],
	296	[a['dname'] for a in backtrace['ancestors']])
	297
	298	# Go corrupt the backtrace
	299	self.fs._write_data_xattr(file_ino, "parent",
	300	"oh i'm sorry did i overwrite your xattr?")
	301
	302	with self.assert_cluster_log("bad backtrace on inode"):
b3b6e05e	303	out_json = self.fs.run_scrub(["start", "/", "repair,recursive"])
1adf2230	304	self.assertNotEqual(out_json, None)
f67539c2 TL	305	self.assertEqual(out_json["return_code"], 0)
	306	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
	307
7c673cae FG	308	self.fs.mds_asok(["flush", "journal"])
	309	backtrace = self.fs.read_backtrace(file_ino)
	310	self.assertEqual(['alpha', 'parent_a'],
	311	[a['dname'] for a in backtrace['ancestors']])
f38dd50b TL	312
	313	def test_health_status_after_dentry_repair(self):
	314	"""
	315	Test that the damage health status is cleared
	316	after the damaged dentry is repaired
	317	"""
	318	# Create a file for checks
	319	self.mount_a.run_shell(["mkdir", "subdir/"])
	320
	321	self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
	322	self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])
	323
	324	subdir_ino = self.mount_a.path_to_ino("subdir")
	325
	326	self.mount_a.umount_wait()
	327	for mds_name in self.fs.get_active_names():
	328	self.fs.mds_asok(["flush", "journal"], mds_name)
	329
	330	self.fs.fail()
	331
	332	# Corrupt a dentry
	333	junk = "deadbeef" * 10
	334	dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
	335	self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
	336
	337	# Start up and try to list it
	338	self.fs.set_joinable()
	339	self.fs.wait_for_daemons()
	340
	341	self.mount_a.mount_wait()
	342	dentries = self.mount_a.ls("subdir/")
	343
	344	# The damaged guy should have disappeared
	345	self.assertEqual(dentries, ["file_undamaged"])
	346
	347	# I should get ENOENT if I try and read it normally, because
	348	# the dir is considered complete
	349	try:
	350	self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
	351	except CommandFailedError as e:
	352	self.assertEqual(e.exitstatus, errno.ENOENT)
	353	else:
	354	raise AssertionError("Expected ENOENT")
	355
	356	nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
	357	self.assertEqual(nfiles, "2")
	358
	359	self.mount_a.umount_wait()
	360
	361	out_json = self.fs.run_scrub(["start", "/subdir", "recursive"])
	362	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
	363
	364	# Check that an entry for dentry damage is created in the damage table
	365	damage = json.loads(
	366	self.fs.mon_manager.raw_cluster_cmd(
	367	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
	368	"damage", "ls", '--format=json-pretty'))
	369	self.assertEqual(len(damage), 1)
	370	self.assertEqual(damage[0]['damage_type'], "dentry")
	371	self.wait_until_true(lambda: self._is_MDS_damage(), timeout=100)
	372
	373	out_json = self.fs.run_scrub(["start", "/subdir", "repair,recursive"])
	374	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
	375
376	# Check that the entry is cleared from the damage table
377	damage = json.loads(
378	self.fs.mon_manager.raw_cluster_cmd(
379	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
380	"damage", "ls", '--format=json-pretty'))
381	self.assertEqual(len(damage), 0)
382	self.wait_until_true(lambda: not self._is_MDS_damage(), timeout=100)
383
384	self.mount_a.mount_wait()
385
386	# Check that the file count is now correct
387	nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
388	self.assertEqual(nfiles, "1")
389
390	# Clean up the omap object
391	self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
392
393	def test_health_status_after_dirfrag_repair(self):
394	"""
395	Test that the damage health status is cleared
396	after the damaged dirfrag is repaired
397	"""
398	self.mount_a.run_shell(["mkdir", "dir"])
399	self.mount_a.run_shell(["touch", "dir/file"])
400	self.mount_a.run_shell(["mkdir", "testdir"])
401	self.mount_a.run_shell(["ln", "dir/file", "testdir/hardlink"])
402
403	dir_ino = self.mount_a.path_to_ino("dir")
404
405	# Ensure everything is written to backing store
406	self.mount_a.umount_wait()
407	self.fs.mds_asok(["flush", "journal"])
408
409	# Drop everything from the MDS cache
410	self.fs.fail()
411
412	self.fs.radosm(["rm", "{0:x}.00000000".format(dir_ino)])
413
414	self.fs.journal_tool(['journal', 'reset'], 0)
415	self.fs.set_joinable()
416	self.fs.wait_for_daemons()
417	self.mount_a.mount_wait()
418
419	# Check that touching the hardlink gives EIO
420	ran = self.mount_a.run_shell(["stat", "testdir/hardlink"], wait=False)
421	try:
422	ran.wait()
423	except CommandFailedError:
424	self.assertTrue("Input/output error" in ran.stderr.getvalue())
425
426	out_json = self.fs.run_scrub(["start", "/dir", "recursive"])
427	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
428
429	# Check that an entry is created in the damage table
430	damage = json.loads(
431	self.fs.mon_manager.raw_cluster_cmd(
432	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
433	"damage", "ls", '--format=json-pretty'))
434	self.assertEqual(len(damage), 3)
435	damage_types = set()
436	for i in range(0, 3):
437	damage_types.add(damage[i]['damage_type'])
438	self.assertIn("dir_frag", damage_types)
439	self.wait_until_true(lambda: self._is_MDS_damage(), timeout=100)
440
441	out_json = self.fs.run_scrub(["start", "/dir", "recursive,repair"])
442	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
443
444	# Check that the entry is cleared from the damage table
445	damage = json.loads(
446	self.fs.mon_manager.raw_cluster_cmd(
447	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
448	"damage", "ls", '--format=json-pretty'))
449	self.assertEqual(len(damage), 1)
450	self.assertNotEqual(damage[0]['damage_type'], "dir_frag")
451
452	self.mount_a.umount_wait()
453	self.fs.mds_asok(["flush", "journal"])
454	self.fs.fail()
455
456	# Run cephfs-data-scan
457	self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
458	self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()])
459	self.fs.data_scan(["scan_links"])
460
461	self.fs.set_joinable()
462	self.fs.wait_for_daemons()
463	self.mount_a.mount_wait()
464
465	out_json = self.fs.run_scrub(["start", "/dir", "recursive,repair"])
466	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
467	damage = json.loads(
468	self.fs.mon_manager.raw_cluster_cmd(
469	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
470	"damage", "ls", '--format=json-pretty'))
471	self.assertEqual(len(damage), 0)
472	self.wait_until_true(lambda: not self._is_MDS_damage(), timeout=100)
473
474	def test_health_status_after_backtrace_repair(self):
475	"""
476	Test that the damage health status is cleared
477	after the damaged backtrace is repaired
478	"""
479	# Create a file for checks
480	self.mount_a.run_shell(["mkdir", "dir_test"])
481	self.mount_a.run_shell(["touch", "dir_test/file"])
482	file_ino = self.mount_a.path_to_ino("dir_test/file")
483
484	# That backtrace and layout are written after initial flush
485	self.fs.mds_asok(["flush", "journal"])
486	backtrace = self.fs.read_backtrace(file_ino)
487	self.assertEqual(['file', 'dir_test'],
488	[a['dname'] for a in backtrace['ancestors']])
489
490	# Corrupt the backtrace
491	self.fs._write_data_xattr(file_ino, "parent",
492	"The backtrace is corrupted")
493
494	out_json = self.fs.run_scrub(["start", "/", "recursive"])
495	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
496
497	# Check that an entry for backtrace damage is created in the damage table
498	damage = json.loads(
499	self.fs.mon_manager.raw_cluster_cmd(
500	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
501	"damage", "ls", '--format=json-pretty'))
502	self.assertEqual(len(damage), 1)
503	self.assertEqual(damage[0]['damage_type'], "backtrace")
504	self.wait_until_true(lambda: self._is_MDS_damage(), timeout=100)
505
506	out_json = self.fs.run_scrub(["start", "/", "repair,recursive,force"])
507	self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
508
509	# Check that the entry is cleared from the damage table
510	damage = json.loads(
511	self.fs.mon_manager.raw_cluster_cmd(
512	'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
513	"damage", "ls", '--format=json-pretty'))
514	self.assertEqual(len(damage), 0)
515	self.wait_until_true(lambda: not self._is_MDS_damage(), timeout=100)