[ceph.git] / ceph / qa / tasks / cephfs / test_fragment.py



from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.orchestra import run

import logging
log = logging.getLogger(__name__)


class TestFragmentation(CephFSTestCase):
    CLIENTS_REQUIRED = 1
    MDSS_REQUIRED = 1

    def get_splits(self):
        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']

    def get_merges(self):
        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']

    def get_dir_ino(self, path):
        dir_cache = self.fs.read_cache(path, 0)
        dir_ino = None
        dir_inono = self.mount_a.path_to_ino(path.strip("/"))
        for ino in dir_cache:
            if ino['ino'] == dir_inono:
                dir_ino = ino
                break
        self.assertIsNotNone(dir_ino)
        return dir_ino

    def _configure(self, **kwargs):
        """
        Apply kwargs as MDS configuration settings, enable dirfrags
        and restart the MDSs.
        """
        kwargs['mds_bal_frag'] = "true"

        for k, v in kwargs.items():
            self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())

        self.fs.set_allow_dirfrags(True)

        self.mds_cluster.mds_fail_restart()
        self.fs.wait_for_daemons()

    def test_oversize(self):
        """
        That a directory is split when it becomes too large.
        """

        split_size = 20
        merge_size = 5

        self._configure(
            mds_bal_split_size=split_size,
            mds_bal_merge_size=merge_size,
            mds_bal_split_bits=1
        )

        self.assertEqual(self.get_splits(), 0)

        self.mount_a.create_n_files("splitdir/file", split_size + 1)

        self.wait_until_true(
            lambda: self.get_splits() == 1,
            timeout=30
        )

        frags = self.get_dir_ino("/splitdir")['dirfrags']
        self.assertEqual(len(frags), 2)
        self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
        self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
        self.assertEqual(
            sum([len(f['dentries']) for f in frags]),
            split_size + 1
        )

        self.assertEqual(self.get_merges(), 0)

        self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])

        self.wait_until_true(
            lambda: self.get_merges() == 1,
            timeout=30
        )

        self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)

    def test_rapid_creation(self):
        """
        That the fast-splitting limit of 1.5x normal limit is
        applied when creating dentries quickly.
        """

        split_size = 100
        merge_size = 1

        self._configure(
            mds_bal_split_size=split_size,
            mds_bal_merge_size=merge_size,
            mds_bal_split_bits=3,
            mds_bal_fragment_size_max=int(split_size * 1.5 + 2)
        )

        # We test this only at a single split level.  If a client was sending
        # IO so fast that it hit a second split before the first split
        # was complete, it could violate mds_bal_fragment_size_max -- there
        # is a window where the child dirfrags of a split are unfrozen
        # (so they can grow), but still have STATE_FRAGMENTING (so they
        # can't be split).

        # By writing 4x the split size when the split bits are set
        # to 3 (i.e. 4-ways), I am reasonably sure to see precisely
        # one split.  The test is to check whether that split
        # happens soon enough that the client doesn't exceed
        # 2x the split_size (the "immediate" split mode should
        # kick in at 1.5x the split size).

        self.assertEqual(self.get_splits(), 0)
        self.mount_a.create_n_files("splitdir/file", split_size * 4)
        self.wait_until_equal(
            self.get_splits,
            1,
            reject_fn=lambda s: s > 1,
            timeout=30
        )

    def test_deep_split(self):
        """
        That when the directory grows many times larger than split size,
        the fragments get split again.
        """

        split_size = 100
        merge_size = 1  # i.e. don't merge frag unless its empty
        split_bits = 1

        branch_factor = 2**split_bits

        # Arbitrary: how many levels shall we try fragmenting before
        # ending the test?
        max_depth = 5

        self._configure(
            mds_bal_split_size=split_size,
            mds_bal_merge_size=merge_size,
            mds_bal_split_bits=split_bits
        )

        # Each iteration we will create another level of fragments.  The
        # placement of dentries into fragments is by hashes (i.e. pseudo
        # random), so we rely on statistics to get the behaviour that
        # by writing about 1.5x as many dentries as the split_size times
        # the number of frags, we will get them all to exceed their
        # split size and trigger a split.
        depth = 0
        files_written = 0
        splits_expected = 0
        while depth < max_depth:
            log.info("Writing files for depth {0}".format(depth))
            target_files = branch_factor**depth * int(split_size * 1.5)
            create_files = target_files - files_written

            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
                "{0} Writing {1} files (depth={2})".format(
                    self.__class__.__name__, create_files, depth
                ))
            self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
                                        create_files)
            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
                "{0} Done".format(self.__class__.__name__))

            files_written += create_files
            log.info("Now have {0} files".format(files_written))

            splits_expected += branch_factor**depth
            log.info("Waiting to see {0} splits".format(splits_expected))
            try:
                self.wait_until_equal(
                    self.get_splits,
                    splits_expected,
                    timeout=30,
                    reject_fn=lambda x: x > splits_expected
                )

                frags = self.get_dir_ino("/splitdir")['dirfrags']
                self.assertEqual(len(frags), branch_factor**(depth+1))
                self.assertEqual(
                    sum([len(f['dentries']) for f in frags]),
                    target_files
                )
            except:
                # On failures, log what fragmentation we actually ended
                # up with.  This block is just for logging, at the end
                # we raise the exception again.
                frags = self.get_dir_ino("/splitdir")['dirfrags']
                log.info("depth={0} splits_expected={1} files_written={2}".format(
                    depth, splits_expected, files_written
                ))
                log.info("Dirfrags:")
                for f in frags:
                    log.info("{0}: {1}".format(
                        f['dirfrag'], len(f['dentries'])
                    ))
                raise

            depth += 1

        # Remember the inode number because we will be checking for
        # objects later.
        dir_inode_no = self.mount_a.path_to_ino("splitdir")

        self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
        self.mount_a.umount_wait()

        self.fs.mds_asok(['flush', 'journal'])

        # Wait for all strays to purge
        self.wait_until_equal(
            lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache']
                                     )['mds_cache']['num_strays'],
            0,
            timeout=1200
        )
        # Check that the metadata pool objects for all the myriad
        # child fragments are gone
        metadata_objs = self.fs.rados(["ls"])
        frag_objs = []
        for o in metadata_objs:
            if o.startswith("{0:x}.".format(dir_inode_no)):
                frag_objs.append(o)
        self.assertListEqual(frag_objs, [])
Commit	Line	Data
7c673cae FG	1
	2
	3	from tasks.cephfs.cephfs_test_case import CephFSTestCase
	4	from teuthology.orchestra import run
	5
	6	import logging
	7	log = logging.getLogger(__name__)
	8
	9
	10	class TestFragmentation(CephFSTestCase):
	11	CLIENTS_REQUIRED = 1
	12	MDSS_REQUIRED = 1
	13
	14	def get_splits(self):
	15	return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']
	16
	17	def get_merges(self):
	18	return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']
	19
	20	def get_dir_ino(self, path):
	21	dir_cache = self.fs.read_cache(path, 0)
	22	dir_ino = None
	23	dir_inono = self.mount_a.path_to_ino(path.strip("/"))
	24	for ino in dir_cache:
	25	if ino['ino'] == dir_inono:
	26	dir_ino = ino
	27	break
	28	self.assertIsNotNone(dir_ino)
	29	return dir_ino
	30
	31	def _configure(self, **kwargs):
	32	"""
	33	Apply kwargs as MDS configuration settings, enable dirfrags
	34	and restart the MDSs.
	35	"""
	36	kwargs['mds_bal_frag'] = "true"
	37
	38	for k, v in kwargs.items():
	39	self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
	40
	41	self.fs.set_allow_dirfrags(True)
	42
	43	self.mds_cluster.mds_fail_restart()
	44	self.fs.wait_for_daemons()
	45
	46	def test_oversize(self):
	47	"""
	48	That a directory is split when it becomes too large.
	49	"""
	50
	51	split_size = 20
	52	merge_size = 5
	53
	54	self._configure(
	55	mds_bal_split_size=split_size,
	56	mds_bal_merge_size=merge_size,
	57	mds_bal_split_bits=1
	58	)
	59
	60	self.assertEqual(self.get_splits(), 0)
	61
	62	self.mount_a.create_n_files("splitdir/file", split_size + 1)
	63
	64	self.wait_until_true(
65	lambda: self.get_splits() == 1,
66	timeout=30
67	)
68
69	frags = self.get_dir_ino("/splitdir")['dirfrags']
70	self.assertEqual(len(frags), 2)
d2e6a577 FG	71	self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
d2e6a577 FG	72	self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
7c673cae FG	73	self.assertEqual(
	74	sum([len(f['dentries']) for f in frags]),
	75	split_size + 1
	76	)
	77
	78	self.assertEqual(self.get_merges(), 0)
	79
	80	self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
	81
	82	self.wait_until_true(
	83	lambda: self.get_merges() == 1,
	84	timeout=30
	85	)
	86
	87	self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)
	88
	89	def test_rapid_creation(self):
	90	"""
	91	That the fast-splitting limit of 1.5x normal limit is
	92	applied when creating dentries quickly.
	93	"""
	94
	95	split_size = 100
	96	merge_size = 1
	97
	98	self._configure(
	99	mds_bal_split_size=split_size,
	100	mds_bal_merge_size=merge_size,
	101	mds_bal_split_bits=3,
31f18b77	102	mds_bal_fragment_size_max=int(split_size * 1.5 + 2)
7c673cae FG	103	)
	104
	105	# We test this only at a single split level. If a client was sending
	106	# IO so fast that it hit a second split before the first split
	107	# was complete, it could violate mds_bal_fragment_size_max -- there
	108	# is a window where the child dirfrags of a split are unfrozen
	109	# (so they can grow), but still have STATE_FRAGMENTING (so they
	110	# can't be split).
	111
	112	# By writing 4x the split size when the split bits are set
	113	# to 3 (i.e. 4-ways), I am reasonably sure to see precisely
	114	# one split. The test is to check whether that split
	115	# happens soon enough that the client doesn't exceed
	116	# 2x the split_size (the "immediate" split mode should
	117	# kick in at 1.5x the split size).
	118
	119	self.assertEqual(self.get_splits(), 0)
	120	self.mount_a.create_n_files("splitdir/file", split_size * 4)
	121	self.wait_until_equal(
	122	self.get_splits,
	123	1,
	124	reject_fn=lambda s: s > 1,
	125	timeout=30
	126	)
	127
	128	def test_deep_split(self):
	129	"""
	130	That when the directory grows many times larger than split size,
	131	the fragments get split again.
	132	"""
	133
	134	split_size = 100
	135	merge_size = 1 # i.e. don't merge frag unless its empty
	136	split_bits = 1
	137
	138	branch_factor = 2**split_bits
	139
	140	# Arbitrary: how many levels shall we try fragmenting before
	141	# ending the test?
	142	max_depth = 5
	143
	144	self._configure(
	145	mds_bal_split_size=split_size,
	146	mds_bal_merge_size=merge_size,
	147	mds_bal_split_bits=split_bits
	148	)
	149
	150	# Each iteration we will create another level of fragments. The
	151	# placement of dentries into fragments is by hashes (i.e. pseudo
	152	# random), so we rely on statistics to get the behaviour that
	153	# by writing about 1.5x as many dentries as the split_size times
	154	# the number of frags, we will get them all to exceed their
	155	# split size and trigger a split.
	156	depth = 0
	157	files_written = 0
	158	splits_expected = 0
	159	while depth < max_depth:
	160	log.info("Writing files for depth {0}".format(depth))
	161	target_files = branch_factor*depth int(split_size * 1.5)
	162	create_files = target_files - files_written
	163
	164	self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
	165	"{0} Writing {1} files (depth={2})".format(
	166	self.__class__.__name__, create_files, depth
167	))
168	self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
169	create_files)
170	self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
171	"{0} Done".format(self.__class__.__name__))
172
173	files_written += create_files
174	log.info("Now have {0} files".format(files_written))
175
176	splits_expected += branch_factor**depth
177	log.info("Waiting to see {0} splits".format(splits_expected))
178	try:
179	self.wait_until_equal(
180	self.get_splits,
181	splits_expected,
182	timeout=30,
183	reject_fn=lambda x: x > splits_expected
184	)
185
186	frags = self.get_dir_ino("/splitdir")['dirfrags']
187	self.assertEqual(len(frags), branch_factor**(depth+1))
188	self.assertEqual(
189	sum([len(f['dentries']) for f in frags]),
190	target_files
191	)
192	except:
193	# On failures, log what fragmentation we actually ended
194	# up with. This block is just for logging, at the end
195	# we raise the exception again.
196	frags = self.get_dir_ino("/splitdir")['dirfrags']
197	log.info("depth={0} splits_expected={1} files_written={2}".format(
198	depth, splits_expected, files_written
199	))
200	log.info("Dirfrags:")
201	for f in frags:
202	log.info("{0}: {1}".format(
203	f['dirfrag'], len(f['dentries'])
204	))
205	raise
206
207	depth += 1
208
209	# Remember the inode number because we will be checking for
210	# objects later.
211	dir_inode_no = self.mount_a.path_to_ino("splitdir")
212
213	self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
214	self.mount_a.umount_wait()
215
216	self.fs.mds_asok(['flush', 'journal'])
217
218	# Wait for all strays to purge
219	self.wait_until_equal(
220	lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache']
221	)['mds_cache']['num_strays'],
222	0,
223	timeout=1200
224	)
225	# Check that the metadata pool objects for all the myriad
226	# child fragments are gone
227	metadata_objs = self.fs.rados(["ls"])
228	frag_objs = []
229	for o in metadata_objs:
230	if o.startswith("{0:x}.".format(dir_inode_no)):
231	frag_objs.append(o)
232	self.assertListEqual(frag_objs, [])