[ceph.git] / ceph / qa / tasks / cephfs / test_journal_migration.py


from tasks.cephfs.cephfs_test_case import CephFSTestCase
from tasks.workunit import task as workunit

JOURNAL_FORMAT_LEGACY = 0
JOURNAL_FORMAT_RESILIENT = 1


class TestJournalMigration(CephFSTestCase):
    CLIENTS_REQUIRED = 1
    MDSS_REQUIRED = 2

    def test_journal_migration(self):
        old_journal_version = JOURNAL_FORMAT_LEGACY
        new_journal_version = JOURNAL_FORMAT_RESILIENT

        self.mount_a.umount_wait()
        self.fs.mds_stop()

        # Create a filesystem using the older journal format.
        self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
        self.fs.mds_restart()
        self.fs.recreate()

        # Enable standby replay, to cover the bug case #8811 where
        # a standby replay might mistakenly end up trying to rewrite
        # the journal at the same time as an active daemon.
        self.fs.set_allow_standby_replay(True)

        status = self.fs.wait_for_daemons()

        self.assertTrue(self.fs.get_replay(status=status) is not None)

        # Do some client work so that the log is populated with something.
        with self.mount_a.mounted_wait():
            self.mount_a.create_files()
            self.mount_a.check_files()  # sanity, this should always pass

            # Run a more substantial workunit so that the length of the log to be
            # coverted is going span at least a few segments
            workunit(self.ctx, {
                'clients': {
                    "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
                },
                "timeout": "3h"
            })

        # Modify the ceph.conf to ask the MDS to use the new journal format.
        self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)

        # Restart the MDS.
        self.fs.mds_fail_restart()

        # This ensures that all daemons come up into a valid state
        status = self.fs.wait_for_daemons()

        # Check that files created in the initial client workload are still visible
        # in a client mount.
        with self.mount_a.mounted_wait():
            self.mount_a.check_files()

        # Verify that the journal really has been rewritten.
        journal_version = self.fs.get_journal_version()
        if journal_version != new_journal_version:
            raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
                new_journal_version, journal_version()
            ))

        # Verify that cephfs-journal-tool can now read the rewritten journal
        inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
        if not inspect_out.endswith(": OK"):
            raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
                inspect_out
            ))

        self.fs.journal_tool(["event", "get", "json",
                              "--path", "/tmp/journal.json"], 0)
        p = self.fs.tool_remote.sh([
                "python3",
                "-c",
                "import json; print(len(json.load(open('/tmp/journal.json'))))"
            ])
        event_count = int(p.strip())
        if event_count < 1000:
            # Approximate value of "lots", expected from having run fsstress
            raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))

        # Do some client work to check that writing the log is still working
        with self.mount_a.mounted_wait():
            workunit(self.ctx, {
                'clients': {
                    "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
                },
                "timeout": "3h"
            })

        # Check that both an active and a standby replay are still up
        status = self.fs.status()
        self.assertEqual(len(list(self.fs.get_replays(status=status))), 1)
        self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1)
Commit	Line	Data
7c673cae	1
7c673cae FG	2	from tasks.cephfs.cephfs_test_case import CephFSTestCase
	3	from tasks.workunit import task as workunit
	4
	5	JOURNAL_FORMAT_LEGACY = 0
	6	JOURNAL_FORMAT_RESILIENT = 1
	7
	8
	9	class TestJournalMigration(CephFSTestCase):
	10	CLIENTS_REQUIRED = 1
31f18b77	11	MDSS_REQUIRED = 2
7c673cae FG	12
	13	def test_journal_migration(self):
	14	old_journal_version = JOURNAL_FORMAT_LEGACY
	15	new_journal_version = JOURNAL_FORMAT_RESILIENT
	16
7c673cae FG	17	self.mount_a.umount_wait()
7c673cae FG	18	self.fs.mds_stop()
31f18b77	19
31f18b77 FG	20	# Create a filesystem using the older journal format.
31f18b77 FG	21	self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
11fdf7f2	22	self.fs.mds_restart()
7c673cae	23	self.fs.recreate()
31f18b77	24
11fdf7f2 TL	25	# Enable standby replay, to cover the bug case #8811 where
	26	# a standby replay might mistakenly end up trying to rewrite
	27	# the journal at the same time as an active daemon.
	28	self.fs.set_allow_standby_replay(True)
31f18b77	29
11fdf7f2 TL	30	status = self.fs.wait_for_daemons()
	31
	32	self.assertTrue(self.fs.get_replay(status=status) is not None)
7c673cae FG	33
7c673cae FG	34	# Do some client work so that the log is populated with something.
f67539c2	35	with self.mount_a.mounted_wait():
7c673cae FG	36	self.mount_a.create_files()
	37	self.mount_a.check_files() # sanity, this should always pass
	38
	39	# Run a more substantial workunit so that the length of the log to be
	40	# coverted is going span at least a few segments
	41	workunit(self.ctx, {
	42	'clients': {
	43	"client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
	44	},
	45	"timeout": "3h"
	46	})
	47
	48	# Modify the ceph.conf to ask the MDS to use the new journal format.
	49	self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
	50
	51	# Restart the MDS.
11fdf7f2	52	self.fs.mds_fail_restart()
7c673cae FG	53
7c673cae FG	54	# This ensures that all daemons come up into a valid state
11fdf7f2	55	status = self.fs.wait_for_daemons()
7c673cae FG	56
	57	# Check that files created in the initial client workload are still visible
	58	# in a client mount.
f67539c2	59	with self.mount_a.mounted_wait():
7c673cae FG	60	self.mount_a.check_files()
	61
	62	# Verify that the journal really has been rewritten.
	63	journal_version = self.fs.get_journal_version()
	64	if journal_version != new_journal_version:
	65	raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
	66	new_journal_version, journal_version()
	67	))
	68
	69	# Verify that cephfs-journal-tool can now read the rewritten journal
f64942e4	70	inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
7c673cae FG	71	if not inspect_out.endswith(": OK"):
	72	raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
	73	inspect_out
	74	))
	75
f64942e4 AA	76	self.fs.journal_tool(["event", "get", "json",
f64942e4 AA	77	"--path", "/tmp/journal.json"], 0)
e306af50	78	p = self.fs.tool_remote.sh([
9f95a23c	79	"python3",
7c673cae	80	"-c",
9f95a23c	81	"import json; print(len(json.load(open('/tmp/journal.json'))))"
e306af50 TL	82	])
e306af50 TL	83	event_count = int(p.strip())
7c673cae FG	84	if event_count < 1000:
	85	# Approximate value of "lots", expected from having run fsstress
	86	raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
	87
31f18b77	88	# Do some client work to check that writing the log is still working
f67539c2	89	with self.mount_a.mounted_wait():
7c673cae FG	90	workunit(self.ctx, {
	91	'clients': {
	92	"client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
	93	},
	94	"timeout": "3h"
	95	})
31f18b77 FG	96
31f18b77 FG	97	# Check that both an active and a standby replay are still up
11fdf7f2 TL	98	status = self.fs.status()
	99	self.assertEqual(len(list(self.fs.get_replays(status=status))), 1)
	100	self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1)