[ceph.git] / ceph / qa / tasks / cephfs / test_journal_migration.py


from StringIO import StringIO
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from tasks.workunit import task as workunit

JOURNAL_FORMAT_LEGACY = 0
JOURNAL_FORMAT_RESILIENT = 1


class TestJournalMigration(CephFSTestCase):
    CLIENTS_REQUIRED = 1
    MDSS_REQUIRED = 2

    def test_journal_migration(self):
        old_journal_version = JOURNAL_FORMAT_LEGACY
        new_journal_version = JOURNAL_FORMAT_RESILIENT

        # Pick out two daemons to use
        mds_a, mds_b = sorted(self.mds_cluster.mds_ids[0:2]) 

        self.mount_a.umount_wait()
        self.fs.mds_stop()

        # Enable standby replay, to cover the bug case #8811 where
        # a standby replay might mistakenly end up trying to rewrite
        # the journal at the same time as an active daemon.
        self.fs.set_ceph_conf('mds', 'mds standby replay', "true")
        self.fs.set_ceph_conf('mds', 'mds standby for rank', "0")

        # Create a filesystem using the older journal format.
        self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
        self.fs.recreate()
        self.fs.mds_restart(mds_id=mds_a)
        self.fs.wait_for_daemons()
        self.assertEqual(self.fs.get_active_names(), [mds_a])

        def replay_names():
            return [s['name']
                    for s in self.fs.status().get_replays(fscid = self.fs.id)]

        # Start the standby and wait for it to come up
        self.fs.mds_restart(mds_id=mds_b)
        self.wait_until_equal(
                replay_names,
                [mds_b],
                timeout = 30)

        # Do some client work so that the log is populated with something.
        with self.mount_a.mounted():
            self.mount_a.create_files()
            self.mount_a.check_files()  # sanity, this should always pass

            # Run a more substantial workunit so that the length of the log to be
            # coverted is going span at least a few segments
            workunit(self.ctx, {
                'clients': {
                    "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
                },
                "timeout": "3h"
            })

        # Modify the ceph.conf to ask the MDS to use the new journal format.
        self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)

        # Restart the MDS.
        self.fs.mds_fail_restart(mds_id=mds_a)
        self.fs.mds_fail_restart(mds_id=mds_b)

        # This ensures that all daemons come up into a valid state
        self.fs.wait_for_daemons()

        # Check that files created in the initial client workload are still visible
        # in a client mount.
        with self.mount_a.mounted():
            self.mount_a.check_files()

        # Verify that the journal really has been rewritten.
        journal_version = self.fs.get_journal_version()
        if journal_version != new_journal_version:
            raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
                new_journal_version, journal_version()
            ))

        # Verify that cephfs-journal-tool can now read the rewritten journal
        inspect_out = self.fs.journal_tool(["journal", "inspect"])
        if not inspect_out.endswith(": OK"):
            raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
                inspect_out
            ))

        self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
        p = self.fs.tool_remote.run(
            args=[
                "python",
                "-c",
                "import json; print len(json.load(open('/tmp/journal.json')))"
            ],
            stdout=StringIO())
        event_count = int(p.stdout.getvalue().strip())
        if event_count < 1000:
            # Approximate value of "lots", expected from having run fsstress
            raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))

        # Do some client work to check that writing the log is still working
        with self.mount_a.mounted():
            workunit(self.ctx, {
                'clients': {
                    "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
                },
                "timeout": "3h"
            })

        # Check that both an active and a standby replay are still up
        self.assertEqual(len(replay_names()), 1)
        self.assertEqual(len(self.fs.get_active_names()), 1)
        self.assertTrue(self.mds_cluster.mds_daemons[mds_a].running())
        self.assertTrue(self.mds_cluster.mds_daemons[mds_b].running())
Commit	Line	Data
7c673cae FG	1
	2	from StringIO import StringIO
	3	from tasks.cephfs.cephfs_test_case import CephFSTestCase
	4	from tasks.workunit import task as workunit
	5
	6	JOURNAL_FORMAT_LEGACY = 0
	7	JOURNAL_FORMAT_RESILIENT = 1
	8
	9
	10	class TestJournalMigration(CephFSTestCase):
	11	CLIENTS_REQUIRED = 1
31f18b77	12	MDSS_REQUIRED = 2
7c673cae FG	13
	14	def test_journal_migration(self):
	15	old_journal_version = JOURNAL_FORMAT_LEGACY
	16	new_journal_version = JOURNAL_FORMAT_RESILIENT
	17
31f18b77 FG	18	# Pick out two daemons to use
31f18b77 FG	19	mds_a, mds_b = sorted(self.mds_cluster.mds_ids[0:2])
7c673cae	20
7c673cae FG	21	self.mount_a.umount_wait()
7c673cae FG	22	self.fs.mds_stop()
31f18b77 FG	23
	24	# Enable standby replay, to cover the bug case #8811 where
	25	# a standby replay might mistakenly end up trying to rewrite
	26	# the journal at the same time as an active daemon.
	27	self.fs.set_ceph_conf('mds', 'mds standby replay', "true")
	28	self.fs.set_ceph_conf('mds', 'mds standby for rank', "0")
	29
	30	# Create a filesystem using the older journal format.
	31	self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
7c673cae	32	self.fs.recreate()
31f18b77	33	self.fs.mds_restart(mds_id=mds_a)
7c673cae	34	self.fs.wait_for_daemons()
31f18b77 FG	35	self.assertEqual(self.fs.get_active_names(), [mds_a])
	36
	37	def replay_names():
	38	return [s['name']
	39	for s in self.fs.status().get_replays(fscid = self.fs.id)]
	40
	41	# Start the standby and wait for it to come up
	42	self.fs.mds_restart(mds_id=mds_b)
	43	self.wait_until_equal(
	44	replay_names,
	45	[mds_b],
	46	timeout = 30)
7c673cae FG	47
	48	# Do some client work so that the log is populated with something.
	49	with self.mount_a.mounted():
	50	self.mount_a.create_files()
	51	self.mount_a.check_files() # sanity, this should always pass
	52
	53	# Run a more substantial workunit so that the length of the log to be
	54	# coverted is going span at least a few segments
	55	workunit(self.ctx, {
	56	'clients': {
	57	"client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
	58	},
	59	"timeout": "3h"
	60	})
	61
	62	# Modify the ceph.conf to ask the MDS to use the new journal format.
	63	self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
	64
	65	# Restart the MDS.
31f18b77 FG	66	self.fs.mds_fail_restart(mds_id=mds_a)
31f18b77 FG	67	self.fs.mds_fail_restart(mds_id=mds_b)
7c673cae FG	68
	69	# This ensures that all daemons come up into a valid state
	70	self.fs.wait_for_daemons()
	71
	72	# Check that files created in the initial client workload are still visible
	73	# in a client mount.
	74	with self.mount_a.mounted():
	75	self.mount_a.check_files()
	76
	77	# Verify that the journal really has been rewritten.
	78	journal_version = self.fs.get_journal_version()
	79	if journal_version != new_journal_version:
	80	raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
	81	new_journal_version, journal_version()
	82	))
	83
	84	# Verify that cephfs-journal-tool can now read the rewritten journal
	85	inspect_out = self.fs.journal_tool(["journal", "inspect"])
	86	if not inspect_out.endswith(": OK"):
	87	raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
	88	inspect_out
	89	))
	90
	91	self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
	92	p = self.fs.tool_remote.run(
	93	args=[
	94	"python",
	95	"-c",
	96	"import json; print len(json.load(open('/tmp/journal.json')))"
	97	],
	98	stdout=StringIO())
	99	event_count = int(p.stdout.getvalue().strip())
	100	if event_count < 1000:
	101	# Approximate value of "lots", expected from having run fsstress
	102	raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
	103
31f18b77	104	# Do some client work to check that writing the log is still working
7c673cae FG	105	with self.mount_a.mounted():
	106	workunit(self.ctx, {
	107	'clients': {
	108	"client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
	109	},
	110	"timeout": "3h"
	111	})
31f18b77 FG	112
	113	# Check that both an active and a standby replay are still up
	114	self.assertEqual(len(replay_names()), 1)
	115	self.assertEqual(len(self.fs.get_active_names()), 1)
	116	self.assertTrue(self.mds_cluster.mds_daemons[mds_a].running())
	117	self.assertTrue(self.mds_cluster.mds_daemons[mds_b].running())
	118