ceph/qa/tasks/cephfs/test_journal_migration.py

   1
   2 from StringIO import StringIO
   3 from tasks.cephfs.cephfs_test_case import CephFSTestCase
   4 from tasks.workunit import task as workunit
   5
   6 JOURNAL_FORMAT_LEGACY = 0
   7 JOURNAL_FORMAT_RESILIENT = 1
   8
   9
  10 class TestJournalMigration(CephFSTestCase):
  11     CLIENTS_REQUIRED = 1
  12     MDSS_REQUIRED = 2
  13
  14     def test_journal_migration(self):
  15         old_journal_version = JOURNAL_FORMAT_LEGACY
  16         new_journal_version = JOURNAL_FORMAT_RESILIENT
  17
  18         self.mount_a.umount_wait()
  19         self.fs.mds_stop()
  20
  21         # Create a filesystem using the older journal format.
  22         self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
  23         self.fs.mds_restart()
  24         self.fs.recreate()
  25
  26         # Enable standby replay, to cover the bug case #8811 where
  27         # a standby replay might mistakenly end up trying to rewrite
  28         # the journal at the same time as an active daemon.
  29         self.fs.set_allow_standby_replay(True)
  30
  31         status = self.fs.wait_for_daemons()
  32
  33         self.assertTrue(self.fs.get_replay(status=status) is not None)
  34
  35         # Do some client work so that the log is populated with something.
  36         with self.mount_a.mounted():
  37             self.mount_a.create_files()
  38             self.mount_a.check_files()  # sanity, this should always pass
  39
  40             # Run a more substantial workunit so that the length of the log to be
  41             # coverted is going span at least a few segments
  42             workunit(self.ctx, {
  43                 'clients': {
  44                     "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
  45                 },
  46                 "timeout": "3h"
  47             })
  48
  49         # Modify the ceph.conf to ask the MDS to use the new journal format.
  50         self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
  51
  52         # Restart the MDS.
  53         self.fs.mds_fail_restart()
  54
  55         # This ensures that all daemons come up into a valid state
  56         status = self.fs.wait_for_daemons()
  57
  58         # Check that files created in the initial client workload are still visible
  59         # in a client mount.
  60         with self.mount_a.mounted():
  61             self.mount_a.check_files()
  62
  63         # Verify that the journal really has been rewritten.
  64         journal_version = self.fs.get_journal_version()
  65         if journal_version != new_journal_version:
  66             raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
  67                 new_journal_version, journal_version()
  68             ))
  69
  70         # Verify that cephfs-journal-tool can now read the rewritten journal
  71         inspect_out = self.fs.journal_tool(["journal", "inspect"], 0)
  72         if not inspect_out.endswith(": OK"):
  73             raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
  74                 inspect_out
  75             ))
  76
  77         self.fs.journal_tool(["event", "get", "json",
  78                               "--path", "/tmp/journal.json"], 0)
  79         p = self.fs.tool_remote.run(
  80             args=[
  81                 "python3",
  82                 "-c",
  83                 "import json; print(len(json.load(open('/tmp/journal.json'))))"
  84             ],
  85             stdout=StringIO())
  86         event_count = int(p.stdout.getvalue().strip())
  87         if event_count < 1000:
  88             # Approximate value of "lots", expected from having run fsstress
  89             raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
  90
  91         # Do some client work to check that writing the log is still working
  92         with self.mount_a.mounted():
  93             workunit(self.ctx, {
  94                 'clients': {
  95                     "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
  96                 },
  97                 "timeout": "3h"
  98             })
  99
 100         # Check that both an active and a standby replay are still up
 101         status = self.fs.status()
 102         self.assertEqual(len(list(self.fs.get_replays(status=status))), 1)
 103         self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1)