ceph/qa/tasks/cephfs/test_scrub.py

   1 """
   2 Test CephFS scrub (distinct from OSD scrub) functionality
   3 """
   4
   5 from io import BytesIO
   6 import logging
   7 from collections import namedtuple
   8
   9 from tasks.cephfs.cephfs_test_case import CephFSTestCase
  10
  11 log = logging.getLogger(__name__)
  12
  13 ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
  14
  15
  16 class Workload(CephFSTestCase):
  17     def __init__(self, test, filesystem, mount):
  18         super().__init__()
  19         self._test =  test
  20         self._mount = mount
  21         self._filesystem = filesystem
  22         self._initial_state = None
  23
  24         # Accumulate backtraces for every failed validation, and return them.  Backtraces
  25         # are rather verbose, but we only see them when something breaks, and they
  26         # let us see which check failed without having to decorate each check with
  27         # a string
  28         self._errors = []
  29
  30     def write(self):
  31         """
  32         Write the workload files to the mount
  33         """
  34         raise NotImplementedError()
  35
  36     def validate(self):
  37         """
  38         Read from the mount and validate that the workload files are present (i.e. have
  39         survived or been reconstructed from the test scenario)
  40         """
  41         raise NotImplementedError()
  42
  43     def damage(self):
  44         """
  45         Damage the filesystem pools in ways that will be interesting to recover from.  By
  46         default just wipe everything in the metadata pool
  47         """
  48         # Delete every object in the metadata pool
  49         pool = self._filesystem.get_metadata_pool_name()
  50         self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])
  51
  52     def flush(self):
  53         """
  54         Called after client unmount, after write: flush whatever you want
  55         """
  56         self._filesystem.mds_asok(["flush", "journal"])
  57
  58
  59 class BacktraceWorkload(Workload):
  60     """
  61     Single file, single directory, wipe the backtrace and check it.
  62     """
  63     def write(self):
  64         self._mount.run_shell(["mkdir", "subdir"])
  65         self._mount.write_n_mb("subdir/sixmegs", 6)
  66
  67     def validate(self):
  68         st = self._mount.stat("subdir/sixmegs")
  69         self._filesystem.mds_asok(["flush", "journal"])
  70         bt = self._filesystem.read_backtrace(st['st_ino'])
  71         parent = bt['ancestors'][0]['dname']
  72         self.assertEqual(parent, 'sixmegs')
  73         return self._errors
  74
  75     def damage(self):
  76         st = self._mount.stat("subdir/sixmegs")
  77         self._filesystem.mds_asok(["flush", "journal"])
  78         self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
  79
  80     def create_files(self, nfiles=1000):
  81         self._mount.create_n_files("scrub-new-files/file", nfiles)
  82
  83
  84 class DupInodeWorkload(Workload):
  85     """
  86     Duplicate an inode and try scrubbing it twice."
  87     """
  88
  89     def write(self):
  90         self._mount.run_shell(["mkdir", "parent"])
  91         self._mount.run_shell(["mkdir", "parent/child"])
  92         self._mount.write_n_mb("parent/parentfile", 6)
  93         self._mount.write_n_mb("parent/child/childfile", 6)
  94
  95     def damage(self):
  96         self._mount.umount_wait()
  97         self._filesystem.mds_asok(["flush", "journal"])
  98         self._filesystem.fail()
  99         d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"])
 100         self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d))
 101         self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
 102         self._filesystem.set_joinable()
 103         self._filesystem.wait_for_daemons()
 104
 105     def validate(self):
 106         out_json = self._filesystem.run_scrub(["start", "/", "recursive,repair"])
 107         self.assertNotEqual(out_json, None)
 108         self.assertEqual(out_json["return_code"], 0)
 109         self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
 110         self.assertTrue(self._filesystem.are_daemons_healthy())
 111         return self._errors
 112
 113
 114 class TestScrub(CephFSTestCase):
 115     MDSS_REQUIRED = 1
 116
 117     def setUp(self):
 118         super().setUp()
 119
 120     def _scrub(self, workload, workers=1):
 121         """
 122         That when all objects in metadata pool are removed, we can rebuild a metadata pool
 123         based on the contents of a data pool, and a client can see and read our files.
 124         """
 125
 126         # First, inject some files
 127
 128         workload.write()
 129
 130         # are off by default, but in QA we need to explicitly disable them)
 131         self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
 132         self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
 133
 134         # Apply any data damage the workload wants
 135         workload.damage()
 136
 137         out_json = self.fs.run_scrub(["start", "/", "recursive,repair"])
 138         self.assertNotEqual(out_json, None)
 139         self.assertEqual(out_json["return_code"], 0)
 140         self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
 141
 142         # See that the files are present and correct
 143         errors = workload.validate()
 144         if errors:
 145             log.error("Validation errors found: {0}".format(len(errors)))
 146             for e in errors:
 147                 log.error(e.exception)
 148                 log.error(e.backtrace)
 149             raise AssertionError("Validation failed, first error: {0}\n{1}".format(
 150                 errors[0].exception, errors[0].backtrace
 151             ))
 152
 153     def _get_damage_count(self, damage_type='backtrace'):
 154         out_json = self.fs.rank_tell(["damage", "ls"])
 155         self.assertNotEqual(out_json, None)
 156
 157         damage_count = 0
 158         for it in out_json:
 159             if it['damage_type'] == damage_type:
 160                 damage_count += 1
 161         return damage_count
 162
 163     def _scrub_new_files(self, workload):
 164         """
 165         That scrubbing new files does not lead to errors
 166         """
 167         workload.create_files(1000)
 168         self.fs.wait_until_scrub_complete()
 169         self.assertEqual(self._get_damage_count(), 0)
 170
 171     def test_scrub_backtrace_for_new_files(self):
 172         self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
 173
 174     def test_scrub_backtrace(self):
 175         self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
 176
 177     def test_scrub_dup_inode(self):
 178         self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))