ceph/qa/tasks/cephfs/test_scrub.py

   1 """
   2 Test CephFS scrub (distinct from OSD scrub) functionality
   3 """
   4 import logging
   5 from collections import namedtuple
   6
   7 from tasks.cephfs.cephfs_test_case import CephFSTestCase
   8
   9 log = logging.getLogger(__name__)
  10
  11 ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
  12
  13
  14 class Workload(CephFSTestCase):
  15     def __init__(self, filesystem, mount):
  16         super().__init__()
  17         self._mount = mount
  18         self._filesystem = filesystem
  19         self._initial_state = None
  20
  21         # Accumulate backtraces for every failed validation, and return them.  Backtraces
  22         # are rather verbose, but we only see them when something breaks, and they
  23         # let us see which check failed without having to decorate each check with
  24         # a string
  25         self._errors = []
  26
  27     def write(self):
  28         """
  29         Write the workload files to the mount
  30         """
  31         raise NotImplementedError()
  32
  33     def validate(self):
  34         """
  35         Read from the mount and validate that the workload files are present (i.e. have
  36         survived or been reconstructed from the test scenario)
  37         """
  38         raise NotImplementedError()
  39
  40     def damage(self):
  41         """
  42         Damage the filesystem pools in ways that will be interesting to recover from.  By
  43         default just wipe everything in the metadata pool
  44         """
  45         # Delete every object in the metadata pool
  46         objects = self._filesystem.rados(["ls"]).split("\n")
  47         for o in objects:
  48             self._filesystem.rados(["rm", o])
  49
  50     def flush(self):
  51         """
  52         Called after client unmount, after write: flush whatever you want
  53         """
  54         self._filesystem.mds_asok(["flush", "journal"])
  55
  56
  57 class BacktraceWorkload(Workload):
  58     """
  59     Single file, single directory, wipe the backtrace and check it.
  60     """
  61     def write(self):
  62         self._mount.run_shell(["mkdir", "subdir"])
  63         self._mount.write_n_mb("subdir/sixmegs", 6)
  64
  65     def validate(self):
  66         st = self._mount.stat("subdir/sixmegs")
  67         self._filesystem.mds_asok(["flush", "journal"])
  68         bt = self._filesystem.read_backtrace(st['st_ino'])
  69         parent = bt['ancestors'][0]['dname']
  70         self.assertEqual(parent, 'sixmegs')
  71         return self._errors
  72
  73     def damage(self):
  74         st = self._mount.stat("subdir/sixmegs")
  75         self._filesystem.mds_asok(["flush", "journal"])
  76         self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
  77
  78     def create_files(self, nfiles=1000):
  79         self._mount.create_n_files("scrub-new-files/file", nfiles)
  80
  81
  82 class DupInodeWorkload(Workload):
  83     """
  84     Duplicate an inode and try scrubbing it twice."
  85     """
  86
  87     def write(self):
  88         self._mount.run_shell(["mkdir", "parent"])
  89         self._mount.run_shell(["mkdir", "parent/child"])
  90         self._mount.write_n_mb("parent/parentfile", 6)
  91         self._mount.write_n_mb("parent/child/childfile", 6)
  92
  93     def damage(self):
  94         temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
  95         self._mount.umount_wait()
  96         self._filesystem.mds_asok(["flush", "journal"])
  97         self._filesystem.mds_stop()
  98         self._filesystem.rados(["getomapval", "10000000000.00000000",
  99                                 "parentfile_head", temp_bin_path])
 100         self._filesystem.rados(["setomapval", "10000000000.00000000",
 101                                 "shadow_head"], stdin_file=temp_bin_path)
 102         self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
 103         self._filesystem.mds_restart()
 104         self._filesystem.wait_for_daemons()
 105
 106     def validate(self):
 107         out_json = self._filesystem.rank_tell(["scrub", "start", "/", "recursive", "repair"])
 108         self.assertNotEqual(out_json, None)
 109         self.assertTrue(self._filesystem.are_daemons_healthy())
 110         return self._errors
 111
 112
 113 class TestScrub(CephFSTestCase):
 114     MDSS_REQUIRED = 1
 115
 116     def setUp(self):
 117         super().setUp()
 118
 119     def _scrub(self, workload, workers=1):
 120         """
 121         That when all objects in metadata pool are removed, we can rebuild a metadata pool
 122         based on the contents of a data pool, and a client can see and read our files.
 123         """
 124
 125         # First, inject some files
 126
 127         workload.write()
 128
 129         # are off by default, but in QA we need to explicitly disable them)
 130         self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
 131         self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
 132
 133         # Apply any data damage the workload wants
 134         workload.damage()
 135
 136         out_json = self.fs.rank_tell(["scrub", "start", "/", "recursive", "repair"])
 137         self.assertNotEqual(out_json, None)
 138
 139         # See that the files are present and correct
 140         errors = workload.validate()
 141         if errors:
 142             log.error("Validation errors found: {0}".format(len(errors)))
 143             for e in errors:
 144                 log.error(e.exception)
 145                 log.error(e.backtrace)
 146             raise AssertionError("Validation failed, first error: {0}\n{1}".format(
 147                 errors[0].exception, errors[0].backtrace
 148             ))
 149
 150     def _get_damage_count(self, damage_type='backtrace'):
 151         out_json = self.fs.rank_tell(["damage", "ls"])
 152         self.assertNotEqual(out_json, None)
 153
 154         damage_count = 0
 155         for it in out_json:
 156             if it['damage_type'] == damage_type:
 157                 damage_count += 1
 158         return damage_count
 159
 160     def _scrub_new_files(self, workload):
 161         """
 162         That scrubbing new files does not lead to errors
 163         """
 164         workload.create_files(1000)
 165         self._wait_until_scrub_complete()
 166         self.assertEqual(self._get_damage_count(), 0)
 167
 168     def test_scrub_backtrace_for_new_files(self):
 169         self._scrub_new_files(BacktraceWorkload(self.fs, self.mount_a))
 170
 171     def test_scrub_backtrace(self):
 172         self._scrub(BacktraceWorkload(self.fs, self.mount_a))
 173
 174     def test_scrub_dup_inode(self):
 175         self._scrub(DupInodeWorkload(self.fs, self.mount_a))