ceph/qa/tasks/cephfs/test_scrub.py

   1 """
   2 Test CephFS scrub (distinct from OSD scrub) functionality
   3 """
   4 import logging
   5 import os
   6 import traceback
   7 from collections import namedtuple
   8
   9 from teuthology.orchestra.run import CommandFailedError
  10 from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
  11
  12 log = logging.getLogger(__name__)
  13
  14 ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
  15
  16
  17 class Workload(CephFSTestCase):
  18     def __init__(self, filesystem, mount):
  19         self._mount = mount
  20         self._filesystem = filesystem
  21         self._initial_state = None
  22
  23         # Accumulate backtraces for every failed validation, and return them.  Backtraces
  24         # are rather verbose, but we only see them when something breaks, and they
  25         # let us see which check failed without having to decorate each check with
  26         # a string
  27         self._errors = []
  28
  29     def write(self):
  30         """
  31         Write the workload files to the mount
  32         """
  33         raise NotImplementedError()
  34
  35     def validate(self):
  36         """
  37         Read from the mount and validate that the workload files are present (i.e. have
  38         survived or been reconstructed from the test scenario)
  39         """
  40         raise NotImplementedError()
  41
  42     def damage(self):
  43         """
  44         Damage the filesystem pools in ways that will be interesting to recover from.  By
  45         default just wipe everything in the metadata pool
  46         """
  47         # Delete every object in the metadata pool
  48         objects = self._filesystem.rados(["ls"]).split("\n")
  49         for o in objects:
  50             self._filesystem.rados(["rm", o])
  51
  52     def flush(self):
  53         """
  54         Called after client unmount, after write: flush whatever you want
  55         """
  56         self._filesystem.mds_asok(["flush", "journal"])
  57
  58
  59 class BacktraceWorkload(Workload):
  60     """
  61     Single file, single directory, wipe the backtrace and check it.
  62     """
  63     def write(self):
  64         self._mount.run_shell(["mkdir", "subdir"])
  65         self._mount.write_n_mb("subdir/sixmegs", 6)
  66
  67     def validate(self):
  68         st = self._mount.stat("subdir/sixmegs")
  69         self._filesystem.mds_asok(["flush", "journal"])
  70         bt = self._filesystem.read_backtrace(st['st_ino'])
  71         parent = bt['ancestors'][0]['dname']
  72         self.assertEqual(parent, 'sixmegs')
  73         return self._errors
  74
  75     def damage(self):
  76         st = self._mount.stat("subdir/sixmegs")
  77         self._filesystem.mds_asok(["flush", "journal"])
  78         self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
  79
  80
  81 class DupInodeWorkload(Workload):
  82     """
  83     Duplicate an inode and try scrubbing it twice."
  84     """
  85
  86     def write(self):
  87         self._mount.run_shell(["mkdir", "parent"])
  88         self._mount.run_shell(["mkdir", "parent/child"])
  89         self._mount.write_n_mb("parent/parentfile", 6)
  90         self._mount.write_n_mb("parent/child/childfile", 6)
  91
  92     def damage(self):
  93         temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
  94         self._mount.umount()
  95         self._filesystem.mds_asok(["flush", "journal"])
  96         self._filesystem.mds_stop()
  97         self._filesystem.rados(["getomapval", "10000000000.00000000",
  98                                 "parentfile_head", temp_bin_path])
  99         self._filesystem.rados(["setomapval", "10000000000.00000000",
 100                                 "shadow_head"], stdin_file=temp_bin_path)
 101         self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
 102         self._filesystem.mds_restart()
 103         self._filesystem.wait_for_daemons()
 104
 105     def validate(self):
 106         out_json = self._filesystem.mds_asok(["scrub_path", "/", "recursive", "repair"])
 107         self.assertNotEqual(out_json, None)
 108         self.assertTrue(self._filesystem.are_daemons_healthy())
 109         return self._errors
 110
 111
 112 class TestScrub(CephFSTestCase):
 113     MDSS_REQUIRED = 1
 114
 115     def _scrub(self, workload, workers=1):
 116         """
 117         That when all objects in metadata pool are removed, we can rebuild a metadata pool
 118         based on the contents of a data pool, and a client can see and read our files.
 119         """
 120
 121         # First, inject some files
 122
 123         workload.write()
 124
 125         # are off by default, but in QA we need to explicitly disable them)
 126         self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
 127         self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
 128
 129         # Apply any data damage the workload wants
 130         workload.damage()
 131
 132         out_json = self.fs.mds_asok(["scrub_path", "/", "recursive", "repair"])
 133         self.assertNotEqual(out_json, None)
 134
 135         # See that the files are present and correct
 136         errors = workload.validate()
 137         if errors:
 138             log.error("Validation errors found: {0}".format(len(errors)))
 139             for e in errors:
 140                 log.error(e.exception)
 141                 log.error(e.backtrace)
 142             raise AssertionError("Validation failed, first error: {0}\n{1}".format(
 143                 errors[0].exception, errors[0].backtrace
 144             ))
 145
 146     def test_scrub_backtrace(self):
 147         self._scrub(BacktraceWorkload(self.fs, self.mount_a))
 148
 149     def test_scrub_dup_inode(self):
 150         self._scrub(DupInodeWorkload(self.fs, self.mount_a))