]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_scrub.py
dd7c11af50b39a6517b2267d1f5e88c4f99a8e6a
[ceph.git] / ceph / qa / tasks / cephfs / test_scrub.py
1 """
2 Test CephFS scrub (distinct from OSD scrub) functionality
3 """
4
5 from io import BytesIO
6 import logging
7 from collections import namedtuple
8
9 from tasks.cephfs.cephfs_test_case import CephFSTestCase
10
11 log = logging.getLogger(__name__)
12
13 ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
14
15
16 class Workload(CephFSTestCase):
17 def __init__(self, test, filesystem, mount):
18 super().__init__()
19 self._test = test
20 self._mount = mount
21 self._filesystem = filesystem
22 self._initial_state = None
23
24 # Accumulate backtraces for every failed validation, and return them. Backtraces
25 # are rather verbose, but we only see them when something breaks, and they
26 # let us see which check failed without having to decorate each check with
27 # a string
28 self._errors = []
29
30 def write(self):
31 """
32 Write the workload files to the mount
33 """
34 raise NotImplementedError()
35
36 def validate(self):
37 """
38 Read from the mount and validate that the workload files are present (i.e. have
39 survived or been reconstructed from the test scenario)
40 """
41 raise NotImplementedError()
42
43 def damage(self):
44 """
45 Damage the filesystem pools in ways that will be interesting to recover from. By
46 default just wipe everything in the metadata pool
47 """
48 # Delete every object in the metadata pool
49 pool = self._filesystem.get_metadata_pool_name()
50 self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it'])
51
52 def flush(self):
53 """
54 Called after client unmount, after write: flush whatever you want
55 """
56 self._filesystem.mds_asok(["flush", "journal"])
57
58
59 class BacktraceWorkload(Workload):
60 """
61 Single file, single directory, wipe the backtrace and check it.
62 """
63 def write(self):
64 self._mount.run_shell(["mkdir", "subdir"])
65 self._mount.write_n_mb("subdir/sixmegs", 6)
66
67 def validate(self):
68 st = self._mount.stat("subdir/sixmegs")
69 self._filesystem.mds_asok(["flush", "journal"])
70 bt = self._filesystem.read_backtrace(st['st_ino'])
71 parent = bt['ancestors'][0]['dname']
72 self.assertEqual(parent, 'sixmegs')
73 return self._errors
74
75 def damage(self):
76 st = self._mount.stat("subdir/sixmegs")
77 self._filesystem.mds_asok(["flush", "journal"])
78 self._filesystem._write_data_xattr(st['st_ino'], "parent", "")
79
80 def create_files(self, nfiles=1000):
81 self._mount.create_n_files("scrub-new-files/file", nfiles)
82
83
84 class DupInodeWorkload(Workload):
85 """
86 Duplicate an inode and try scrubbing it twice."
87 """
88
89 def write(self):
90 self._mount.run_shell(["mkdir", "parent"])
91 self._mount.run_shell(["mkdir", "parent/child"])
92 self._mount.write_n_mb("parent/parentfile", 6)
93 self._mount.write_n_mb("parent/child/childfile", 6)
94
95 def damage(self):
96 self._mount.umount_wait()
97 self._filesystem.mds_asok(["flush", "journal"])
98 self._filesystem.fail()
99 d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"])
100 self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d))
101 self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
102 self._filesystem.set_joinable()
103 self._filesystem.wait_for_daemons()
104
105 def validate(self):
106 out_json = self._filesystem.run_scrub(["start", "/", "recursive,repair"])
107 self.assertNotEqual(out_json, None)
108 self.assertEqual(out_json["return_code"], 0)
109 self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
110 self.assertTrue(self._filesystem.are_daemons_healthy())
111 return self._errors
112
113
114 class TestScrub(CephFSTestCase):
115 MDSS_REQUIRED = 1
116
117 def setUp(self):
118 super().setUp()
119
120 def _scrub(self, workload, workers=1):
121 """
122 That when all objects in metadata pool are removed, we can rebuild a metadata pool
123 based on the contents of a data pool, and a client can see and read our files.
124 """
125
126 # First, inject some files
127
128 workload.write()
129
130 # are off by default, but in QA we need to explicitly disable them)
131 self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
132 self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
133
134 # Apply any data damage the workload wants
135 workload.damage()
136
137 out_json = self.fs.run_scrub(["start", "/", "recursive,repair"])
138 self.assertNotEqual(out_json, None)
139 self.assertEqual(out_json["return_code"], 0)
140 self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
141
142 # See that the files are present and correct
143 errors = workload.validate()
144 if errors:
145 log.error("Validation errors found: {0}".format(len(errors)))
146 for e in errors:
147 log.error(e.exception)
148 log.error(e.backtrace)
149 raise AssertionError("Validation failed, first error: {0}\n{1}".format(
150 errors[0].exception, errors[0].backtrace
151 ))
152
153 def _get_damage_count(self, damage_type='backtrace'):
154 out_json = self.fs.rank_tell(["damage", "ls"])
155 self.assertNotEqual(out_json, None)
156
157 damage_count = 0
158 for it in out_json:
159 if it['damage_type'] == damage_type:
160 damage_count += 1
161 return damage_count
162
163 def _scrub_new_files(self, workload):
164 """
165 That scrubbing new files does not lead to errors
166 """
167 workload.create_files(1000)
168 self.fs.wait_until_scrub_complete()
169 self.assertEqual(self._get_damage_count(), 0)
170
171 def test_scrub_backtrace_for_new_files(self):
172 self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
173
174 def test_scrub_backtrace(self):
175 self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
176
177 def test_scrub_dup_inode(self):
178 self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))