]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | |
2 | """ | |
3 | Test our tools for recovering metadata from the data pool | |
4 | """ | |
5 | import json | |
6 | ||
7 | import logging | |
8 | import os | |
11fdf7f2 | 9 | import time |
7c673cae | 10 | import traceback |
e306af50 | 11 | |
f67539c2 | 12 | from io import BytesIO, StringIO |
7c673cae | 13 | from collections import namedtuple, defaultdict |
e306af50 | 14 | from textwrap import dedent |
7c673cae FG |
15 | |
16 | from teuthology.orchestra.run import CommandFailedError | |
17 | from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology | |
18 | ||
19 | log = logging.getLogger(__name__) | |
20 | ||
21 | ||
22 | ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) | |
23 | ||
24 | ||
25 | class Workload(object): | |
26 | def __init__(self, filesystem, mount): | |
27 | self._mount = mount | |
28 | self._filesystem = filesystem | |
29 | self._initial_state = None | |
30 | ||
31 | # Accumulate backtraces for every failed validation, and return them. Backtraces | |
32 | # are rather verbose, but we only see them when something breaks, and they | |
33 | # let us see which check failed without having to decorate each check with | |
34 | # a string | |
35 | self._errors = [] | |
36 | ||
37 | def assert_equal(self, a, b): | |
38 | try: | |
39 | if a != b: | |
40 | raise AssertionError("{0} != {1}".format(a, b)) | |
41 | except AssertionError as e: | |
42 | self._errors.append( | |
43 | ValidationError(e, traceback.format_exc(3)) | |
44 | ) | |
45 | ||
46 | def write(self): | |
47 | """ | |
48 | Write the workload files to the mount | |
49 | """ | |
50 | raise NotImplementedError() | |
51 | ||
52 | def validate(self): | |
53 | """ | |
54 | Read from the mount and validate that the workload files are present (i.e. have | |
55 | survived or been reconstructed from the test scenario) | |
56 | """ | |
57 | raise NotImplementedError() | |
58 | ||
59 | def damage(self): | |
60 | """ | |
61 | Damage the filesystem pools in ways that will be interesting to recover from. By | |
62 | default just wipe everything in the metadata pool | |
63 | """ | |
64 | # Delete every object in the metadata pool | |
f67539c2 TL |
65 | pool = self._filesystem.get_metadata_pool_name() |
66 | self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it']) | |
7c673cae FG |
67 | |
68 | def flush(self): | |
69 | """ | |
70 | Called after client unmount, after write: flush whatever you want | |
71 | """ | |
72 | self._filesystem.mds_asok(["flush", "journal"]) | |
73 | ||
74 | ||
75 | class SimpleWorkload(Workload): | |
76 | """ | |
77 | Single file, single directory, check that it gets recovered and so does its size | |
78 | """ | |
79 | def write(self): | |
80 | self._mount.run_shell(["mkdir", "subdir"]) | |
81 | self._mount.write_n_mb("subdir/sixmegs", 6) | |
82 | self._initial_state = self._mount.stat("subdir/sixmegs") | |
83 | ||
84 | def validate(self): | |
85 | self._mount.run_shell(["ls", "subdir"]) | |
86 | st = self._mount.stat("subdir/sixmegs") | |
87 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
88 | return self._errors | |
89 | ||
90 | ||
91 | class MovedFile(Workload): | |
92 | def write(self): | |
93 | # Create a file whose backtrace disagrees with his eventual position | |
94 | # in the metadata. We will see that he gets reconstructed in his | |
95 | # original position according to his backtrace. | |
96 | self._mount.run_shell(["mkdir", "subdir_alpha"]) | |
97 | self._mount.run_shell(["mkdir", "subdir_bravo"]) | |
98 | self._mount.write_n_mb("subdir_alpha/sixmegs", 6) | |
99 | self._filesystem.mds_asok(["flush", "journal"]) | |
100 | self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"]) | |
101 | self._initial_state = self._mount.stat("subdir_bravo/sixmegs") | |
102 | ||
103 | def flush(self): | |
104 | pass | |
105 | ||
106 | def validate(self): | |
107 | self.assert_equal(self._mount.ls(), ["subdir_alpha"]) | |
108 | st = self._mount.stat("subdir_alpha/sixmegs") | |
109 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
110 | return self._errors | |
111 | ||
112 | ||
113 | class BacktracelessFile(Workload): | |
114 | def write(self): | |
115 | self._mount.run_shell(["mkdir", "subdir"]) | |
116 | self._mount.write_n_mb("subdir/sixmegs", 6) | |
117 | self._initial_state = self._mount.stat("subdir/sixmegs") | |
118 | ||
119 | def flush(self): | |
120 | # Never flush metadata, so backtrace won't be written | |
121 | pass | |
122 | ||
123 | def validate(self): | |
124 | ino_name = "%x" % self._initial_state["st_ino"] | |
125 | ||
126 | # The inode should be linked into lost+found because we had no path for it | |
127 | self.assert_equal(self._mount.ls(), ["lost+found"]) | |
128 | self.assert_equal(self._mount.ls("lost+found"), [ino_name]) | |
129 | st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name)) | |
130 | ||
131 | # We might not have got the name or path, but we should still get the size | |
132 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
133 | ||
134 | return self._errors | |
135 | ||
136 | ||
137 | class StripedStashedLayout(Workload): | |
138 | def __init__(self, fs, m): | |
139 | super(StripedStashedLayout, self).__init__(fs, m) | |
140 | ||
141 | # Nice small stripes so we can quickly do our writes+validates | |
142 | self.sc = 4 | |
143 | self.ss = 65536 | |
144 | self.os = 262144 | |
145 | ||
146 | self.interesting_sizes = [ | |
147 | # Exactly stripe_count objects will exist | |
148 | self.os * self.sc, | |
149 | # Fewer than stripe_count objects will exist | |
e306af50 TL |
150 | self.os * self.sc // 2, |
151 | self.os * (self.sc - 1) + self.os // 2, | |
152 | self.os * (self.sc - 1) + self.os // 2 - 1, | |
153 | self.os * (self.sc + 1) + self.os // 2, | |
154 | self.os * (self.sc + 1) + self.os // 2 + 1, | |
7c673cae | 155 | # More than stripe_count objects will exist |
e306af50 | 156 | self.os * self.sc + self.os * self.sc // 2 |
7c673cae FG |
157 | ] |
158 | ||
159 | def write(self): | |
160 | # Create a dir with a striped layout set on it | |
161 | self._mount.run_shell(["mkdir", "stripey"]) | |
162 | ||
163 | self._mount.setfattr("./stripey", "ceph.dir.layout", | |
164 | "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format( | |
165 | ss=self.ss, os=self.os, sc=self.sc, | |
166 | pool=self._filesystem.get_data_pool_name() | |
167 | )) | |
168 | ||
169 | # Write files, then flush metadata so that its layout gets written into an xattr | |
170 | for i, n_bytes in enumerate(self.interesting_sizes): | |
171 | self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) | |
172 | # This is really just validating the validator | |
173 | self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) | |
174 | self._filesystem.mds_asok(["flush", "journal"]) | |
175 | ||
176 | # Write another file in the same way, but this time don't flush the metadata, | |
177 | # so that it won't have the layout xattr | |
178 | self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512) | |
179 | self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512) | |
180 | ||
181 | self._initial_state = { | |
182 | "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file") | |
183 | } | |
184 | ||
185 | def flush(self): | |
186 | # Pass because we already selectively flushed during write | |
187 | pass | |
188 | ||
189 | def validate(self): | |
190 | # The first files should have been recovered into its original location | |
191 | # with the correct layout: read back correct data | |
192 | for i, n_bytes in enumerate(self.interesting_sizes): | |
193 | try: | |
194 | self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) | |
195 | except CommandFailedError as e: | |
196 | self._errors.append( | |
197 | ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3)) | |
198 | ) | |
199 | ||
200 | # The unflushed file should have been recovered into lost+found without | |
201 | # the correct layout: read back junk | |
202 | ino_name = "%x" % self._initial_state["unflushed_ino"] | |
203 | self.assert_equal(self._mount.ls("lost+found"), [ino_name]) | |
204 | try: | |
205 | self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512) | |
206 | except CommandFailedError: | |
207 | pass | |
208 | else: | |
209 | self._errors.append( | |
210 | ValidationError("Unexpectedly valid data in unflushed striped file", "") | |
211 | ) | |
212 | ||
213 | return self._errors | |
214 | ||
215 | ||
216 | class ManyFilesWorkload(Workload): | |
217 | def __init__(self, filesystem, mount, file_count): | |
218 | super(ManyFilesWorkload, self).__init__(filesystem, mount) | |
219 | self.file_count = file_count | |
220 | ||
221 | def write(self): | |
222 | self._mount.run_shell(["mkdir", "subdir"]) | |
223 | for n in range(0, self.file_count): | |
224 | self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) | |
225 | ||
226 | def validate(self): | |
227 | for n in range(0, self.file_count): | |
228 | try: | |
229 | self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) | |
230 | except CommandFailedError as e: | |
231 | self._errors.append( | |
232 | ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3)) | |
233 | ) | |
234 | ||
235 | return self._errors | |
236 | ||
237 | ||
238 | class MovedDir(Workload): | |
239 | def write(self): | |
240 | # Create a nested dir that we will then move. Two files with two different | |
241 | # backtraces referring to the moved dir, claiming two different locations for | |
242 | # it. We will see that only one backtrace wins and the dir ends up with | |
243 | # single linkage. | |
244 | self._mount.run_shell(["mkdir", "-p", "grandmother/parent"]) | |
245 | self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1) | |
246 | self._filesystem.mds_asok(["flush", "journal"]) | |
247 | self._mount.run_shell(["mkdir", "grandfather"]) | |
248 | self._mount.run_shell(["mv", "grandmother/parent", "grandfather"]) | |
249 | self._mount.write_n_mb("grandfather/parent/new_pos_file", 2) | |
250 | self._filesystem.mds_asok(["flush", "journal"]) | |
251 | ||
252 | self._initial_state = ( | |
253 | self._mount.stat("grandfather/parent/orig_pos_file"), | |
254 | self._mount.stat("grandfather/parent/new_pos_file") | |
255 | ) | |
256 | ||
257 | def validate(self): | |
258 | root_files = self._mount.ls() | |
259 | self.assert_equal(len(root_files), 1) | |
260 | self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True) | |
261 | winner = root_files[0] | |
262 | st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner)) | |
263 | st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner)) | |
264 | ||
265 | self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size']) | |
266 | self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size']) | |
267 | ||
268 | ||
269 | class MissingZerothObject(Workload): | |
270 | def write(self): | |
271 | self._mount.run_shell(["mkdir", "subdir"]) | |
272 | self._mount.write_n_mb("subdir/sixmegs", 6) | |
273 | self._initial_state = self._mount.stat("subdir/sixmegs") | |
274 | ||
275 | def damage(self): | |
276 | super(MissingZerothObject, self).damage() | |
277 | zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino']) | |
278 | self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name()) | |
279 | ||
280 | def validate(self): | |
281 | st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino'])) | |
282 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
283 | ||
284 | ||
285 | class NonDefaultLayout(Workload): | |
286 | """ | |
287 | Check that the reconstruction copes with files that have a different | |
288 | object size in their layout | |
289 | """ | |
290 | def write(self): | |
291 | self._mount.run_shell(["touch", "datafile"]) | |
292 | self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608") | |
293 | self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"]) | |
294 | self._initial_state = self._mount.stat("datafile") | |
295 | ||
296 | def validate(self): | |
297 | # Check we got the layout reconstructed properly | |
298 | object_size = int(self._mount.getfattr( | |
299 | "./datafile", "ceph.file.layout.object_size")) | |
300 | self.assert_equal(object_size, 8388608) | |
301 | ||
302 | # Check we got the file size reconstructed properly | |
303 | st = self._mount.stat("datafile") | |
304 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
305 | ||
306 | ||
307 | class TestDataScan(CephFSTestCase): | |
308 | MDSS_REQUIRED = 2 | |
309 | ||
310 | def is_marked_damaged(self, rank): | |
311 | mds_map = self.fs.get_mds_map() | |
312 | return rank in mds_map['damaged'] | |
313 | ||
181888fb | 314 | def _rebuild_metadata(self, workload, workers=1): |
7c673cae FG |
315 | """ |
316 | That when all objects in metadata pool are removed, we can rebuild a metadata pool | |
317 | based on the contents of a data pool, and a client can see and read our files. | |
318 | """ | |
319 | ||
320 | # First, inject some files | |
321 | ||
7c673cae FG |
322 | workload.write() |
323 | ||
324 | # Unmount the client and flush the journal: the tool should also cope with | |
325 | # situations where there is dirty metadata, but we'll test that separately | |
326 | self.mount_a.umount_wait() | |
327 | workload.flush() | |
328 | ||
7c673cae | 329 | # Stop the MDS |
f67539c2 | 330 | self.fs.fail() |
7c673cae FG |
331 | |
332 | # After recovery, we need the MDS to not be strict about stats (in production these options | |
333 | # are off by default, but in QA we need to explicitly disable them) | |
334 | self.fs.set_ceph_conf('mds', 'mds verify scatter', False) | |
335 | self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) | |
336 | ||
337 | # Apply any data damage the workload wants | |
338 | workload.damage() | |
339 | ||
340 | # Reset the MDS map in case multiple ranks were in play: recovery procedure | |
341 | # only understands how to rebuild metadata under rank 0 | |
f67539c2 | 342 | self.fs.reset() |
7c673cae | 343 | |
f67539c2 | 344 | self.fs.set_joinable() # redundant with reset |
7c673cae FG |
345 | |
346 | def get_state(mds_id): | |
347 | info = self.mds_cluster.get_mds_info(mds_id) | |
348 | return info['state'] if info is not None else None | |
349 | ||
181888fb FG |
350 | self.wait_until_true(lambda: self.is_marked_damaged(0), 60) |
351 | for mds_id in self.fs.mds_ids: | |
352 | self.wait_until_equal( | |
353 | lambda: get_state(mds_id), | |
354 | "up:standby", | |
355 | timeout=60) | |
7c673cae FG |
356 | |
357 | self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) | |
358 | self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) | |
359 | self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) | |
360 | ||
361 | # Run the recovery procedure | |
362 | if False: | |
363 | with self.assertRaises(CommandFailedError): | |
364 | # Normal reset should fail when no objects are present, we'll use --force instead | |
f64942e4 | 365 | self.fs.journal_tool(["journal", "reset"], 0) |
7c673cae | 366 | |
f64942e4 | 367 | self.fs.journal_tool(["journal", "reset", "--force"], 0) |
181888fb FG |
368 | self.fs.data_scan(["init"]) |
369 | self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers) | |
370 | self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers) | |
7c673cae FG |
371 | |
372 | # Mark the MDS repaired | |
373 | self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') | |
374 | ||
375 | # Start the MDS | |
376 | self.fs.mds_restart() | |
377 | self.fs.wait_for_daemons() | |
7c673cae FG |
378 | log.info(str(self.mds_cluster.status())) |
379 | ||
380 | # Mount a client | |
e306af50 | 381 | self.mount_a.mount_wait() |
7c673cae FG |
382 | |
383 | # See that the files are present and correct | |
384 | errors = workload.validate() | |
385 | if errors: | |
386 | log.error("Validation errors found: {0}".format(len(errors))) | |
387 | for e in errors: | |
388 | log.error(e.exception) | |
389 | log.error(e.backtrace) | |
390 | raise AssertionError("Validation failed, first error: {0}\n{1}".format( | |
391 | errors[0].exception, errors[0].backtrace | |
392 | )) | |
393 | ||
394 | def test_rebuild_simple(self): | |
395 | self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a)) | |
396 | ||
397 | def test_rebuild_moved_file(self): | |
398 | self._rebuild_metadata(MovedFile(self.fs, self.mount_a)) | |
399 | ||
400 | def test_rebuild_backtraceless(self): | |
401 | self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a)) | |
402 | ||
403 | def test_rebuild_moved_dir(self): | |
404 | self._rebuild_metadata(MovedDir(self.fs, self.mount_a)) | |
405 | ||
406 | def test_rebuild_missing_zeroth(self): | |
407 | self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a)) | |
408 | ||
409 | def test_rebuild_nondefault_layout(self): | |
410 | self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a)) | |
411 | ||
412 | def test_stashed_layout(self): | |
413 | self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a)) | |
414 | ||
7c673cae | 415 | def _dirfrag_keys(self, object_id): |
f67539c2 | 416 | keys_str = self.fs.radosmo(["listomapkeys", object_id], stdout=StringIO()) |
7c673cae | 417 | if keys_str: |
f67539c2 | 418 | return keys_str.strip().split("\n") |
7c673cae FG |
419 | else: |
420 | return [] | |
421 | ||
422 | def test_fragmented_injection(self): | |
423 | """ | |
424 | That when injecting a dentry into a fragmented directory, we put it in the right fragment. | |
425 | """ | |
426 | ||
7c673cae FG |
427 | file_count = 100 |
428 | file_names = ["%s" % n for n in range(0, file_count)] | |
429 | ||
1911f103 TL |
430 | # Make sure and disable dirfrag auto merging and splitting |
431 | self.fs.set_ceph_conf('mds', 'mds bal merge size', 0) | |
432 | self.fs.set_ceph_conf('mds', 'mds bal split size', 100 * file_count) | |
433 | ||
7c673cae FG |
434 | # Create a directory of `file_count` files, each named after its |
435 | # decimal number and containing the string of its decimal number | |
436 | self.mount_a.run_python(dedent(""" | |
437 | import os | |
438 | path = os.path.join("{path}", "subdir") | |
439 | os.mkdir(path) | |
440 | for n in range(0, {file_count}): | |
441 | open(os.path.join(path, "%s" % n), 'w').write("%s" % n) | |
442 | """.format( | |
443 | path=self.mount_a.mountpoint, | |
444 | file_count=file_count | |
445 | ))) | |
446 | ||
447 | dir_ino = self.mount_a.path_to_ino("subdir") | |
448 | ||
449 | # Only one MDS should be active! | |
450 | self.assertEqual(len(self.fs.get_active_names()), 1) | |
451 | ||
452 | # Ensure that one directory is fragmented | |
453 | mds_id = self.fs.get_active_names()[0] | |
454 | self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id) | |
455 | ||
456 | # Flush journal and stop MDS | |
457 | self.mount_a.umount_wait() | |
458 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
f67539c2 | 459 | self.fs.fail() |
7c673cae FG |
460 | |
461 | # Pick a dentry and wipe out its key | |
462 | # Because I did a 1 bit split, I know one frag will be named <inode>.01000000 | |
463 | frag_obj_id = "{0:x}.01000000".format(dir_ino) | |
464 | keys = self._dirfrag_keys(frag_obj_id) | |
465 | victim_key = keys[7] # arbitrary choice | |
466 | log.info("victim_key={0}".format(victim_key)) | |
467 | victim_dentry = victim_key.split("_head")[0] | |
f67539c2 | 468 | self.fs.radosm(["rmomapkey", frag_obj_id, victim_key]) |
7c673cae FG |
469 | |
470 | # Start filesystem back up, observe that the file appears to be gone in an `ls` | |
f67539c2 | 471 | self.fs.set_joinable() |
7c673cae | 472 | self.fs.wait_for_daemons() |
e306af50 | 473 | self.mount_a.mount_wait() |
7c673cae FG |
474 | files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n") |
475 | self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry])))) | |
476 | ||
477 | # Stop the filesystem | |
478 | self.mount_a.umount_wait() | |
f67539c2 | 479 | self.fs.fail() |
7c673cae FG |
480 | |
481 | # Run data-scan, observe that it inserts our dentry back into the correct fragment | |
482 | # by checking the omap now has the dentry's key again | |
483 | self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) | |
484 | self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()]) | |
1911f103 | 485 | self.fs.data_scan(["scan_links"]) |
7c673cae FG |
486 | self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id)) |
487 | ||
488 | # Start the filesystem and check that the dentry we deleted is now once again visible | |
489 | # and points to the correct file data. | |
f67539c2 | 490 | self.fs.set_joinable() |
7c673cae | 491 | self.fs.wait_for_daemons() |
e306af50 | 492 | self.mount_a.mount_wait() |
7c673cae FG |
493 | out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip() |
494 | self.assertEqual(out, victim_dentry) | |
495 | ||
496 | # Finally, close the loop by checking our injected dentry survives a merge | |
497 | mds_id = self.fs.get_active_names()[0] | |
498 | self.mount_a.ls("subdir") # Do an ls to ensure both frags are in cache so the merge will work | |
499 | self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id) | |
500 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
501 | frag_obj_id = "{0:x}.00000000".format(dir_ino) | |
502 | keys = self._dirfrag_keys(frag_obj_id) | |
503 | self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names])) | |
504 | ||
1911f103 TL |
505 | # run scrub to update and make sure rstat.rbytes info in subdir inode and dirfrag |
506 | # are matched | |
b3b6e05e | 507 | out_json = self.fs.run_scrub(["start", "/subdir", "repair,recursive"]) |
1911f103 | 508 | self.assertNotEqual(out_json, None) |
f67539c2 TL |
509 | self.assertEqual(out_json["return_code"], 0) |
510 | self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) | |
1911f103 TL |
511 | |
512 | # Remove the whole 'sudbdir' directory | |
513 | self.mount_a.run_shell(["rm", "-rf", "subdir/"]) | |
514 | ||
7c673cae FG |
515 | @for_teuthology |
516 | def test_parallel_execution(self): | |
517 | self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7) | |
518 | ||
519 | def test_pg_files(self): | |
520 | """ | |
521 | That the pg files command tells us which files are associated with | |
522 | a particular PG | |
523 | """ | |
524 | file_count = 20 | |
525 | self.mount_a.run_shell(["mkdir", "mydir"]) | |
526 | self.mount_a.create_n_files("mydir/myfile", file_count) | |
527 | ||
528 | # Some files elsewhere in the system that we will ignore | |
529 | # to check that the tool is filtering properly | |
530 | self.mount_a.run_shell(["mkdir", "otherdir"]) | |
531 | self.mount_a.create_n_files("otherdir/otherfile", file_count) | |
532 | ||
533 | pgs_to_files = defaultdict(list) | |
534 | # Rough (slow) reimplementation of the logic | |
535 | for i in range(0, file_count): | |
536 | file_path = "mydir/myfile_{0}".format(i) | |
537 | ino = self.mount_a.path_to_ino(file_path) | |
538 | obj = "{0:x}.{1:08x}".format(ino, 0) | |
539 | pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd( | |
540 | "osd", "map", self.fs.get_data_pool_name(), obj, | |
541 | "--format=json-pretty" | |
542 | ))['pgid'] | |
543 | pgs_to_files[pgid].append(file_path) | |
544 | log.info("{0}: {1}".format(file_path, pgid)) | |
545 | ||
9f95a23c | 546 | pg_count = self.fs.pgs_per_fs_pool |
7c673cae | 547 | for pg_n in range(0, pg_count): |
b3b6e05e | 548 | pg_str = "{0}.{1:x}".format(self.fs.get_data_pool_id(), pg_n) |
7c673cae FG |
549 | out = self.fs.data_scan(["pg_files", "mydir", pg_str]) |
550 | lines = [l for l in out.split("\n") if l] | |
551 | log.info("{0}: {1}".format(pg_str, lines)) | |
552 | self.assertSetEqual(set(lines), set(pgs_to_files[pg_str])) | |
553 | ||
11fdf7f2 | 554 | def test_rebuild_linkage(self): |
7c673cae FG |
555 | """ |
556 | The scan_links command fixes linkage errors | |
557 | """ | |
558 | self.mount_a.run_shell(["mkdir", "testdir1"]) | |
559 | self.mount_a.run_shell(["mkdir", "testdir2"]) | |
560 | dir1_ino = self.mount_a.path_to_ino("testdir1") | |
561 | dir2_ino = self.mount_a.path_to_ino("testdir2") | |
562 | dirfrag1_oid = "{0:x}.00000000".format(dir1_ino) | |
563 | dirfrag2_oid = "{0:x}.00000000".format(dir2_ino) | |
564 | ||
565 | self.mount_a.run_shell(["touch", "testdir1/file1"]) | |
566 | self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"]) | |
567 | self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"]) | |
568 | ||
569 | mds_id = self.fs.get_active_names()[0] | |
570 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
571 | ||
572 | dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid) | |
573 | ||
574 | # introduce duplicated primary link | |
575 | file1_key = "file1_head" | |
576 | self.assertIn(file1_key, dirfrag1_keys) | |
f67539c2 TL |
577 | file1_omap_data = self.fs.radosmo(["getomapval", dirfrag1_oid, file1_key, '-']) |
578 | self.fs.radosm(["setomapval", dirfrag2_oid, file1_key], stdin=BytesIO(file1_omap_data)) | |
7c673cae FG |
579 | self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) |
580 | ||
581 | # remove a remote link, make inode link count incorrect | |
582 | link1_key = 'link1_head' | |
583 | self.assertIn(link1_key, dirfrag1_keys) | |
f67539c2 | 584 | self.fs.radosm(["rmomapkey", dirfrag1_oid, link1_key]) |
7c673cae FG |
585 | |
586 | # increase good primary link's version | |
587 | self.mount_a.run_shell(["touch", "testdir1/file1"]) | |
588 | self.mount_a.umount_wait() | |
589 | ||
590 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
f67539c2 | 591 | self.fs.fail() |
7c673cae FG |
592 | |
593 | # repair linkage errors | |
594 | self.fs.data_scan(["scan_links"]) | |
595 | ||
596 | # primary link in testdir2 was deleted? | |
597 | self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) | |
598 | ||
f67539c2 | 599 | self.fs.set_joinable() |
7c673cae FG |
600 | self.fs.wait_for_daemons() |
601 | ||
e306af50 | 602 | self.mount_a.mount_wait() |
7c673cae FG |
603 | |
604 | # link count was adjusted? | |
605 | file1_nlink = self.mount_a.path_to_nlink("testdir1/file1") | |
606 | self.assertEqual(file1_nlink, 2) | |
11fdf7f2 TL |
607 | |
608 | def test_rebuild_inotable(self): | |
609 | """ | |
610 | The scan_links command repair inotables | |
611 | """ | |
612 | self.fs.set_max_mds(2) | |
613 | self.fs.wait_for_daemons() | |
614 | ||
615 | active_mds_names = self.fs.get_active_names() | |
616 | mds0_id = active_mds_names[0] | |
617 | mds1_id = active_mds_names[1] | |
618 | ||
619 | self.mount_a.run_shell(["mkdir", "dir1"]) | |
620 | dir_ino = self.mount_a.path_to_ino("dir1") | |
621 | self.mount_a.setfattr("dir1", "ceph.dir.pin", "1") | |
622 | # wait for subtree migration | |
623 | ||
624 | file_ino = 0; | |
625 | while True: | |
626 | time.sleep(1) | |
627 | # allocate an inode from mds.1 | |
628 | self.mount_a.run_shell(["touch", "dir1/file1"]) | |
629 | file_ino = self.mount_a.path_to_ino("dir1/file1") | |
630 | if file_ino >= (2 << 40): | |
631 | break | |
632 | self.mount_a.run_shell(["rm", "-f", "dir1/file1"]) | |
633 | ||
634 | self.mount_a.umount_wait() | |
635 | ||
636 | self.fs.mds_asok(["flush", "journal"], mds0_id) | |
637 | self.fs.mds_asok(["flush", "journal"], mds1_id) | |
f67539c2 | 638 | self.fs.fail() |
11fdf7f2 | 639 | |
f67539c2 TL |
640 | self.fs.radosm(["rm", "mds0_inotable"]) |
641 | self.fs.radosm(["rm", "mds1_inotable"]) | |
11fdf7f2 TL |
642 | |
643 | self.fs.data_scan(["scan_links", "--filesystem", self.fs.name]) | |
644 | ||
645 | mds0_inotable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "inode"])) | |
646 | self.assertGreaterEqual( | |
647 | mds0_inotable['0']['data']['inotable']['free'][0]['start'], dir_ino) | |
648 | ||
649 | mds1_inotable = json.loads(self.fs.table_tool([self.fs.name + ":1", "show", "inode"])) | |
650 | self.assertGreaterEqual( | |
651 | mds1_inotable['1']['data']['inotable']['free'][0]['start'], file_ino) | |
652 | ||
653 | def test_rebuild_snaptable(self): | |
654 | """ | |
655 | The scan_links command repair snaptable | |
656 | """ | |
657 | self.fs.set_allow_new_snaps(True) | |
658 | ||
659 | self.mount_a.run_shell(["mkdir", "dir1"]) | |
660 | self.mount_a.run_shell(["mkdir", "dir1/.snap/s1"]) | |
661 | self.mount_a.run_shell(["mkdir", "dir1/.snap/s2"]) | |
662 | self.mount_a.run_shell(["rmdir", "dir1/.snap/s2"]) | |
663 | ||
664 | self.mount_a.umount_wait() | |
665 | ||
666 | mds0_id = self.fs.get_active_names()[0] | |
667 | self.fs.mds_asok(["flush", "journal"], mds0_id) | |
668 | ||
669 | # wait for mds to update removed snaps | |
670 | time.sleep(10) | |
671 | ||
672 | old_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"])) | |
673 | # stamps may have minor difference | |
674 | for item in old_snaptable['snapserver']['snaps']: | |
675 | del item['stamp'] | |
676 | ||
f67539c2 | 677 | self.fs.radosm(["rm", "mds_snaptable"]) |
11fdf7f2 TL |
678 | self.fs.data_scan(["scan_links", "--filesystem", self.fs.name]) |
679 | ||
680 | new_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"])) | |
681 | for item in new_snaptable['snapserver']['snaps']: | |
682 | del item['stamp'] | |
683 | self.assertGreaterEqual( | |
684 | new_snaptable['snapserver']['last_snap'], old_snaptable['snapserver']['last_snap']) | |
685 | self.assertEqual( | |
686 | new_snaptable['snapserver']['snaps'], old_snaptable['snapserver']['snaps']) |