]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | |
2 | """ | |
3 | Test our tools for recovering metadata from the data pool | |
4 | """ | |
5 | import json | |
6 | ||
7 | import logging | |
8 | import os | |
11fdf7f2 | 9 | import time |
7c673cae FG |
10 | from textwrap import dedent |
11 | import traceback | |
12 | from collections import namedtuple, defaultdict | |
13 | ||
14 | from teuthology.orchestra.run import CommandFailedError | |
15 | from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology | |
16 | ||
17 | log = logging.getLogger(__name__) | |
18 | ||
19 | ||
20 | ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) | |
21 | ||
22 | ||
23 | class Workload(object): | |
24 | def __init__(self, filesystem, mount): | |
25 | self._mount = mount | |
26 | self._filesystem = filesystem | |
27 | self._initial_state = None | |
28 | ||
29 | # Accumulate backtraces for every failed validation, and return them. Backtraces | |
30 | # are rather verbose, but we only see them when something breaks, and they | |
31 | # let us see which check failed without having to decorate each check with | |
32 | # a string | |
33 | self._errors = [] | |
34 | ||
35 | def assert_equal(self, a, b): | |
36 | try: | |
37 | if a != b: | |
38 | raise AssertionError("{0} != {1}".format(a, b)) | |
39 | except AssertionError as e: | |
40 | self._errors.append( | |
41 | ValidationError(e, traceback.format_exc(3)) | |
42 | ) | |
43 | ||
44 | def write(self): | |
45 | """ | |
46 | Write the workload files to the mount | |
47 | """ | |
48 | raise NotImplementedError() | |
49 | ||
50 | def validate(self): | |
51 | """ | |
52 | Read from the mount and validate that the workload files are present (i.e. have | |
53 | survived or been reconstructed from the test scenario) | |
54 | """ | |
55 | raise NotImplementedError() | |
56 | ||
57 | def damage(self): | |
58 | """ | |
59 | Damage the filesystem pools in ways that will be interesting to recover from. By | |
60 | default just wipe everything in the metadata pool | |
61 | """ | |
62 | # Delete every object in the metadata pool | |
63 | objects = self._filesystem.rados(["ls"]).split("\n") | |
64 | for o in objects: | |
65 | self._filesystem.rados(["rm", o]) | |
66 | ||
67 | def flush(self): | |
68 | """ | |
69 | Called after client unmount, after write: flush whatever you want | |
70 | """ | |
71 | self._filesystem.mds_asok(["flush", "journal"]) | |
72 | ||
73 | ||
74 | class SimpleWorkload(Workload): | |
75 | """ | |
76 | Single file, single directory, check that it gets recovered and so does its size | |
77 | """ | |
78 | def write(self): | |
79 | self._mount.run_shell(["mkdir", "subdir"]) | |
80 | self._mount.write_n_mb("subdir/sixmegs", 6) | |
81 | self._initial_state = self._mount.stat("subdir/sixmegs") | |
82 | ||
83 | def validate(self): | |
84 | self._mount.run_shell(["ls", "subdir"]) | |
85 | st = self._mount.stat("subdir/sixmegs") | |
86 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
87 | return self._errors | |
88 | ||
89 | ||
90 | class MovedFile(Workload): | |
91 | def write(self): | |
92 | # Create a file whose backtrace disagrees with his eventual position | |
93 | # in the metadata. We will see that he gets reconstructed in his | |
94 | # original position according to his backtrace. | |
95 | self._mount.run_shell(["mkdir", "subdir_alpha"]) | |
96 | self._mount.run_shell(["mkdir", "subdir_bravo"]) | |
97 | self._mount.write_n_mb("subdir_alpha/sixmegs", 6) | |
98 | self._filesystem.mds_asok(["flush", "journal"]) | |
99 | self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"]) | |
100 | self._initial_state = self._mount.stat("subdir_bravo/sixmegs") | |
101 | ||
102 | def flush(self): | |
103 | pass | |
104 | ||
105 | def validate(self): | |
106 | self.assert_equal(self._mount.ls(), ["subdir_alpha"]) | |
107 | st = self._mount.stat("subdir_alpha/sixmegs") | |
108 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
109 | return self._errors | |
110 | ||
111 | ||
112 | class BacktracelessFile(Workload): | |
113 | def write(self): | |
114 | self._mount.run_shell(["mkdir", "subdir"]) | |
115 | self._mount.write_n_mb("subdir/sixmegs", 6) | |
116 | self._initial_state = self._mount.stat("subdir/sixmegs") | |
117 | ||
118 | def flush(self): | |
119 | # Never flush metadata, so backtrace won't be written | |
120 | pass | |
121 | ||
122 | def validate(self): | |
123 | ino_name = "%x" % self._initial_state["st_ino"] | |
124 | ||
125 | # The inode should be linked into lost+found because we had no path for it | |
126 | self.assert_equal(self._mount.ls(), ["lost+found"]) | |
127 | self.assert_equal(self._mount.ls("lost+found"), [ino_name]) | |
128 | st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name)) | |
129 | ||
130 | # We might not have got the name or path, but we should still get the size | |
131 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
132 | ||
133 | return self._errors | |
134 | ||
135 | ||
136 | class StripedStashedLayout(Workload): | |
137 | def __init__(self, fs, m): | |
138 | super(StripedStashedLayout, self).__init__(fs, m) | |
139 | ||
140 | # Nice small stripes so we can quickly do our writes+validates | |
141 | self.sc = 4 | |
142 | self.ss = 65536 | |
143 | self.os = 262144 | |
144 | ||
145 | self.interesting_sizes = [ | |
146 | # Exactly stripe_count objects will exist | |
147 | self.os * self.sc, | |
148 | # Fewer than stripe_count objects will exist | |
149 | self.os * self.sc / 2, | |
150 | self.os * (self.sc - 1) + self.os / 2, | |
151 | self.os * (self.sc - 1) + self.os / 2 - 1, | |
152 | self.os * (self.sc + 1) + self.os / 2, | |
153 | self.os * (self.sc + 1) + self.os / 2 + 1, | |
154 | # More than stripe_count objects will exist | |
155 | self.os * self.sc + self.os * self.sc / 2 | |
156 | ] | |
157 | ||
158 | def write(self): | |
159 | # Create a dir with a striped layout set on it | |
160 | self._mount.run_shell(["mkdir", "stripey"]) | |
161 | ||
162 | self._mount.setfattr("./stripey", "ceph.dir.layout", | |
163 | "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format( | |
164 | ss=self.ss, os=self.os, sc=self.sc, | |
165 | pool=self._filesystem.get_data_pool_name() | |
166 | )) | |
167 | ||
168 | # Write files, then flush metadata so that its layout gets written into an xattr | |
169 | for i, n_bytes in enumerate(self.interesting_sizes): | |
170 | self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) | |
171 | # This is really just validating the validator | |
172 | self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) | |
173 | self._filesystem.mds_asok(["flush", "journal"]) | |
174 | ||
175 | # Write another file in the same way, but this time don't flush the metadata, | |
176 | # so that it won't have the layout xattr | |
177 | self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512) | |
178 | self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512) | |
179 | ||
180 | self._initial_state = { | |
181 | "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file") | |
182 | } | |
183 | ||
184 | def flush(self): | |
185 | # Pass because we already selectively flushed during write | |
186 | pass | |
187 | ||
188 | def validate(self): | |
189 | # The first files should have been recovered into its original location | |
190 | # with the correct layout: read back correct data | |
191 | for i, n_bytes in enumerate(self.interesting_sizes): | |
192 | try: | |
193 | self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) | |
194 | except CommandFailedError as e: | |
195 | self._errors.append( | |
196 | ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3)) | |
197 | ) | |
198 | ||
199 | # The unflushed file should have been recovered into lost+found without | |
200 | # the correct layout: read back junk | |
201 | ino_name = "%x" % self._initial_state["unflushed_ino"] | |
202 | self.assert_equal(self._mount.ls("lost+found"), [ino_name]) | |
203 | try: | |
204 | self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512) | |
205 | except CommandFailedError: | |
206 | pass | |
207 | else: | |
208 | self._errors.append( | |
209 | ValidationError("Unexpectedly valid data in unflushed striped file", "") | |
210 | ) | |
211 | ||
212 | return self._errors | |
213 | ||
214 | ||
215 | class ManyFilesWorkload(Workload): | |
216 | def __init__(self, filesystem, mount, file_count): | |
217 | super(ManyFilesWorkload, self).__init__(filesystem, mount) | |
218 | self.file_count = file_count | |
219 | ||
220 | def write(self): | |
221 | self._mount.run_shell(["mkdir", "subdir"]) | |
222 | for n in range(0, self.file_count): | |
223 | self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) | |
224 | ||
225 | def validate(self): | |
226 | for n in range(0, self.file_count): | |
227 | try: | |
228 | self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) | |
229 | except CommandFailedError as e: | |
230 | self._errors.append( | |
231 | ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3)) | |
232 | ) | |
233 | ||
234 | return self._errors | |
235 | ||
236 | ||
237 | class MovedDir(Workload): | |
238 | def write(self): | |
239 | # Create a nested dir that we will then move. Two files with two different | |
240 | # backtraces referring to the moved dir, claiming two different locations for | |
241 | # it. We will see that only one backtrace wins and the dir ends up with | |
242 | # single linkage. | |
243 | self._mount.run_shell(["mkdir", "-p", "grandmother/parent"]) | |
244 | self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1) | |
245 | self._filesystem.mds_asok(["flush", "journal"]) | |
246 | self._mount.run_shell(["mkdir", "grandfather"]) | |
247 | self._mount.run_shell(["mv", "grandmother/parent", "grandfather"]) | |
248 | self._mount.write_n_mb("grandfather/parent/new_pos_file", 2) | |
249 | self._filesystem.mds_asok(["flush", "journal"]) | |
250 | ||
251 | self._initial_state = ( | |
252 | self._mount.stat("grandfather/parent/orig_pos_file"), | |
253 | self._mount.stat("grandfather/parent/new_pos_file") | |
254 | ) | |
255 | ||
256 | def validate(self): | |
257 | root_files = self._mount.ls() | |
258 | self.assert_equal(len(root_files), 1) | |
259 | self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True) | |
260 | winner = root_files[0] | |
261 | st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner)) | |
262 | st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner)) | |
263 | ||
264 | self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size']) | |
265 | self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size']) | |
266 | ||
267 | ||
268 | class MissingZerothObject(Workload): | |
269 | def write(self): | |
270 | self._mount.run_shell(["mkdir", "subdir"]) | |
271 | self._mount.write_n_mb("subdir/sixmegs", 6) | |
272 | self._initial_state = self._mount.stat("subdir/sixmegs") | |
273 | ||
274 | def damage(self): | |
275 | super(MissingZerothObject, self).damage() | |
276 | zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino']) | |
277 | self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name()) | |
278 | ||
279 | def validate(self): | |
280 | st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino'])) | |
281 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
282 | ||
283 | ||
284 | class NonDefaultLayout(Workload): | |
285 | """ | |
286 | Check that the reconstruction copes with files that have a different | |
287 | object size in their layout | |
288 | """ | |
289 | def write(self): | |
290 | self._mount.run_shell(["touch", "datafile"]) | |
291 | self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608") | |
292 | self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"]) | |
293 | self._initial_state = self._mount.stat("datafile") | |
294 | ||
295 | def validate(self): | |
296 | # Check we got the layout reconstructed properly | |
297 | object_size = int(self._mount.getfattr( | |
298 | "./datafile", "ceph.file.layout.object_size")) | |
299 | self.assert_equal(object_size, 8388608) | |
300 | ||
301 | # Check we got the file size reconstructed properly | |
302 | st = self._mount.stat("datafile") | |
303 | self.assert_equal(st['st_size'], self._initial_state['st_size']) | |
304 | ||
305 | ||
306 | class TestDataScan(CephFSTestCase): | |
307 | MDSS_REQUIRED = 2 | |
308 | ||
309 | def is_marked_damaged(self, rank): | |
310 | mds_map = self.fs.get_mds_map() | |
311 | return rank in mds_map['damaged'] | |
312 | ||
181888fb | 313 | def _rebuild_metadata(self, workload, workers=1): |
7c673cae FG |
314 | """ |
315 | That when all objects in metadata pool are removed, we can rebuild a metadata pool | |
316 | based on the contents of a data pool, and a client can see and read our files. | |
317 | """ | |
318 | ||
319 | # First, inject some files | |
320 | ||
7c673cae FG |
321 | workload.write() |
322 | ||
323 | # Unmount the client and flush the journal: the tool should also cope with | |
324 | # situations where there is dirty metadata, but we'll test that separately | |
325 | self.mount_a.umount_wait() | |
326 | workload.flush() | |
327 | ||
7c673cae FG |
328 | # Stop the MDS |
329 | self.fs.mds_stop() | |
330 | self.fs.mds_fail() | |
331 | ||
332 | # After recovery, we need the MDS to not be strict about stats (in production these options | |
333 | # are off by default, but in QA we need to explicitly disable them) | |
334 | self.fs.set_ceph_conf('mds', 'mds verify scatter', False) | |
335 | self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) | |
336 | ||
337 | # Apply any data damage the workload wants | |
338 | workload.damage() | |
339 | ||
340 | # Reset the MDS map in case multiple ranks were in play: recovery procedure | |
341 | # only understands how to rebuild metadata under rank 0 | |
342 | self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, | |
343 | '--yes-i-really-mean-it') | |
344 | ||
181888fb | 345 | self.fs.mds_restart() |
7c673cae FG |
346 | |
347 | def get_state(mds_id): | |
348 | info = self.mds_cluster.get_mds_info(mds_id) | |
349 | return info['state'] if info is not None else None | |
350 | ||
181888fb FG |
351 | self.wait_until_true(lambda: self.is_marked_damaged(0), 60) |
352 | for mds_id in self.fs.mds_ids: | |
353 | self.wait_until_equal( | |
354 | lambda: get_state(mds_id), | |
355 | "up:standby", | |
356 | timeout=60) | |
7c673cae FG |
357 | |
358 | self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) | |
359 | self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) | |
360 | self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) | |
361 | ||
362 | # Run the recovery procedure | |
363 | if False: | |
364 | with self.assertRaises(CommandFailedError): | |
365 | # Normal reset should fail when no objects are present, we'll use --force instead | |
f64942e4 | 366 | self.fs.journal_tool(["journal", "reset"], 0) |
7c673cae | 367 | |
f64942e4 | 368 | self.fs.journal_tool(["journal", "reset", "--force"], 0) |
181888fb FG |
369 | self.fs.data_scan(["init"]) |
370 | self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers) | |
371 | self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers) | |
7c673cae FG |
372 | |
373 | # Mark the MDS repaired | |
374 | self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') | |
375 | ||
376 | # Start the MDS | |
377 | self.fs.mds_restart() | |
378 | self.fs.wait_for_daemons() | |
7c673cae FG |
379 | log.info(str(self.mds_cluster.status())) |
380 | ||
381 | # Mount a client | |
181888fb | 382 | self.mount_a.mount() |
7c673cae FG |
383 | self.mount_a.wait_until_mounted() |
384 | ||
385 | # See that the files are present and correct | |
386 | errors = workload.validate() | |
387 | if errors: | |
388 | log.error("Validation errors found: {0}".format(len(errors))) | |
389 | for e in errors: | |
390 | log.error(e.exception) | |
391 | log.error(e.backtrace) | |
392 | raise AssertionError("Validation failed, first error: {0}\n{1}".format( | |
393 | errors[0].exception, errors[0].backtrace | |
394 | )) | |
395 | ||
396 | def test_rebuild_simple(self): | |
397 | self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a)) | |
398 | ||
399 | def test_rebuild_moved_file(self): | |
400 | self._rebuild_metadata(MovedFile(self.fs, self.mount_a)) | |
401 | ||
402 | def test_rebuild_backtraceless(self): | |
403 | self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a)) | |
404 | ||
405 | def test_rebuild_moved_dir(self): | |
406 | self._rebuild_metadata(MovedDir(self.fs, self.mount_a)) | |
407 | ||
408 | def test_rebuild_missing_zeroth(self): | |
409 | self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a)) | |
410 | ||
411 | def test_rebuild_nondefault_layout(self): | |
412 | self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a)) | |
413 | ||
414 | def test_stashed_layout(self): | |
415 | self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a)) | |
416 | ||
7c673cae | 417 | def _dirfrag_keys(self, object_id): |
7c673cae FG |
418 | keys_str = self.fs.rados(["listomapkeys", object_id]) |
419 | if keys_str: | |
420 | return keys_str.split("\n") | |
421 | else: | |
422 | return [] | |
423 | ||
424 | def test_fragmented_injection(self): | |
425 | """ | |
426 | That when injecting a dentry into a fragmented directory, we put it in the right fragment. | |
427 | """ | |
428 | ||
7c673cae FG |
429 | file_count = 100 |
430 | file_names = ["%s" % n for n in range(0, file_count)] | |
431 | ||
432 | # Create a directory of `file_count` files, each named after its | |
433 | # decimal number and containing the string of its decimal number | |
434 | self.mount_a.run_python(dedent(""" | |
435 | import os | |
436 | path = os.path.join("{path}", "subdir") | |
437 | os.mkdir(path) | |
438 | for n in range(0, {file_count}): | |
439 | open(os.path.join(path, "%s" % n), 'w').write("%s" % n) | |
440 | """.format( | |
441 | path=self.mount_a.mountpoint, | |
442 | file_count=file_count | |
443 | ))) | |
444 | ||
445 | dir_ino = self.mount_a.path_to_ino("subdir") | |
446 | ||
447 | # Only one MDS should be active! | |
448 | self.assertEqual(len(self.fs.get_active_names()), 1) | |
449 | ||
450 | # Ensure that one directory is fragmented | |
451 | mds_id = self.fs.get_active_names()[0] | |
452 | self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id) | |
453 | ||
454 | # Flush journal and stop MDS | |
455 | self.mount_a.umount_wait() | |
456 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
457 | self.fs.mds_stop() | |
458 | self.fs.mds_fail() | |
459 | ||
460 | # Pick a dentry and wipe out its key | |
461 | # Because I did a 1 bit split, I know one frag will be named <inode>.01000000 | |
462 | frag_obj_id = "{0:x}.01000000".format(dir_ino) | |
463 | keys = self._dirfrag_keys(frag_obj_id) | |
464 | victim_key = keys[7] # arbitrary choice | |
465 | log.info("victim_key={0}".format(victim_key)) | |
466 | victim_dentry = victim_key.split("_head")[0] | |
467 | self.fs.rados(["rmomapkey", frag_obj_id, victim_key]) | |
468 | ||
469 | # Start filesystem back up, observe that the file appears to be gone in an `ls` | |
470 | self.fs.mds_restart() | |
471 | self.fs.wait_for_daemons() | |
472 | self.mount_a.mount() | |
473 | self.mount_a.wait_until_mounted() | |
474 | files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n") | |
475 | self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry])))) | |
476 | ||
477 | # Stop the filesystem | |
478 | self.mount_a.umount_wait() | |
479 | self.fs.mds_stop() | |
480 | self.fs.mds_fail() | |
481 | ||
482 | # Run data-scan, observe that it inserts our dentry back into the correct fragment | |
483 | # by checking the omap now has the dentry's key again | |
484 | self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) | |
485 | self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()]) | |
486 | self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id)) | |
487 | ||
488 | # Start the filesystem and check that the dentry we deleted is now once again visible | |
489 | # and points to the correct file data. | |
490 | self.fs.mds_restart() | |
491 | self.fs.wait_for_daemons() | |
492 | self.mount_a.mount() | |
493 | self.mount_a.wait_until_mounted() | |
494 | out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip() | |
495 | self.assertEqual(out, victim_dentry) | |
496 | ||
497 | # Finally, close the loop by checking our injected dentry survives a merge | |
498 | mds_id = self.fs.get_active_names()[0] | |
499 | self.mount_a.ls("subdir") # Do an ls to ensure both frags are in cache so the merge will work | |
500 | self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id) | |
501 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
502 | frag_obj_id = "{0:x}.00000000".format(dir_ino) | |
503 | keys = self._dirfrag_keys(frag_obj_id) | |
504 | self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names])) | |
505 | ||
506 | @for_teuthology | |
507 | def test_parallel_execution(self): | |
508 | self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7) | |
509 | ||
510 | def test_pg_files(self): | |
511 | """ | |
512 | That the pg files command tells us which files are associated with | |
513 | a particular PG | |
514 | """ | |
515 | file_count = 20 | |
516 | self.mount_a.run_shell(["mkdir", "mydir"]) | |
517 | self.mount_a.create_n_files("mydir/myfile", file_count) | |
518 | ||
519 | # Some files elsewhere in the system that we will ignore | |
520 | # to check that the tool is filtering properly | |
521 | self.mount_a.run_shell(["mkdir", "otherdir"]) | |
522 | self.mount_a.create_n_files("otherdir/otherfile", file_count) | |
523 | ||
524 | pgs_to_files = defaultdict(list) | |
525 | # Rough (slow) reimplementation of the logic | |
526 | for i in range(0, file_count): | |
527 | file_path = "mydir/myfile_{0}".format(i) | |
528 | ino = self.mount_a.path_to_ino(file_path) | |
529 | obj = "{0:x}.{1:08x}".format(ino, 0) | |
530 | pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd( | |
531 | "osd", "map", self.fs.get_data_pool_name(), obj, | |
532 | "--format=json-pretty" | |
533 | ))['pgid'] | |
534 | pgs_to_files[pgid].append(file_path) | |
535 | log.info("{0}: {1}".format(file_path, pgid)) | |
536 | ||
537 | pg_count = self.fs.get_pgs_per_fs_pool() | |
538 | for pg_n in range(0, pg_count): | |
539 | pg_str = "{0}.{1}".format(self.fs.get_data_pool_id(), pg_n) | |
540 | out = self.fs.data_scan(["pg_files", "mydir", pg_str]) | |
541 | lines = [l for l in out.split("\n") if l] | |
542 | log.info("{0}: {1}".format(pg_str, lines)) | |
543 | self.assertSetEqual(set(lines), set(pgs_to_files[pg_str])) | |
544 | ||
11fdf7f2 | 545 | def test_rebuild_linkage(self): |
7c673cae FG |
546 | """ |
547 | The scan_links command fixes linkage errors | |
548 | """ | |
549 | self.mount_a.run_shell(["mkdir", "testdir1"]) | |
550 | self.mount_a.run_shell(["mkdir", "testdir2"]) | |
551 | dir1_ino = self.mount_a.path_to_ino("testdir1") | |
552 | dir2_ino = self.mount_a.path_to_ino("testdir2") | |
553 | dirfrag1_oid = "{0:x}.00000000".format(dir1_ino) | |
554 | dirfrag2_oid = "{0:x}.00000000".format(dir2_ino) | |
555 | ||
556 | self.mount_a.run_shell(["touch", "testdir1/file1"]) | |
557 | self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"]) | |
558 | self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"]) | |
559 | ||
560 | mds_id = self.fs.get_active_names()[0] | |
561 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
562 | ||
563 | dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid) | |
564 | ||
565 | # introduce duplicated primary link | |
566 | file1_key = "file1_head" | |
567 | self.assertIn(file1_key, dirfrag1_keys) | |
568 | file1_omap_data = self.fs.rados(["getomapval", dirfrag1_oid, file1_key, '-']) | |
569 | self.fs.rados(["setomapval", dirfrag2_oid, file1_key], stdin_data=file1_omap_data) | |
570 | self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) | |
571 | ||
572 | # remove a remote link, make inode link count incorrect | |
573 | link1_key = 'link1_head' | |
574 | self.assertIn(link1_key, dirfrag1_keys) | |
575 | self.fs.rados(["rmomapkey", dirfrag1_oid, link1_key]) | |
576 | ||
577 | # increase good primary link's version | |
578 | self.mount_a.run_shell(["touch", "testdir1/file1"]) | |
579 | self.mount_a.umount_wait() | |
580 | ||
581 | self.fs.mds_asok(["flush", "journal"], mds_id) | |
582 | self.fs.mds_stop() | |
583 | self.fs.mds_fail() | |
584 | ||
585 | # repair linkage errors | |
586 | self.fs.data_scan(["scan_links"]) | |
587 | ||
588 | # primary link in testdir2 was deleted? | |
589 | self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) | |
590 | ||
591 | self.fs.mds_restart() | |
592 | self.fs.wait_for_daemons() | |
593 | ||
594 | self.mount_a.mount() | |
595 | self.mount_a.wait_until_mounted() | |
596 | ||
597 | # link count was adjusted? | |
598 | file1_nlink = self.mount_a.path_to_nlink("testdir1/file1") | |
599 | self.assertEqual(file1_nlink, 2) | |
11fdf7f2 TL |
600 | |
601 | def test_rebuild_inotable(self): | |
602 | """ | |
603 | The scan_links command repair inotables | |
604 | """ | |
605 | self.fs.set_max_mds(2) | |
606 | self.fs.wait_for_daemons() | |
607 | ||
608 | active_mds_names = self.fs.get_active_names() | |
609 | mds0_id = active_mds_names[0] | |
610 | mds1_id = active_mds_names[1] | |
611 | ||
612 | self.mount_a.run_shell(["mkdir", "dir1"]) | |
613 | dir_ino = self.mount_a.path_to_ino("dir1") | |
614 | self.mount_a.setfattr("dir1", "ceph.dir.pin", "1") | |
615 | # wait for subtree migration | |
616 | ||
617 | file_ino = 0; | |
618 | while True: | |
619 | time.sleep(1) | |
620 | # allocate an inode from mds.1 | |
621 | self.mount_a.run_shell(["touch", "dir1/file1"]) | |
622 | file_ino = self.mount_a.path_to_ino("dir1/file1") | |
623 | if file_ino >= (2 << 40): | |
624 | break | |
625 | self.mount_a.run_shell(["rm", "-f", "dir1/file1"]) | |
626 | ||
627 | self.mount_a.umount_wait() | |
628 | ||
629 | self.fs.mds_asok(["flush", "journal"], mds0_id) | |
630 | self.fs.mds_asok(["flush", "journal"], mds1_id) | |
631 | self.mds_cluster.mds_stop() | |
632 | ||
633 | self.fs.rados(["rm", "mds0_inotable"]) | |
634 | self.fs.rados(["rm", "mds1_inotable"]) | |
635 | ||
636 | self.fs.data_scan(["scan_links", "--filesystem", self.fs.name]) | |
637 | ||
638 | mds0_inotable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "inode"])) | |
639 | self.assertGreaterEqual( | |
640 | mds0_inotable['0']['data']['inotable']['free'][0]['start'], dir_ino) | |
641 | ||
642 | mds1_inotable = json.loads(self.fs.table_tool([self.fs.name + ":1", "show", "inode"])) | |
643 | self.assertGreaterEqual( | |
644 | mds1_inotable['1']['data']['inotable']['free'][0]['start'], file_ino) | |
645 | ||
646 | def test_rebuild_snaptable(self): | |
647 | """ | |
648 | The scan_links command repair snaptable | |
649 | """ | |
650 | self.fs.set_allow_new_snaps(True) | |
651 | ||
652 | self.mount_a.run_shell(["mkdir", "dir1"]) | |
653 | self.mount_a.run_shell(["mkdir", "dir1/.snap/s1"]) | |
654 | self.mount_a.run_shell(["mkdir", "dir1/.snap/s2"]) | |
655 | self.mount_a.run_shell(["rmdir", "dir1/.snap/s2"]) | |
656 | ||
657 | self.mount_a.umount_wait() | |
658 | ||
659 | mds0_id = self.fs.get_active_names()[0] | |
660 | self.fs.mds_asok(["flush", "journal"], mds0_id) | |
661 | ||
662 | # wait for mds to update removed snaps | |
663 | time.sleep(10) | |
664 | ||
665 | old_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"])) | |
666 | # stamps may have minor difference | |
667 | for item in old_snaptable['snapserver']['snaps']: | |
668 | del item['stamp'] | |
669 | ||
670 | self.fs.rados(["rm", "mds_snaptable"]) | |
671 | self.fs.data_scan(["scan_links", "--filesystem", self.fs.name]) | |
672 | ||
673 | new_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"])) | |
674 | for item in new_snaptable['snapserver']['snaps']: | |
675 | del item['stamp'] | |
676 | self.assertGreaterEqual( | |
677 | new_snaptable['snapserver']['last_snap'], old_snaptable['snapserver']['last_snap']) | |
678 | self.assertEqual( | |
679 | new_snaptable['snapserver']['snaps'], old_snaptable['snapserver']['snaps']) |