]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | |
2 | """ | |
3 | Test our tools for recovering the content of damaged journals | |
4 | """ | |
5 | ||
6 | import json | |
7 | import logging | |
8 | from textwrap import dedent | |
9 | import time | |
10 | ||
11 | from teuthology.exceptions import CommandFailedError, ConnectionLostError | |
12 | from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO | |
13 | from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology | |
14 | from tasks.workunit import task as workunit | |
15 | ||
16 | log = logging.getLogger(__name__) | |
17 | ||
18 | ||
19 | class TestJournalRepair(CephFSTestCase): | |
20 | MDSS_REQUIRED = 2 | |
21 | ||
22 | def test_inject_to_empty(self): | |
23 | """ | |
24 | That when some dentries in the journal but nothing is in | |
25 | the backing store, we correctly populate the backing store | |
26 | from the journalled dentries. | |
27 | """ | |
28 | ||
29 | # Inject metadata operations | |
30 | self.mount_a.run_shell(["touch", "rootfile"]) | |
31 | self.mount_a.run_shell(["mkdir", "subdir"]) | |
32 | self.mount_a.run_shell(["touch", "subdir/subdirfile"]) | |
33 | # There are several different paths for handling hardlinks, depending | |
34 | # on whether an existing dentry (being overwritten) is also a hardlink | |
35 | self.mount_a.run_shell(["mkdir", "linkdir"]) | |
36 | ||
37 | # Test inode -> remote transition for a dentry | |
38 | self.mount_a.run_shell(["touch", "linkdir/link0"]) | |
39 | self.mount_a.run_shell(["rm", "-f", "linkdir/link0"]) | |
40 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"]) | |
41 | ||
42 | # Test nothing -> remote transition | |
43 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"]) | |
44 | ||
45 | # Test remote -> inode transition | |
46 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"]) | |
47 | self.mount_a.run_shell(["rm", "-f", "linkdir/link2"]) | |
48 | self.mount_a.run_shell(["touch", "linkdir/link2"]) | |
49 | ||
50 | # Test remote -> diff remote transition | |
51 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"]) | |
52 | self.mount_a.run_shell(["rm", "-f", "linkdir/link3"]) | |
53 | self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"]) | |
54 | ||
55 | # Test an empty directory | |
56 | self.mount_a.run_shell(["mkdir", "subdir/subsubdir"]) | |
57 | self.mount_a.run_shell(["sync"]) | |
58 | ||
59 | # Before we unmount, make a note of the inode numbers, later we will | |
60 | # check that they match what we recover from the journal | |
61 | rootfile_ino = self.mount_a.path_to_ino("rootfile") | |
62 | subdir_ino = self.mount_a.path_to_ino("subdir") | |
63 | linkdir_ino = self.mount_a.path_to_ino("linkdir") | |
64 | subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile") | |
65 | subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir") | |
66 | ||
67 | self.mount_a.umount_wait() | |
68 | ||
69 | # Stop the MDS | |
70 | self.fs.mds_stop() | |
71 | self.fs.mds_fail() | |
72 | ||
73 | # Now, the journal should contain the operations, but the backing | |
74 | # store shouldn't | |
75 | with self.assertRaises(ObjectNotFound): | |
76 | self.fs.list_dirfrag(subdir_ino) | |
77 | self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) | |
78 | ||
79 | # Execute the dentry recovery, this should populate the backing store | |
80 | self.fs.journal_tool(['event', 'recover_dentries', 'list']) | |
81 | ||
82 | # Dentries in ROOT_INO are present | |
83 | self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head'])) | |
84 | self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head']) | |
85 | self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)), | |
86 | sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head'])) | |
87 | ||
88 | # Now check the MDS can read what we wrote: truncate the journal | |
89 | # and start the mds. | |
90 | self.fs.journal_tool(['journal', 'reset']) | |
91 | self.fs.mds_fail_restart() | |
92 | self.fs.wait_for_daemons() | |
93 | ||
94 | # List files | |
95 | self.mount_a.mount() | |
96 | self.mount_a.wait_until_mounted() | |
97 | ||
98 | # First ls -R to populate MDCache, such that hardlinks will | |
99 | # resolve properly (recover_dentries does not create backtraces, | |
100 | # so ordinarily hardlinks to inodes that happen not to have backtraces | |
101 | # will be invisible in readdir). | |
102 | # FIXME: hook in forward scrub here to regenerate backtraces | |
103 | proc = self.mount_a.run_shell(['ls', '-R']) | |
104 | self.mount_a.umount_wait() # remount to clear client cache before our second ls | |
105 | self.mount_a.mount() | |
106 | self.mount_a.wait_until_mounted() | |
107 | ||
108 | proc = self.mount_a.run_shell(['ls', '-R']) | |
109 | self.assertEqual(proc.stdout.getvalue().strip(), | |
110 | dedent(""" | |
111 | .: | |
112 | linkdir | |
113 | rootfile | |
114 | subdir | |
115 | ||
116 | ./linkdir: | |
117 | link0 | |
118 | link1 | |
119 | link2 | |
120 | link3 | |
121 | ||
122 | ./subdir: | |
123 | subdirfile | |
124 | subsubdir | |
125 | ||
126 | ./subdir/subsubdir: | |
127 | """).strip()) | |
128 | ||
129 | # Check the correct inos were preserved by path | |
130 | self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile")) | |
131 | self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir")) | |
132 | self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile")) | |
133 | self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir")) | |
134 | ||
135 | # Check that the hard link handling came out correctly | |
136 | self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino) | |
137 | self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino) | |
138 | self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino) | |
139 | self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino) | |
140 | ||
141 | # Create a new file, ensure it is not issued the same ino as one of the | |
142 | # recovered ones | |
143 | self.mount_a.run_shell(["touch", "afterwards"]) | |
144 | new_ino = self.mount_a.path_to_ino("afterwards") | |
145 | self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino]) | |
146 | ||
147 | # Check that we can do metadata ops in the recovered directory | |
148 | self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"]) | |
149 | ||
150 | @for_teuthology # 308s | |
151 | def test_reset(self): | |
152 | """ | |
153 | That after forcibly modifying the backing store, we can get back into | |
154 | a good state by resetting the MDSMap. | |
155 | ||
156 | The scenario is that we have two active MDSs, and we lose the journals. Once | |
157 | we have completely lost confidence in the integrity of the metadata, we want to | |
158 | return the system to a single-MDS state to go into a scrub to recover what we | |
159 | can. | |
160 | """ | |
161 | ||
162 | # Set max_mds to 2 | |
7c673cae FG |
163 | self.fs.set_max_mds(2) |
164 | ||
165 | # See that we have two active MDSs | |
166 | self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, | |
167 | reject_fn=lambda v: v > 2 or v < 1) | |
168 | active_mds_names = self.fs.get_active_names() | |
169 | ||
170 | # Switch off any unneeded MDS daemons | |
171 | for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names): | |
172 | self.mds_cluster.mds_stop(unneeded_mds) | |
173 | self.mds_cluster.mds_fail(unneeded_mds) | |
174 | ||
31f18b77 FG |
175 | # Create a dir on each rank |
176 | self.mount_a.run_shell(["mkdir", "alpha"]) | |
177 | self.mount_a.run_shell(["mkdir", "bravo"]) | |
178 | self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0") | |
179 | self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1") | |
7c673cae FG |
180 | |
181 | def subtrees_assigned(): | |
182 | got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0]) | |
7c673cae | 183 | |
31f18b77 FG |
184 | for s in got_subtrees: |
185 | if s['dir']['path'] == '/bravo': | |
186 | if s['auth_first'] == 1: | |
187 | return True | |
188 | else: | |
189 | # Should not happen | |
190 | raise RuntimeError("/bravo is subtree but not rank 1!") | |
7c673cae | 191 | |
31f18b77 FG |
192 | return False |
193 | ||
194 | # Ensure the pinning has taken effect and the /bravo dir is now | |
195 | # migrated to rank 1. | |
196 | self.wait_until_true(subtrees_assigned, 30) | |
197 | ||
198 | # Do some IO (this should be split across ranks according to | |
199 | # the rank-pinned dirs) | |
200 | self.mount_a.create_n_files("alpha/file", 1000) | |
201 | self.mount_a.create_n_files("bravo/file", 1000) | |
7c673cae FG |
202 | |
203 | # Flush the journals so that we have some backing store data | |
204 | # belonging to one MDS, and some to the other MDS. | |
205 | for mds_name in active_mds_names: | |
206 | self.fs.mds_asok(["flush", "journal"], mds_name) | |
207 | ||
208 | # Stop (hard) the second MDS daemon | |
209 | self.fs.mds_stop(active_mds_names[1]) | |
210 | ||
211 | # Wipe out the tables for MDS rank 1 so that it is broken and can't start | |
212 | # (this is the simulated failure that we will demonstrate that the disaster | |
213 | # recovery tools can get us back from) | |
214 | self.fs.erase_metadata_objects(prefix="mds1_") | |
215 | ||
216 | # Try to access files from the client | |
217 | blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False) | |
218 | ||
219 | # Check that this "ls -R" blocked rather than completing: indicates | |
220 | # it got stuck trying to access subtrees which were on the now-dead MDS. | |
221 | log.info("Sleeping to check ls is blocked...") | |
222 | time.sleep(60) | |
223 | self.assertFalse(blocked_ls.finished) | |
224 | ||
225 | # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1 | |
226 | # is not coming back. Kill it. | |
227 | log.info("Killing mount, it's blocked on the MDS we killed") | |
228 | self.mount_a.kill() | |
229 | self.mount_a.kill_cleanup() | |
230 | try: | |
231 | # Now that the mount is dead, the ls -R should error out. | |
232 | blocked_ls.wait() | |
233 | except (CommandFailedError, ConnectionLostError): | |
234 | # The ConnectionLostError case is for kernel client, where | |
235 | # killing the mount also means killing the node. | |
236 | pass | |
237 | ||
7c673cae FG |
238 | # See that the second MDS will crash when it starts and tries to |
239 | # acquire rank 1 | |
240 | damaged_id = active_mds_names[1] | |
241 | self.fs.mds_restart(damaged_id) | |
242 | ||
243 | # The daemon taking the damaged rank should start starting, then | |
244 | # restart back into standby after asking the mon to mark the rank | |
245 | # damaged. | |
246 | def is_marked_damaged(): | |
247 | mds_map = self.fs.get_mds_map() | |
248 | return 1 in mds_map['damaged'] | |
249 | ||
250 | self.wait_until_true(is_marked_damaged, 60) | |
251 | ||
252 | def get_state(): | |
253 | info = self.mds_cluster.get_mds_info(damaged_id) | |
254 | return info['state'] if info is not None else None | |
255 | ||
256 | self.wait_until_equal( | |
257 | get_state, | |
258 | "up:standby", | |
259 | timeout=60) | |
260 | ||
261 | self.fs.mds_stop(damaged_id) | |
262 | self.fs.mds_fail(damaged_id) | |
263 | ||
264 | # Now give up and go through a disaster recovery procedure | |
265 | self.fs.mds_stop(active_mds_names[0]) | |
266 | self.fs.mds_fail(active_mds_names[0]) | |
267 | # Invoke recover_dentries quietly, because otherwise log spews millions of lines | |
268 | self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True) | |
269 | self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True) | |
270 | self.fs.table_tool(["0", "reset", "session"]) | |
271 | self.fs.journal_tool(["journal", "reset"], rank=0) | |
272 | self.fs.erase_mds_objects(1) | |
273 | self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, | |
274 | '--yes-i-really-mean-it') | |
275 | ||
276 | # Bring an MDS back online, mount a client, and see that we can walk the full | |
277 | # filesystem tree again | |
278 | self.fs.mds_fail_restart(active_mds_names[0]) | |
279 | self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30, | |
280 | reject_fn=lambda v: len(v) > 1) | |
281 | self.mount_a.mount() | |
282 | self.mount_a.run_shell(["ls", "-R"], wait=True) | |
283 | ||
284 | def test_table_tool(self): | |
285 | active_mdss = self.fs.get_active_names() | |
286 | self.assertEqual(len(active_mdss), 1) | |
287 | mds_name = active_mdss[0] | |
288 | ||
289 | self.mount_a.run_shell(["touch", "foo"]) | |
290 | self.fs.mds_asok(["flush", "journal"], mds_name) | |
291 | ||
292 | log.info(self.fs.table_tool(["all", "show", "inode"])) | |
293 | log.info(self.fs.table_tool(["all", "show", "snap"])) | |
294 | log.info(self.fs.table_tool(["all", "show", "session"])) | |
295 | ||
296 | # Inode table should always be the same because initial state | |
297 | # and choice of inode are deterministic. | |
298 | # Should see one inode consumed | |
299 | self.assertEqual( | |
300 | json.loads(self.fs.table_tool(["all", "show", "inode"])), | |
301 | {"0": { | |
302 | "data": { | |
303 | "version": 2, | |
304 | "inotable": { | |
305 | "projected_free": [ | |
306 | {"start": 1099511628777, | |
307 | "len": 1099511626775}], | |
308 | "free": [ | |
309 | {"start": 1099511628777, | |
310 | "len": 1099511626775}]}}, | |
311 | "result": 0}} | |
312 | ||
313 | ) | |
314 | ||
315 | # Should see one session | |
316 | session_data = json.loads(self.fs.table_tool( | |
317 | ["all", "show", "session"])) | |
318 | self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1) | |
319 | self.assertEqual(session_data["0"]["result"], 0) | |
320 | ||
321 | # Should see no snaps | |
322 | self.assertEqual( | |
323 | json.loads(self.fs.table_tool(["all", "show", "snap"])), | |
324 | {"version": 0, | |
325 | "snapserver": {"last_snap": 1, | |
326 | "pending_noop": [], | |
327 | "snaps": [], | |
328 | "need_to_purge": {}, | |
329 | "pending_update": [], | |
330 | "pending_destroy": []}, | |
331 | "result": 0} | |
332 | ) | |
333 | ||
334 | # Reset everything | |
335 | for table in ["session", "inode", "snap"]: | |
336 | self.fs.table_tool(["all", "reset", table]) | |
337 | ||
338 | log.info(self.fs.table_tool(["all", "show", "inode"])) | |
339 | log.info(self.fs.table_tool(["all", "show", "snap"])) | |
340 | log.info(self.fs.table_tool(["all", "show", "session"])) | |
341 | ||
342 | # Should see 0 sessions | |
343 | session_data = json.loads(self.fs.table_tool( | |
344 | ["all", "show", "session"])) | |
345 | self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0) | |
346 | self.assertEqual(session_data["0"]["result"], 0) | |
347 | ||
348 | # Should see entire inode range now marked free | |
349 | self.assertEqual( | |
350 | json.loads(self.fs.table_tool(["all", "show", "inode"])), | |
351 | {"0": {"data": {"version": 1, | |
352 | "inotable": {"projected_free": [ | |
353 | {"start": 1099511627776, | |
354 | "len": 1099511627776}], | |
355 | "free": [ | |
356 | {"start": 1099511627776, | |
357 | "len": 1099511627776}]}}, | |
358 | "result": 0}} | |
359 | ) | |
360 | ||
361 | # Should see no snaps | |
362 | self.assertEqual( | |
363 | json.loads(self.fs.table_tool(["all", "show", "snap"])), | |
364 | {"version": 1, | |
365 | "snapserver": {"last_snap": 1, | |
366 | "pending_noop": [], | |
367 | "snaps": [], | |
368 | "need_to_purge": {}, | |
369 | "pending_update": [], | |
370 | "pending_destroy": []}, | |
371 | "result": 0} | |
372 | ) | |
373 | ||
374 | def test_table_tool_take_inos(self): | |
375 | initial_range_start = 1099511627776 | |
376 | initial_range_len = 1099511627776 | |
377 | # Initially a completely clear range | |
378 | self.assertEqual( | |
379 | json.loads(self.fs.table_tool(["all", "show", "inode"])), | |
380 | {"0": {"data": {"version": 0, | |
381 | "inotable": {"projected_free": [ | |
382 | {"start": initial_range_start, | |
383 | "len": initial_range_len}], | |
384 | "free": [ | |
385 | {"start": initial_range_start, | |
386 | "len": initial_range_len}]}}, | |
387 | "result": 0}} | |
388 | ) | |
389 | ||
390 | # Remove some | |
391 | self.assertEqual( | |
392 | json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])), | |
393 | {"0": {"data": {"version": 1, | |
394 | "inotable": {"projected_free": [ | |
395 | {"start": initial_range_start + 101, | |
396 | "len": initial_range_len - 101}], | |
397 | "free": [ | |
398 | {"start": initial_range_start + 101, | |
399 | "len": initial_range_len - 101}]}}, | |
400 | "result": 0}} | |
401 | ) | |
402 | ||
403 | @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth | |
404 | def test_journal_smoke(self): | |
405 | workunit(self.ctx, { | |
406 | 'clients': { | |
407 | "client.{0}".format(self.mount_a.client_id): [ | |
408 | "fs/misc/trivial_sync.sh"], | |
409 | }, | |
410 | "timeout": "1h" | |
411 | }) | |
412 | ||
413 | for mount in self.mounts: | |
414 | mount.umount_wait() | |
415 | ||
416 | self.fs.mds_stop() | |
417 | self.fs.mds_fail() | |
418 | ||
419 | # journal tool smoke | |
420 | workunit(self.ctx, { | |
421 | 'clients': { | |
422 | "client.{0}".format(self.mount_a.client_id): [ | |
423 | "suites/cephfs_journal_tool_smoke.sh"], | |
424 | }, | |
425 | "timeout": "1h" | |
426 | }) | |
427 | ||
428 | ||
429 | ||
430 | self.fs.mds_restart() | |
431 | self.fs.wait_for_daemons() | |
432 | ||
433 | self.mount_a.mount() | |
434 | ||
435 | # trivial sync moutn a | |
436 | workunit(self.ctx, { | |
437 | 'clients': { | |
438 | "client.{0}".format(self.mount_a.client_id): [ | |
439 | "fs/misc/trivial_sync.sh"], | |
440 | }, | |
441 | "timeout": "1h" | |
442 | }) | |
443 |