]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | |
2 | """ | |
3 | Test our tools for recovering the content of damaged journals | |
4 | """ | |
5 | ||
6 | import json | |
7 | import logging | |
8 | from textwrap import dedent | |
9 | import time | |
10 | ||
11 | from teuthology.exceptions import CommandFailedError, ConnectionLostError | |
12 | from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO | |
13 | from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology | |
14 | from tasks.workunit import task as workunit | |
15 | ||
16 | log = logging.getLogger(__name__) | |
17 | ||
18 | ||
19 | class TestJournalRepair(CephFSTestCase): | |
20 | MDSS_REQUIRED = 2 | |
21 | ||
22 | def test_inject_to_empty(self): | |
23 | """ | |
24 | That when some dentries in the journal but nothing is in | |
25 | the backing store, we correctly populate the backing store | |
26 | from the journalled dentries. | |
27 | """ | |
28 | ||
29 | # Inject metadata operations | |
30 | self.mount_a.run_shell(["touch", "rootfile"]) | |
31 | self.mount_a.run_shell(["mkdir", "subdir"]) | |
32 | self.mount_a.run_shell(["touch", "subdir/subdirfile"]) | |
33 | # There are several different paths for handling hardlinks, depending | |
34 | # on whether an existing dentry (being overwritten) is also a hardlink | |
35 | self.mount_a.run_shell(["mkdir", "linkdir"]) | |
36 | ||
37 | # Test inode -> remote transition for a dentry | |
38 | self.mount_a.run_shell(["touch", "linkdir/link0"]) | |
39 | self.mount_a.run_shell(["rm", "-f", "linkdir/link0"]) | |
40 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"]) | |
41 | ||
42 | # Test nothing -> remote transition | |
43 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"]) | |
44 | ||
45 | # Test remote -> inode transition | |
46 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"]) | |
47 | self.mount_a.run_shell(["rm", "-f", "linkdir/link2"]) | |
48 | self.mount_a.run_shell(["touch", "linkdir/link2"]) | |
49 | ||
50 | # Test remote -> diff remote transition | |
51 | self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"]) | |
52 | self.mount_a.run_shell(["rm", "-f", "linkdir/link3"]) | |
53 | self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"]) | |
54 | ||
55 | # Test an empty directory | |
56 | self.mount_a.run_shell(["mkdir", "subdir/subsubdir"]) | |
57 | self.mount_a.run_shell(["sync"]) | |
58 | ||
59 | # Before we unmount, make a note of the inode numbers, later we will | |
60 | # check that they match what we recover from the journal | |
61 | rootfile_ino = self.mount_a.path_to_ino("rootfile") | |
62 | subdir_ino = self.mount_a.path_to_ino("subdir") | |
63 | linkdir_ino = self.mount_a.path_to_ino("linkdir") | |
64 | subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile") | |
65 | subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir") | |
66 | ||
67 | self.mount_a.umount_wait() | |
68 | ||
69 | # Stop the MDS | |
70 | self.fs.mds_stop() | |
71 | self.fs.mds_fail() | |
72 | ||
73 | # Now, the journal should contain the operations, but the backing | |
74 | # store shouldn't | |
75 | with self.assertRaises(ObjectNotFound): | |
76 | self.fs.list_dirfrag(subdir_ino) | |
77 | self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) | |
78 | ||
79 | # Execute the dentry recovery, this should populate the backing store | |
80 | self.fs.journal_tool(['event', 'recover_dentries', 'list']) | |
81 | ||
82 | # Dentries in ROOT_INO are present | |
83 | self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head'])) | |
84 | self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head']) | |
85 | self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)), | |
86 | sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head'])) | |
87 | ||
88 | # Now check the MDS can read what we wrote: truncate the journal | |
89 | # and start the mds. | |
90 | self.fs.journal_tool(['journal', 'reset']) | |
91 | self.fs.mds_fail_restart() | |
92 | self.fs.wait_for_daemons() | |
93 | ||
94 | # List files | |
95 | self.mount_a.mount() | |
96 | self.mount_a.wait_until_mounted() | |
97 | ||
98 | # First ls -R to populate MDCache, such that hardlinks will | |
99 | # resolve properly (recover_dentries does not create backtraces, | |
100 | # so ordinarily hardlinks to inodes that happen not to have backtraces | |
101 | # will be invisible in readdir). | |
102 | # FIXME: hook in forward scrub here to regenerate backtraces | |
103 | proc = self.mount_a.run_shell(['ls', '-R']) | |
104 | self.mount_a.umount_wait() # remount to clear client cache before our second ls | |
105 | self.mount_a.mount() | |
106 | self.mount_a.wait_until_mounted() | |
107 | ||
108 | proc = self.mount_a.run_shell(['ls', '-R']) | |
109 | self.assertEqual(proc.stdout.getvalue().strip(), | |
110 | dedent(""" | |
111 | .: | |
112 | linkdir | |
113 | rootfile | |
114 | subdir | |
115 | ||
116 | ./linkdir: | |
117 | link0 | |
118 | link1 | |
119 | link2 | |
120 | link3 | |
121 | ||
122 | ./subdir: | |
123 | subdirfile | |
124 | subsubdir | |
125 | ||
126 | ./subdir/subsubdir: | |
127 | """).strip()) | |
128 | ||
129 | # Check the correct inos were preserved by path | |
130 | self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile")) | |
131 | self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir")) | |
132 | self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile")) | |
133 | self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir")) | |
134 | ||
135 | # Check that the hard link handling came out correctly | |
136 | self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino) | |
137 | self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino) | |
138 | self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino) | |
139 | self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino) | |
140 | ||
141 | # Create a new file, ensure it is not issued the same ino as one of the | |
142 | # recovered ones | |
143 | self.mount_a.run_shell(["touch", "afterwards"]) | |
144 | new_ino = self.mount_a.path_to_ino("afterwards") | |
145 | self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino]) | |
146 | ||
147 | # Check that we can do metadata ops in the recovered directory | |
148 | self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"]) | |
149 | ||
150 | @for_teuthology # 308s | |
151 | def test_reset(self): | |
152 | """ | |
153 | That after forcibly modifying the backing store, we can get back into | |
154 | a good state by resetting the MDSMap. | |
155 | ||
156 | The scenario is that we have two active MDSs, and we lose the journals. Once | |
157 | we have completely lost confidence in the integrity of the metadata, we want to | |
158 | return the system to a single-MDS state to go into a scrub to recover what we | |
159 | can. | |
160 | """ | |
161 | ||
162 | # Set max_mds to 2 | |
163 | self.fs.set_allow_multimds(True) | |
164 | self.fs.set_max_mds(2) | |
165 | ||
166 | # See that we have two active MDSs | |
167 | self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, | |
168 | reject_fn=lambda v: v > 2 or v < 1) | |
169 | active_mds_names = self.fs.get_active_names() | |
170 | ||
171 | # Switch off any unneeded MDS daemons | |
172 | for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names): | |
173 | self.mds_cluster.mds_stop(unneeded_mds) | |
174 | self.mds_cluster.mds_fail(unneeded_mds) | |
175 | ||
31f18b77 FG |
176 | # Create a dir on each rank |
177 | self.mount_a.run_shell(["mkdir", "alpha"]) | |
178 | self.mount_a.run_shell(["mkdir", "bravo"]) | |
179 | self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0") | |
180 | self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1") | |
7c673cae FG |
181 | |
182 | def subtrees_assigned(): | |
183 | got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0]) | |
7c673cae | 184 | |
31f18b77 FG |
185 | for s in got_subtrees: |
186 | if s['dir']['path'] == '/bravo': | |
187 | if s['auth_first'] == 1: | |
188 | return True | |
189 | else: | |
190 | # Should not happen | |
191 | raise RuntimeError("/bravo is subtree but not rank 1!") | |
7c673cae | 192 | |
31f18b77 FG |
193 | return False |
194 | ||
195 | # Ensure the pinning has taken effect and the /bravo dir is now | |
196 | # migrated to rank 1. | |
197 | self.wait_until_true(subtrees_assigned, 30) | |
198 | ||
199 | # Do some IO (this should be split across ranks according to | |
200 | # the rank-pinned dirs) | |
201 | self.mount_a.create_n_files("alpha/file", 1000) | |
202 | self.mount_a.create_n_files("bravo/file", 1000) | |
7c673cae FG |
203 | |
204 | # Flush the journals so that we have some backing store data | |
205 | # belonging to one MDS, and some to the other MDS. | |
206 | for mds_name in active_mds_names: | |
207 | self.fs.mds_asok(["flush", "journal"], mds_name) | |
208 | ||
209 | # Stop (hard) the second MDS daemon | |
210 | self.fs.mds_stop(active_mds_names[1]) | |
211 | ||
212 | # Wipe out the tables for MDS rank 1 so that it is broken and can't start | |
213 | # (this is the simulated failure that we will demonstrate that the disaster | |
214 | # recovery tools can get us back from) | |
215 | self.fs.erase_metadata_objects(prefix="mds1_") | |
216 | ||
217 | # Try to access files from the client | |
218 | blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False) | |
219 | ||
220 | # Check that this "ls -R" blocked rather than completing: indicates | |
221 | # it got stuck trying to access subtrees which were on the now-dead MDS. | |
222 | log.info("Sleeping to check ls is blocked...") | |
223 | time.sleep(60) | |
224 | self.assertFalse(blocked_ls.finished) | |
225 | ||
226 | # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1 | |
227 | # is not coming back. Kill it. | |
228 | log.info("Killing mount, it's blocked on the MDS we killed") | |
229 | self.mount_a.kill() | |
230 | self.mount_a.kill_cleanup() | |
231 | try: | |
232 | # Now that the mount is dead, the ls -R should error out. | |
233 | blocked_ls.wait() | |
234 | except (CommandFailedError, ConnectionLostError): | |
235 | # The ConnectionLostError case is for kernel client, where | |
236 | # killing the mount also means killing the node. | |
237 | pass | |
238 | ||
7c673cae FG |
239 | # See that the second MDS will crash when it starts and tries to |
240 | # acquire rank 1 | |
241 | damaged_id = active_mds_names[1] | |
242 | self.fs.mds_restart(damaged_id) | |
243 | ||
244 | # The daemon taking the damaged rank should start starting, then | |
245 | # restart back into standby after asking the mon to mark the rank | |
246 | # damaged. | |
247 | def is_marked_damaged(): | |
248 | mds_map = self.fs.get_mds_map() | |
249 | return 1 in mds_map['damaged'] | |
250 | ||
251 | self.wait_until_true(is_marked_damaged, 60) | |
252 | ||
253 | def get_state(): | |
254 | info = self.mds_cluster.get_mds_info(damaged_id) | |
255 | return info['state'] if info is not None else None | |
256 | ||
257 | self.wait_until_equal( | |
258 | get_state, | |
259 | "up:standby", | |
260 | timeout=60) | |
261 | ||
262 | self.fs.mds_stop(damaged_id) | |
263 | self.fs.mds_fail(damaged_id) | |
264 | ||
265 | # Now give up and go through a disaster recovery procedure | |
266 | self.fs.mds_stop(active_mds_names[0]) | |
267 | self.fs.mds_fail(active_mds_names[0]) | |
268 | # Invoke recover_dentries quietly, because otherwise log spews millions of lines | |
269 | self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True) | |
270 | self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True) | |
271 | self.fs.table_tool(["0", "reset", "session"]) | |
272 | self.fs.journal_tool(["journal", "reset"], rank=0) | |
273 | self.fs.erase_mds_objects(1) | |
274 | self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, | |
275 | '--yes-i-really-mean-it') | |
276 | ||
277 | # Bring an MDS back online, mount a client, and see that we can walk the full | |
278 | # filesystem tree again | |
279 | self.fs.mds_fail_restart(active_mds_names[0]) | |
280 | self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30, | |
281 | reject_fn=lambda v: len(v) > 1) | |
282 | self.mount_a.mount() | |
283 | self.mount_a.run_shell(["ls", "-R"], wait=True) | |
284 | ||
285 | def test_table_tool(self): | |
286 | active_mdss = self.fs.get_active_names() | |
287 | self.assertEqual(len(active_mdss), 1) | |
288 | mds_name = active_mdss[0] | |
289 | ||
290 | self.mount_a.run_shell(["touch", "foo"]) | |
291 | self.fs.mds_asok(["flush", "journal"], mds_name) | |
292 | ||
293 | log.info(self.fs.table_tool(["all", "show", "inode"])) | |
294 | log.info(self.fs.table_tool(["all", "show", "snap"])) | |
295 | log.info(self.fs.table_tool(["all", "show", "session"])) | |
296 | ||
297 | # Inode table should always be the same because initial state | |
298 | # and choice of inode are deterministic. | |
299 | # Should see one inode consumed | |
300 | self.assertEqual( | |
301 | json.loads(self.fs.table_tool(["all", "show", "inode"])), | |
302 | {"0": { | |
303 | "data": { | |
304 | "version": 2, | |
305 | "inotable": { | |
306 | "projected_free": [ | |
307 | {"start": 1099511628777, | |
308 | "len": 1099511626775}], | |
309 | "free": [ | |
310 | {"start": 1099511628777, | |
311 | "len": 1099511626775}]}}, | |
312 | "result": 0}} | |
313 | ||
314 | ) | |
315 | ||
316 | # Should see one session | |
317 | session_data = json.loads(self.fs.table_tool( | |
318 | ["all", "show", "session"])) | |
319 | self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1) | |
320 | self.assertEqual(session_data["0"]["result"], 0) | |
321 | ||
322 | # Should see no snaps | |
323 | self.assertEqual( | |
324 | json.loads(self.fs.table_tool(["all", "show", "snap"])), | |
325 | {"version": 0, | |
326 | "snapserver": {"last_snap": 1, | |
327 | "pending_noop": [], | |
328 | "snaps": [], | |
329 | "need_to_purge": {}, | |
330 | "pending_update": [], | |
331 | "pending_destroy": []}, | |
332 | "result": 0} | |
333 | ) | |
334 | ||
335 | # Reset everything | |
336 | for table in ["session", "inode", "snap"]: | |
337 | self.fs.table_tool(["all", "reset", table]) | |
338 | ||
339 | log.info(self.fs.table_tool(["all", "show", "inode"])) | |
340 | log.info(self.fs.table_tool(["all", "show", "snap"])) | |
341 | log.info(self.fs.table_tool(["all", "show", "session"])) | |
342 | ||
343 | # Should see 0 sessions | |
344 | session_data = json.loads(self.fs.table_tool( | |
345 | ["all", "show", "session"])) | |
346 | self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0) | |
347 | self.assertEqual(session_data["0"]["result"], 0) | |
348 | ||
349 | # Should see entire inode range now marked free | |
350 | self.assertEqual( | |
351 | json.loads(self.fs.table_tool(["all", "show", "inode"])), | |
352 | {"0": {"data": {"version": 1, | |
353 | "inotable": {"projected_free": [ | |
354 | {"start": 1099511627776, | |
355 | "len": 1099511627776}], | |
356 | "free": [ | |
357 | {"start": 1099511627776, | |
358 | "len": 1099511627776}]}}, | |
359 | "result": 0}} | |
360 | ) | |
361 | ||
362 | # Should see no snaps | |
363 | self.assertEqual( | |
364 | json.loads(self.fs.table_tool(["all", "show", "snap"])), | |
365 | {"version": 1, | |
366 | "snapserver": {"last_snap": 1, | |
367 | "pending_noop": [], | |
368 | "snaps": [], | |
369 | "need_to_purge": {}, | |
370 | "pending_update": [], | |
371 | "pending_destroy": []}, | |
372 | "result": 0} | |
373 | ) | |
374 | ||
375 | def test_table_tool_take_inos(self): | |
376 | initial_range_start = 1099511627776 | |
377 | initial_range_len = 1099511627776 | |
378 | # Initially a completely clear range | |
379 | self.assertEqual( | |
380 | json.loads(self.fs.table_tool(["all", "show", "inode"])), | |
381 | {"0": {"data": {"version": 0, | |
382 | "inotable": {"projected_free": [ | |
383 | {"start": initial_range_start, | |
384 | "len": initial_range_len}], | |
385 | "free": [ | |
386 | {"start": initial_range_start, | |
387 | "len": initial_range_len}]}}, | |
388 | "result": 0}} | |
389 | ) | |
390 | ||
391 | # Remove some | |
392 | self.assertEqual( | |
393 | json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])), | |
394 | {"0": {"data": {"version": 1, | |
395 | "inotable": {"projected_free": [ | |
396 | {"start": initial_range_start + 101, | |
397 | "len": initial_range_len - 101}], | |
398 | "free": [ | |
399 | {"start": initial_range_start + 101, | |
400 | "len": initial_range_len - 101}]}}, | |
401 | "result": 0}} | |
402 | ) | |
403 | ||
404 | @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth | |
405 | def test_journal_smoke(self): | |
406 | workunit(self.ctx, { | |
407 | 'clients': { | |
408 | "client.{0}".format(self.mount_a.client_id): [ | |
409 | "fs/misc/trivial_sync.sh"], | |
410 | }, | |
411 | "timeout": "1h" | |
412 | }) | |
413 | ||
414 | for mount in self.mounts: | |
415 | mount.umount_wait() | |
416 | ||
417 | self.fs.mds_stop() | |
418 | self.fs.mds_fail() | |
419 | ||
420 | # journal tool smoke | |
421 | workunit(self.ctx, { | |
422 | 'clients': { | |
423 | "client.{0}".format(self.mount_a.client_id): [ | |
424 | "suites/cephfs_journal_tool_smoke.sh"], | |
425 | }, | |
426 | "timeout": "1h" | |
427 | }) | |
428 | ||
429 | ||
430 | ||
431 | self.fs.mds_restart() | |
432 | self.fs.wait_for_daemons() | |
433 | ||
434 | self.mount_a.mount() | |
435 | ||
436 | # trivial sync moutn a | |
437 | workunit(self.ctx, { | |
438 | 'clients': { | |
439 | "client.{0}".format(self.mount_a.client_id): [ | |
440 | "fs/misc/trivial_sync.sh"], | |
441 | }, | |
442 | "timeout": "1h" | |
443 | }) | |
444 |