]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_journal_repair.py
1b03afc0fc49b9adcc8300c77820e09625fed2b3
3 Test our tools for recovering the content of damaged journals
8 from textwrap
import dedent
11 from teuthology
.exceptions
import CommandFailedError
, ConnectionLostError
12 from tasks
.cephfs
.filesystem
import ObjectNotFound
, ROOT_INO
13 from tasks
.cephfs
.cephfs_test_case
import CephFSTestCase
, for_teuthology
14 from tasks
.workunit
import task
as workunit
16 log
= logging
.getLogger(__name__
)
19 class TestJournalRepair(CephFSTestCase
):
22 def test_inject_to_empty(self
):
24 That when some dentries in the journal but nothing is in
25 the backing store, we correctly populate the backing store
26 from the journalled dentries.
29 # Inject metadata operations
30 self
.mount_a
.run_shell(["touch", "rootfile"])
31 self
.mount_a
.run_shell(["mkdir", "subdir"])
32 self
.mount_a
.run_shell(["touch", "subdir/subdirfile"])
33 # There are several different paths for handling hardlinks, depending
34 # on whether an existing dentry (being overwritten) is also a hardlink
35 self
.mount_a
.run_shell(["mkdir", "linkdir"])
37 # Test inode -> remote transition for a dentry
38 self
.mount_a
.run_shell(["touch", "linkdir/link0"])
39 self
.mount_a
.run_shell(["rm", "-f", "linkdir/link0"])
40 self
.mount_a
.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
42 # Test nothing -> remote transition
43 self
.mount_a
.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
45 # Test remote -> inode transition
46 self
.mount_a
.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
47 self
.mount_a
.run_shell(["rm", "-f", "linkdir/link2"])
48 self
.mount_a
.run_shell(["touch", "linkdir/link2"])
50 # Test remote -> diff remote transition
51 self
.mount_a
.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
52 self
.mount_a
.run_shell(["rm", "-f", "linkdir/link3"])
53 self
.mount_a
.run_shell(["ln", "rootfile", "linkdir/link3"])
55 # Test an empty directory
56 self
.mount_a
.run_shell(["mkdir", "subdir/subsubdir"])
57 self
.mount_a
.run_shell(["sync"])
59 # Before we unmount, make a note of the inode numbers, later we will
60 # check that they match what we recover from the journal
61 rootfile_ino
= self
.mount_a
.path_to_ino("rootfile")
62 subdir_ino
= self
.mount_a
.path_to_ino("subdir")
63 linkdir_ino
= self
.mount_a
.path_to_ino("linkdir")
64 subdirfile_ino
= self
.mount_a
.path_to_ino("subdir/subdirfile")
65 subsubdir_ino
= self
.mount_a
.path_to_ino("subdir/subsubdir")
67 self
.mount_a
.umount_wait()
73 # Now, the journal should contain the operations, but the backing
75 with self
.assertRaises(ObjectNotFound
):
76 self
.fs
.list_dirfrag(subdir_ino
)
77 self
.assertEqual(self
.fs
.list_dirfrag(ROOT_INO
), [])
79 # Execute the dentry recovery, this should populate the backing store
80 self
.fs
.journal_tool(['event', 'recover_dentries', 'list'])
82 # Dentries in ROOT_INO are present
83 self
.assertEqual(sorted(self
.fs
.list_dirfrag(ROOT_INO
)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
84 self
.assertEqual(self
.fs
.list_dirfrag(subdir_ino
), ['subdirfile_head', 'subsubdir_head'])
85 self
.assertEqual(sorted(self
.fs
.list_dirfrag(linkdir_ino
)),
86 sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
88 # Now check the MDS can read what we wrote: truncate the journal
90 self
.fs
.journal_tool(['journal', 'reset'])
91 self
.fs
.mds_fail_restart()
92 self
.fs
.wait_for_daemons()
96 self
.mount_a
.wait_until_mounted()
98 # First ls -R to populate MDCache, such that hardlinks will
99 # resolve properly (recover_dentries does not create backtraces,
100 # so ordinarily hardlinks to inodes that happen not to have backtraces
101 # will be invisible in readdir).
102 # FIXME: hook in forward scrub here to regenerate backtraces
103 proc
= self
.mount_a
.run_shell(['ls', '-R'])
104 self
.mount_a
.umount_wait() # remount to clear client cache before our second ls
106 self
.mount_a
.wait_until_mounted()
108 proc
= self
.mount_a
.run_shell(['ls', '-R'])
109 self
.assertEqual(proc
.stdout
.getvalue().strip(),
129 # Check the correct inos were preserved by path
130 self
.assertEqual(rootfile_ino
, self
.mount_a
.path_to_ino("rootfile"))
131 self
.assertEqual(subdir_ino
, self
.mount_a
.path_to_ino("subdir"))
132 self
.assertEqual(subdirfile_ino
, self
.mount_a
.path_to_ino("subdir/subdirfile"))
133 self
.assertEqual(subsubdir_ino
, self
.mount_a
.path_to_ino("subdir/subsubdir"))
135 # Check that the hard link handling came out correctly
136 self
.assertEqual(self
.mount_a
.path_to_ino("linkdir/link0"), subdirfile_ino
)
137 self
.assertEqual(self
.mount_a
.path_to_ino("linkdir/link1"), subdirfile_ino
)
138 self
.assertNotEqual(self
.mount_a
.path_to_ino("linkdir/link2"), subdirfile_ino
)
139 self
.assertEqual(self
.mount_a
.path_to_ino("linkdir/link3"), rootfile_ino
)
141 # Create a new file, ensure it is not issued the same ino as one of the
143 self
.mount_a
.run_shell(["touch", "afterwards"])
144 new_ino
= self
.mount_a
.path_to_ino("afterwards")
145 self
.assertNotIn(new_ino
, [rootfile_ino
, subdir_ino
, subdirfile_ino
])
147 # Check that we can do metadata ops in the recovered directory
148 self
.mount_a
.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
150 @for_teuthology # 308s
151 def test_reset(self
):
153 That after forcibly modifying the backing store, we can get back into
154 a good state by resetting the MDSMap.
156 The scenario is that we have two active MDSs, and we lose the journals. Once
157 we have completely lost confidence in the integrity of the metadata, we want to
158 return the system to a single-MDS state to go into a scrub to recover what we
163 self
.fs
.set_allow_multimds(True)
164 self
.fs
.set_max_mds(2)
166 # See that we have two active MDSs
167 self
.wait_until_equal(lambda: len(self
.fs
.get_active_names()), 2, 30,
168 reject_fn
=lambda v
: v
> 2 or v
< 1)
169 active_mds_names
= self
.fs
.get_active_names()
171 # Switch off any unneeded MDS daemons
172 for unneeded_mds
in set(self
.mds_cluster
.mds_ids
) - set(active_mds_names
):
173 self
.mds_cluster
.mds_stop(unneeded_mds
)
174 self
.mds_cluster
.mds_fail(unneeded_mds
)
176 # Create a dir on each rank
177 self
.mount_a
.run_shell(["mkdir", "alpha"])
178 self
.mount_a
.run_shell(["mkdir", "bravo"])
179 self
.mount_a
.setfattr("alpha/", "ceph.dir.pin", "0")
180 self
.mount_a
.setfattr("bravo/", "ceph.dir.pin", "1")
182 def subtrees_assigned():
183 got_subtrees
= self
.fs
.mds_asok(["get", "subtrees"], mds_id
=active_mds_names
[0])
185 for s
in got_subtrees
:
186 if s
['dir']['path'] == '/bravo':
187 if s
['auth_first'] == 1:
191 raise RuntimeError("/bravo is subtree but not rank 1!")
195 # Ensure the pinning has taken effect and the /bravo dir is now
196 # migrated to rank 1.
197 self
.wait_until_true(subtrees_assigned
, 30)
199 # Do some IO (this should be split across ranks according to
200 # the rank-pinned dirs)
201 self
.mount_a
.create_n_files("alpha/file", 1000)
202 self
.mount_a
.create_n_files("bravo/file", 1000)
204 # Flush the journals so that we have some backing store data
205 # belonging to one MDS, and some to the other MDS.
206 for mds_name
in active_mds_names
:
207 self
.fs
.mds_asok(["flush", "journal"], mds_name
)
209 # Stop (hard) the second MDS daemon
210 self
.fs
.mds_stop(active_mds_names
[1])
212 # Wipe out the tables for MDS rank 1 so that it is broken and can't start
213 # (this is the simulated failure that we will demonstrate that the disaster
214 # recovery tools can get us back from)
215 self
.fs
.erase_metadata_objects(prefix
="mds1_")
217 # Try to access files from the client
218 blocked_ls
= self
.mount_a
.run_shell(["ls", "-R"], wait
=False)
220 # Check that this "ls -R" blocked rather than completing: indicates
221 # it got stuck trying to access subtrees which were on the now-dead MDS.
222 log
.info("Sleeping to check ls is blocked...")
224 self
.assertFalse(blocked_ls
.finished
)
226 # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
227 # is not coming back. Kill it.
228 log
.info("Killing mount, it's blocked on the MDS we killed")
230 self
.mount_a
.kill_cleanup()
232 # Now that the mount is dead, the ls -R should error out.
234 except (CommandFailedError
, ConnectionLostError
):
235 # The ConnectionLostError case is for kernel client, where
236 # killing the mount also means killing the node.
239 # See that the second MDS will crash when it starts and tries to
241 damaged_id
= active_mds_names
[1]
242 self
.fs
.mds_restart(damaged_id
)
244 # The daemon taking the damaged rank should start starting, then
245 # restart back into standby after asking the mon to mark the rank
247 def is_marked_damaged():
248 mds_map
= self
.fs
.get_mds_map()
249 return 1 in mds_map
['damaged']
251 self
.wait_until_true(is_marked_damaged
, 60)
254 info
= self
.mds_cluster
.get_mds_info(damaged_id
)
255 return info
['state'] if info
is not None else None
257 self
.wait_until_equal(
262 self
.fs
.mds_stop(damaged_id
)
263 self
.fs
.mds_fail(damaged_id
)
265 # Now give up and go through a disaster recovery procedure
266 self
.fs
.mds_stop(active_mds_names
[0])
267 self
.fs
.mds_fail(active_mds_names
[0])
268 # Invoke recover_dentries quietly, because otherwise log spews millions of lines
269 self
.fs
.journal_tool(["event", "recover_dentries", "summary"], rank
=0, quiet
=True)
270 self
.fs
.journal_tool(["event", "recover_dentries", "summary"], rank
=1, quiet
=True)
271 self
.fs
.table_tool(["0", "reset", "session"])
272 self
.fs
.journal_tool(["journal", "reset"], rank
=0)
273 self
.fs
.erase_mds_objects(1)
274 self
.fs
.mon_manager
.raw_cluster_cmd('fs', 'reset', self
.fs
.name
,
275 '--yes-i-really-mean-it')
277 # Bring an MDS back online, mount a client, and see that we can walk the full
278 # filesystem tree again
279 self
.fs
.mds_fail_restart(active_mds_names
[0])
280 self
.wait_until_equal(lambda: self
.fs
.get_active_names(), [active_mds_names
[0]], 30,
281 reject_fn
=lambda v
: len(v
) > 1)
283 self
.mount_a
.run_shell(["ls", "-R"], wait
=True)
285 def test_table_tool(self
):
286 active_mdss
= self
.fs
.get_active_names()
287 self
.assertEqual(len(active_mdss
), 1)
288 mds_name
= active_mdss
[0]
290 self
.mount_a
.run_shell(["touch", "foo"])
291 self
.fs
.mds_asok(["flush", "journal"], mds_name
)
293 log
.info(self
.fs
.table_tool(["all", "show", "inode"]))
294 log
.info(self
.fs
.table_tool(["all", "show", "snap"]))
295 log
.info(self
.fs
.table_tool(["all", "show", "session"]))
297 # Inode table should always be the same because initial state
298 # and choice of inode are deterministic.
299 # Should see one inode consumed
301 json
.loads(self
.fs
.table_tool(["all", "show", "inode"])),
307 {"start": 1099511628777,
308 "len": 1099511626775}],
310 {"start": 1099511628777,
311 "len": 1099511626775}]}},
316 # Should see one session
317 session_data
= json
.loads(self
.fs
.table_tool(
318 ["all", "show", "session"]))
319 self
.assertEqual(len(session_data
["0"]["data"]["Sessions"]), 1)
320 self
.assertEqual(session_data
["0"]["result"], 0)
322 # Should see no snaps
324 json
.loads(self
.fs
.table_tool(["all", "show", "snap"])),
326 "snapserver": {"last_snap": 1,
330 "pending_update": [],
331 "pending_destroy": []},
336 for table
in ["session", "inode", "snap"]:
337 self
.fs
.table_tool(["all", "reset", table
])
339 log
.info(self
.fs
.table_tool(["all", "show", "inode"]))
340 log
.info(self
.fs
.table_tool(["all", "show", "snap"]))
341 log
.info(self
.fs
.table_tool(["all", "show", "session"]))
343 # Should see 0 sessions
344 session_data
= json
.loads(self
.fs
.table_tool(
345 ["all", "show", "session"]))
346 self
.assertEqual(len(session_data
["0"]["data"]["Sessions"]), 0)
347 self
.assertEqual(session_data
["0"]["result"], 0)
349 # Should see entire inode range now marked free
351 json
.loads(self
.fs
.table_tool(["all", "show", "inode"])),
352 {"0": {"data": {"version": 1,
353 "inotable": {"projected_free": [
354 {"start": 1099511627776,
355 "len": 1099511627776}],
357 {"start": 1099511627776,
358 "len": 1099511627776}]}},
362 # Should see no snaps
364 json
.loads(self
.fs
.table_tool(["all", "show", "snap"])),
366 "snapserver": {"last_snap": 1,
370 "pending_update": [],
371 "pending_destroy": []},
375 def test_table_tool_take_inos(self
):
376 initial_range_start
= 1099511627776
377 initial_range_len
= 1099511627776
378 # Initially a completely clear range
380 json
.loads(self
.fs
.table_tool(["all", "show", "inode"])),
381 {"0": {"data": {"version": 0,
382 "inotable": {"projected_free": [
383 {"start": initial_range_start
,
384 "len": initial_range_len
}],
386 {"start": initial_range_start
,
387 "len": initial_range_len
}]}},
393 json
.loads(self
.fs
.table_tool(["all", "take_inos", "{0}".format(initial_range_start
+ 100)])),
394 {"0": {"data": {"version": 1,
395 "inotable": {"projected_free": [
396 {"start": initial_range_start
+ 101,
397 "len": initial_range_len
- 101}],
399 {"start": initial_range_start
+ 101,
400 "len": initial_range_len
- 101}]}},
404 @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth
405 def test_journal_smoke(self
):
408 "client.{0}".format(self
.mount_a
.client_id
): [
409 "fs/misc/trivial_sync.sh"],
414 for mount
in self
.mounts
:
423 "client.{0}".format(self
.mount_a
.client_id
): [
424 "suites/cephfs_journal_tool_smoke.sh"],
431 self
.fs
.mds_restart()
432 self
.fs
.wait_for_daemons()
436 # trivial sync moutn a
439 "client.{0}".format(self
.mount_a
.client_id
): [
440 "fs/misc/trivial_sync.sh"],