]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_journal_repair.py
1b03afc0fc49b9adcc8300c77820e09625fed2b3
[ceph.git] / ceph / qa / tasks / cephfs / test_journal_repair.py
1
2 """
3 Test our tools for recovering the content of damaged journals
4 """
5
6 import json
7 import logging
8 from textwrap import dedent
9 import time
10
11 from teuthology.exceptions import CommandFailedError, ConnectionLostError
12 from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
13 from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
14 from tasks.workunit import task as workunit
15
16 log = logging.getLogger(__name__)
17
18
19 class TestJournalRepair(CephFSTestCase):
20 MDSS_REQUIRED = 2
21
22 def test_inject_to_empty(self):
23 """
24 That when some dentries in the journal but nothing is in
25 the backing store, we correctly populate the backing store
26 from the journalled dentries.
27 """
28
29 # Inject metadata operations
30 self.mount_a.run_shell(["touch", "rootfile"])
31 self.mount_a.run_shell(["mkdir", "subdir"])
32 self.mount_a.run_shell(["touch", "subdir/subdirfile"])
33 # There are several different paths for handling hardlinks, depending
34 # on whether an existing dentry (being overwritten) is also a hardlink
35 self.mount_a.run_shell(["mkdir", "linkdir"])
36
37 # Test inode -> remote transition for a dentry
38 self.mount_a.run_shell(["touch", "linkdir/link0"])
39 self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
40 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
41
42 # Test nothing -> remote transition
43 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
44
45 # Test remote -> inode transition
46 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
47 self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
48 self.mount_a.run_shell(["touch", "linkdir/link2"])
49
50 # Test remote -> diff remote transition
51 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
52 self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
53 self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
54
55 # Test an empty directory
56 self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
57 self.mount_a.run_shell(["sync"])
58
59 # Before we unmount, make a note of the inode numbers, later we will
60 # check that they match what we recover from the journal
61 rootfile_ino = self.mount_a.path_to_ino("rootfile")
62 subdir_ino = self.mount_a.path_to_ino("subdir")
63 linkdir_ino = self.mount_a.path_to_ino("linkdir")
64 subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
65 subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
66
67 self.mount_a.umount_wait()
68
69 # Stop the MDS
70 self.fs.mds_stop()
71 self.fs.mds_fail()
72
73 # Now, the journal should contain the operations, but the backing
74 # store shouldn't
75 with self.assertRaises(ObjectNotFound):
76 self.fs.list_dirfrag(subdir_ino)
77 self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
78
79 # Execute the dentry recovery, this should populate the backing store
80 self.fs.journal_tool(['event', 'recover_dentries', 'list'])
81
82 # Dentries in ROOT_INO are present
83 self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
84 self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
85 self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
86 sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
87
88 # Now check the MDS can read what we wrote: truncate the journal
89 # and start the mds.
90 self.fs.journal_tool(['journal', 'reset'])
91 self.fs.mds_fail_restart()
92 self.fs.wait_for_daemons()
93
94 # List files
95 self.mount_a.mount()
96 self.mount_a.wait_until_mounted()
97
98 # First ls -R to populate MDCache, such that hardlinks will
99 # resolve properly (recover_dentries does not create backtraces,
100 # so ordinarily hardlinks to inodes that happen not to have backtraces
101 # will be invisible in readdir).
102 # FIXME: hook in forward scrub here to regenerate backtraces
103 proc = self.mount_a.run_shell(['ls', '-R'])
104 self.mount_a.umount_wait() # remount to clear client cache before our second ls
105 self.mount_a.mount()
106 self.mount_a.wait_until_mounted()
107
108 proc = self.mount_a.run_shell(['ls', '-R'])
109 self.assertEqual(proc.stdout.getvalue().strip(),
110 dedent("""
111 .:
112 linkdir
113 rootfile
114 subdir
115
116 ./linkdir:
117 link0
118 link1
119 link2
120 link3
121
122 ./subdir:
123 subdirfile
124 subsubdir
125
126 ./subdir/subsubdir:
127 """).strip())
128
129 # Check the correct inos were preserved by path
130 self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
131 self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
132 self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
133 self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
134
135 # Check that the hard link handling came out correctly
136 self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
137 self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
138 self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
139 self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
140
141 # Create a new file, ensure it is not issued the same ino as one of the
142 # recovered ones
143 self.mount_a.run_shell(["touch", "afterwards"])
144 new_ino = self.mount_a.path_to_ino("afterwards")
145 self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
146
147 # Check that we can do metadata ops in the recovered directory
148 self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
149
150 @for_teuthology # 308s
151 def test_reset(self):
152 """
153 That after forcibly modifying the backing store, we can get back into
154 a good state by resetting the MDSMap.
155
156 The scenario is that we have two active MDSs, and we lose the journals. Once
157 we have completely lost confidence in the integrity of the metadata, we want to
158 return the system to a single-MDS state to go into a scrub to recover what we
159 can.
160 """
161
162 # Set max_mds to 2
163 self.fs.set_allow_multimds(True)
164 self.fs.set_max_mds(2)
165
166 # See that we have two active MDSs
167 self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
168 reject_fn=lambda v: v > 2 or v < 1)
169 active_mds_names = self.fs.get_active_names()
170
171 # Switch off any unneeded MDS daemons
172 for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
173 self.mds_cluster.mds_stop(unneeded_mds)
174 self.mds_cluster.mds_fail(unneeded_mds)
175
176 # Create a dir on each rank
177 self.mount_a.run_shell(["mkdir", "alpha"])
178 self.mount_a.run_shell(["mkdir", "bravo"])
179 self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
180 self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
181
182 def subtrees_assigned():
183 got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
184
185 for s in got_subtrees:
186 if s['dir']['path'] == '/bravo':
187 if s['auth_first'] == 1:
188 return True
189 else:
190 # Should not happen
191 raise RuntimeError("/bravo is subtree but not rank 1!")
192
193 return False
194
195 # Ensure the pinning has taken effect and the /bravo dir is now
196 # migrated to rank 1.
197 self.wait_until_true(subtrees_assigned, 30)
198
199 # Do some IO (this should be split across ranks according to
200 # the rank-pinned dirs)
201 self.mount_a.create_n_files("alpha/file", 1000)
202 self.mount_a.create_n_files("bravo/file", 1000)
203
204 # Flush the journals so that we have some backing store data
205 # belonging to one MDS, and some to the other MDS.
206 for mds_name in active_mds_names:
207 self.fs.mds_asok(["flush", "journal"], mds_name)
208
209 # Stop (hard) the second MDS daemon
210 self.fs.mds_stop(active_mds_names[1])
211
212 # Wipe out the tables for MDS rank 1 so that it is broken and can't start
213 # (this is the simulated failure that we will demonstrate that the disaster
214 # recovery tools can get us back from)
215 self.fs.erase_metadata_objects(prefix="mds1_")
216
217 # Try to access files from the client
218 blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
219
220 # Check that this "ls -R" blocked rather than completing: indicates
221 # it got stuck trying to access subtrees which were on the now-dead MDS.
222 log.info("Sleeping to check ls is blocked...")
223 time.sleep(60)
224 self.assertFalse(blocked_ls.finished)
225
226 # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
227 # is not coming back. Kill it.
228 log.info("Killing mount, it's blocked on the MDS we killed")
229 self.mount_a.kill()
230 self.mount_a.kill_cleanup()
231 try:
232 # Now that the mount is dead, the ls -R should error out.
233 blocked_ls.wait()
234 except (CommandFailedError, ConnectionLostError):
235 # The ConnectionLostError case is for kernel client, where
236 # killing the mount also means killing the node.
237 pass
238
239 # See that the second MDS will crash when it starts and tries to
240 # acquire rank 1
241 damaged_id = active_mds_names[1]
242 self.fs.mds_restart(damaged_id)
243
244 # The daemon taking the damaged rank should start starting, then
245 # restart back into standby after asking the mon to mark the rank
246 # damaged.
247 def is_marked_damaged():
248 mds_map = self.fs.get_mds_map()
249 return 1 in mds_map['damaged']
250
251 self.wait_until_true(is_marked_damaged, 60)
252
253 def get_state():
254 info = self.mds_cluster.get_mds_info(damaged_id)
255 return info['state'] if info is not None else None
256
257 self.wait_until_equal(
258 get_state,
259 "up:standby",
260 timeout=60)
261
262 self.fs.mds_stop(damaged_id)
263 self.fs.mds_fail(damaged_id)
264
265 # Now give up and go through a disaster recovery procedure
266 self.fs.mds_stop(active_mds_names[0])
267 self.fs.mds_fail(active_mds_names[0])
268 # Invoke recover_dentries quietly, because otherwise log spews millions of lines
269 self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
270 self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
271 self.fs.table_tool(["0", "reset", "session"])
272 self.fs.journal_tool(["journal", "reset"], rank=0)
273 self.fs.erase_mds_objects(1)
274 self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
275 '--yes-i-really-mean-it')
276
277 # Bring an MDS back online, mount a client, and see that we can walk the full
278 # filesystem tree again
279 self.fs.mds_fail_restart(active_mds_names[0])
280 self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
281 reject_fn=lambda v: len(v) > 1)
282 self.mount_a.mount()
283 self.mount_a.run_shell(["ls", "-R"], wait=True)
284
285 def test_table_tool(self):
286 active_mdss = self.fs.get_active_names()
287 self.assertEqual(len(active_mdss), 1)
288 mds_name = active_mdss[0]
289
290 self.mount_a.run_shell(["touch", "foo"])
291 self.fs.mds_asok(["flush", "journal"], mds_name)
292
293 log.info(self.fs.table_tool(["all", "show", "inode"]))
294 log.info(self.fs.table_tool(["all", "show", "snap"]))
295 log.info(self.fs.table_tool(["all", "show", "session"]))
296
297 # Inode table should always be the same because initial state
298 # and choice of inode are deterministic.
299 # Should see one inode consumed
300 self.assertEqual(
301 json.loads(self.fs.table_tool(["all", "show", "inode"])),
302 {"0": {
303 "data": {
304 "version": 2,
305 "inotable": {
306 "projected_free": [
307 {"start": 1099511628777,
308 "len": 1099511626775}],
309 "free": [
310 {"start": 1099511628777,
311 "len": 1099511626775}]}},
312 "result": 0}}
313
314 )
315
316 # Should see one session
317 session_data = json.loads(self.fs.table_tool(
318 ["all", "show", "session"]))
319 self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
320 self.assertEqual(session_data["0"]["result"], 0)
321
322 # Should see no snaps
323 self.assertEqual(
324 json.loads(self.fs.table_tool(["all", "show", "snap"])),
325 {"version": 0,
326 "snapserver": {"last_snap": 1,
327 "pending_noop": [],
328 "snaps": [],
329 "need_to_purge": {},
330 "pending_update": [],
331 "pending_destroy": []},
332 "result": 0}
333 )
334
335 # Reset everything
336 for table in ["session", "inode", "snap"]:
337 self.fs.table_tool(["all", "reset", table])
338
339 log.info(self.fs.table_tool(["all", "show", "inode"]))
340 log.info(self.fs.table_tool(["all", "show", "snap"]))
341 log.info(self.fs.table_tool(["all", "show", "session"]))
342
343 # Should see 0 sessions
344 session_data = json.loads(self.fs.table_tool(
345 ["all", "show", "session"]))
346 self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
347 self.assertEqual(session_data["0"]["result"], 0)
348
349 # Should see entire inode range now marked free
350 self.assertEqual(
351 json.loads(self.fs.table_tool(["all", "show", "inode"])),
352 {"0": {"data": {"version": 1,
353 "inotable": {"projected_free": [
354 {"start": 1099511627776,
355 "len": 1099511627776}],
356 "free": [
357 {"start": 1099511627776,
358 "len": 1099511627776}]}},
359 "result": 0}}
360 )
361
362 # Should see no snaps
363 self.assertEqual(
364 json.loads(self.fs.table_tool(["all", "show", "snap"])),
365 {"version": 1,
366 "snapserver": {"last_snap": 1,
367 "pending_noop": [],
368 "snaps": [],
369 "need_to_purge": {},
370 "pending_update": [],
371 "pending_destroy": []},
372 "result": 0}
373 )
374
375 def test_table_tool_take_inos(self):
376 initial_range_start = 1099511627776
377 initial_range_len = 1099511627776
378 # Initially a completely clear range
379 self.assertEqual(
380 json.loads(self.fs.table_tool(["all", "show", "inode"])),
381 {"0": {"data": {"version": 0,
382 "inotable": {"projected_free": [
383 {"start": initial_range_start,
384 "len": initial_range_len}],
385 "free": [
386 {"start": initial_range_start,
387 "len": initial_range_len}]}},
388 "result": 0}}
389 )
390
391 # Remove some
392 self.assertEqual(
393 json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
394 {"0": {"data": {"version": 1,
395 "inotable": {"projected_free": [
396 {"start": initial_range_start + 101,
397 "len": initial_range_len - 101}],
398 "free": [
399 {"start": initial_range_start + 101,
400 "len": initial_range_len - 101}]}},
401 "result": 0}}
402 )
403
404 @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth
405 def test_journal_smoke(self):
406 workunit(self.ctx, {
407 'clients': {
408 "client.{0}".format(self.mount_a.client_id): [
409 "fs/misc/trivial_sync.sh"],
410 },
411 "timeout": "1h"
412 })
413
414 for mount in self.mounts:
415 mount.umount_wait()
416
417 self.fs.mds_stop()
418 self.fs.mds_fail()
419
420 # journal tool smoke
421 workunit(self.ctx, {
422 'clients': {
423 "client.{0}".format(self.mount_a.client_id): [
424 "suites/cephfs_journal_tool_smoke.sh"],
425 },
426 "timeout": "1h"
427 })
428
429
430
431 self.fs.mds_restart()
432 self.fs.wait_for_daemons()
433
434 self.mount_a.mount()
435
436 # trivial sync moutn a
437 workunit(self.ctx, {
438 'clients': {
439 "client.{0}".format(self.mount_a.client_id): [
440 "fs/misc/trivial_sync.sh"],
441 },
442 "timeout": "1h"
443 })
444