]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephfs/test_journal_repair.py
update sources to v12.1.1
[ceph.git] / ceph / qa / tasks / cephfs / test_journal_repair.py
CommitLineData
7c673cae
FG
1
2"""
3Test our tools for recovering the content of damaged journals
4"""
5
6import json
7import logging
8from textwrap import dedent
9import time
10
11from teuthology.exceptions import CommandFailedError, ConnectionLostError
12from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
13from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
14from tasks.workunit import task as workunit
15
16log = logging.getLogger(__name__)
17
18
19class TestJournalRepair(CephFSTestCase):
20 MDSS_REQUIRED = 2
21
22 def test_inject_to_empty(self):
23 """
24 That when some dentries in the journal but nothing is in
25 the backing store, we correctly populate the backing store
26 from the journalled dentries.
27 """
28
29 # Inject metadata operations
30 self.mount_a.run_shell(["touch", "rootfile"])
31 self.mount_a.run_shell(["mkdir", "subdir"])
32 self.mount_a.run_shell(["touch", "subdir/subdirfile"])
33 # There are several different paths for handling hardlinks, depending
34 # on whether an existing dentry (being overwritten) is also a hardlink
35 self.mount_a.run_shell(["mkdir", "linkdir"])
36
37 # Test inode -> remote transition for a dentry
38 self.mount_a.run_shell(["touch", "linkdir/link0"])
39 self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
40 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
41
42 # Test nothing -> remote transition
43 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
44
45 # Test remote -> inode transition
46 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
47 self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
48 self.mount_a.run_shell(["touch", "linkdir/link2"])
49
50 # Test remote -> diff remote transition
51 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
52 self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
53 self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
54
55 # Test an empty directory
56 self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
57 self.mount_a.run_shell(["sync"])
58
59 # Before we unmount, make a note of the inode numbers, later we will
60 # check that they match what we recover from the journal
61 rootfile_ino = self.mount_a.path_to_ino("rootfile")
62 subdir_ino = self.mount_a.path_to_ino("subdir")
63 linkdir_ino = self.mount_a.path_to_ino("linkdir")
64 subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
65 subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
66
67 self.mount_a.umount_wait()
68
69 # Stop the MDS
70 self.fs.mds_stop()
71 self.fs.mds_fail()
72
73 # Now, the journal should contain the operations, but the backing
74 # store shouldn't
75 with self.assertRaises(ObjectNotFound):
76 self.fs.list_dirfrag(subdir_ino)
77 self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
78
79 # Execute the dentry recovery, this should populate the backing store
80 self.fs.journal_tool(['event', 'recover_dentries', 'list'])
81
82 # Dentries in ROOT_INO are present
83 self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
84 self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
85 self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
86 sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
87
88 # Now check the MDS can read what we wrote: truncate the journal
89 # and start the mds.
90 self.fs.journal_tool(['journal', 'reset'])
91 self.fs.mds_fail_restart()
92 self.fs.wait_for_daemons()
93
94 # List files
95 self.mount_a.mount()
96 self.mount_a.wait_until_mounted()
97
98 # First ls -R to populate MDCache, such that hardlinks will
99 # resolve properly (recover_dentries does not create backtraces,
100 # so ordinarily hardlinks to inodes that happen not to have backtraces
101 # will be invisible in readdir).
102 # FIXME: hook in forward scrub here to regenerate backtraces
103 proc = self.mount_a.run_shell(['ls', '-R'])
104 self.mount_a.umount_wait() # remount to clear client cache before our second ls
105 self.mount_a.mount()
106 self.mount_a.wait_until_mounted()
107
108 proc = self.mount_a.run_shell(['ls', '-R'])
109 self.assertEqual(proc.stdout.getvalue().strip(),
110 dedent("""
111 .:
112 linkdir
113 rootfile
114 subdir
115
116 ./linkdir:
117 link0
118 link1
119 link2
120 link3
121
122 ./subdir:
123 subdirfile
124 subsubdir
125
126 ./subdir/subsubdir:
127 """).strip())
128
129 # Check the correct inos were preserved by path
130 self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
131 self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
132 self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
133 self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
134
135 # Check that the hard link handling came out correctly
136 self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
137 self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
138 self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
139 self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
140
141 # Create a new file, ensure it is not issued the same ino as one of the
142 # recovered ones
143 self.mount_a.run_shell(["touch", "afterwards"])
144 new_ino = self.mount_a.path_to_ino("afterwards")
145 self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
146
147 # Check that we can do metadata ops in the recovered directory
148 self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
149
150 @for_teuthology # 308s
151 def test_reset(self):
152 """
153 That after forcibly modifying the backing store, we can get back into
154 a good state by resetting the MDSMap.
155
156 The scenario is that we have two active MDSs, and we lose the journals. Once
157 we have completely lost confidence in the integrity of the metadata, we want to
158 return the system to a single-MDS state to go into a scrub to recover what we
159 can.
160 """
161
162 # Set max_mds to 2
7c673cae
FG
163 self.fs.set_max_mds(2)
164
165 # See that we have two active MDSs
166 self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
167 reject_fn=lambda v: v > 2 or v < 1)
168 active_mds_names = self.fs.get_active_names()
169
170 # Switch off any unneeded MDS daemons
171 for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
172 self.mds_cluster.mds_stop(unneeded_mds)
173 self.mds_cluster.mds_fail(unneeded_mds)
174
31f18b77
FG
175 # Create a dir on each rank
176 self.mount_a.run_shell(["mkdir", "alpha"])
177 self.mount_a.run_shell(["mkdir", "bravo"])
178 self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
179 self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
7c673cae
FG
180
181 def subtrees_assigned():
182 got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
7c673cae 183
31f18b77
FG
184 for s in got_subtrees:
185 if s['dir']['path'] == '/bravo':
186 if s['auth_first'] == 1:
187 return True
188 else:
189 # Should not happen
190 raise RuntimeError("/bravo is subtree but not rank 1!")
7c673cae 191
31f18b77
FG
192 return False
193
194 # Ensure the pinning has taken effect and the /bravo dir is now
195 # migrated to rank 1.
196 self.wait_until_true(subtrees_assigned, 30)
197
198 # Do some IO (this should be split across ranks according to
199 # the rank-pinned dirs)
200 self.mount_a.create_n_files("alpha/file", 1000)
201 self.mount_a.create_n_files("bravo/file", 1000)
7c673cae
FG
202
203 # Flush the journals so that we have some backing store data
204 # belonging to one MDS, and some to the other MDS.
205 for mds_name in active_mds_names:
206 self.fs.mds_asok(["flush", "journal"], mds_name)
207
208 # Stop (hard) the second MDS daemon
209 self.fs.mds_stop(active_mds_names[1])
210
211 # Wipe out the tables for MDS rank 1 so that it is broken and can't start
212 # (this is the simulated failure that we will demonstrate that the disaster
213 # recovery tools can get us back from)
214 self.fs.erase_metadata_objects(prefix="mds1_")
215
216 # Try to access files from the client
217 blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
218
219 # Check that this "ls -R" blocked rather than completing: indicates
220 # it got stuck trying to access subtrees which were on the now-dead MDS.
221 log.info("Sleeping to check ls is blocked...")
222 time.sleep(60)
223 self.assertFalse(blocked_ls.finished)
224
225 # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
226 # is not coming back. Kill it.
227 log.info("Killing mount, it's blocked on the MDS we killed")
228 self.mount_a.kill()
229 self.mount_a.kill_cleanup()
230 try:
231 # Now that the mount is dead, the ls -R should error out.
232 blocked_ls.wait()
233 except (CommandFailedError, ConnectionLostError):
234 # The ConnectionLostError case is for kernel client, where
235 # killing the mount also means killing the node.
236 pass
237
7c673cae
FG
238 # See that the second MDS will crash when it starts and tries to
239 # acquire rank 1
240 damaged_id = active_mds_names[1]
241 self.fs.mds_restart(damaged_id)
242
243 # The daemon taking the damaged rank should start starting, then
244 # restart back into standby after asking the mon to mark the rank
245 # damaged.
246 def is_marked_damaged():
247 mds_map = self.fs.get_mds_map()
248 return 1 in mds_map['damaged']
249
250 self.wait_until_true(is_marked_damaged, 60)
251
252 def get_state():
253 info = self.mds_cluster.get_mds_info(damaged_id)
254 return info['state'] if info is not None else None
255
256 self.wait_until_equal(
257 get_state,
258 "up:standby",
259 timeout=60)
260
261 self.fs.mds_stop(damaged_id)
262 self.fs.mds_fail(damaged_id)
263
264 # Now give up and go through a disaster recovery procedure
265 self.fs.mds_stop(active_mds_names[0])
266 self.fs.mds_fail(active_mds_names[0])
267 # Invoke recover_dentries quietly, because otherwise log spews millions of lines
268 self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
269 self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
270 self.fs.table_tool(["0", "reset", "session"])
271 self.fs.journal_tool(["journal", "reset"], rank=0)
272 self.fs.erase_mds_objects(1)
273 self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
274 '--yes-i-really-mean-it')
275
276 # Bring an MDS back online, mount a client, and see that we can walk the full
277 # filesystem tree again
278 self.fs.mds_fail_restart(active_mds_names[0])
279 self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
280 reject_fn=lambda v: len(v) > 1)
281 self.mount_a.mount()
282 self.mount_a.run_shell(["ls", "-R"], wait=True)
283
284 def test_table_tool(self):
285 active_mdss = self.fs.get_active_names()
286 self.assertEqual(len(active_mdss), 1)
287 mds_name = active_mdss[0]
288
289 self.mount_a.run_shell(["touch", "foo"])
290 self.fs.mds_asok(["flush", "journal"], mds_name)
291
292 log.info(self.fs.table_tool(["all", "show", "inode"]))
293 log.info(self.fs.table_tool(["all", "show", "snap"]))
294 log.info(self.fs.table_tool(["all", "show", "session"]))
295
296 # Inode table should always be the same because initial state
297 # and choice of inode are deterministic.
298 # Should see one inode consumed
299 self.assertEqual(
300 json.loads(self.fs.table_tool(["all", "show", "inode"])),
301 {"0": {
302 "data": {
303 "version": 2,
304 "inotable": {
305 "projected_free": [
306 {"start": 1099511628777,
307 "len": 1099511626775}],
308 "free": [
309 {"start": 1099511628777,
310 "len": 1099511626775}]}},
311 "result": 0}}
312
313 )
314
315 # Should see one session
316 session_data = json.loads(self.fs.table_tool(
317 ["all", "show", "session"]))
318 self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
319 self.assertEqual(session_data["0"]["result"], 0)
320
321 # Should see no snaps
322 self.assertEqual(
323 json.loads(self.fs.table_tool(["all", "show", "snap"])),
324 {"version": 0,
325 "snapserver": {"last_snap": 1,
326 "pending_noop": [],
327 "snaps": [],
328 "need_to_purge": {},
329 "pending_update": [],
330 "pending_destroy": []},
331 "result": 0}
332 )
333
334 # Reset everything
335 for table in ["session", "inode", "snap"]:
336 self.fs.table_tool(["all", "reset", table])
337
338 log.info(self.fs.table_tool(["all", "show", "inode"]))
339 log.info(self.fs.table_tool(["all", "show", "snap"]))
340 log.info(self.fs.table_tool(["all", "show", "session"]))
341
342 # Should see 0 sessions
343 session_data = json.loads(self.fs.table_tool(
344 ["all", "show", "session"]))
345 self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
346 self.assertEqual(session_data["0"]["result"], 0)
347
348 # Should see entire inode range now marked free
349 self.assertEqual(
350 json.loads(self.fs.table_tool(["all", "show", "inode"])),
351 {"0": {"data": {"version": 1,
352 "inotable": {"projected_free": [
353 {"start": 1099511627776,
354 "len": 1099511627776}],
355 "free": [
356 {"start": 1099511627776,
357 "len": 1099511627776}]}},
358 "result": 0}}
359 )
360
361 # Should see no snaps
362 self.assertEqual(
363 json.loads(self.fs.table_tool(["all", "show", "snap"])),
364 {"version": 1,
365 "snapserver": {"last_snap": 1,
366 "pending_noop": [],
367 "snaps": [],
368 "need_to_purge": {},
369 "pending_update": [],
370 "pending_destroy": []},
371 "result": 0}
372 )
373
374 def test_table_tool_take_inos(self):
375 initial_range_start = 1099511627776
376 initial_range_len = 1099511627776
377 # Initially a completely clear range
378 self.assertEqual(
379 json.loads(self.fs.table_tool(["all", "show", "inode"])),
380 {"0": {"data": {"version": 0,
381 "inotable": {"projected_free": [
382 {"start": initial_range_start,
383 "len": initial_range_len}],
384 "free": [
385 {"start": initial_range_start,
386 "len": initial_range_len}]}},
387 "result": 0}}
388 )
389
390 # Remove some
391 self.assertEqual(
392 json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
393 {"0": {"data": {"version": 1,
394 "inotable": {"projected_free": [
395 {"start": initial_range_start + 101,
396 "len": initial_range_len - 101}],
397 "free": [
398 {"start": initial_range_start + 101,
399 "len": initial_range_len - 101}]}},
400 "result": 0}}
401 )
402
403 @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth
404 def test_journal_smoke(self):
405 workunit(self.ctx, {
406 'clients': {
407 "client.{0}".format(self.mount_a.client_id): [
408 "fs/misc/trivial_sync.sh"],
409 },
410 "timeout": "1h"
411 })
412
413 for mount in self.mounts:
414 mount.umount_wait()
415
416 self.fs.mds_stop()
417 self.fs.mds_fail()
418
419 # journal tool smoke
420 workunit(self.ctx, {
421 'clients': {
422 "client.{0}".format(self.mount_a.client_id): [
423 "suites/cephfs_journal_tool_smoke.sh"],
424 },
425 "timeout": "1h"
426 })
427
428
429
430 self.fs.mds_restart()
431 self.fs.wait_for_daemons()
432
433 self.mount_a.mount()
434
435 # trivial sync moutn a
436 workunit(self.ctx, {
437 'clients': {
438 "client.{0}".format(self.mount_a.client_id): [
439 "fs/misc/trivial_sync.sh"],
440 },
441 "timeout": "1h"
442 })
443