]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_journal_repair.py
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / qa / tasks / cephfs / test_journal_repair.py
1
2 """
3 Test our tools for recovering the content of damaged journals
4 """
5
6 import json
7 import logging
8 from textwrap import dedent
9 import time
10
11 from teuthology.exceptions import CommandFailedError, ConnectionLostError
12 from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
13 from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
14 from tasks.workunit import task as workunit
15
16 log = logging.getLogger(__name__)
17
18
19 class TestJournalRepair(CephFSTestCase):
20 MDSS_REQUIRED = 2
21
22 def test_inject_to_empty(self):
23 """
24 That when some dentries in the journal but nothing is in
25 the backing store, we correctly populate the backing store
26 from the journalled dentries.
27 """
28
29 # Inject metadata operations
30 self.mount_a.run_shell(["touch", "rootfile"])
31 self.mount_a.run_shell(["mkdir", "subdir"])
32 self.mount_a.run_shell(["touch", "subdir/subdirfile"])
33 # There are several different paths for handling hardlinks, depending
34 # on whether an existing dentry (being overwritten) is also a hardlink
35 self.mount_a.run_shell(["mkdir", "linkdir"])
36
37 # Test inode -> remote transition for a dentry
38 self.mount_a.run_shell(["touch", "linkdir/link0"])
39 self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
40 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
41
42 # Test nothing -> remote transition
43 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
44
45 # Test remote -> inode transition
46 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
47 self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
48 self.mount_a.run_shell(["touch", "linkdir/link2"])
49
50 # Test remote -> diff remote transition
51 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
52 self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
53 self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
54
55 # Test an empty directory
56 self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
57 self.mount_a.run_shell(["sync"])
58
59 # Before we unmount, make a note of the inode numbers, later we will
60 # check that they match what we recover from the journal
61 rootfile_ino = self.mount_a.path_to_ino("rootfile")
62 subdir_ino = self.mount_a.path_to_ino("subdir")
63 linkdir_ino = self.mount_a.path_to_ino("linkdir")
64 subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
65 subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
66
67 self.mount_a.umount_wait()
68
69 # Stop the MDS
70 self.fs.mds_stop()
71 self.fs.mds_fail()
72
73 # Now, the journal should contain the operations, but the backing
74 # store shouldn't
75 with self.assertRaises(ObjectNotFound):
76 self.fs.list_dirfrag(subdir_ino)
77 self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
78
79 # Execute the dentry recovery, this should populate the backing store
80 self.fs.journal_tool(['event', 'recover_dentries', 'list'])
81
82 # Dentries in ROOT_INO are present
83 self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
84 self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
85 self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
86 sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
87
88 # Now check the MDS can read what we wrote: truncate the journal
89 # and start the mds.
90 self.fs.journal_tool(['journal', 'reset'])
91 self.fs.mds_fail_restart()
92 self.fs.wait_for_daemons()
93
94 # List files
95 self.mount_a.mount()
96 self.mount_a.wait_until_mounted()
97
98 # First ls -R to populate MDCache, such that hardlinks will
99 # resolve properly (recover_dentries does not create backtraces,
100 # so ordinarily hardlinks to inodes that happen not to have backtraces
101 # will be invisible in readdir).
102 # FIXME: hook in forward scrub here to regenerate backtraces
103 proc = self.mount_a.run_shell(['ls', '-R'])
104 self.mount_a.umount_wait() # remount to clear client cache before our second ls
105 self.mount_a.mount()
106 self.mount_a.wait_until_mounted()
107
108 proc = self.mount_a.run_shell(['ls', '-R'])
109 self.assertEqual(proc.stdout.getvalue().strip(),
110 dedent("""
111 .:
112 linkdir
113 rootfile
114 subdir
115
116 ./linkdir:
117 link0
118 link1
119 link2
120 link3
121
122 ./subdir:
123 subdirfile
124 subsubdir
125
126 ./subdir/subsubdir:
127 """).strip())
128
129 # Check the correct inos were preserved by path
130 self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
131 self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
132 self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
133 self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
134
135 # Check that the hard link handling came out correctly
136 self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
137 self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
138 self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
139 self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
140
141 # Create a new file, ensure it is not issued the same ino as one of the
142 # recovered ones
143 self.mount_a.run_shell(["touch", "afterwards"])
144 new_ino = self.mount_a.path_to_ino("afterwards")
145 self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
146
147 # Check that we can do metadata ops in the recovered directory
148 self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
149
150 @for_teuthology # 308s
151 def test_reset(self):
152 """
153 That after forcibly modifying the backing store, we can get back into
154 a good state by resetting the MDSMap.
155
156 The scenario is that we have two active MDSs, and we lose the journals. Once
157 we have completely lost confidence in the integrity of the metadata, we want to
158 return the system to a single-MDS state to go into a scrub to recover what we
159 can.
160 """
161
162 # Set max_mds to 2
163 self.fs.set_allow_multimds(True)
164 self.fs.set_max_mds(2)
165
166 # See that we have two active MDSs
167 self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
168 reject_fn=lambda v: v > 2 or v < 1)
169 active_mds_names = self.fs.get_active_names()
170
171 # Switch off any unneeded MDS daemons
172 for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
173 self.mds_cluster.mds_stop(unneeded_mds)
174 self.mds_cluster.mds_fail(unneeded_mds)
175
176 # Do a bunch of I/O such that at least some will hit the second MDS: create
177 # lots of directories so that the balancer should find it easy to make a decision
178 # to allocate some of them to the second mds.
179 spammers = []
180 for n in range(0, 16):
181 dir_name = "spam_{0}".format(n)
182 spammers.append(self.mount_a.spam_dir_background(dir_name))
183
184 def subtrees_assigned():
185 got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
186 rank_1_count = len([s for s in got_subtrees if s['auth_first'] == 1])
187
188 # Greater than 1, because there is typically 1 for ~mds1, and once it
189 # has been assigned something in addition to that it means it has been
190 # assigned a "real" subtree.
191 return rank_1_count > 1
192
193 # We are waiting for the MDS to respond to hot directories, which
194 # is not guaranteed to happen at a particular time, so a lengthy timeout here.
195 self.wait_until_true(subtrees_assigned, 600)
196
197 # Flush the journals so that we have some backing store data
198 # belonging to one MDS, and some to the other MDS.
199 for mds_name in active_mds_names:
200 self.fs.mds_asok(["flush", "journal"], mds_name)
201
202 # Stop (hard) the second MDS daemon
203 self.fs.mds_stop(active_mds_names[1])
204
205 # Wipe out the tables for MDS rank 1 so that it is broken and can't start
206 # (this is the simulated failure that we will demonstrate that the disaster
207 # recovery tools can get us back from)
208 self.fs.erase_metadata_objects(prefix="mds1_")
209
210 # Try to access files from the client
211 blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
212
213 # Check that this "ls -R" blocked rather than completing: indicates
214 # it got stuck trying to access subtrees which were on the now-dead MDS.
215 log.info("Sleeping to check ls is blocked...")
216 time.sleep(60)
217 self.assertFalse(blocked_ls.finished)
218
219 # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
220 # is not coming back. Kill it.
221 log.info("Killing mount, it's blocked on the MDS we killed")
222 self.mount_a.kill()
223 self.mount_a.kill_cleanup()
224 try:
225 # Now that the mount is dead, the ls -R should error out.
226 blocked_ls.wait()
227 except (CommandFailedError, ConnectionLostError):
228 # The ConnectionLostError case is for kernel client, where
229 # killing the mount also means killing the node.
230 pass
231
232 log.info("Terminating spammer processes...")
233 for spammer_proc in spammers:
234 spammer_proc.stdin.close()
235 try:
236 spammer_proc.wait()
237 except (CommandFailedError, ConnectionLostError):
238 # The ConnectionLostError case is for kernel client, where
239 # killing the mount also means killing the node.
240 pass
241
242 # See that the second MDS will crash when it starts and tries to
243 # acquire rank 1
244 damaged_id = active_mds_names[1]
245 self.fs.mds_restart(damaged_id)
246
247 # The daemon taking the damaged rank should start starting, then
248 # restart back into standby after asking the mon to mark the rank
249 # damaged.
250 def is_marked_damaged():
251 mds_map = self.fs.get_mds_map()
252 return 1 in mds_map['damaged']
253
254 self.wait_until_true(is_marked_damaged, 60)
255
256 def get_state():
257 info = self.mds_cluster.get_mds_info(damaged_id)
258 return info['state'] if info is not None else None
259
260 self.wait_until_equal(
261 get_state,
262 "up:standby",
263 timeout=60)
264
265 self.fs.mds_stop(damaged_id)
266 self.fs.mds_fail(damaged_id)
267
268 # Now give up and go through a disaster recovery procedure
269 self.fs.mds_stop(active_mds_names[0])
270 self.fs.mds_fail(active_mds_names[0])
271 # Invoke recover_dentries quietly, because otherwise log spews millions of lines
272 self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
273 self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
274 self.fs.table_tool(["0", "reset", "session"])
275 self.fs.journal_tool(["journal", "reset"], rank=0)
276 self.fs.erase_mds_objects(1)
277 self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
278 '--yes-i-really-mean-it')
279
280 # Bring an MDS back online, mount a client, and see that we can walk the full
281 # filesystem tree again
282 self.fs.mds_fail_restart(active_mds_names[0])
283 self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
284 reject_fn=lambda v: len(v) > 1)
285 self.mount_a.mount()
286 self.mount_a.run_shell(["ls", "-R"], wait=True)
287
288 def test_table_tool(self):
289 active_mdss = self.fs.get_active_names()
290 self.assertEqual(len(active_mdss), 1)
291 mds_name = active_mdss[0]
292
293 self.mount_a.run_shell(["touch", "foo"])
294 self.fs.mds_asok(["flush", "journal"], mds_name)
295
296 log.info(self.fs.table_tool(["all", "show", "inode"]))
297 log.info(self.fs.table_tool(["all", "show", "snap"]))
298 log.info(self.fs.table_tool(["all", "show", "session"]))
299
300 # Inode table should always be the same because initial state
301 # and choice of inode are deterministic.
302 # Should see one inode consumed
303 self.assertEqual(
304 json.loads(self.fs.table_tool(["all", "show", "inode"])),
305 {"0": {
306 "data": {
307 "version": 2,
308 "inotable": {
309 "projected_free": [
310 {"start": 1099511628777,
311 "len": 1099511626775}],
312 "free": [
313 {"start": 1099511628777,
314 "len": 1099511626775}]}},
315 "result": 0}}
316
317 )
318
319 # Should see one session
320 session_data = json.loads(self.fs.table_tool(
321 ["all", "show", "session"]))
322 self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
323 self.assertEqual(session_data["0"]["result"], 0)
324
325 # Should see no snaps
326 self.assertEqual(
327 json.loads(self.fs.table_tool(["all", "show", "snap"])),
328 {"version": 0,
329 "snapserver": {"last_snap": 1,
330 "pending_noop": [],
331 "snaps": [],
332 "need_to_purge": {},
333 "pending_update": [],
334 "pending_destroy": []},
335 "result": 0}
336 )
337
338 # Reset everything
339 for table in ["session", "inode", "snap"]:
340 self.fs.table_tool(["all", "reset", table])
341
342 log.info(self.fs.table_tool(["all", "show", "inode"]))
343 log.info(self.fs.table_tool(["all", "show", "snap"]))
344 log.info(self.fs.table_tool(["all", "show", "session"]))
345
346 # Should see 0 sessions
347 session_data = json.loads(self.fs.table_tool(
348 ["all", "show", "session"]))
349 self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
350 self.assertEqual(session_data["0"]["result"], 0)
351
352 # Should see entire inode range now marked free
353 self.assertEqual(
354 json.loads(self.fs.table_tool(["all", "show", "inode"])),
355 {"0": {"data": {"version": 1,
356 "inotable": {"projected_free": [
357 {"start": 1099511627776,
358 "len": 1099511627776}],
359 "free": [
360 {"start": 1099511627776,
361 "len": 1099511627776}]}},
362 "result": 0}}
363 )
364
365 # Should see no snaps
366 self.assertEqual(
367 json.loads(self.fs.table_tool(["all", "show", "snap"])),
368 {"version": 1,
369 "snapserver": {"last_snap": 1,
370 "pending_noop": [],
371 "snaps": [],
372 "need_to_purge": {},
373 "pending_update": [],
374 "pending_destroy": []},
375 "result": 0}
376 )
377
378 def test_table_tool_take_inos(self):
379 initial_range_start = 1099511627776
380 initial_range_len = 1099511627776
381 # Initially a completely clear range
382 self.assertEqual(
383 json.loads(self.fs.table_tool(["all", "show", "inode"])),
384 {"0": {"data": {"version": 0,
385 "inotable": {"projected_free": [
386 {"start": initial_range_start,
387 "len": initial_range_len}],
388 "free": [
389 {"start": initial_range_start,
390 "len": initial_range_len}]}},
391 "result": 0}}
392 )
393
394 # Remove some
395 self.assertEqual(
396 json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
397 {"0": {"data": {"version": 1,
398 "inotable": {"projected_free": [
399 {"start": initial_range_start + 101,
400 "len": initial_range_len - 101}],
401 "free": [
402 {"start": initial_range_start + 101,
403 "len": initial_range_len - 101}]}},
404 "result": 0}}
405 )
406
407 @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth
408 def test_journal_smoke(self):
409 workunit(self.ctx, {
410 'clients': {
411 "client.{0}".format(self.mount_a.client_id): [
412 "fs/misc/trivial_sync.sh"],
413 },
414 "timeout": "1h"
415 })
416
417 for mount in self.mounts:
418 mount.umount_wait()
419
420 self.fs.mds_stop()
421 self.fs.mds_fail()
422
423 # journal tool smoke
424 workunit(self.ctx, {
425 'clients': {
426 "client.{0}".format(self.mount_a.client_id): [
427 "suites/cephfs_journal_tool_smoke.sh"],
428 },
429 "timeout": "1h"
430 })
431
432
433
434 self.fs.mds_restart()
435 self.fs.wait_for_daemons()
436
437 self.mount_a.mount()
438
439 # trivial sync moutn a
440 workunit(self.ctx, {
441 'clients': {
442 "client.{0}".format(self.mount_a.client_id): [
443 "fs/misc/trivial_sync.sh"],
444 },
445 "timeout": "1h"
446 })
447