]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_journal_repair.py
61037b96d7320d3f4d48d67e2cfbf10bfdb8c324
[ceph.git] / ceph / qa / tasks / cephfs / test_journal_repair.py
1
2 """
3 Test our tools for recovering the content of damaged journals
4 """
5
6 import json
7 import logging
8 from textwrap import dedent
9 import time
10
11 from teuthology.exceptions import CommandFailedError, ConnectionLostError
12 from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
13 from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
14 from tasks.workunit import task as workunit
15
16 log = logging.getLogger(__name__)
17
18
19 class TestJournalRepair(CephFSTestCase):
20 MDSS_REQUIRED = 2
21
22 def test_inject_to_empty(self):
23 """
24 That when some dentries in the journal but nothing is in
25 the backing store, we correctly populate the backing store
26 from the journalled dentries.
27 """
28
29 # Inject metadata operations
30 self.mount_a.run_shell(["touch", "rootfile"])
31 self.mount_a.run_shell(["mkdir", "subdir"])
32 self.mount_a.run_shell(["touch", "subdir/subdirfile"])
33 # There are several different paths for handling hardlinks, depending
34 # on whether an existing dentry (being overwritten) is also a hardlink
35 self.mount_a.run_shell(["mkdir", "linkdir"])
36
37 # Test inode -> remote transition for a dentry
38 self.mount_a.run_shell(["touch", "linkdir/link0"])
39 self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
40 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
41
42 # Test nothing -> remote transition
43 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
44
45 # Test remote -> inode transition
46 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
47 self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
48 self.mount_a.run_shell(["touch", "linkdir/link2"])
49
50 # Test remote -> diff remote transition
51 self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
52 self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
53 self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
54
55 # Test an empty directory
56 self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
57 self.mount_a.run_shell(["sync"])
58
59 # Before we unmount, make a note of the inode numbers, later we will
60 # check that they match what we recover from the journal
61 rootfile_ino = self.mount_a.path_to_ino("rootfile")
62 subdir_ino = self.mount_a.path_to_ino("subdir")
63 linkdir_ino = self.mount_a.path_to_ino("linkdir")
64 subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
65 subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
66
67 self.mount_a.umount_wait()
68
69 # Stop the MDS
70 self.fs.mds_stop()
71 self.fs.mds_fail()
72
73 # Now, the journal should contain the operations, but the backing
74 # store shouldn't
75 with self.assertRaises(ObjectNotFound):
76 self.fs.list_dirfrag(subdir_ino)
77 self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
78
79 # Execute the dentry recovery, this should populate the backing store
80 self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
81
82 # Dentries in ROOT_INO are present
83 self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
84 self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
85 self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
86 sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
87
88 # Now check the MDS can read what we wrote: truncate the journal
89 # and start the mds.
90 self.fs.journal_tool(['journal', 'reset'], 0)
91 self.fs.mds_fail_restart()
92 self.fs.wait_for_daemons()
93
94 # List files
95 self.mount_a.mount_wait()
96
97 # First ls -R to populate MDCache, such that hardlinks will
98 # resolve properly (recover_dentries does not create backtraces,
99 # so ordinarily hardlinks to inodes that happen not to have backtraces
100 # will be invisible in readdir).
101 # FIXME: hook in forward scrub here to regenerate backtraces
102 proc = self.mount_a.run_shell(['ls', '-R'])
103 self.mount_a.umount_wait() # remount to clear client cache before our second ls
104 self.mount_a.mount_wait()
105
106 proc = self.mount_a.run_shell(['ls', '-R'])
107 self.assertEqual(proc.stdout.getvalue().strip(),
108 dedent("""
109 .:
110 linkdir
111 rootfile
112 subdir
113
114 ./linkdir:
115 link0
116 link1
117 link2
118 link3
119
120 ./subdir:
121 subdirfile
122 subsubdir
123
124 ./subdir/subsubdir:
125 """).strip())
126
127 # Check the correct inos were preserved by path
128 self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
129 self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
130 self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
131 self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
132
133 # Check that the hard link handling came out correctly
134 self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
135 self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
136 self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
137 self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
138
139 # Create a new file, ensure it is not issued the same ino as one of the
140 # recovered ones
141 self.mount_a.run_shell(["touch", "afterwards"])
142 new_ino = self.mount_a.path_to_ino("afterwards")
143 self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
144
145 # Check that we can do metadata ops in the recovered directory
146 self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
147
148 @for_teuthology # 308s
149 def test_reset(self):
150 """
151 That after forcibly modifying the backing store, we can get back into
152 a good state by resetting the MDSMap.
153
154 The scenario is that we have two active MDSs, and we lose the journals. Once
155 we have completely lost confidence in the integrity of the metadata, we want to
156 return the system to a single-MDS state to go into a scrub to recover what we
157 can.
158 """
159
160 # Set max_mds to 2
161 self.fs.set_max_mds(2)
162
163 # See that we have two active MDSs
164 self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
165 reject_fn=lambda v: v > 2 or v < 1)
166 active_mds_names = self.fs.get_active_names()
167
168 # Switch off any unneeded MDS daemons
169 for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
170 self.mds_cluster.mds_stop(unneeded_mds)
171 self.mds_cluster.mds_fail(unneeded_mds)
172
173 # Create a dir on each rank
174 self.mount_a.run_shell(["mkdir", "alpha"])
175 self.mount_a.run_shell(["mkdir", "bravo"])
176 self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
177 self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
178
179 def subtrees_assigned():
180 got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
181
182 for s in got_subtrees:
183 if s['dir']['path'] == '/bravo':
184 if s['auth_first'] == 1:
185 return True
186 else:
187 # Should not happen
188 raise RuntimeError("/bravo is subtree but not rank 1!")
189
190 return False
191
192 # Ensure the pinning has taken effect and the /bravo dir is now
193 # migrated to rank 1.
194 self.wait_until_true(subtrees_assigned, 30)
195
196 # Do some IO (this should be split across ranks according to
197 # the rank-pinned dirs)
198 self.mount_a.create_n_files("alpha/file", 1000)
199 self.mount_a.create_n_files("bravo/file", 1000)
200
201 # Flush the journals so that we have some backing store data
202 # belonging to one MDS, and some to the other MDS.
203 for mds_name in active_mds_names:
204 self.fs.mds_asok(["flush", "journal"], mds_name)
205
206 # Stop (hard) the second MDS daemon
207 self.fs.mds_stop(active_mds_names[1])
208
209 # Wipe out the tables for MDS rank 1 so that it is broken and can't start
210 # (this is the simulated failure that we will demonstrate that the disaster
211 # recovery tools can get us back from)
212 self.fs.erase_metadata_objects(prefix="mds1_")
213
214 # Try to access files from the client
215 blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
216
217 # Check that this "ls -R" blocked rather than completing: indicates
218 # it got stuck trying to access subtrees which were on the now-dead MDS.
219 log.info("Sleeping to check ls is blocked...")
220 time.sleep(60)
221 self.assertFalse(blocked_ls.finished)
222
223 # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
224 # is not coming back. Kill it.
225 log.info("Killing mount, it's blocked on the MDS we killed")
226 self.mount_a.kill()
227 self.mount_a.kill_cleanup()
228 try:
229 # Now that the mount is dead, the ls -R should error out.
230 blocked_ls.wait()
231 except (CommandFailedError, ConnectionLostError):
232 # The ConnectionLostError case is for kernel client, where
233 # killing the mount also means killing the node.
234 pass
235
236 # See that the second MDS will crash when it starts and tries to
237 # acquire rank 1
238 damaged_id = active_mds_names[1]
239 self.fs.mds_restart(damaged_id)
240
241 # The daemon taking the damaged rank should start starting, then
242 # restart back into standby after asking the mon to mark the rank
243 # damaged.
244 def is_marked_damaged():
245 mds_map = self.fs.get_mds_map()
246 return 1 in mds_map['damaged']
247
248 self.wait_until_true(is_marked_damaged, 60)
249
250 def get_state():
251 info = self.mds_cluster.get_mds_info(damaged_id)
252 return info['state'] if info is not None else None
253
254 self.wait_until_equal(
255 get_state,
256 "up:standby",
257 timeout=60)
258
259 self.fs.mds_stop(damaged_id)
260 self.fs.mds_fail(damaged_id)
261
262 # Now give up and go through a disaster recovery procedure
263 self.fs.mds_stop(active_mds_names[0])
264 self.fs.mds_fail(active_mds_names[0])
265 # Invoke recover_dentries quietly, because otherwise log spews millions of lines
266 self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
267 self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
268 self.fs.table_tool(["0", "reset", "session"])
269 self.fs.journal_tool(["journal", "reset"], 0)
270 self.fs.erase_mds_objects(1)
271 self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
272 '--yes-i-really-mean-it')
273
274 # Bring an MDS back online, mount a client, and see that we can walk the full
275 # filesystem tree again
276 self.fs.mds_fail_restart(active_mds_names[0])
277 self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
278 reject_fn=lambda v: len(v) > 1)
279 self.mount_a.mount_wait()
280 self.mount_a.run_shell(["ls", "-R"], wait=True)
281
282 def test_table_tool(self):
283 active_mdss = self.fs.get_active_names()
284 self.assertEqual(len(active_mdss), 1)
285 mds_name = active_mdss[0]
286
287 self.mount_a.run_shell(["touch", "foo"])
288 self.fs.mds_asok(["flush", "journal"], mds_name)
289
290 log.info(self.fs.table_tool(["all", "show", "inode"]))
291 log.info(self.fs.table_tool(["all", "show", "snap"]))
292 log.info(self.fs.table_tool(["all", "show", "session"]))
293
294 # Inode table should always be the same because initial state
295 # and choice of inode are deterministic.
296 # Should see one inode consumed
297 self.assertEqual(
298 json.loads(self.fs.table_tool(["all", "show", "inode"])),
299 {"0": {
300 "data": {
301 "version": 2,
302 "inotable": {
303 "projected_free": [
304 {"start": 1099511628777,
305 "len": 1099511626775}],
306 "free": [
307 {"start": 1099511628777,
308 "len": 1099511626775}]}},
309 "result": 0}}
310
311 )
312
313 # Should see one session
314 session_data = json.loads(self.fs.table_tool(
315 ["all", "show", "session"]))
316 self.assertEqual(len(session_data["0"]["data"]["sessions"]), 1)
317 self.assertEqual(session_data["0"]["result"], 0)
318
319 # Should see no snaps
320 self.assertEqual(
321 json.loads(self.fs.table_tool(["all", "show", "snap"])),
322 {"version": 1,
323 "snapserver": {"last_snap": 1,
324 "last_created": 1,
325 "last_destroyed": 1,
326 "pending_noop": [],
327 "snaps": [],
328 "need_to_purge": {},
329 "pending_update": [],
330 "pending_destroy": []},
331 "result": 0}
332 )
333
334 # Reset everything
335 for table in ["session", "inode", "snap"]:
336 self.fs.table_tool(["all", "reset", table])
337
338 log.info(self.fs.table_tool(["all", "show", "inode"]))
339 log.info(self.fs.table_tool(["all", "show", "snap"]))
340 log.info(self.fs.table_tool(["all", "show", "session"]))
341
342 # Should see 0 sessions
343 session_data = json.loads(self.fs.table_tool(
344 ["all", "show", "session"]))
345 self.assertEqual(len(session_data["0"]["data"]["sessions"]), 0)
346 self.assertEqual(session_data["0"]["result"], 0)
347
348 # Should see entire inode range now marked free
349 self.assertEqual(
350 json.loads(self.fs.table_tool(["all", "show", "inode"])),
351 {"0": {"data": {"version": 1,
352 "inotable": {"projected_free": [
353 {"start": 1099511627776,
354 "len": 1099511627776}],
355 "free": [
356 {"start": 1099511627776,
357 "len": 1099511627776}]}},
358 "result": 0}}
359 )
360
361 # Should see no snaps
362 self.assertEqual(
363 json.loads(self.fs.table_tool(["all", "show", "snap"])),
364 {"version": 1,
365 "snapserver": {"last_snap": 1,
366 "last_created": 1,
367 "last_destroyed": 1,
368 "pending_noop": [],
369 "snaps": [],
370 "need_to_purge": {},
371 "pending_update": [],
372 "pending_destroy": []},
373 "result": 0}
374 )
375
376 def test_table_tool_take_inos(self):
377 initial_range_start = 1099511627776
378 initial_range_len = 1099511627776
379 # Initially a completely clear range
380 self.assertEqual(
381 json.loads(self.fs.table_tool(["all", "show", "inode"])),
382 {"0": {"data": {"version": 0,
383 "inotable": {"projected_free": [
384 {"start": initial_range_start,
385 "len": initial_range_len}],
386 "free": [
387 {"start": initial_range_start,
388 "len": initial_range_len}]}},
389 "result": 0}}
390 )
391
392 # Remove some
393 self.assertEqual(
394 json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
395 {"0": {"data": {"version": 1,
396 "inotable": {"projected_free": [
397 {"start": initial_range_start + 101,
398 "len": initial_range_len - 101}],
399 "free": [
400 {"start": initial_range_start + 101,
401 "len": initial_range_len - 101}]}},
402 "result": 0}}
403 )
404
405 @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth
406 def test_journal_smoke(self):
407 workunit(self.ctx, {
408 'clients': {
409 "client.{0}".format(self.mount_a.client_id): [
410 "fs/misc/trivial_sync.sh"],
411 },
412 "timeout": "1h"
413 })
414
415 for mount in self.mounts:
416 mount.umount_wait()
417
418 self.fs.mds_stop()
419 self.fs.mds_fail()
420
421 # journal tool smoke
422 workunit(self.ctx, {
423 'clients': {
424 "client.{0}".format(self.mount_a.client_id): [
425 "suites/cephfs_journal_tool_smoke.sh"],
426 },
427 "timeout": "1h"
428 })
429
430
431
432 self.fs.mds_restart()
433 self.fs.wait_for_daemons()
434
435 self.mount_a.mount_wait()
436
437 # trivial sync moutn a
438 workunit(self.ctx, {
439 'clients': {
440 "client.{0}".format(self.mount_a.client_id): [
441 "fs/misc/trivial_sync.sh"],
442 },
443 "timeout": "1h"
444 })
445