ceph/qa/tasks/cephfs/test_journal_repair.py

   1
   2 """
   3 Test our tools for recovering the content of damaged journals
   4 """
   5
   6 import json
   7 import logging
   8 from textwrap import dedent
   9 import time
  10
  11 from teuthology.exceptions import CommandFailedError, ConnectionLostError
  12 from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
  13 from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
  14 from tasks.workunit import task as workunit
  15
  16 log = logging.getLogger(__name__)
  17
  18
  19 class TestJournalRepair(CephFSTestCase):
  20     MDSS_REQUIRED = 2
  21
  22     def test_inject_to_empty(self):
  23         """
  24         That when some dentries in the journal but nothing is in
  25         the backing store, we correctly populate the backing store
  26         from the journalled dentries.
  27         """
  28
  29         # Inject metadata operations
  30         self.mount_a.run_shell(["touch", "rootfile"])
  31         self.mount_a.run_shell(["mkdir", "subdir"])
  32         self.mount_a.run_shell(["touch", "subdir/subdirfile"])
  33         # There are several different paths for handling hardlinks, depending
  34         # on whether an existing dentry (being overwritten) is also a hardlink
  35         self.mount_a.run_shell(["mkdir", "linkdir"])
  36
  37         # Test inode -> remote transition for a dentry
  38         self.mount_a.run_shell(["touch", "linkdir/link0"])
  39         self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
  40         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
  41
  42         # Test nothing -> remote transition
  43         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
  44
  45         # Test remote -> inode transition
  46         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
  47         self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
  48         self.mount_a.run_shell(["touch", "linkdir/link2"])
  49
  50         # Test remote -> diff remote transition
  51         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
  52         self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
  53         self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
  54
  55         # Test an empty directory
  56         self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
  57         self.mount_a.run_shell(["sync"])
  58
  59         # Before we unmount, make a note of the inode numbers, later we will
  60         # check that they match what we recover from the journal
  61         rootfile_ino = self.mount_a.path_to_ino("rootfile")
  62         subdir_ino = self.mount_a.path_to_ino("subdir")
  63         linkdir_ino = self.mount_a.path_to_ino("linkdir")
  64         subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
  65         subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
  66
  67         self.mount_a.umount_wait()
  68
  69         # Stop the MDS
  70         self.fs.mds_stop()
  71         self.fs.mds_fail()
  72
  73         # Now, the journal should contain the operations, but the backing
  74         # store shouldn't
  75         with self.assertRaises(ObjectNotFound):
  76             self.fs.list_dirfrag(subdir_ino)
  77         self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
  78
  79         # Execute the dentry recovery, this should populate the backing store
  80         self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
  81
  82         # Dentries in ROOT_INO are present
  83         self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
  84         self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
  85         self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
  86                          sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
  87
  88         # Now check the MDS can read what we wrote: truncate the journal
  89         # and start the mds.
  90         self.fs.journal_tool(['journal', 'reset'], 0)
  91         self.fs.mds_fail_restart()
  92         self.fs.wait_for_daemons()
  93
  94         # List files
  95         self.mount_a.mount_wait()
  96
  97         # First ls -R to populate MDCache, such that hardlinks will
  98         # resolve properly (recover_dentries does not create backtraces,
  99         # so ordinarily hardlinks to inodes that happen not to have backtraces
 100         # will be invisible in readdir).
 101         # FIXME: hook in forward scrub here to regenerate backtraces
 102         proc = self.mount_a.run_shell(['ls', '-R'])
 103         self.mount_a.umount_wait()  # remount to clear client cache before our second ls
 104         self.mount_a.mount_wait()
 105
 106         proc = self.mount_a.run_shell(['ls', '-R'])
 107         self.assertEqual(proc.stdout.getvalue().strip(),
 108                          dedent("""
 109                          .:
 110                          linkdir
 111                          rootfile
 112                          subdir
 113
 114                          ./linkdir:
 115                          link0
 116                          link1
 117                          link2
 118                          link3
 119
 120                          ./subdir:
 121                          subdirfile
 122                          subsubdir
 123
 124                          ./subdir/subsubdir:
 125                          """).strip())
 126
 127         # Check the correct inos were preserved by path
 128         self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
 129         self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
 130         self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
 131         self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
 132
 133         # Check that the hard link handling came out correctly
 134         self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
 135         self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
 136         self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
 137         self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
 138
 139         # Create a new file, ensure it is not issued the same ino as one of the
 140         # recovered ones
 141         self.mount_a.run_shell(["touch", "afterwards"])
 142         new_ino = self.mount_a.path_to_ino("afterwards")
 143         self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
 144
 145         # Check that we can do metadata ops in the recovered directory
 146         self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
 147
 148     @for_teuthology # 308s
 149     def test_reset(self):
 150         """
 151         That after forcibly modifying the backing store, we can get back into
 152         a good state by resetting the MDSMap.
 153
 154         The scenario is that we have two active MDSs, and we lose the journals.  Once
 155         we have completely lost confidence in the integrity of the metadata, we want to
 156         return the system to a single-MDS state to go into a scrub to recover what we
 157         can.
 158         """
 159
 160         # Set max_mds to 2
 161         self.fs.set_max_mds(2)
 162
 163         # See that we have two active MDSs
 164         self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
 165                               reject_fn=lambda v: v > 2 or v < 1)
 166         active_mds_names = self.fs.get_active_names()
 167
 168         # Switch off any unneeded MDS daemons
 169         for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
 170             self.mds_cluster.mds_stop(unneeded_mds)
 171             self.mds_cluster.mds_fail(unneeded_mds)
 172
 173         # Create a dir on each rank
 174         self.mount_a.run_shell(["mkdir", "alpha"])
 175         self.mount_a.run_shell(["mkdir", "bravo"])
 176         self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
 177         self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
 178
 179         def subtrees_assigned():
 180             got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
 181
 182             for s in got_subtrees:
 183                 if s['dir']['path'] == '/bravo':
 184                     if s['auth_first'] == 1:
 185                         return True
 186                     else:
 187                         # Should not happen
 188                         raise RuntimeError("/bravo is subtree but not rank 1!")
 189
 190             return False
 191
 192         # Ensure the pinning has taken effect and the /bravo dir is now
 193         # migrated to rank 1.
 194         self.wait_until_true(subtrees_assigned, 30)
 195
 196         # Do some IO (this should be split across ranks according to
 197         # the rank-pinned dirs)
 198         self.mount_a.create_n_files("alpha/file", 1000)
 199         self.mount_a.create_n_files("bravo/file", 1000)
 200
 201         # Flush the journals so that we have some backing store data
 202         # belonging to one MDS, and some to the other MDS.
 203         for mds_name in active_mds_names:
 204             self.fs.mds_asok(["flush", "journal"], mds_name)
 205
 206         # Stop (hard) the second MDS daemon
 207         self.fs.mds_stop(active_mds_names[1])
 208
 209         # Wipe out the tables for MDS rank 1 so that it is broken and can't start
 210         # (this is the simulated failure that we will demonstrate that the disaster
 211         #  recovery tools can get us back from)
 212         self.fs.erase_metadata_objects(prefix="mds1_")
 213
 214         # Try to access files from the client
 215         blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
 216
 217         # Check that this "ls -R" blocked rather than completing: indicates
 218         # it got stuck trying to access subtrees which were on the now-dead MDS.
 219         log.info("Sleeping to check ls is blocked...")
 220         time.sleep(60)
 221         self.assertFalse(blocked_ls.finished)
 222
 223         # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
 224         # is not coming back.  Kill it.
 225         log.info("Killing mount, it's blocked on the MDS we killed")
 226         self.mount_a.kill()
 227         self.mount_a.kill_cleanup()
 228         try:
 229             # Now that the mount is dead, the ls -R should error out.
 230             blocked_ls.wait()
 231         except (CommandFailedError, ConnectionLostError):
 232             # The ConnectionLostError case is for kernel client, where
 233             # killing the mount also means killing the node.
 234             pass
 235
 236         # See that the second MDS will crash when it starts and tries to
 237         # acquire rank 1
 238         damaged_id = active_mds_names[1]
 239         self.fs.mds_restart(damaged_id)
 240
 241         # The daemon taking the damaged rank should start starting, then
 242         # restart back into standby after asking the mon to mark the rank
 243         # damaged.
 244         def is_marked_damaged():
 245             mds_map = self.fs.get_mds_map()
 246             return 1 in mds_map['damaged']
 247
 248         self.wait_until_true(is_marked_damaged, 60)
 249
 250         def get_state():
 251             info = self.mds_cluster.get_mds_info(damaged_id)
 252             return info['state'] if info is not None else None
 253
 254         self.wait_until_equal(
 255                 get_state,
 256                 "up:standby",
 257                 timeout=60)
 258
 259         self.fs.mds_stop(damaged_id)
 260         self.fs.mds_fail(damaged_id)
 261
 262         # Now give up and go through a disaster recovery procedure
 263         self.fs.mds_stop(active_mds_names[0])
 264         self.fs.mds_fail(active_mds_names[0])
 265         # Invoke recover_dentries quietly, because otherwise log spews millions of lines
 266         self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True)
 267         self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True)
 268         self.fs.table_tool(["0", "reset", "session"])
 269         self.fs.journal_tool(["journal", "reset"], 0)
 270         self.fs.erase_mds_objects(1)
 271         self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
 272                 '--yes-i-really-mean-it')
 273
 274         # Bring an MDS back online, mount a client, and see that we can walk the full
 275         # filesystem tree again
 276         self.fs.mds_fail_restart(active_mds_names[0])
 277         self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
 278                               reject_fn=lambda v: len(v) > 1)
 279         self.mount_a.mount_wait()
 280         self.mount_a.run_shell(["ls", "-R"], wait=True)
 281
 282     def test_table_tool(self):
 283         active_mdss = self.fs.get_active_names()
 284         self.assertEqual(len(active_mdss), 1)
 285         mds_name = active_mdss[0]
 286
 287         self.mount_a.run_shell(["touch", "foo"])
 288         self.fs.mds_asok(["flush", "journal"], mds_name)
 289
 290         log.info(self.fs.table_tool(["all", "show", "inode"]))
 291         log.info(self.fs.table_tool(["all", "show", "snap"]))
 292         log.info(self.fs.table_tool(["all", "show", "session"]))
 293
 294         # Inode table should always be the same because initial state
 295         # and choice of inode are deterministic.
 296         # Should see one inode consumed
 297         self.assertEqual(
 298             json.loads(self.fs.table_tool(["all", "show", "inode"])),
 299             {"0": {
 300                 "data": {
 301                     "version": 2,
 302                     "inotable": {
 303                         "projected_free": [
 304                             {"start": 1099511628777,
 305                              "len": 1099511626775}],
 306                         "free": [
 307                             {"start": 1099511628777,
 308                              "len": 1099511626775}]}},
 309                 "result": 0}}
 310
 311         )
 312
 313         # Should see one session
 314         session_data = json.loads(self.fs.table_tool(
 315             ["all", "show", "session"]))
 316         self.assertEqual(len(session_data["0"]["data"]["sessions"]), 1)
 317         self.assertEqual(session_data["0"]["result"], 0)
 318
 319         # Should see no snaps
 320         self.assertEqual(
 321             json.loads(self.fs.table_tool(["all", "show", "snap"])),
 322             {"version": 1,
 323              "snapserver": {"last_snap": 1,
 324                             "last_created": 1,
 325                             "last_destroyed": 1,
 326                             "pending_noop": [],
 327                             "snaps": [],
 328                             "need_to_purge": {},
 329                             "pending_update": [],
 330                             "pending_destroy": []},
 331              "result": 0}
 332         )
 333
 334         # Reset everything
 335         for table in ["session", "inode", "snap"]:
 336             self.fs.table_tool(["all", "reset", table])
 337
 338         log.info(self.fs.table_tool(["all", "show", "inode"]))
 339         log.info(self.fs.table_tool(["all", "show", "snap"]))
 340         log.info(self.fs.table_tool(["all", "show", "session"]))
 341
 342         # Should see 0 sessions
 343         session_data = json.loads(self.fs.table_tool(
 344             ["all", "show", "session"]))
 345         self.assertEqual(len(session_data["0"]["data"]["sessions"]), 0)
 346         self.assertEqual(session_data["0"]["result"], 0)
 347
 348         # Should see entire inode range now marked free
 349         self.assertEqual(
 350             json.loads(self.fs.table_tool(["all", "show", "inode"])),
 351             {"0": {"data": {"version": 1,
 352                             "inotable": {"projected_free": [
 353                                 {"start": 1099511627776,
 354                                  "len": 1099511627776}],
 355                                  "free": [
 356                                     {"start": 1099511627776,
 357                                     "len": 1099511627776}]}},
 358                    "result": 0}}
 359         )
 360
 361         # Should see no snaps
 362         self.assertEqual(
 363             json.loads(self.fs.table_tool(["all", "show", "snap"])),
 364             {"version": 1,
 365              "snapserver": {"last_snap": 1,
 366                             "last_created": 1,
 367                             "last_destroyed": 1,
 368                             "pending_noop": [],
 369                             "snaps": [],
 370                             "need_to_purge": {},
 371                             "pending_update": [],
 372                             "pending_destroy": []},
 373              "result": 0}
 374         )
 375
 376     def test_table_tool_take_inos(self):
 377         initial_range_start = 1099511627776
 378         initial_range_len = 1099511627776
 379         # Initially a completely clear range
 380         self.assertEqual(
 381             json.loads(self.fs.table_tool(["all", "show", "inode"])),
 382             {"0": {"data": {"version": 0,
 383                             "inotable": {"projected_free": [
 384                                 {"start": initial_range_start,
 385                                  "len": initial_range_len}],
 386                                 "free": [
 387                                     {"start": initial_range_start,
 388                                      "len": initial_range_len}]}},
 389                    "result": 0}}
 390         )
 391
 392         # Remove some
 393         self.assertEqual(
 394             json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
 395             {"0": {"data": {"version": 1,
 396                             "inotable": {"projected_free": [
 397                                 {"start": initial_range_start + 101,
 398                                  "len": initial_range_len - 101}],
 399                                 "free": [
 400                                     {"start": initial_range_start + 101,
 401                                      "len": initial_range_len - 101}]}},
 402                    "result": 0}}
 403         )
 404
 405     @for_teuthology  # Hack: "for_teuthology" because .sh doesn't work outside teuth
 406     def test_journal_smoke(self):
 407         workunit(self.ctx, {
 408             'clients': {
 409                 "client.{0}".format(self.mount_a.client_id): [
 410                     "fs/misc/trivial_sync.sh"],
 411             },
 412             "timeout": "1h"
 413         })
 414
 415         for mount in self.mounts:
 416             mount.umount_wait()
 417
 418         self.fs.mds_stop()
 419         self.fs.mds_fail()
 420
 421         # journal tool smoke
 422         workunit(self.ctx, {
 423             'clients': {
 424                 "client.{0}".format(self.mount_a.client_id): [
 425                     "suites/cephfs_journal_tool_smoke.sh"],
 426             },
 427             "timeout": "1h"
 428         })
 429
 430
 431
 432         self.fs.mds_restart()
 433         self.fs.wait_for_daemons()
 434
 435         self.mount_a.mount_wait()
 436
 437         # trivial sync moutn a
 438         workunit(self.ctx, {
 439             'clients': {
 440                 "client.{0}".format(self.mount_a.client_id): [
 441                     "fs/misc/trivial_sync.sh"],
 442             },
 443             "timeout": "1h"
 444         })
 445