ceph/qa/tasks/cephfs/test_journal_repair.py

   1
   2 """
   3 Test our tools for recovering the content of damaged journals
   4 """
   5
   6 import json
   7 import logging
   8 from textwrap import dedent
   9 import time
  10
  11 from teuthology.exceptions import CommandFailedError, ConnectionLostError
  12 from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
  13 from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
  14 from tasks.workunit import task as workunit
  15
  16 log = logging.getLogger(__name__)
  17
  18
  19 class TestJournalRepair(CephFSTestCase):
  20     MDSS_REQUIRED = 2
  21
  22     def test_inject_to_empty(self):
  23         """
  24         That when some dentries in the journal but nothing is in
  25         the backing store, we correctly populate the backing store
  26         from the journalled dentries.
  27         """
  28
  29         # Inject metadata operations
  30         self.mount_a.run_shell(["touch", "rootfile"])
  31         self.mount_a.run_shell(["mkdir", "subdir"])
  32         self.mount_a.run_shell(["touch", "subdir/subdirfile"])
  33         # There are several different paths for handling hardlinks, depending
  34         # on whether an existing dentry (being overwritten) is also a hardlink
  35         self.mount_a.run_shell(["mkdir", "linkdir"])
  36
  37         # Test inode -> remote transition for a dentry
  38         self.mount_a.run_shell(["touch", "linkdir/link0"])
  39         self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
  40         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
  41
  42         # Test nothing -> remote transition
  43         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
  44
  45         # Test remote -> inode transition
  46         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
  47         self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
  48         self.mount_a.run_shell(["touch", "linkdir/link2"])
  49
  50         # Test remote -> diff remote transition
  51         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
  52         self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
  53         self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
  54
  55         # Test an empty directory
  56         self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
  57         self.mount_a.run_shell(["sync"])
  58
  59         # Before we unmount, make a note of the inode numbers, later we will
  60         # check that they match what we recover from the journal
  61         rootfile_ino = self.mount_a.path_to_ino("rootfile")
  62         subdir_ino = self.mount_a.path_to_ino("subdir")
  63         linkdir_ino = self.mount_a.path_to_ino("linkdir")
  64         subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
  65         subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
  66
  67         self.mount_a.umount_wait()
  68
  69         # Stop the MDS
  70         self.fs.mds_stop()
  71         self.fs.mds_fail()
  72
  73         # Now, the journal should contain the operations, but the backing
  74         # store shouldn't
  75         with self.assertRaises(ObjectNotFound):
  76             self.fs.list_dirfrag(subdir_ino)
  77         self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
  78
  79         # Execute the dentry recovery, this should populate the backing store
  80         self.fs.journal_tool(['event', 'recover_dentries', 'list'])
  81
  82         # Dentries in ROOT_INO are present
  83         self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
  84         self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
  85         self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
  86                          sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
  87
  88         # Now check the MDS can read what we wrote: truncate the journal
  89         # and start the mds.
  90         self.fs.journal_tool(['journal', 'reset'])
  91         self.fs.mds_fail_restart()
  92         self.fs.wait_for_daemons()
  93
  94         # List files
  95         self.mount_a.mount()
  96         self.mount_a.wait_until_mounted()
  97
  98         # First ls -R to populate MDCache, such that hardlinks will
  99         # resolve properly (recover_dentries does not create backtraces,
 100         # so ordinarily hardlinks to inodes that happen not to have backtraces
 101         # will be invisible in readdir).
 102         # FIXME: hook in forward scrub here to regenerate backtraces
 103         proc = self.mount_a.run_shell(['ls', '-R'])
 104         self.mount_a.umount_wait()  # remount to clear client cache before our second ls
 105         self.mount_a.mount()
 106         self.mount_a.wait_until_mounted()
 107
 108         proc = self.mount_a.run_shell(['ls', '-R'])
 109         self.assertEqual(proc.stdout.getvalue().strip(),
 110                          dedent("""
 111                          .:
 112                          linkdir
 113                          rootfile
 114                          subdir
 115
 116                          ./linkdir:
 117                          link0
 118                          link1
 119                          link2
 120                          link3
 121
 122                          ./subdir:
 123                          subdirfile
 124                          subsubdir
 125
 126                          ./subdir/subsubdir:
 127                          """).strip())
 128
 129         # Check the correct inos were preserved by path
 130         self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
 131         self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
 132         self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
 133         self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
 134
 135         # Check that the hard link handling came out correctly
 136         self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
 137         self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
 138         self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
 139         self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
 140
 141         # Create a new file, ensure it is not issued the same ino as one of the
 142         # recovered ones
 143         self.mount_a.run_shell(["touch", "afterwards"])
 144         new_ino = self.mount_a.path_to_ino("afterwards")
 145         self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
 146
 147         # Check that we can do metadata ops in the recovered directory
 148         self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
 149
 150     @for_teuthology # 308s
 151     def test_reset(self):
 152         """
 153         That after forcibly modifying the backing store, we can get back into
 154         a good state by resetting the MDSMap.
 155
 156         The scenario is that we have two active MDSs, and we lose the journals.  Once
 157         we have completely lost confidence in the integrity of the metadata, we want to
 158         return the system to a single-MDS state to go into a scrub to recover what we
 159         can.
 160         """
 161
 162         # Set max_mds to 2
 163         self.fs.set_allow_multimds(True)
 164         self.fs.set_max_mds(2)
 165
 166         # See that we have two active MDSs
 167         self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
 168                               reject_fn=lambda v: v > 2 or v < 1)
 169         active_mds_names = self.fs.get_active_names()
 170
 171         # Switch off any unneeded MDS daemons
 172         for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
 173             self.mds_cluster.mds_stop(unneeded_mds)
 174             self.mds_cluster.mds_fail(unneeded_mds)
 175
 176         # Do a bunch of I/O such that at least some will hit the second MDS: create
 177         # lots of directories so that the balancer should find it easy to make a decision
 178         # to allocate some of them to the second mds.
 179         spammers = []
 180         for n in range(0, 16):
 181             dir_name = "spam_{0}".format(n)
 182             spammers.append(self.mount_a.spam_dir_background(dir_name))
 183
 184         def subtrees_assigned():
 185             got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
 186             rank_1_count = len([s for s in got_subtrees if s['auth_first'] == 1])
 187
 188             # Greater than 1, because there is typically 1 for ~mds1, and once it
 189             # has been assigned something in addition to that it means it has been
 190             # assigned a "real" subtree.
 191             return rank_1_count > 1
 192
 193         # We are waiting for the MDS to respond to hot directories, which
 194         # is not guaranteed to happen at a particular time, so a lengthy timeout here.
 195         self.wait_until_true(subtrees_assigned, 600)
 196
 197         # Flush the journals so that we have some backing store data
 198         # belonging to one MDS, and some to the other MDS.
 199         for mds_name in active_mds_names:
 200             self.fs.mds_asok(["flush", "journal"], mds_name)
 201
 202         # Stop (hard) the second MDS daemon
 203         self.fs.mds_stop(active_mds_names[1])
 204
 205         # Wipe out the tables for MDS rank 1 so that it is broken and can't start
 206         # (this is the simulated failure that we will demonstrate that the disaster
 207         #  recovery tools can get us back from)
 208         self.fs.erase_metadata_objects(prefix="mds1_")
 209
 210         # Try to access files from the client
 211         blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
 212
 213         # Check that this "ls -R" blocked rather than completing: indicates
 214         # it got stuck trying to access subtrees which were on the now-dead MDS.
 215         log.info("Sleeping to check ls is blocked...")
 216         time.sleep(60)
 217         self.assertFalse(blocked_ls.finished)
 218
 219         # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
 220         # is not coming back.  Kill it.
 221         log.info("Killing mount, it's blocked on the MDS we killed")
 222         self.mount_a.kill()
 223         self.mount_a.kill_cleanup()
 224         try:
 225             # Now that the mount is dead, the ls -R should error out.
 226             blocked_ls.wait()
 227         except (CommandFailedError, ConnectionLostError):
 228             # The ConnectionLostError case is for kernel client, where
 229             # killing the mount also means killing the node.
 230             pass
 231
 232         log.info("Terminating spammer processes...")
 233         for spammer_proc in spammers:
 234             spammer_proc.stdin.close()
 235             try:
 236                 spammer_proc.wait()
 237             except (CommandFailedError, ConnectionLostError):
 238                 # The ConnectionLostError case is for kernel client, where
 239                 # killing the mount also means killing the node.
 240                 pass
 241
 242         # See that the second MDS will crash when it starts and tries to
 243         # acquire rank 1
 244         damaged_id = active_mds_names[1]
 245         self.fs.mds_restart(damaged_id)
 246
 247         # The daemon taking the damaged rank should start starting, then
 248         # restart back into standby after asking the mon to mark the rank
 249         # damaged.
 250         def is_marked_damaged():
 251             mds_map = self.fs.get_mds_map()
 252             return 1 in mds_map['damaged']
 253
 254         self.wait_until_true(is_marked_damaged, 60)
 255
 256         def get_state():
 257             info = self.mds_cluster.get_mds_info(damaged_id)
 258             return info['state'] if info is not None else None
 259
 260         self.wait_until_equal(
 261                 get_state,
 262                 "up:standby",
 263                 timeout=60)
 264
 265         self.fs.mds_stop(damaged_id)
 266         self.fs.mds_fail(damaged_id)
 267
 268         # Now give up and go through a disaster recovery procedure
 269         self.fs.mds_stop(active_mds_names[0])
 270         self.fs.mds_fail(active_mds_names[0])
 271         # Invoke recover_dentries quietly, because otherwise log spews millions of lines
 272         self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
 273         self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
 274         self.fs.table_tool(["0", "reset", "session"])
 275         self.fs.journal_tool(["journal", "reset"], rank=0)
 276         self.fs.erase_mds_objects(1)
 277         self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
 278                 '--yes-i-really-mean-it')
 279
 280         # Bring an MDS back online, mount a client, and see that we can walk the full
 281         # filesystem tree again
 282         self.fs.mds_fail_restart(active_mds_names[0])
 283         self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
 284                               reject_fn=lambda v: len(v) > 1)
 285         self.mount_a.mount()
 286         self.mount_a.run_shell(["ls", "-R"], wait=True)
 287
 288     def test_table_tool(self):
 289         active_mdss = self.fs.get_active_names()
 290         self.assertEqual(len(active_mdss), 1)
 291         mds_name = active_mdss[0]
 292
 293         self.mount_a.run_shell(["touch", "foo"])
 294         self.fs.mds_asok(["flush", "journal"], mds_name)
 295
 296         log.info(self.fs.table_tool(["all", "show", "inode"]))
 297         log.info(self.fs.table_tool(["all", "show", "snap"]))
 298         log.info(self.fs.table_tool(["all", "show", "session"]))
 299
 300         # Inode table should always be the same because initial state
 301         # and choice of inode are deterministic.
 302         # Should see one inode consumed
 303         self.assertEqual(
 304             json.loads(self.fs.table_tool(["all", "show", "inode"])),
 305             {"0": {
 306                 "data": {
 307                     "version": 2,
 308                     "inotable": {
 309                         "projected_free": [
 310                             {"start": 1099511628777,
 311                              "len": 1099511626775}],
 312                         "free": [
 313                             {"start": 1099511628777,
 314                              "len": 1099511626775}]}},
 315                 "result": 0}}
 316
 317         )
 318
 319         # Should see one session
 320         session_data = json.loads(self.fs.table_tool(
 321             ["all", "show", "session"]))
 322         self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
 323         self.assertEqual(session_data["0"]["result"], 0)
 324
 325         # Should see no snaps
 326         self.assertEqual(
 327             json.loads(self.fs.table_tool(["all", "show", "snap"])),
 328             {"version": 0,
 329              "snapserver": {"last_snap": 1,
 330                             "pending_noop": [],
 331                             "snaps": [],
 332                             "need_to_purge": {},
 333                             "pending_update": [],
 334                             "pending_destroy": []},
 335              "result": 0}
 336         )
 337
 338         # Reset everything
 339         for table in ["session", "inode", "snap"]:
 340             self.fs.table_tool(["all", "reset", table])
 341
 342         log.info(self.fs.table_tool(["all", "show", "inode"]))
 343         log.info(self.fs.table_tool(["all", "show", "snap"]))
 344         log.info(self.fs.table_tool(["all", "show", "session"]))
 345
 346         # Should see 0 sessions
 347         session_data = json.loads(self.fs.table_tool(
 348             ["all", "show", "session"]))
 349         self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
 350         self.assertEqual(session_data["0"]["result"], 0)
 351
 352         # Should see entire inode range now marked free
 353         self.assertEqual(
 354             json.loads(self.fs.table_tool(["all", "show", "inode"])),
 355             {"0": {"data": {"version": 1,
 356                             "inotable": {"projected_free": [
 357                                 {"start": 1099511627776,
 358                                  "len": 1099511627776}],
 359                                  "free": [
 360                                     {"start": 1099511627776,
 361                                     "len": 1099511627776}]}},
 362                    "result": 0}}
 363         )
 364
 365         # Should see no snaps
 366         self.assertEqual(
 367             json.loads(self.fs.table_tool(["all", "show", "snap"])),
 368             {"version": 1,
 369              "snapserver": {"last_snap": 1,
 370                             "pending_noop": [],
 371                             "snaps": [],
 372                             "need_to_purge": {},
 373                             "pending_update": [],
 374                             "pending_destroy": []},
 375              "result": 0}
 376         )
 377
 378     def test_table_tool_take_inos(self):
 379         initial_range_start = 1099511627776
 380         initial_range_len = 1099511627776
 381         # Initially a completely clear range
 382         self.assertEqual(
 383             json.loads(self.fs.table_tool(["all", "show", "inode"])),
 384             {"0": {"data": {"version": 0,
 385                             "inotable": {"projected_free": [
 386                                 {"start": initial_range_start,
 387                                  "len": initial_range_len}],
 388                                 "free": [
 389                                     {"start": initial_range_start,
 390                                      "len": initial_range_len}]}},
 391                    "result": 0}}
 392         )
 393
 394         # Remove some
 395         self.assertEqual(
 396             json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
 397             {"0": {"data": {"version": 1,
 398                             "inotable": {"projected_free": [
 399                                 {"start": initial_range_start + 101,
 400                                  "len": initial_range_len - 101}],
 401                                 "free": [
 402                                     {"start": initial_range_start + 101,
 403                                      "len": initial_range_len - 101}]}},
 404                    "result": 0}}
 405         )
 406
 407     @for_teuthology  # Hack: "for_teuthology" because .sh doesn't work outside teuth
 408     def test_journal_smoke(self):
 409         workunit(self.ctx, {
 410             'clients': {
 411                 "client.{0}".format(self.mount_a.client_id): [
 412                     "fs/misc/trivial_sync.sh"],
 413             },
 414             "timeout": "1h"
 415         })
 416
 417         for mount in self.mounts:
 418             mount.umount_wait()
 419
 420         self.fs.mds_stop()
 421         self.fs.mds_fail()
 422
 423         # journal tool smoke
 424         workunit(self.ctx, {
 425             'clients': {
 426                 "client.{0}".format(self.mount_a.client_id): [
 427                     "suites/cephfs_journal_tool_smoke.sh"],
 428             },
 429             "timeout": "1h"
 430         })
 431
 432
 433
 434         self.fs.mds_restart()
 435         self.fs.wait_for_daemons()
 436
 437         self.mount_a.mount()
 438
 439         # trivial sync moutn a
 440         workunit(self.ctx, {
 441             'clients': {
 442                 "client.{0}".format(self.mount_a.client_id): [
 443                     "fs/misc/trivial_sync.sh"],
 444             },
 445             "timeout": "1h"
 446         })
 447