ceph/qa/tasks/cephfs/cephfs_test_case.py

   1 import json
   2 import logging
   3 from unittest import case
   4 from tasks.ceph_test_case import CephTestCase
   5 import os
   6 import re
   7 from StringIO import StringIO
   8
   9 from tasks.cephfs.fuse_mount import FuseMount
  10
  11 from teuthology.orchestra import run
  12 from teuthology.orchestra.run import CommandFailedError
  13
  14
  15 log = logging.getLogger(__name__)
  16
  17
  18 def for_teuthology(f):
  19     """
  20     Decorator that adds an "is_for_teuthology" attribute to the wrapped function
  21     """
  22     f.is_for_teuthology = True
  23     return f
  24
  25
  26 def needs_trimming(f):
  27     """
  28     Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
  29     this means it needs to be able to run as root, currently)
  30     """
  31     f.needs_trimming = True
  32     return f
  33
  34
  35 class CephFSTestCase(CephTestCase):
  36     """
  37     Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
  38     into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
  39
  40     Handles resetting the cluster under test between tests.
  41     """
  42
  43     # FIXME weird explicit naming
  44     mount_a = None
  45     mount_b = None
  46
  47     # Declarative test requirements: subclasses should override these to indicate
  48     # their special needs.  If not met, tests will be skipped.
  49     CLIENTS_REQUIRED = 1
  50     MDSS_REQUIRED = 1
  51     REQUIRE_KCLIENT_REMOTE = False
  52     REQUIRE_ONE_CLIENT_REMOTE = False
  53     REQUIRE_MEMSTORE = False
  54
  55     # Whether to create the default filesystem during setUp
  56     REQUIRE_FILESYSTEM = True
  57
  58     LOAD_SETTINGS = []
  59
  60     def setUp(self):
  61         super(CephFSTestCase, self).setUp()
  62
  63         if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
  64             raise case.SkipTest("Only have {0} MDSs, require {1}".format(
  65                 len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
  66             ))
  67
  68         if len(self.mounts) < self.CLIENTS_REQUIRED:
  69             raise case.SkipTest("Only have {0} clients, require {1}".format(
  70                 len(self.mounts), self.CLIENTS_REQUIRED
  71             ))
  72
  73         if self.REQUIRE_KCLIENT_REMOTE:
  74             if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
  75                 # kclient kill() power cycles nodes, so requires clients to each be on
  76                 # their own node
  77                 if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
  78                     raise case.SkipTest("kclient clients must be on separate nodes")
  79
  80         if self.REQUIRE_ONE_CLIENT_REMOTE:
  81             if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
  82                 raise case.SkipTest("Require first client to be on separate server from MDSs")
  83
  84         if self.REQUIRE_MEMSTORE:
  85             objectstore = self.mds_cluster.get_config("osd_objectstore", "osd")
  86             if objectstore != "memstore":
  87                 # You certainly *could* run this on a real OSD, but you don't want to sit
  88                 # here for hours waiting for the test to fill up a 1TB drive!
  89                 raise case.SkipTest("Require `memstore` OSD backend to simulate full drives")
  90
  91         # Create friendly mount_a, mount_b attrs
  92         for i in range(0, self.CLIENTS_REQUIRED):
  93             setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
  94
  95         self.mds_cluster.clear_firewall()
  96
  97         # Unmount all clients, we are about to blow away the filesystem
  98         for mount in self.mounts:
  99             if mount.is_mounted():
 100                 mount.umount_wait(force=True)
 101
 102         # To avoid any issues with e.g. unlink bugs, we destroy and recreate
 103         # the filesystem rather than just doing a rm -rf of files
 104         self.mds_cluster.mds_stop()
 105         self.mds_cluster.mds_fail()
 106         self.mds_cluster.delete_all_filesystems()
 107         self.fs = None # is now invalid!
 108
 109         # In case the previous filesystem had filled up the RADOS cluster, wait for that
 110         # flag to pass.
 111         osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd'))
 112         self.wait_until_true(lambda: not self.mds_cluster.is_full(),
 113                              timeout=osd_mon_report_interval_max * 5)
 114
 115         # In case anything is in the OSD blacklist list, clear it out.  This is to avoid
 116         # the OSD map changing in the background (due to blacklist expiry) while tests run.
 117         try:
 118             self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
 119         except CommandFailedError:
 120             # Fallback for older Ceph cluster
 121             blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
 122                                   "dump", "--format=json-pretty"))['blacklist']
 123             log.info("Removing {0} blacklist entries".format(len(blacklist)))
 124             for addr, blacklisted_at in blacklist.items():
 125                 self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
 126
 127         client_mount_ids = [m.client_id for m in self.mounts]
 128         # In case the test changes the IDs of clients, stash them so that we can
 129         # reset in tearDown
 130         self._original_client_ids = client_mount_ids
 131         log.info(client_mount_ids)
 132
 133         # In case there were any extra auth identities around from a previous
 134         # test, delete them
 135         for entry in self.auth_list():
 136             ent_type, ent_id = entry['entity'].split(".")
 137             if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
 138                 self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
 139
 140         if self.REQUIRE_FILESYSTEM:
 141             self.fs = self.mds_cluster.newfs(True)
 142             self.fs.mds_restart()
 143
 144             # In case some test messed with auth caps, reset them
 145             for client_id in client_mount_ids:
 146                 self.mds_cluster.mon_manager.raw_cluster_cmd_result(
 147                     'auth', 'caps', "client.{0}".format(client_id),
 148                     'mds', 'allow',
 149                     'mon', 'allow r',
 150                     'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))
 151
 152             # wait for mds restart to complete...
 153             self.fs.wait_for_daemons()
 154
 155             # Mount the requested number of clients
 156             for i in range(0, self.CLIENTS_REQUIRED):
 157                 self.mounts[i].mount()
 158                 self.mounts[i].wait_until_mounted()
 159
 160         # Load an config settings of interest
 161         for setting in self.LOAD_SETTINGS:
 162             setattr(self, setting, float(self.fs.mds_asok(
 163                 ['config', 'get', setting], self.mds_cluster.mds_ids[0]
 164             )[setting]))
 165
 166         self.configs_set = set()
 167
 168     def tearDown(self):
 169         super(CephFSTestCase, self).tearDown()
 170
 171         self.mds_cluster.clear_firewall()
 172         for m in self.mounts:
 173             m.teardown()
 174
 175         for i, m in enumerate(self.mounts):
 176             m.client_id = self._original_client_ids[i]
 177
 178         for subsys, key in self.configs_set:
 179             self.mds_cluster.clear_ceph_conf(subsys, key)
 180
 181     def set_conf(self, subsys, key, value):
 182         self.configs_set.add((subsys, key))
 183         self.mds_cluster.set_ceph_conf(subsys, key, value)
 184
 185     def auth_list(self):
 186         """
 187         Convenience wrapper on "ceph auth ls"
 188         """
 189         return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
 190             "auth", "ls", "--format=json-pretty"
 191         ))['auth_dump']
 192
 193     def assert_session_count(self, expected, ls_data=None, mds_id=None):
 194         if ls_data is None:
 195             ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
 196
 197         alive_count = len([s for s in ls_data if s['state'] != 'killing'])
 198
 199         self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
 200             expected, alive_count
 201         ))
 202
 203     def assert_session_state(self, client_id,  expected_state):
 204         self.assertEqual(
 205             self._session_by_id(
 206                 self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
 207             expected_state)
 208
 209     def get_session_data(self, client_id):
 210         return self._session_by_id(client_id)
 211
 212     def _session_list(self):
 213         ls_data = self.fs.mds_asok(['session', 'ls'])
 214         ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
 215         return ls_data
 216
 217     def get_session(self, client_id, session_ls=None):
 218         if session_ls is None:
 219             session_ls = self.fs.mds_asok(['session', 'ls'])
 220
 221         return self._session_by_id(session_ls)[client_id]
 222
 223     def _session_by_id(self, session_ls):
 224         return dict([(s['id'], s) for s in session_ls])
 225
 226     def wait_for_daemon_start(self, daemon_ids=None):
 227         """
 228         Wait until all the daemons appear in the FSMap, either assigned
 229         MDS ranks or in the list of standbys
 230         """
 231         def get_daemon_names():
 232             return [info['name'] for info in self.mds_cluster.status().get_all()]
 233
 234         if daemon_ids is None:
 235             daemon_ids = self.mds_cluster.mds_ids
 236
 237         try:
 238             self.wait_until_true(
 239                 lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
 240                 timeout=30
 241             )
 242         except RuntimeError:
 243             log.warn("Timeout waiting for daemons {0}, while we have {1}".format(
 244                 daemon_ids, get_daemon_names()
 245             ))
 246             raise
 247
 248     def assert_mds_crash(self, daemon_id):
 249         """
 250         Assert that the a particular MDS daemon crashes (block until
 251         it does)
 252         """
 253         try:
 254             self.mds_cluster.mds_daemons[daemon_id].proc.wait()
 255         except CommandFailedError as e:
 256             log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
 257             self.mds_cluster.mds_daemons[daemon_id].proc = None
 258
 259             # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
 260             # catch it later and treat it as a failure.
 261             p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
 262                 "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
 263             core_pattern = p.stdout.getvalue().strip()
 264             if os.path.dirname(core_pattern):  # Non-default core_pattern with a directory in it
 265                 # We have seen a core_pattern that looks like it's from teuthology's coredump
 266                 # task, so proceed to clear out the core file
 267                 log.info("Clearing core from pattern: {0}".format(core_pattern))
 268
 269                 # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
 270                 # to talk to the mons to get assigned a rank to reach the point of crashing
 271                 addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr']
 272                 pid_str = addr.split("/")[1]
 273                 log.info("Determined crasher PID was {0}".format(pid_str))
 274
 275                 # Substitute PID into core_pattern to get a glob
 276                 core_glob = core_pattern.replace("%p", pid_str)
 277                 core_glob = re.sub("%[a-z]", "*", core_glob)  # Match all for all other % tokens
 278
 279                 # Verify that we see the expected single coredump matching the expected pattern
 280                 ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
 281                     "sudo", "ls", run.Raw(core_glob)
 282                 ], stdout=StringIO())
 283                 cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
 284                 log.info("Enumerated cores: {0}".format(cores))
 285                 self.assertEqual(len(cores), 1)
 286
 287                 log.info("Found core file {0}, deleting it".format(cores[0]))
 288
 289                 self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
 290                     "sudo", "rm", "-f", cores[0]
 291                 ])
 292             else:
 293                 log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
 294
 295         else:
 296             raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))