ceph/qa/tasks/cephfs/kernel_mount.py

   1 import errno
   2 import json
   3 import logging
   4 import os
   5 import re
   6
   7 from io import StringIO
   8 from textwrap import dedent
   9
  10 from teuthology.exceptions import CommandFailedError
  11 from teuthology.orchestra import run
  12 from teuthology.contextutil import MaxWhileTries
  13
  14 from tasks.cephfs.mount import CephFSMount, UMOUNT_TIMEOUT
  15
  16 log = logging.getLogger(__name__)
  17
  18
  19 # internal metadata directory
  20 DEBUGFS_META_DIR = 'meta'
  21
  22 class KernelMount(CephFSMount):
  23     def __init__(self, ctx, test_dir, client_id, client_remote,
  24                  client_keyring_path=None, hostfs_mntpt=None,
  25                  cephfs_name=None, cephfs_mntpt=None, brxnet=None,
  26                  client_config={}):
  27         super(KernelMount, self).__init__(ctx=ctx, test_dir=test_dir,
  28             client_id=client_id, client_remote=client_remote,
  29             client_keyring_path=client_keyring_path, hostfs_mntpt=hostfs_mntpt,
  30             cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet,
  31             client_config=client_config)
  32
  33         if client_config.get('debug', False):
  34             self.client_remote.run(args=["sudo", "bash", "-c", "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"])
  35             self.client_remote.run(args=["sudo", "bash", "-c", "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"])
  36
  37         self.dynamic_debug = self.client_config.get('dynamic_debug', False)
  38         self.rbytes = self.client_config.get('rbytes', False)
  39         self.snapdirname = client_config.get('snapdirname', '.snap')
  40         self.syntax_style = self.client_config.get('syntax', 'v2')
  41         self.inst = None
  42         self.addr = None
  43         self._mount_bin = ['adjust-ulimits', 'ceph-coverage', self.test_dir +\
  44                            '/archive/coverage', '/bin/mount', '-t', 'ceph']
  45
  46     def mount(self, mntopts=None, check_status=True, **kwargs):
  47         self.update_attrs(**kwargs)
  48         self.assert_and_log_minimum_mount_details()
  49
  50         self.setup_netns()
  51
  52         if not self.cephfs_mntpt:
  53             self.cephfs_mntpt = '/'
  54         if not self.cephfs_name:
  55             self.cephfs_name = 'cephfs'
  56
  57         self._create_mntpt()
  58
  59         retval = self._run_mount_cmd(mntopts, check_status)
  60         if retval:
  61             return retval
  62
  63         self._set_filemode_on_mntpt()
  64
  65         if self.dynamic_debug:
  66             kmount_count = self.ctx.get(f'kmount_count.{self.client_remote.hostname}', 0)
  67             if kmount_count == 0:
  68                 self.enable_dynamic_debug()
  69             self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count + 1
  70
  71         self.gather_mount_info()
  72
  73     def gather_mount_info(self):
  74         self.id = self._get_global_id()
  75         self.get_global_inst()
  76         self.get_global_addr()
  77
  78     def _run_mount_cmd(self, mntopts, check_status):
  79         mount_cmd = self._get_mount_cmd(mntopts)
  80         mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
  81
  82         try:
  83             self.client_remote.run(args=mount_cmd, timeout=300,
  84                                    stdout=mountcmd_stdout,
  85                                    stderr=mountcmd_stderr, omit_sudo=False)
  86         except CommandFailedError as e:
  87             log.info('mount command failed')
  88             if check_status:
  89                 raise
  90             else:
  91                 return (e, mountcmd_stdout.getvalue(),
  92                         mountcmd_stderr.getvalue())
  93         log.info('mount command passed')
  94
  95     def _make_mount_cmd_old_or_new_style(self):
  96         optd = {}
  97         mnt_stx = ''
  98
  99         self.validate_subvol_options()
 100
 101         assert(self.cephfs_mntpt)
 102         if self.syntax_style == 'v1':
 103             mnt_stx = f':{self.cephfs_mntpt}'
 104             if self.client_id:
 105                 optd['name'] = self.client_id
 106             if self.cephfs_name:
 107                 optd['mds_namespace'] = self.cephfs_name
 108         elif self.syntax_style == 'v2':
 109             mnt_stx = f'{self.client_id}@.{self.cephfs_name}={self.cephfs_mntpt}'
 110         else:
 111             assert 0, f'invalid syntax style: {self.syntax_style}'
 112         return (mnt_stx, optd)
 113
 114     def _get_mount_cmd(self, mntopts):
 115         opts = 'norequire_active_mds'
 116         if self.client_keyring_path and self.client_id:
 117             opts += ',secret=' + self.get_key_from_keyfile()
 118         if self.config_path:
 119             opts += ',conf=' + self.config_path
 120         if self.rbytes:
 121             opts += ",rbytes"
 122         else:
 123             opts += ",norbytes"
 124         if self.snapdirname != '.snap':
 125             opts += f',snapdirname={self.snapdirname}'
 126
 127         mount_cmd = ['sudo'] + self._nsenter_args
 128         stx_opt = self._make_mount_cmd_old_or_new_style()
 129         for opt_name, opt_val in stx_opt[1].items():
 130             opts += f',{opt_name}={opt_val}'
 131         if mntopts:
 132             opts += ',' + ','.join(mntopts)
 133         log.info(f'mounting using device: {stx_opt[0]}')
 134         # do not fall-back to old-style mount (catch new-style
 135         # mount syntax bugs in the kernel). exclude this config
 136         # when using v1-style syntax, since old mount helpers
 137         # (pre-quincy) would pass this option to the kernel.
 138         if self.syntax_style != 'v1':
 139             opts += ",nofallback"
 140         mount_cmd += self._mount_bin + [stx_opt[0], self.hostfs_mntpt, '-v',
 141                                         '-o', opts]
 142         return mount_cmd
 143
 144     def umount(self, force=False):
 145         if not self.is_mounted():
 146             self.cleanup()
 147             return
 148
 149         if self.is_blocked():
 150             self._run_umount_lf()
 151             self.cleanup()
 152             return
 153
 154         log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
 155
 156         try:
 157             cmd=['sudo', 'umount', self.hostfs_mntpt]
 158             if force:
 159                 cmd.append('-f')
 160             self.client_remote.run(args=cmd, timeout=UMOUNT_TIMEOUT, omit_sudo=False)
 161         except Exception as e:
 162             log.debug('Killing processes on client.{id}...'.format(id=self.client_id))
 163             self.client_remote.run(
 164                 args=['sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof',
 165                       run.Raw(';'), 'ps', 'auxf'],
 166                 timeout=UMOUNT_TIMEOUT, omit_sudo=False)
 167             raise e
 168
 169         if self.dynamic_debug:
 170             kmount_count = self.ctx.get(f'kmount_count.{self.client_remote.hostname}')
 171             assert kmount_count
 172             if kmount_count == 1:
 173                 self.disable_dynamic_debug()
 174             self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count - 1
 175
 176         self.cleanup()
 177
 178     def umount_wait(self, force=False, require_clean=False,
 179                     timeout=UMOUNT_TIMEOUT):
 180         """
 181         Unlike the fuse client, the kernel client's umount is immediate
 182         """
 183         if not self.is_mounted():
 184             self.cleanup()
 185             return
 186
 187         try:
 188             self.umount(force)
 189         except (CommandFailedError, MaxWhileTries):
 190             if not force:
 191                 raise
 192
 193             # force delete the netns and umount
 194             self._run_umount_lf()
 195             self.cleanup()
 196
 197     def wait_until_mounted(self):
 198         """
 199         Unlike the fuse client, the kernel client is up and running as soon
 200         as the initial mount() function returns.
 201         """
 202         assert self.is_mounted()
 203
 204     def teardown(self):
 205         super(KernelMount, self).teardown()
 206         if self.is_mounted():
 207             self.umount()
 208
 209     def _get_debug_dir(self):
 210         """
 211         Get the debugfs folder for this mount
 212         """
 213
 214         cluster_name = 'ceph'
 215         fsid = self.ctx.ceph[cluster_name].fsid
 216
 217         global_id = self._get_global_id()
 218
 219         return os.path.join("/sys/kernel/debug/ceph/", f"{fsid}.client{global_id}")
 220
 221     def read_debug_file(self, filename):
 222         """
 223         Read the debug file "filename", return None if the file doesn't exist.
 224         """
 225
 226         path = os.path.join(self._get_debug_dir(), filename)
 227
 228         stdout = StringIO()
 229         stderr = StringIO()
 230         try:
 231             self.run_shell_payload(f"sudo dd if={path}", timeout=(5 * 60),
 232                                    stdout=stdout, stderr=stderr)
 233             return stdout.getvalue()
 234         except CommandFailedError:
 235             if 'no such file or directory' in stderr.getvalue().lower():
 236                 return errno.ENOENT
 237             elif 'not a directory' in stderr.getvalue().lower():
 238                 return errno.ENOTDIR
 239             elif 'permission denied' in stderr.getvalue().lower():
 240                 return errno.EACCES
 241             raise
 242
 243     def _get_global_id(self):
 244         try:
 245             p = self.run_shell_payload("getfattr --only-values -n ceph.client_id .", stdout=StringIO())
 246             v = p.stdout.getvalue()
 247             prefix = "client"
 248             assert v.startswith(prefix)
 249             return int(v[len(prefix):])
 250         except CommandFailedError:
 251             # Probably this fallback can be deleted in a few releases when the kernel xattr is widely available.
 252             log.debug("Falling back to messy global_id lookup via /sys...")
 253
 254             pyscript = dedent("""
 255                 import glob
 256                 import os
 257                 import json
 258
 259                 def get_id_to_dir():
 260                     result = {}
 261                     for dir in glob.glob("/sys/kernel/debug/ceph/*"):
 262                         if os.path.basename(dir) == DEBUGFS_META_DIR:
 263                             continue
 264                         mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines()
 265                         global_id = mds_sessions_lines[0].split()[1].strip('"')
 266                         client_id = mds_sessions_lines[1].split()[1].strip('"')
 267                         result[client_id] = global_id
 268                     return result
 269                 print(json.dumps(get_id_to_dir()))
 270             """)
 271
 272             output = self.client_remote.sh([
 273                 'sudo', 'python3', '-c', pyscript
 274             ], timeout=(5*60))
 275             client_id_to_global_id = json.loads(output)
 276
 277             try:
 278                 return client_id_to_global_id[self.client_id]
 279             except KeyError:
 280                 log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format(
 281                     self.client_id, ",".join(client_id_to_global_id.keys())
 282                 ))
 283                 raise
 284
 285     def _dynamic_debug_control(self, enable):
 286         """
 287         Write to dynamic debug control file.
 288         """
 289         if enable:
 290             fdata = "module ceph +p"
 291         else:
 292             fdata = "module ceph -p"
 293
 294         self.run_shell_payload(f"""
 295 sudo modprobe ceph
 296 echo '{fdata}' | sudo tee /sys/kernel/debug/dynamic_debug/control
 297 """)
 298
 299     def enable_dynamic_debug(self):
 300         """
 301         Enable the dynamic debug.
 302         """
 303         self._dynamic_debug_control(True)
 304
 305     def disable_dynamic_debug(self):
 306         """
 307         Disable the dynamic debug.
 308         """
 309         self._dynamic_debug_control(False)
 310
 311     def get_global_id(self):
 312         """
 313         Look up the CephFS client ID for this mount, using debugfs.
 314         """
 315
 316         assert self.is_mounted()
 317
 318         return self._get_global_id()
 319
 320     @property
 321     def _global_addr(self):
 322         if self.addr is not None:
 323             return self.addr
 324
 325         # The first line of the "status" file's output will be something
 326         # like:
 327         #   "instance: client.4297 (0)10.72.47.117:0/1148470933"
 328         # What we need here is only the string "10.72.47.117:0/1148470933"
 329         status = self.read_debug_file("status")
 330         if status is None:
 331             return None
 332
 333         instance = re.findall(r'instance:.*', status)[0]
 334         self.addr = instance.split()[2].split(')')[1]
 335         return self.addr;
 336
 337     @property
 338     def _global_inst(self):
 339         if self.inst is not None:
 340             return self.inst
 341
 342         client_gid = "client%d" % self.get_global_id()
 343         self.inst = " ".join([client_gid, self._global_addr])
 344         return self.inst
 345
 346     def get_global_inst(self):
 347         """
 348         Look up the CephFS client instance for this mount
 349         """
 350         return self._global_inst
 351
 352     def get_global_addr(self):
 353         """
 354         Look up the CephFS client addr for this mount
 355         """
 356         return self._global_addr
 357
 358     def get_osd_epoch(self):
 359         """
 360         Return 2-tuple of osd_epoch, osd_epoch_barrier
 361         """
 362         osd_map = self.read_debug_file("osdmap")
 363         assert osd_map
 364
 365         lines = osd_map.split("\n")
 366         first_line_tokens = lines[0].split()
 367         epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3])
 368
 369         return epoch, barrier
 370
 371     def get_op_read_count(self):
 372         stdout = StringIO()
 373         stderr = StringIO()
 374         try:
 375             path = os.path.join(self._get_debug_dir(), "metrics/size")
 376             self.run_shell(f"sudo stat {path}", stdout=stdout,
 377                            stderr=stderr, cwd=None)
 378             buf = self.read_debug_file("metrics/size")
 379         except CommandFailedError:
 380             if 'no such file or directory' in stderr.getvalue().lower() \
 381                     or 'not a directory' in stderr.getvalue().lower():
 382                 try:
 383                     path = os.path.join(self._get_debug_dir(), "metrics")
 384                     self.run_shell(f"sudo stat {path}", stdout=stdout,
 385                                    stderr=stderr, cwd=None)
 386                     buf = self.read_debug_file("metrics")
 387                 except CommandFailedError:
 388                     return errno.ENOENT
 389             else:
 390                 return 0
 391         return int(re.findall(r'read.*', buf)[0].split()[1])