ceph/qa/tasks/workunit.py

   1 """
   2 Workunit task -- Run ceph on sets of specific clients
   3 """
   4 import logging
   5 import pipes
   6 import os
   7 import re
   8
   9 import six
  10
  11 from tasks.util import get_remote_for_role
  12 from tasks.util.workunit import get_refspec_after_overrides
  13
  14 from teuthology import misc
  15 from teuthology.config import config as teuth_config
  16 from teuthology.orchestra.run import CommandFailedError
  17 from teuthology.parallel import parallel
  18 from teuthology.orchestra import run
  19
  20 log = logging.getLogger(__name__)
  21
  22 def task(ctx, config):
  23     """
  24     Run ceph on all workunits found under the specified path.
  25
  26     For example::
  27
  28         tasks:
  29         - ceph:
  30         - ceph-fuse: [client.0]
  31         - workunit:
  32             clients:
  33               client.0: [direct_io, xattrs.sh]
  34               client.1: [snaps]
  35             branch: foo
  36
  37     You can also run a list of workunits on all clients:
  38         tasks:
  39         - ceph:
  40         - ceph-fuse:
  41         - workunit:
  42             tag: v0.47
  43             clients:
  44               all: [direct_io, xattrs.sh, snaps]
  45
  46     If you have an "all" section it will run all the workunits
  47     on each client simultaneously, AFTER running any workunits specified
  48     for individual clients. (This prevents unintended simultaneous runs.)
  49
  50     To customize tests, you can specify environment variables as a dict. You
  51     can also specify a time limit for each work unit (defaults to 3h):
  52
  53         tasks:
  54         - ceph:
  55         - ceph-fuse:
  56         - workunit:
  57             sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6
  58             clients:
  59               all: [snaps]
  60             env:
  61               FOO: bar
  62               BAZ: quux
  63             timeout: 3h
  64
  65     This task supports roles that include a ceph cluster, e.g.::
  66
  67         tasks:
  68         - ceph:
  69         - workunit:
  70             clients:
  71               backup.client.0: [foo]
  72               client.1: [bar] # cluster is implicitly 'ceph'
  73
  74     You can also specify an alternative top-level dir to 'qa/workunits', like
  75     'qa/standalone', with::
  76
  77         tasks:
  78         - install:
  79         - workunit:
  80             basedir: qa/standalone
  81             clients:
  82               client.0:
  83                 - test-ceph-helpers.sh
  84
  85     :param ctx: Context
  86     :param config: Configuration
  87     """
  88     assert isinstance(config, dict)
  89     assert isinstance(config.get('clients'), dict), \
  90         'configuration must contain a dictionary of clients'
  91
  92     overrides = ctx.config.get('overrides', {})
  93     refspec = get_refspec_after_overrides(config, overrides)
  94     timeout = config.get('timeout', '3h')
  95     cleanup = config.get('cleanup', True)
  96
  97     log.info('Pulling workunits from ref %s', refspec)
  98
  99     created_mountpoint = {}
 100
 101     if config.get('env') is not None:
 102         assert isinstance(config['env'], dict), 'env must be a dictionary'
 103     clients = config['clients']
 104
 105     # Create scratch dirs for any non-all workunits
 106     log.info('Making a separate scratch dir for every client...')
 107     for role in clients.keys():
 108         assert isinstance(role, six.string_types)
 109         if role == "all":
 110             continue
 111
 112         assert 'client' in role
 113         created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
 114         created_mountpoint[role] = created_mnt_dir
 115
 116     # Execute any non-all workunits
 117     log.info("timeout={}".format(timeout))
 118     log.info("cleanup={}".format(cleanup))
 119     with parallel() as p:
 120         for role, tests in clients.items():
 121             if role != "all":
 122                 p.spawn(_run_tests, ctx, refspec, role, tests,
 123                         config.get('env'),
 124                         basedir=config.get('basedir','qa/workunits'),
 125                         timeout=timeout,
 126                         cleanup=cleanup,
 127                         coverage_and_limits=not config.get('no_coverage_and_limits', None))
 128
 129     if cleanup:
 130         # Clean up dirs from any non-all workunits
 131         for role, created in created_mountpoint.items():
 132             _delete_dir(ctx, role, created)
 133
 134     # Execute any 'all' workunits
 135     if 'all' in clients:
 136         all_tasks = clients["all"]
 137         _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
 138                               config.get('basedir', 'qa/workunits'),
 139                               config.get('subdir'), timeout=timeout,
 140                               cleanup=cleanup)
 141
 142
 143 def _client_mountpoint(ctx, cluster, id_):
 144     """
 145     Returns the path to the expected mountpoint for workunits running
 146     on some kind of filesystem.
 147     """
 148     # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet,
 149     # only include the cluster name in the dir if the cluster is not 'ceph'
 150     if cluster == 'ceph':
 151         dir_ = 'mnt.{0}'.format(id_)
 152     else:
 153         dir_ = 'mnt.{0}.{1}'.format(cluster, id_)
 154     return os.path.join(misc.get_testdir(ctx), dir_)
 155
 156
 157 def _delete_dir(ctx, role, created_mountpoint):
 158     """
 159     Delete file used by this role, and delete the directory that this
 160     role appeared in.
 161
 162     :param ctx: Context
 163     :param role: "role.#" where # is used for the role id.
 164     """
 165     cluster, _, id_ = misc.split_role(role)
 166     remote = get_remote_for_role(ctx, role)
 167     mnt = _client_mountpoint(ctx, cluster, id_)
 168     client = os.path.join(mnt, 'client.{id}'.format(id=id_))
 169
 170     # Remove the directory inside the mount where the workunit ran
 171     remote.run(
 172         args=[
 173             'sudo',
 174             'rm',
 175             '-rf',
 176             '--',
 177             client,
 178         ],
 179     )
 180     log.info("Deleted dir {dir}".format(dir=client))
 181
 182     # If the mount was an artificially created dir, delete that too
 183     if created_mountpoint:
 184         remote.run(
 185             args=[
 186                 'rmdir',
 187                 '--',
 188                 mnt,
 189             ],
 190         )
 191         log.info("Deleted artificial mount point {dir}".format(dir=client))
 192
 193
 194 def _make_scratch_dir(ctx, role, subdir):
 195     """
 196     Make scratch directories for this role.  This also makes the mount
 197     point if that directory does not exist.
 198
 199     :param ctx: Context
 200     :param role: "role.#" where # is used for the role id.
 201     :param subdir: use this subdir (False if not used)
 202     """
 203     created_mountpoint = False
 204     cluster, _, id_ = misc.split_role(role)
 205     remote = get_remote_for_role(ctx, role)
 206     dir_owner = remote.user
 207     mnt = _client_mountpoint(ctx, cluster, id_)
 208     # if neither kclient nor ceph-fuse are required for a workunit,
 209     # mnt may not exist. Stat and create the directory if it doesn't.
 210     try:
 211         remote.run(
 212             args=[
 213                 'stat',
 214                 '--',
 215                 mnt,
 216             ],
 217         )
 218         log.info('Did not need to create dir {dir}'.format(dir=mnt))
 219     except CommandFailedError:
 220         remote.run(
 221             args=[
 222                 'mkdir',
 223                 '--',
 224                 mnt,
 225             ],
 226         )
 227         log.info('Created dir {dir}'.format(dir=mnt))
 228         created_mountpoint = True
 229
 230     if not subdir:
 231         subdir = 'client.{id}'.format(id=id_)
 232
 233     if created_mountpoint:
 234         remote.run(
 235             args=[
 236                 'cd',
 237                 '--',
 238                 mnt,
 239                 run.Raw('&&'),
 240                 'mkdir',
 241                 '--',
 242                 subdir,
 243             ],
 244         )
 245     else:
 246         remote.run(
 247             args=[
 248                 # cd first so this will fail if the mount point does
 249                 # not exist; pure install -d will silently do the
 250                 # wrong thing
 251                 'cd',
 252                 '--',
 253                 mnt,
 254                 run.Raw('&&'),
 255                 'sudo',
 256                 'install',
 257                 '-d',
 258                 '-m', '0755',
 259                 '--owner={user}'.format(user=dir_owner),
 260                 '--',
 261                 subdir,
 262             ],
 263         )
 264
 265     return created_mountpoint
 266
 267
 268 def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True):
 269     """
 270     Make a scratch directory for each client in the cluster, and then for each
 271     test spawn _run_tests() for each role.
 272
 273     See run_tests() for parameter documentation.
 274     """
 275     is_client = misc.is_type('client')
 276     client_remotes = {}
 277     created_mountpoint = {}
 278     for remote, roles_for_host in ctx.cluster.remotes.items():
 279         for role in roles_for_host:
 280             if is_client(role):
 281                 client_remotes[role] = remote
 282                 created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir)
 283
 284     for unit in tests:
 285         with parallel() as p:
 286             for role, remote in client_remotes.items():
 287                 p.spawn(_run_tests, ctx, refspec, role, [unit], env,
 288                         basedir,
 289                         subdir,
 290                         timeout=timeout)
 291
 292     # cleanup the generated client directories
 293     if cleanup:
 294         for role, _ in client_remotes.items():
 295             _delete_dir(ctx, role, created_mountpoint[role])
 296
 297
 298 def _run_tests(ctx, refspec, role, tests, env, basedir,
 299                subdir=None, timeout=None, cleanup=True,
 300                coverage_and_limits=True):
 301     """
 302     Run the individual test. Create a scratch directory and then extract the
 303     workunits from git. Make the executables, and then run the tests.
 304     Clean up (remove files created) after the tests are finished.
 305
 306     :param ctx:     Context
 307     :param refspec: branch, sha1, or version tag used to identify this
 308                     build
 309     :param tests:   specific tests specified.
 310     :param env:     environment set in yaml file.  Could be None.
 311     :param subdir:  subdirectory set in yaml file.  Could be None
 312     :param timeout: If present, use the 'timeout' command on the remote host
 313                     to limit execution time. Must be specified by a number
 314                     followed by 's' for seconds, 'm' for minutes, 'h' for
 315                     hours, or 'd' for days. If '0' or anything that evaluates
 316                     to False is passed, the 'timeout' command is not used.
 317     """
 318     testdir = misc.get_testdir(ctx)
 319     assert isinstance(role, six.string_types)
 320     cluster, type_, id_ = misc.split_role(role)
 321     assert type_ == 'client'
 322     remote = get_remote_for_role(ctx, role)
 323     mnt = _client_mountpoint(ctx, cluster, id_)
 324     # subdir so we can remove and recreate this a lot without sudo
 325     if subdir is None:
 326         scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp')
 327     else:
 328         scratch_tmp = os.path.join(mnt, subdir)
 329     clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
 330     srcdir = '{cdir}/{basedir}'.format(cdir=clonedir,
 331                                        basedir=basedir)
 332
 333     git_url = teuth_config.get_ceph_qa_suite_git_url()
 334     # if we are running an upgrade test, and ceph-ci does not have branches like
 335     # `jewel`, so should use ceph.git as an alternative.
 336     try:
 337         remote.run(logger=log.getChild(role),
 338                    args=refspec.clone(git_url, clonedir))
 339     except CommandFailedError:
 340         if git_url.endswith('/ceph-ci.git'):
 341             alt_git_url = git_url.replace('/ceph-ci.git', '/ceph.git')
 342         elif git_url.endswith('/ceph-ci'):
 343             alt_git_url = re.sub(r'/ceph-ci$', '/ceph.git', git_url)
 344         else:
 345             raise
 346         log.info(
 347             "failed to check out '%s' from %s; will also try in %s",
 348             refspec,
 349             git_url,
 350             alt_git_url,
 351         )
 352         remote.run(logger=log.getChild(role),
 353                    args=refspec.clone(alt_git_url, clonedir))
 354     remote.run(
 355         logger=log.getChild(role),
 356         args=[
 357             'cd', '--', srcdir,
 358             run.Raw('&&'),
 359             'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
 360             run.Raw('&&'),
 361             'find', '-executable', '-type', 'f', '-printf', r'%P\0',
 362             run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)),
 363         ],
 364     )
 365
 366     workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)
 367     workunits = sorted(six.ensure_str(misc.get_file(remote, workunits_file)).split('\0'))
 368     assert workunits
 369
 370     try:
 371         assert isinstance(tests, list)
 372         for spec in tests:
 373             log.info('Running workunits matching %s on %s...', spec, role)
 374             prefix = '{spec}/'.format(spec=spec)
 375             to_run = [w for w in workunits if w == spec or w.startswith(prefix)]
 376             if not to_run:
 377                 raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec))
 378             for workunit in to_run:
 379                 log.info('Running workunit %s...', workunit)
 380                 args = [
 381                     'mkdir', '-p', '--', scratch_tmp,
 382                     run.Raw('&&'),
 383                     'cd', '--', scratch_tmp,
 384                     run.Raw('&&'),
 385                     run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'),
 386                     run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
 387                     run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
 388                     run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)),
 389                     run.Raw('CEPH_ID="{id}"'.format(id=id_)),
 390                     run.Raw('PATH=$PATH:/usr/sbin'),
 391                     run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)),
 392                     run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)),
 393                 ]
 394                 if env is not None:
 395                     for var, val in env.items():
 396                         quoted_val = pipes.quote(val)
 397                         env_arg = '{var}={val}'.format(var=var, val=quoted_val)
 398                         args.append(run.Raw(env_arg))
 399                 if coverage_and_limits:
 400                     args.extend([
 401                         'adjust-ulimits',
 402                         'ceph-coverage',
 403                         '{tdir}/archive/coverage'.format(tdir=testdir)])
 404                 if timeout and timeout != '0':
 405                     args.extend(['timeout', timeout])
 406                 args.extend([
 407                     '{srcdir}/{workunit}'.format(
 408                         srcdir=srcdir,
 409                         workunit=workunit,
 410                     ),
 411                 ])
 412                 remote.run(
 413                     logger=log.getChild(role),
 414                     args=args,
 415                     label="workunit test {workunit}".format(workunit=workunit)
 416                 )
 417                 if cleanup:
 418                     args=['sudo', 'rm', '-rf', '--', scratch_tmp]
 419                     remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
 420     finally:
 421         log.info('Stopping %s on %s...', tests, role)
 422         args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
 423         # N.B. don't cleanup scratch_tmp! If the mount is broken then rm will hang.
 424         remote.run(
 425             logger=log.getChild(role),
 426             args=args,
 427         )