ceph/qa/tasks/ceph.py

   1 """
   2 Ceph cluster task.
   3
   4 Handle the setup, starting, and clean-up of a Ceph cluster.
   5 """
   6 from cStringIO import StringIO
   7
   8 import argparse
   9 import contextlib
  10 import errno
  11 import logging
  12 import os
  13 import json
  14 import time
  15 import gevent
  16 import socket
  17
  18 from paramiko import SSHException
  19 from ceph_manager import CephManager, write_conf
  20 from tasks.cephfs.filesystem import Filesystem
  21 from teuthology import misc as teuthology
  22 from teuthology import contextutil
  23 from teuthology import exceptions
  24 from teuthology.orchestra import run
  25 import ceph_client as cclient
  26 from teuthology.orchestra.daemon import DaemonGroup
  27
  28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
  29
  30 log = logging.getLogger(__name__)
  31
  32
  33 def generate_caps(type_):
  34     """
  35     Each call will return the next capability for each system type
  36     (essentially a subset of possible role values).  Valid types are osd,
  37     mds and client.
  38     """
  39     defaults = dict(
  40         osd=dict(
  41             mon='allow *',
  42             mgr='allow *',
  43             osd='allow *',
  44         ),
  45         mgr=dict(
  46             mon='allow profile mgr',
  47             osd='allow *',
  48             mds='allow *',
  49         ),
  50         mds=dict(
  51             mon='allow *',
  52             mgr='allow *',
  53             osd='allow *',
  54             mds='allow',
  55         ),
  56         client=dict(
  57             mon='allow rw',
  58             mgr='allow r',
  59             osd='allow rwx',
  60             mds='allow',
  61         ),
  62     )
  63     for subsystem, capability in defaults[type_].items():
  64         yield '--cap'
  65         yield subsystem
  66         yield capability
  67
  68
  69 @contextlib.contextmanager
  70 def ceph_log(ctx, config):
  71     """
  72     Create /var/log/ceph log directory that is open to everyone.
  73     Add valgrind and profiling-logger directories.
  74
  75     :param ctx: Context
  76     :param config: Configuration
  77     """
  78     log.info('Making ceph log dir writeable by non-root...')
  79     run.wait(
  80         ctx.cluster.run(
  81             args=[
  82                 'sudo',
  83                 'chmod',
  84                 '777',
  85                 '/var/log/ceph',
  86             ],
  87             wait=False,
  88         )
  89     )
  90     log.info('Disabling ceph logrotate...')
  91     run.wait(
  92         ctx.cluster.run(
  93             args=[
  94                 'sudo',
  95                 'rm', '-f', '--',
  96                 '/etc/logrotate.d/ceph',
  97             ],
  98             wait=False,
  99         )
 100     )
 101     log.info('Creating extra log directories...')
 102     run.wait(
 103         ctx.cluster.run(
 104             args=[
 105                 'sudo',
 106                 'install', '-d', '-m0777', '--',
 107                 '/var/log/ceph/valgrind',
 108                 '/var/log/ceph/profiling-logger',
 109             ],
 110             wait=False,
 111         )
 112     )
 113
 114     class Rotater(object):
 115         stop_event = gevent.event.Event()
 116
 117         def invoke_logrotate(self):
 118             # 1) install ceph-test.conf in /etc/logrotate.d
 119             # 2) continuously loop over logrotate invocation with ceph-test.conf
 120             while not self.stop_event.is_set():
 121                 self.stop_event.wait(timeout=30)
 122                 try:
 123                     run.wait(
 124                         ctx.cluster.run(
 125                             args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
 126                                   ],
 127                             wait=False,
 128                         )
 129                     )
 130                 except exceptions.ConnectionLostError as e:
 131                     # Some tests may power off nodes during test, in which
 132                     # case we will see connection errors that we should ignore.
 133                     log.debug("Missed logrotate, node '{0}' is offline".format(
 134                         e.node))
 135                 except EOFError as e:
 136                     # Paramiko sometimes raises this when it fails to
 137                     # connect to a node during open_session.  As with
 138                     # ConnectionLostError, we ignore this because nodes
 139                     # are allowed to get power cycled during tests.
 140                     log.debug("Missed logrotate, EOFError")
 141                 except SSHException as e:
 142                     log.debug("Missed logrotate, SSHException")
 143                 except socket.error as e:
 144                     if e.errno == errno.EHOSTUNREACH:
 145                         log.debug("Missed logrotate, host unreachable")
 146                     else:
 147                         raise
 148
 149         def begin(self):
 150             self.thread = gevent.spawn(self.invoke_logrotate)
 151
 152         def end(self):
 153             self.stop_event.set()
 154             self.thread.get()
 155
 156     def write_rotate_conf(ctx, daemons):
 157         testdir = teuthology.get_testdir(ctx)
 158         rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
 159         with file(rotate_conf_path, 'rb') as f:
 160             conf = ""
 161             for daemon, size in daemons.iteritems():
 162                 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
 163                 conf += f.read().format(daemon_type=daemon, max_size=size)
 164                 f.seek(0, 0)
 165
 166             for remote in ctx.cluster.remotes.iterkeys():
 167                 teuthology.write_file(remote=remote,
 168                                       path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 169                                       data=StringIO(conf)
 170                                       )
 171                 remote.run(
 172                     args=[
 173                         'sudo',
 174                         'mv',
 175                         '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 176                         '/etc/logrotate.d/ceph-test.conf',
 177                         run.Raw('&&'),
 178                         'sudo',
 179                         'chmod',
 180                         '0644',
 181                         '/etc/logrotate.d/ceph-test.conf',
 182                         run.Raw('&&'),
 183                         'sudo',
 184                         'chown',
 185                         'root.root',
 186                         '/etc/logrotate.d/ceph-test.conf'
 187                     ]
 188                 )
 189                 remote.chcon('/etc/logrotate.d/ceph-test.conf',
 190                              'system_u:object_r:etc_t:s0')
 191
 192     if ctx.config.get('log-rotate'):
 193         daemons = ctx.config.get('log-rotate')
 194         log.info('Setting up log rotation with ' + str(daemons))
 195         write_rotate_conf(ctx, daemons)
 196         logrotater = Rotater()
 197         logrotater.begin()
 198     try:
 199         yield
 200
 201     finally:
 202         if ctx.config.get('log-rotate'):
 203             log.info('Shutting down logrotate')
 204             logrotater.end()
 205             ctx.cluster.run(
 206                 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
 207                       ]
 208             )
 209         if ctx.archive is not None and \
 210                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 211             # and logs
 212             log.info('Compressing logs...')
 213             run.wait(
 214                 ctx.cluster.run(
 215                     args=[
 216                         'sudo',
 217                         'find',
 218                         '/var/log/ceph',
 219                         '-name',
 220                         '*.log',
 221                         '-print0',
 222                         run.Raw('|'),
 223                         'sudo',
 224                         'xargs',
 225                         '-0',
 226                         '--no-run-if-empty',
 227                         '--',
 228                         'gzip',
 229                         '--',
 230                     ],
 231                     wait=False,
 232                 ),
 233             )
 234
 235             log.info('Archiving logs...')
 236             path = os.path.join(ctx.archive, 'remote')
 237             os.makedirs(path)
 238             for remote in ctx.cluster.remotes.iterkeys():
 239                 sub = os.path.join(path, remote.shortname)
 240                 os.makedirs(sub)
 241                 teuthology.pull_directory(remote, '/var/log/ceph',
 242                                           os.path.join(sub, 'log'))
 243
 244
 245 def assign_devs(roles, devs):
 246     """
 247     Create a dictionary of devs indexed by roles
 248
 249     :param roles: List of roles
 250     :param devs: Corresponding list of devices.
 251     :returns: Dictionary of devs indexed by roles.
 252     """
 253     return dict(zip(roles, devs))
 254
 255
 256 @contextlib.contextmanager
 257 def valgrind_post(ctx, config):
 258     """
 259     After the tests run, look throught all the valgrind logs.  Exceptions are raised
 260     if textual errors occured in the logs, or if valgrind exceptions were detected in
 261     the logs.
 262
 263     :param ctx: Context
 264     :param config: Configuration
 265     """
 266     try:
 267         yield
 268     finally:
 269         lookup_procs = list()
 270         log.info('Checking for errors in any valgrind logs...')
 271         for remote in ctx.cluster.remotes.iterkeys():
 272             # look at valgrind logs for each node
 273             proc = remote.run(
 274                 args=[
 275                     'sudo',
 276                     'zgrep',
 277                     '<kind>',
 278                     run.Raw('/var/log/ceph/valgrind/*'),
 279                     '/dev/null',  # include a second file so that we always get a filename prefix on the output
 280                     run.Raw('|'),
 281                     'sort',
 282                     run.Raw('|'),
 283                     'uniq',
 284                 ],
 285                 wait=False,
 286                 check_status=False,
 287                 stdout=StringIO(),
 288             )
 289             lookup_procs.append((proc, remote))
 290
 291         valgrind_exception = None
 292         for (proc, remote) in lookup_procs:
 293             proc.wait()
 294             out = proc.stdout.getvalue()
 295             for line in out.split('\n'):
 296                 if line == '':
 297                     continue
 298                 try:
 299                     (file, kind) = line.split(':')
 300                 except Exception:
 301                     log.error('failed to split line %s', line)
 302                     raise
 303                 log.debug('file %s kind %s', file, kind)
 304                 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
 305                     continue
 306                 log.error('saw valgrind issue %s in %s', kind, file)
 307                 valgrind_exception = Exception('saw valgrind issues')
 308
 309         if config.get('expect_valgrind_errors'):
 310             if not valgrind_exception:
 311                 raise Exception('expected valgrind issues and found none')
 312         else:
 313             if valgrind_exception:
 314                 raise valgrind_exception
 315
 316
 317 @contextlib.contextmanager
 318 def crush_setup(ctx, config):
 319     cluster_name = config['cluster']
 320     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 321     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 322
 323     profile = config.get('crush_tunables', 'default')
 324     log.info('Setting crush tunables to %s', profile)
 325     mon_remote.run(
 326         args=['sudo', 'ceph', '--cluster', cluster_name,
 327               'osd', 'crush', 'tunables', profile])
 328     yield
 329
 330
 331 @contextlib.contextmanager
 332 def create_rbd_pool(ctx, config):
 333     cluster_name = config['cluster']
 334     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 335     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 336     log.info('Waiting for OSDs to come up')
 337     teuthology.wait_until_osds_up(
 338         ctx,
 339         cluster=ctx.cluster,
 340         remote=mon_remote,
 341         ceph_cluster=cluster_name,
 342     )
 343     if config.get('create_rbd_pool', True):
 344         log.info('Creating RBD pool')
 345         mon_remote.run(
 346             args=['sudo', 'ceph', '--cluster', cluster_name,
 347                   'osd', 'pool', 'create', 'rbd', '8'])
 348         mon_remote.run(
 349             args=[
 350                 'sudo', 'ceph', '--cluster', cluster_name,
 351                 'osd', 'pool', 'application', 'enable',
 352                 'rbd', 'rbd', '--yes-i-really-mean-it'
 353             ],
 354             check_status=False)
 355     yield
 356
 357 @contextlib.contextmanager
 358 def cephfs_setup(ctx, config):
 359     cluster_name = config['cluster']
 360     testdir = teuthology.get_testdir(ctx)
 361     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 362
 363     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 364     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 365     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 366     # If there are any MDSs, then create a filesystem for them to use
 367     # Do this last because requires mon cluster to be up and running
 368     if mdss.remotes:
 369         log.info('Setting up CephFS filesystem...')
 370
 371         fs = Filesystem(ctx, name='cephfs', create=True,
 372                         ec_profile=config.get('cephfs_ec_profile', None))
 373
 374         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
 375         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
 376         num_active = len([r for r in all_roles if is_active_mds(r)])
 377
 378         fs.set_max_mds(num_active)
 379         fs.set_allow_dirfrags(True)
 380
 381     yield
 382
 383
 384 @contextlib.contextmanager
 385 def cluster(ctx, config):
 386     """
 387     Handle the creation and removal of a ceph cluster.
 388
 389     On startup:
 390         Create directories needed for the cluster.
 391         Create remote journals for all osds.
 392         Create and set keyring.
 393         Copy the monmap to tht test systems.
 394         Setup mon nodes.
 395         Setup mds nodes.
 396         Mkfs osd nodes.
 397         Add keyring information to monmaps
 398         Mkfs mon nodes.
 399
 400     On exit:
 401         If errors occured, extract a failure message and store in ctx.summary.
 402         Unmount all test files and temporary journaling files.
 403         Save the monitor information and archive all ceph logs.
 404         Cleanup the keyring setup, and remove all monitor map and data files left over.
 405
 406     :param ctx: Context
 407     :param config: Configuration
 408     """
 409     if ctx.config.get('use_existing_cluster', False) is True:
 410         log.info("'use_existing_cluster' is true; skipping cluster creation")
 411         yield
 412
 413     testdir = teuthology.get_testdir(ctx)
 414     cluster_name = config['cluster']
 415     data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
 416     log.info('Creating ceph cluster %s...', cluster_name)
 417     run.wait(
 418         ctx.cluster.run(
 419             args=[
 420                 'install', '-d', '-m0755', '--',
 421                 data_dir,
 422             ],
 423             wait=False,
 424         )
 425     )
 426
 427     run.wait(
 428         ctx.cluster.run(
 429             args=[
 430                 'sudo',
 431                 'install', '-d', '-m0777', '--', '/var/run/ceph',
 432             ],
 433             wait=False,
 434         )
 435     )
 436
 437     devs_to_clean = {}
 438     remote_to_roles_to_devs = {}
 439     remote_to_roles_to_journals = {}
 440     osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
 441     for remote, roles_for_host in osds.remotes.iteritems():
 442         devs = teuthology.get_scratch_devices(remote)
 443         roles_to_devs = {}
 444         roles_to_journals = {}
 445         if config.get('fs'):
 446             log.info('fs option selected, checking for scratch devs')
 447             log.info('found devs: %s' % (str(devs),))
 448             devs_id_map = teuthology.get_wwn_id_map(remote, devs)
 449             iddevs = devs_id_map.values()
 450             roles_to_devs = assign_devs(
 451                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 452             )
 453             if len(roles_to_devs) < len(iddevs):
 454                 iddevs = iddevs[len(roles_to_devs):]
 455             devs_to_clean[remote] = []
 456
 457         if config.get('block_journal'):
 458             log.info('block journal enabled')
 459             roles_to_journals = assign_devs(
 460                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 461             )
 462             log.info('journal map: %s', roles_to_journals)
 463
 464         if config.get('tmpfs_journal'):
 465             log.info('tmpfs journal enabled')
 466             roles_to_journals = {}
 467             remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
 468             for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 469                 tmpfs = '/mnt/' + role
 470                 roles_to_journals[role] = tmpfs
 471                 remote.run(args=['truncate', '-s', '1500M', tmpfs])
 472             log.info('journal map: %s', roles_to_journals)
 473
 474         log.info('dev map: %s' % (str(roles_to_devs),))
 475         remote_to_roles_to_devs[remote] = roles_to_devs
 476         remote_to_roles_to_journals[remote] = roles_to_journals
 477
 478     log.info('Generating config...')
 479     remotes_and_roles = ctx.cluster.remotes.items()
 480     roles = [role_list for (remote, role_list) in remotes_and_roles]
 481     ips = [host for (host, port) in
 482            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
 483     conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
 484     for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
 485         for role, journal in roles_to_journals.iteritems():
 486             name = teuthology.ceph_role(role)
 487             if name not in conf:
 488                 conf[name] = {}
 489             conf[name]['osd journal'] = journal
 490     for section, keys in config['conf'].iteritems():
 491         for key, value in keys.iteritems():
 492             log.info("[%s] %s = %s" % (section, key, value))
 493             if section not in conf:
 494                 conf[section] = {}
 495             conf[section][key] = value
 496
 497     if config.get('tmpfs_journal'):
 498         conf['journal dio'] = False
 499
 500     if not hasattr(ctx, 'ceph'):
 501         ctx.ceph = {}
 502     ctx.ceph[cluster_name] = argparse.Namespace()
 503     ctx.ceph[cluster_name].conf = conf
 504
 505     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
 506     keyring_path = config.get('keyring_path', default_keyring)
 507
 508     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 509
 510     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
 511
 512     log.info('Setting up %s...' % firstmon)
 513     ctx.cluster.only(firstmon).run(
 514         args=[
 515             'sudo',
 516             'adjust-ulimits',
 517             'ceph-coverage',
 518             coverage_dir,
 519             'ceph-authtool',
 520             '--create-keyring',
 521             keyring_path,
 522         ],
 523     )
 524     ctx.cluster.only(firstmon).run(
 525         args=[
 526             'sudo',
 527             'adjust-ulimits',
 528             'ceph-coverage',
 529             coverage_dir,
 530             'ceph-authtool',
 531             '--gen-key',
 532             '--name=mon.',
 533             keyring_path,
 534         ],
 535     )
 536     ctx.cluster.only(firstmon).run(
 537         args=[
 538             'sudo',
 539             'chmod',
 540             '0644',
 541             keyring_path,
 542         ],
 543     )
 544     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 545     monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
 546                                                    cluster=cluster_name)
 547     fsid = teuthology.create_simple_monmap(
 548         ctx,
 549         remote=mon0_remote,
 550         conf=conf,
 551         path=monmap_path,
 552     )
 553     if not 'global' in conf:
 554         conf['global'] = {}
 555     conf['global']['fsid'] = fsid
 556
 557     default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
 558     conf_path = config.get('conf_path', default_conf_path)
 559     log.info('Writing %s for FSID %s...' % (conf_path, fsid))
 560     write_conf(ctx, conf_path, cluster_name)
 561
 562     log.info('Creating admin key on %s...' % firstmon)
 563     ctx.cluster.only(firstmon).run(
 564         args=[
 565             'sudo',
 566             'adjust-ulimits',
 567             'ceph-coverage',
 568             coverage_dir,
 569             'ceph-authtool',
 570             '--gen-key',
 571             '--name=client.admin',
 572             '--set-uid=0',
 573             '--cap', 'mon', 'allow *',
 574             '--cap', 'osd', 'allow *',
 575             '--cap', 'mds', 'allow *',
 576             '--cap', 'mgr', 'allow *',
 577             keyring_path,
 578         ],
 579     )
 580
 581     log.info('Copying monmap to all nodes...')
 582     keyring = teuthology.get_file(
 583         remote=mon0_remote,
 584         path=keyring_path,
 585     )
 586     monmap = teuthology.get_file(
 587         remote=mon0_remote,
 588         path=monmap_path,
 589     )
 590
 591     for rem in ctx.cluster.remotes.iterkeys():
 592         # copy mon key and initial monmap
 593         log.info('Sending monmap to node {remote}'.format(remote=rem))
 594         teuthology.sudo_write_file(
 595             remote=rem,
 596             path=keyring_path,
 597             data=keyring,
 598             perms='0644'
 599         )
 600         teuthology.write_file(
 601             remote=rem,
 602             path=monmap_path,
 603             data=monmap,
 604         )
 605
 606     log.info('Setting up mon nodes...')
 607     mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
 608
 609     if not config.get('skip_mgr_daemons', False):
 610         log.info('Setting up mgr nodes...')
 611         mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
 612         for remote, roles_for_host in mgrs.remotes.iteritems():
 613             for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
 614                                                          cluster_name):
 615                 _, _, id_ = teuthology.split_role(role)
 616                 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
 617                     cluster=cluster_name,
 618                     id=id_,
 619                 )
 620                 remote.run(
 621                     args=[
 622                         'sudo',
 623                         'mkdir',
 624                         '-p',
 625                         mgr_dir,
 626                         run.Raw('&&'),
 627                         'sudo',
 628                         'adjust-ulimits',
 629                         'ceph-coverage',
 630                         coverage_dir,
 631                         'ceph-authtool',
 632                         '--create-keyring',
 633                         '--gen-key',
 634                         '--name=mgr.{id}'.format(id=id_),
 635                         mgr_dir + '/keyring',
 636                     ],
 637                 )
 638
 639     log.info('Setting up mds nodes...')
 640     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 641     for remote, roles_for_host in mdss.remotes.iteritems():
 642         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
 643                                                      cluster_name):
 644             _, _, id_ = teuthology.split_role(role)
 645             mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
 646                 cluster=cluster_name,
 647                 id=id_,
 648             )
 649             remote.run(
 650                 args=[
 651                     'sudo',
 652                     'mkdir',
 653                     '-p',
 654                     mds_dir,
 655                     run.Raw('&&'),
 656                     'sudo',
 657                     'adjust-ulimits',
 658                     'ceph-coverage',
 659                     coverage_dir,
 660                     'ceph-authtool',
 661                     '--create-keyring',
 662                     '--gen-key',
 663                     '--name=mds.{id}'.format(id=id_),
 664                     mds_dir + '/keyring',
 665                 ],
 666             )
 667
 668     cclient.create_keyring(ctx, cluster_name)
 669     log.info('Running mkfs on osd nodes...')
 670
 671     if not hasattr(ctx, 'disk_config'):
 672         ctx.disk_config = argparse.Namespace()
 673     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
 674         ctx.disk_config.remote_to_roles_to_dev = {}
 675     if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
 676         ctx.disk_config.remote_to_roles_to_journals = {}
 677     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
 678         ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
 679     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
 680         ctx.disk_config.remote_to_roles_to_dev_fstype = {}
 681
 682     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
 683     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
 684
 685     log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
 686     for remote, roles_for_host in osds.remotes.iteritems():
 687         roles_to_devs = remote_to_roles_to_devs[remote]
 688         roles_to_journals = remote_to_roles_to_journals[remote]
 689
 690         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 691             _, _, id_ = teuthology.split_role(role)
 692             mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
 693             remote.run(
 694                 args=[
 695                     'sudo',
 696                     'mkdir',
 697                     '-p',
 698                     mnt_point,
 699                 ])
 700             log.info(str(roles_to_devs))
 701             log.info(str(roles_to_journals))
 702             log.info(role)
 703             if roles_to_devs.get(role):
 704                 dev = roles_to_devs[role]
 705                 fs = config.get('fs')
 706                 package = None
 707                 mkfs_options = config.get('mkfs_options')
 708                 mount_options = config.get('mount_options')
 709                 if fs == 'btrfs':
 710                     # package = 'btrfs-tools'
 711                     if mount_options is None:
 712                         mount_options = ['noatime', 'user_subvol_rm_allowed']
 713                     if mkfs_options is None:
 714                         mkfs_options = ['-m', 'single',
 715                                         '-l', '32768',
 716                                         '-n', '32768']
 717                 if fs == 'xfs':
 718                     # package = 'xfsprogs'
 719                     if mount_options is None:
 720                         mount_options = ['noatime']
 721                     if mkfs_options is None:
 722                         mkfs_options = ['-f', '-i', 'size=2048']
 723                 if fs == 'ext4' or fs == 'ext3':
 724                     if mount_options is None:
 725                         mount_options = ['noatime', 'user_xattr']
 726
 727                 if mount_options is None:
 728                     mount_options = []
 729                 if mkfs_options is None:
 730                     mkfs_options = []
 731                 mkfs = ['mkfs.%s' % fs] + mkfs_options
 732                 log.info('%s on %s on %s' % (mkfs, dev, remote))
 733                 if package is not None:
 734                     remote.run(
 735                         args=[
 736                             'sudo',
 737                             'apt-get', 'install', '-y', package
 738                         ],
 739                         stdout=StringIO(),
 740                     )
 741
 742                 try:
 743                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 744                 except run.CommandFailedError:
 745                     # Newer btfs-tools doesn't prompt for overwrite, use -f
 746                     if '-f' not in mount_options:
 747                         mkfs_options.append('-f')
 748                         mkfs = ['mkfs.%s' % fs] + mkfs_options
 749                         log.info('%s on %s on %s' % (mkfs, dev, remote))
 750                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 751
 752                 log.info('mount %s on %s -o %s' % (dev, remote,
 753                                                    ','.join(mount_options)))
 754                 remote.run(
 755                     args=[
 756                         'sudo',
 757                         'mount',
 758                         '-t', fs,
 759                         '-o', ','.join(mount_options),
 760                         dev,
 761                         mnt_point,
 762                     ]
 763                 )
 764                 remote.run(
 765                     args=[
 766                         'sudo', '/sbin/restorecon', mnt_point,
 767                     ],
 768                     check_status=False,
 769                 )
 770                 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
 771                     ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
 772                 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
 773                 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
 774                     ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
 775                 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
 776                 devs_to_clean[remote].append(mnt_point)
 777
 778         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 779             _, _, id_ = teuthology.split_role(role)
 780             remote.run(
 781                 args=[
 782                     'sudo',
 783                     'MALLOC_CHECK_=3',
 784                     'adjust-ulimits',
 785                     'ceph-coverage',
 786                     coverage_dir,
 787                     'ceph-osd',
 788                     '--cluster',
 789                     cluster_name,
 790                     '--mkfs',
 791                     '--mkkey',
 792                     '-i', id_,
 793                     '--monmap', monmap_path,
 794                 ],
 795             )
 796
 797     log.info('Reading keys from all nodes...')
 798     keys_fp = StringIO()
 799     keys = []
 800     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 801         for type_ in ['mgr',  'mds', 'osd']:
 802             if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
 803                 continue
 804             for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
 805                 _, _, id_ = teuthology.split_role(role)
 806                 data = teuthology.get_file(
 807                     remote=remote,
 808                     path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
 809                         type=type_,
 810                         id=id_,
 811                         cluster=cluster_name,
 812                     ),
 813                     sudo=True,
 814                 )
 815                 keys.append((type_, id_, data))
 816                 keys_fp.write(data)
 817     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 818         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
 819             _, _, id_ = teuthology.split_role(role)
 820             data = teuthology.get_file(
 821                 remote=remote,
 822                 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
 823             )
 824             keys.append(('client', id_, data))
 825             keys_fp.write(data)
 826
 827     log.info('Adding keys to all mons...')
 828     writes = mons.run(
 829         args=[
 830             'sudo', 'tee', '-a',
 831             keyring_path,
 832         ],
 833         stdin=run.PIPE,
 834         wait=False,
 835         stdout=StringIO(),
 836     )
 837     keys_fp.seek(0)
 838     teuthology.feed_many_stdins_and_close(keys_fp, writes)
 839     run.wait(writes)
 840     for type_, id_, data in keys:
 841         run.wait(
 842             mons.run(
 843                 args=[
 844                          'sudo',
 845                          'adjust-ulimits',
 846                          'ceph-coverage',
 847                          coverage_dir,
 848                          'ceph-authtool',
 849                          keyring_path,
 850                          '--name={type}.{id}'.format(
 851                              type=type_,
 852                              id=id_,
 853                          ),
 854                      ] + list(generate_caps(type_)),
 855                 wait=False,
 856             ),
 857         )
 858
 859     log.info('Running mkfs on mon nodes...')
 860     for remote, roles_for_host in mons.remotes.iteritems():
 861         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
 862             _, _, id_ = teuthology.split_role(role)
 863             remote.run(
 864                 args=[
 865                     'sudo',
 866                     'mkdir',
 867                     '-p',
 868                     '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
 869                 ],
 870             )
 871             remote.run(
 872                 args=[
 873                     'sudo',
 874                     'adjust-ulimits',
 875                     'ceph-coverage',
 876                     coverage_dir,
 877                     'ceph-mon',
 878                     '--cluster', cluster_name,
 879                     '--mkfs',
 880                     '-i', id_,
 881                     '--monmap', monmap_path,
 882                     '--keyring', keyring_path,
 883                 ],
 884             )
 885
 886     run.wait(
 887         mons.run(
 888             args=[
 889                 'rm',
 890                 '--',
 891                 monmap_path,
 892             ],
 893             wait=False,
 894         ),
 895     )
 896
 897     try:
 898         yield
 899     except Exception:
 900         # we need to know this below
 901         ctx.summary['success'] = False
 902         raise
 903     finally:
 904         (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 905
 906         log.info('Checking cluster log for badness...')
 907
 908         def first_in_ceph_log(pattern, excludes):
 909             """
 910             Find the first occurence of the pattern specified in the Ceph log,
 911             Returns None if none found.
 912
 913             :param pattern: Pattern scanned for.
 914             :param excludes: Patterns to ignore.
 915             :return: First line of text (or None if not found)
 916             """
 917             args = [
 918                 'sudo',
 919                 'egrep', pattern,
 920                 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
 921             ]
 922             for exclude in excludes:
 923                 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 924             args.extend([
 925                 run.Raw('|'), 'head', '-n', '1',
 926             ])
 927             r = mon0_remote.run(
 928                 stdout=StringIO(),
 929                 args=args,
 930             )
 931             stdout = r.stdout.getvalue()
 932             if stdout != '':
 933                 return stdout
 934             return None
 935
 936         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 937                              config['log_whitelist']) is not None:
 938             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 939             ctx.summary['success'] = False
 940             # use the most severe problem as the failure reason
 941             if 'failure_reason' not in ctx.summary:
 942                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 943                     match = first_in_ceph_log(pattern, config['log_whitelist'])
 944                     if match is not None:
 945                         ctx.summary['failure_reason'] = \
 946                             '"{match}" in cluster log'.format(
 947                                 match=match.rstrip('\n'),
 948                             )
 949                         break
 950
 951         for remote, dirs in devs_to_clean.iteritems():
 952             for dir_ in dirs:
 953                 log.info('Unmounting %s on %s' % (dir_, remote))
 954                 try:
 955                     remote.run(
 956                         args=[
 957                             'sync',
 958                             run.Raw('&&'),
 959                             'sudo',
 960                             'umount',
 961                             '-f',
 962                             dir_
 963                         ]
 964                     )
 965                 except Exception as e:
 966                     remote.run(args=[
 967                         'sudo',
 968                         run.Raw('PATH=/usr/sbin:$PATH'),
 969                         'lsof',
 970                         run.Raw(';'),
 971                         'ps', 'auxf',
 972                     ])
 973                     raise e
 974
 975         if config.get('tmpfs_journal'):
 976             log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
 977             for remote, roles_for_host in osds.remotes.iteritems():
 978                 remote.run(
 979                     args=['sudo', 'umount', '-f', '/mnt'],
 980                     check_status=False,
 981                 )
 982
 983         if ctx.archive is not None and \
 984                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 985
 986             # archive mon data, too
 987             log.info('Archiving mon data...')
 988             path = os.path.join(ctx.archive, 'data')
 989             try:
 990                 os.makedirs(path)
 991             except OSError as e:
 992                 if e.errno == errno.EEXIST:
 993                     pass
 994                 else:
 995                     raise
 996             for remote, roles in mons.remotes.iteritems():
 997                 for role in roles:
 998                     is_mon = teuthology.is_type('mon', cluster_name)
 999                     if is_mon(role):
1000                         _, _, id_ = teuthology.split_role(role)
1001                         mon_dir = '/var/lib/ceph/mon/' + \
1002                                   '{0}-{1}'.format(cluster_name, id_)
1003                         teuthology.pull_directory_tarball(
1004                             remote,
1005                             mon_dir,
1006                             path + '/' + role + '.tgz')
1007
1008         log.info('Cleaning ceph cluster...')
1009         run.wait(
1010             ctx.cluster.run(
1011                 args=[
1012                     'sudo',
1013                     'rm',
1014                     '-rf',
1015                     '--',
1016                     conf_path,
1017                     keyring_path,
1018                     data_dir,
1019                     monmap_path,
1020                     run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1021                 ],
1022                 wait=False,
1023             ),
1024         )
1025
1026
1027 def osd_scrub_pgs(ctx, config):
1028     """
1029     Scrub pgs when we exit.
1030
1031     First make sure all pgs are active and clean.
1032     Next scrub all osds.
1033     Then periodically check until all pgs have scrub time stamps that
1034     indicate the last scrub completed.  Time out if no progess is made
1035     here after two minutes.
1036     """
1037     retries = 40
1038     delays = 20
1039     cluster_name = config['cluster']
1040     manager = ctx.managers[cluster_name]
1041     all_clean = False
1042     for _ in range(0, retries):
1043         stats = manager.get_pg_stats()
1044         bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1045         if not bad:
1046             all_clean = True
1047             break
1048         log.info(
1049             "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1050         time.sleep(delays)
1051     if not all_clean:
1052         raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1053     check_time_now = time.localtime()
1054     time.sleep(1)
1055     all_roles = teuthology.all_roles(ctx.cluster)
1056     for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1057         log.info("Scrubbing {osd}".format(osd=role))
1058         _, _, id_ = teuthology.split_role(role)
1059         # allow this to fail; in certain cases the OSD might not be up
1060         # at this point.  we will catch all pgs below.
1061         try:
1062             manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
1063                                     'osd_debug_deep_scrub_sleep', '0');
1064             manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1065         except run.CommandFailedError:
1066             pass
1067     prev_good = 0
1068     gap_cnt = 0
1069     loop = True
1070     while loop:
1071         stats = manager.get_pg_stats()
1072         timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1073         loop = False
1074         thiscnt = 0
1075         for (pgid, tmval) in timez:
1076             pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1077             if pgtm > check_time_now:
1078                 thiscnt += 1
1079             else:
1080                 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1081                 loop = True
1082         if thiscnt > prev_good:
1083             prev_good = thiscnt
1084             gap_cnt = 0
1085         else:
1086             gap_cnt += 1
1087             if gap_cnt % 6 == 0:
1088                 for (pgid, tmval) in timez:
1089                     # re-request scrub every so often in case the earlier
1090                     # request was missed.  do not do it everytime because
1091                     # the scrub may be in progress or not reported yet and
1092                     # we will starve progress.
1093                     manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1094             if gap_cnt > retries:
1095                 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1096         if loop:
1097             log.info('Still waiting for all pgs to be scrubbed.')
1098             time.sleep(delays)
1099
1100
1101 @contextlib.contextmanager
1102 def run_daemon(ctx, config, type_):
1103     """
1104     Run daemons for a role type.  Handle the startup and termination of a a daemon.
1105     On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1106     and a max_mds value for one mds.
1107     On cleanup -- Stop all existing daemons of this type.
1108
1109     :param ctx: Context
1110     :param config: Configuration
1111     :paran type_: Role type
1112     """
1113     cluster_name = config['cluster']
1114     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1115     testdir = teuthology.get_testdir(ctx)
1116     daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1117
1118     # check whether any daemons if this type are configured
1119     if daemons is None:
1120         return
1121     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1122
1123     daemon_signal = 'kill'
1124     if config.get('coverage') or config.get('valgrind') is not None:
1125         daemon_signal = 'term'
1126
1127     # create osds in order.  (this only matters for pre-luminous, which might
1128     # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1129     osd_uuids  = {}
1130     for remote, roles_for_host in daemons.remotes.iteritems():
1131         is_type_ = teuthology.is_type(type_, cluster_name)
1132         for role in roles_for_host:
1133             if not is_type_(role):
1134                 continue
1135             _, _, id_ = teuthology.split_role(role)
1136
1137
1138             if type_ == 'osd':
1139                 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1140                     cluster=cluster_name, id=id_)
1141                 osd_uuid = teuthology.get_file(
1142                     remote=remote,
1143                     path=datadir + '/fsid',
1144                     sudo=True,
1145                 ).strip()
1146                 osd_uuids[id_] = osd_uuid
1147     for osd_id in range(len(osd_uuids)):
1148         id_ = str(osd_id)
1149         osd_uuid = osd_uuids.get(id_)
1150         try:
1151             remote.run(
1152                 args=[
1153                 'sudo', 'ceph', '--cluster', cluster_name,
1154                     'osd', 'new', osd_uuid, id_,
1155                 ]
1156             )
1157         except:
1158             # fallback to pre-luminous (hammer or jewel)
1159             remote.run(
1160                 args=[
1161                 'sudo', 'ceph', '--cluster', cluster_name,
1162                     'osd', 'create', osd_uuid,
1163                 ]
1164             )
1165             if config.get('add_osds_to_crush'):
1166                 remote.run(
1167                 args=[
1168                     'sudo', 'ceph', '--cluster', cluster_name,
1169                     'osd', 'crush', 'create-or-move', 'osd.' + id_,
1170                     '1.0', 'host=localhost', 'root=default',
1171                 ]
1172             )
1173
1174     for remote, roles_for_host in daemons.remotes.iteritems():
1175         is_type_ = teuthology.is_type(type_, cluster_name)
1176         for role in roles_for_host:
1177             if not is_type_(role):
1178                 continue
1179             _, _, id_ = teuthology.split_role(role)
1180
1181             run_cmd = [
1182                 'sudo',
1183                 'adjust-ulimits',
1184                 'ceph-coverage',
1185                 coverage_dir,
1186                 'daemon-helper',
1187                 daemon_signal,
1188             ]
1189             run_cmd_tail = [
1190                 'ceph-%s' % (type_),
1191                 '-f',
1192                 '--cluster', cluster_name,
1193                 '-i', id_]
1194
1195             if type_ in config.get('cpu_profile', []):
1196                 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1197                 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1198
1199             if config.get('valgrind') is not None:
1200                 valgrind_args = None
1201                 if type_ in config['valgrind']:
1202                     valgrind_args = config['valgrind'][type_]
1203                 if role in config['valgrind']:
1204                     valgrind_args = config['valgrind'][role]
1205                 run_cmd = teuthology.get_valgrind_args(testdir, role,
1206                                                        run_cmd,
1207                                                        valgrind_args)
1208
1209             run_cmd.extend(run_cmd_tail)
1210
1211             # always register mgr; don't necessarily start
1212             ctx.daemons.register_daemon(
1213                 remote, type_, id_,
1214                 cluster=cluster_name,
1215                 args=run_cmd,
1216                 logger=log.getChild(role),
1217                 stdin=run.PIPE,
1218                 wait=False
1219             )
1220             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1221                 role = cluster_name + '.' + type_
1222                 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1223
1224     try:
1225         yield
1226     finally:
1227         teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1228
1229
1230 def healthy(ctx, config):
1231     """
1232     Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1233
1234     :param ctx: Context
1235     :param config: Configuration
1236     """
1237     config = config if isinstance(config, dict) else dict()
1238     cluster_name = config.get('cluster', 'ceph')
1239     log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1240     manager = ctx.managers[cluster_name]
1241     try:
1242         manager.wait_for_mgr_available(timeout=30)
1243     except (run.CommandFailedError, AssertionError) as e:
1244         log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
1245
1246     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1247     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1248     teuthology.wait_until_osds_up(
1249         ctx,
1250         cluster=ctx.cluster,
1251         remote=mon0_remote,
1252         ceph_cluster=cluster_name,
1253     )
1254
1255     try:
1256         manager.flush_all_pg_stats()
1257     except (run.CommandFailedError, Exception) as e:
1258         log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
1259     manager.wait_for_clean()
1260
1261     if config.get('wait-for-healthy', True):
1262         log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1263         teuthology.wait_until_healthy(
1264             ctx,
1265             remote=mon0_remote,
1266             ceph_cluster=cluster_name,
1267         )
1268
1269     if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1270         # Some MDSs exist, wait for them to be healthy
1271         ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1272         ceph_fs.wait_for_daemons(timeout=300)
1273
1274
1275 def wait_for_osds_up(ctx, config):
1276     """
1277     Wait for all osd's to come up.
1278
1279     :param ctx: Context
1280     :param config: Configuration
1281     """
1282     log.info('Waiting until ceph osds are all up...')
1283     cluster_name = config.get('cluster', 'ceph')
1284     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1285     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1286     teuthology.wait_until_osds_up(
1287         ctx,
1288         cluster=ctx.cluster,
1289         remote=mon0_remote
1290     )
1291
1292
1293 def wait_for_mon_quorum(ctx, config):
1294     """
1295     Check renote ceph status until all monitors are up.
1296
1297     :param ctx: Context
1298     :param config: Configuration
1299     """
1300     if isinstance(config, dict):
1301         mons = config['daemons']
1302         cluster_name = config.get('cluster', 'ceph')
1303     else:
1304         assert isinstance(config, list)
1305         mons = config
1306         cluster_name = 'ceph'
1307     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1308     (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1309     with contextutil.safe_while(sleep=10, tries=60,
1310                                 action='wait for monitor quorum') as proceed:
1311         while proceed():
1312             r = remote.run(
1313                 args=[
1314                     'sudo',
1315                     'ceph',
1316                     'quorum_status',
1317                 ],
1318                 stdout=StringIO(),
1319                 logger=log.getChild('quorum_status'),
1320             )
1321             j = json.loads(r.stdout.getvalue())
1322             q = j.get('quorum_names', [])
1323             log.debug('Quorum: %s', q)
1324             if sorted(q) == sorted(mons):
1325                 break
1326
1327
1328 def created_pool(ctx, config):
1329     """
1330     Add new pools to the dictionary of pools that the ceph-manager
1331     knows about.
1332     """
1333     for new_pool in config:
1334         if new_pool not in ctx.managers['ceph'].pools:
1335             ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1336                 new_pool, 'pg_num')
1337
1338
1339 @contextlib.contextmanager
1340 def restart(ctx, config):
1341     """
1342    restart ceph daemons
1343
1344    For example::
1345       tasks:
1346       - ceph.restart: [all]
1347
1348    For example::
1349       tasks:
1350       - ceph.restart: [osd.0, mon.1, mds.*]
1351
1352    or::
1353
1354       tasks:
1355       - ceph.restart:
1356           daemons: [osd.0, mon.1]
1357           wait-for-healthy: false
1358           wait-for-osds-up: true
1359
1360     :param ctx: Context
1361     :param config: Configuration
1362     """
1363     if config is None:
1364         config = {}
1365     elif isinstance(config, list):
1366         config = {'daemons': config}
1367
1368     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1369     clusters = set()
1370     for role in daemons:
1371         cluster, type_, id_ = teuthology.split_role(role)
1372         ctx.daemons.get_daemon(type_, id_, cluster).restart()
1373         clusters.add(cluster)
1374
1375     manager = ctx.managers['ceph']
1376     for dmon in daemons:
1377         if '.' in dmon:
1378             dm_parts = dmon.split('.')
1379             if dm_parts[1].isdigit():
1380                 if dm_parts[0] == 'osd':
1381                     manager.mark_down_osd(int(dm_parts[1]))
1382
1383     if config.get('wait-for-healthy', True):
1384         for cluster in clusters:
1385             healthy(ctx=ctx, config=dict(cluster=cluster))
1386     if config.get('wait-for-osds-up', False):
1387         for cluster in clusters:
1388             wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1389     yield
1390
1391
1392 @contextlib.contextmanager
1393 def stop(ctx, config):
1394     """
1395     Stop ceph daemons
1396
1397     For example::
1398       tasks:
1399       - ceph.stop: [mds.*]
1400
1401       tasks:
1402       - ceph.stop: [osd.0, osd.2]
1403
1404       tasks:
1405       - ceph.stop:
1406           daemons: [osd.0, osd.2]
1407
1408     """
1409     if config is None:
1410         config = {}
1411     elif isinstance(config, list):
1412         config = {'daemons': config}
1413
1414     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1415     for role in daemons:
1416         cluster, type_, id_ = teuthology.split_role(role)
1417         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1418
1419     yield
1420
1421
1422 @contextlib.contextmanager
1423 def wait_for_failure(ctx, config):
1424     """
1425     Wait for a failure of a ceph daemon
1426
1427     For example::
1428       tasks:
1429       - ceph.wait_for_failure: [mds.*]
1430
1431       tasks:
1432       - ceph.wait_for_failure: [osd.0, osd.2]
1433
1434       tasks:
1435       - ceph.wait_for_failure:
1436           daemons: [osd.0, osd.2]
1437
1438     """
1439     if config is None:
1440         config = {}
1441     elif isinstance(config, list):
1442         config = {'daemons': config}
1443
1444     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1445     for role in daemons:
1446         cluster, type_, id_ = teuthology.split_role(role)
1447         try:
1448             ctx.daemons.get_daemon(type_, id_, cluster).wait()
1449         except:
1450             log.info('Saw expected daemon failure.  Continuing.')
1451             pass
1452         else:
1453             raise RuntimeError('daemon %s did not fail' % role)
1454
1455     yield
1456
1457
1458 def validate_config(ctx, config):
1459     """
1460     Perform some simple validation on task configuration.
1461     Raises exceptions.ConfigError if an error is found.
1462     """
1463     # check for osds from multiple clusters on the same host
1464     for remote, roles_for_host in ctx.cluster.remotes.items():
1465         last_cluster = None
1466         last_role = None
1467         for role in roles_for_host:
1468             role_cluster, role_type, _ = teuthology.split_role(role)
1469             if role_type != 'osd':
1470                 continue
1471             if last_cluster and last_cluster != role_cluster:
1472                 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1473                     last_role, role)
1474                 raise exceptions.ConfigError(msg)
1475             last_cluster = role_cluster
1476             last_role = role
1477
1478
1479 @contextlib.contextmanager
1480 def task(ctx, config):
1481     """
1482     Set up and tear down a Ceph cluster.
1483
1484     For example::
1485
1486         tasks:
1487         - ceph:
1488         - interactive:
1489
1490     You can also specify what branch to run::
1491
1492         tasks:
1493         - ceph:
1494             branch: foo
1495
1496     Or a tag::
1497
1498         tasks:
1499         - ceph:
1500             tag: v0.42.13
1501
1502     Or a sha1::
1503
1504         tasks:
1505         - ceph:
1506             sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1507
1508     Or a local source dir::
1509
1510         tasks:
1511         - ceph:
1512             path: /home/sage/ceph
1513
1514     To capture code coverage data, use::
1515
1516         tasks:
1517         - ceph:
1518             coverage: true
1519
1520     To use btrfs, ext4, or xfs on the target's scratch disks, use::
1521
1522         tasks:
1523         - ceph:
1524             fs: xfs
1525             mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1526             mount_options: [nobarrier, inode64]
1527
1528     Note, this will cause the task to check the /scratch_devs file on each node
1529     for available devices.  If no such file is found, /dev/sdb will be used.
1530
1531     To run some daemons under valgrind, include their names
1532     and the tool/args to use in a valgrind section::
1533
1534         tasks:
1535         - ceph:
1536           valgrind:
1537             mds.1: --tool=memcheck
1538             osd.1: [--tool=memcheck, --leak-check=no]
1539
1540     Those nodes which are using memcheck or valgrind will get
1541     checked for bad results.
1542
1543     To adjust or modify config options, use::
1544
1545         tasks:
1546         - ceph:
1547             conf:
1548               section:
1549                 key: value
1550
1551     For example::
1552
1553         tasks:
1554         - ceph:
1555             conf:
1556               mds.0:
1557                 some option: value
1558                 other key: other value
1559               client.0:
1560                 debug client: 10
1561                 debug ms: 1
1562
1563     By default, the cluster log is checked for errors and warnings,
1564     and the run marked failed if any appear. You can ignore log
1565     entries by giving a list of egrep compatible regexes, i.e.:
1566
1567         tasks:
1568         - ceph:
1569             log-whitelist: ['foo.*bar', 'bad message']
1570
1571     To run multiple ceph clusters, use multiple ceph tasks, and roles
1572     with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1573     cluster use the default cluster name, 'ceph'. OSDs from separate
1574     clusters must be on separate hosts. Clients and non-osd daemons
1575     from multiple clusters may be colocated. For each cluster, add an
1576     instance of the ceph task with the cluster name specified, e.g.::
1577
1578         roles:
1579         - [mon.a, osd.0, osd.1]
1580         - [backup.mon.a, backup.osd.0, backup.osd.1]
1581         - [client.0, backup.client.0]
1582         tasks:
1583         - ceph:
1584             cluster: ceph
1585         - ceph:
1586             cluster: backup
1587
1588     :param ctx: Context
1589     :param config: Configuration
1590
1591     """
1592     if config is None:
1593         config = {}
1594     assert isinstance(config, dict), \
1595         "task ceph only supports a dictionary for configuration"
1596
1597     overrides = ctx.config.get('overrides', {})
1598     teuthology.deep_merge(config, overrides.get('ceph', {}))
1599
1600     first_ceph_cluster = False
1601     if not hasattr(ctx, 'daemons'):
1602         first_ceph_cluster = True
1603         ctx.daemons = DaemonGroup()
1604
1605     testdir = teuthology.get_testdir(ctx)
1606     if config.get('coverage'):
1607         coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1608         log.info('Creating coverage directory...')
1609         run.wait(
1610             ctx.cluster.run(
1611                 args=[
1612                     'install', '-d', '-m0755', '--',
1613                     coverage_dir,
1614                 ],
1615                 wait=False,
1616             )
1617         )
1618
1619     if 'cluster' not in config:
1620         config['cluster'] = 'ceph'
1621
1622     validate_config(ctx, config)
1623
1624     subtasks = []
1625     if first_ceph_cluster:
1626         # these tasks handle general log setup and parsing on all hosts,
1627         # so they should only be run once
1628         subtasks = [
1629             lambda: ceph_log(ctx=ctx, config=None),
1630             lambda: valgrind_post(ctx=ctx, config=config),
1631         ]
1632
1633     subtasks += [
1634         lambda: cluster(ctx=ctx, config=dict(
1635             conf=config.get('conf', {}),
1636             fs=config.get('fs', 'xfs'),
1637             mkfs_options=config.get('mkfs_options', None),
1638             mount_options=config.get('mount_options', None),
1639             block_journal=config.get('block_journal', None),
1640             tmpfs_journal=config.get('tmpfs_journal', None),
1641             skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1642             log_whitelist=config.get('log-whitelist', []),
1643             cpu_profile=set(config.get('cpu_profile', []),),
1644             cluster=config['cluster'],
1645         )),
1646         lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1647         lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1648         lambda: crush_setup(ctx=ctx, config=config),
1649         lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1650         lambda: create_rbd_pool(ctx=ctx, config=config),
1651         lambda: cephfs_setup(ctx=ctx, config=config),
1652         lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1653     ]
1654
1655     with contextutil.nested(*subtasks):
1656         first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1657         (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1658         if not hasattr(ctx, 'managers'):
1659             ctx.managers = {}
1660         ctx.managers[config['cluster']] = CephManager(
1661             mon,
1662             ctx=ctx,
1663             logger=log.getChild('ceph_manager.' + config['cluster']),
1664             cluster=config['cluster'],
1665         )
1666
1667         try:
1668             if config.get('wait-for-healthy', True):
1669                 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1670
1671             yield
1672         finally:
1673             if config.get('wait-for-scrub', True):
1674                 osd_scrub_pgs(ctx, config)
1675
1676             # stop logging health to clog during shutdown, or else we generate
1677             # a bunch of scary messages unrelated to our actual run.
1678             firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1679             (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1680             mon0_remote.run(
1681                 args=[
1682                     'sudo',
1683                     'ceph',
1684                     '--cluster', config['cluster'],
1685                     'tell',
1686                     'mon.*',
1687                     'injectargs',
1688                     '--',
1689                     '--no-mon-health-to-clog',
1690                 ]
1691             )