ceph/qa/tasks/ceph.py

   1 """
   2 Ceph cluster task.
   3
   4 Handle the setup, starting, and clean-up of a Ceph cluster.
   5 """
   6 from cStringIO import StringIO
   7
   8 import argparse
   9 import contextlib
  10 import errno
  11 import logging
  12 import os
  13 import json
  14 import time
  15 import gevent
  16 import socket
  17
  18 from paramiko import SSHException
  19 from ceph_manager import CephManager, write_conf
  20 from tasks.cephfs.filesystem import Filesystem
  21 from teuthology import misc as teuthology
  22 from teuthology import contextutil
  23 from teuthology import exceptions
  24 from teuthology.orchestra import run
  25 import ceph_client as cclient
  26 from teuthology.orchestra.daemon import DaemonGroup
  27
  28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
  29
  30 log = logging.getLogger(__name__)
  31
  32
  33 def generate_caps(type_):
  34     """
  35     Each call will return the next capability for each system type
  36     (essentially a subset of possible role values).  Valid types are osd,
  37     mds and client.
  38     """
  39     defaults = dict(
  40         osd=dict(
  41             mon='allow *',
  42             mgr='allow *',
  43             osd='allow *',
  44         ),
  45         mgr=dict(
  46             mon='allow *',
  47         ),
  48         mds=dict(
  49             mon='allow *',
  50             mgr='allow *',
  51             osd='allow *',
  52             mds='allow',
  53         ),
  54         client=dict(
  55             mon='allow rw',
  56             mgr='allow r',
  57             osd='allow rwx',
  58             mds='allow',
  59         ),
  60     )
  61     for subsystem, capability in defaults[type_].items():
  62         yield '--cap'
  63         yield subsystem
  64         yield capability
  65
  66
  67 @contextlib.contextmanager
  68 def ceph_log(ctx, config):
  69     """
  70     Create /var/log/ceph log directory that is open to everyone.
  71     Add valgrind and profiling-logger directories.
  72
  73     :param ctx: Context
  74     :param config: Configuration
  75     """
  76     log.info('Making ceph log dir writeable by non-root...')
  77     run.wait(
  78         ctx.cluster.run(
  79             args=[
  80                 'sudo',
  81                 'chmod',
  82                 '777',
  83                 '/var/log/ceph',
  84             ],
  85             wait=False,
  86         )
  87     )
  88     log.info('Disabling ceph logrotate...')
  89     run.wait(
  90         ctx.cluster.run(
  91             args=[
  92                 'sudo',
  93                 'rm', '-f', '--',
  94                 '/etc/logrotate.d/ceph',
  95             ],
  96             wait=False,
  97         )
  98     )
  99     log.info('Creating extra log directories...')
 100     run.wait(
 101         ctx.cluster.run(
 102             args=[
 103                 'sudo',
 104                 'install', '-d', '-m0777', '--',
 105                 '/var/log/ceph/valgrind',
 106                 '/var/log/ceph/profiling-logger',
 107             ],
 108             wait=False,
 109         )
 110     )
 111
 112     class Rotater(object):
 113         stop_event = gevent.event.Event()
 114
 115         def invoke_logrotate(self):
 116             # 1) install ceph-test.conf in /etc/logrotate.d
 117             # 2) continuously loop over logrotate invocation with ceph-test.conf
 118             while not self.stop_event.is_set():
 119                 self.stop_event.wait(timeout=30)
 120                 try:
 121                     run.wait(
 122                         ctx.cluster.run(
 123                             args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
 124                                   ],
 125                             wait=False,
 126                         )
 127                     )
 128                 except exceptions.ConnectionLostError as e:
 129                     # Some tests may power off nodes during test, in which
 130                     # case we will see connection errors that we should ignore.
 131                     log.debug("Missed logrotate, node '{0}' is offline".format(
 132                         e.node))
 133                 except EOFError as e:
 134                     # Paramiko sometimes raises this when it fails to
 135                     # connect to a node during open_session.  As with
 136                     # ConnectionLostError, we ignore this because nodes
 137                     # are allowed to get power cycled during tests.
 138                     log.debug("Missed logrotate, EOFError")
 139                 except SSHException as e:
 140                     log.debug("Missed logrotate, SSHException")
 141                 except socket.error as e:
 142                     if e.errno == errno.EHOSTUNREACH:
 143                         log.debug("Missed logrotate, host unreachable")
 144                     else:
 145                         raise
 146
 147         def begin(self):
 148             self.thread = gevent.spawn(self.invoke_logrotate)
 149
 150         def end(self):
 151             self.stop_event.set()
 152             self.thread.get()
 153
 154     def write_rotate_conf(ctx, daemons):
 155         testdir = teuthology.get_testdir(ctx)
 156         rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
 157         with file(rotate_conf_path, 'rb') as f:
 158             conf = ""
 159             for daemon, size in daemons.iteritems():
 160                 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
 161                 conf += f.read().format(daemon_type=daemon, max_size=size)
 162                 f.seek(0, 0)
 163
 164             for remote in ctx.cluster.remotes.iterkeys():
 165                 teuthology.write_file(remote=remote,
 166                                       path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 167                                       data=StringIO(conf)
 168                                       )
 169                 remote.run(
 170                     args=[
 171                         'sudo',
 172                         'mv',
 173                         '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 174                         '/etc/logrotate.d/ceph-test.conf',
 175                         run.Raw('&&'),
 176                         'sudo',
 177                         'chmod',
 178                         '0644',
 179                         '/etc/logrotate.d/ceph-test.conf',
 180                         run.Raw('&&'),
 181                         'sudo',
 182                         'chown',
 183                         'root.root',
 184                         '/etc/logrotate.d/ceph-test.conf'
 185                     ]
 186                 )
 187                 remote.chcon('/etc/logrotate.d/ceph-test.conf',
 188                              'system_u:object_r:etc_t:s0')
 189
 190     if ctx.config.get('log-rotate'):
 191         daemons = ctx.config.get('log-rotate')
 192         log.info('Setting up log rotation with ' + str(daemons))
 193         write_rotate_conf(ctx, daemons)
 194         logrotater = Rotater()
 195         logrotater.begin()
 196     try:
 197         yield
 198
 199     finally:
 200         if ctx.config.get('log-rotate'):
 201             log.info('Shutting down logrotate')
 202             logrotater.end()
 203             ctx.cluster.run(
 204                 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
 205                       ]
 206             )
 207         if ctx.archive is not None and \
 208                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 209             # and logs
 210             log.info('Compressing logs...')
 211             run.wait(
 212                 ctx.cluster.run(
 213                     args=[
 214                         'sudo',
 215                         'find',
 216                         '/var/log/ceph',
 217                         '-name',
 218                         '*.log',
 219                         '-print0',
 220                         run.Raw('|'),
 221                         'sudo',
 222                         'xargs',
 223                         '-0',
 224                         '--no-run-if-empty',
 225                         '--',
 226                         'gzip',
 227                         '--',
 228                     ],
 229                     wait=False,
 230                 ),
 231             )
 232
 233             log.info('Archiving logs...')
 234             path = os.path.join(ctx.archive, 'remote')
 235             os.makedirs(path)
 236             for remote in ctx.cluster.remotes.iterkeys():
 237                 sub = os.path.join(path, remote.shortname)
 238                 os.makedirs(sub)
 239                 teuthology.pull_directory(remote, '/var/log/ceph',
 240                                           os.path.join(sub, 'log'))
 241
 242
 243 def assign_devs(roles, devs):
 244     """
 245     Create a dictionary of devs indexed by roles
 246
 247     :param roles: List of roles
 248     :param devs: Corresponding list of devices.
 249     :returns: Dictionary of devs indexed by roles.
 250     """
 251     return dict(zip(roles, devs))
 252
 253
 254 @contextlib.contextmanager
 255 def valgrind_post(ctx, config):
 256     """
 257     After the tests run, look throught all the valgrind logs.  Exceptions are raised
 258     if textual errors occured in the logs, or if valgrind exceptions were detected in
 259     the logs.
 260
 261     :param ctx: Context
 262     :param config: Configuration
 263     """
 264     try:
 265         yield
 266     finally:
 267         lookup_procs = list()
 268         log.info('Checking for errors in any valgrind logs...')
 269         for remote in ctx.cluster.remotes.iterkeys():
 270             # look at valgrind logs for each node
 271             proc = remote.run(
 272                 args=[
 273                     'sudo',
 274                     'zgrep',
 275                     '<kind>',
 276                     run.Raw('/var/log/ceph/valgrind/*'),
 277                     '/dev/null',  # include a second file so that we always get a filename prefix on the output
 278                     run.Raw('|'),
 279                     'sort',
 280                     run.Raw('|'),
 281                     'uniq',
 282                 ],
 283                 wait=False,
 284                 check_status=False,
 285                 stdout=StringIO(),
 286             )
 287             lookup_procs.append((proc, remote))
 288
 289         valgrind_exception = None
 290         for (proc, remote) in lookup_procs:
 291             proc.wait()
 292             out = proc.stdout.getvalue()
 293             for line in out.split('\n'):
 294                 if line == '':
 295                     continue
 296                 try:
 297                     (file, kind) = line.split(':')
 298                 except Exception:
 299                     log.error('failed to split line %s', line)
 300                     raise
 301                 log.debug('file %s kind %s', file, kind)
 302                 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
 303                     continue
 304                 log.error('saw valgrind issue %s in %s', kind, file)
 305                 valgrind_exception = Exception('saw valgrind issues')
 306
 307         if config.get('expect_valgrind_errors'):
 308             if not valgrind_exception:
 309                 raise Exception('expected valgrind issues and found none')
 310         else:
 311             if valgrind_exception:
 312                 raise valgrind_exception
 313
 314
 315 @contextlib.contextmanager
 316 def crush_setup(ctx, config):
 317     cluster_name = config['cluster']
 318     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 319     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 320
 321     profile = config.get('crush_tunables', 'default')
 322     log.info('Setting crush tunables to %s', profile)
 323     mon_remote.run(
 324         args=['sudo', 'ceph', '--cluster', cluster_name,
 325               'osd', 'crush', 'tunables', profile])
 326     yield
 327
 328
 329 @contextlib.contextmanager
 330 def cephfs_setup(ctx, config):
 331     cluster_name = config['cluster']
 332     testdir = teuthology.get_testdir(ctx)
 333     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 334
 335     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 336     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 337     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 338     # If there are any MDSs, then create a filesystem for them to use
 339     # Do this last because requires mon cluster to be up and running
 340     if mdss.remotes:
 341         log.info('Setting up CephFS filesystem...')
 342
 343         fs = Filesystem(ctx, create='cephfs')
 344
 345         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
 346         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
 347         num_active = len([r for r in all_roles if is_active_mds(r)])
 348
 349         fs.set_allow_multimds(True)
 350         fs.set_max_mds(num_active)
 351         fs.set_allow_dirfrags(True)
 352
 353     yield
 354
 355
 356 @contextlib.contextmanager
 357 def cluster(ctx, config):
 358     """
 359     Handle the creation and removal of a ceph cluster.
 360
 361     On startup:
 362         Create directories needed for the cluster.
 363         Create remote journals for all osds.
 364         Create and set keyring.
 365         Copy the monmap to tht test systems.
 366         Setup mon nodes.
 367         Setup mds nodes.
 368         Mkfs osd nodes.
 369         Add keyring information to monmaps
 370         Mkfs mon nodes.
 371
 372     On exit:
 373         If errors occured, extract a failure message and store in ctx.summary.
 374         Unmount all test files and temporary journaling files.
 375         Save the monitor information and archive all ceph logs.
 376         Cleanup the keyring setup, and remove all monitor map and data files left over.
 377
 378     :param ctx: Context
 379     :param config: Configuration
 380     """
 381     if ctx.config.get('use_existing_cluster', False) is True:
 382         log.info("'use_existing_cluster' is true; skipping cluster creation")
 383         yield
 384
 385     testdir = teuthology.get_testdir(ctx)
 386     cluster_name = config['cluster']
 387     data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
 388     log.info('Creating ceph cluster %s...', cluster_name)
 389     run.wait(
 390         ctx.cluster.run(
 391             args=[
 392                 'install', '-d', '-m0755', '--',
 393                 data_dir,
 394             ],
 395             wait=False,
 396         )
 397     )
 398
 399     run.wait(
 400         ctx.cluster.run(
 401             args=[
 402                 'sudo',
 403                 'install', '-d', '-m0777', '--', '/var/run/ceph',
 404             ],
 405             wait=False,
 406         )
 407     )
 408
 409     devs_to_clean = {}
 410     remote_to_roles_to_devs = {}
 411     remote_to_roles_to_journals = {}
 412     osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
 413     for remote, roles_for_host in osds.remotes.iteritems():
 414         devs = teuthology.get_scratch_devices(remote)
 415         roles_to_devs = {}
 416         roles_to_journals = {}
 417         if config.get('fs'):
 418             log.info('fs option selected, checking for scratch devs')
 419             log.info('found devs: %s' % (str(devs),))
 420             devs_id_map = teuthology.get_wwn_id_map(remote, devs)
 421             iddevs = devs_id_map.values()
 422             roles_to_devs = assign_devs(
 423                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 424             )
 425             if len(roles_to_devs) < len(iddevs):
 426                 iddevs = iddevs[len(roles_to_devs):]
 427             devs_to_clean[remote] = []
 428
 429         if config.get('block_journal'):
 430             log.info('block journal enabled')
 431             roles_to_journals = assign_devs(
 432                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 433             )
 434             log.info('journal map: %s', roles_to_journals)
 435
 436         if config.get('tmpfs_journal'):
 437             log.info('tmpfs journal enabled')
 438             roles_to_journals = {}
 439             remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
 440             for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 441                 tmpfs = '/mnt/' + role
 442                 roles_to_journals[role] = tmpfs
 443                 remote.run(args=['truncate', '-s', '1500M', tmpfs])
 444             log.info('journal map: %s', roles_to_journals)
 445
 446         log.info('dev map: %s' % (str(roles_to_devs),))
 447         remote_to_roles_to_devs[remote] = roles_to_devs
 448         remote_to_roles_to_journals[remote] = roles_to_journals
 449
 450     log.info('Generating config...')
 451     remotes_and_roles = ctx.cluster.remotes.items()
 452     roles = [role_list for (remote, role_list) in remotes_and_roles]
 453     ips = [host for (host, port) in
 454            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
 455     conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
 456     for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
 457         for role, journal in roles_to_journals.iteritems():
 458             name = teuthology.ceph_role(role)
 459             if name not in conf:
 460                 conf[name] = {}
 461             conf[name]['osd journal'] = journal
 462     for section, keys in config['conf'].iteritems():
 463         for key, value in keys.iteritems():
 464             log.info("[%s] %s = %s" % (section, key, value))
 465             if section not in conf:
 466                 conf[section] = {}
 467             conf[section][key] = value
 468
 469     if config.get('tmpfs_journal'):
 470         conf['journal dio'] = False
 471
 472     if not hasattr(ctx, 'ceph'):
 473         ctx.ceph = {}
 474     ctx.ceph[cluster_name] = argparse.Namespace()
 475     ctx.ceph[cluster_name].conf = conf
 476
 477     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
 478     keyring_path = config.get('keyring_path', default_keyring)
 479
 480     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 481
 482     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
 483
 484     log.info('Setting up %s...' % firstmon)
 485     ctx.cluster.only(firstmon).run(
 486         args=[
 487             'sudo',
 488             'adjust-ulimits',
 489             'ceph-coverage',
 490             coverage_dir,
 491             'ceph-authtool',
 492             '--create-keyring',
 493             keyring_path,
 494         ],
 495     )
 496     ctx.cluster.only(firstmon).run(
 497         args=[
 498             'sudo',
 499             'adjust-ulimits',
 500             'ceph-coverage',
 501             coverage_dir,
 502             'ceph-authtool',
 503             '--gen-key',
 504             '--name=mon.',
 505             keyring_path,
 506         ],
 507     )
 508     ctx.cluster.only(firstmon).run(
 509         args=[
 510             'sudo',
 511             'chmod',
 512             '0644',
 513             keyring_path,
 514         ],
 515     )
 516     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 517     monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
 518                                                    cluster=cluster_name)
 519     fsid = teuthology.create_simple_monmap(
 520         ctx,
 521         remote=mon0_remote,
 522         conf=conf,
 523         path=monmap_path,
 524     )
 525     if not 'global' in conf:
 526         conf['global'] = {}
 527     conf['global']['fsid'] = fsid
 528
 529     default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
 530     conf_path = config.get('conf_path', default_conf_path)
 531     log.info('Writing %s for FSID %s...' % (conf_path, fsid))
 532     write_conf(ctx, conf_path, cluster_name)
 533
 534     log.info('Creating admin key on %s...' % firstmon)
 535     ctx.cluster.only(firstmon).run(
 536         args=[
 537             'sudo',
 538             'adjust-ulimits',
 539             'ceph-coverage',
 540             coverage_dir,
 541             'ceph-authtool',
 542             '--gen-key',
 543             '--name=client.admin',
 544             '--set-uid=0',
 545             '--cap', 'mon', 'allow *',
 546             '--cap', 'osd', 'allow *',
 547             '--cap', 'mds', 'allow *',
 548             '--cap', 'mgr', 'allow *',
 549             keyring_path,
 550         ],
 551     )
 552
 553     log.info('Copying monmap to all nodes...')
 554     keyring = teuthology.get_file(
 555         remote=mon0_remote,
 556         path=keyring_path,
 557     )
 558     monmap = teuthology.get_file(
 559         remote=mon0_remote,
 560         path=monmap_path,
 561     )
 562
 563     for rem in ctx.cluster.remotes.iterkeys():
 564         # copy mon key and initial monmap
 565         log.info('Sending monmap to node {remote}'.format(remote=rem))
 566         teuthology.sudo_write_file(
 567             remote=rem,
 568             path=keyring_path,
 569             data=keyring,
 570             perms='0644'
 571         )
 572         teuthology.write_file(
 573             remote=rem,
 574             path=monmap_path,
 575             data=monmap,
 576         )
 577
 578     log.info('Setting up mon nodes...')
 579     mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
 580     osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir,
 581                                                    cluster=cluster_name)
 582     run.wait(
 583         mons.run(
 584             args=[
 585                 'adjust-ulimits',
 586                 'ceph-coverage',
 587                 coverage_dir,
 588                 'osdmaptool',
 589                 '-c', conf_path,
 590                 '--clobber',
 591                 '--createsimple', '{num:d}'.format(
 592                     num=teuthology.num_instances_of_type(ctx.cluster, 'osd',
 593                                                          cluster_name),
 594                 ),
 595                 osdmap_path,
 596                 '--pg_bits', '2',
 597                 '--pgp_bits', '4',
 598             ],
 599             wait=False,
 600         ),
 601     )
 602
 603     if not config.get('skip_mgr_daemons', False):
 604         log.info('Setting up mgr nodes...')
 605         mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
 606         for remote, roles_for_host in mgrs.remotes.iteritems():
 607             for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
 608                                                          cluster_name):
 609                 _, _, id_ = teuthology.split_role(role)
 610                 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
 611                     cluster=cluster_name,
 612                     id=id_,
 613                 )
 614                 remote.run(
 615                     args=[
 616                         'sudo',
 617                         'mkdir',
 618                         '-p',
 619                         mgr_dir,
 620                         run.Raw('&&'),
 621                         'sudo',
 622                         'adjust-ulimits',
 623                         'ceph-coverage',
 624                         coverage_dir,
 625                         'ceph-authtool',
 626                         '--create-keyring',
 627                         '--gen-key',
 628                         '--name=mgr.{id}'.format(id=id_),
 629                         mgr_dir + '/keyring',
 630                     ],
 631                 )
 632
 633     log.info('Setting up mds nodes...')
 634     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 635     for remote, roles_for_host in mdss.remotes.iteritems():
 636         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
 637                                                      cluster_name):
 638             _, _, id_ = teuthology.split_role(role)
 639             mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
 640                 cluster=cluster_name,
 641                 id=id_,
 642             )
 643             remote.run(
 644                 args=[
 645                     'sudo',
 646                     'mkdir',
 647                     '-p',
 648                     mds_dir,
 649                     run.Raw('&&'),
 650                     'sudo',
 651                     'adjust-ulimits',
 652                     'ceph-coverage',
 653                     coverage_dir,
 654                     'ceph-authtool',
 655                     '--create-keyring',
 656                     '--gen-key',
 657                     '--name=mds.{id}'.format(id=id_),
 658                     mds_dir + '/keyring',
 659                 ],
 660             )
 661
 662     cclient.create_keyring(ctx, cluster_name)
 663     log.info('Running mkfs on osd nodes...')
 664
 665     if not hasattr(ctx, 'disk_config'):
 666         ctx.disk_config = argparse.Namespace()
 667     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
 668         ctx.disk_config.remote_to_roles_to_dev = {}
 669     if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
 670         ctx.disk_config.remote_to_roles_to_journals = {}
 671     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
 672         ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
 673     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
 674         ctx.disk_config.remote_to_roles_to_dev_fstype = {}
 675
 676     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
 677     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
 678
 679     log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
 680     for remote, roles_for_host in osds.remotes.iteritems():
 681         roles_to_devs = remote_to_roles_to_devs[remote]
 682         roles_to_journals = remote_to_roles_to_journals[remote]
 683
 684         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 685             _, _, id_ = teuthology.split_role(role)
 686             mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
 687             remote.run(
 688                 args=[
 689                     'sudo',
 690                     'mkdir',
 691                     '-p',
 692                     mnt_point,
 693                 ])
 694             log.info(str(roles_to_journals))
 695             log.info(role)
 696             if roles_to_devs.get(role):
 697                 dev = roles_to_devs[role]
 698                 fs = config.get('fs')
 699                 package = None
 700                 mkfs_options = config.get('mkfs_options')
 701                 mount_options = config.get('mount_options')
 702                 if fs == 'btrfs':
 703                     # package = 'btrfs-tools'
 704                     if mount_options is None:
 705                         mount_options = ['noatime', 'user_subvol_rm_allowed']
 706                     if mkfs_options is None:
 707                         mkfs_options = ['-m', 'single',
 708                                         '-l', '32768',
 709                                         '-n', '32768']
 710                 if fs == 'xfs':
 711                     # package = 'xfsprogs'
 712                     if mount_options is None:
 713                         mount_options = ['noatime']
 714                     if mkfs_options is None:
 715                         mkfs_options = ['-f', '-i', 'size=2048']
 716                 if fs == 'ext4' or fs == 'ext3':
 717                     if mount_options is None:
 718                         mount_options = ['noatime', 'user_xattr']
 719
 720                 if mount_options is None:
 721                     mount_options = []
 722                 if mkfs_options is None:
 723                     mkfs_options = []
 724                 mkfs = ['mkfs.%s' % fs] + mkfs_options
 725                 log.info('%s on %s on %s' % (mkfs, dev, remote))
 726                 if package is not None:
 727                     remote.run(
 728                         args=[
 729                             'sudo',
 730                             'apt-get', 'install', '-y', package
 731                         ],
 732                         stdout=StringIO(),
 733                     )
 734
 735                 try:
 736                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 737                 except run.CommandFailedError:
 738                     # Newer btfs-tools doesn't prompt for overwrite, use -f
 739                     if '-f' not in mount_options:
 740                         mkfs_options.append('-f')
 741                         mkfs = ['mkfs.%s' % fs] + mkfs_options
 742                         log.info('%s on %s on %s' % (mkfs, dev, remote))
 743                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 744
 745                 log.info('mount %s on %s -o %s' % (dev, remote,
 746                                                    ','.join(mount_options)))
 747                 remote.run(
 748                     args=[
 749                         'sudo',
 750                         'mount',
 751                         '-t', fs,
 752                         '-o', ','.join(mount_options),
 753                         dev,
 754                         mnt_point,
 755                     ]
 756                 )
 757                 remote.run(
 758                     args=[
 759                         'sudo', '/sbin/restorecon', mnt_point,
 760                     ],
 761                     check_status=False,
 762                 )
 763                 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
 764                     ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
 765                 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
 766                 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
 767                     ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
 768                 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
 769                 devs_to_clean[remote].append(mnt_point)
 770
 771         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 772             _, _, id_ = teuthology.split_role(role)
 773             remote.run(
 774                 args=[
 775                     'sudo',
 776                     'MALLOC_CHECK_=3',
 777                     'adjust-ulimits',
 778                     'ceph-coverage',
 779                     coverage_dir,
 780                     'ceph-osd',
 781                     '--cluster',
 782                     cluster_name,
 783                     '--mkfs',
 784                     '--mkkey',
 785                     '-i', id_,
 786                     '--monmap', monmap_path,
 787                 ],
 788             )
 789
 790     log.info('Reading keys from all nodes...')
 791     keys_fp = StringIO()
 792     keys = []
 793     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 794         for type_ in ['mgr',  'mds', 'osd']:
 795             if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
 796                 continue
 797             for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
 798                 _, _, id_ = teuthology.split_role(role)
 799                 data = teuthology.get_file(
 800                     remote=remote,
 801                     path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
 802                         type=type_,
 803                         id=id_,
 804                         cluster=cluster_name,
 805                     ),
 806                     sudo=True,
 807                 )
 808                 keys.append((type_, id_, data))
 809                 keys_fp.write(data)
 810     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 811         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
 812             _, _, id_ = teuthology.split_role(role)
 813             data = teuthology.get_file(
 814                 remote=remote,
 815                 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
 816             )
 817             keys.append(('client', id_, data))
 818             keys_fp.write(data)
 819
 820     log.info('Adding keys to all mons...')
 821     writes = mons.run(
 822         args=[
 823             'sudo', 'tee', '-a',
 824             keyring_path,
 825         ],
 826         stdin=run.PIPE,
 827         wait=False,
 828         stdout=StringIO(),
 829     )
 830     keys_fp.seek(0)
 831     teuthology.feed_many_stdins_and_close(keys_fp, writes)
 832     run.wait(writes)
 833     for type_, id_, data in keys:
 834         run.wait(
 835             mons.run(
 836                 args=[
 837                          'sudo',
 838                          'adjust-ulimits',
 839                          'ceph-coverage',
 840                          coverage_dir,
 841                          'ceph-authtool',
 842                          keyring_path,
 843                          '--name={type}.{id}'.format(
 844                              type=type_,
 845                              id=id_,
 846                          ),
 847                      ] + list(generate_caps(type_)),
 848                 wait=False,
 849             ),
 850         )
 851
 852     log.info('Running mkfs on mon nodes...')
 853     for remote, roles_for_host in mons.remotes.iteritems():
 854         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
 855             _, _, id_ = teuthology.split_role(role)
 856             remote.run(
 857                 args=[
 858                     'sudo',
 859                     'mkdir',
 860                     '-p',
 861                     '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
 862                 ],
 863             )
 864             remote.run(
 865                 args=[
 866                     'sudo',
 867                     'adjust-ulimits',
 868                     'ceph-coverage',
 869                     coverage_dir,
 870                     'ceph-mon',
 871                     '--cluster', cluster_name,
 872                     '--mkfs',
 873                     '-i', id_,
 874                     '--monmap', monmap_path,
 875                     '--osdmap', osdmap_path,
 876                     '--keyring', keyring_path,
 877                 ],
 878             )
 879
 880     run.wait(
 881         mons.run(
 882             args=[
 883                 'rm',
 884                 '--',
 885                 monmap_path,
 886                 osdmap_path,
 887             ],
 888             wait=False,
 889         ),
 890     )
 891
 892     try:
 893         yield
 894     except Exception:
 895         # we need to know this below
 896         ctx.summary['success'] = False
 897         raise
 898     finally:
 899         (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 900
 901         log.info('Checking cluster log for badness...')
 902
 903         def first_in_ceph_log(pattern, excludes):
 904             """
 905             Find the first occurence of the pattern specified in the Ceph log,
 906             Returns None if none found.
 907
 908             :param pattern: Pattern scanned for.
 909             :param excludes: Patterns to ignore.
 910             :return: First line of text (or None if not found)
 911             """
 912             args = [
 913                 'sudo',
 914                 'egrep', pattern,
 915                 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
 916             ]
 917             for exclude in excludes:
 918                 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 919             args.extend([
 920                 run.Raw('|'), 'head', '-n', '1',
 921             ])
 922             r = mon0_remote.run(
 923                 stdout=StringIO(),
 924                 args=args,
 925             )
 926             stdout = r.stdout.getvalue()
 927             if stdout != '':
 928                 return stdout
 929             return None
 930
 931         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 932                              config['log_whitelist']) is not None:
 933             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 934             ctx.summary['success'] = False
 935             # use the most severe problem as the failure reason
 936             if 'failure_reason' not in ctx.summary:
 937                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 938                     match = first_in_ceph_log(pattern, config['log_whitelist'])
 939                     if match is not None:
 940                         ctx.summary['failure_reason'] = \
 941                             '"{match}" in cluster log'.format(
 942                                 match=match.rstrip('\n'),
 943                             )
 944                         break
 945
 946         for remote, dirs in devs_to_clean.iteritems():
 947             for dir_ in dirs:
 948                 log.info('Unmounting %s on %s' % (dir_, remote))
 949                 try:
 950                     remote.run(
 951                         args=[
 952                             'sync',
 953                             run.Raw('&&'),
 954                             'sudo',
 955                             'umount',
 956                             '-f',
 957                             dir_
 958                         ]
 959                     )
 960                 except Exception as e:
 961                     remote.run(args=[
 962                         'sudo',
 963                         run.Raw('PATH=/usr/sbin:$PATH'),
 964                         'lsof',
 965                         run.Raw(';'),
 966                         'ps', 'auxf',
 967                     ])
 968                     raise e
 969
 970         if config.get('tmpfs_journal'):
 971             log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
 972             for remote, roles_for_host in osds.remotes.iteritems():
 973                 remote.run(
 974                     args=['sudo', 'umount', '-f', '/mnt'],
 975                     check_status=False,
 976                 )
 977
 978         if ctx.archive is not None and \
 979                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 980
 981             # archive mon data, too
 982             log.info('Archiving mon data...')
 983             path = os.path.join(ctx.archive, 'data')
 984             try:
 985                 os.makedirs(path)
 986             except OSError as e:
 987                 if e.errno == errno.EEXIST:
 988                     pass
 989                 else:
 990                     raise
 991             for remote, roles in mons.remotes.iteritems():
 992                 for role in roles:
 993                     is_mon = teuthology.is_type('mon', cluster_name)
 994                     if is_mon(role):
 995                         _, _, id_ = teuthology.split_role(role)
 996                         mon_dir = '/var/lib/ceph/mon/' + \
 997                                   '{0}-{1}'.format(cluster_name, id_)
 998                         teuthology.pull_directory_tarball(
 999                             remote,
1000                             mon_dir,
1001                             path + '/' + role + '.tgz')
1002
1003         log.info('Cleaning ceph cluster...')
1004         run.wait(
1005             ctx.cluster.run(
1006                 args=[
1007                     'sudo',
1008                     'rm',
1009                     '-rf',
1010                     '--',
1011                     conf_path,
1012                     keyring_path,
1013                     data_dir,
1014                     monmap_path,
1015                     osdmap_path,
1016                     run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1017                 ],
1018                 wait=False,
1019             ),
1020         )
1021
1022
1023 def osd_scrub_pgs(ctx, config):
1024     """
1025     Scrub pgs when we exit.
1026
1027     First make sure all pgs are active and clean.
1028     Next scrub all osds.
1029     Then periodically check until all pgs have scrub time stamps that
1030     indicate the last scrub completed.  Time out if no progess is made
1031     here after two minutes.
1032     """
1033     retries = 20
1034     delays = 10
1035     cluster_name = config['cluster']
1036     manager = ctx.managers[cluster_name]
1037     all_clean = False
1038     for _ in range(0, retries):
1039         stats = manager.get_pg_stats()
1040         bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1041         if not bad:
1042             all_clean = True
1043             break
1044         log.info(
1045             "Waiting for all osds to be active and clean, waiting on %s" % bad)
1046         time.sleep(delays)
1047     if not all_clean:
1048         raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1049     check_time_now = time.localtime()
1050     time.sleep(1)
1051     all_roles = teuthology.all_roles(ctx.cluster)
1052     for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1053         log.info("Scrubbing {osd}".format(osd=role))
1054         _, _, id_ = teuthology.split_role(role)
1055         # allow this to fail; in certain cases the OSD might not be up
1056         # at this point.  we will catch all pgs below.
1057         try:
1058             manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1059         except run.CommandFailedError:
1060             pass
1061     prev_good = 0
1062     gap_cnt = 0
1063     loop = True
1064     while loop:
1065         stats = manager.get_pg_stats()
1066         timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1067         loop = False
1068         thiscnt = 0
1069         for (pgid, tmval) in timez:
1070             pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1071             if pgtm > check_time_now:
1072                 thiscnt += 1
1073             else:
1074                 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1075                 loop = True
1076         if thiscnt > prev_good:
1077             prev_good = thiscnt
1078             gap_cnt = 0
1079         else:
1080             gap_cnt += 1
1081             if gap_cnt % 6 == 0:
1082                 for (pgid, tmval) in timez:
1083                     # re-request scrub every so often in case the earlier
1084                     # request was missed.  do not do it everytime because
1085                     # the scrub may be in progress or not reported yet and
1086                     # we will starve progress.
1087                     manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1088             if gap_cnt > retries:
1089                 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1090         if loop:
1091             log.info('Still waiting for all pgs to be scrubbed.')
1092             time.sleep(delays)
1093
1094
1095 @contextlib.contextmanager
1096 def run_daemon(ctx, config, type_):
1097     """
1098     Run daemons for a role type.  Handle the startup and termination of a a daemon.
1099     On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1100     and a max_mds value for one mds.
1101     On cleanup -- Stop all existing daemons of this type.
1102
1103     :param ctx: Context
1104     :param config: Configuration
1105     :paran type_: Role type
1106     """
1107     cluster_name = config['cluster']
1108     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1109     testdir = teuthology.get_testdir(ctx)
1110     daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1111
1112     # check whether any daemons if this type are configured
1113     if daemons is None:
1114         return
1115     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1116
1117     daemon_signal = 'kill'
1118     if config.get('coverage') or config.get('valgrind') is not None:
1119         daemon_signal = 'term'
1120
1121     for remote, roles_for_host in daemons.remotes.iteritems():
1122         is_type_ = teuthology.is_type(type_, cluster_name)
1123         for role in roles_for_host:
1124             if not is_type_(role):
1125                 continue
1126             _, _, id_ = teuthology.split_role(role)
1127
1128             run_cmd = [
1129                 'sudo',
1130                 'adjust-ulimits',
1131                 'ceph-coverage',
1132                 coverage_dir,
1133                 'daemon-helper',
1134                 daemon_signal,
1135             ]
1136             run_cmd_tail = [
1137                 'ceph-%s' % (type_),
1138                 '-f',
1139                 '--cluster', cluster_name,
1140                 '-i', id_]
1141
1142             if type_ in config.get('cpu_profile', []):
1143                 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1144                 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1145
1146             if config.get('valgrind') is not None:
1147                 valgrind_args = None
1148                 if type_ in config['valgrind']:
1149                     valgrind_args = config['valgrind'][type_]
1150                 if role in config['valgrind']:
1151                     valgrind_args = config['valgrind'][role]
1152                 run_cmd = teuthology.get_valgrind_args(testdir, role,
1153                                                        run_cmd,
1154                                                        valgrind_args)
1155
1156             run_cmd.extend(run_cmd_tail)
1157
1158             # always register mgr; don't necessarily start
1159             ctx.daemons.register_daemon(
1160                 remote, type_, id_,
1161                 cluster=cluster_name,
1162                 args=run_cmd,
1163                 logger=log.getChild(role),
1164                 stdin=run.PIPE,
1165                 wait=False
1166             )
1167             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1168                 role = cluster_name + '.' + type_
1169                 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1170
1171     try:
1172         yield
1173     finally:
1174         teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1175
1176
1177 def healthy(ctx, config):
1178     """
1179     Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1180
1181     :param ctx: Context
1182     :param config: Configuration
1183     """
1184     config = config if isinstance(config, dict) else dict()
1185     cluster_name = config.get('cluster', 'ceph')
1186     log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1187     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1188     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1189     teuthology.wait_until_osds_up(
1190         ctx,
1191         cluster=ctx.cluster,
1192         remote=mon0_remote,
1193         ceph_cluster=cluster_name,
1194     )
1195     teuthology.wait_until_healthy(
1196         ctx,
1197         remote=mon0_remote,
1198         ceph_cluster=cluster_name,
1199     )
1200
1201     if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1202         # Some MDSs exist, wait for them to be healthy
1203         ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1204         ceph_fs.wait_for_daemons(timeout=300)
1205
1206
1207 def wait_for_osds_up(ctx, config):
1208     """
1209     Wait for all osd's to come up.
1210
1211     :param ctx: Context
1212     :param config: Configuration
1213     """
1214     log.info('Waiting until ceph osds are all up...')
1215     cluster_name = config.get('cluster', 'ceph')
1216     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1217     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1218     teuthology.wait_until_osds_up(
1219         ctx,
1220         cluster=ctx.cluster,
1221         remote=mon0_remote
1222     )
1223
1224
1225 def wait_for_mon_quorum(ctx, config):
1226     """
1227     Check renote ceph status until all monitors are up.
1228
1229     :param ctx: Context
1230     :param config: Configuration
1231     """
1232     if isinstance(config, dict):
1233         mons = config['daemons']
1234         cluster_name = config.get('cluster', 'ceph')
1235     else:
1236         assert isinstance(config, list)
1237         mons = config
1238         cluster_name = 'ceph'
1239     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1240     (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1241     with contextutil.safe_while(sleep=10, tries=60,
1242                                 action='wait for monitor quorum') as proceed:
1243         while proceed():
1244             r = remote.run(
1245                 args=[
1246                     'sudo',
1247                     'ceph',
1248                     'quorum_status',
1249                 ],
1250                 stdout=StringIO(),
1251                 logger=log.getChild('quorum_status'),
1252             )
1253             j = json.loads(r.stdout.getvalue())
1254             q = j.get('quorum_names', [])
1255             log.debug('Quorum: %s', q)
1256             if sorted(q) == sorted(mons):
1257                 break
1258
1259
1260 def created_pool(ctx, config):
1261     """
1262     Add new pools to the dictionary of pools that the ceph-manager
1263     knows about.
1264     """
1265     for new_pool in config:
1266         if new_pool not in ctx.managers['ceph'].pools:
1267             ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1268                 new_pool, 'pg_num')
1269
1270
1271 @contextlib.contextmanager
1272 def restart(ctx, config):
1273     """
1274    restart ceph daemons
1275
1276    For example::
1277       tasks:
1278       - ceph.restart: [all]
1279
1280    For example::
1281       tasks:
1282       - ceph.restart: [osd.0, mon.1, mds.*]
1283
1284    or::
1285
1286       tasks:
1287       - ceph.restart:
1288           daemons: [osd.0, mon.1]
1289           wait-for-healthy: false
1290           wait-for-osds-up: true
1291
1292     :param ctx: Context
1293     :param config: Configuration
1294     """
1295     if config is None:
1296         config = {}
1297     elif isinstance(config, list):
1298         config = {'daemons': config}
1299
1300     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1301     clusters = set()
1302     for role in daemons:
1303         cluster, type_, id_ = teuthology.split_role(role)
1304         ctx.daemons.get_daemon(type_, id_, cluster).restart()
1305         clusters.add(cluster)
1306
1307     manager = ctx.managers['ceph']
1308     for dmon in daemons:
1309         if '.' in dmon:
1310             dm_parts = dmon.split('.')
1311             if dm_parts[1].isdigit():
1312                 if dm_parts[0] == 'osd':
1313                     manager.mark_down_osd(int(dm_parts[1]))
1314
1315     if config.get('wait-for-healthy', True):
1316         for cluster in clusters:
1317             healthy(ctx=ctx, config=dict(cluster=cluster))
1318     if config.get('wait-for-osds-up', False):
1319         for cluster in clusters:
1320             wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1321     yield
1322
1323
1324 @contextlib.contextmanager
1325 def stop(ctx, config):
1326     """
1327     Stop ceph daemons
1328
1329     For example::
1330       tasks:
1331       - ceph.stop: [mds.*]
1332
1333       tasks:
1334       - ceph.stop: [osd.0, osd.2]
1335
1336       tasks:
1337       - ceph.stop:
1338           daemons: [osd.0, osd.2]
1339
1340     """
1341     if config is None:
1342         config = {}
1343     elif isinstance(config, list):
1344         config = {'daemons': config}
1345
1346     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1347     for role in daemons:
1348         cluster, type_, id_ = teuthology.split_role(role)
1349         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1350
1351     yield
1352
1353
1354 @contextlib.contextmanager
1355 def wait_for_failure(ctx, config):
1356     """
1357     Wait for a failure of a ceph daemon
1358
1359     For example::
1360       tasks:
1361       - ceph.wait_for_failure: [mds.*]
1362
1363       tasks:
1364       - ceph.wait_for_failure: [osd.0, osd.2]
1365
1366       tasks:
1367       - ceph.wait_for_failure:
1368           daemons: [osd.0, osd.2]
1369
1370     """
1371     if config is None:
1372         config = {}
1373     elif isinstance(config, list):
1374         config = {'daemons': config}
1375
1376     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1377     for role in daemons:
1378         cluster, type_, id_ = teuthology.split_role(role)
1379         try:
1380             ctx.daemons.get_daemon(type_, id_, cluster).wait()
1381         except:
1382             log.info('Saw expected daemon failure.  Continuing.')
1383             pass
1384         else:
1385             raise RuntimeError('daemon %s did not fail' % role)
1386
1387     yield
1388
1389
1390 def validate_config(ctx, config):
1391     """
1392     Perform some simple validation on task configuration.
1393     Raises exceptions.ConfigError if an error is found.
1394     """
1395     # check for osds from multiple clusters on the same host
1396     for remote, roles_for_host in ctx.cluster.remotes.items():
1397         last_cluster = None
1398         last_role = None
1399         for role in roles_for_host:
1400             role_cluster, role_type, _ = teuthology.split_role(role)
1401             if role_type != 'osd':
1402                 continue
1403             if last_cluster and last_cluster != role_cluster:
1404                 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1405                     last_role, role)
1406                 raise exceptions.ConfigError(msg)
1407             last_cluster = role_cluster
1408             last_role = role
1409
1410
1411 @contextlib.contextmanager
1412 def task(ctx, config):
1413     """
1414     Set up and tear down a Ceph cluster.
1415
1416     For example::
1417
1418         tasks:
1419         - ceph:
1420         - interactive:
1421
1422     You can also specify what branch to run::
1423
1424         tasks:
1425         - ceph:
1426             branch: foo
1427
1428     Or a tag::
1429
1430         tasks:
1431         - ceph:
1432             tag: v0.42.13
1433
1434     Or a sha1::
1435
1436         tasks:
1437         - ceph:
1438             sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1439
1440     Or a local source dir::
1441
1442         tasks:
1443         - ceph:
1444             path: /home/sage/ceph
1445
1446     To capture code coverage data, use::
1447
1448         tasks:
1449         - ceph:
1450             coverage: true
1451
1452     To use btrfs, ext4, or xfs on the target's scratch disks, use::
1453
1454         tasks:
1455         - ceph:
1456             fs: xfs
1457             mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1458             mount_options: [nobarrier, inode64]
1459
1460     Note, this will cause the task to check the /scratch_devs file on each node
1461     for available devices.  If no such file is found, /dev/sdb will be used.
1462
1463     To run some daemons under valgrind, include their names
1464     and the tool/args to use in a valgrind section::
1465
1466         tasks:
1467         - ceph:
1468           valgrind:
1469             mds.1: --tool=memcheck
1470             osd.1: [--tool=memcheck, --leak-check=no]
1471
1472     Those nodes which are using memcheck or valgrind will get
1473     checked for bad results.
1474
1475     To adjust or modify config options, use::
1476
1477         tasks:
1478         - ceph:
1479             conf:
1480               section:
1481                 key: value
1482
1483     For example::
1484
1485         tasks:
1486         - ceph:
1487             conf:
1488               mds.0:
1489                 some option: value
1490                 other key: other value
1491               client.0:
1492                 debug client: 10
1493                 debug ms: 1
1494
1495     By default, the cluster log is checked for errors and warnings,
1496     and the run marked failed if any appear. You can ignore log
1497     entries by giving a list of egrep compatible regexes, i.e.:
1498
1499         tasks:
1500         - ceph:
1501             log-whitelist: ['foo.*bar', 'bad message']
1502
1503     To run multiple ceph clusters, use multiple ceph tasks, and roles
1504     with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1505     cluster use the default cluster name, 'ceph'. OSDs from separate
1506     clusters must be on separate hosts. Clients and non-osd daemons
1507     from multiple clusters may be colocated. For each cluster, add an
1508     instance of the ceph task with the cluster name specified, e.g.::
1509
1510         roles:
1511         - [mon.a, osd.0, osd.1]
1512         - [backup.mon.a, backup.osd.0, backup.osd.1]
1513         - [client.0, backup.client.0]
1514         tasks:
1515         - ceph:
1516             cluster: ceph
1517         - ceph:
1518             cluster: backup
1519
1520     :param ctx: Context
1521     :param config: Configuration
1522
1523     """
1524     if config is None:
1525         config = {}
1526     assert isinstance(config, dict), \
1527         "task ceph only supports a dictionary for configuration"
1528
1529     overrides = ctx.config.get('overrides', {})
1530     teuthology.deep_merge(config, overrides.get('ceph', {}))
1531
1532     first_ceph_cluster = False
1533     if not hasattr(ctx, 'daemons'):
1534         first_ceph_cluster = True
1535         ctx.daemons = DaemonGroup()
1536
1537     testdir = teuthology.get_testdir(ctx)
1538     if config.get('coverage'):
1539         coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1540         log.info('Creating coverage directory...')
1541         run.wait(
1542             ctx.cluster.run(
1543                 args=[
1544                     'install', '-d', '-m0755', '--',
1545                     coverage_dir,
1546                 ],
1547                 wait=False,
1548             )
1549         )
1550
1551     if 'cluster' not in config:
1552         config['cluster'] = 'ceph'
1553
1554     validate_config(ctx, config)
1555
1556     subtasks = []
1557     if first_ceph_cluster:
1558         # these tasks handle general log setup and parsing on all hosts,
1559         # so they should only be run once
1560         subtasks = [
1561             lambda: ceph_log(ctx=ctx, config=None),
1562             lambda: valgrind_post(ctx=ctx, config=config),
1563         ]
1564
1565     subtasks += [
1566         lambda: cluster(ctx=ctx, config=dict(
1567             conf=config.get('conf', {}),
1568             fs=config.get('fs', 'xfs'),
1569             mkfs_options=config.get('mkfs_options', None),
1570             mount_options=config.get('mount_options', None),
1571             block_journal=config.get('block_journal', None),
1572             tmpfs_journal=config.get('tmpfs_journal', None),
1573             skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1574             log_whitelist=config.get('log-whitelist', []),
1575             cpu_profile=set(config.get('cpu_profile', []),),
1576             cluster=config['cluster'],
1577         )),
1578         lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1579         lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1580         lambda: crush_setup(ctx=ctx, config=config),
1581         lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1582         lambda: cephfs_setup(ctx=ctx, config=config),
1583         lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1584     ]
1585
1586     with contextutil.nested(*subtasks):
1587         first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1588         (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1589         if not hasattr(ctx, 'managers'):
1590             ctx.managers = {}
1591         ctx.managers[config['cluster']] = CephManager(
1592             mon,
1593             ctx=ctx,
1594             logger=log.getChild('ceph_manager.' + config['cluster']),
1595             cluster=config['cluster'],
1596         )
1597
1598         try:
1599             if config.get('wait-for-healthy', True):
1600                 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1601
1602             yield
1603         finally:
1604             if config.get('wait-for-scrub', True):
1605                 osd_scrub_pgs(ctx, config)