ceph/qa/tasks/ceph.py

   1 """
   2 Ceph cluster task.
   3
   4 Handle the setup, starting, and clean-up of a Ceph cluster.
   5 """
   6 from cStringIO import StringIO
   7
   8 import argparse
   9 import contextlib
  10 import errno
  11 import logging
  12 import os
  13 import json
  14 import time
  15 import gevent
  16 import socket
  17
  18 from paramiko import SSHException
  19 from ceph_manager import CephManager, write_conf
  20 from tasks.cephfs.filesystem import Filesystem
  21 from teuthology import misc as teuthology
  22 from teuthology import contextutil
  23 from teuthology import exceptions
  24 from teuthology.orchestra import run
  25 import ceph_client as cclient
  26 from teuthology.orchestra.daemon import DaemonGroup
  27
  28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
  29
  30 log = logging.getLogger(__name__)
  31
  32
  33 def generate_caps(type_):
  34     """
  35     Each call will return the next capability for each system type
  36     (essentially a subset of possible role values).  Valid types are osd,
  37     mds and client.
  38     """
  39     defaults = dict(
  40         osd=dict(
  41             mon='allow *',
  42             mgr='allow *',
  43             osd='allow *',
  44         ),
  45         mgr=dict(
  46             mon='allow *',
  47         ),
  48         mds=dict(
  49             mon='allow *',
  50             mgr='allow *',
  51             osd='allow *',
  52             mds='allow',
  53         ),
  54         client=dict(
  55             mon='allow rw',
  56             mgr='allow r',
  57             osd='allow rwx',
  58             mds='allow',
  59         ),
  60     )
  61     for subsystem, capability in defaults[type_].items():
  62         yield '--cap'
  63         yield subsystem
  64         yield capability
  65
  66
  67 @contextlib.contextmanager
  68 def ceph_log(ctx, config):
  69     """
  70     Create /var/log/ceph log directory that is open to everyone.
  71     Add valgrind and profiling-logger directories.
  72
  73     :param ctx: Context
  74     :param config: Configuration
  75     """
  76     log.info('Making ceph log dir writeable by non-root...')
  77     run.wait(
  78         ctx.cluster.run(
  79             args=[
  80                 'sudo',
  81                 'chmod',
  82                 '777',
  83                 '/var/log/ceph',
  84             ],
  85             wait=False,
  86         )
  87     )
  88     log.info('Disabling ceph logrotate...')
  89     run.wait(
  90         ctx.cluster.run(
  91             args=[
  92                 'sudo',
  93                 'rm', '-f', '--',
  94                 '/etc/logrotate.d/ceph',
  95             ],
  96             wait=False,
  97         )
  98     )
  99     log.info('Creating extra log directories...')
 100     run.wait(
 101         ctx.cluster.run(
 102             args=[
 103                 'sudo',
 104                 'install', '-d', '-m0777', '--',
 105                 '/var/log/ceph/valgrind',
 106                 '/var/log/ceph/profiling-logger',
 107             ],
 108             wait=False,
 109         )
 110     )
 111
 112     class Rotater(object):
 113         stop_event = gevent.event.Event()
 114
 115         def invoke_logrotate(self):
 116             # 1) install ceph-test.conf in /etc/logrotate.d
 117             # 2) continuously loop over logrotate invocation with ceph-test.conf
 118             while not self.stop_event.is_set():
 119                 self.stop_event.wait(timeout=30)
 120                 try:
 121                     run.wait(
 122                         ctx.cluster.run(
 123                             args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
 124                                   ],
 125                             wait=False,
 126                         )
 127                     )
 128                 except exceptions.ConnectionLostError as e:
 129                     # Some tests may power off nodes during test, in which
 130                     # case we will see connection errors that we should ignore.
 131                     log.debug("Missed logrotate, node '{0}' is offline".format(
 132                         e.node))
 133                 except EOFError as e:
 134                     # Paramiko sometimes raises this when it fails to
 135                     # connect to a node during open_session.  As with
 136                     # ConnectionLostError, we ignore this because nodes
 137                     # are allowed to get power cycled during tests.
 138                     log.debug("Missed logrotate, EOFError")
 139                 except SSHException as e:
 140                     log.debug("Missed logrotate, SSHException")
 141                 except socket.error as e:
 142                     if e.errno == errno.EHOSTUNREACH:
 143                         log.debug("Missed logrotate, host unreachable")
 144                     else:
 145                         raise
 146
 147         def begin(self):
 148             self.thread = gevent.spawn(self.invoke_logrotate)
 149
 150         def end(self):
 151             self.stop_event.set()
 152             self.thread.get()
 153
 154     def write_rotate_conf(ctx, daemons):
 155         testdir = teuthology.get_testdir(ctx)
 156         rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
 157         with file(rotate_conf_path, 'rb') as f:
 158             conf = ""
 159             for daemon, size in daemons.iteritems():
 160                 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
 161                 conf += f.read().format(daemon_type=daemon, max_size=size)
 162                 f.seek(0, 0)
 163
 164             for remote in ctx.cluster.remotes.iterkeys():
 165                 teuthology.write_file(remote=remote,
 166                                       path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 167                                       data=StringIO(conf)
 168                                       )
 169                 remote.run(
 170                     args=[
 171                         'sudo',
 172                         'mv',
 173                         '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 174                         '/etc/logrotate.d/ceph-test.conf',
 175                         run.Raw('&&'),
 176                         'sudo',
 177                         'chmod',
 178                         '0644',
 179                         '/etc/logrotate.d/ceph-test.conf',
 180                         run.Raw('&&'),
 181                         'sudo',
 182                         'chown',
 183                         'root.root',
 184                         '/etc/logrotate.d/ceph-test.conf'
 185                     ]
 186                 )
 187                 remote.chcon('/etc/logrotate.d/ceph-test.conf',
 188                              'system_u:object_r:etc_t:s0')
 189
 190     if ctx.config.get('log-rotate'):
 191         daemons = ctx.config.get('log-rotate')
 192         log.info('Setting up log rotation with ' + str(daemons))
 193         write_rotate_conf(ctx, daemons)
 194         logrotater = Rotater()
 195         logrotater.begin()
 196     try:
 197         yield
 198
 199     finally:
 200         if ctx.config.get('log-rotate'):
 201             log.info('Shutting down logrotate')
 202             logrotater.end()
 203             ctx.cluster.run(
 204                 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
 205                       ]
 206             )
 207         if ctx.archive is not None and \
 208                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 209             # and logs
 210             log.info('Compressing logs...')
 211             run.wait(
 212                 ctx.cluster.run(
 213                     args=[
 214                         'sudo',
 215                         'find',
 216                         '/var/log/ceph',
 217                         '-name',
 218                         '*.log',
 219                         '-print0',
 220                         run.Raw('|'),
 221                         'sudo',
 222                         'xargs',
 223                         '-0',
 224                         '--no-run-if-empty',
 225                         '--',
 226                         'gzip',
 227                         '--',
 228                     ],
 229                     wait=False,
 230                 ),
 231             )
 232
 233             log.info('Archiving logs...')
 234             path = os.path.join(ctx.archive, 'remote')
 235             os.makedirs(path)
 236             for remote in ctx.cluster.remotes.iterkeys():
 237                 sub = os.path.join(path, remote.shortname)
 238                 os.makedirs(sub)
 239                 teuthology.pull_directory(remote, '/var/log/ceph',
 240                                           os.path.join(sub, 'log'))
 241
 242
 243 def assign_devs(roles, devs):
 244     """
 245     Create a dictionary of devs indexed by roles
 246
 247     :param roles: List of roles
 248     :param devs: Corresponding list of devices.
 249     :returns: Dictionary of devs indexed by roles.
 250     """
 251     return dict(zip(roles, devs))
 252
 253
 254 @contextlib.contextmanager
 255 def valgrind_post(ctx, config):
 256     """
 257     After the tests run, look throught all the valgrind logs.  Exceptions are raised
 258     if textual errors occured in the logs, or if valgrind exceptions were detected in
 259     the logs.
 260
 261     :param ctx: Context
 262     :param config: Configuration
 263     """
 264     try:
 265         yield
 266     finally:
 267         lookup_procs = list()
 268         log.info('Checking for errors in any valgrind logs...')
 269         for remote in ctx.cluster.remotes.iterkeys():
 270             # look at valgrind logs for each node
 271             proc = remote.run(
 272                 args=[
 273                     'sudo',
 274                     'zgrep',
 275                     '<kind>',
 276                     run.Raw('/var/log/ceph/valgrind/*'),
 277                     '/dev/null',  # include a second file so that we always get a filename prefix on the output
 278                     run.Raw('|'),
 279                     'sort',
 280                     run.Raw('|'),
 281                     'uniq',
 282                 ],
 283                 wait=False,
 284                 check_status=False,
 285                 stdout=StringIO(),
 286             )
 287             lookup_procs.append((proc, remote))
 288
 289         valgrind_exception = None
 290         for (proc, remote) in lookup_procs:
 291             proc.wait()
 292             out = proc.stdout.getvalue()
 293             for line in out.split('\n'):
 294                 if line == '':
 295                     continue
 296                 try:
 297                     (file, kind) = line.split(':')
 298                 except Exception:
 299                     log.error('failed to split line %s', line)
 300                     raise
 301                 log.debug('file %s kind %s', file, kind)
 302                 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
 303                     continue
 304                 log.error('saw valgrind issue %s in %s', kind, file)
 305                 valgrind_exception = Exception('saw valgrind issues')
 306
 307         if config.get('expect_valgrind_errors'):
 308             if not valgrind_exception:
 309                 raise Exception('expected valgrind issues and found none')
 310         else:
 311             if valgrind_exception:
 312                 raise valgrind_exception
 313
 314
 315 @contextlib.contextmanager
 316 def crush_setup(ctx, config):
 317     cluster_name = config['cluster']
 318     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 319     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 320
 321     profile = config.get('crush_tunables', 'default')
 322     log.info('Setting crush tunables to %s', profile)
 323     mon_remote.run(
 324         args=['sudo', 'ceph', '--cluster', cluster_name,
 325               'osd', 'crush', 'tunables', profile])
 326     yield
 327
 328
 329 @contextlib.contextmanager
 330 def create_rbd_pool(ctx, config):
 331     cluster_name = config['cluster']
 332     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 333     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 334     log.info('Waiting for OSDs to come up')
 335     teuthology.wait_until_osds_up(
 336         ctx,
 337         cluster=ctx.cluster,
 338         remote=mon_remote,
 339         ceph_cluster=cluster_name,
 340     )
 341     log.info('Creating RBD pool')
 342     mon_remote.run(
 343         args=['sudo', 'ceph', '--cluster', cluster_name,
 344               'osd', 'pool', 'create', 'rbd', '8'])
 345     yield
 346
 347 @contextlib.contextmanager
 348 def cephfs_setup(ctx, config):
 349     cluster_name = config['cluster']
 350     testdir = teuthology.get_testdir(ctx)
 351     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 352
 353     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 354     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 355     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 356     # If there are any MDSs, then create a filesystem for them to use
 357     # Do this last because requires mon cluster to be up and running
 358     if mdss.remotes:
 359         log.info('Setting up CephFS filesystem...')
 360
 361         fs = Filesystem(ctx, create='cephfs')
 362
 363         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
 364         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
 365         num_active = len([r for r in all_roles if is_active_mds(r)])
 366
 367         fs.set_max_mds(num_active)
 368         fs.set_allow_dirfrags(True)
 369
 370     yield
 371
 372
 373 @contextlib.contextmanager
 374 def cluster(ctx, config):
 375     """
 376     Handle the creation and removal of a ceph cluster.
 377
 378     On startup:
 379         Create directories needed for the cluster.
 380         Create remote journals for all osds.
 381         Create and set keyring.
 382         Copy the monmap to tht test systems.
 383         Setup mon nodes.
 384         Setup mds nodes.
 385         Mkfs osd nodes.
 386         Add keyring information to monmaps
 387         Mkfs mon nodes.
 388
 389     On exit:
 390         If errors occured, extract a failure message and store in ctx.summary.
 391         Unmount all test files and temporary journaling files.
 392         Save the monitor information and archive all ceph logs.
 393         Cleanup the keyring setup, and remove all monitor map and data files left over.
 394
 395     :param ctx: Context
 396     :param config: Configuration
 397     """
 398     if ctx.config.get('use_existing_cluster', False) is True:
 399         log.info("'use_existing_cluster' is true; skipping cluster creation")
 400         yield
 401
 402     testdir = teuthology.get_testdir(ctx)
 403     cluster_name = config['cluster']
 404     data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
 405     log.info('Creating ceph cluster %s...', cluster_name)
 406     run.wait(
 407         ctx.cluster.run(
 408             args=[
 409                 'install', '-d', '-m0755', '--',
 410                 data_dir,
 411             ],
 412             wait=False,
 413         )
 414     )
 415
 416     run.wait(
 417         ctx.cluster.run(
 418             args=[
 419                 'sudo',
 420                 'install', '-d', '-m0777', '--', '/var/run/ceph',
 421             ],
 422             wait=False,
 423         )
 424     )
 425
 426     devs_to_clean = {}
 427     remote_to_roles_to_devs = {}
 428     remote_to_roles_to_journals = {}
 429     osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
 430     for remote, roles_for_host in osds.remotes.iteritems():
 431         devs = teuthology.get_scratch_devices(remote)
 432         roles_to_devs = {}
 433         roles_to_journals = {}
 434         if config.get('fs'):
 435             log.info('fs option selected, checking for scratch devs')
 436             log.info('found devs: %s' % (str(devs),))
 437             devs_id_map = teuthology.get_wwn_id_map(remote, devs)
 438             iddevs = devs_id_map.values()
 439             roles_to_devs = assign_devs(
 440                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 441             )
 442             if len(roles_to_devs) < len(iddevs):
 443                 iddevs = iddevs[len(roles_to_devs):]
 444             devs_to_clean[remote] = []
 445
 446         if config.get('block_journal'):
 447             log.info('block journal enabled')
 448             roles_to_journals = assign_devs(
 449                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 450             )
 451             log.info('journal map: %s', roles_to_journals)
 452
 453         if config.get('tmpfs_journal'):
 454             log.info('tmpfs journal enabled')
 455             roles_to_journals = {}
 456             remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
 457             for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 458                 tmpfs = '/mnt/' + role
 459                 roles_to_journals[role] = tmpfs
 460                 remote.run(args=['truncate', '-s', '1500M', tmpfs])
 461             log.info('journal map: %s', roles_to_journals)
 462
 463         log.info('dev map: %s' % (str(roles_to_devs),))
 464         remote_to_roles_to_devs[remote] = roles_to_devs
 465         remote_to_roles_to_journals[remote] = roles_to_journals
 466
 467     log.info('Generating config...')
 468     remotes_and_roles = ctx.cluster.remotes.items()
 469     roles = [role_list for (remote, role_list) in remotes_and_roles]
 470     ips = [host for (host, port) in
 471            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
 472     conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
 473     for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
 474         for role, journal in roles_to_journals.iteritems():
 475             name = teuthology.ceph_role(role)
 476             if name not in conf:
 477                 conf[name] = {}
 478             conf[name]['osd journal'] = journal
 479     for section, keys in config['conf'].iteritems():
 480         for key, value in keys.iteritems():
 481             log.info("[%s] %s = %s" % (section, key, value))
 482             if section not in conf:
 483                 conf[section] = {}
 484             conf[section][key] = value
 485
 486     if config.get('tmpfs_journal'):
 487         conf['journal dio'] = False
 488
 489     if not hasattr(ctx, 'ceph'):
 490         ctx.ceph = {}
 491     ctx.ceph[cluster_name] = argparse.Namespace()
 492     ctx.ceph[cluster_name].conf = conf
 493
 494     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
 495     keyring_path = config.get('keyring_path', default_keyring)
 496
 497     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 498
 499     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
 500
 501     log.info('Setting up %s...' % firstmon)
 502     ctx.cluster.only(firstmon).run(
 503         args=[
 504             'sudo',
 505             'adjust-ulimits',
 506             'ceph-coverage',
 507             coverage_dir,
 508             'ceph-authtool',
 509             '--create-keyring',
 510             keyring_path,
 511         ],
 512     )
 513     ctx.cluster.only(firstmon).run(
 514         args=[
 515             'sudo',
 516             'adjust-ulimits',
 517             'ceph-coverage',
 518             coverage_dir,
 519             'ceph-authtool',
 520             '--gen-key',
 521             '--name=mon.',
 522             keyring_path,
 523         ],
 524     )
 525     ctx.cluster.only(firstmon).run(
 526         args=[
 527             'sudo',
 528             'chmod',
 529             '0644',
 530             keyring_path,
 531         ],
 532     )
 533     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 534     monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
 535                                                    cluster=cluster_name)
 536     fsid = teuthology.create_simple_monmap(
 537         ctx,
 538         remote=mon0_remote,
 539         conf=conf,
 540         path=monmap_path,
 541     )
 542     if not 'global' in conf:
 543         conf['global'] = {}
 544     conf['global']['fsid'] = fsid
 545
 546     default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
 547     conf_path = config.get('conf_path', default_conf_path)
 548     log.info('Writing %s for FSID %s...' % (conf_path, fsid))
 549     write_conf(ctx, conf_path, cluster_name)
 550
 551     log.info('Creating admin key on %s...' % firstmon)
 552     ctx.cluster.only(firstmon).run(
 553         args=[
 554             'sudo',
 555             'adjust-ulimits',
 556             'ceph-coverage',
 557             coverage_dir,
 558             'ceph-authtool',
 559             '--gen-key',
 560             '--name=client.admin',
 561             '--set-uid=0',
 562             '--cap', 'mon', 'allow *',
 563             '--cap', 'osd', 'allow *',
 564             '--cap', 'mds', 'allow *',
 565             '--cap', 'mgr', 'allow *',
 566             keyring_path,
 567         ],
 568     )
 569
 570     log.info('Copying monmap to all nodes...')
 571     keyring = teuthology.get_file(
 572         remote=mon0_remote,
 573         path=keyring_path,
 574     )
 575     monmap = teuthology.get_file(
 576         remote=mon0_remote,
 577         path=monmap_path,
 578     )
 579
 580     for rem in ctx.cluster.remotes.iterkeys():
 581         # copy mon key and initial monmap
 582         log.info('Sending monmap to node {remote}'.format(remote=rem))
 583         teuthology.sudo_write_file(
 584             remote=rem,
 585             path=keyring_path,
 586             data=keyring,
 587             perms='0644'
 588         )
 589         teuthology.write_file(
 590             remote=rem,
 591             path=monmap_path,
 592             data=monmap,
 593         )
 594
 595     log.info('Setting up mon nodes...')
 596     mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
 597
 598     if not config.get('skip_mgr_daemons', False):
 599         log.info('Setting up mgr nodes...')
 600         mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
 601         for remote, roles_for_host in mgrs.remotes.iteritems():
 602             for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
 603                                                          cluster_name):
 604                 _, _, id_ = teuthology.split_role(role)
 605                 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
 606                     cluster=cluster_name,
 607                     id=id_,
 608                 )
 609                 remote.run(
 610                     args=[
 611                         'sudo',
 612                         'mkdir',
 613                         '-p',
 614                         mgr_dir,
 615                         run.Raw('&&'),
 616                         'sudo',
 617                         'adjust-ulimits',
 618                         'ceph-coverage',
 619                         coverage_dir,
 620                         'ceph-authtool',
 621                         '--create-keyring',
 622                         '--gen-key',
 623                         '--name=mgr.{id}'.format(id=id_),
 624                         mgr_dir + '/keyring',
 625                     ],
 626                 )
 627
 628     log.info('Setting up mds nodes...')
 629     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 630     for remote, roles_for_host in mdss.remotes.iteritems():
 631         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
 632                                                      cluster_name):
 633             _, _, id_ = teuthology.split_role(role)
 634             mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
 635                 cluster=cluster_name,
 636                 id=id_,
 637             )
 638             remote.run(
 639                 args=[
 640                     'sudo',
 641                     'mkdir',
 642                     '-p',
 643                     mds_dir,
 644                     run.Raw('&&'),
 645                     'sudo',
 646                     'adjust-ulimits',
 647                     'ceph-coverage',
 648                     coverage_dir,
 649                     'ceph-authtool',
 650                     '--create-keyring',
 651                     '--gen-key',
 652                     '--name=mds.{id}'.format(id=id_),
 653                     mds_dir + '/keyring',
 654                 ],
 655             )
 656
 657     cclient.create_keyring(ctx, cluster_name)
 658     log.info('Running mkfs on osd nodes...')
 659
 660     if not hasattr(ctx, 'disk_config'):
 661         ctx.disk_config = argparse.Namespace()
 662     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
 663         ctx.disk_config.remote_to_roles_to_dev = {}
 664     if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
 665         ctx.disk_config.remote_to_roles_to_journals = {}
 666     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
 667         ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
 668     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
 669         ctx.disk_config.remote_to_roles_to_dev_fstype = {}
 670
 671     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
 672     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
 673
 674     log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
 675     for remote, roles_for_host in osds.remotes.iteritems():
 676         roles_to_devs = remote_to_roles_to_devs[remote]
 677         roles_to_journals = remote_to_roles_to_journals[remote]
 678
 679         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 680             _, _, id_ = teuthology.split_role(role)
 681             mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
 682             remote.run(
 683                 args=[
 684                     'sudo',
 685                     'mkdir',
 686                     '-p',
 687                     mnt_point,
 688                 ])
 689             log.info(str(roles_to_journals))
 690             log.info(role)
 691             if roles_to_devs.get(role):
 692                 dev = roles_to_devs[role]
 693                 fs = config.get('fs')
 694                 package = None
 695                 mkfs_options = config.get('mkfs_options')
 696                 mount_options = config.get('mount_options')
 697                 if fs == 'btrfs':
 698                     # package = 'btrfs-tools'
 699                     if mount_options is None:
 700                         mount_options = ['noatime', 'user_subvol_rm_allowed']
 701                     if mkfs_options is None:
 702                         mkfs_options = ['-m', 'single',
 703                                         '-l', '32768',
 704                                         '-n', '32768']
 705                 if fs == 'xfs':
 706                     # package = 'xfsprogs'
 707                     if mount_options is None:
 708                         mount_options = ['noatime']
 709                     if mkfs_options is None:
 710                         mkfs_options = ['-f', '-i', 'size=2048']
 711                 if fs == 'ext4' or fs == 'ext3':
 712                     if mount_options is None:
 713                         mount_options = ['noatime', 'user_xattr']
 714
 715                 if mount_options is None:
 716                     mount_options = []
 717                 if mkfs_options is None:
 718                     mkfs_options = []
 719                 mkfs = ['mkfs.%s' % fs] + mkfs_options
 720                 log.info('%s on %s on %s' % (mkfs, dev, remote))
 721                 if package is not None:
 722                     remote.run(
 723                         args=[
 724                             'sudo',
 725                             'apt-get', 'install', '-y', package
 726                         ],
 727                         stdout=StringIO(),
 728                     )
 729
 730                 try:
 731                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 732                 except run.CommandFailedError:
 733                     # Newer btfs-tools doesn't prompt for overwrite, use -f
 734                     if '-f' not in mount_options:
 735                         mkfs_options.append('-f')
 736                         mkfs = ['mkfs.%s' % fs] + mkfs_options
 737                         log.info('%s on %s on %s' % (mkfs, dev, remote))
 738                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 739
 740                 log.info('mount %s on %s -o %s' % (dev, remote,
 741                                                    ','.join(mount_options)))
 742                 remote.run(
 743                     args=[
 744                         'sudo',
 745                         'mount',
 746                         '-t', fs,
 747                         '-o', ','.join(mount_options),
 748                         dev,
 749                         mnt_point,
 750                     ]
 751                 )
 752                 remote.run(
 753                     args=[
 754                         'sudo', '/sbin/restorecon', mnt_point,
 755                     ],
 756                     check_status=False,
 757                 )
 758                 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
 759                     ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
 760                 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
 761                 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
 762                     ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
 763                 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
 764                 devs_to_clean[remote].append(mnt_point)
 765
 766         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 767             _, _, id_ = teuthology.split_role(role)
 768             remote.run(
 769                 args=[
 770                     'sudo',
 771                     'MALLOC_CHECK_=3',
 772                     'adjust-ulimits',
 773                     'ceph-coverage',
 774                     coverage_dir,
 775                     'ceph-osd',
 776                     '--cluster',
 777                     cluster_name,
 778                     '--mkfs',
 779                     '--mkkey',
 780                     '-i', id_,
 781                     '--monmap', monmap_path,
 782                 ],
 783             )
 784
 785     log.info('Reading keys from all nodes...')
 786     keys_fp = StringIO()
 787     keys = []
 788     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 789         for type_ in ['mgr',  'mds', 'osd']:
 790             if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
 791                 continue
 792             for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
 793                 _, _, id_ = teuthology.split_role(role)
 794                 data = teuthology.get_file(
 795                     remote=remote,
 796                     path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
 797                         type=type_,
 798                         id=id_,
 799                         cluster=cluster_name,
 800                     ),
 801                     sudo=True,
 802                 )
 803                 keys.append((type_, id_, data))
 804                 keys_fp.write(data)
 805     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 806         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
 807             _, _, id_ = teuthology.split_role(role)
 808             data = teuthology.get_file(
 809                 remote=remote,
 810                 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
 811             )
 812             keys.append(('client', id_, data))
 813             keys_fp.write(data)
 814
 815     log.info('Adding keys to all mons...')
 816     writes = mons.run(
 817         args=[
 818             'sudo', 'tee', '-a',
 819             keyring_path,
 820         ],
 821         stdin=run.PIPE,
 822         wait=False,
 823         stdout=StringIO(),
 824     )
 825     keys_fp.seek(0)
 826     teuthology.feed_many_stdins_and_close(keys_fp, writes)
 827     run.wait(writes)
 828     for type_, id_, data in keys:
 829         run.wait(
 830             mons.run(
 831                 args=[
 832                          'sudo',
 833                          'adjust-ulimits',
 834                          'ceph-coverage',
 835                          coverage_dir,
 836                          'ceph-authtool',
 837                          keyring_path,
 838                          '--name={type}.{id}'.format(
 839                              type=type_,
 840                              id=id_,
 841                          ),
 842                      ] + list(generate_caps(type_)),
 843                 wait=False,
 844             ),
 845         )
 846
 847     log.info('Running mkfs on mon nodes...')
 848     for remote, roles_for_host in mons.remotes.iteritems():
 849         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
 850             _, _, id_ = teuthology.split_role(role)
 851             remote.run(
 852                 args=[
 853                     'sudo',
 854                     'mkdir',
 855                     '-p',
 856                     '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
 857                 ],
 858             )
 859             remote.run(
 860                 args=[
 861                     'sudo',
 862                     'adjust-ulimits',
 863                     'ceph-coverage',
 864                     coverage_dir,
 865                     'ceph-mon',
 866                     '--cluster', cluster_name,
 867                     '--mkfs',
 868                     '-i', id_,
 869                     '--monmap', monmap_path,
 870                     '--keyring', keyring_path,
 871                 ],
 872             )
 873
 874     run.wait(
 875         mons.run(
 876             args=[
 877                 'rm',
 878                 '--',
 879                 monmap_path,
 880             ],
 881             wait=False,
 882         ),
 883     )
 884
 885     try:
 886         yield
 887     except Exception:
 888         # we need to know this below
 889         ctx.summary['success'] = False
 890         raise
 891     finally:
 892         (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 893
 894         log.info('Checking cluster log for badness...')
 895
 896         def first_in_ceph_log(pattern, excludes):
 897             """
 898             Find the first occurence of the pattern specified in the Ceph log,
 899             Returns None if none found.
 900
 901             :param pattern: Pattern scanned for.
 902             :param excludes: Patterns to ignore.
 903             :return: First line of text (or None if not found)
 904             """
 905             args = [
 906                 'sudo',
 907                 'egrep', pattern,
 908                 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
 909             ]
 910             for exclude in excludes:
 911                 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 912             args.extend([
 913                 run.Raw('|'), 'head', '-n', '1',
 914             ])
 915             r = mon0_remote.run(
 916                 stdout=StringIO(),
 917                 args=args,
 918             )
 919             stdout = r.stdout.getvalue()
 920             if stdout != '':
 921                 return stdout
 922             return None
 923
 924         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 925                              config['log_whitelist']) is not None:
 926             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 927             ctx.summary['success'] = False
 928             # use the most severe problem as the failure reason
 929             if 'failure_reason' not in ctx.summary:
 930                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 931                     match = first_in_ceph_log(pattern, config['log_whitelist'])
 932                     if match is not None:
 933                         ctx.summary['failure_reason'] = \
 934                             '"{match}" in cluster log'.format(
 935                                 match=match.rstrip('\n'),
 936                             )
 937                         break
 938
 939         for remote, dirs in devs_to_clean.iteritems():
 940             for dir_ in dirs:
 941                 log.info('Unmounting %s on %s' % (dir_, remote))
 942                 try:
 943                     remote.run(
 944                         args=[
 945                             'sync',
 946                             run.Raw('&&'),
 947                             'sudo',
 948                             'umount',
 949                             '-f',
 950                             dir_
 951                         ]
 952                     )
 953                 except Exception as e:
 954                     remote.run(args=[
 955                         'sudo',
 956                         run.Raw('PATH=/usr/sbin:$PATH'),
 957                         'lsof',
 958                         run.Raw(';'),
 959                         'ps', 'auxf',
 960                     ])
 961                     raise e
 962
 963         if config.get('tmpfs_journal'):
 964             log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
 965             for remote, roles_for_host in osds.remotes.iteritems():
 966                 remote.run(
 967                     args=['sudo', 'umount', '-f', '/mnt'],
 968                     check_status=False,
 969                 )
 970
 971         if ctx.archive is not None and \
 972                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 973
 974             # archive mon data, too
 975             log.info('Archiving mon data...')
 976             path = os.path.join(ctx.archive, 'data')
 977             try:
 978                 os.makedirs(path)
 979             except OSError as e:
 980                 if e.errno == errno.EEXIST:
 981                     pass
 982                 else:
 983                     raise
 984             for remote, roles in mons.remotes.iteritems():
 985                 for role in roles:
 986                     is_mon = teuthology.is_type('mon', cluster_name)
 987                     if is_mon(role):
 988                         _, _, id_ = teuthology.split_role(role)
 989                         mon_dir = '/var/lib/ceph/mon/' + \
 990                                   '{0}-{1}'.format(cluster_name, id_)
 991                         teuthology.pull_directory_tarball(
 992                             remote,
 993                             mon_dir,
 994                             path + '/' + role + '.tgz')
 995
 996         log.info('Cleaning ceph cluster...')
 997         run.wait(
 998             ctx.cluster.run(
 999                 args=[
1000                     'sudo',
1001                     'rm',
1002                     '-rf',
1003                     '--',
1004                     conf_path,
1005                     keyring_path,
1006                     data_dir,
1007                     monmap_path,
1008                     run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1009                 ],
1010                 wait=False,
1011             ),
1012         )
1013
1014
1015 def osd_scrub_pgs(ctx, config):
1016     """
1017     Scrub pgs when we exit.
1018
1019     First make sure all pgs are active and clean.
1020     Next scrub all osds.
1021     Then periodically check until all pgs have scrub time stamps that
1022     indicate the last scrub completed.  Time out if no progess is made
1023     here after two minutes.
1024     """
1025     retries = 20
1026     delays = 10
1027     cluster_name = config['cluster']
1028     manager = ctx.managers[cluster_name]
1029     all_clean = False
1030     for _ in range(0, retries):
1031         stats = manager.get_pg_stats()
1032         bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1033         if not bad:
1034             all_clean = True
1035             break
1036         log.info(
1037             "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1038         time.sleep(delays)
1039     if not all_clean:
1040         raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1041     check_time_now = time.localtime()
1042     time.sleep(1)
1043     all_roles = teuthology.all_roles(ctx.cluster)
1044     for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1045         log.info("Scrubbing {osd}".format(osd=role))
1046         _, _, id_ = teuthology.split_role(role)
1047         # allow this to fail; in certain cases the OSD might not be up
1048         # at this point.  we will catch all pgs below.
1049         try:
1050             manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1051         except run.CommandFailedError:
1052             pass
1053     prev_good = 0
1054     gap_cnt = 0
1055     loop = True
1056     while loop:
1057         stats = manager.get_pg_stats()
1058         timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1059         loop = False
1060         thiscnt = 0
1061         for (pgid, tmval) in timez:
1062             pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1063             if pgtm > check_time_now:
1064                 thiscnt += 1
1065             else:
1066                 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1067                 loop = True
1068         if thiscnt > prev_good:
1069             prev_good = thiscnt
1070             gap_cnt = 0
1071         else:
1072             gap_cnt += 1
1073             if gap_cnt % 6 == 0:
1074                 for (pgid, tmval) in timez:
1075                     # re-request scrub every so often in case the earlier
1076                     # request was missed.  do not do it everytime because
1077                     # the scrub may be in progress or not reported yet and
1078                     # we will starve progress.
1079                     manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1080             if gap_cnt > retries:
1081                 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1082         if loop:
1083             log.info('Still waiting for all pgs to be scrubbed.')
1084             time.sleep(delays)
1085
1086
1087 @contextlib.contextmanager
1088 def run_daemon(ctx, config, type_):
1089     """
1090     Run daemons for a role type.  Handle the startup and termination of a a daemon.
1091     On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1092     and a max_mds value for one mds.
1093     On cleanup -- Stop all existing daemons of this type.
1094
1095     :param ctx: Context
1096     :param config: Configuration
1097     :paran type_: Role type
1098     """
1099     cluster_name = config['cluster']
1100     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1101     testdir = teuthology.get_testdir(ctx)
1102     daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1103
1104     # check whether any daemons if this type are configured
1105     if daemons is None:
1106         return
1107     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1108
1109     daemon_signal = 'kill'
1110     if config.get('coverage') or config.get('valgrind') is not None:
1111         daemon_signal = 'term'
1112
1113     for remote, roles_for_host in daemons.remotes.iteritems():
1114         is_type_ = teuthology.is_type(type_, cluster_name)
1115         for role in roles_for_host:
1116             if not is_type_(role):
1117                 continue
1118             _, _, id_ = teuthology.split_role(role)
1119
1120             if type_ == 'osd':
1121                 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1122                     cluster=cluster_name, id=id_)
1123                 osd_uuid = teuthology.get_file(
1124                     remote=remote,
1125                     path=datadir + '/fsid',
1126                     sudo=True,
1127                 ).strip()
1128                 try:
1129                     remote.run(
1130                         args=[
1131                             'sudo', 'ceph', '--cluster', cluster_name,
1132                             'osd', 'new', osd_uuid, id_,
1133                         ]
1134                     )
1135                 except:
1136                     # fallback to pre-luminous (hammer or jewel)
1137                     remote.run(
1138                         args=[
1139                             'sudo', 'ceph', '--cluster', cluster_name,
1140                             'osd', 'create', osd_uuid,
1141                         ]
1142                     )
1143                 if config.get('add_osds_to_crush'):
1144                     remote.run(
1145                         args=[
1146                             'sudo', 'ceph', '--cluster', cluster_name,
1147                             'osd', 'crush', 'create-or-move', 'osd.' + id_,
1148                             '1.0', 'host=localhost', 'root=default',
1149                         ]
1150                     )
1151
1152             run_cmd = [
1153                 'sudo',
1154                 'adjust-ulimits',
1155                 'ceph-coverage',
1156                 coverage_dir,
1157                 'daemon-helper',
1158                 daemon_signal,
1159             ]
1160             run_cmd_tail = [
1161                 'ceph-%s' % (type_),
1162                 '-f',
1163                 '--cluster', cluster_name,
1164                 '-i', id_]
1165
1166             if type_ in config.get('cpu_profile', []):
1167                 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1168                 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1169
1170             if config.get('valgrind') is not None:
1171                 valgrind_args = None
1172                 if type_ in config['valgrind']:
1173                     valgrind_args = config['valgrind'][type_]
1174                 if role in config['valgrind']:
1175                     valgrind_args = config['valgrind'][role]
1176                 run_cmd = teuthology.get_valgrind_args(testdir, role,
1177                                                        run_cmd,
1178                                                        valgrind_args)
1179
1180             run_cmd.extend(run_cmd_tail)
1181
1182             # always register mgr; don't necessarily start
1183             ctx.daemons.register_daemon(
1184                 remote, type_, id_,
1185                 cluster=cluster_name,
1186                 args=run_cmd,
1187                 logger=log.getChild(role),
1188                 stdin=run.PIPE,
1189                 wait=False
1190             )
1191             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1192                 role = cluster_name + '.' + type_
1193                 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1194
1195     try:
1196         yield
1197     finally:
1198         teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1199
1200
1201 def healthy(ctx, config):
1202     """
1203     Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1204
1205     :param ctx: Context
1206     :param config: Configuration
1207     """
1208     config = config if isinstance(config, dict) else dict()
1209     cluster_name = config.get('cluster', 'ceph')
1210     log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1211     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1212     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1213     teuthology.wait_until_osds_up(
1214         ctx,
1215         cluster=ctx.cluster,
1216         remote=mon0_remote,
1217         ceph_cluster=cluster_name,
1218     )
1219     teuthology.wait_until_healthy(
1220         ctx,
1221         remote=mon0_remote,
1222         ceph_cluster=cluster_name,
1223     )
1224
1225     if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1226         # Some MDSs exist, wait for them to be healthy
1227         ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1228         ceph_fs.wait_for_daemons(timeout=300)
1229
1230
1231 def wait_for_osds_up(ctx, config):
1232     """
1233     Wait for all osd's to come up.
1234
1235     :param ctx: Context
1236     :param config: Configuration
1237     """
1238     log.info('Waiting until ceph osds are all up...')
1239     cluster_name = config.get('cluster', 'ceph')
1240     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1241     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1242     teuthology.wait_until_osds_up(
1243         ctx,
1244         cluster=ctx.cluster,
1245         remote=mon0_remote
1246     )
1247
1248
1249 def wait_for_mon_quorum(ctx, config):
1250     """
1251     Check renote ceph status until all monitors are up.
1252
1253     :param ctx: Context
1254     :param config: Configuration
1255     """
1256     if isinstance(config, dict):
1257         mons = config['daemons']
1258         cluster_name = config.get('cluster', 'ceph')
1259     else:
1260         assert isinstance(config, list)
1261         mons = config
1262         cluster_name = 'ceph'
1263     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1264     (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1265     with contextutil.safe_while(sleep=10, tries=60,
1266                                 action='wait for monitor quorum') as proceed:
1267         while proceed():
1268             r = remote.run(
1269                 args=[
1270                     'sudo',
1271                     'ceph',
1272                     'quorum_status',
1273                 ],
1274                 stdout=StringIO(),
1275                 logger=log.getChild('quorum_status'),
1276             )
1277             j = json.loads(r.stdout.getvalue())
1278             q = j.get('quorum_names', [])
1279             log.debug('Quorum: %s', q)
1280             if sorted(q) == sorted(mons):
1281                 break
1282
1283
1284 def created_pool(ctx, config):
1285     """
1286     Add new pools to the dictionary of pools that the ceph-manager
1287     knows about.
1288     """
1289     for new_pool in config:
1290         if new_pool not in ctx.managers['ceph'].pools:
1291             ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1292                 new_pool, 'pg_num')
1293
1294
1295 @contextlib.contextmanager
1296 def restart(ctx, config):
1297     """
1298    restart ceph daemons
1299
1300    For example::
1301       tasks:
1302       - ceph.restart: [all]
1303
1304    For example::
1305       tasks:
1306       - ceph.restart: [osd.0, mon.1, mds.*]
1307
1308    or::
1309
1310       tasks:
1311       - ceph.restart:
1312           daemons: [osd.0, mon.1]
1313           wait-for-healthy: false
1314           wait-for-osds-up: true
1315
1316     :param ctx: Context
1317     :param config: Configuration
1318     """
1319     if config is None:
1320         config = {}
1321     elif isinstance(config, list):
1322         config = {'daemons': config}
1323
1324     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1325     clusters = set()
1326     for role in daemons:
1327         cluster, type_, id_ = teuthology.split_role(role)
1328         ctx.daemons.get_daemon(type_, id_, cluster).restart()
1329         clusters.add(cluster)
1330
1331     manager = ctx.managers['ceph']
1332     for dmon in daemons:
1333         if '.' in dmon:
1334             dm_parts = dmon.split('.')
1335             if dm_parts[1].isdigit():
1336                 if dm_parts[0] == 'osd':
1337                     manager.mark_down_osd(int(dm_parts[1]))
1338
1339     if config.get('wait-for-healthy', True):
1340         for cluster in clusters:
1341             healthy(ctx=ctx, config=dict(cluster=cluster))
1342     if config.get('wait-for-osds-up', False):
1343         for cluster in clusters:
1344             wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1345     yield
1346
1347
1348 @contextlib.contextmanager
1349 def stop(ctx, config):
1350     """
1351     Stop ceph daemons
1352
1353     For example::
1354       tasks:
1355       - ceph.stop: [mds.*]
1356
1357       tasks:
1358       - ceph.stop: [osd.0, osd.2]
1359
1360       tasks:
1361       - ceph.stop:
1362           daemons: [osd.0, osd.2]
1363
1364     """
1365     if config is None:
1366         config = {}
1367     elif isinstance(config, list):
1368         config = {'daemons': config}
1369
1370     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1371     for role in daemons:
1372         cluster, type_, id_ = teuthology.split_role(role)
1373         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1374
1375     yield
1376
1377
1378 @contextlib.contextmanager
1379 def wait_for_failure(ctx, config):
1380     """
1381     Wait for a failure of a ceph daemon
1382
1383     For example::
1384       tasks:
1385       - ceph.wait_for_failure: [mds.*]
1386
1387       tasks:
1388       - ceph.wait_for_failure: [osd.0, osd.2]
1389
1390       tasks:
1391       - ceph.wait_for_failure:
1392           daemons: [osd.0, osd.2]
1393
1394     """
1395     if config is None:
1396         config = {}
1397     elif isinstance(config, list):
1398         config = {'daemons': config}
1399
1400     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1401     for role in daemons:
1402         cluster, type_, id_ = teuthology.split_role(role)
1403         try:
1404             ctx.daemons.get_daemon(type_, id_, cluster).wait()
1405         except:
1406             log.info('Saw expected daemon failure.  Continuing.')
1407             pass
1408         else:
1409             raise RuntimeError('daemon %s did not fail' % role)
1410
1411     yield
1412
1413
1414 def validate_config(ctx, config):
1415     """
1416     Perform some simple validation on task configuration.
1417     Raises exceptions.ConfigError if an error is found.
1418     """
1419     # check for osds from multiple clusters on the same host
1420     for remote, roles_for_host in ctx.cluster.remotes.items():
1421         last_cluster = None
1422         last_role = None
1423         for role in roles_for_host:
1424             role_cluster, role_type, _ = teuthology.split_role(role)
1425             if role_type != 'osd':
1426                 continue
1427             if last_cluster and last_cluster != role_cluster:
1428                 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1429                     last_role, role)
1430                 raise exceptions.ConfigError(msg)
1431             last_cluster = role_cluster
1432             last_role = role
1433
1434
1435 @contextlib.contextmanager
1436 def task(ctx, config):
1437     """
1438     Set up and tear down a Ceph cluster.
1439
1440     For example::
1441
1442         tasks:
1443         - ceph:
1444         - interactive:
1445
1446     You can also specify what branch to run::
1447
1448         tasks:
1449         - ceph:
1450             branch: foo
1451
1452     Or a tag::
1453
1454         tasks:
1455         - ceph:
1456             tag: v0.42.13
1457
1458     Or a sha1::
1459
1460         tasks:
1461         - ceph:
1462             sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1463
1464     Or a local source dir::
1465
1466         tasks:
1467         - ceph:
1468             path: /home/sage/ceph
1469
1470     To capture code coverage data, use::
1471
1472         tasks:
1473         - ceph:
1474             coverage: true
1475
1476     To use btrfs, ext4, or xfs on the target's scratch disks, use::
1477
1478         tasks:
1479         - ceph:
1480             fs: xfs
1481             mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1482             mount_options: [nobarrier, inode64]
1483
1484     Note, this will cause the task to check the /scratch_devs file on each node
1485     for available devices.  If no such file is found, /dev/sdb will be used.
1486
1487     To run some daemons under valgrind, include their names
1488     and the tool/args to use in a valgrind section::
1489
1490         tasks:
1491         - ceph:
1492           valgrind:
1493             mds.1: --tool=memcheck
1494             osd.1: [--tool=memcheck, --leak-check=no]
1495
1496     Those nodes which are using memcheck or valgrind will get
1497     checked for bad results.
1498
1499     To adjust or modify config options, use::
1500
1501         tasks:
1502         - ceph:
1503             conf:
1504               section:
1505                 key: value
1506
1507     For example::
1508
1509         tasks:
1510         - ceph:
1511             conf:
1512               mds.0:
1513                 some option: value
1514                 other key: other value
1515               client.0:
1516                 debug client: 10
1517                 debug ms: 1
1518
1519     By default, the cluster log is checked for errors and warnings,
1520     and the run marked failed if any appear. You can ignore log
1521     entries by giving a list of egrep compatible regexes, i.e.:
1522
1523         tasks:
1524         - ceph:
1525             log-whitelist: ['foo.*bar', 'bad message']
1526
1527     To run multiple ceph clusters, use multiple ceph tasks, and roles
1528     with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1529     cluster use the default cluster name, 'ceph'. OSDs from separate
1530     clusters must be on separate hosts. Clients and non-osd daemons
1531     from multiple clusters may be colocated. For each cluster, add an
1532     instance of the ceph task with the cluster name specified, e.g.::
1533
1534         roles:
1535         - [mon.a, osd.0, osd.1]
1536         - [backup.mon.a, backup.osd.0, backup.osd.1]
1537         - [client.0, backup.client.0]
1538         tasks:
1539         - ceph:
1540             cluster: ceph
1541         - ceph:
1542             cluster: backup
1543
1544     :param ctx: Context
1545     :param config: Configuration
1546
1547     """
1548     if config is None:
1549         config = {}
1550     assert isinstance(config, dict), \
1551         "task ceph only supports a dictionary for configuration"
1552
1553     overrides = ctx.config.get('overrides', {})
1554     teuthology.deep_merge(config, overrides.get('ceph', {}))
1555
1556     first_ceph_cluster = False
1557     if not hasattr(ctx, 'daemons'):
1558         first_ceph_cluster = True
1559         ctx.daemons = DaemonGroup()
1560
1561     testdir = teuthology.get_testdir(ctx)
1562     if config.get('coverage'):
1563         coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1564         log.info('Creating coverage directory...')
1565         run.wait(
1566             ctx.cluster.run(
1567                 args=[
1568                     'install', '-d', '-m0755', '--',
1569                     coverage_dir,
1570                 ],
1571                 wait=False,
1572             )
1573         )
1574
1575     if 'cluster' not in config:
1576         config['cluster'] = 'ceph'
1577
1578     validate_config(ctx, config)
1579
1580     subtasks = []
1581     if first_ceph_cluster:
1582         # these tasks handle general log setup and parsing on all hosts,
1583         # so they should only be run once
1584         subtasks = [
1585             lambda: ceph_log(ctx=ctx, config=None),
1586             lambda: valgrind_post(ctx=ctx, config=config),
1587         ]
1588
1589     subtasks += [
1590         lambda: cluster(ctx=ctx, config=dict(
1591             conf=config.get('conf', {}),
1592             fs=config.get('fs', 'xfs'),
1593             mkfs_options=config.get('mkfs_options', None),
1594             mount_options=config.get('mount_options', None),
1595             block_journal=config.get('block_journal', None),
1596             tmpfs_journal=config.get('tmpfs_journal', None),
1597             skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1598             log_whitelist=config.get('log-whitelist', []),
1599             cpu_profile=set(config.get('cpu_profile', []),),
1600             cluster=config['cluster'],
1601         )),
1602         lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1603         lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1604         lambda: crush_setup(ctx=ctx, config=config),
1605         lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1606         lambda: create_rbd_pool(ctx=ctx, config=config),
1607         lambda: cephfs_setup(ctx=ctx, config=config),
1608         lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1609     ]
1610
1611     with contextutil.nested(*subtasks):
1612         first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1613         (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1614         if not hasattr(ctx, 'managers'):
1615             ctx.managers = {}
1616         ctx.managers[config['cluster']] = CephManager(
1617             mon,
1618             ctx=ctx,
1619             logger=log.getChild('ceph_manager.' + config['cluster']),
1620             cluster=config['cluster'],
1621         )
1622
1623         try:
1624             if config.get('wait-for-healthy', True):
1625                 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1626
1627             yield
1628         finally:
1629             if config.get('wait-for-scrub', True):
1630                 osd_scrub_pgs(ctx, config)
1631
1632             # stop logging health to clog during shutdown, or else we generate
1633             # a bunch of scary messages unrelated to our actual run.
1634             firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1635             (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1636             mon0_remote.run(
1637                 args=[
1638                     'sudo',
1639                     'ceph',
1640                     '--cluster', config['cluster'],
1641                     'tell',
1642                     'mon.*',
1643                     'injectargs',
1644                     '--',
1645                     '--no-mon-health-to-clog',
1646                 ]
1647             )