ceph/qa/tasks/ceph.py

   1 """
   2 Ceph cluster task.
   3
   4 Handle the setup, starting, and clean-up of a Ceph cluster.
   5 """
   6 from cStringIO import StringIO
   7
   8 import argparse
   9 import contextlib
  10 import errno
  11 import logging
  12 import os
  13 import json
  14 import time
  15 import gevent
  16 import socket
  17
  18 from paramiko import SSHException
  19 from ceph_manager import CephManager, write_conf
  20 from tasks.cephfs.filesystem import Filesystem
  21 from teuthology import misc as teuthology
  22 from teuthology import contextutil
  23 from teuthology import exceptions
  24 from teuthology.orchestra import run
  25 import ceph_client as cclient
  26 from teuthology.orchestra.daemon import DaemonGroup
  27
  28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
  29
  30 log = logging.getLogger(__name__)
  31
  32
  33 def generate_caps(type_):
  34     """
  35     Each call will return the next capability for each system type
  36     (essentially a subset of possible role values).  Valid types are osd,
  37     mds and client.
  38     """
  39     defaults = dict(
  40         osd=dict(
  41             mon='allow *',
  42             mgr='allow *',
  43             osd='allow *',
  44         ),
  45         mgr=dict(
  46             mon='allow *',
  47         ),
  48         mds=dict(
  49             mon='allow *',
  50             mgr='allow *',
  51             osd='allow *',
  52             mds='allow',
  53         ),
  54         client=dict(
  55             mon='allow rw',
  56             mgr='allow r',
  57             osd='allow rwx',
  58             mds='allow',
  59         ),
  60     )
  61     for subsystem, capability in defaults[type_].items():
  62         yield '--cap'
  63         yield subsystem
  64         yield capability
  65
  66
  67 @contextlib.contextmanager
  68 def ceph_log(ctx, config):
  69     """
  70     Create /var/log/ceph log directory that is open to everyone.
  71     Add valgrind and profiling-logger directories.
  72
  73     :param ctx: Context
  74     :param config: Configuration
  75     """
  76     log.info('Making ceph log dir writeable by non-root...')
  77     run.wait(
  78         ctx.cluster.run(
  79             args=[
  80                 'sudo',
  81                 'chmod',
  82                 '777',
  83                 '/var/log/ceph',
  84             ],
  85             wait=False,
  86         )
  87     )
  88     log.info('Disabling ceph logrotate...')
  89     run.wait(
  90         ctx.cluster.run(
  91             args=[
  92                 'sudo',
  93                 'rm', '-f', '--',
  94                 '/etc/logrotate.d/ceph',
  95             ],
  96             wait=False,
  97         )
  98     )
  99     log.info('Creating extra log directories...')
 100     run.wait(
 101         ctx.cluster.run(
 102             args=[
 103                 'sudo',
 104                 'install', '-d', '-m0777', '--',
 105                 '/var/log/ceph/valgrind',
 106                 '/var/log/ceph/profiling-logger',
 107             ],
 108             wait=False,
 109         )
 110     )
 111
 112     class Rotater(object):
 113         stop_event = gevent.event.Event()
 114
 115         def invoke_logrotate(self):
 116             # 1) install ceph-test.conf in /etc/logrotate.d
 117             # 2) continuously loop over logrotate invocation with ceph-test.conf
 118             while not self.stop_event.is_set():
 119                 self.stop_event.wait(timeout=30)
 120                 try:
 121                     run.wait(
 122                         ctx.cluster.run(
 123                             args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
 124                                   ],
 125                             wait=False,
 126                         )
 127                     )
 128                 except exceptions.ConnectionLostError as e:
 129                     # Some tests may power off nodes during test, in which
 130                     # case we will see connection errors that we should ignore.
 131                     log.debug("Missed logrotate, node '{0}' is offline".format(
 132                         e.node))
 133                 except EOFError as e:
 134                     # Paramiko sometimes raises this when it fails to
 135                     # connect to a node during open_session.  As with
 136                     # ConnectionLostError, we ignore this because nodes
 137                     # are allowed to get power cycled during tests.
 138                     log.debug("Missed logrotate, EOFError")
 139                 except SSHException as e:
 140                     log.debug("Missed logrotate, SSHException")
 141                 except socket.error as e:
 142                     if e.errno == errno.EHOSTUNREACH:
 143                         log.debug("Missed logrotate, host unreachable")
 144                     else:
 145                         raise
 146
 147         def begin(self):
 148             self.thread = gevent.spawn(self.invoke_logrotate)
 149
 150         def end(self):
 151             self.stop_event.set()
 152             self.thread.get()
 153
 154     def write_rotate_conf(ctx, daemons):
 155         testdir = teuthology.get_testdir(ctx)
 156         rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
 157         with file(rotate_conf_path, 'rb') as f:
 158             conf = ""
 159             for daemon, size in daemons.iteritems():
 160                 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
 161                 conf += f.read().format(daemon_type=daemon, max_size=size)
 162                 f.seek(0, 0)
 163
 164             for remote in ctx.cluster.remotes.iterkeys():
 165                 teuthology.write_file(remote=remote,
 166                                       path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 167                                       data=StringIO(conf)
 168                                       )
 169                 remote.run(
 170                     args=[
 171                         'sudo',
 172                         'mv',
 173                         '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 174                         '/etc/logrotate.d/ceph-test.conf',
 175                         run.Raw('&&'),
 176                         'sudo',
 177                         'chmod',
 178                         '0644',
 179                         '/etc/logrotate.d/ceph-test.conf',
 180                         run.Raw('&&'),
 181                         'sudo',
 182                         'chown',
 183                         'root.root',
 184                         '/etc/logrotate.d/ceph-test.conf'
 185                     ]
 186                 )
 187                 remote.chcon('/etc/logrotate.d/ceph-test.conf',
 188                              'system_u:object_r:etc_t:s0')
 189
 190     if ctx.config.get('log-rotate'):
 191         daemons = ctx.config.get('log-rotate')
 192         log.info('Setting up log rotation with ' + str(daemons))
 193         write_rotate_conf(ctx, daemons)
 194         logrotater = Rotater()
 195         logrotater.begin()
 196     try:
 197         yield
 198
 199     finally:
 200         if ctx.config.get('log-rotate'):
 201             log.info('Shutting down logrotate')
 202             logrotater.end()
 203             ctx.cluster.run(
 204                 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
 205                       ]
 206             )
 207         if ctx.archive is not None and \
 208                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 209             # and logs
 210             log.info('Compressing logs...')
 211             run.wait(
 212                 ctx.cluster.run(
 213                     args=[
 214                         'sudo',
 215                         'find',
 216                         '/var/log/ceph',
 217                         '-name',
 218                         '*.log',
 219                         '-print0',
 220                         run.Raw('|'),
 221                         'sudo',
 222                         'xargs',
 223                         '-0',
 224                         '--no-run-if-empty',
 225                         '--',
 226                         'gzip',
 227                         '--',
 228                     ],
 229                     wait=False,
 230                 ),
 231             )
 232
 233             log.info('Archiving logs...')
 234             path = os.path.join(ctx.archive, 'remote')
 235             os.makedirs(path)
 236             for remote in ctx.cluster.remotes.iterkeys():
 237                 sub = os.path.join(path, remote.shortname)
 238                 os.makedirs(sub)
 239                 teuthology.pull_directory(remote, '/var/log/ceph',
 240                                           os.path.join(sub, 'log'))
 241
 242
 243 def assign_devs(roles, devs):
 244     """
 245     Create a dictionary of devs indexed by roles
 246
 247     :param roles: List of roles
 248     :param devs: Corresponding list of devices.
 249     :returns: Dictionary of devs indexed by roles.
 250     """
 251     return dict(zip(roles, devs))
 252
 253
 254 @contextlib.contextmanager
 255 def valgrind_post(ctx, config):
 256     """
 257     After the tests run, look throught all the valgrind logs.  Exceptions are raised
 258     if textual errors occured in the logs, or if valgrind exceptions were detected in
 259     the logs.
 260
 261     :param ctx: Context
 262     :param config: Configuration
 263     """
 264     try:
 265         yield
 266     finally:
 267         lookup_procs = list()
 268         log.info('Checking for errors in any valgrind logs...')
 269         for remote in ctx.cluster.remotes.iterkeys():
 270             # look at valgrind logs for each node
 271             proc = remote.run(
 272                 args=[
 273                     'sudo',
 274                     'zgrep',
 275                     '<kind>',
 276                     run.Raw('/var/log/ceph/valgrind/*'),
 277                     '/dev/null',  # include a second file so that we always get a filename prefix on the output
 278                     run.Raw('|'),
 279                     'sort',
 280                     run.Raw('|'),
 281                     'uniq',
 282                 ],
 283                 wait=False,
 284                 check_status=False,
 285                 stdout=StringIO(),
 286             )
 287             lookup_procs.append((proc, remote))
 288
 289         valgrind_exception = None
 290         for (proc, remote) in lookup_procs:
 291             proc.wait()
 292             out = proc.stdout.getvalue()
 293             for line in out.split('\n'):
 294                 if line == '':
 295                     continue
 296                 try:
 297                     (file, kind) = line.split(':')
 298                 except Exception:
 299                     log.error('failed to split line %s', line)
 300                     raise
 301                 log.debug('file %s kind %s', file, kind)
 302                 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
 303                     continue
 304                 log.error('saw valgrind issue %s in %s', kind, file)
 305                 valgrind_exception = Exception('saw valgrind issues')
 306
 307         if config.get('expect_valgrind_errors'):
 308             if not valgrind_exception:
 309                 raise Exception('expected valgrind issues and found none')
 310         else:
 311             if valgrind_exception:
 312                 raise valgrind_exception
 313
 314
 315 @contextlib.contextmanager
 316 def crush_setup(ctx, config):
 317     cluster_name = config['cluster']
 318     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 319     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 320
 321     profile = config.get('crush_tunables', 'default')
 322     log.info('Setting crush tunables to %s', profile)
 323     mon_remote.run(
 324         args=['sudo', 'ceph', '--cluster', cluster_name,
 325               'osd', 'crush', 'tunables', profile])
 326     yield
 327
 328
 329 @contextlib.contextmanager
 330 def create_rbd_pool(ctx, config):
 331     cluster_name = config['cluster']
 332     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 333     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 334     log.info('Waiting for OSDs to come up')
 335     teuthology.wait_until_osds_up(
 336         ctx,
 337         cluster=ctx.cluster,
 338         remote=mon_remote,
 339         ceph_cluster=cluster_name,
 340     )
 341     log.info('Creating RBD pool')
 342     mon_remote.run(
 343         args=['sudo', 'ceph', '--cluster', cluster_name,
 344               'osd', 'pool', 'create', 'rbd', '8'])
 345     yield
 346
 347 @contextlib.contextmanager
 348 def cephfs_setup(ctx, config):
 349     cluster_name = config['cluster']
 350     testdir = teuthology.get_testdir(ctx)
 351     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 352
 353     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 354     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 355     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 356     # If there are any MDSs, then create a filesystem for them to use
 357     # Do this last because requires mon cluster to be up and running
 358     if mdss.remotes:
 359         log.info('Setting up CephFS filesystem...')
 360
 361         fs = Filesystem(ctx, create='cephfs')
 362
 363         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
 364         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
 365         num_active = len([r for r in all_roles if is_active_mds(r)])
 366
 367         fs.set_max_mds(num_active)
 368         fs.set_allow_dirfrags(True)
 369
 370     yield
 371
 372
 373 @contextlib.contextmanager
 374 def cluster(ctx, config):
 375     """
 376     Handle the creation and removal of a ceph cluster.
 377
 378     On startup:
 379         Create directories needed for the cluster.
 380         Create remote journals for all osds.
 381         Create and set keyring.
 382         Copy the monmap to tht test systems.
 383         Setup mon nodes.
 384         Setup mds nodes.
 385         Mkfs osd nodes.
 386         Add keyring information to monmaps
 387         Mkfs mon nodes.
 388
 389     On exit:
 390         If errors occured, extract a failure message and store in ctx.summary.
 391         Unmount all test files and temporary journaling files.
 392         Save the monitor information and archive all ceph logs.
 393         Cleanup the keyring setup, and remove all monitor map and data files left over.
 394
 395     :param ctx: Context
 396     :param config: Configuration
 397     """
 398     if ctx.config.get('use_existing_cluster', False) is True:
 399         log.info("'use_existing_cluster' is true; skipping cluster creation")
 400         yield
 401
 402     testdir = teuthology.get_testdir(ctx)
 403     cluster_name = config['cluster']
 404     data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
 405     log.info('Creating ceph cluster %s...', cluster_name)
 406     run.wait(
 407         ctx.cluster.run(
 408             args=[
 409                 'install', '-d', '-m0755', '--',
 410                 data_dir,
 411             ],
 412             wait=False,
 413         )
 414     )
 415
 416     run.wait(
 417         ctx.cluster.run(
 418             args=[
 419                 'sudo',
 420                 'install', '-d', '-m0777', '--', '/var/run/ceph',
 421             ],
 422             wait=False,
 423         )
 424     )
 425
 426     devs_to_clean = {}
 427     remote_to_roles_to_devs = {}
 428     remote_to_roles_to_journals = {}
 429     osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
 430     for remote, roles_for_host in osds.remotes.iteritems():
 431         devs = teuthology.get_scratch_devices(remote)
 432         roles_to_devs = {}
 433         roles_to_journals = {}
 434         if config.get('fs'):
 435             log.info('fs option selected, checking for scratch devs')
 436             log.info('found devs: %s' % (str(devs),))
 437             devs_id_map = teuthology.get_wwn_id_map(remote, devs)
 438             iddevs = devs_id_map.values()
 439             roles_to_devs = assign_devs(
 440                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 441             )
 442             if len(roles_to_devs) < len(iddevs):
 443                 iddevs = iddevs[len(roles_to_devs):]
 444             devs_to_clean[remote] = []
 445
 446         if config.get('block_journal'):
 447             log.info('block journal enabled')
 448             roles_to_journals = assign_devs(
 449                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 450             )
 451             log.info('journal map: %s', roles_to_journals)
 452
 453         if config.get('tmpfs_journal'):
 454             log.info('tmpfs journal enabled')
 455             roles_to_journals = {}
 456             remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
 457             for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 458                 tmpfs = '/mnt/' + role
 459                 roles_to_journals[role] = tmpfs
 460                 remote.run(args=['truncate', '-s', '1500M', tmpfs])
 461             log.info('journal map: %s', roles_to_journals)
 462
 463         log.info('dev map: %s' % (str(roles_to_devs),))
 464         remote_to_roles_to_devs[remote] = roles_to_devs
 465         remote_to_roles_to_journals[remote] = roles_to_journals
 466
 467     log.info('Generating config...')
 468     remotes_and_roles = ctx.cluster.remotes.items()
 469     roles = [role_list for (remote, role_list) in remotes_and_roles]
 470     ips = [host for (host, port) in
 471            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
 472     conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
 473     for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
 474         for role, journal in roles_to_journals.iteritems():
 475             name = teuthology.ceph_role(role)
 476             if name not in conf:
 477                 conf[name] = {}
 478             conf[name]['osd journal'] = journal
 479     for section, keys in config['conf'].iteritems():
 480         for key, value in keys.iteritems():
 481             log.info("[%s] %s = %s" % (section, key, value))
 482             if section not in conf:
 483                 conf[section] = {}
 484             conf[section][key] = value
 485
 486     if config.get('tmpfs_journal'):
 487         conf['journal dio'] = False
 488
 489     if not hasattr(ctx, 'ceph'):
 490         ctx.ceph = {}
 491     ctx.ceph[cluster_name] = argparse.Namespace()
 492     ctx.ceph[cluster_name].conf = conf
 493
 494     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
 495     keyring_path = config.get('keyring_path', default_keyring)
 496
 497     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 498
 499     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
 500
 501     log.info('Setting up %s...' % firstmon)
 502     ctx.cluster.only(firstmon).run(
 503         args=[
 504             'sudo',
 505             'adjust-ulimits',
 506             'ceph-coverage',
 507             coverage_dir,
 508             'ceph-authtool',
 509             '--create-keyring',
 510             keyring_path,
 511         ],
 512     )
 513     ctx.cluster.only(firstmon).run(
 514         args=[
 515             'sudo',
 516             'adjust-ulimits',
 517             'ceph-coverage',
 518             coverage_dir,
 519             'ceph-authtool',
 520             '--gen-key',
 521             '--name=mon.',
 522             keyring_path,
 523         ],
 524     )
 525     ctx.cluster.only(firstmon).run(
 526         args=[
 527             'sudo',
 528             'chmod',
 529             '0644',
 530             keyring_path,
 531         ],
 532     )
 533     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 534     monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
 535                                                    cluster=cluster_name)
 536     fsid = teuthology.create_simple_monmap(
 537         ctx,
 538         remote=mon0_remote,
 539         conf=conf,
 540         path=monmap_path,
 541     )
 542     if not 'global' in conf:
 543         conf['global'] = {}
 544     conf['global']['fsid'] = fsid
 545
 546     default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
 547     conf_path = config.get('conf_path', default_conf_path)
 548     log.info('Writing %s for FSID %s...' % (conf_path, fsid))
 549     write_conf(ctx, conf_path, cluster_name)
 550
 551     log.info('Creating admin key on %s...' % firstmon)
 552     ctx.cluster.only(firstmon).run(
 553         args=[
 554             'sudo',
 555             'adjust-ulimits',
 556             'ceph-coverage',
 557             coverage_dir,
 558             'ceph-authtool',
 559             '--gen-key',
 560             '--name=client.admin',
 561             '--set-uid=0',
 562             '--cap', 'mon', 'allow *',
 563             '--cap', 'osd', 'allow *',
 564             '--cap', 'mds', 'allow *',
 565             '--cap', 'mgr', 'allow *',
 566             keyring_path,
 567         ],
 568     )
 569
 570     log.info('Copying monmap to all nodes...')
 571     keyring = teuthology.get_file(
 572         remote=mon0_remote,
 573         path=keyring_path,
 574     )
 575     monmap = teuthology.get_file(
 576         remote=mon0_remote,
 577         path=monmap_path,
 578     )
 579
 580     for rem in ctx.cluster.remotes.iterkeys():
 581         # copy mon key and initial monmap
 582         log.info('Sending monmap to node {remote}'.format(remote=rem))
 583         teuthology.sudo_write_file(
 584             remote=rem,
 585             path=keyring_path,
 586             data=keyring,
 587             perms='0644'
 588         )
 589         teuthology.write_file(
 590             remote=rem,
 591             path=monmap_path,
 592             data=monmap,
 593         )
 594
 595     log.info('Setting up mon nodes...')
 596     mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
 597
 598     if not config.get('skip_mgr_daemons', False):
 599         log.info('Setting up mgr nodes...')
 600         mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
 601         for remote, roles_for_host in mgrs.remotes.iteritems():
 602             for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
 603                                                          cluster_name):
 604                 _, _, id_ = teuthology.split_role(role)
 605                 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
 606                     cluster=cluster_name,
 607                     id=id_,
 608                 )
 609                 remote.run(
 610                     args=[
 611                         'sudo',
 612                         'mkdir',
 613                         '-p',
 614                         mgr_dir,
 615                         run.Raw('&&'),
 616                         'sudo',
 617                         'adjust-ulimits',
 618                         'ceph-coverage',
 619                         coverage_dir,
 620                         'ceph-authtool',
 621                         '--create-keyring',
 622                         '--gen-key',
 623                         '--name=mgr.{id}'.format(id=id_),
 624                         mgr_dir + '/keyring',
 625                     ],
 626                 )
 627
 628     log.info('Setting up mds nodes...')
 629     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 630     for remote, roles_for_host in mdss.remotes.iteritems():
 631         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
 632                                                      cluster_name):
 633             _, _, id_ = teuthology.split_role(role)
 634             mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
 635                 cluster=cluster_name,
 636                 id=id_,
 637             )
 638             remote.run(
 639                 args=[
 640                     'sudo',
 641                     'mkdir',
 642                     '-p',
 643                     mds_dir,
 644                     run.Raw('&&'),
 645                     'sudo',
 646                     'adjust-ulimits',
 647                     'ceph-coverage',
 648                     coverage_dir,
 649                     'ceph-authtool',
 650                     '--create-keyring',
 651                     '--gen-key',
 652                     '--name=mds.{id}'.format(id=id_),
 653                     mds_dir + '/keyring',
 654                 ],
 655             )
 656
 657     cclient.create_keyring(ctx, cluster_name)
 658     log.info('Running mkfs on osd nodes...')
 659
 660     if not hasattr(ctx, 'disk_config'):
 661         ctx.disk_config = argparse.Namespace()
 662     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
 663         ctx.disk_config.remote_to_roles_to_dev = {}
 664     if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
 665         ctx.disk_config.remote_to_roles_to_journals = {}
 666     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
 667         ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
 668     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
 669         ctx.disk_config.remote_to_roles_to_dev_fstype = {}
 670
 671     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
 672     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
 673
 674     log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
 675     for remote, roles_for_host in osds.remotes.iteritems():
 676         roles_to_devs = remote_to_roles_to_devs[remote]
 677         roles_to_journals = remote_to_roles_to_journals[remote]
 678
 679         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 680             _, _, id_ = teuthology.split_role(role)
 681             mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
 682             remote.run(
 683                 args=[
 684                     'sudo',
 685                     'mkdir',
 686                     '-p',
 687                     mnt_point,
 688                 ])
 689             log.info(str(roles_to_journals))
 690             log.info(role)
 691             if roles_to_devs.get(role):
 692                 dev = roles_to_devs[role]
 693                 fs = config.get('fs')
 694                 package = None
 695                 mkfs_options = config.get('mkfs_options')
 696                 mount_options = config.get('mount_options')
 697                 if fs == 'btrfs':
 698                     # package = 'btrfs-tools'
 699                     if mount_options is None:
 700                         mount_options = ['noatime', 'user_subvol_rm_allowed']
 701                     if mkfs_options is None:
 702                         mkfs_options = ['-m', 'single',
 703                                         '-l', '32768',
 704                                         '-n', '32768']
 705                 if fs == 'xfs':
 706                     # package = 'xfsprogs'
 707                     if mount_options is None:
 708                         mount_options = ['noatime']
 709                     if mkfs_options is None:
 710                         mkfs_options = ['-f', '-i', 'size=2048']
 711                 if fs == 'ext4' or fs == 'ext3':
 712                     if mount_options is None:
 713                         mount_options = ['noatime', 'user_xattr']
 714
 715                 if mount_options is None:
 716                     mount_options = []
 717                 if mkfs_options is None:
 718                     mkfs_options = []
 719                 mkfs = ['mkfs.%s' % fs] + mkfs_options
 720                 log.info('%s on %s on %s' % (mkfs, dev, remote))
 721                 if package is not None:
 722                     remote.run(
 723                         args=[
 724                             'sudo',
 725                             'apt-get', 'install', '-y', package
 726                         ],
 727                         stdout=StringIO(),
 728                     )
 729
 730                 try:
 731                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 732                 except run.CommandFailedError:
 733                     # Newer btfs-tools doesn't prompt for overwrite, use -f
 734                     if '-f' not in mount_options:
 735                         mkfs_options.append('-f')
 736                         mkfs = ['mkfs.%s' % fs] + mkfs_options
 737                         log.info('%s on %s on %s' % (mkfs, dev, remote))
 738                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 739
 740                 log.info('mount %s on %s -o %s' % (dev, remote,
 741                                                    ','.join(mount_options)))
 742                 remote.run(
 743                     args=[
 744                         'sudo',
 745                         'mount',
 746                         '-t', fs,
 747                         '-o', ','.join(mount_options),
 748                         dev,
 749                         mnt_point,
 750                     ]
 751                 )
 752                 remote.run(
 753                     args=[
 754                         'sudo', '/sbin/restorecon', mnt_point,
 755                     ],
 756                     check_status=False,
 757                 )
 758                 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
 759                     ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
 760                 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
 761                 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
 762                     ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
 763                 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
 764                 devs_to_clean[remote].append(mnt_point)
 765
 766         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 767             _, _, id_ = teuthology.split_role(role)
 768             remote.run(
 769                 args=[
 770                     'sudo',
 771                     'MALLOC_CHECK_=3',
 772                     'adjust-ulimits',
 773                     'ceph-coverage',
 774                     coverage_dir,
 775                     'ceph-osd',
 776                     '--cluster',
 777                     cluster_name,
 778                     '--mkfs',
 779                     '--mkkey',
 780                     '-i', id_,
 781                     '--monmap', monmap_path,
 782                 ],
 783             )
 784
 785     log.info('Reading keys from all nodes...')
 786     keys_fp = StringIO()
 787     keys = []
 788     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 789         for type_ in ['mgr',  'mds', 'osd']:
 790             if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
 791                 continue
 792             for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
 793                 _, _, id_ = teuthology.split_role(role)
 794                 data = teuthology.get_file(
 795                     remote=remote,
 796                     path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
 797                         type=type_,
 798                         id=id_,
 799                         cluster=cluster_name,
 800                     ),
 801                     sudo=True,
 802                 )
 803                 keys.append((type_, id_, data))
 804                 keys_fp.write(data)
 805     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 806         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
 807             _, _, id_ = teuthology.split_role(role)
 808             data = teuthology.get_file(
 809                 remote=remote,
 810                 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
 811             )
 812             keys.append(('client', id_, data))
 813             keys_fp.write(data)
 814
 815     log.info('Adding keys to all mons...')
 816     writes = mons.run(
 817         args=[
 818             'sudo', 'tee', '-a',
 819             keyring_path,
 820         ],
 821         stdin=run.PIPE,
 822         wait=False,
 823         stdout=StringIO(),
 824     )
 825     keys_fp.seek(0)
 826     teuthology.feed_many_stdins_and_close(keys_fp, writes)
 827     run.wait(writes)
 828     for type_, id_, data in keys:
 829         run.wait(
 830             mons.run(
 831                 args=[
 832                          'sudo',
 833                          'adjust-ulimits',
 834                          'ceph-coverage',
 835                          coverage_dir,
 836                          'ceph-authtool',
 837                          keyring_path,
 838                          '--name={type}.{id}'.format(
 839                              type=type_,
 840                              id=id_,
 841                          ),
 842                      ] + list(generate_caps(type_)),
 843                 wait=False,
 844             ),
 845         )
 846
 847     log.info('Running mkfs on mon nodes...')
 848     for remote, roles_for_host in mons.remotes.iteritems():
 849         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
 850             _, _, id_ = teuthology.split_role(role)
 851             remote.run(
 852                 args=[
 853                     'sudo',
 854                     'mkdir',
 855                     '-p',
 856                     '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
 857                 ],
 858             )
 859             remote.run(
 860                 args=[
 861                     'sudo',
 862                     'adjust-ulimits',
 863                     'ceph-coverage',
 864                     coverage_dir,
 865                     'ceph-mon',
 866                     '--cluster', cluster_name,
 867                     '--mkfs',
 868                     '-i', id_,
 869                     '--monmap', monmap_path,
 870                     '--keyring', keyring_path,
 871                 ],
 872             )
 873
 874     run.wait(
 875         mons.run(
 876             args=[
 877                 'rm',
 878                 '--',
 879                 monmap_path,
 880             ],
 881             wait=False,
 882         ),
 883     )
 884
 885     try:
 886         yield
 887     except Exception:
 888         # we need to know this below
 889         ctx.summary['success'] = False
 890         raise
 891     finally:
 892         (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 893
 894         log.info('Checking cluster log for badness...')
 895
 896         def first_in_ceph_log(pattern, excludes):
 897             """
 898             Find the first occurence of the pattern specified in the Ceph log,
 899             Returns None if none found.
 900
 901             :param pattern: Pattern scanned for.
 902             :param excludes: Patterns to ignore.
 903             :return: First line of text (or None if not found)
 904             """
 905             args = [
 906                 'sudo',
 907                 'egrep', pattern,
 908                 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
 909             ]
 910             for exclude in excludes:
 911                 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 912             args.extend([
 913                 run.Raw('|'), 'head', '-n', '1',
 914             ])
 915             r = mon0_remote.run(
 916                 stdout=StringIO(),
 917                 args=args,
 918             )
 919             stdout = r.stdout.getvalue()
 920             if stdout != '':
 921                 return stdout
 922             return None
 923
 924         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 925                              config['log_whitelist']) is not None:
 926             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 927             ctx.summary['success'] = False
 928             # use the most severe problem as the failure reason
 929             if 'failure_reason' not in ctx.summary:
 930                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 931                     match = first_in_ceph_log(pattern, config['log_whitelist'])
 932                     if match is not None:
 933                         ctx.summary['failure_reason'] = \
 934                             '"{match}" in cluster log'.format(
 935                                 match=match.rstrip('\n'),
 936                             )
 937                         break
 938
 939         for remote, dirs in devs_to_clean.iteritems():
 940             for dir_ in dirs:
 941                 log.info('Unmounting %s on %s' % (dir_, remote))
 942                 try:
 943                     remote.run(
 944                         args=[
 945                             'sync',
 946                             run.Raw('&&'),
 947                             'sudo',
 948                             'umount',
 949                             '-f',
 950                             dir_
 951                         ]
 952                     )
 953                 except Exception as e:
 954                     remote.run(args=[
 955                         'sudo',
 956                         run.Raw('PATH=/usr/sbin:$PATH'),
 957                         'lsof',
 958                         run.Raw(';'),
 959                         'ps', 'auxf',
 960                     ])
 961                     raise e
 962
 963         if config.get('tmpfs_journal'):
 964             log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
 965             for remote, roles_for_host in osds.remotes.iteritems():
 966                 remote.run(
 967                     args=['sudo', 'umount', '-f', '/mnt'],
 968                     check_status=False,
 969                 )
 970
 971         if ctx.archive is not None and \
 972                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 973
 974             # archive mon data, too
 975             log.info('Archiving mon data...')
 976             path = os.path.join(ctx.archive, 'data')
 977             try:
 978                 os.makedirs(path)
 979             except OSError as e:
 980                 if e.errno == errno.EEXIST:
 981                     pass
 982                 else:
 983                     raise
 984             for remote, roles in mons.remotes.iteritems():
 985                 for role in roles:
 986                     is_mon = teuthology.is_type('mon', cluster_name)
 987                     if is_mon(role):
 988                         _, _, id_ = teuthology.split_role(role)
 989                         mon_dir = '/var/lib/ceph/mon/' + \
 990                                   '{0}-{1}'.format(cluster_name, id_)
 991                         teuthology.pull_directory_tarball(
 992                             remote,
 993                             mon_dir,
 994                             path + '/' + role + '.tgz')
 995
 996         log.info('Cleaning ceph cluster...')
 997         run.wait(
 998             ctx.cluster.run(
 999                 args=[
1000                     'sudo',
1001                     'rm',
1002                     '-rf',
1003                     '--',
1004                     conf_path,
1005                     keyring_path,
1006                     data_dir,
1007                     monmap_path,
1008                     run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1009                 ],
1010                 wait=False,
1011             ),
1012         )
1013
1014
1015 def osd_scrub_pgs(ctx, config):
1016     """
1017     Scrub pgs when we exit.
1018
1019     First make sure all pgs are active and clean.
1020     Next scrub all osds.
1021     Then periodically check until all pgs have scrub time stamps that
1022     indicate the last scrub completed.  Time out if no progess is made
1023     here after two minutes.
1024     """
1025     retries = 20
1026     delays = 10
1027     cluster_name = config['cluster']
1028     manager = ctx.managers[cluster_name]
1029     all_clean = False
1030     for _ in range(0, retries):
1031         stats = manager.get_pg_stats()
1032         bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1033         if not bad:
1034             all_clean = True
1035             break
1036         log.info(
1037             "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1038         time.sleep(delays)
1039     if not all_clean:
1040         raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1041     check_time_now = time.localtime()
1042     time.sleep(1)
1043     all_roles = teuthology.all_roles(ctx.cluster)
1044     for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1045         log.info("Scrubbing {osd}".format(osd=role))
1046         _, _, id_ = teuthology.split_role(role)
1047         # allow this to fail; in certain cases the OSD might not be up
1048         # at this point.  we will catch all pgs below.
1049         try:
1050             manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1051         except run.CommandFailedError:
1052             pass
1053     prev_good = 0
1054     gap_cnt = 0
1055     loop = True
1056     while loop:
1057         stats = manager.get_pg_stats()
1058         timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1059         loop = False
1060         thiscnt = 0
1061         for (pgid, tmval) in timez:
1062             pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1063             if pgtm > check_time_now:
1064                 thiscnt += 1
1065             else:
1066                 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1067                 loop = True
1068         if thiscnt > prev_good:
1069             prev_good = thiscnt
1070             gap_cnt = 0
1071         else:
1072             gap_cnt += 1
1073             if gap_cnt % 6 == 0:
1074                 for (pgid, tmval) in timez:
1075                     # re-request scrub every so often in case the earlier
1076                     # request was missed.  do not do it everytime because
1077                     # the scrub may be in progress or not reported yet and
1078                     # we will starve progress.
1079                     manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1080             if gap_cnt > retries:
1081                 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1082         if loop:
1083             log.info('Still waiting for all pgs to be scrubbed.')
1084             time.sleep(delays)
1085
1086
1087 @contextlib.contextmanager
1088 def run_daemon(ctx, config, type_):
1089     """
1090     Run daemons for a role type.  Handle the startup and termination of a a daemon.
1091     On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1092     and a max_mds value for one mds.
1093     On cleanup -- Stop all existing daemons of this type.
1094
1095     :param ctx: Context
1096     :param config: Configuration
1097     :paran type_: Role type
1098     """
1099     cluster_name = config['cluster']
1100     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1101     testdir = teuthology.get_testdir(ctx)
1102     daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1103
1104     # check whether any daemons if this type are configured
1105     if daemons is None:
1106         return
1107     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1108
1109     daemon_signal = 'kill'
1110     if config.get('coverage') or config.get('valgrind') is not None:
1111         daemon_signal = 'term'
1112
1113     # create osds in order.  (this only matters for pre-luminous, which might
1114     # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1115     osd_uuids  = {}
1116     for remote, roles_for_host in daemons.remotes.iteritems():
1117         is_type_ = teuthology.is_type(type_, cluster_name)
1118         for role in roles_for_host:
1119             if not is_type_(role):
1120                 continue
1121             _, _, id_ = teuthology.split_role(role)
1122
1123
1124             if type_ == 'osd':
1125                 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1126                     cluster=cluster_name, id=id_)
1127                 osd_uuid = teuthology.get_file(
1128                     remote=remote,
1129                     path=datadir + '/fsid',
1130                     sudo=True,
1131                 ).strip()
1132                 osd_uuids[id_] = osd_uuid
1133     for osd_id in range(len(osd_uuids)):
1134         id_ = str(osd_id)
1135         osd_uuid = osd_uuids.get(id_)
1136         try:
1137             remote.run(
1138                 args=[
1139                 'sudo', 'ceph', '--cluster', cluster_name,
1140                     'osd', 'new', osd_uuid, id_,
1141                 ]
1142             )
1143         except:
1144             # fallback to pre-luminous (hammer or jewel)
1145             remote.run(
1146                 args=[
1147                 'sudo', 'ceph', '--cluster', cluster_name,
1148                     'osd', 'create', osd_uuid,
1149                 ]
1150             )
1151             if config.get('add_osds_to_crush'):
1152                 remote.run(
1153                 args=[
1154                     'sudo', 'ceph', '--cluster', cluster_name,
1155                     'osd', 'crush', 'create-or-move', 'osd.' + id_,
1156                     '1.0', 'host=localhost', 'root=default',
1157                 ]
1158             )
1159
1160     for remote, roles_for_host in daemons.remotes.iteritems():
1161         is_type_ = teuthology.is_type(type_, cluster_name)
1162         for role in roles_for_host:
1163             if not is_type_(role):
1164                 continue
1165             _, _, id_ = teuthology.split_role(role)
1166
1167             run_cmd = [
1168                 'sudo',
1169                 'adjust-ulimits',
1170                 'ceph-coverage',
1171                 coverage_dir,
1172                 'daemon-helper',
1173                 daemon_signal,
1174             ]
1175             run_cmd_tail = [
1176                 'ceph-%s' % (type_),
1177                 '-f',
1178                 '--cluster', cluster_name,
1179                 '-i', id_]
1180
1181             if type_ in config.get('cpu_profile', []):
1182                 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1183                 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1184
1185             if config.get('valgrind') is not None:
1186                 valgrind_args = None
1187                 if type_ in config['valgrind']:
1188                     valgrind_args = config['valgrind'][type_]
1189                 if role in config['valgrind']:
1190                     valgrind_args = config['valgrind'][role]
1191                 run_cmd = teuthology.get_valgrind_args(testdir, role,
1192                                                        run_cmd,
1193                                                        valgrind_args)
1194
1195             run_cmd.extend(run_cmd_tail)
1196
1197             # always register mgr; don't necessarily start
1198             ctx.daemons.register_daemon(
1199                 remote, type_, id_,
1200                 cluster=cluster_name,
1201                 args=run_cmd,
1202                 logger=log.getChild(role),
1203                 stdin=run.PIPE,
1204                 wait=False
1205             )
1206             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1207                 role = cluster_name + '.' + type_
1208                 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1209
1210     try:
1211         yield
1212     finally:
1213         teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1214
1215
1216 def healthy(ctx, config):
1217     """
1218     Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1219
1220     :param ctx: Context
1221     :param config: Configuration
1222     """
1223     config = config if isinstance(config, dict) else dict()
1224     cluster_name = config.get('cluster', 'ceph')
1225     log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1226     manager = ctx.managers[cluster_name]
1227     try:
1228         manager.wait_for_mgr_available()
1229     except run.CommandFailedError:
1230         log.info('ignoring mgr wait error, probably testing upgrade')
1231
1232     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1233     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1234     teuthology.wait_until_osds_up(
1235         ctx,
1236         cluster=ctx.cluster,
1237         remote=mon0_remote,
1238         ceph_cluster=cluster_name,
1239     )
1240
1241     try:
1242         manager.flush_all_pg_stats()
1243     except run.CommandFailedError:
1244         log.info('ignoring flush pg stats error, probably testing upgrade')
1245     manager.wait_for_clean()
1246
1247     log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1248     teuthology.wait_until_healthy(
1249         ctx,
1250         remote=mon0_remote,
1251         ceph_cluster=cluster_name,
1252     )
1253
1254     if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1255         # Some MDSs exist, wait for them to be healthy
1256         ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1257         ceph_fs.wait_for_daemons(timeout=300)
1258
1259
1260 def wait_for_osds_up(ctx, config):
1261     """
1262     Wait for all osd's to come up.
1263
1264     :param ctx: Context
1265     :param config: Configuration
1266     """
1267     log.info('Waiting until ceph osds are all up...')
1268     cluster_name = config.get('cluster', 'ceph')
1269     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1270     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1271     teuthology.wait_until_osds_up(
1272         ctx,
1273         cluster=ctx.cluster,
1274         remote=mon0_remote
1275     )
1276
1277
1278 def wait_for_mon_quorum(ctx, config):
1279     """
1280     Check renote ceph status until all monitors are up.
1281
1282     :param ctx: Context
1283     :param config: Configuration
1284     """
1285     if isinstance(config, dict):
1286         mons = config['daemons']
1287         cluster_name = config.get('cluster', 'ceph')
1288     else:
1289         assert isinstance(config, list)
1290         mons = config
1291         cluster_name = 'ceph'
1292     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1293     (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1294     with contextutil.safe_while(sleep=10, tries=60,
1295                                 action='wait for monitor quorum') as proceed:
1296         while proceed():
1297             r = remote.run(
1298                 args=[
1299                     'sudo',
1300                     'ceph',
1301                     'quorum_status',
1302                 ],
1303                 stdout=StringIO(),
1304                 logger=log.getChild('quorum_status'),
1305             )
1306             j = json.loads(r.stdout.getvalue())
1307             q = j.get('quorum_names', [])
1308             log.debug('Quorum: %s', q)
1309             if sorted(q) == sorted(mons):
1310                 break
1311
1312
1313 def created_pool(ctx, config):
1314     """
1315     Add new pools to the dictionary of pools that the ceph-manager
1316     knows about.
1317     """
1318     for new_pool in config:
1319         if new_pool not in ctx.managers['ceph'].pools:
1320             ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1321                 new_pool, 'pg_num')
1322
1323
1324 @contextlib.contextmanager
1325 def restart(ctx, config):
1326     """
1327    restart ceph daemons
1328
1329    For example::
1330       tasks:
1331       - ceph.restart: [all]
1332
1333    For example::
1334       tasks:
1335       - ceph.restart: [osd.0, mon.1, mds.*]
1336
1337    or::
1338
1339       tasks:
1340       - ceph.restart:
1341           daemons: [osd.0, mon.1]
1342           wait-for-healthy: false
1343           wait-for-osds-up: true
1344
1345     :param ctx: Context
1346     :param config: Configuration
1347     """
1348     if config is None:
1349         config = {}
1350     elif isinstance(config, list):
1351         config = {'daemons': config}
1352
1353     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1354     clusters = set()
1355     for role in daemons:
1356         cluster, type_, id_ = teuthology.split_role(role)
1357         ctx.daemons.get_daemon(type_, id_, cluster).restart()
1358         clusters.add(cluster)
1359
1360     manager = ctx.managers['ceph']
1361     for dmon in daemons:
1362         if '.' in dmon:
1363             dm_parts = dmon.split('.')
1364             if dm_parts[1].isdigit():
1365                 if dm_parts[0] == 'osd':
1366                     manager.mark_down_osd(int(dm_parts[1]))
1367
1368     if config.get('wait-for-healthy', True):
1369         for cluster in clusters:
1370             healthy(ctx=ctx, config=dict(cluster=cluster))
1371     if config.get('wait-for-osds-up', False):
1372         for cluster in clusters:
1373             wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1374     yield
1375
1376
1377 @contextlib.contextmanager
1378 def stop(ctx, config):
1379     """
1380     Stop ceph daemons
1381
1382     For example::
1383       tasks:
1384       - ceph.stop: [mds.*]
1385
1386       tasks:
1387       - ceph.stop: [osd.0, osd.2]
1388
1389       tasks:
1390       - ceph.stop:
1391           daemons: [osd.0, osd.2]
1392
1393     """
1394     if config is None:
1395         config = {}
1396     elif isinstance(config, list):
1397         config = {'daemons': config}
1398
1399     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1400     for role in daemons:
1401         cluster, type_, id_ = teuthology.split_role(role)
1402         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1403
1404     yield
1405
1406
1407 @contextlib.contextmanager
1408 def wait_for_failure(ctx, config):
1409     """
1410     Wait for a failure of a ceph daemon
1411
1412     For example::
1413       tasks:
1414       - ceph.wait_for_failure: [mds.*]
1415
1416       tasks:
1417       - ceph.wait_for_failure: [osd.0, osd.2]
1418
1419       tasks:
1420       - ceph.wait_for_failure:
1421           daemons: [osd.0, osd.2]
1422
1423     """
1424     if config is None:
1425         config = {}
1426     elif isinstance(config, list):
1427         config = {'daemons': config}
1428
1429     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1430     for role in daemons:
1431         cluster, type_, id_ = teuthology.split_role(role)
1432         try:
1433             ctx.daemons.get_daemon(type_, id_, cluster).wait()
1434         except:
1435             log.info('Saw expected daemon failure.  Continuing.')
1436             pass
1437         else:
1438             raise RuntimeError('daemon %s did not fail' % role)
1439
1440     yield
1441
1442
1443 def validate_config(ctx, config):
1444     """
1445     Perform some simple validation on task configuration.
1446     Raises exceptions.ConfigError if an error is found.
1447     """
1448     # check for osds from multiple clusters on the same host
1449     for remote, roles_for_host in ctx.cluster.remotes.items():
1450         last_cluster = None
1451         last_role = None
1452         for role in roles_for_host:
1453             role_cluster, role_type, _ = teuthology.split_role(role)
1454             if role_type != 'osd':
1455                 continue
1456             if last_cluster and last_cluster != role_cluster:
1457                 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1458                     last_role, role)
1459                 raise exceptions.ConfigError(msg)
1460             last_cluster = role_cluster
1461             last_role = role
1462
1463
1464 @contextlib.contextmanager
1465 def task(ctx, config):
1466     """
1467     Set up and tear down a Ceph cluster.
1468
1469     For example::
1470
1471         tasks:
1472         - ceph:
1473         - interactive:
1474
1475     You can also specify what branch to run::
1476
1477         tasks:
1478         - ceph:
1479             branch: foo
1480
1481     Or a tag::
1482
1483         tasks:
1484         - ceph:
1485             tag: v0.42.13
1486
1487     Or a sha1::
1488
1489         tasks:
1490         - ceph:
1491             sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1492
1493     Or a local source dir::
1494
1495         tasks:
1496         - ceph:
1497             path: /home/sage/ceph
1498
1499     To capture code coverage data, use::
1500
1501         tasks:
1502         - ceph:
1503             coverage: true
1504
1505     To use btrfs, ext4, or xfs on the target's scratch disks, use::
1506
1507         tasks:
1508         - ceph:
1509             fs: xfs
1510             mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1511             mount_options: [nobarrier, inode64]
1512
1513     Note, this will cause the task to check the /scratch_devs file on each node
1514     for available devices.  If no such file is found, /dev/sdb will be used.
1515
1516     To run some daemons under valgrind, include their names
1517     and the tool/args to use in a valgrind section::
1518
1519         tasks:
1520         - ceph:
1521           valgrind:
1522             mds.1: --tool=memcheck
1523             osd.1: [--tool=memcheck, --leak-check=no]
1524
1525     Those nodes which are using memcheck or valgrind will get
1526     checked for bad results.
1527
1528     To adjust or modify config options, use::
1529
1530         tasks:
1531         - ceph:
1532             conf:
1533               section:
1534                 key: value
1535
1536     For example::
1537
1538         tasks:
1539         - ceph:
1540             conf:
1541               mds.0:
1542                 some option: value
1543                 other key: other value
1544               client.0:
1545                 debug client: 10
1546                 debug ms: 1
1547
1548     By default, the cluster log is checked for errors and warnings,
1549     and the run marked failed if any appear. You can ignore log
1550     entries by giving a list of egrep compatible regexes, i.e.:
1551
1552         tasks:
1553         - ceph:
1554             log-whitelist: ['foo.*bar', 'bad message']
1555
1556     To run multiple ceph clusters, use multiple ceph tasks, and roles
1557     with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1558     cluster use the default cluster name, 'ceph'. OSDs from separate
1559     clusters must be on separate hosts. Clients and non-osd daemons
1560     from multiple clusters may be colocated. For each cluster, add an
1561     instance of the ceph task with the cluster name specified, e.g.::
1562
1563         roles:
1564         - [mon.a, osd.0, osd.1]
1565         - [backup.mon.a, backup.osd.0, backup.osd.1]
1566         - [client.0, backup.client.0]
1567         tasks:
1568         - ceph:
1569             cluster: ceph
1570         - ceph:
1571             cluster: backup
1572
1573     :param ctx: Context
1574     :param config: Configuration
1575
1576     """
1577     if config is None:
1578         config = {}
1579     assert isinstance(config, dict), \
1580         "task ceph only supports a dictionary for configuration"
1581
1582     overrides = ctx.config.get('overrides', {})
1583     teuthology.deep_merge(config, overrides.get('ceph', {}))
1584
1585     first_ceph_cluster = False
1586     if not hasattr(ctx, 'daemons'):
1587         first_ceph_cluster = True
1588         ctx.daemons = DaemonGroup()
1589
1590     testdir = teuthology.get_testdir(ctx)
1591     if config.get('coverage'):
1592         coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1593         log.info('Creating coverage directory...')
1594         run.wait(
1595             ctx.cluster.run(
1596                 args=[
1597                     'install', '-d', '-m0755', '--',
1598                     coverage_dir,
1599                 ],
1600                 wait=False,
1601             )
1602         )
1603
1604     if 'cluster' not in config:
1605         config['cluster'] = 'ceph'
1606
1607     validate_config(ctx, config)
1608
1609     subtasks = []
1610     if first_ceph_cluster:
1611         # these tasks handle general log setup and parsing on all hosts,
1612         # so they should only be run once
1613         subtasks = [
1614             lambda: ceph_log(ctx=ctx, config=None),
1615             lambda: valgrind_post(ctx=ctx, config=config),
1616         ]
1617
1618     subtasks += [
1619         lambda: cluster(ctx=ctx, config=dict(
1620             conf=config.get('conf', {}),
1621             fs=config.get('fs', 'xfs'),
1622             mkfs_options=config.get('mkfs_options', None),
1623             mount_options=config.get('mount_options', None),
1624             block_journal=config.get('block_journal', None),
1625             tmpfs_journal=config.get('tmpfs_journal', None),
1626             skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1627             log_whitelist=config.get('log-whitelist', []),
1628             cpu_profile=set(config.get('cpu_profile', []),),
1629             cluster=config['cluster'],
1630         )),
1631         lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1632         lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1633         lambda: crush_setup(ctx=ctx, config=config),
1634         lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1635         lambda: create_rbd_pool(ctx=ctx, config=config),
1636         lambda: cephfs_setup(ctx=ctx, config=config),
1637         lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1638     ]
1639
1640     with contextutil.nested(*subtasks):
1641         first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1642         (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1643         if not hasattr(ctx, 'managers'):
1644             ctx.managers = {}
1645         ctx.managers[config['cluster']] = CephManager(
1646             mon,
1647             ctx=ctx,
1648             logger=log.getChild('ceph_manager.' + config['cluster']),
1649             cluster=config['cluster'],
1650         )
1651
1652         try:
1653             if config.get('wait-for-healthy', True):
1654                 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1655
1656             yield
1657         finally:
1658             if config.get('wait-for-scrub', True):
1659                 osd_scrub_pgs(ctx, config)
1660
1661             # stop logging health to clog during shutdown, or else we generate
1662             # a bunch of scary messages unrelated to our actual run.
1663             firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1664             (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1665             mon0_remote.run(
1666                 args=[
1667                     'sudo',
1668                     'ceph',
1669                     '--cluster', config['cluster'],
1670                     'tell',
1671                     'mon.*',
1672                     'injectargs',
1673                     '--',
1674                     '--no-mon-health-to-clog',
1675                 ]
1676             )