ceph/qa/tasks/ceph.py

   1 """
   2 Ceph cluster task.
   3
   4 Handle the setup, starting, and clean-up of a Ceph cluster.
   5 """
   6 from cStringIO import StringIO
   7
   8 import argparse
   9 import contextlib
  10 import errno
  11 import logging
  12 import os
  13 import json
  14 import time
  15 import gevent
  16 import socket
  17
  18 from paramiko import SSHException
  19 from ceph_manager import CephManager, write_conf
  20 from tasks.cephfs.filesystem import Filesystem
  21 from teuthology import misc as teuthology
  22 from teuthology import contextutil
  23 from teuthology import exceptions
  24 from teuthology.orchestra import run
  25 import ceph_client as cclient
  26 from teuthology.orchestra.daemon import DaemonGroup
  27
  28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
  29
  30 log = logging.getLogger(__name__)
  31
  32
  33 def generate_caps(type_):
  34     """
  35     Each call will return the next capability for each system type
  36     (essentially a subset of possible role values).  Valid types are osd,
  37     mds and client.
  38     """
  39     defaults = dict(
  40         osd=dict(
  41             mon='allow *',
  42             mgr='allow *',
  43             osd='allow *',
  44         ),
  45         mgr=dict(
  46             mon='allow *',
  47         ),
  48         mds=dict(
  49             mon='allow *',
  50             mgr='allow *',
  51             osd='allow *',
  52             mds='allow',
  53         ),
  54         client=dict(
  55             mon='allow rw',
  56             mgr='allow r',
  57             osd='allow rwx',
  58             mds='allow',
  59         ),
  60     )
  61     for subsystem, capability in defaults[type_].items():
  62         yield '--cap'
  63         yield subsystem
  64         yield capability
  65
  66
  67 @contextlib.contextmanager
  68 def ceph_log(ctx, config):
  69     """
  70     Create /var/log/ceph log directory that is open to everyone.
  71     Add valgrind and profiling-logger directories.
  72
  73     :param ctx: Context
  74     :param config: Configuration
  75     """
  76     log.info('Making ceph log dir writeable by non-root...')
  77     run.wait(
  78         ctx.cluster.run(
  79             args=[
  80                 'sudo',
  81                 'chmod',
  82                 '777',
  83                 '/var/log/ceph',
  84             ],
  85             wait=False,
  86         )
  87     )
  88     log.info('Disabling ceph logrotate...')
  89     run.wait(
  90         ctx.cluster.run(
  91             args=[
  92                 'sudo',
  93                 'rm', '-f', '--',
  94                 '/etc/logrotate.d/ceph',
  95             ],
  96             wait=False,
  97         )
  98     )
  99     log.info('Creating extra log directories...')
 100     run.wait(
 101         ctx.cluster.run(
 102             args=[
 103                 'sudo',
 104                 'install', '-d', '-m0777', '--',
 105                 '/var/log/ceph/valgrind',
 106                 '/var/log/ceph/profiling-logger',
 107             ],
 108             wait=False,
 109         )
 110     )
 111
 112     class Rotater(object):
 113         stop_event = gevent.event.Event()
 114
 115         def invoke_logrotate(self):
 116             # 1) install ceph-test.conf in /etc/logrotate.d
 117             # 2) continuously loop over logrotate invocation with ceph-test.conf
 118             while not self.stop_event.is_set():
 119                 self.stop_event.wait(timeout=30)
 120                 try:
 121                     run.wait(
 122                         ctx.cluster.run(
 123                             args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
 124                                   ],
 125                             wait=False,
 126                         )
 127                     )
 128                 except exceptions.ConnectionLostError as e:
 129                     # Some tests may power off nodes during test, in which
 130                     # case we will see connection errors that we should ignore.
 131                     log.debug("Missed logrotate, node '{0}' is offline".format(
 132                         e.node))
 133                 except EOFError as e:
 134                     # Paramiko sometimes raises this when it fails to
 135                     # connect to a node during open_session.  As with
 136                     # ConnectionLostError, we ignore this because nodes
 137                     # are allowed to get power cycled during tests.
 138                     log.debug("Missed logrotate, EOFError")
 139                 except SSHException as e:
 140                     log.debug("Missed logrotate, SSHException")
 141                 except socket.error as e:
 142                     if e.errno == errno.EHOSTUNREACH:
 143                         log.debug("Missed logrotate, host unreachable")
 144                     else:
 145                         raise
 146
 147         def begin(self):
 148             self.thread = gevent.spawn(self.invoke_logrotate)
 149
 150         def end(self):
 151             self.stop_event.set()
 152             self.thread.get()
 153
 154     def write_rotate_conf(ctx, daemons):
 155         testdir = teuthology.get_testdir(ctx)
 156         rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
 157         with file(rotate_conf_path, 'rb') as f:
 158             conf = ""
 159             for daemon, size in daemons.iteritems():
 160                 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
 161                 conf += f.read().format(daemon_type=daemon, max_size=size)
 162                 f.seek(0, 0)
 163
 164             for remote in ctx.cluster.remotes.iterkeys():
 165                 teuthology.write_file(remote=remote,
 166                                       path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 167                                       data=StringIO(conf)
 168                                       )
 169                 remote.run(
 170                     args=[
 171                         'sudo',
 172                         'mv',
 173                         '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
 174                         '/etc/logrotate.d/ceph-test.conf',
 175                         run.Raw('&&'),
 176                         'sudo',
 177                         'chmod',
 178                         '0644',
 179                         '/etc/logrotate.d/ceph-test.conf',
 180                         run.Raw('&&'),
 181                         'sudo',
 182                         'chown',
 183                         'root.root',
 184                         '/etc/logrotate.d/ceph-test.conf'
 185                     ]
 186                 )
 187                 remote.chcon('/etc/logrotate.d/ceph-test.conf',
 188                              'system_u:object_r:etc_t:s0')
 189
 190     if ctx.config.get('log-rotate'):
 191         daemons = ctx.config.get('log-rotate')
 192         log.info('Setting up log rotation with ' + str(daemons))
 193         write_rotate_conf(ctx, daemons)
 194         logrotater = Rotater()
 195         logrotater.begin()
 196     try:
 197         yield
 198
 199     finally:
 200         if ctx.config.get('log-rotate'):
 201             log.info('Shutting down logrotate')
 202             logrotater.end()
 203             ctx.cluster.run(
 204                 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
 205                       ]
 206             )
 207         if ctx.archive is not None and \
 208                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 209             # and logs
 210             log.info('Compressing logs...')
 211             run.wait(
 212                 ctx.cluster.run(
 213                     args=[
 214                         'sudo',
 215                         'find',
 216                         '/var/log/ceph',
 217                         '-name',
 218                         '*.log',
 219                         '-print0',
 220                         run.Raw('|'),
 221                         'sudo',
 222                         'xargs',
 223                         '-0',
 224                         '--no-run-if-empty',
 225                         '--',
 226                         'gzip',
 227                         '--',
 228                     ],
 229                     wait=False,
 230                 ),
 231             )
 232
 233             log.info('Archiving logs...')
 234             path = os.path.join(ctx.archive, 'remote')
 235             os.makedirs(path)
 236             for remote in ctx.cluster.remotes.iterkeys():
 237                 sub = os.path.join(path, remote.shortname)
 238                 os.makedirs(sub)
 239                 teuthology.pull_directory(remote, '/var/log/ceph',
 240                                           os.path.join(sub, 'log'))
 241
 242
 243 def assign_devs(roles, devs):
 244     """
 245     Create a dictionary of devs indexed by roles
 246
 247     :param roles: List of roles
 248     :param devs: Corresponding list of devices.
 249     :returns: Dictionary of devs indexed by roles.
 250     """
 251     return dict(zip(roles, devs))
 252
 253
 254 @contextlib.contextmanager
 255 def valgrind_post(ctx, config):
 256     """
 257     After the tests run, look throught all the valgrind logs.  Exceptions are raised
 258     if textual errors occured in the logs, or if valgrind exceptions were detected in
 259     the logs.
 260
 261     :param ctx: Context
 262     :param config: Configuration
 263     """
 264     try:
 265         yield
 266     finally:
 267         lookup_procs = list()
 268         log.info('Checking for errors in any valgrind logs...')
 269         for remote in ctx.cluster.remotes.iterkeys():
 270             # look at valgrind logs for each node
 271             proc = remote.run(
 272                 args=[
 273                     'sudo',
 274                     'zgrep',
 275                     '<kind>',
 276                     run.Raw('/var/log/ceph/valgrind/*'),
 277                     '/dev/null',  # include a second file so that we always get a filename prefix on the output
 278                     run.Raw('|'),
 279                     'sort',
 280                     run.Raw('|'),
 281                     'uniq',
 282                 ],
 283                 wait=False,
 284                 check_status=False,
 285                 stdout=StringIO(),
 286             )
 287             lookup_procs.append((proc, remote))
 288
 289         valgrind_exception = None
 290         for (proc, remote) in lookup_procs:
 291             proc.wait()
 292             out = proc.stdout.getvalue()
 293             for line in out.split('\n'):
 294                 if line == '':
 295                     continue
 296                 try:
 297                     (file, kind) = line.split(':')
 298                 except Exception:
 299                     log.error('failed to split line %s', line)
 300                     raise
 301                 log.debug('file %s kind %s', file, kind)
 302                 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
 303                     continue
 304                 log.error('saw valgrind issue %s in %s', kind, file)
 305                 valgrind_exception = Exception('saw valgrind issues')
 306
 307         if config.get('expect_valgrind_errors'):
 308             if not valgrind_exception:
 309                 raise Exception('expected valgrind issues and found none')
 310         else:
 311             if valgrind_exception:
 312                 raise valgrind_exception
 313
 314
 315 @contextlib.contextmanager
 316 def crush_setup(ctx, config):
 317     cluster_name = config['cluster']
 318     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 319     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 320
 321     profile = config.get('crush_tunables', 'default')
 322     log.info('Setting crush tunables to %s', profile)
 323     mon_remote.run(
 324         args=['sudo', 'ceph', '--cluster', cluster_name,
 325               'osd', 'crush', 'tunables', profile])
 326     yield
 327
 328
 329 @contextlib.contextmanager
 330 def create_rbd_pool(ctx, config):
 331     cluster_name = config['cluster']
 332     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 333     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 334     log.info('Waiting for OSDs to come up')
 335     teuthology.wait_until_osds_up(
 336         ctx,
 337         cluster=ctx.cluster,
 338         remote=mon_remote,
 339         ceph_cluster=cluster_name,
 340     )
 341     log.info('Creating RBD pool')
 342     mon_remote.run(
 343         args=['sudo', 'ceph', '--cluster', cluster_name,
 344               'osd', 'pool', 'create', 'rbd', '8'])
 345     mon_remote.run(
 346         args=[
 347             'sudo', 'ceph', '--cluster', cluster_name,
 348             'osd', 'pool', 'application', 'enable',
 349             'rbd', 'rbd', '--yes-i-really-mean-it'
 350         ],
 351         check_status=False)
 352     yield
 353
 354 @contextlib.contextmanager
 355 def cephfs_setup(ctx, config):
 356     cluster_name = config['cluster']
 357     testdir = teuthology.get_testdir(ctx)
 358     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 359
 360     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 361     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 362     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 363     # If there are any MDSs, then create a filesystem for them to use
 364     # Do this last because requires mon cluster to be up and running
 365     if mdss.remotes:
 366         log.info('Setting up CephFS filesystem...')
 367
 368         fs = Filesystem(ctx, name='cephfs', create=True)
 369
 370         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
 371         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
 372         num_active = len([r for r in all_roles if is_active_mds(r)])
 373
 374         fs.set_max_mds(num_active)
 375         fs.set_allow_dirfrags(True)
 376
 377     yield
 378
 379
 380 @contextlib.contextmanager
 381 def cluster(ctx, config):
 382     """
 383     Handle the creation and removal of a ceph cluster.
 384
 385     On startup:
 386         Create directories needed for the cluster.
 387         Create remote journals for all osds.
 388         Create and set keyring.
 389         Copy the monmap to tht test systems.
 390         Setup mon nodes.
 391         Setup mds nodes.
 392         Mkfs osd nodes.
 393         Add keyring information to monmaps
 394         Mkfs mon nodes.
 395
 396     On exit:
 397         If errors occured, extract a failure message and store in ctx.summary.
 398         Unmount all test files and temporary journaling files.
 399         Save the monitor information and archive all ceph logs.
 400         Cleanup the keyring setup, and remove all monitor map and data files left over.
 401
 402     :param ctx: Context
 403     :param config: Configuration
 404     """
 405     if ctx.config.get('use_existing_cluster', False) is True:
 406         log.info("'use_existing_cluster' is true; skipping cluster creation")
 407         yield
 408
 409     testdir = teuthology.get_testdir(ctx)
 410     cluster_name = config['cluster']
 411     data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
 412     log.info('Creating ceph cluster %s...', cluster_name)
 413     run.wait(
 414         ctx.cluster.run(
 415             args=[
 416                 'install', '-d', '-m0755', '--',
 417                 data_dir,
 418             ],
 419             wait=False,
 420         )
 421     )
 422
 423     run.wait(
 424         ctx.cluster.run(
 425             args=[
 426                 'sudo',
 427                 'install', '-d', '-m0777', '--', '/var/run/ceph',
 428             ],
 429             wait=False,
 430         )
 431     )
 432
 433     devs_to_clean = {}
 434     remote_to_roles_to_devs = {}
 435     remote_to_roles_to_journals = {}
 436     osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
 437     for remote, roles_for_host in osds.remotes.iteritems():
 438         devs = teuthology.get_scratch_devices(remote)
 439         roles_to_devs = {}
 440         roles_to_journals = {}
 441         if config.get('fs'):
 442             log.info('fs option selected, checking for scratch devs')
 443             log.info('found devs: %s' % (str(devs),))
 444             devs_id_map = teuthology.get_wwn_id_map(remote, devs)
 445             iddevs = devs_id_map.values()
 446             roles_to_devs = assign_devs(
 447                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 448             )
 449             if len(roles_to_devs) < len(iddevs):
 450                 iddevs = iddevs[len(roles_to_devs):]
 451             devs_to_clean[remote] = []
 452
 453         if config.get('block_journal'):
 454             log.info('block journal enabled')
 455             roles_to_journals = assign_devs(
 456                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
 457             )
 458             log.info('journal map: %s', roles_to_journals)
 459
 460         if config.get('tmpfs_journal'):
 461             log.info('tmpfs journal enabled')
 462             roles_to_journals = {}
 463             remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
 464             for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 465                 tmpfs = '/mnt/' + role
 466                 roles_to_journals[role] = tmpfs
 467                 remote.run(args=['truncate', '-s', '1500M', tmpfs])
 468             log.info('journal map: %s', roles_to_journals)
 469
 470         log.info('dev map: %s' % (str(roles_to_devs),))
 471         remote_to_roles_to_devs[remote] = roles_to_devs
 472         remote_to_roles_to_journals[remote] = roles_to_journals
 473
 474     log.info('Generating config...')
 475     remotes_and_roles = ctx.cluster.remotes.items()
 476     roles = [role_list for (remote, role_list) in remotes_and_roles]
 477     ips = [host for (host, port) in
 478            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
 479     conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
 480     for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
 481         for role, journal in roles_to_journals.iteritems():
 482             name = teuthology.ceph_role(role)
 483             if name not in conf:
 484                 conf[name] = {}
 485             conf[name]['osd journal'] = journal
 486     for section, keys in config['conf'].iteritems():
 487         for key, value in keys.iteritems():
 488             log.info("[%s] %s = %s" % (section, key, value))
 489             if section not in conf:
 490                 conf[section] = {}
 491             conf[section][key] = value
 492
 493     if config.get('tmpfs_journal'):
 494         conf['journal dio'] = False
 495
 496     if not hasattr(ctx, 'ceph'):
 497         ctx.ceph = {}
 498     ctx.ceph[cluster_name] = argparse.Namespace()
 499     ctx.ceph[cluster_name].conf = conf
 500
 501     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
 502     keyring_path = config.get('keyring_path', default_keyring)
 503
 504     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 505
 506     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
 507
 508     log.info('Setting up %s...' % firstmon)
 509     ctx.cluster.only(firstmon).run(
 510         args=[
 511             'sudo',
 512             'adjust-ulimits',
 513             'ceph-coverage',
 514             coverage_dir,
 515             'ceph-authtool',
 516             '--create-keyring',
 517             keyring_path,
 518         ],
 519     )
 520     ctx.cluster.only(firstmon).run(
 521         args=[
 522             'sudo',
 523             'adjust-ulimits',
 524             'ceph-coverage',
 525             coverage_dir,
 526             'ceph-authtool',
 527             '--gen-key',
 528             '--name=mon.',
 529             keyring_path,
 530         ],
 531     )
 532     ctx.cluster.only(firstmon).run(
 533         args=[
 534             'sudo',
 535             'chmod',
 536             '0644',
 537             keyring_path,
 538         ],
 539     )
 540     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 541     monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
 542                                                    cluster=cluster_name)
 543     fsid = teuthology.create_simple_monmap(
 544         ctx,
 545         remote=mon0_remote,
 546         conf=conf,
 547         path=monmap_path,
 548     )
 549     if not 'global' in conf:
 550         conf['global'] = {}
 551     conf['global']['fsid'] = fsid
 552
 553     default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
 554     conf_path = config.get('conf_path', default_conf_path)
 555     log.info('Writing %s for FSID %s...' % (conf_path, fsid))
 556     write_conf(ctx, conf_path, cluster_name)
 557
 558     log.info('Creating admin key on %s...' % firstmon)
 559     ctx.cluster.only(firstmon).run(
 560         args=[
 561             'sudo',
 562             'adjust-ulimits',
 563             'ceph-coverage',
 564             coverage_dir,
 565             'ceph-authtool',
 566             '--gen-key',
 567             '--name=client.admin',
 568             '--set-uid=0',
 569             '--cap', 'mon', 'allow *',
 570             '--cap', 'osd', 'allow *',
 571             '--cap', 'mds', 'allow *',
 572             '--cap', 'mgr', 'allow *',
 573             keyring_path,
 574         ],
 575     )
 576
 577     log.info('Copying monmap to all nodes...')
 578     keyring = teuthology.get_file(
 579         remote=mon0_remote,
 580         path=keyring_path,
 581     )
 582     monmap = teuthology.get_file(
 583         remote=mon0_remote,
 584         path=monmap_path,
 585     )
 586
 587     for rem in ctx.cluster.remotes.iterkeys():
 588         # copy mon key and initial monmap
 589         log.info('Sending monmap to node {remote}'.format(remote=rem))
 590         teuthology.sudo_write_file(
 591             remote=rem,
 592             path=keyring_path,
 593             data=keyring,
 594             perms='0644'
 595         )
 596         teuthology.write_file(
 597             remote=rem,
 598             path=monmap_path,
 599             data=monmap,
 600         )
 601
 602     log.info('Setting up mon nodes...')
 603     mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
 604
 605     if not config.get('skip_mgr_daemons', False):
 606         log.info('Setting up mgr nodes...')
 607         mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
 608         for remote, roles_for_host in mgrs.remotes.iteritems():
 609             for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
 610                                                          cluster_name):
 611                 _, _, id_ = teuthology.split_role(role)
 612                 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
 613                     cluster=cluster_name,
 614                     id=id_,
 615                 )
 616                 remote.run(
 617                     args=[
 618                         'sudo',
 619                         'mkdir',
 620                         '-p',
 621                         mgr_dir,
 622                         run.Raw('&&'),
 623                         'sudo',
 624                         'adjust-ulimits',
 625                         'ceph-coverage',
 626                         coverage_dir,
 627                         'ceph-authtool',
 628                         '--create-keyring',
 629                         '--gen-key',
 630                         '--name=mgr.{id}'.format(id=id_),
 631                         mgr_dir + '/keyring',
 632                     ],
 633                 )
 634
 635     log.info('Setting up mds nodes...')
 636     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
 637     for remote, roles_for_host in mdss.remotes.iteritems():
 638         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
 639                                                      cluster_name):
 640             _, _, id_ = teuthology.split_role(role)
 641             mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
 642                 cluster=cluster_name,
 643                 id=id_,
 644             )
 645             remote.run(
 646                 args=[
 647                     'sudo',
 648                     'mkdir',
 649                     '-p',
 650                     mds_dir,
 651                     run.Raw('&&'),
 652                     'sudo',
 653                     'adjust-ulimits',
 654                     'ceph-coverage',
 655                     coverage_dir,
 656                     'ceph-authtool',
 657                     '--create-keyring',
 658                     '--gen-key',
 659                     '--name=mds.{id}'.format(id=id_),
 660                     mds_dir + '/keyring',
 661                 ],
 662             )
 663
 664     cclient.create_keyring(ctx, cluster_name)
 665     log.info('Running mkfs on osd nodes...')
 666
 667     if not hasattr(ctx, 'disk_config'):
 668         ctx.disk_config = argparse.Namespace()
 669     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
 670         ctx.disk_config.remote_to_roles_to_dev = {}
 671     if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
 672         ctx.disk_config.remote_to_roles_to_journals = {}
 673     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
 674         ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
 675     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
 676         ctx.disk_config.remote_to_roles_to_dev_fstype = {}
 677
 678     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
 679     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
 680
 681     log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
 682     for remote, roles_for_host in osds.remotes.iteritems():
 683         roles_to_devs = remote_to_roles_to_devs[remote]
 684         roles_to_journals = remote_to_roles_to_journals[remote]
 685
 686         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 687             _, _, id_ = teuthology.split_role(role)
 688             mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
 689             remote.run(
 690                 args=[
 691                     'sudo',
 692                     'mkdir',
 693                     '-p',
 694                     mnt_point,
 695                 ])
 696             log.info(str(roles_to_devs))
 697             log.info(str(roles_to_journals))
 698             log.info(role)
 699             if roles_to_devs.get(role):
 700                 dev = roles_to_devs[role]
 701                 fs = config.get('fs')
 702                 package = None
 703                 mkfs_options = config.get('mkfs_options')
 704                 mount_options = config.get('mount_options')
 705                 if fs == 'btrfs':
 706                     # package = 'btrfs-tools'
 707                     if mount_options is None:
 708                         mount_options = ['noatime', 'user_subvol_rm_allowed']
 709                     if mkfs_options is None:
 710                         mkfs_options = ['-m', 'single',
 711                                         '-l', '32768',
 712                                         '-n', '32768']
 713                 if fs == 'xfs':
 714                     # package = 'xfsprogs'
 715                     if mount_options is None:
 716                         mount_options = ['noatime']
 717                     if mkfs_options is None:
 718                         mkfs_options = ['-f', '-i', 'size=2048']
 719                 if fs == 'ext4' or fs == 'ext3':
 720                     if mount_options is None:
 721                         mount_options = ['noatime', 'user_xattr']
 722
 723                 if mount_options is None:
 724                     mount_options = []
 725                 if mkfs_options is None:
 726                     mkfs_options = []
 727                 mkfs = ['mkfs.%s' % fs] + mkfs_options
 728                 log.info('%s on %s on %s' % (mkfs, dev, remote))
 729                 if package is not None:
 730                     remote.run(
 731                         args=[
 732                             'sudo',
 733                             'apt-get', 'install', '-y', package
 734                         ],
 735                         stdout=StringIO(),
 736                     )
 737
 738                 try:
 739                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 740                 except run.CommandFailedError:
 741                     # Newer btfs-tools doesn't prompt for overwrite, use -f
 742                     if '-f' not in mount_options:
 743                         mkfs_options.append('-f')
 744                         mkfs = ['mkfs.%s' % fs] + mkfs_options
 745                         log.info('%s on %s on %s' % (mkfs, dev, remote))
 746                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
 747
 748                 log.info('mount %s on %s -o %s' % (dev, remote,
 749                                                    ','.join(mount_options)))
 750                 remote.run(
 751                     args=[
 752                         'sudo',
 753                         'mount',
 754                         '-t', fs,
 755                         '-o', ','.join(mount_options),
 756                         dev,
 757                         mnt_point,
 758                     ]
 759                 )
 760                 remote.run(
 761                     args=[
 762                         'sudo', '/sbin/restorecon', mnt_point,
 763                     ],
 764                     check_status=False,
 765                 )
 766                 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
 767                     ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
 768                 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
 769                 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
 770                     ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
 771                 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
 772                 devs_to_clean[remote].append(mnt_point)
 773
 774         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
 775             _, _, id_ = teuthology.split_role(role)
 776             remote.run(
 777                 args=[
 778                     'sudo',
 779                     'MALLOC_CHECK_=3',
 780                     'adjust-ulimits',
 781                     'ceph-coverage',
 782                     coverage_dir,
 783                     'ceph-osd',
 784                     '--cluster',
 785                     cluster_name,
 786                     '--mkfs',
 787                     '--mkkey',
 788                     '-i', id_,
 789                     '--monmap', monmap_path,
 790                 ],
 791             )
 792
 793     log.info('Reading keys from all nodes...')
 794     keys_fp = StringIO()
 795     keys = []
 796     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 797         for type_ in ['mgr',  'mds', 'osd']:
 798             if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
 799                 continue
 800             for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
 801                 _, _, id_ = teuthology.split_role(role)
 802                 data = teuthology.get_file(
 803                     remote=remote,
 804                     path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
 805                         type=type_,
 806                         id=id_,
 807                         cluster=cluster_name,
 808                     ),
 809                     sudo=True,
 810                 )
 811                 keys.append((type_, id_, data))
 812                 keys_fp.write(data)
 813     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
 814         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
 815             _, _, id_ = teuthology.split_role(role)
 816             data = teuthology.get_file(
 817                 remote=remote,
 818                 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
 819             )
 820             keys.append(('client', id_, data))
 821             keys_fp.write(data)
 822
 823     log.info('Adding keys to all mons...')
 824     writes = mons.run(
 825         args=[
 826             'sudo', 'tee', '-a',
 827             keyring_path,
 828         ],
 829         stdin=run.PIPE,
 830         wait=False,
 831         stdout=StringIO(),
 832     )
 833     keys_fp.seek(0)
 834     teuthology.feed_many_stdins_and_close(keys_fp, writes)
 835     run.wait(writes)
 836     for type_, id_, data in keys:
 837         run.wait(
 838             mons.run(
 839                 args=[
 840                          'sudo',
 841                          'adjust-ulimits',
 842                          'ceph-coverage',
 843                          coverage_dir,
 844                          'ceph-authtool',
 845                          keyring_path,
 846                          '--name={type}.{id}'.format(
 847                              type=type_,
 848                              id=id_,
 849                          ),
 850                      ] + list(generate_caps(type_)),
 851                 wait=False,
 852             ),
 853         )
 854
 855     log.info('Running mkfs on mon nodes...')
 856     for remote, roles_for_host in mons.remotes.iteritems():
 857         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
 858             _, _, id_ = teuthology.split_role(role)
 859             remote.run(
 860                 args=[
 861                     'sudo',
 862                     'mkdir',
 863                     '-p',
 864                     '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
 865                 ],
 866             )
 867             remote.run(
 868                 args=[
 869                     'sudo',
 870                     'adjust-ulimits',
 871                     'ceph-coverage',
 872                     coverage_dir,
 873                     'ceph-mon',
 874                     '--cluster', cluster_name,
 875                     '--mkfs',
 876                     '-i', id_,
 877                     '--monmap', monmap_path,
 878                     '--keyring', keyring_path,
 879                 ],
 880             )
 881
 882     run.wait(
 883         mons.run(
 884             args=[
 885                 'rm',
 886                 '--',
 887                 monmap_path,
 888             ],
 889             wait=False,
 890         ),
 891     )
 892
 893     try:
 894         yield
 895     except Exception:
 896         # we need to know this below
 897         ctx.summary['success'] = False
 898         raise
 899     finally:
 900         (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
 901
 902         log.info('Checking cluster log for badness...')
 903
 904         def first_in_ceph_log(pattern, excludes):
 905             """
 906             Find the first occurence of the pattern specified in the Ceph log,
 907             Returns None if none found.
 908
 909             :param pattern: Pattern scanned for.
 910             :param excludes: Patterns to ignore.
 911             :return: First line of text (or None if not found)
 912             """
 913             args = [
 914                 'sudo',
 915                 'egrep', pattern,
 916                 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
 917             ]
 918             for exclude in excludes:
 919                 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 920             args.extend([
 921                 run.Raw('|'), 'head', '-n', '1',
 922             ])
 923             r = mon0_remote.run(
 924                 stdout=StringIO(),
 925                 args=args,
 926             )
 927             stdout = r.stdout.getvalue()
 928             if stdout != '':
 929                 return stdout
 930             return None
 931
 932         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 933                              config['log_whitelist']) is not None:
 934             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 935             ctx.summary['success'] = False
 936             # use the most severe problem as the failure reason
 937             if 'failure_reason' not in ctx.summary:
 938                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 939                     match = first_in_ceph_log(pattern, config['log_whitelist'])
 940                     if match is not None:
 941                         ctx.summary['failure_reason'] = \
 942                             '"{match}" in cluster log'.format(
 943                                 match=match.rstrip('\n'),
 944                             )
 945                         break
 946
 947         for remote, dirs in devs_to_clean.iteritems():
 948             for dir_ in dirs:
 949                 log.info('Unmounting %s on %s' % (dir_, remote))
 950                 try:
 951                     remote.run(
 952                         args=[
 953                             'sync',
 954                             run.Raw('&&'),
 955                             'sudo',
 956                             'umount',
 957                             '-f',
 958                             dir_
 959                         ]
 960                     )
 961                 except Exception as e:
 962                     remote.run(args=[
 963                         'sudo',
 964                         run.Raw('PATH=/usr/sbin:$PATH'),
 965                         'lsof',
 966                         run.Raw(';'),
 967                         'ps', 'auxf',
 968                     ])
 969                     raise e
 970
 971         if config.get('tmpfs_journal'):
 972             log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
 973             for remote, roles_for_host in osds.remotes.iteritems():
 974                 remote.run(
 975                     args=['sudo', 'umount', '-f', '/mnt'],
 976                     check_status=False,
 977                 )
 978
 979         if ctx.archive is not None and \
 980                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 981
 982             # archive mon data, too
 983             log.info('Archiving mon data...')
 984             path = os.path.join(ctx.archive, 'data')
 985             try:
 986                 os.makedirs(path)
 987             except OSError as e:
 988                 if e.errno == errno.EEXIST:
 989                     pass
 990                 else:
 991                     raise
 992             for remote, roles in mons.remotes.iteritems():
 993                 for role in roles:
 994                     is_mon = teuthology.is_type('mon', cluster_name)
 995                     if is_mon(role):
 996                         _, _, id_ = teuthology.split_role(role)
 997                         mon_dir = '/var/lib/ceph/mon/' + \
 998                                   '{0}-{1}'.format(cluster_name, id_)
 999                         teuthology.pull_directory_tarball(
1000                             remote,
1001                             mon_dir,
1002                             path + '/' + role + '.tgz')
1003
1004         log.info('Cleaning ceph cluster...')
1005         run.wait(
1006             ctx.cluster.run(
1007                 args=[
1008                     'sudo',
1009                     'rm',
1010                     '-rf',
1011                     '--',
1012                     conf_path,
1013                     keyring_path,
1014                     data_dir,
1015                     monmap_path,
1016                     run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1017                 ],
1018                 wait=False,
1019             ),
1020         )
1021
1022
1023 def osd_scrub_pgs(ctx, config):
1024     """
1025     Scrub pgs when we exit.
1026
1027     First make sure all pgs are active and clean.
1028     Next scrub all osds.
1029     Then periodically check until all pgs have scrub time stamps that
1030     indicate the last scrub completed.  Time out if no progess is made
1031     here after two minutes.
1032     """
1033     retries = 40
1034     delays = 20
1035     cluster_name = config['cluster']
1036     manager = ctx.managers[cluster_name]
1037     all_clean = False
1038     for _ in range(0, retries):
1039         stats = manager.get_pg_stats()
1040         bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1041         if not bad:
1042             all_clean = True
1043             break
1044         log.info(
1045             "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1046         time.sleep(delays)
1047     if not all_clean:
1048         raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1049     check_time_now = time.localtime()
1050     time.sleep(1)
1051     all_roles = teuthology.all_roles(ctx.cluster)
1052     for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1053         log.info("Scrubbing {osd}".format(osd=role))
1054         _, _, id_ = teuthology.split_role(role)
1055         # allow this to fail; in certain cases the OSD might not be up
1056         # at this point.  we will catch all pgs below.
1057         try:
1058             manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1059         except run.CommandFailedError:
1060             pass
1061     prev_good = 0
1062     gap_cnt = 0
1063     loop = True
1064     while loop:
1065         stats = manager.get_pg_stats()
1066         timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1067         loop = False
1068         thiscnt = 0
1069         for (pgid, tmval) in timez:
1070             pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1071             if pgtm > check_time_now:
1072                 thiscnt += 1
1073             else:
1074                 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1075                 loop = True
1076         if thiscnt > prev_good:
1077             prev_good = thiscnt
1078             gap_cnt = 0
1079         else:
1080             gap_cnt += 1
1081             if gap_cnt % 6 == 0:
1082                 for (pgid, tmval) in timez:
1083                     # re-request scrub every so often in case the earlier
1084                     # request was missed.  do not do it everytime because
1085                     # the scrub may be in progress or not reported yet and
1086                     # we will starve progress.
1087                     manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1088             if gap_cnt > retries:
1089                 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1090         if loop:
1091             log.info('Still waiting for all pgs to be scrubbed.')
1092             time.sleep(delays)
1093
1094
1095 @contextlib.contextmanager
1096 def run_daemon(ctx, config, type_):
1097     """
1098     Run daemons for a role type.  Handle the startup and termination of a a daemon.
1099     On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1100     and a max_mds value for one mds.
1101     On cleanup -- Stop all existing daemons of this type.
1102
1103     :param ctx: Context
1104     :param config: Configuration
1105     :paran type_: Role type
1106     """
1107     cluster_name = config['cluster']
1108     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1109     testdir = teuthology.get_testdir(ctx)
1110     daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1111
1112     # check whether any daemons if this type are configured
1113     if daemons is None:
1114         return
1115     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1116
1117     daemon_signal = 'kill'
1118     if config.get('coverage') or config.get('valgrind') is not None:
1119         daemon_signal = 'term'
1120
1121     # create osds in order.  (this only matters for pre-luminous, which might
1122     # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1123     osd_uuids  = {}
1124     for remote, roles_for_host in daemons.remotes.iteritems():
1125         is_type_ = teuthology.is_type(type_, cluster_name)
1126         for role in roles_for_host:
1127             if not is_type_(role):
1128                 continue
1129             _, _, id_ = teuthology.split_role(role)
1130
1131
1132             if type_ == 'osd':
1133                 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1134                     cluster=cluster_name, id=id_)
1135                 osd_uuid = teuthology.get_file(
1136                     remote=remote,
1137                     path=datadir + '/fsid',
1138                     sudo=True,
1139                 ).strip()
1140                 osd_uuids[id_] = osd_uuid
1141     for osd_id in range(len(osd_uuids)):
1142         id_ = str(osd_id)
1143         osd_uuid = osd_uuids.get(id_)
1144         try:
1145             remote.run(
1146                 args=[
1147                 'sudo', 'ceph', '--cluster', cluster_name,
1148                     'osd', 'new', osd_uuid, id_,
1149                 ]
1150             )
1151         except:
1152             # fallback to pre-luminous (hammer or jewel)
1153             remote.run(
1154                 args=[
1155                 'sudo', 'ceph', '--cluster', cluster_name,
1156                     'osd', 'create', osd_uuid,
1157                 ]
1158             )
1159             if config.get('add_osds_to_crush'):
1160                 remote.run(
1161                 args=[
1162                     'sudo', 'ceph', '--cluster', cluster_name,
1163                     'osd', 'crush', 'create-or-move', 'osd.' + id_,
1164                     '1.0', 'host=localhost', 'root=default',
1165                 ]
1166             )
1167
1168     for remote, roles_for_host in daemons.remotes.iteritems():
1169         is_type_ = teuthology.is_type(type_, cluster_name)
1170         for role in roles_for_host:
1171             if not is_type_(role):
1172                 continue
1173             _, _, id_ = teuthology.split_role(role)
1174
1175             run_cmd = [
1176                 'sudo',
1177                 'adjust-ulimits',
1178                 'ceph-coverage',
1179                 coverage_dir,
1180                 'daemon-helper',
1181                 daemon_signal,
1182             ]
1183             run_cmd_tail = [
1184                 'ceph-%s' % (type_),
1185                 '-f',
1186                 '--cluster', cluster_name,
1187                 '-i', id_]
1188
1189             if type_ in config.get('cpu_profile', []):
1190                 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1191                 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1192
1193             if config.get('valgrind') is not None:
1194                 valgrind_args = None
1195                 if type_ in config['valgrind']:
1196                     valgrind_args = config['valgrind'][type_]
1197                 if role in config['valgrind']:
1198                     valgrind_args = config['valgrind'][role]
1199                 run_cmd = teuthology.get_valgrind_args(testdir, role,
1200                                                        run_cmd,
1201                                                        valgrind_args)
1202
1203             run_cmd.extend(run_cmd_tail)
1204
1205             # always register mgr; don't necessarily start
1206             ctx.daemons.register_daemon(
1207                 remote, type_, id_,
1208                 cluster=cluster_name,
1209                 args=run_cmd,
1210                 logger=log.getChild(role),
1211                 stdin=run.PIPE,
1212                 wait=False
1213             )
1214             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1215                 role = cluster_name + '.' + type_
1216                 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1217
1218     try:
1219         yield
1220     finally:
1221         teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1222
1223
1224 def healthy(ctx, config):
1225     """
1226     Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1227
1228     :param ctx: Context
1229     :param config: Configuration
1230     """
1231     config = config if isinstance(config, dict) else dict()
1232     cluster_name = config.get('cluster', 'ceph')
1233     log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1234     manager = ctx.managers[cluster_name]
1235     try:
1236         manager.wait_for_mgr_available(timeout=30)
1237     except (run.CommandFailedError, AssertionError) as e:
1238         log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
1239
1240     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1241     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1242     teuthology.wait_until_osds_up(
1243         ctx,
1244         cluster=ctx.cluster,
1245         remote=mon0_remote,
1246         ceph_cluster=cluster_name,
1247     )
1248
1249     try:
1250         manager.flush_all_pg_stats()
1251     except (run.CommandFailedError, Exception) as e:
1252         log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
1253     manager.wait_for_clean()
1254
1255     log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1256     teuthology.wait_until_healthy(
1257         ctx,
1258         remote=mon0_remote,
1259         ceph_cluster=cluster_name,
1260     )
1261
1262     if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1263         # Some MDSs exist, wait for them to be healthy
1264         ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1265         ceph_fs.wait_for_daemons(timeout=300)
1266
1267
1268 def wait_for_osds_up(ctx, config):
1269     """
1270     Wait for all osd's to come up.
1271
1272     :param ctx: Context
1273     :param config: Configuration
1274     """
1275     log.info('Waiting until ceph osds are all up...')
1276     cluster_name = config.get('cluster', 'ceph')
1277     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1278     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1279     teuthology.wait_until_osds_up(
1280         ctx,
1281         cluster=ctx.cluster,
1282         remote=mon0_remote
1283     )
1284
1285
1286 def wait_for_mon_quorum(ctx, config):
1287     """
1288     Check renote ceph status until all monitors are up.
1289
1290     :param ctx: Context
1291     :param config: Configuration
1292     """
1293     if isinstance(config, dict):
1294         mons = config['daemons']
1295         cluster_name = config.get('cluster', 'ceph')
1296     else:
1297         assert isinstance(config, list)
1298         mons = config
1299         cluster_name = 'ceph'
1300     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1301     (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1302     with contextutil.safe_while(sleep=10, tries=60,
1303                                 action='wait for monitor quorum') as proceed:
1304         while proceed():
1305             r = remote.run(
1306                 args=[
1307                     'sudo',
1308                     'ceph',
1309                     'quorum_status',
1310                 ],
1311                 stdout=StringIO(),
1312                 logger=log.getChild('quorum_status'),
1313             )
1314             j = json.loads(r.stdout.getvalue())
1315             q = j.get('quorum_names', [])
1316             log.debug('Quorum: %s', q)
1317             if sorted(q) == sorted(mons):
1318                 break
1319
1320
1321 def created_pool(ctx, config):
1322     """
1323     Add new pools to the dictionary of pools that the ceph-manager
1324     knows about.
1325     """
1326     for new_pool in config:
1327         if new_pool not in ctx.managers['ceph'].pools:
1328             ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1329                 new_pool, 'pg_num')
1330
1331
1332 @contextlib.contextmanager
1333 def restart(ctx, config):
1334     """
1335    restart ceph daemons
1336
1337    For example::
1338       tasks:
1339       - ceph.restart: [all]
1340
1341    For example::
1342       tasks:
1343       - ceph.restart: [osd.0, mon.1, mds.*]
1344
1345    or::
1346
1347       tasks:
1348       - ceph.restart:
1349           daemons: [osd.0, mon.1]
1350           wait-for-healthy: false
1351           wait-for-osds-up: true
1352
1353     :param ctx: Context
1354     :param config: Configuration
1355     """
1356     if config is None:
1357         config = {}
1358     elif isinstance(config, list):
1359         config = {'daemons': config}
1360
1361     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1362     clusters = set()
1363     for role in daemons:
1364         cluster, type_, id_ = teuthology.split_role(role)
1365         ctx.daemons.get_daemon(type_, id_, cluster).restart()
1366         clusters.add(cluster)
1367
1368     manager = ctx.managers['ceph']
1369     for dmon in daemons:
1370         if '.' in dmon:
1371             dm_parts = dmon.split('.')
1372             if dm_parts[1].isdigit():
1373                 if dm_parts[0] == 'osd':
1374                     manager.mark_down_osd(int(dm_parts[1]))
1375
1376     if config.get('wait-for-healthy', True):
1377         for cluster in clusters:
1378             healthy(ctx=ctx, config=dict(cluster=cluster))
1379     if config.get('wait-for-osds-up', False):
1380         for cluster in clusters:
1381             wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1382     yield
1383
1384
1385 @contextlib.contextmanager
1386 def stop(ctx, config):
1387     """
1388     Stop ceph daemons
1389
1390     For example::
1391       tasks:
1392       - ceph.stop: [mds.*]
1393
1394       tasks:
1395       - ceph.stop: [osd.0, osd.2]
1396
1397       tasks:
1398       - ceph.stop:
1399           daemons: [osd.0, osd.2]
1400
1401     """
1402     if config is None:
1403         config = {}
1404     elif isinstance(config, list):
1405         config = {'daemons': config}
1406
1407     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1408     for role in daemons:
1409         cluster, type_, id_ = teuthology.split_role(role)
1410         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1411
1412     yield
1413
1414
1415 @contextlib.contextmanager
1416 def wait_for_failure(ctx, config):
1417     """
1418     Wait for a failure of a ceph daemon
1419
1420     For example::
1421       tasks:
1422       - ceph.wait_for_failure: [mds.*]
1423
1424       tasks:
1425       - ceph.wait_for_failure: [osd.0, osd.2]
1426
1427       tasks:
1428       - ceph.wait_for_failure:
1429           daemons: [osd.0, osd.2]
1430
1431     """
1432     if config is None:
1433         config = {}
1434     elif isinstance(config, list):
1435         config = {'daemons': config}
1436
1437     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1438     for role in daemons:
1439         cluster, type_, id_ = teuthology.split_role(role)
1440         try:
1441             ctx.daemons.get_daemon(type_, id_, cluster).wait()
1442         except:
1443             log.info('Saw expected daemon failure.  Continuing.')
1444             pass
1445         else:
1446             raise RuntimeError('daemon %s did not fail' % role)
1447
1448     yield
1449
1450
1451 def validate_config(ctx, config):
1452     """
1453     Perform some simple validation on task configuration.
1454     Raises exceptions.ConfigError if an error is found.
1455     """
1456     # check for osds from multiple clusters on the same host
1457     for remote, roles_for_host in ctx.cluster.remotes.items():
1458         last_cluster = None
1459         last_role = None
1460         for role in roles_for_host:
1461             role_cluster, role_type, _ = teuthology.split_role(role)
1462             if role_type != 'osd':
1463                 continue
1464             if last_cluster and last_cluster != role_cluster:
1465                 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1466                     last_role, role)
1467                 raise exceptions.ConfigError(msg)
1468             last_cluster = role_cluster
1469             last_role = role
1470
1471
1472 @contextlib.contextmanager
1473 def task(ctx, config):
1474     """
1475     Set up and tear down a Ceph cluster.
1476
1477     For example::
1478
1479         tasks:
1480         - ceph:
1481         - interactive:
1482
1483     You can also specify what branch to run::
1484
1485         tasks:
1486         - ceph:
1487             branch: foo
1488
1489     Or a tag::
1490
1491         tasks:
1492         - ceph:
1493             tag: v0.42.13
1494
1495     Or a sha1::
1496
1497         tasks:
1498         - ceph:
1499             sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1500
1501     Or a local source dir::
1502
1503         tasks:
1504         - ceph:
1505             path: /home/sage/ceph
1506
1507     To capture code coverage data, use::
1508
1509         tasks:
1510         - ceph:
1511             coverage: true
1512
1513     To use btrfs, ext4, or xfs on the target's scratch disks, use::
1514
1515         tasks:
1516         - ceph:
1517             fs: xfs
1518             mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1519             mount_options: [nobarrier, inode64]
1520
1521     Note, this will cause the task to check the /scratch_devs file on each node
1522     for available devices.  If no such file is found, /dev/sdb will be used.
1523
1524     To run some daemons under valgrind, include their names
1525     and the tool/args to use in a valgrind section::
1526
1527         tasks:
1528         - ceph:
1529           valgrind:
1530             mds.1: --tool=memcheck
1531             osd.1: [--tool=memcheck, --leak-check=no]
1532
1533     Those nodes which are using memcheck or valgrind will get
1534     checked for bad results.
1535
1536     To adjust or modify config options, use::
1537
1538         tasks:
1539         - ceph:
1540             conf:
1541               section:
1542                 key: value
1543
1544     For example::
1545
1546         tasks:
1547         - ceph:
1548             conf:
1549               mds.0:
1550                 some option: value
1551                 other key: other value
1552               client.0:
1553                 debug client: 10
1554                 debug ms: 1
1555
1556     By default, the cluster log is checked for errors and warnings,
1557     and the run marked failed if any appear. You can ignore log
1558     entries by giving a list of egrep compatible regexes, i.e.:
1559
1560         tasks:
1561         - ceph:
1562             log-whitelist: ['foo.*bar', 'bad message']
1563
1564     To run multiple ceph clusters, use multiple ceph tasks, and roles
1565     with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1566     cluster use the default cluster name, 'ceph'. OSDs from separate
1567     clusters must be on separate hosts. Clients and non-osd daemons
1568     from multiple clusters may be colocated. For each cluster, add an
1569     instance of the ceph task with the cluster name specified, e.g.::
1570
1571         roles:
1572         - [mon.a, osd.0, osd.1]
1573         - [backup.mon.a, backup.osd.0, backup.osd.1]
1574         - [client.0, backup.client.0]
1575         tasks:
1576         - ceph:
1577             cluster: ceph
1578         - ceph:
1579             cluster: backup
1580
1581     :param ctx: Context
1582     :param config: Configuration
1583
1584     """
1585     if config is None:
1586         config = {}
1587     assert isinstance(config, dict), \
1588         "task ceph only supports a dictionary for configuration"
1589
1590     overrides = ctx.config.get('overrides', {})
1591     teuthology.deep_merge(config, overrides.get('ceph', {}))
1592
1593     first_ceph_cluster = False
1594     if not hasattr(ctx, 'daemons'):
1595         first_ceph_cluster = True
1596         ctx.daemons = DaemonGroup()
1597
1598     testdir = teuthology.get_testdir(ctx)
1599     if config.get('coverage'):
1600         coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1601         log.info('Creating coverage directory...')
1602         run.wait(
1603             ctx.cluster.run(
1604                 args=[
1605                     'install', '-d', '-m0755', '--',
1606                     coverage_dir,
1607                 ],
1608                 wait=False,
1609             )
1610         )
1611
1612     if 'cluster' not in config:
1613         config['cluster'] = 'ceph'
1614
1615     validate_config(ctx, config)
1616
1617     subtasks = []
1618     if first_ceph_cluster:
1619         # these tasks handle general log setup and parsing on all hosts,
1620         # so they should only be run once
1621         subtasks = [
1622             lambda: ceph_log(ctx=ctx, config=None),
1623             lambda: valgrind_post(ctx=ctx, config=config),
1624         ]
1625
1626     subtasks += [
1627         lambda: cluster(ctx=ctx, config=dict(
1628             conf=config.get('conf', {}),
1629             fs=config.get('fs', 'xfs'),
1630             mkfs_options=config.get('mkfs_options', None),
1631             mount_options=config.get('mount_options', None),
1632             block_journal=config.get('block_journal', None),
1633             tmpfs_journal=config.get('tmpfs_journal', None),
1634             skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1635             log_whitelist=config.get('log-whitelist', []),
1636             cpu_profile=set(config.get('cpu_profile', []),),
1637             cluster=config['cluster'],
1638         )),
1639         lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1640         lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1641         lambda: crush_setup(ctx=ctx, config=config),
1642         lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1643         lambda: create_rbd_pool(ctx=ctx, config=config),
1644         lambda: cephfs_setup(ctx=ctx, config=config),
1645         lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1646     ]
1647
1648     with contextutil.nested(*subtasks):
1649         first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1650         (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1651         if not hasattr(ctx, 'managers'):
1652             ctx.managers = {}
1653         ctx.managers[config['cluster']] = CephManager(
1654             mon,
1655             ctx=ctx,
1656             logger=log.getChild('ceph_manager.' + config['cluster']),
1657             cluster=config['cluster'],
1658         )
1659
1660         try:
1661             if config.get('wait-for-healthy', True):
1662                 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1663
1664             yield
1665         finally:
1666             if config.get('wait-for-scrub', True):
1667                 osd_scrub_pgs(ctx, config)
1668
1669             # stop logging health to clog during shutdown, or else we generate
1670             # a bunch of scary messages unrelated to our actual run.
1671             firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1672             (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1673             mon0_remote.run(
1674                 args=[
1675                     'sudo',
1676                     'ceph',
1677                     '--cluster', config['cluster'],
1678                     'tell',
1679                     'mon.*',
1680                     'injectargs',
1681                     '--',
1682                     '--no-mon-health-to-clog',
1683                 ]
1684             )