ceph/qa/tasks/cephadm.py

   1 """
   2 Ceph cluster task, deployed via cephadm orchestrator
   3 """
   4 from io import BytesIO
   5
   6 import argparse
   7 import configobj
   8 import contextlib
   9 import logging
  10 import os
  11 import json
  12 import re
  13 import uuid
  14
  15 from ceph_manager import CephManager
  16 from tarfile import ReadError
  17 from teuthology import misc as teuthology
  18 from teuthology import contextutil
  19 from teuthology.orchestra import run
  20 from teuthology.orchestra.daemon import DaemonGroup
  21 from teuthology.config import config as teuth_config
  22
  23 # these items we use from ceph.py should probably eventually move elsewhere
  24 from tasks.ceph import get_mons, healthy
  25
  26 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
  27
  28 log = logging.getLogger(__name__)
  29
  30
  31 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
  32     testdir = teuthology.get_testdir(ctx)
  33     return remote.run(
  34         args=[
  35             'sudo',
  36             ctx.cephadm,
  37             '--image', ctx.ceph[cluster_name].image,
  38             'shell',
  39             '-c', '/etc/ceph/{}.conf'.format(cluster_name),
  40             '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
  41             '--fsid', ctx.ceph[cluster_name].fsid,
  42             ] + extra_cephadm_args + [
  43             '--',
  44             ] + args,
  45         **kwargs
  46     )
  47
  48 def build_initial_config(ctx, config):
  49     cluster_name = config['cluster']
  50
  51     path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
  52     conf = configobj.ConfigObj(path, file_error=True)
  53
  54     conf.setdefault('global', {})
  55     conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
  56
  57     # overrides
  58     for section, keys in config.get('conf',{}).items():
  59         for key, value in keys.items():
  60             log.info(" override: [%s] %s = %s" % (section, key, value))
  61             if section not in conf:
  62                 conf[section] = {}
  63             conf[section][key] = value
  64
  65     return conf
  66
  67 @contextlib.contextmanager
  68 def normalize_hostnames(ctx):
  69     """
  70     Ensure we have short hostnames throughout, for consistency between
  71     remote.shortname and socket.gethostname() in cephadm.
  72     """
  73     log.info('Normalizing hostnames...')
  74     ctx.cluster.run(args=[
  75         'sudo',
  76         'hostname',
  77         run.Raw('$(hostname -s)'),
  78     ])
  79
  80     try:
  81         yield
  82     finally:
  83         pass
  84
  85 @contextlib.contextmanager
  86 def download_cephadm(ctx, config, ref):
  87     cluster_name = config['cluster']
  88
  89     if config.get('cephadm_mode') != 'cephadm-package':
  90         ref = config.get('cephadm_branch', ref)
  91         git_url = teuth_config.get_ceph_git_url()
  92         log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
  93         if git_url.startswith('https://github.com/'):
  94             # git archive doesn't like https:// URLs, which we use with github.
  95             rest = git_url.split('https://github.com/', 1)[1]
  96             rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
  97             ctx.cluster.run(
  98                 args=[
  99                     'curl', '--silent',
 100                     'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
 101                     run.Raw('>'),
 102                     ctx.cephadm,
 103                     run.Raw('&&'),
 104                     'ls', '-l',
 105                     ctx.cephadm,
 106                 ],
 107             )
 108         else:
 109             ctx.cluster.run(
 110                 args=[
 111                     'git', 'archive',
 112                     '--remote=' + git_url,
 113                     ref,
 114                     'src/cephadm/cephadm',
 115                     run.Raw('|'),
 116                     'tar', '-xO', 'src/cephadm/cephadm',
 117                     run.Raw('>'),
 118                     ctx.cephadm,
 119                 ],
 120             )
 121         # sanity-check the resulting file and set executable bit
 122         cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
 123         ctx.cluster.run(
 124             args=[
 125                 'test', '-s', ctx.cephadm,
 126                 run.Raw('&&'),
 127                 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
 128                 run.Raw('&&'),
 129                 'chmod', '+x', ctx.cephadm,
 130             ],
 131         )
 132
 133     try:
 134         yield
 135     finally:
 136         log.info('Removing cluster...')
 137         ctx.cluster.run(args=[
 138             'sudo',
 139             ctx.cephadm,
 140             'rm-cluster',
 141             '--fsid', ctx.ceph[cluster_name].fsid,
 142             '--force',
 143         ])
 144
 145         if config.get('cephadm_mode') == 'root':
 146             log.info('Removing cephadm ...')
 147             ctx.cluster.run(
 148                 args=[
 149                     'rm',
 150                     '-rf',
 151                     ctx.cephadm,
 152                 ],
 153             )
 154
 155 @contextlib.contextmanager
 156 def ceph_log(ctx, config):
 157     cluster_name = config['cluster']
 158     fsid = ctx.ceph[cluster_name].fsid
 159
 160     try:
 161         yield
 162
 163     except Exception:
 164         # we need to know this below
 165         ctx.summary['success'] = False
 166         raise
 167
 168     finally:
 169         log.info('Checking cluster log for badness...')
 170         def first_in_ceph_log(pattern, excludes):
 171             """
 172             Find the first occurrence of the pattern specified in the Ceph log,
 173             Returns None if none found.
 174
 175             :param pattern: Pattern scanned for.
 176             :param excludes: Patterns to ignore.
 177             :return: First line of text (or None if not found)
 178             """
 179             args = [
 180                 'sudo',
 181                 'egrep', pattern,
 182                 '/var/log/ceph/{fsid}/ceph.log'.format(
 183                     fsid=fsid),
 184             ]
 185             if excludes:
 186                 for exclude in excludes:
 187                     args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 188             args.extend([
 189                 run.Raw('|'), 'head', '-n', '1',
 190             ])
 191             r = ctx.ceph[cluster_name].bootstrap_remote.run(
 192                 stdout=BytesIO(),
 193                 args=args,
 194             )
 195             stdout = r.stdout.getvalue()
 196             if stdout != '':
 197                 return stdout
 198             return None
 199
 200         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 201                              config.get('log-whitelist')) is not None:
 202             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 203             ctx.summary['success'] = False
 204             # use the most severe problem as the failure reason
 205             if 'failure_reason' not in ctx.summary:
 206                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 207                     match = first_in_ceph_log(pattern, config['log-whitelist'])
 208                     if match is not None:
 209                         ctx.summary['failure_reason'] = \
 210                             '"{match}" in cluster log'.format(
 211                                 match=match.rstrip('\n'),
 212                             )
 213                         break
 214
 215         if ctx.archive is not None and \
 216                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 217             # and logs
 218             log.info('Compressing logs...')
 219             run.wait(
 220                 ctx.cluster.run(
 221                     args=[
 222                         'sudo',
 223                         'find',
 224                         '/var/log/ceph',   # all logs, not just for the cluster
 225                         '-name',
 226                         '*.log',
 227                         '-print0',
 228                         run.Raw('|'),
 229                         'sudo',
 230                         'xargs',
 231                         '-0',
 232                         '--no-run-if-empty',
 233                         '--',
 234                         'gzip',
 235                         '--',
 236                     ],
 237                     wait=False,
 238                 ),
 239             )
 240
 241             log.info('Archiving logs...')
 242             path = os.path.join(ctx.archive, 'remote')
 243             try:
 244                 os.makedirs(path)
 245             except OSError:
 246                 pass
 247             for remote in ctx.cluster.remotes.keys():
 248                 sub = os.path.join(path, remote.name)
 249                 try:
 250                     os.makedirs(sub)
 251                 except OSError:
 252                     pass
 253                 teuthology.pull_directory(remote, '/var/log/ceph',  # everything
 254                                           os.path.join(sub, 'log'))
 255
 256 @contextlib.contextmanager
 257 def ceph_crash(ctx, config):
 258     """
 259     Gather crash dumps from /var/lib/ceph/$fsid/crash
 260     """
 261     cluster_name = config['cluster']
 262     fsid = ctx.ceph[cluster_name].fsid
 263
 264     try:
 265         yield
 266
 267     finally:
 268         if ctx.archive is not None:
 269             log.info('Archiving crash dumps...')
 270             path = os.path.join(ctx.archive, 'remote')
 271             try:
 272                 os.makedirs(path)
 273             except OSError:
 274                 pass
 275             for remote in ctx.cluster.remotes.keys():
 276                 sub = os.path.join(path, remote.name)
 277                 try:
 278                     os.makedirs(sub)
 279                 except OSError:
 280                     pass
 281                 try:
 282                     teuthology.pull_directory(remote,
 283                                               '/var/lib/ceph/%s/crash' % fsid,
 284                                               os.path.join(sub, 'crash'))
 285                 except ReadError:
 286                     pass
 287
 288 @contextlib.contextmanager
 289 def ceph_bootstrap(ctx, config):
 290     cluster_name = config['cluster']
 291     testdir = teuthology.get_testdir(ctx)
 292     fsid = ctx.ceph[cluster_name].fsid
 293
 294     mons = ctx.ceph[cluster_name].mons
 295     first_mon_role = sorted(mons.keys())[0]
 296     _, _, first_mon = teuthology.split_role(first_mon_role)
 297     (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
 298     log.info('First mon is mon.%s on %s' % (first_mon,
 299                                             bootstrap_remote.shortname))
 300     ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
 301     ctx.ceph[cluster_name].first_mon = first_mon
 302
 303     others = ctx.cluster.remotes[bootstrap_remote]
 304     log.info('others %s' % others)
 305     mgrs = sorted([r for r in others
 306                    if teuthology.is_type('mgr', cluster_name)(r)])
 307     if not mgrs:
 308         raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
 309     _, _, first_mgr = teuthology.split_role(mgrs[0])
 310     log.info('First mgr is %s' % (first_mgr))
 311     ctx.ceph[cluster_name].first_mgr = first_mgr
 312
 313     ctx.cluster.run(args=[
 314         'sudo', 'mkdir', '-p', '/etc/ceph',
 315         ]);
 316     ctx.cluster.run(args=[
 317         'sudo', 'chmod', '777', '/etc/ceph',
 318         ]);
 319     try:
 320         # write seed config
 321         log.info('Writing seed config...')
 322         conf_fp = BytesIO()
 323         seed_config = build_initial_config(ctx, config)
 324         seed_config.write(conf_fp)
 325         teuthology.write_file(
 326             remote=bootstrap_remote,
 327             path='{}/seed.{}.conf'.format(testdir, cluster_name),
 328             data=conf_fp.getvalue())
 329         log.debug('Final config:\n' + conf_fp.getvalue())
 330         ctx.ceph[cluster_name].conf = seed_config
 331
 332         # register initial daemons
 333         ctx.daemons.register_daemon(
 334             bootstrap_remote, 'mon', first_mon,
 335             cluster=cluster_name,
 336             fsid=fsid,
 337             logger=log.getChild('mon.' + first_mon),
 338             wait=False,
 339             started=True,
 340         )
 341         ctx.daemons.register_daemon(
 342             bootstrap_remote, 'mgr', first_mgr,
 343             cluster=cluster_name,
 344             fsid=fsid,
 345             logger=log.getChild('mgr.' + first_mgr),
 346             wait=False,
 347             started=True,
 348         )
 349
 350         # bootstrap
 351         log.info('Bootstrapping...')
 352         cmd = [
 353             'sudo',
 354             ctx.cephadm,
 355             '--image', ctx.ceph[cluster_name].image,
 356             '-v',
 357             'bootstrap',
 358             '--fsid', fsid,
 359             '--mon-id', first_mon,
 360             '--mgr-id', first_mgr,
 361             '--orphan-initial-daemons',   # we will do it explicitly!
 362             '--skip-monitoring-stack',    # we'll provision these explicitly
 363             '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
 364             '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
 365             '--output-keyring',
 366             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 367             '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
 368         ]
 369         if mons[first_mon_role].startswith('['):
 370             cmd += ['--mon-addrv', mons[first_mon_role]]
 371         else:
 372             cmd += ['--mon-ip', mons[first_mon_role]]
 373         if config.get('skip_dashboard'):
 374             cmd += ['--skip-dashboard']
 375         # bootstrap makes the keyring root 0600, so +r it for our purposes
 376         cmd += [
 377             run.Raw('&&'),
 378             'sudo', 'chmod', '+r',
 379             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 380         ]
 381         bootstrap_remote.run(args=cmd)
 382
 383         # fetch keys and configs
 384         log.info('Fetching config...')
 385         ctx.ceph[cluster_name].config_file = teuthology.get_file(
 386             remote=bootstrap_remote,
 387             path='/etc/ceph/{}.conf'.format(cluster_name))
 388         log.info('Fetching client.admin keyring...')
 389         ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
 390             remote=bootstrap_remote,
 391             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name))
 392         log.info('Fetching mon keyring...')
 393         ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
 394             remote=bootstrap_remote,
 395             path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
 396             sudo=True)
 397
 398         # fetch ssh key, distribute to additional nodes
 399         log.info('Fetching pub ssh key...')
 400         ssh_pub_key = teuthology.get_file(
 401             remote=bootstrap_remote,
 402             path='{}/{}.pub'.format(testdir, cluster_name)
 403         ).strip()
 404
 405         log.info('Installing pub ssh key for root users...')
 406         ctx.cluster.run(args=[
 407             'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
 408             run.Raw('&&'),
 409             'echo', ssh_pub_key,
 410             run.Raw('|'),
 411             'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
 412             run.Raw('&&'),
 413             'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
 414         ])
 415
 416         # set options
 417         _shell(ctx, cluster_name, bootstrap_remote,
 418                ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
 419
 420         # add other hosts
 421         for remote in ctx.cluster.remotes.keys():
 422             if remote == bootstrap_remote:
 423                 continue
 424             log.info('Writing conf and keyring to %s' % remote.shortname)
 425             teuthology.write_file(
 426                 remote=remote,
 427                 path='/etc/ceph/{}.conf'.format(cluster_name),
 428                 data=ctx.ceph[cluster_name].config_file)
 429             teuthology.write_file(
 430                 remote=remote,
 431                 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 432                 data=ctx.ceph[cluster_name].admin_keyring)
 433
 434             log.info('Adding host %s to orchestrator...' % remote.shortname)
 435             _shell(ctx, cluster_name, remote, [
 436                 'ceph', 'orch', 'host', 'add',
 437                 remote.shortname
 438             ])
 439             r = _shell(ctx, cluster_name, remote,
 440                        ['ceph', 'orch', 'host', 'ls', '--format=json'],
 441                        stdout=BytesIO())
 442             hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
 443             assert remote.shortname in hosts
 444
 445         yield
 446
 447     finally:
 448         log.info('Cleaning up testdir ceph.* files...')
 449         ctx.cluster.run(args=[
 450             'rm', '-f',
 451             '{}/seed.{}.conf'.format(testdir, cluster_name),
 452             '{}/{}.pub'.format(testdir, cluster_name),
 453         ])
 454
 455         log.info('Stopping all daemons...')
 456
 457         # this doesn't block until they are all stopped...
 458         #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
 459
 460         # so, stop them individually
 461         for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES):
 462             cluster, type_, id_ = teuthology.split_role(role)
 463             ctx.daemons.get_daemon(type_, id_, cluster).stop()
 464
 465         # clean up /etc/ceph
 466         ctx.cluster.run(args=[
 467             'sudo', 'rm', '-f',
 468             '/etc/ceph/{}.conf'.format(cluster_name),
 469             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 470         ])
 471
 472 @contextlib.contextmanager
 473 def ceph_mons(ctx, config):
 474     """
 475     Deploy any additional mons
 476     """
 477     cluster_name = config['cluster']
 478     fsid = ctx.ceph[cluster_name].fsid
 479     num_mons = 1
 480
 481     try:
 482         for remote, roles in ctx.cluster.remotes.items():
 483             for mon in [r for r in roles
 484                         if teuthology.is_type('mon', cluster_name)(r)]:
 485                 c_, _, id_ = teuthology.split_role(mon)
 486                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 487                     continue
 488                 log.info('Adding %s on %s' % (mon, remote.shortname))
 489                 num_mons += 1
 490                 _shell(ctx, cluster_name, remote, [
 491                     'ceph', 'orch', 'daemon', 'add', 'mon',
 492                     remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
 493                 ])
 494                 ctx.daemons.register_daemon(
 495                     remote, 'mon', id_,
 496                     cluster=cluster_name,
 497                     fsid=fsid,
 498                     logger=log.getChild(mon),
 499                     wait=False,
 500                     started=True,
 501                 )
 502
 503                 with contextutil.safe_while(sleep=1, tries=180) as proceed:
 504                     while proceed():
 505                         log.info('Waiting for %d mons in monmap...' % (num_mons))
 506                         r = _shell(
 507                             ctx=ctx,
 508                             cluster_name=cluster_name,
 509                             remote=remote,
 510                             args=[
 511                                 'ceph', 'mon', 'dump', '-f', 'json',
 512                             ],
 513                             stdout=BytesIO(),
 514                         )
 515                         j = json.loads(r.stdout.getvalue())
 516                         if len(j['mons']) == num_mons:
 517                             break
 518
 519         # refresh ceph.conf files for all mons + first mgr
 520         for remote, roles in ctx.cluster.remotes.items():
 521             for mon in [r for r in roles
 522                         if teuthology.is_type('mon', cluster_name)(r)]:
 523                 c_, _, id_ = teuthology.split_role(mon)
 524                 _shell(ctx, cluster_name, remote, [
 525                     'ceph', 'orch', 'daemon', 'reconfig',
 526                     'mon.' + id_,
 527                 ])
 528         _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote, [
 529             'ceph', 'orch', 'daemon', 'reconfig',
 530             'mgr.' + ctx.ceph[cluster_name].first_mgr,
 531         ])
 532
 533         yield
 534
 535     finally:
 536         pass
 537
 538 @contextlib.contextmanager
 539 def ceph_mgrs(ctx, config):
 540     """
 541     Deploy any additional mgrs
 542     """
 543     cluster_name = config['cluster']
 544     fsid = ctx.ceph[cluster_name].fsid
 545
 546     try:
 547         nodes = []
 548         daemons = {}
 549         for remote, roles in ctx.cluster.remotes.items():
 550             for mgr in [r for r in roles
 551                         if teuthology.is_type('mgr', cluster_name)(r)]:
 552                 c_, _, id_ = teuthology.split_role(mgr)
 553                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
 554                     continue
 555                 log.info('Adding %s on %s' % (mgr, remote.shortname))
 556                 nodes.append(remote.shortname + '=' + id_)
 557                 daemons[mgr] = (remote, id_)
 558         if nodes:
 559             _shell(ctx, cluster_name, remote, [
 560                 'ceph', 'orch', 'apply', 'mgr',
 561                 str(len(nodes) + 1) + ';' + ';'.join(nodes)]
 562             )
 563         for mgr, i in daemons.items():
 564             remote, id_ = i
 565             ctx.daemons.register_daemon(
 566                 remote, 'mgr', id_,
 567                 cluster=cluster_name,
 568                 fsid=fsid,
 569                 logger=log.getChild(mgr),
 570                 wait=False,
 571                 started=True,
 572             )
 573
 574         yield
 575
 576     finally:
 577         pass
 578
 579 @contextlib.contextmanager
 580 def ceph_osds(ctx, config):
 581     """
 582     Deploy OSDs
 583     """
 584     cluster_name = config['cluster']
 585     fsid = ctx.ceph[cluster_name].fsid
 586     try:
 587         log.info('Deploying OSDs...')
 588
 589         # provision OSDs in numeric order
 590         id_to_remote = {}
 591         devs_by_remote = {}
 592         for remote, roles in ctx.cluster.remotes.items():
 593             devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
 594             for osd in [r for r in roles
 595                         if teuthology.is_type('osd', cluster_name)(r)]:
 596                 _, _, id_ = teuthology.split_role(osd)
 597                 id_to_remote[int(id_)] = (osd, remote)
 598
 599         cur = 0
 600         for osd_id in sorted(id_to_remote.keys()):
 601             osd, remote = id_to_remote[osd_id]
 602             _, _, id_ = teuthology.split_role(osd)
 603             assert int(id_) == cur
 604             devs = devs_by_remote[remote]
 605             assert devs   ## FIXME ##
 606             dev = devs.pop()
 607             short_dev = dev.replace('/dev/', '')
 608             log.info('Deploying %s on %s with %s...' % (
 609                 osd, remote.shortname, dev))
 610             _shell(ctx, cluster_name, remote, [
 611                 'ceph-volume', 'lvm', 'zap', dev])
 612             _shell(ctx, cluster_name, remote, [
 613                 'ceph', 'orch', 'daemon', 'add', 'osd',
 614                 remote.shortname + ':' + short_dev
 615             ])
 616             ctx.daemons.register_daemon(
 617                 remote, 'osd', id_,
 618                 cluster=cluster_name,
 619                 fsid=fsid,
 620                 logger=log.getChild(osd),
 621                 wait=False,
 622                 started=True,
 623             )
 624             cur += 1
 625
 626         yield
 627     finally:
 628         pass
 629
 630 @contextlib.contextmanager
 631 def ceph_mdss(ctx, config):
 632     """
 633     Deploy MDSss
 634     """
 635     cluster_name = config['cluster']
 636     fsid = ctx.ceph[cluster_name].fsid
 637
 638     nodes = []
 639     daemons = {}
 640     for remote, roles in ctx.cluster.remotes.items():
 641         for role in [r for r in roles
 642                     if teuthology.is_type('mds', cluster_name)(r)]:
 643             c_, _, id_ = teuthology.split_role(role)
 644             log.info('Adding %s on %s' % (role, remote.shortname))
 645             nodes.append(remote.shortname + '=' + id_)
 646             daemons[role] = (remote, id_)
 647     if nodes:
 648         _shell(ctx, cluster_name, remote, [
 649             'ceph', 'orch', 'apply', 'mds',
 650             'all',
 651             str(len(nodes)) + ';' + ';'.join(nodes)]
 652         )
 653     for role, i in daemons.items():
 654         remote, id_ = i
 655         ctx.daemons.register_daemon(
 656             remote, 'mds', id_,
 657             cluster=cluster_name,
 658             fsid=fsid,
 659             logger=log.getChild(role),
 660             wait=False,
 661             started=True,
 662         )
 663
 664     yield
 665
 666 @contextlib.contextmanager
 667 def ceph_monitoring(daemon_type, ctx, config):
 668     """
 669     Deploy prometheus, node-exporter, etc.
 670     """
 671     cluster_name = config['cluster']
 672     fsid = ctx.ceph[cluster_name].fsid
 673
 674     nodes = []
 675     daemons = {}
 676     for remote, roles in ctx.cluster.remotes.items():
 677         for role in [r for r in roles
 678                     if teuthology.is_type(daemon_type, cluster_name)(r)]:
 679             c_, _, id_ = teuthology.split_role(role)
 680             log.info('Adding %s on %s' % (role, remote.shortname))
 681             nodes.append(remote.shortname + '=' + id_)
 682             daemons[role] = (remote, id_)
 683     if nodes:
 684         _shell(ctx, cluster_name, remote, [
 685             'ceph', 'orch', 'apply', daemon_type,
 686             str(len(nodes)) + ';' + ';'.join(nodes)]
 687         )
 688     for role, i in daemons.items():
 689         remote, id_ = i
 690         ctx.daemons.register_daemon(
 691             remote, daemon_type, id_,
 692             cluster=cluster_name,
 693             fsid=fsid,
 694             logger=log.getChild(role),
 695             wait=False,
 696             started=True,
 697         )
 698
 699     yield
 700
 701 @contextlib.contextmanager
 702 def ceph_rgw(ctx, config):
 703     """
 704     Deploy rgw
 705     """
 706     cluster_name = config['cluster']
 707     fsid = ctx.ceph[cluster_name].fsid
 708
 709     nodes = {}
 710     daemons = {}
 711     for remote, roles in ctx.cluster.remotes.items():
 712         for role in [r for r in roles
 713                     if teuthology.is_type('rgw', cluster_name)(r)]:
 714             c_, _, id_ = teuthology.split_role(role)
 715             log.info('Adding %s on %s' % (role, remote.shortname))
 716             realmzone = '.'.join(id_.split('.')[0:2])
 717             if realmzone not in nodes:
 718                 nodes[realmzone] = []
 719             nodes[realmzone].append(remote.shortname + '=' + id_)
 720             daemons[role] = (remote, id_)
 721     for realmzone, nodes in nodes.items():
 722         (realm, zone) = realmzone.split('.', 1)
 723         _shell(ctx, cluster_name, remote, [
 724             'ceph', 'orch', 'apply', 'rgw',
 725             realm, zone,
 726             str(len(nodes)) + ';' + ';'.join(nodes)]
 727         )
 728     for role, i in daemons.items():
 729         remote, id_ = i
 730         ctx.daemons.register_daemon(
 731             remote, 'rgw', id_,
 732             cluster=cluster_name,
 733             fsid=fsid,
 734             logger=log.getChild(role),
 735             wait=False,
 736             started=True,
 737         )
 738
 739     yield
 740
 741 @contextlib.contextmanager
 742 def ceph_clients(ctx, config):
 743     cluster_name = config['cluster']
 744     testdir = teuthology.get_testdir(ctx)
 745
 746     log.info('Setting up client nodes...')
 747     clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
 748     testdir = teuthology.get_testdir(ctx)
 749     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 750     for remote, roles_for_host in clients.remotes.items():
 751         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
 752                                                      cluster_name):
 753             name = teuthology.ceph_role(role)
 754             client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
 755                                                                 name)
 756             r = _shell(
 757                 ctx=ctx,
 758                 cluster_name=cluster_name,
 759                 remote=remote,
 760                 args=[
 761                     'ceph', 'auth',
 762                     'get-or-create', name,
 763                     'mon', 'allow *',
 764                     'osd', 'allow *',
 765                     'mds', 'allow *',
 766                     'mgr', 'allow *',
 767                 ],
 768                 stdout=BytesIO(),
 769             )
 770             keyring = r.stdout.getvalue()
 771             teuthology.sudo_write_file(
 772                 remote=remote,
 773                 path=client_keyring,
 774                 data=keyring,
 775                 perms='0644'
 776             )
 777     yield
 778
 779 @contextlib.contextmanager
 780 def ceph_initial():
 781     try:
 782         yield
 783     finally:
 784         log.info('Teardown complete')
 785
 786 ## public methods
 787 @contextlib.contextmanager
 788 def stop(ctx, config):
 789     """
 790     Stop ceph daemons
 791
 792     For example::
 793       tasks:
 794       - ceph.stop: [mds.*]
 795
 796       tasks:
 797       - ceph.stop: [osd.0, osd.2]
 798
 799       tasks:
 800       - ceph.stop:
 801           daemons: [osd.0, osd.2]
 802
 803     """
 804     if config is None:
 805         config = {}
 806     elif isinstance(config, list):
 807         config = {'daemons': config}
 808
 809     daemons = ctx.daemons.resolve_role_list(
 810         config.get('daemons', None), CEPH_ROLE_TYPES, True)
 811     clusters = set()
 812
 813     for role in daemons:
 814         cluster, type_, id_ = teuthology.split_role(role)
 815         ctx.daemons.get_daemon(type_, id_, cluster).stop()
 816         clusters.add(cluster)
 817
 818 #    for cluster in clusters:
 819 #        ctx.ceph[cluster].watchdog.stop()
 820 #        ctx.ceph[cluster].watchdog.join()
 821
 822     yield
 823
 824 def shell(ctx, config):
 825     """
 826     Execute (shell) commands
 827     """
 828     cluster_name = config.get('cluster', 'ceph')
 829
 830     env = []
 831     if 'env' in config:
 832         for k in config['env']:
 833             env.extend(['-e', k + '=' + ctx.config.get(k, '')])
 834         del config['env']
 835
 836     if 'all' in config and len(config) == 1:
 837         a = config['all']
 838         roles = teuthology.all_roles(ctx.cluster)
 839         config = dict((id_, a) for id_ in roles)
 840
 841     for role, ls in config.items():
 842         (remote,) = ctx.cluster.only(role).remotes.keys()
 843         log.info('Running commands on role %s host %s', role, remote.name)
 844         for c in ls:
 845             _shell(ctx, cluster_name, remote,
 846                    ['bash', '-c', c],
 847                    extra_cephadm_args=env)
 848
 849 @contextlib.contextmanager
 850 def tweaked_option(ctx, config):
 851     """
 852     set an option, and then restore it with its original value
 853
 854     Note, due to the way how tasks are executed/nested, it's not suggested to
 855     use this method as a standalone task. otherwise, it's likely that it will
 856     restore the tweaked option at the /end/ of 'tasks' block.
 857     """
 858     saved_options = {}
 859     # we can complicate this when necessary
 860     options = ['mon-health-to-clog']
 861     type_, id_ = 'mon', '*'
 862     cluster = config.get('cluster', 'ceph')
 863     manager = ctx.managers[cluster]
 864     if id_ == '*':
 865         get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
 866     else:
 867         get_from = id_
 868     for option in options:
 869         if option not in config:
 870             continue
 871         value = 'true' if config[option] else 'false'
 872         option = option.replace('-', '_')
 873         old_value = manager.get_config(type_, get_from, option)
 874         if value != old_value:
 875             saved_options[option] = old_value
 876             manager.inject_args(type_, id_, option, value)
 877     yield
 878     for option, value in saved_options.items():
 879         manager.inject_args(type_, id_, option, value)
 880
 881 @contextlib.contextmanager
 882 def restart(ctx, config):
 883     """
 884    restart ceph daemons
 885
 886    For example::
 887       tasks:
 888       - ceph.restart: [all]
 889
 890    For example::
 891       tasks:
 892       - ceph.restart: [osd.0, mon.1, mds.*]
 893
 894    or::
 895
 896       tasks:
 897       - ceph.restart:
 898           daemons: [osd.0, mon.1]
 899           wait-for-healthy: false
 900           wait-for-osds-up: true
 901
 902     :param ctx: Context
 903     :param config: Configuration
 904     """
 905     if config is None:
 906         config = {}
 907     elif isinstance(config, list):
 908         config = {'daemons': config}
 909
 910     daemons = ctx.daemons.resolve_role_list(
 911         config.get('daemons', None), CEPH_ROLE_TYPES, True)
 912     clusters = set()
 913
 914     log.info('daemons %s' % daemons)
 915     with tweaked_option(ctx, config):
 916         for role in daemons:
 917             cluster, type_, id_ = teuthology.split_role(role)
 918             d = ctx.daemons.get_daemon(type_, id_, cluster)
 919             assert d, 'daemon %s does not exist' % role
 920             d.stop()
 921             if type_ == 'osd':
 922                 ctx.managers[cluster].mark_down_osd(id_)
 923             d.restart()
 924             clusters.add(cluster)
 925
 926     if config.get('wait-for-healthy', True):
 927         for cluster in clusters:
 928             healthy(ctx=ctx, config=dict(cluster=cluster))
 929     if config.get('wait-for-osds-up', False):
 930         for cluster in clusters:
 931             ctx.managers[cluster].wait_for_all_osds_up()
 932     yield
 933
 934 @contextlib.contextmanager
 935 def crush_setup(ctx, config):
 936     cluster_name = config['cluster']
 937     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
 938     (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
 939
 940     profile = config.get('crush_tunables', 'default')
 941     log.info('Setting crush tunables to %s', profile)
 942     _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
 943         args=['ceph', 'osd', 'crush', 'tunables', profile])
 944     yield
 945
 946 @contextlib.contextmanager
 947 def task(ctx, config):
 948     if config is None:
 949         config = {}
 950
 951     assert isinstance(config, dict), \
 952         "task only supports a dictionary for configuration"
 953
 954     overrides = ctx.config.get('overrides', {})
 955     teuthology.deep_merge(config, overrides.get('ceph', {}))
 956     log.info('Config: ' + str(config))
 957
 958     testdir = teuthology.get_testdir(ctx)
 959
 960     # set up cluster context
 961     first_ceph_cluster = False
 962     if not hasattr(ctx, 'daemons'):
 963         first_ceph_cluster = True
 964     if not hasattr(ctx, 'ceph'):
 965         ctx.ceph = {}
 966         ctx.managers = {}
 967     if 'cluster' not in config:
 968         config['cluster'] = 'ceph'
 969     cluster_name = config['cluster']
 970     ctx.ceph[cluster_name] = argparse.Namespace()
 971
 972     ctx.ceph[cluster_name].thrashers = []
 973     # fixme: setup watchdog, ala ceph.py
 974
 975     # cephadm mode?
 976     if 'cephadm_mode' not in config:
 977         config['cephadm_mode'] = 'root'
 978     assert config['cephadm_mode'] in ['root', 'cephadm-package']
 979     if config['cephadm_mode'] == 'root':
 980         ctx.cephadm = testdir + '/cephadm'
 981     else:
 982         ctx.cephadm = 'cephadm'  # in the path
 983
 984     if first_ceph_cluster:
 985         # FIXME: this is global for all clusters
 986         ctx.daemons = DaemonGroup(
 987             use_cephadm=ctx.cephadm)
 988
 989     # image
 990     ctx.ceph[cluster_name].image = config.get('image')
 991     ref = None
 992     if not ctx.ceph[cluster_name].image:
 993         sha1 = config.get('sha1')
 994         if sha1:
 995             ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % sha1
 996             ref = sha1
 997         else:
 998             # hmm, fall back to branch?
 999             branch = config.get('branch', 'master')
1000             ref = branch
1001             ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % branch
1002     log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1003
1004     # uuid
1005     fsid = str(uuid.uuid1())
1006     log.info('Cluster fsid is %s' % fsid)
1007     ctx.ceph[cluster_name].fsid = fsid
1008
1009     # mon ips
1010     log.info('Choosing monitor IPs and ports...')
1011     remotes_and_roles = ctx.cluster.remotes.items()
1012     roles = [role_list for (remote, role_list) in remotes_and_roles]
1013     ips = [host for (host, port) in
1014            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1015     ctx.ceph[cluster_name].mons = get_mons(
1016         roles, ips, cluster_name,
1017         mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1018         mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1019         )
1020     log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1021
1022     with contextutil.nested(
1023             lambda: ceph_initial(),
1024             lambda: normalize_hostnames(ctx=ctx),
1025             lambda: download_cephadm(ctx=ctx, config=config, ref=ref),
1026             lambda: ceph_log(ctx=ctx, config=config),
1027             lambda: ceph_crash(ctx=ctx, config=config),
1028             lambda: ceph_bootstrap(ctx=ctx, config=config),
1029             lambda: crush_setup(ctx=ctx, config=config),
1030             lambda: ceph_mons(ctx=ctx, config=config),
1031             lambda: ceph_mgrs(ctx=ctx, config=config),
1032             lambda: ceph_osds(ctx=ctx, config=config),
1033             lambda: ceph_mdss(ctx=ctx, config=config),
1034             lambda: ceph_rgw(ctx=ctx, config=config),
1035             lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1036             lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1037             lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1038             lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1039             lambda: ceph_clients(ctx=ctx, config=config),
1040     ):
1041         ctx.managers[cluster_name] = CephManager(
1042             ctx.ceph[cluster_name].bootstrap_remote,
1043             ctx=ctx,
1044             logger=log.getChild('ceph_manager.' + cluster_name),
1045             cluster=cluster_name,
1046             cephadm=True,
1047         )
1048
1049         try:
1050             if config.get('wait-for-healthy', True):
1051                 healthy(ctx=ctx, config=config)
1052
1053             log.info('Setup complete, yielding')
1054             yield
1055
1056         finally:
1057             log.info('Teardown begin')