ceph/qa/tasks/cephadm.py

   1 """
   2 Ceph cluster task, deployed via cephadm orchestrator
   3 """
   4 from io import BytesIO
   5
   6 import argparse
   7 import configobj
   8 import contextlib
   9 import logging
  10 import os
  11 import json
  12 import re
  13 import uuid
  14
  15 from ceph_manager import CephManager
  16 from tarfile import ReadError
  17 from teuthology import misc as teuthology
  18 from teuthology import contextutil
  19 from teuthology.orchestra import run
  20 from teuthology.orchestra.daemon import DaemonGroup
  21 from teuthology.config import config as teuth_config
  22
  23 # these items we use from ceph.py should probably eventually move elsewhere
  24 from tasks.ceph import get_mons, healthy
  25
  26 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
  27
  28 log = logging.getLogger(__name__)
  29
  30
  31 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
  32     testdir = teuthology.get_testdir(ctx)
  33     return remote.run(
  34         args=[
  35             'sudo',
  36             ctx.cephadm,
  37             '--image', ctx.ceph[cluster_name].image,
  38             'shell',
  39             '-c', '/etc/ceph/{}.conf'.format(cluster_name),
  40             '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
  41             '--fsid', ctx.ceph[cluster_name].fsid,
  42             ] + extra_cephadm_args + [
  43             '--',
  44             ] + args,
  45         **kwargs
  46     )
  47
  48 def build_initial_config(ctx, config):
  49     cluster_name = config['cluster']
  50
  51     path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
  52     conf = configobj.ConfigObj(path, file_error=True)
  53
  54     conf.setdefault('global', {})
  55     conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
  56
  57     # overrides
  58     for section, keys in config.get('conf',{}).items():
  59         for key, value in keys.items():
  60             log.info(" override: [%s] %s = %s" % (section, key, value))
  61             if section not in conf:
  62                 conf[section] = {}
  63             conf[section][key] = value
  64
  65     return conf
  66
  67 @contextlib.contextmanager
  68 def normalize_hostnames(ctx):
  69     """
  70     Ensure we have short hostnames throughout, for consistency between
  71     remote.shortname and socket.gethostname() in cephadm.
  72     """
  73     log.info('Normalizing hostnames...')
  74     ctx.cluster.run(args=[
  75         'sudo',
  76         'hostname',
  77         run.Raw('$(hostname -s)'),
  78     ])
  79
  80     try:
  81         yield
  82     finally:
  83         pass
  84
  85 @contextlib.contextmanager
  86 def download_cephadm(ctx, config, ref):
  87     cluster_name = config['cluster']
  88
  89     if config.get('cephadm_mode') != 'cephadm-package':
  90         ref = config.get('cephadm_branch', ref)
  91         git_url = teuth_config.get_ceph_git_url()
  92         log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
  93         if git_url.startswith('https://github.com/'):
  94             # git archive doesn't like https:// URLs, which we use with github.
  95             rest = git_url.split('https://github.com/', 1)[1]
  96             rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
  97             ctx.cluster.run(
  98                 args=[
  99                     'curl', '--silent',
 100                     'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
 101                     run.Raw('>'),
 102                     ctx.cephadm,
 103                     run.Raw('&&'),
 104                     'ls', '-l',
 105                     ctx.cephadm,
 106                 ],
 107             )
 108         else:
 109             ctx.cluster.run(
 110                 args=[
 111                     'git', 'archive',
 112                     '--remote=' + git_url,
 113                     ref,
 114                     'src/cephadm/cephadm',
 115                     run.Raw('|'),
 116                     'tar', '-xO', 'src/cephadm/cephadm',
 117                     run.Raw('>'),
 118                     ctx.cephadm,
 119                 ],
 120             )
 121         # sanity-check the resulting file and set executable bit
 122         cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
 123         ctx.cluster.run(
 124             args=[
 125                 'test', '-s', ctx.cephadm,
 126                 run.Raw('&&'),
 127                 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
 128                 run.Raw('&&'),
 129                 'chmod', '+x', ctx.cephadm,
 130             ],
 131         )
 132
 133     try:
 134         yield
 135     finally:
 136         log.info('Removing cluster...')
 137         ctx.cluster.run(args=[
 138             'sudo',
 139             ctx.cephadm,
 140             'rm-cluster',
 141             '--fsid', ctx.ceph[cluster_name].fsid,
 142             '--force',
 143         ])
 144
 145         if config.get('cephadm_mode') == 'root':
 146             log.info('Removing cephadm ...')
 147             ctx.cluster.run(
 148                 args=[
 149                     'rm',
 150                     '-rf',
 151                     ctx.cephadm,
 152                 ],
 153             )
 154
 155 @contextlib.contextmanager
 156 def ceph_log(ctx, config):
 157     cluster_name = config['cluster']
 158     fsid = ctx.ceph[cluster_name].fsid
 159
 160     try:
 161         yield
 162
 163     except Exception:
 164         # we need to know this below
 165         ctx.summary['success'] = False
 166         raise
 167
 168     finally:
 169         log.info('Checking cluster log for badness...')
 170         def first_in_ceph_log(pattern, excludes):
 171             """
 172             Find the first occurrence of the pattern specified in the Ceph log,
 173             Returns None if none found.
 174
 175             :param pattern: Pattern scanned for.
 176             :param excludes: Patterns to ignore.
 177             :return: First line of text (or None if not found)
 178             """
 179             args = [
 180                 'sudo',
 181                 'egrep', pattern,
 182                 '/var/log/ceph/{fsid}/ceph.log'.format(
 183                     fsid=fsid),
 184             ]
 185             if excludes:
 186                 for exclude in excludes:
 187                     args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 188             args.extend([
 189                 run.Raw('|'), 'head', '-n', '1',
 190             ])
 191             r = ctx.ceph[cluster_name].bootstrap_remote.run(
 192                 stdout=BytesIO(),
 193                 args=args,
 194             )
 195             stdout = r.stdout.getvalue()
 196             if stdout != '':
 197                 return stdout
 198             return None
 199
 200         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 201                              config.get('log-whitelist')) is not None:
 202             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 203             ctx.summary['success'] = False
 204             # use the most severe problem as the failure reason
 205             if 'failure_reason' not in ctx.summary:
 206                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 207                     match = first_in_ceph_log(pattern, config['log-whitelist'])
 208                     if match is not None:
 209                         ctx.summary['failure_reason'] = \
 210                             '"{match}" in cluster log'.format(
 211                                 match=match.rstrip('\n'),
 212                             )
 213                         break
 214
 215         if ctx.archive is not None and \
 216                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 217             # and logs
 218             log.info('Compressing logs...')
 219             run.wait(
 220                 ctx.cluster.run(
 221                     args=[
 222                         'sudo',
 223                         'find',
 224                         '/var/log/ceph',   # all logs, not just for the cluster
 225                         '-name',
 226                         '*.log',
 227                         '-print0',
 228                         run.Raw('|'),
 229                         'sudo',
 230                         'xargs',
 231                         '-0',
 232                         '--no-run-if-empty',
 233                         '--',
 234                         'gzip',
 235                         '--',
 236                     ],
 237                     wait=False,
 238                 ),
 239             )
 240
 241             log.info('Archiving logs...')
 242             path = os.path.join(ctx.archive, 'remote')
 243             try:
 244                 os.makedirs(path)
 245             except OSError:
 246                 pass
 247             for remote in ctx.cluster.remotes.keys():
 248                 sub = os.path.join(path, remote.name)
 249                 try:
 250                     os.makedirs(sub)
 251                 except OSError:
 252                     pass
 253                 teuthology.pull_directory(remote, '/var/log/ceph',  # everything
 254                                           os.path.join(sub, 'log'))
 255
 256 @contextlib.contextmanager
 257 def ceph_crash(ctx, config):
 258     """
 259     Gather crash dumps from /var/lib/ceph/$fsid/crash
 260     """
 261     cluster_name = config['cluster']
 262     fsid = ctx.ceph[cluster_name].fsid
 263
 264     try:
 265         yield
 266
 267     finally:
 268         if ctx.archive is not None:
 269             log.info('Archiving crash dumps...')
 270             path = os.path.join(ctx.archive, 'remote')
 271             try:
 272                 os.makedirs(path)
 273             except OSError:
 274                 pass
 275             for remote in ctx.cluster.remotes.keys():
 276                 sub = os.path.join(path, remote.name)
 277                 try:
 278                     os.makedirs(sub)
 279                 except OSError:
 280                     pass
 281                 try:
 282                     teuthology.pull_directory(remote,
 283                                               '/var/lib/ceph/%s/crash' % fsid,
 284                                               os.path.join(sub, 'crash'))
 285                 except ReadError:
 286                     pass
 287
 288 @contextlib.contextmanager
 289 def ceph_bootstrap(ctx, config):
 290     cluster_name = config['cluster']
 291     testdir = teuthology.get_testdir(ctx)
 292     fsid = ctx.ceph[cluster_name].fsid
 293
 294     bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 295     first_mon = ctx.ceph[cluster_name].first_mon
 296     first_mon_role = ctx.ceph[cluster_name].first_mon_role
 297     mons = ctx.ceph[cluster_name].mons
 298
 299     ctx.cluster.run(args=[
 300         'sudo', 'mkdir', '-p', '/etc/ceph',
 301         ]);
 302     ctx.cluster.run(args=[
 303         'sudo', 'chmod', '777', '/etc/ceph',
 304         ]);
 305     try:
 306         # write seed config
 307         log.info('Writing seed config...')
 308         conf_fp = BytesIO()
 309         seed_config = build_initial_config(ctx, config)
 310         seed_config.write(conf_fp)
 311         teuthology.write_file(
 312             remote=bootstrap_remote,
 313             path='{}/seed.{}.conf'.format(testdir, cluster_name),
 314             data=conf_fp.getvalue())
 315         log.debug('Final config:\n' + conf_fp.getvalue())
 316         ctx.ceph[cluster_name].conf = seed_config
 317
 318         # register initial daemons
 319         ctx.daemons.register_daemon(
 320             bootstrap_remote, 'mon', first_mon,
 321             cluster=cluster_name,
 322             fsid=fsid,
 323             logger=log.getChild('mon.' + first_mon),
 324             wait=False,
 325             started=True,
 326         )
 327         if not ctx.ceph[cluster_name].roleless:
 328             first_mgr = ctx.ceph[cluster_name].first_mgr
 329             ctx.daemons.register_daemon(
 330                 bootstrap_remote, 'mgr', first_mgr,
 331                 cluster=cluster_name,
 332                 fsid=fsid,
 333                 logger=log.getChild('mgr.' + first_mgr),
 334                 wait=False,
 335                 started=True,
 336             )
 337
 338         # bootstrap
 339         log.info('Bootstrapping...')
 340         cmd = [
 341             'sudo',
 342             ctx.cephadm,
 343             '--image', ctx.ceph[cluster_name].image,
 344             '-v',
 345             'bootstrap',
 346             '--fsid', fsid,
 347             '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
 348             '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
 349             '--output-keyring',
 350             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 351             '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
 352         ]
 353         if not ctx.ceph[cluster_name].roleless:
 354             cmd += [
 355                 '--mon-id', first_mon,
 356                 '--mgr-id', first_mgr,
 357                 '--orphan-initial-daemons',   # we will do it explicitly!
 358                 '--skip-monitoring-stack',    # we'll provision these explicitly
 359             ]
 360         if mons[first_mon_role].startswith('['):
 361             cmd += ['--mon-addrv', mons[first_mon_role]]
 362         else:
 363             cmd += ['--mon-ip', mons[first_mon_role]]
 364         if config.get('skip_dashboard'):
 365             cmd += ['--skip-dashboard']
 366         # bootstrap makes the keyring root 0600, so +r it for our purposes
 367         cmd += [
 368             run.Raw('&&'),
 369             'sudo', 'chmod', '+r',
 370             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 371         ]
 372         bootstrap_remote.run(args=cmd)
 373
 374         # fetch keys and configs
 375         log.info('Fetching config...')
 376         ctx.ceph[cluster_name].config_file = teuthology.get_file(
 377             remote=bootstrap_remote,
 378             path='/etc/ceph/{}.conf'.format(cluster_name))
 379         log.info('Fetching client.admin keyring...')
 380         ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
 381             remote=bootstrap_remote,
 382             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name))
 383         log.info('Fetching mon keyring...')
 384         ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
 385             remote=bootstrap_remote,
 386             path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
 387             sudo=True)
 388
 389         # fetch ssh key, distribute to additional nodes
 390         log.info('Fetching pub ssh key...')
 391         ssh_pub_key = teuthology.get_file(
 392             remote=bootstrap_remote,
 393             path='{}/{}.pub'.format(testdir, cluster_name)
 394         ).strip()
 395
 396         log.info('Installing pub ssh key for root users...')
 397         ctx.cluster.run(args=[
 398             'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
 399             run.Raw('&&'),
 400             'echo', ssh_pub_key,
 401             run.Raw('|'),
 402             'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
 403             run.Raw('&&'),
 404             'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
 405         ])
 406
 407         # set options
 408         _shell(ctx, cluster_name, bootstrap_remote,
 409                ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
 410
 411         # add other hosts
 412         for remote in ctx.cluster.remotes.keys():
 413             if remote == bootstrap_remote:
 414                 continue
 415             log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
 416             teuthology.write_file(
 417                 remote=remote,
 418                 path='/etc/ceph/{}.conf'.format(cluster_name),
 419                 data=ctx.ceph[cluster_name].config_file)
 420             teuthology.write_file(
 421                 remote=remote,
 422                 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 423                 data=ctx.ceph[cluster_name].admin_keyring)
 424
 425             log.info('Adding host %s to orchestrator...' % remote.shortname)
 426             _shell(ctx, cluster_name, remote, [
 427                 'ceph', 'orch', 'host', 'add',
 428                 remote.shortname
 429             ])
 430             r = _shell(ctx, cluster_name, remote,
 431                        ['ceph', 'orch', 'host', 'ls', '--format=json'],
 432                        stdout=BytesIO())
 433             hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
 434             assert remote.shortname in hosts
 435
 436         yield
 437
 438     finally:
 439         log.info('Cleaning up testdir ceph.* files...')
 440         ctx.cluster.run(args=[
 441             'rm', '-f',
 442             '{}/seed.{}.conf'.format(testdir, cluster_name),
 443             '{}/{}.pub'.format(testdir, cluster_name),
 444         ])
 445
 446         log.info('Stopping all daemons...')
 447
 448         # this doesn't block until they are all stopped...
 449         #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
 450
 451         # so, stop them individually
 452         for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES):
 453             cluster, type_, id_ = teuthology.split_role(role)
 454             ctx.daemons.get_daemon(type_, id_, cluster).stop()
 455
 456         # clean up /etc/ceph
 457         ctx.cluster.run(args=[
 458             'sudo', 'rm', '-f',
 459             '/etc/ceph/{}.conf'.format(cluster_name),
 460             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 461         ])
 462
 463 @contextlib.contextmanager
 464 def ceph_mons(ctx, config):
 465     """
 466     Deploy any additional mons
 467     """
 468     cluster_name = config['cluster']
 469     fsid = ctx.ceph[cluster_name].fsid
 470     num_mons = 1
 471
 472     try:
 473         for remote, roles in ctx.cluster.remotes.items():
 474             for mon in [r for r in roles
 475                         if teuthology.is_type('mon', cluster_name)(r)]:
 476                 c_, _, id_ = teuthology.split_role(mon)
 477                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 478                     continue
 479                 log.info('Adding %s on %s' % (mon, remote.shortname))
 480                 num_mons += 1
 481                 _shell(ctx, cluster_name, remote, [
 482                     'ceph', 'orch', 'daemon', 'add', 'mon',
 483                     remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
 484                 ])
 485                 ctx.daemons.register_daemon(
 486                     remote, 'mon', id_,
 487                     cluster=cluster_name,
 488                     fsid=fsid,
 489                     logger=log.getChild(mon),
 490                     wait=False,
 491                     started=True,
 492                 )
 493
 494                 with contextutil.safe_while(sleep=1, tries=180) as proceed:
 495                     while proceed():
 496                         log.info('Waiting for %d mons in monmap...' % (num_mons))
 497                         r = _shell(
 498                             ctx=ctx,
 499                             cluster_name=cluster_name,
 500                             remote=remote,
 501                             args=[
 502                                 'ceph', 'mon', 'dump', '-f', 'json',
 503                             ],
 504                             stdout=BytesIO(),
 505                         )
 506                         j = json.loads(r.stdout.getvalue())
 507                         if len(j['mons']) == num_mons:
 508                             break
 509
 510         # refresh our (final) ceph.conf file
 511         log.info('Generating final ceph.conf file...')
 512         r = _shell(
 513             ctx=ctx,
 514             cluster_name=cluster_name,
 515             remote=remote,
 516             args=[
 517                 'ceph', 'config', 'generate-minimal-conf',
 518             ],
 519             stdout=BytesIO(),
 520         )
 521         ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
 522
 523         yield
 524
 525     finally:
 526         pass
 527
 528 @contextlib.contextmanager
 529 def ceph_mgrs(ctx, config):
 530     """
 531     Deploy any additional mgrs
 532     """
 533     cluster_name = config['cluster']
 534     fsid = ctx.ceph[cluster_name].fsid
 535
 536     try:
 537         nodes = []
 538         daemons = {}
 539         for remote, roles in ctx.cluster.remotes.items():
 540             for mgr in [r for r in roles
 541                         if teuthology.is_type('mgr', cluster_name)(r)]:
 542                 c_, _, id_ = teuthology.split_role(mgr)
 543                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
 544                     continue
 545                 log.info('Adding %s on %s' % (mgr, remote.shortname))
 546                 nodes.append(remote.shortname + '=' + id_)
 547                 daemons[mgr] = (remote, id_)
 548         if nodes:
 549             _shell(ctx, cluster_name, remote, [
 550                 'ceph', 'orch', 'apply', 'mgr',
 551                 str(len(nodes) + 1) + ';' + ';'.join(nodes)]
 552             )
 553         for mgr, i in daemons.items():
 554             remote, id_ = i
 555             ctx.daemons.register_daemon(
 556                 remote, 'mgr', id_,
 557                 cluster=cluster_name,
 558                 fsid=fsid,
 559                 logger=log.getChild(mgr),
 560                 wait=False,
 561                 started=True,
 562             )
 563
 564         yield
 565
 566     finally:
 567         pass
 568
 569 @contextlib.contextmanager
 570 def ceph_osds(ctx, config):
 571     """
 572     Deploy OSDs
 573     """
 574     cluster_name = config['cluster']
 575     fsid = ctx.ceph[cluster_name].fsid
 576
 577     try:
 578         log.info('Deploying OSDs...')
 579
 580         # provision OSDs in numeric order
 581         id_to_remote = {}
 582         devs_by_remote = {}
 583         for remote, roles in ctx.cluster.remotes.items():
 584             devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
 585             for osd in [r for r in roles
 586                         if teuthology.is_type('osd', cluster_name)(r)]:
 587                 _, _, id_ = teuthology.split_role(osd)
 588                 id_to_remote[int(id_)] = (osd, remote)
 589
 590         cur = 0
 591         for osd_id in sorted(id_to_remote.keys()):
 592             osd, remote = id_to_remote[osd_id]
 593             _, _, id_ = teuthology.split_role(osd)
 594             assert int(id_) == cur
 595             devs = devs_by_remote[remote]
 596             assert devs   ## FIXME ##
 597             dev = devs.pop()
 598             short_dev = dev.replace('/dev/', '')
 599             log.info('Deploying %s on %s with %s...' % (
 600                 osd, remote.shortname, dev))
 601             _shell(ctx, cluster_name, remote, [
 602                 'ceph-volume', 'lvm', 'zap', dev])
 603             _shell(ctx, cluster_name, remote, [
 604                 'ceph', 'orch', 'daemon', 'add', 'osd',
 605                 remote.shortname + ':' + short_dev
 606             ])
 607             ctx.daemons.register_daemon(
 608                 remote, 'osd', id_,
 609                 cluster=cluster_name,
 610                 fsid=fsid,
 611                 logger=log.getChild(osd),
 612                 wait=False,
 613                 started=True,
 614             )
 615             cur += 1
 616
 617         yield
 618     finally:
 619         pass
 620
 621 @contextlib.contextmanager
 622 def ceph_mdss(ctx, config):
 623     """
 624     Deploy MDSss
 625     """
 626     cluster_name = config['cluster']
 627     fsid = ctx.ceph[cluster_name].fsid
 628
 629     nodes = []
 630     daemons = {}
 631     for remote, roles in ctx.cluster.remotes.items():
 632         for role in [r for r in roles
 633                     if teuthology.is_type('mds', cluster_name)(r)]:
 634             c_, _, id_ = teuthology.split_role(role)
 635             log.info('Adding %s on %s' % (role, remote.shortname))
 636             nodes.append(remote.shortname + '=' + id_)
 637             daemons[role] = (remote, id_)
 638     if nodes:
 639         _shell(ctx, cluster_name, remote, [
 640             'ceph', 'orch', 'apply', 'mds',
 641             'all',
 642             str(len(nodes)) + ';' + ';'.join(nodes)]
 643         )
 644     for role, i in daemons.items():
 645         remote, id_ = i
 646         ctx.daemons.register_daemon(
 647             remote, 'mds', id_,
 648             cluster=cluster_name,
 649             fsid=fsid,
 650             logger=log.getChild(role),
 651             wait=False,
 652             started=True,
 653         )
 654
 655     yield
 656
 657 @contextlib.contextmanager
 658 def ceph_monitoring(daemon_type, ctx, config):
 659     """
 660     Deploy prometheus, node-exporter, etc.
 661     """
 662     cluster_name = config['cluster']
 663     fsid = ctx.ceph[cluster_name].fsid
 664
 665     nodes = []
 666     daemons = {}
 667     for remote, roles in ctx.cluster.remotes.items():
 668         for role in [r for r in roles
 669                     if teuthology.is_type(daemon_type, cluster_name)(r)]:
 670             c_, _, id_ = teuthology.split_role(role)
 671             log.info('Adding %s on %s' % (role, remote.shortname))
 672             nodes.append(remote.shortname + '=' + id_)
 673             daemons[role] = (remote, id_)
 674     if nodes:
 675         _shell(ctx, cluster_name, remote, [
 676             'ceph', 'orch', 'apply', daemon_type,
 677             str(len(nodes)) + ';' + ';'.join(nodes)]
 678         )
 679     for role, i in daemons.items():
 680         remote, id_ = i
 681         ctx.daemons.register_daemon(
 682             remote, daemon_type, id_,
 683             cluster=cluster_name,
 684             fsid=fsid,
 685             logger=log.getChild(role),
 686             wait=False,
 687             started=True,
 688         )
 689
 690     yield
 691
 692 @contextlib.contextmanager
 693 def ceph_rgw(ctx, config):
 694     """
 695     Deploy rgw
 696     """
 697     cluster_name = config['cluster']
 698     fsid = ctx.ceph[cluster_name].fsid
 699
 700     nodes = {}
 701     daemons = {}
 702     for remote, roles in ctx.cluster.remotes.items():
 703         for role in [r for r in roles
 704                     if teuthology.is_type('rgw', cluster_name)(r)]:
 705             c_, _, id_ = teuthology.split_role(role)
 706             log.info('Adding %s on %s' % (role, remote.shortname))
 707             realmzone = '.'.join(id_.split('.')[0:2])
 708             if realmzone not in nodes:
 709                 nodes[realmzone] = []
 710             nodes[realmzone].append(remote.shortname + '=' + id_)
 711             daemons[role] = (remote, id_)
 712     for realmzone, nodes in nodes.items():
 713         (realm, zone) = realmzone.split('.', 1)
 714         _shell(ctx, cluster_name, remote, [
 715             'ceph', 'orch', 'apply', 'rgw',
 716             realm, zone,
 717             str(len(nodes)) + ';' + ';'.join(nodes)]
 718         )
 719     for role, i in daemons.items():
 720         remote, id_ = i
 721         ctx.daemons.register_daemon(
 722             remote, 'rgw', id_,
 723             cluster=cluster_name,
 724             fsid=fsid,
 725             logger=log.getChild(role),
 726             wait=False,
 727             started=True,
 728         )
 729
 730     yield
 731
 732 @contextlib.contextmanager
 733 def ceph_clients(ctx, config):
 734     cluster_name = config['cluster']
 735     testdir = teuthology.get_testdir(ctx)
 736
 737     log.info('Setting up client nodes...')
 738     clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
 739     testdir = teuthology.get_testdir(ctx)
 740     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 741     for remote, roles_for_host in clients.remotes.items():
 742         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
 743                                                      cluster_name):
 744             name = teuthology.ceph_role(role)
 745             client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
 746                                                                 name)
 747             r = _shell(
 748                 ctx=ctx,
 749                 cluster_name=cluster_name,
 750                 remote=remote,
 751                 args=[
 752                     'ceph', 'auth',
 753                     'get-or-create', name,
 754                     'mon', 'allow *',
 755                     'osd', 'allow *',
 756                     'mds', 'allow *',
 757                     'mgr', 'allow *',
 758                 ],
 759                 stdout=BytesIO(),
 760             )
 761             keyring = r.stdout.getvalue()
 762             teuthology.sudo_write_file(
 763                 remote=remote,
 764                 path=client_keyring,
 765                 data=keyring,
 766                 perms='0644'
 767             )
 768     yield
 769
 770 @contextlib.contextmanager
 771 def ceph_initial():
 772     try:
 773         yield
 774     finally:
 775         log.info('Teardown complete')
 776
 777 ## public methods
 778 @contextlib.contextmanager
 779 def stop(ctx, config):
 780     """
 781     Stop ceph daemons
 782
 783     For example::
 784       tasks:
 785       - ceph.stop: [mds.*]
 786
 787       tasks:
 788       - ceph.stop: [osd.0, osd.2]
 789
 790       tasks:
 791       - ceph.stop:
 792           daemons: [osd.0, osd.2]
 793
 794     """
 795     if config is None:
 796         config = {}
 797     elif isinstance(config, list):
 798         config = {'daemons': config}
 799
 800     daemons = ctx.daemons.resolve_role_list(
 801         config.get('daemons', None), CEPH_ROLE_TYPES, True)
 802     clusters = set()
 803
 804     for role in daemons:
 805         cluster, type_, id_ = teuthology.split_role(role)
 806         ctx.daemons.get_daemon(type_, id_, cluster).stop()
 807         clusters.add(cluster)
 808
 809 #    for cluster in clusters:
 810 #        ctx.ceph[cluster].watchdog.stop()
 811 #        ctx.ceph[cluster].watchdog.join()
 812
 813     yield
 814
 815 def shell(ctx, config):
 816     """
 817     Execute (shell) commands
 818     """
 819     cluster_name = config.get('cluster', 'ceph')
 820
 821     env = []
 822     if 'env' in config:
 823         for k in config['env']:
 824             env.extend(['-e', k + '=' + ctx.config.get(k, '')])
 825         del config['env']
 826
 827     if 'all' in config and len(config) == 1:
 828         a = config['all']
 829         roles = teuthology.all_roles(ctx.cluster)
 830         config = dict((id_, a) for id_ in roles)
 831
 832     for role, ls in config.items():
 833         (remote,) = ctx.cluster.only(role).remotes.keys()
 834         log.info('Running commands on role %s host %s', role, remote.name)
 835         for c in ls:
 836             _shell(ctx, cluster_name, remote,
 837                    ['bash', '-c', c],
 838                    extra_cephadm_args=env)
 839
 840 @contextlib.contextmanager
 841 def tweaked_option(ctx, config):
 842     """
 843     set an option, and then restore it with its original value
 844
 845     Note, due to the way how tasks are executed/nested, it's not suggested to
 846     use this method as a standalone task. otherwise, it's likely that it will
 847     restore the tweaked option at the /end/ of 'tasks' block.
 848     """
 849     saved_options = {}
 850     # we can complicate this when necessary
 851     options = ['mon-health-to-clog']
 852     type_, id_ = 'mon', '*'
 853     cluster = config.get('cluster', 'ceph')
 854     manager = ctx.managers[cluster]
 855     if id_ == '*':
 856         get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
 857     else:
 858         get_from = id_
 859     for option in options:
 860         if option not in config:
 861             continue
 862         value = 'true' if config[option] else 'false'
 863         option = option.replace('-', '_')
 864         old_value = manager.get_config(type_, get_from, option)
 865         if value != old_value:
 866             saved_options[option] = old_value
 867             manager.inject_args(type_, id_, option, value)
 868     yield
 869     for option, value in saved_options.items():
 870         manager.inject_args(type_, id_, option, value)
 871
 872 @contextlib.contextmanager
 873 def restart(ctx, config):
 874     """
 875    restart ceph daemons
 876
 877    For example::
 878       tasks:
 879       - ceph.restart: [all]
 880
 881    For example::
 882       tasks:
 883       - ceph.restart: [osd.0, mon.1, mds.*]
 884
 885    or::
 886
 887       tasks:
 888       - ceph.restart:
 889           daemons: [osd.0, mon.1]
 890           wait-for-healthy: false
 891           wait-for-osds-up: true
 892
 893     :param ctx: Context
 894     :param config: Configuration
 895     """
 896     if config is None:
 897         config = {}
 898     elif isinstance(config, list):
 899         config = {'daemons': config}
 900
 901     daemons = ctx.daemons.resolve_role_list(
 902         config.get('daemons', None), CEPH_ROLE_TYPES, True)
 903     clusters = set()
 904
 905     log.info('daemons %s' % daemons)
 906     with tweaked_option(ctx, config):
 907         for role in daemons:
 908             cluster, type_, id_ = teuthology.split_role(role)
 909             d = ctx.daemons.get_daemon(type_, id_, cluster)
 910             assert d, 'daemon %s does not exist' % role
 911             d.stop()
 912             if type_ == 'osd':
 913                 ctx.managers[cluster].mark_down_osd(id_)
 914             d.restart()
 915             clusters.add(cluster)
 916
 917     if config.get('wait-for-healthy', True):
 918         for cluster in clusters:
 919             healthy(ctx=ctx, config=dict(cluster=cluster))
 920     if config.get('wait-for-osds-up', False):
 921         for cluster in clusters:
 922             ctx.managers[cluster].wait_for_all_osds_up()
 923     yield
 924
 925 @contextlib.contextmanager
 926 def distribute_config_and_admin_keyring(ctx, config):
 927     """
 928     Distribute a sufficient config and keyring for clients
 929     """
 930     cluster_name = config['cluster']
 931     log.info('Distributing (final) config and client.admin keyring...')
 932     for remote, roles in ctx.cluster.remotes.items():
 933         teuthology.sudo_write_file(
 934             remote=remote,
 935             path='/etc/ceph/{}.conf'.format(cluster_name),
 936             data=ctx.ceph[cluster_name].config_file)
 937         teuthology.sudo_write_file(
 938             remote=remote,
 939             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 940             data=ctx.ceph[cluster_name].admin_keyring)
 941     try:
 942         yield
 943     finally:
 944         ctx.cluster.run(args=[
 945             'sudo', 'rm', '-f',
 946             '/etc/ceph/{}.conf'.format(cluster_name),
 947             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 948         ])
 949
 950 @contextlib.contextmanager
 951 def crush_setup(ctx, config):
 952     cluster_name = config['cluster']
 953
 954     profile = config.get('crush_tunables', 'default')
 955     log.info('Setting crush tunables to %s', profile)
 956     _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
 957         args=['ceph', 'osd', 'crush', 'tunables', profile])
 958     yield
 959
 960 @contextlib.contextmanager
 961 def task(ctx, config):
 962     if config is None:
 963         config = {}
 964
 965     assert isinstance(config, dict), \
 966         "task only supports a dictionary for configuration"
 967
 968     overrides = ctx.config.get('overrides', {})
 969     teuthology.deep_merge(config, overrides.get('ceph', {}))
 970     log.info('Config: ' + str(config))
 971
 972     testdir = teuthology.get_testdir(ctx)
 973
 974     # set up cluster context
 975     first_ceph_cluster = False
 976     if not hasattr(ctx, 'daemons'):
 977         first_ceph_cluster = True
 978     if not hasattr(ctx, 'ceph'):
 979         ctx.ceph = {}
 980         ctx.managers = {}
 981     if 'cluster' not in config:
 982         config['cluster'] = 'ceph'
 983     cluster_name = config['cluster']
 984     ctx.ceph[cluster_name] = argparse.Namespace()
 985
 986     ctx.ceph[cluster_name].thrashers = []
 987     # fixme: setup watchdog, ala ceph.py
 988
 989     ctx.ceph[cluster_name].roleless = False  # see below
 990
 991     # cephadm mode?
 992     if 'cephadm_mode' not in config:
 993         config['cephadm_mode'] = 'root'
 994     assert config['cephadm_mode'] in ['root', 'cephadm-package']
 995     if config['cephadm_mode'] == 'root':
 996         ctx.cephadm = testdir + '/cephadm'
 997     else:
 998         ctx.cephadm = 'cephadm'  # in the path
 999
1000     if first_ceph_cluster:
1001         # FIXME: this is global for all clusters
1002         ctx.daemons = DaemonGroup(
1003             use_cephadm=ctx.cephadm)
1004
1005     # image
1006     ctx.ceph[cluster_name].image = config.get('image')
1007     ref = None
1008     if not ctx.ceph[cluster_name].image:
1009         sha1 = config.get('sha1')
1010         if sha1:
1011             ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % sha1
1012             ref = sha1
1013         else:
1014             # hmm, fall back to branch?
1015             branch = config.get('branch', 'master')
1016             ref = branch
1017             ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % branch
1018     log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1019
1020     # uuid
1021     fsid = str(uuid.uuid1())
1022     log.info('Cluster fsid is %s' % fsid)
1023     ctx.ceph[cluster_name].fsid = fsid
1024
1025     # mon ips
1026     log.info('Choosing monitor IPs and ports...')
1027     remotes_and_roles = ctx.cluster.remotes.items()
1028     roles = [role_list for (remote, role_list) in remotes_and_roles]
1029     ips = [host for (host, port) in
1030            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1031
1032     if config.get('roleless', False):
1033         # mons will be named after hosts
1034         n = len(roles)
1035         roles = []
1036         first_mon = None
1037         for remote, _ in remotes_and_roles:
1038             roles.append(['mon.' + remote.shortname])
1039             if not first_mon:
1040                 first_mon = remote.shortname
1041                 bootstrap_remote = remote
1042         log.info('No roles; fabricating mons %s' % roles)
1043
1044     ctx.ceph[cluster_name].mons = get_mons(
1045         roles, ips, cluster_name,
1046         mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1047         mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1048     )
1049     log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1050
1051     if config.get('roleless', False):
1052         ctx.ceph[cluster_name].roleless = True
1053         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1054         ctx.ceph[cluster_name].first_mon = first_mon
1055         ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1056     else:
1057         first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1058         _, _, first_mon = teuthology.split_role(first_mon_role)
1059         (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1060         log.info('First mon is mon.%s on %s' % (first_mon,
1061                                                 bootstrap_remote.shortname))
1062         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1063         ctx.ceph[cluster_name].first_mon = first_mon
1064         ctx.ceph[cluster_name].first_mon_role = first_mon_role
1065
1066         others = ctx.cluster.remotes[bootstrap_remote]
1067         mgrs = sorted([r for r in others
1068                        if teuthology.is_type('mgr', cluster_name)(r)])
1069         if not mgrs:
1070             raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1071         _, _, first_mgr = teuthology.split_role(mgrs[0])
1072         log.info('First mgr is %s' % (first_mgr))
1073         ctx.ceph[cluster_name].first_mgr = first_mgr
1074
1075
1076     with contextutil.nested(
1077             lambda: ceph_initial(),
1078             lambda: normalize_hostnames(ctx=ctx),
1079             lambda: download_cephadm(ctx=ctx, config=config, ref=ref),
1080             lambda: ceph_log(ctx=ctx, config=config),
1081             lambda: ceph_crash(ctx=ctx, config=config),
1082             lambda: ceph_bootstrap(ctx=ctx, config=config),
1083             lambda: crush_setup(ctx=ctx, config=config),
1084             lambda: ceph_mons(ctx=ctx, config=config),
1085             lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1086             lambda: ceph_mgrs(ctx=ctx, config=config),
1087             lambda: ceph_osds(ctx=ctx, config=config),
1088             lambda: ceph_mdss(ctx=ctx, config=config),
1089             lambda: ceph_rgw(ctx=ctx, config=config),
1090             lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1091             lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1092             lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1093             lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1094             lambda: ceph_clients(ctx=ctx, config=config),
1095     ):
1096         ctx.managers[cluster_name] = CephManager(
1097             ctx.ceph[cluster_name].bootstrap_remote,
1098             ctx=ctx,
1099             logger=log.getChild('ceph_manager.' + cluster_name),
1100             cluster=cluster_name,
1101             cephadm=True,
1102         )
1103
1104         try:
1105             if config.get('wait-for-healthy', True):
1106                 healthy(ctx=ctx, config=config)
1107
1108             log.info('Setup complete, yielding')
1109             yield
1110
1111         finally:
1112             log.info('Teardown begin')