ceph/qa/tasks/cephadm.py

   1 """
   2 Ceph cluster task, deployed via cephadm orchestrator
   3 """
   4 import argparse
   5 import configobj
   6 import contextlib
   7 import logging
   8 import os
   9 import json
  10 import re
  11 import uuid
  12 import yaml
  13
  14 from copy import deepcopy
  15 from io import BytesIO, StringIO
  16 from tarfile import ReadError
  17 from tasks.ceph_manager import CephManager
  18 from teuthology import misc as teuthology
  19 from teuthology import contextutil
  20 from teuthology.orchestra import run
  21 from teuthology.orchestra.daemon import DaemonGroup
  22 from teuthology.config import config as teuth_config
  23 from textwrap import dedent
  24 from tasks.cephfs.filesystem import MDSCluster, Filesystem
  25
  26 # these items we use from ceph.py should probably eventually move elsewhere
  27 from tasks.ceph import get_mons, healthy
  28 from tasks.vip import subst_vip
  29
  30 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
  31
  32 log = logging.getLogger(__name__)
  33
  34
  35 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
  36     teuthology.get_testdir(ctx)
  37     return remote.run(
  38         args=[
  39             'sudo',
  40             ctx.cephadm,
  41             '--image', ctx.ceph[cluster_name].image,
  42             'shell',
  43             '-c', '/etc/ceph/{}.conf'.format(cluster_name),
  44             '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
  45             '--fsid', ctx.ceph[cluster_name].fsid,
  46             ] + extra_cephadm_args + [
  47             '--',
  48             ] + args,
  49         **kwargs
  50     )
  51
  52
  53 def build_initial_config(ctx, config):
  54     cluster_name = config['cluster']
  55
  56     path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
  57     conf = configobj.ConfigObj(path, file_error=True)
  58
  59     conf.setdefault('global', {})
  60     conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
  61
  62     # overrides
  63     for section, keys in config.get('conf',{}).items():
  64         for key, value in keys.items():
  65             log.info(" override: [%s] %s = %s" % (section, key, value))
  66             if section not in conf:
  67                 conf[section] = {}
  68             conf[section][key] = value
  69
  70     return conf
  71
  72
  73 def distribute_iscsi_gateway_cfg(ctx, conf_data):
  74     """
  75     Distribute common gateway config to get the IPs.
  76     These will help in iscsi clients with finding trusted_ip_list.
  77     """
  78     log.info('Distributing iscsi-gateway.cfg...')
  79     for remote, roles in ctx.cluster.remotes.items():
  80         remote.write_file(
  81             path='/etc/ceph/iscsi-gateway.cfg',
  82             data=conf_data,
  83             sudo=True)
  84
  85 def update_archive_setting(ctx, key, value):
  86     """
  87     Add logs directory to job's info log file
  88     """
  89     if ctx.archive is None:
  90         return
  91     with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
  92         info_yaml = yaml.safe_load(info_file)
  93         info_file.seek(0)
  94         if 'archive' in info_yaml:
  95             info_yaml['archive'][key] = value
  96         else:
  97             info_yaml['archive'] = {key: value}
  98         yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
  99
 100
 101 @contextlib.contextmanager
 102 def normalize_hostnames(ctx):
 103     """
 104     Ensure we have short hostnames throughout, for consistency between
 105     remote.shortname and socket.gethostname() in cephadm.
 106     """
 107     log.info('Normalizing hostnames...')
 108     ctx.cluster.run(args=[
 109         'sudo',
 110         'hostname',
 111         run.Raw('$(hostname -s)'),
 112     ])
 113
 114     try:
 115         yield
 116     finally:
 117         pass
 118
 119
 120 @contextlib.contextmanager
 121 def download_cephadm(ctx, config, ref):
 122     cluster_name = config['cluster']
 123
 124     if config.get('cephadm_mode') != 'cephadm-package':
 125         ref = config.get('cephadm_branch', ref)
 126         git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
 127         log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
 128         if ctx.config.get('redhat'):
 129             log.info("Install cephadm using RPM")
 130             # cephadm already installed from redhat.install task
 131             ctx.cluster.run(
 132                 args=[
 133                     'cp',
 134                     run.Raw('$(which cephadm)'),
 135                     ctx.cephadm,
 136                     run.Raw('&&'),
 137                     'ls', '-l',
 138                     ctx.cephadm,
 139                 ]
 140             )
 141         elif git_url.startswith('https://github.com/'):
 142             # git archive doesn't like https:// URLs, which we use with github.
 143             rest = git_url.split('https://github.com/', 1)[1]
 144             rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
 145             ctx.cluster.run(
 146                 args=[
 147                     'curl', '--silent',
 148                     'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
 149                     run.Raw('>'),
 150                     ctx.cephadm,
 151                     run.Raw('&&'),
 152                     'ls', '-l',
 153                     ctx.cephadm,
 154                 ],
 155             )
 156         else:
 157             ctx.cluster.run(
 158                 args=[
 159                     'git', 'archive',
 160                     '--remote=' + git_url,
 161                     ref,
 162                     'src/cephadm/cephadm',
 163                     run.Raw('|'),
 164                     'tar', '-xO', 'src/cephadm/cephadm',
 165                     run.Raw('>'),
 166                     ctx.cephadm,
 167                 ],
 168             )
 169         # sanity-check the resulting file and set executable bit
 170         cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
 171         ctx.cluster.run(
 172             args=[
 173                 'test', '-s', ctx.cephadm,
 174                 run.Raw('&&'),
 175                 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
 176                 run.Raw('&&'),
 177                 'chmod', '+x', ctx.cephadm,
 178             ],
 179         )
 180
 181     try:
 182         yield
 183     finally:
 184         log.info('Removing cluster...')
 185         ctx.cluster.run(args=[
 186             'sudo',
 187             ctx.cephadm,
 188             'rm-cluster',
 189             '--fsid', ctx.ceph[cluster_name].fsid,
 190             '--force',
 191         ])
 192
 193         if config.get('cephadm_mode') == 'root':
 194             log.info('Removing cephadm ...')
 195             ctx.cluster.run(
 196                 args=[
 197                     'rm',
 198                     '-rf',
 199                     ctx.cephadm,
 200                 ],
 201             )
 202
 203
 204 @contextlib.contextmanager
 205 def ceph_log(ctx, config):
 206     cluster_name = config['cluster']
 207     fsid = ctx.ceph[cluster_name].fsid
 208
 209     update_archive_setting(ctx, 'log', '/var/log/ceph')
 210
 211
 212     try:
 213         yield
 214
 215     except Exception:
 216         # we need to know this below
 217         ctx.summary['success'] = False
 218         raise
 219
 220     finally:
 221         log.info('Checking cluster log for badness...')
 222         def first_in_ceph_log(pattern, excludes):
 223             """
 224             Find the first occurrence of the pattern specified in the Ceph log,
 225             Returns None if none found.
 226
 227             :param pattern: Pattern scanned for.
 228             :param excludes: Patterns to ignore.
 229             :return: First line of text (or None if not found)
 230             """
 231             args = [
 232                 'sudo',
 233                 'egrep', pattern,
 234                 '/var/log/ceph/{fsid}/ceph.log'.format(
 235                     fsid=fsid),
 236             ]
 237             if excludes:
 238                 for exclude in excludes:
 239                     args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 240             args.extend([
 241                 run.Raw('|'), 'head', '-n', '1',
 242             ])
 243             r = ctx.ceph[cluster_name].bootstrap_remote.run(
 244                 stdout=StringIO(),
 245                 args=args,
 246             )
 247             stdout = r.stdout.getvalue()
 248             if stdout != '':
 249                 return stdout
 250             return None
 251
 252         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 253                              config.get('log-ignorelist')) is not None:
 254             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 255             ctx.summary['success'] = False
 256             # use the most severe problem as the failure reason
 257             if 'failure_reason' not in ctx.summary:
 258                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 259                     match = first_in_ceph_log(pattern, config['log-ignorelist'])
 260                     if match is not None:
 261                         ctx.summary['failure_reason'] = \
 262                             '"{match}" in cluster log'.format(
 263                                 match=match.rstrip('\n'),
 264                             )
 265                         break
 266
 267         if ctx.archive is not None and \
 268                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 269             # and logs
 270             log.info('Compressing logs...')
 271             run.wait(
 272                 ctx.cluster.run(
 273                     args=[
 274                         'sudo',
 275                         'find',
 276                         '/var/log/ceph',   # all logs, not just for the cluster
 277                         '/var/log/rbd-target-api', # ceph-iscsi
 278                         '-name',
 279                         '*.log',
 280                         '-print0',
 281                         run.Raw('|'),
 282                         'sudo',
 283                         'xargs',
 284                         '-0',
 285                         '--no-run-if-empty',
 286                         '--',
 287                         'gzip',
 288                         '--',
 289                     ],
 290                     wait=False,
 291                 ),
 292             )
 293
 294             log.info('Archiving logs...')
 295             path = os.path.join(ctx.archive, 'remote')
 296             try:
 297                 os.makedirs(path)
 298             except OSError:
 299                 pass
 300             for remote in ctx.cluster.remotes.keys():
 301                 sub = os.path.join(path, remote.shortname)
 302                 try:
 303                     os.makedirs(sub)
 304                 except OSError:
 305                     pass
 306                 try:
 307                     teuthology.pull_directory(remote, '/var/log/ceph',  # everything
 308                                               os.path.join(sub, 'log'))
 309                 except ReadError:
 310                     pass
 311
 312
 313 @contextlib.contextmanager
 314 def ceph_crash(ctx, config):
 315     """
 316     Gather crash dumps from /var/lib/ceph/$fsid/crash
 317     """
 318     cluster_name = config['cluster']
 319     fsid = ctx.ceph[cluster_name].fsid
 320
 321     update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
 322
 323     try:
 324         yield
 325
 326     finally:
 327         if ctx.archive is not None:
 328             log.info('Archiving crash dumps...')
 329             path = os.path.join(ctx.archive, 'remote')
 330             try:
 331                 os.makedirs(path)
 332             except OSError:
 333                 pass
 334             for remote in ctx.cluster.remotes.keys():
 335                 sub = os.path.join(path, remote.shortname)
 336                 try:
 337                     os.makedirs(sub)
 338                 except OSError:
 339                     pass
 340                 try:
 341                     teuthology.pull_directory(remote,
 342                                               '/var/lib/ceph/%s/crash' % fsid,
 343                                               os.path.join(sub, 'crash'))
 344                 except ReadError:
 345                     pass
 346
 347
 348 @contextlib.contextmanager
 349 def pull_image(ctx, config):
 350     cluster_name = config['cluster']
 351     log.info(f'Pulling image {ctx.ceph[cluster_name].image} on all hosts...')
 352     run.wait(
 353         ctx.cluster.run(
 354             args=[
 355                 'sudo',
 356                 ctx.cephadm,
 357                 '--image', ctx.ceph[cluster_name].image,
 358                 'pull',
 359             ],
 360             wait=False,
 361         )
 362     )
 363
 364     try:
 365         yield
 366     finally:
 367         pass
 368
 369
 370 @contextlib.contextmanager
 371 def ceph_bootstrap(ctx, config):
 372     """
 373     Bootstrap ceph cluster.
 374
 375     :param ctx: the argparse.Namespace object
 376     :param config: the config dict
 377     """
 378     cluster_name = config['cluster']
 379     testdir = teuthology.get_testdir(ctx)
 380     fsid = ctx.ceph[cluster_name].fsid
 381
 382     bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 383     first_mon = ctx.ceph[cluster_name].first_mon
 384     first_mon_role = ctx.ceph[cluster_name].first_mon_role
 385     mons = ctx.ceph[cluster_name].mons
 386
 387     ctx.cluster.run(args=[
 388         'sudo', 'mkdir', '-p', '/etc/ceph',
 389         ]);
 390     ctx.cluster.run(args=[
 391         'sudo', 'chmod', '777', '/etc/ceph',
 392         ]);
 393     try:
 394         # write seed config
 395         log.info('Writing seed config...')
 396         conf_fp = BytesIO()
 397         seed_config = build_initial_config(ctx, config)
 398         seed_config.write(conf_fp)
 399         bootstrap_remote.write_file(
 400             path='{}/seed.{}.conf'.format(testdir, cluster_name),
 401             data=conf_fp.getvalue())
 402         log.debug('Final config:\n' + conf_fp.getvalue().decode())
 403         ctx.ceph[cluster_name].conf = seed_config
 404
 405         # register initial daemons
 406         ctx.daemons.register_daemon(
 407             bootstrap_remote, 'mon', first_mon,
 408             cluster=cluster_name,
 409             fsid=fsid,
 410             logger=log.getChild('mon.' + first_mon),
 411             wait=False,
 412             started=True,
 413         )
 414         if not ctx.ceph[cluster_name].roleless:
 415             first_mgr = ctx.ceph[cluster_name].first_mgr
 416             ctx.daemons.register_daemon(
 417                 bootstrap_remote, 'mgr', first_mgr,
 418                 cluster=cluster_name,
 419                 fsid=fsid,
 420                 logger=log.getChild('mgr.' + first_mgr),
 421                 wait=False,
 422                 started=True,
 423             )
 424
 425         # bootstrap
 426         log.info('Bootstrapping...')
 427         cmd = [
 428             'sudo',
 429             ctx.cephadm,
 430             '--image', ctx.ceph[cluster_name].image,
 431             '-v',
 432             'bootstrap',
 433             '--fsid', fsid,
 434             '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
 435             '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
 436             '--output-keyring',
 437             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 438             '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
 439         ]
 440
 441         if config.get('registry-login'):
 442             registry = config['registry-login']
 443             cmd += [
 444                 "--registry-url", registry['url'],
 445                 "--registry-username", registry['username'],
 446                 "--registry-password", registry['password'],
 447             ]
 448
 449         if not ctx.ceph[cluster_name].roleless:
 450             cmd += [
 451                 '--mon-id', first_mon,
 452                 '--mgr-id', first_mgr,
 453                 '--orphan-initial-daemons',   # we will do it explicitly!
 454                 '--skip-monitoring-stack',    # we'll provision these explicitly
 455             ]
 456
 457         if mons[first_mon_role].startswith('['):
 458             cmd += ['--mon-addrv', mons[first_mon_role]]
 459         else:
 460             cmd += ['--mon-ip', mons[first_mon_role]]
 461         if config.get('skip_dashboard'):
 462             cmd += ['--skip-dashboard']
 463         if config.get('skip_monitoring_stack'):
 464             cmd += ['--skip-monitoring-stack']
 465         if config.get('single_host_defaults'):
 466             cmd += ['--single-host-defaults']
 467         if not config.get('avoid_pacific_features', False):
 468             cmd += ['--skip-admin-label']
 469         # bootstrap makes the keyring root 0600, so +r it for our purposes
 470         cmd += [
 471             run.Raw('&&'),
 472             'sudo', 'chmod', '+r',
 473             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 474         ]
 475         bootstrap_remote.run(args=cmd)
 476
 477         # fetch keys and configs
 478         log.info('Fetching config...')
 479         ctx.ceph[cluster_name].config_file = \
 480             bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
 481         log.info('Fetching client.admin keyring...')
 482         ctx.ceph[cluster_name].admin_keyring = \
 483             bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
 484         log.info('Fetching mon keyring...')
 485         ctx.ceph[cluster_name].mon_keyring = \
 486             bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
 487
 488         # fetch ssh key, distribute to additional nodes
 489         log.info('Fetching pub ssh key...')
 490         ssh_pub_key = bootstrap_remote.read_file(
 491             f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
 492
 493         log.info('Installing pub ssh key for root users...')
 494         ctx.cluster.run(args=[
 495             'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
 496             run.Raw('&&'),
 497             'echo', ssh_pub_key,
 498             run.Raw('|'),
 499             'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
 500             run.Raw('&&'),
 501             'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
 502         ])
 503
 504         # set options
 505         if config.get('allow_ptrace', True):
 506             _shell(ctx, cluster_name, bootstrap_remote,
 507                    ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
 508
 509         if not config.get('avoid_pacific_features', False):
 510             log.info('Distributing conf and client.admin keyring to all hosts + 0755')
 511             _shell(ctx, cluster_name, bootstrap_remote,
 512                    ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
 513                     '*', '--mode', '0755'],
 514                    check_status=False)
 515
 516         # add other hosts
 517         for remote in ctx.cluster.remotes.keys():
 518             if remote == bootstrap_remote:
 519                 continue
 520
 521             # note: this may be redundant (see above), but it avoids
 522             # us having to wait for cephadm to do it.
 523             log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
 524             remote.write_file(
 525                 path='/etc/ceph/{}.conf'.format(cluster_name),
 526                 data=ctx.ceph[cluster_name].config_file)
 527             remote.write_file(
 528                 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 529                 data=ctx.ceph[cluster_name].admin_keyring)
 530
 531             log.info('Adding host %s to orchestrator...' % remote.shortname)
 532             _shell(ctx, cluster_name, bootstrap_remote, [
 533                 'ceph', 'orch', 'host', 'add',
 534                 remote.shortname
 535             ])
 536             r = _shell(ctx, cluster_name, bootstrap_remote,
 537                        ['ceph', 'orch', 'host', 'ls', '--format=json'],
 538                        stdout=StringIO())
 539             hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
 540             assert remote.shortname in hosts
 541
 542         yield
 543
 544     finally:
 545         log.info('Cleaning up testdir ceph.* files...')
 546         ctx.cluster.run(args=[
 547             'rm', '-f',
 548             '{}/seed.{}.conf'.format(testdir, cluster_name),
 549             '{}/{}.pub'.format(testdir, cluster_name),
 550         ])
 551
 552         log.info('Stopping all daemons...')
 553
 554         # this doesn't block until they are all stopped...
 555         #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
 556
 557         # stop the daemons we know
 558         for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
 559             cluster, type_, id_ = teuthology.split_role(role)
 560             try:
 561                 ctx.daemons.get_daemon(type_, id_, cluster).stop()
 562             except Exception:
 563                 log.exception(f'Failed to stop "{role}"')
 564                 raise
 565
 566         # tear down anything left (but leave the logs behind)
 567         ctx.cluster.run(
 568             args=[
 569                 'sudo',
 570                 ctx.cephadm,
 571                 'rm-cluster',
 572                 '--fsid', fsid,
 573                 '--force',
 574                 '--keep-logs',
 575             ],
 576             check_status=False,  # may fail if upgrading from old cephadm
 577         )
 578
 579         # clean up /etc/ceph
 580         ctx.cluster.run(args=[
 581             'sudo', 'rm', '-f',
 582             '/etc/ceph/{}.conf'.format(cluster_name),
 583             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 584         ])
 585
 586
 587 @contextlib.contextmanager
 588 def ceph_mons(ctx, config):
 589     """
 590     Deploy any additional mons
 591     """
 592     cluster_name = config['cluster']
 593     fsid = ctx.ceph[cluster_name].fsid
 594
 595     try:
 596         daemons = {}
 597         if config.get('add_mons_via_daemon_add'):
 598             # This is the old way of adding mons that works with the (early) octopus
 599             # cephadm scheduler.
 600             num_mons = 1
 601             for remote, roles in ctx.cluster.remotes.items():
 602                 for mon in [r for r in roles
 603                             if teuthology.is_type('mon', cluster_name)(r)]:
 604                     c_, _, id_ = teuthology.split_role(mon)
 605                     if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 606                         continue
 607                     log.info('Adding %s on %s' % (mon, remote.shortname))
 608                     num_mons += 1
 609                     _shell(ctx, cluster_name, remote, [
 610                         'ceph', 'orch', 'daemon', 'add', 'mon',
 611                         remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
 612                     ])
 613                     ctx.daemons.register_daemon(
 614                         remote, 'mon', id_,
 615                         cluster=cluster_name,
 616                         fsid=fsid,
 617                         logger=log.getChild(mon),
 618                         wait=False,
 619                         started=True,
 620                     )
 621                     daemons[mon] = (remote, id_)
 622
 623                     with contextutil.safe_while(sleep=1, tries=180) as proceed:
 624                         while proceed():
 625                             log.info('Waiting for %d mons in monmap...' % (num_mons))
 626                             r = _shell(
 627                                 ctx=ctx,
 628                                 cluster_name=cluster_name,
 629                                 remote=remote,
 630                                 args=[
 631                                     'ceph', 'mon', 'dump', '-f', 'json',
 632                                 ],
 633                                 stdout=StringIO(),
 634                             )
 635                             j = json.loads(r.stdout.getvalue())
 636                             if len(j['mons']) == num_mons:
 637                                 break
 638         else:
 639             nodes = []
 640             for remote, roles in ctx.cluster.remotes.items():
 641                 for mon in [r for r in roles
 642                             if teuthology.is_type('mon', cluster_name)(r)]:
 643                     c_, _, id_ = teuthology.split_role(mon)
 644                     log.info('Adding %s on %s' % (mon, remote.shortname))
 645                     nodes.append(remote.shortname
 646                                  + ':' + ctx.ceph[cluster_name].mons[mon]
 647                                  + '=' + id_)
 648                     if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 649                         continue
 650                     daemons[mon] = (remote, id_)
 651
 652             _shell(ctx, cluster_name, remote, [
 653                 'ceph', 'orch', 'apply', 'mon',
 654                 str(len(nodes)) + ';' + ';'.join(nodes)]
 655                    )
 656             for mgr, i in daemons.items():
 657                 remote, id_ = i
 658                 ctx.daemons.register_daemon(
 659                     remote, 'mon', id_,
 660                     cluster=cluster_name,
 661                     fsid=fsid,
 662                     logger=log.getChild(mon),
 663                     wait=False,
 664                     started=True,
 665                 )
 666
 667             with contextutil.safe_while(sleep=1, tries=180) as proceed:
 668                 while proceed():
 669                     log.info('Waiting for %d mons in monmap...' % (len(nodes)))
 670                     r = _shell(
 671                         ctx=ctx,
 672                         cluster_name=cluster_name,
 673                         remote=remote,
 674                         args=[
 675                             'ceph', 'mon', 'dump', '-f', 'json',
 676                         ],
 677                         stdout=StringIO(),
 678                     )
 679                     j = json.loads(r.stdout.getvalue())
 680                     if len(j['mons']) == len(nodes):
 681                         break
 682
 683         # refresh our (final) ceph.conf file
 684         bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 685         log.info('Generating final ceph.conf file...')
 686         r = _shell(
 687             ctx=ctx,
 688             cluster_name=cluster_name,
 689             remote=bootstrap_remote,
 690             args=[
 691                 'ceph', 'config', 'generate-minimal-conf',
 692             ],
 693             stdout=StringIO(),
 694         )
 695         ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
 696
 697         yield
 698
 699     finally:
 700         pass
 701
 702
 703 @contextlib.contextmanager
 704 def ceph_mgrs(ctx, config):
 705     """
 706     Deploy any additional mgrs
 707     """
 708     cluster_name = config['cluster']
 709     fsid = ctx.ceph[cluster_name].fsid
 710
 711     try:
 712         nodes = []
 713         daemons = {}
 714         for remote, roles in ctx.cluster.remotes.items():
 715             for mgr in [r for r in roles
 716                         if teuthology.is_type('mgr', cluster_name)(r)]:
 717                 c_, _, id_ = teuthology.split_role(mgr)
 718                 log.info('Adding %s on %s' % (mgr, remote.shortname))
 719                 nodes.append(remote.shortname + '=' + id_)
 720                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
 721                     continue
 722                 daemons[mgr] = (remote, id_)
 723         if nodes:
 724             _shell(ctx, cluster_name, remote, [
 725                 'ceph', 'orch', 'apply', 'mgr',
 726                 str(len(nodes)) + ';' + ';'.join(nodes)]
 727             )
 728         for mgr, i in daemons.items():
 729             remote, id_ = i
 730             ctx.daemons.register_daemon(
 731                 remote, 'mgr', id_,
 732                 cluster=cluster_name,
 733                 fsid=fsid,
 734                 logger=log.getChild(mgr),
 735                 wait=False,
 736                 started=True,
 737             )
 738
 739         yield
 740
 741     finally:
 742         pass
 743
 744
 745 @contextlib.contextmanager
 746 def ceph_osds(ctx, config):
 747     """
 748     Deploy OSDs
 749     """
 750     cluster_name = config['cluster']
 751     fsid = ctx.ceph[cluster_name].fsid
 752
 753     try:
 754         log.info('Deploying OSDs...')
 755
 756         # provision OSDs in numeric order
 757         id_to_remote = {}
 758         devs_by_remote = {}
 759         for remote, roles in ctx.cluster.remotes.items():
 760             devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
 761             for osd in [r for r in roles
 762                         if teuthology.is_type('osd', cluster_name)(r)]:
 763                 _, _, id_ = teuthology.split_role(osd)
 764                 id_to_remote[int(id_)] = (osd, remote)
 765
 766         cur = 0
 767         for osd_id in sorted(id_to_remote.keys()):
 768             osd, remote = id_to_remote[osd_id]
 769             _, _, id_ = teuthology.split_role(osd)
 770             assert int(id_) == cur
 771             devs = devs_by_remote[remote]
 772             assert devs   ## FIXME ##
 773             dev = devs.pop()
 774             if all(_ in dev for _ in ('lv', 'vg')):
 775                 short_dev = dev.replace('/dev/', '')
 776             else:
 777                 short_dev = dev
 778             log.info('Deploying %s on %s with %s...' % (
 779                 osd, remote.shortname, dev))
 780             _shell(ctx, cluster_name, remote, [
 781                 'ceph-volume', 'lvm', 'zap', dev])
 782             _shell(ctx, cluster_name, remote, [
 783                 'ceph', 'orch', 'daemon', 'add', 'osd',
 784                 remote.shortname + ':' + short_dev
 785             ])
 786             ctx.daemons.register_daemon(
 787                 remote, 'osd', id_,
 788                 cluster=cluster_name,
 789                 fsid=fsid,
 790                 logger=log.getChild(osd),
 791                 wait=False,
 792                 started=True,
 793             )
 794             cur += 1
 795
 796         if cur == 0:
 797             _shell(ctx, cluster_name, remote, [
 798                 'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
 799             ])
 800             # expect the number of scratch devs
 801             num_osds = sum(map(len, devs_by_remote.values()))
 802             assert num_osds
 803         else:
 804             # expect the number of OSDs we created
 805             num_osds = cur
 806
 807         log.info(f'Waiting for {num_osds} OSDs to come up...')
 808         with contextutil.safe_while(sleep=1, tries=120) as proceed:
 809             while proceed():
 810                 p = _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
 811                            ['ceph', 'osd', 'stat', '-f', 'json'], stdout=StringIO())
 812                 j = json.loads(p.stdout.getvalue())
 813                 if int(j.get('num_up_osds', 0)) == num_osds:
 814                     break;
 815
 816         if not hasattr(ctx, 'managers'):
 817             ctx.managers = {}
 818         ctx.managers[cluster_name] = CephManager(
 819             ctx.ceph[cluster_name].bootstrap_remote,
 820             ctx=ctx,
 821             logger=log.getChild('ceph_manager.' + cluster_name),
 822             cluster=cluster_name,
 823             cephadm=True,
 824         )
 825
 826         yield
 827     finally:
 828         pass
 829
 830
 831 @contextlib.contextmanager
 832 def ceph_mdss(ctx, config):
 833     """
 834     Deploy MDSss
 835     """
 836     cluster_name = config['cluster']
 837     fsid = ctx.ceph[cluster_name].fsid
 838
 839     nodes = []
 840     daemons = {}
 841     for remote, roles in ctx.cluster.remotes.items():
 842         for role in [r for r in roles
 843                     if teuthology.is_type('mds', cluster_name)(r)]:
 844             c_, _, id_ = teuthology.split_role(role)
 845             log.info('Adding %s on %s' % (role, remote.shortname))
 846             nodes.append(remote.shortname + '=' + id_)
 847             daemons[role] = (remote, id_)
 848     if nodes:
 849         _shell(ctx, cluster_name, remote, [
 850             'ceph', 'orch', 'apply', 'mds',
 851             'all',
 852             str(len(nodes)) + ';' + ';'.join(nodes)]
 853         )
 854     for role, i in daemons.items():
 855         remote, id_ = i
 856         ctx.daemons.register_daemon(
 857             remote, 'mds', id_,
 858             cluster=cluster_name,
 859             fsid=fsid,
 860             logger=log.getChild(role),
 861             wait=False,
 862             started=True,
 863         )
 864
 865     yield
 866
 867 @contextlib.contextmanager
 868 def cephfs_setup(ctx, config):
 869     mdss = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
 870
 871     # If there are any MDSs, then create a filesystem for them to use
 872     # Do this last because requires mon cluster to be up and running
 873     if len(mdss) > 0:
 874         log.info('Setting up CephFS filesystem(s)...')
 875         cephfs_config = config.get('cephfs', {})
 876         fs_configs =  cephfs_config.pop('fs', [{'name': 'cephfs'}])
 877         set_allow_multifs = len(fs_configs) > 1
 878
 879         # wait for standbys to become available (slow due to valgrind, perhaps)
 880         mdsc = MDSCluster(ctx)
 881         with contextutil.safe_while(sleep=2,tries=150) as proceed:
 882             while proceed():
 883                 if len(mdsc.get_standby_daemons()) >= len(mdss):
 884                     break
 885
 886         fss = []
 887         for fs_config in fs_configs:
 888             assert isinstance(fs_config, dict)
 889             name = fs_config.pop('name')
 890             temp = deepcopy(cephfs_config)
 891             teuthology.deep_merge(temp, fs_config)
 892             fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
 893             if set_allow_multifs:
 894                 fs.set_allow_multifs()
 895                 set_allow_multifs = False
 896             fss.append(fs)
 897
 898         yield
 899
 900         for fs in fss:
 901             fs.destroy()
 902     else:
 903         yield
 904
 905 @contextlib.contextmanager
 906 def ceph_monitoring(daemon_type, ctx, config):
 907     """
 908     Deploy prometheus, node-exporter, etc.
 909     """
 910     cluster_name = config['cluster']
 911     fsid = ctx.ceph[cluster_name].fsid
 912
 913     nodes = []
 914     daemons = {}
 915     for remote, roles in ctx.cluster.remotes.items():
 916         for role in [r for r in roles
 917                     if teuthology.is_type(daemon_type, cluster_name)(r)]:
 918             c_, _, id_ = teuthology.split_role(role)
 919             log.info('Adding %s on %s' % (role, remote.shortname))
 920             nodes.append(remote.shortname + '=' + id_)
 921             daemons[role] = (remote, id_)
 922     if nodes:
 923         _shell(ctx, cluster_name, remote, [
 924             'ceph', 'orch', 'apply', daemon_type,
 925             str(len(nodes)) + ';' + ';'.join(nodes)]
 926         )
 927     for role, i in daemons.items():
 928         remote, id_ = i
 929         ctx.daemons.register_daemon(
 930             remote, daemon_type, id_,
 931             cluster=cluster_name,
 932             fsid=fsid,
 933             logger=log.getChild(role),
 934             wait=False,
 935             started=True,
 936         )
 937
 938     yield
 939
 940
 941 @contextlib.contextmanager
 942 def ceph_rgw(ctx, config):
 943     """
 944     Deploy rgw
 945     """
 946     cluster_name = config['cluster']
 947     fsid = ctx.ceph[cluster_name].fsid
 948
 949     nodes = {}
 950     daemons = {}
 951     for remote, roles in ctx.cluster.remotes.items():
 952         for role in [r for r in roles
 953                     if teuthology.is_type('rgw', cluster_name)(r)]:
 954             c_, _, id_ = teuthology.split_role(role)
 955             log.info('Adding %s on %s' % (role, remote.shortname))
 956             svc = '.'.join(id_.split('.')[0:2])
 957             if svc not in nodes:
 958                 nodes[svc] = []
 959             nodes[svc].append(remote.shortname + '=' + id_)
 960             daemons[role] = (remote, id_)
 961
 962     for svc, nodes in nodes.items():
 963         _shell(ctx, cluster_name, remote, [
 964             'ceph', 'orch', 'apply', 'rgw', svc,
 965              '--placement',
 966              str(len(nodes)) + ';' + ';'.join(nodes)]
 967         )
 968     for role, i in daemons.items():
 969         remote, id_ = i
 970         ctx.daemons.register_daemon(
 971             remote, 'rgw', id_,
 972             cluster=cluster_name,
 973             fsid=fsid,
 974             logger=log.getChild(role),
 975             wait=False,
 976             started=True,
 977         )
 978
 979     yield
 980
 981
 982 @contextlib.contextmanager
 983 def ceph_iscsi(ctx, config):
 984     """
 985     Deploy iSCSIs
 986     """
 987     cluster_name = config['cluster']
 988     fsid = ctx.ceph[cluster_name].fsid
 989
 990     nodes = []
 991     daemons = {}
 992     ips = []
 993
 994     for remote, roles in ctx.cluster.remotes.items():
 995         for role in [r for r in roles
 996                      if teuthology.is_type('iscsi', cluster_name)(r)]:
 997             c_, _, id_ = teuthology.split_role(role)
 998             log.info('Adding %s on %s' % (role, remote.shortname))
 999             nodes.append(remote.shortname + '=' + id_)
1000             daemons[role] = (remote, id_)
1001             ips.append(remote.ip_address)
1002     trusted_ip_list = ','.join(ips)
1003     if nodes:
1004         poolname = 'datapool'
1005         # ceph osd pool create datapool 3 3 replicated
1006         _shell(ctx, cluster_name, remote, [
1007             'ceph', 'osd', 'pool', 'create',
1008             poolname, '3', '3', 'replicated']
1009         )
1010
1011         _shell(ctx, cluster_name, remote, [
1012             'rbd', 'pool', 'init', poolname]
1013         )
1014
1015         # ceph orch apply iscsi datapool (admin)user (admin)password
1016         _shell(ctx, cluster_name, remote, [
1017             'ceph', 'orch', 'apply', 'iscsi',
1018             poolname, 'admin', 'admin',
1019             '--trusted_ip_list', trusted_ip_list,
1020             '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
1021         )
1022
1023         # used by iscsi client to identify valid gateway ip's
1024         conf_data = dedent(f"""
1025         [config]
1026         trusted_ip_list = {trusted_ip_list}
1027         """)
1028         distribute_iscsi_gateway_cfg(ctx, conf_data)
1029
1030     for role, i in daemons.items():
1031         remote, id_ = i
1032         ctx.daemons.register_daemon(
1033             remote, 'iscsi', id_,
1034             cluster=cluster_name,
1035             fsid=fsid,
1036             logger=log.getChild(role),
1037             wait=False,
1038             started=True,
1039         )
1040
1041     yield
1042
1043
1044 @contextlib.contextmanager
1045 def ceph_clients(ctx, config):
1046     cluster_name = config['cluster']
1047
1048     log.info('Setting up client nodes...')
1049     clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
1050     for remote, roles_for_host in clients.remotes.items():
1051         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
1052                                                      cluster_name):
1053             name = teuthology.ceph_role(role)
1054             client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
1055                                                                 name)
1056             r = _shell(
1057                 ctx=ctx,
1058                 cluster_name=cluster_name,
1059                 remote=remote,
1060                 args=[
1061                     'ceph', 'auth',
1062                     'get-or-create', name,
1063                     'mon', 'allow *',
1064                     'osd', 'allow *',
1065                     'mds', 'allow *',
1066                     'mgr', 'allow *',
1067                 ],
1068                 stdout=StringIO(),
1069             )
1070             keyring = r.stdout.getvalue()
1071             remote.sudo_write_file(client_keyring, keyring, mode='0644')
1072     yield
1073
1074
1075 @contextlib.contextmanager
1076 def ceph_initial():
1077     try:
1078         yield
1079     finally:
1080         log.info('Teardown complete')
1081
1082
1083 ## public methods
1084 @contextlib.contextmanager
1085 def stop(ctx, config):
1086     """
1087     Stop ceph daemons
1088
1089     For example::
1090       tasks:
1091       - ceph.stop: [mds.*]
1092
1093       tasks:
1094       - ceph.stop: [osd.0, osd.2]
1095
1096       tasks:
1097       - ceph.stop:
1098           daemons: [osd.0, osd.2]
1099
1100     """
1101     if config is None:
1102         config = {}
1103     elif isinstance(config, list):
1104         config = {'daemons': config}
1105
1106     daemons = ctx.daemons.resolve_role_list(
1107         config.get('daemons', None), CEPH_ROLE_TYPES, True)
1108     clusters = set()
1109
1110     for role in daemons:
1111         cluster, type_, id_ = teuthology.split_role(role)
1112         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1113         clusters.add(cluster)
1114
1115 #    for cluster in clusters:
1116 #        ctx.ceph[cluster].watchdog.stop()
1117 #        ctx.ceph[cluster].watchdog.join()
1118
1119     yield
1120
1121
1122 def shell(ctx, config):
1123     """
1124     Execute (shell) commands
1125     """
1126     cluster_name = config.get('cluster', 'ceph')
1127
1128     args = []
1129     for k in config.pop('env', []):
1130         args.extend(['-e', k + '=' + ctx.config.get(k, '')])
1131     for k in config.pop('volumes', []):
1132         args.extend(['-v', k])
1133
1134     if 'all-roles' in config and len(config) == 1:
1135         a = config['all-roles']
1136         roles = teuthology.all_roles(ctx.cluster)
1137         config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
1138     elif 'all-hosts' in config and len(config) == 1:
1139         a = config['all-hosts']
1140         roles = teuthology.all_roles(ctx.cluster)
1141         config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
1142
1143     for role, cmd in config.items():
1144         (remote,) = ctx.cluster.only(role).remotes.keys()
1145         log.info('Running commands on role %s host %s', role, remote.name)
1146         if isinstance(cmd, list):
1147             for c in cmd:
1148                 _shell(ctx, cluster_name, remote,
1149                        ['bash', '-c', subst_vip(ctx, c)],
1150                        extra_cephadm_args=args)
1151         else:
1152             assert isinstance(cmd, str)
1153             _shell(ctx, cluster_name, remote,
1154                    ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
1155                    extra_cephadm_args=args)
1156
1157
1158 def apply(ctx, config):
1159     """
1160     Apply spec
1161
1162       tasks:
1163         - cephadm.apply:
1164             specs:
1165             - service_type: rgw
1166               service_id: foo
1167               spec:
1168                 rgw_frontend_port: 8000
1169             - service_type: rgw
1170               service_id: bar
1171               spec:
1172                 rgw_frontend_port: 9000
1173                 zone: bar
1174                 realm: asdf
1175
1176     """
1177     cluster_name = config.get('cluster', 'ceph')
1178
1179     specs = config.get('specs', [])
1180     y = subst_vip(ctx, yaml.dump_all(specs))
1181
1182     log.info(f'Applying spec(s):\n{y}')
1183     _shell(
1184         ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1185         ['ceph', 'orch', 'apply', '-i', '-'],
1186         stdin=y,
1187     )
1188
1189
1190 def wait_for_service(ctx, config):
1191     """
1192     Wait for a service to be fully started
1193
1194       tasks:
1195         - cephadm.wait_for_service:
1196             service: rgw.foo
1197             timeout: 60    # defaults to 300
1198
1199     """
1200     cluster_name = config.get('cluster', 'ceph')
1201     timeout = config.get('timeout', 300)
1202     service = config.get('service')
1203     assert service
1204
1205     log.info(
1206         f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1207     )
1208     with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
1209         while proceed():
1210             r = _shell(
1211                 ctx=ctx,
1212                 cluster_name=cluster_name,
1213                 remote=ctx.ceph[cluster_name].bootstrap_remote,
1214                 args=[
1215                     'ceph', 'orch', 'ls', '-f', 'json',
1216                 ],
1217                 stdout=StringIO(),
1218             )
1219             j = json.loads(r.stdout.getvalue())
1220             svc = None
1221             for s in j:
1222                 if s['service_name'] == service:
1223                     svc = s
1224                     break
1225             if svc:
1226                 log.info(
1227                     f"{service} has {s['status']['running']}/{s['status']['size']}"
1228                 )
1229                 if s['status']['running'] == s['status']['size']:
1230                     break
1231
1232
1233 @contextlib.contextmanager
1234 def tweaked_option(ctx, config):
1235     """
1236     set an option, and then restore it with its original value
1237
1238     Note, due to the way how tasks are executed/nested, it's not suggested to
1239     use this method as a standalone task. otherwise, it's likely that it will
1240     restore the tweaked option at the /end/ of 'tasks' block.
1241     """
1242     saved_options = {}
1243     # we can complicate this when necessary
1244     options = ['mon-health-to-clog']
1245     type_, id_ = 'mon', '*'
1246     cluster = config.get('cluster', 'ceph')
1247     manager = ctx.managers[cluster]
1248     if id_ == '*':
1249         get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
1250     else:
1251         get_from = id_
1252     for option in options:
1253         if option not in config:
1254             continue
1255         value = 'true' if config[option] else 'false'
1256         option = option.replace('-', '_')
1257         old_value = manager.get_config(type_, get_from, option)
1258         if value != old_value:
1259             saved_options[option] = old_value
1260             manager.inject_args(type_, id_, option, value)
1261     yield
1262     for option, value in saved_options.items():
1263         manager.inject_args(type_, id_, option, value)
1264
1265
1266 @contextlib.contextmanager
1267 def restart(ctx, config):
1268     """
1269    restart ceph daemons
1270
1271    For example::
1272       tasks:
1273       - ceph.restart: [all]
1274
1275    For example::
1276       tasks:
1277       - ceph.restart: [osd.0, mon.1, mds.*]
1278
1279    or::
1280
1281       tasks:
1282       - ceph.restart:
1283           daemons: [osd.0, mon.1]
1284           wait-for-healthy: false
1285           wait-for-osds-up: true
1286
1287     :param ctx: Context
1288     :param config: Configuration
1289     """
1290     if config is None:
1291         config = {}
1292     elif isinstance(config, list):
1293         config = {'daemons': config}
1294
1295     daemons = ctx.daemons.resolve_role_list(
1296         config.get('daemons', None), CEPH_ROLE_TYPES, True)
1297     clusters = set()
1298
1299     log.info('daemons %s' % daemons)
1300     with tweaked_option(ctx, config):
1301         for role in daemons:
1302             cluster, type_, id_ = teuthology.split_role(role)
1303             d = ctx.daemons.get_daemon(type_, id_, cluster)
1304             assert d, 'daemon %s does not exist' % role
1305             d.stop()
1306             if type_ == 'osd':
1307                 ctx.managers[cluster].mark_down_osd(id_)
1308             d.restart()
1309             clusters.add(cluster)
1310
1311     if config.get('wait-for-healthy', True):
1312         for cluster in clusters:
1313             healthy(ctx=ctx, config=dict(cluster=cluster))
1314     if config.get('wait-for-osds-up', False):
1315         for cluster in clusters:
1316             ctx.managers[cluster].wait_for_all_osds_up()
1317     yield
1318
1319
1320 @contextlib.contextmanager
1321 def distribute_config_and_admin_keyring(ctx, config):
1322     """
1323     Distribute a sufficient config and keyring for clients
1324     """
1325     cluster_name = config['cluster']
1326     log.info('Distributing (final) config and client.admin keyring...')
1327     for remote, roles in ctx.cluster.remotes.items():
1328         remote.write_file(
1329             '/etc/ceph/{}.conf'.format(cluster_name),
1330             ctx.ceph[cluster_name].config_file,
1331             sudo=True)
1332         remote.write_file(
1333             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1334             data=ctx.ceph[cluster_name].admin_keyring,
1335             sudo=True)
1336     try:
1337         yield
1338     finally:
1339         ctx.cluster.run(args=[
1340             'sudo', 'rm', '-f',
1341             '/etc/ceph/{}.conf'.format(cluster_name),
1342             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1343         ])
1344
1345
1346 @contextlib.contextmanager
1347 def crush_setup(ctx, config):
1348     cluster_name = config['cluster']
1349
1350     profile = config.get('crush_tunables', 'default')
1351     log.info('Setting crush tunables to %s', profile)
1352     _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1353         args=['ceph', 'osd', 'crush', 'tunables', profile])
1354     yield
1355
1356
1357 @contextlib.contextmanager
1358 def create_rbd_pool(ctx, config):
1359     if config.get('create_rbd_pool', False):
1360       cluster_name = config['cluster']
1361       log.info('Waiting for OSDs to come up')
1362       teuthology.wait_until_osds_up(
1363           ctx,
1364           cluster=ctx.cluster,
1365           remote=ctx.ceph[cluster_name].bootstrap_remote,
1366           ceph_cluster=cluster_name,
1367       )
1368       log.info('Creating RBD pool')
1369       _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1370           args=['sudo', 'ceph', '--cluster', cluster_name,
1371                 'osd', 'pool', 'create', 'rbd', '8'])
1372       _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1373           args=['sudo', 'ceph', '--cluster', cluster_name,
1374                 'osd', 'pool', 'application', 'enable',
1375                 'rbd', 'rbd', '--yes-i-really-mean-it'
1376           ])
1377     yield
1378
1379
1380 @contextlib.contextmanager
1381 def _bypass():
1382     yield
1383
1384
1385 @contextlib.contextmanager
1386 def initialize_config(ctx, config):
1387     cluster_name = config['cluster']
1388     testdir = teuthology.get_testdir(ctx)
1389
1390     ctx.ceph[cluster_name].thrashers = []
1391     # fixme: setup watchdog, ala ceph.py
1392
1393     ctx.ceph[cluster_name].roleless = False  # see below
1394
1395     first_ceph_cluster = False
1396     if not hasattr(ctx, 'daemons'):
1397         first_ceph_cluster = True
1398
1399     # cephadm mode?
1400     if 'cephadm_mode' not in config:
1401         config['cephadm_mode'] = 'root'
1402     assert config['cephadm_mode'] in ['root', 'cephadm-package']
1403     if config['cephadm_mode'] == 'root':
1404         ctx.cephadm = testdir + '/cephadm'
1405     else:
1406         ctx.cephadm = 'cephadm'  # in the path
1407
1408     if first_ceph_cluster:
1409         # FIXME: this is global for all clusters
1410         ctx.daemons = DaemonGroup(
1411             use_cephadm=ctx.cephadm)
1412
1413     # uuid
1414     fsid = str(uuid.uuid1())
1415     log.info('Cluster fsid is %s' % fsid)
1416     ctx.ceph[cluster_name].fsid = fsid
1417
1418     # mon ips
1419     log.info('Choosing monitor IPs and ports...')
1420     remotes_and_roles = ctx.cluster.remotes.items()
1421     ips = [host for (host, port) in
1422            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1423
1424     if config.get('roleless', False):
1425         # mons will be named after hosts
1426         first_mon = None
1427         max_mons = config.get('max_mons', 5)
1428         for remote, _ in remotes_and_roles:
1429             ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1430             if not first_mon:
1431                 first_mon = remote.shortname
1432                 bootstrap_remote = remote
1433             max_mons -= 1
1434             if not max_mons:
1435                 break
1436         log.info('No mon roles; fabricating mons')
1437
1438     roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1439
1440     ctx.ceph[cluster_name].mons = get_mons(
1441         roles, ips, cluster_name,
1442         mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1443         mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1444     )
1445     log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1446
1447     if config.get('roleless', False):
1448         ctx.ceph[cluster_name].roleless = True
1449         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1450         ctx.ceph[cluster_name].first_mon = first_mon
1451         ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1452     else:
1453         first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1454         _, _, first_mon = teuthology.split_role(first_mon_role)
1455         (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1456         log.info('First mon is mon.%s on %s' % (first_mon,
1457                                                 bootstrap_remote.shortname))
1458         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1459         ctx.ceph[cluster_name].first_mon = first_mon
1460         ctx.ceph[cluster_name].first_mon_role = first_mon_role
1461
1462         others = ctx.cluster.remotes[bootstrap_remote]
1463         mgrs = sorted([r for r in others
1464                        if teuthology.is_type('mgr', cluster_name)(r)])
1465         if not mgrs:
1466             raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1467         _, _, first_mgr = teuthology.split_role(mgrs[0])
1468         log.info('First mgr is %s' % (first_mgr))
1469         ctx.ceph[cluster_name].first_mgr = first_mgr
1470     yield
1471
1472
1473 @contextlib.contextmanager
1474 def task(ctx, config):
1475     """
1476     Deploy ceph cluster using cephadm
1477
1478     For example, teuthology.yaml can contain the 'defaults' section:
1479
1480         defaults:
1481           cephadm:
1482             containers:
1483               image: 'quay.io/ceph-ci/ceph'
1484
1485     Using overrides makes it possible to customize it per run.
1486     The equivalent 'overrides' section looks like:
1487
1488         overrides:
1489           cephadm:
1490             containers:
1491               image: 'quay.io/ceph-ci/ceph'
1492             registry-login:
1493               url:  registry-url
1494               username: registry-user
1495               password: registry-password
1496
1497     :param ctx: the argparse.Namespace object
1498     :param config: the config dict
1499     """
1500     if config is None:
1501         config = {}
1502
1503     assert isinstance(config, dict), \
1504         "task only supports a dictionary for configuration"
1505
1506     overrides = ctx.config.get('overrides', {})
1507     teuthology.deep_merge(config, overrides.get('ceph', {}))
1508     teuthology.deep_merge(config, overrides.get('cephadm', {}))
1509     log.info('Config: ' + str(config))
1510
1511     # set up cluster context
1512     if not hasattr(ctx, 'ceph'):
1513         ctx.ceph = {}
1514     if 'cluster' not in config:
1515         config['cluster'] = 'ceph'
1516     cluster_name = config['cluster']
1517     if cluster_name not in ctx.ceph:
1518         ctx.ceph[cluster_name] = argparse.Namespace()
1519         ctx.ceph[cluster_name].bootstrapped = False
1520
1521     # image
1522     teuth_defaults = teuth_config.get('defaults', {})
1523     cephadm_defaults = teuth_defaults.get('cephadm', {})
1524     containers_defaults = cephadm_defaults.get('containers', {})
1525     container_image_name = containers_defaults.get('image', None)
1526
1527     containers = config.get('containers', {})
1528     container_image_name = containers.get('image', container_image_name)
1529
1530     if not hasattr(ctx.ceph[cluster_name], 'image'):
1531         ctx.ceph[cluster_name].image = config.get('image')
1532     ref = None
1533     if not ctx.ceph[cluster_name].image:
1534         if not container_image_name:
1535             raise Exception("Configuration error occurred. "
1536                             "The 'image' value is undefined for 'cephadm' task. "
1537                             "Please provide corresponding options in the task's "
1538                             "config, task 'overrides', or teuthology 'defaults' "
1539                             "section.")
1540         sha1 = config.get('sha1')
1541         flavor = config.get('flavor', 'default')
1542
1543         if sha1:
1544             if flavor == "crimson":
1545                 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1546             else:
1547                 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1548             ref = sha1
1549         else:
1550             # hmm, fall back to branch?
1551             branch = config.get('branch', 'master')
1552             ref = branch
1553             ctx.ceph[cluster_name].image = container_image_name + ':' + branch
1554     log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1555
1556
1557     with contextutil.nested(
1558             #if the cluster is already bootstrapped bypass corresponding methods
1559             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1560                               else initialize_config(ctx=ctx, config=config),
1561             lambda: ceph_initial(),
1562             lambda: normalize_hostnames(ctx=ctx),
1563             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1564                               else download_cephadm(ctx=ctx, config=config, ref=ref),
1565             lambda: ceph_log(ctx=ctx, config=config),
1566             lambda: ceph_crash(ctx=ctx, config=config),
1567             lambda: pull_image(ctx=ctx, config=config),
1568             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1569                               else ceph_bootstrap(ctx, config),
1570             lambda: crush_setup(ctx=ctx, config=config),
1571             lambda: ceph_mons(ctx=ctx, config=config),
1572             lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1573             lambda: ceph_mgrs(ctx=ctx, config=config),
1574             lambda: ceph_osds(ctx=ctx, config=config),
1575             lambda: ceph_mdss(ctx=ctx, config=config),
1576             lambda: cephfs_setup(ctx=ctx, config=config),
1577             lambda: ceph_rgw(ctx=ctx, config=config),
1578             lambda: ceph_iscsi(ctx=ctx, config=config),
1579             lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1580             lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1581             lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1582             lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1583             lambda: ceph_clients(ctx=ctx, config=config),
1584             lambda: create_rbd_pool(ctx=ctx, config=config),
1585     ):
1586         try:
1587             if config.get('wait-for-healthy', True):
1588                 healthy(ctx=ctx, config=config)
1589
1590             log.info('Setup complete, yielding')
1591             yield
1592
1593         finally:
1594             log.info('Teardown begin')
1595