ceph/qa/tasks/cephadm.py

   1 """
   2 Ceph cluster task, deployed via cephadm orchestrator
   3 """
   4 import argparse
   5 import configobj
   6 import contextlib
   7 import logging
   8 import os
   9 import json
  10 import re
  11 import uuid
  12 import yaml
  13
  14 from copy import deepcopy
  15 from io import BytesIO, StringIO
  16 from tarfile import ReadError
  17 from tasks.ceph_manager import CephManager
  18 from teuthology import misc as teuthology
  19 from teuthology import contextutil
  20 from teuthology import packaging
  21 from teuthology.orchestra import run
  22 from teuthology.orchestra.daemon import DaemonGroup
  23 from teuthology.config import config as teuth_config
  24 from textwrap import dedent
  25 from tasks.cephfs.filesystem import MDSCluster, Filesystem
  26 from tasks.util import chacra
  27
  28 # these items we use from ceph.py should probably eventually move elsewhere
  29 from tasks.ceph import get_mons, healthy
  30 from tasks.vip import subst_vip
  31
  32 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
  33
  34 log = logging.getLogger(__name__)
  35
  36
  37 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
  38     teuthology.get_testdir(ctx)
  39     return remote.run(
  40         args=[
  41             'sudo',
  42             ctx.cephadm,
  43             '--image', ctx.ceph[cluster_name].image,
  44             'shell',
  45             '-c', '/etc/ceph/{}.conf'.format(cluster_name),
  46             '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
  47             '--fsid', ctx.ceph[cluster_name].fsid,
  48             ] + extra_cephadm_args + [
  49             '--',
  50             ] + args,
  51         **kwargs
  52     )
  53
  54
  55 def build_initial_config(ctx, config):
  56     cluster_name = config['cluster']
  57
  58     path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
  59     conf = configobj.ConfigObj(path, file_error=True)
  60
  61     conf.setdefault('global', {})
  62     conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
  63
  64     # overrides
  65     for section, keys in config.get('conf',{}).items():
  66         for key, value in keys.items():
  67             log.info(" override: [%s] %s = %s" % (section, key, value))
  68             if section not in conf:
  69                 conf[section] = {}
  70             conf[section][key] = value
  71
  72     return conf
  73
  74
  75 def distribute_iscsi_gateway_cfg(ctx, conf_data):
  76     """
  77     Distribute common gateway config to get the IPs.
  78     These will help in iscsi clients with finding trusted_ip_list.
  79     """
  80     log.info('Distributing iscsi-gateway.cfg...')
  81     for remote, roles in ctx.cluster.remotes.items():
  82         remote.write_file(
  83             path='/etc/ceph/iscsi-gateway.cfg',
  84             data=conf_data,
  85             sudo=True)
  86
  87 def update_archive_setting(ctx, key, value):
  88     """
  89     Add logs directory to job's info log file
  90     """
  91     if ctx.archive is None:
  92         return
  93     with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
  94         info_yaml = yaml.safe_load(info_file)
  95         info_file.seek(0)
  96         if 'archive' in info_yaml:
  97             info_yaml['archive'][key] = value
  98         else:
  99             info_yaml['archive'] = {key: value}
 100         yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
 101
 102
 103 @contextlib.contextmanager
 104 def normalize_hostnames(ctx):
 105     """
 106     Ensure we have short hostnames throughout, for consistency between
 107     remote.shortname and socket.gethostname() in cephadm.
 108     """
 109     log.info('Normalizing hostnames...')
 110     cluster = ctx.cluster.filter(lambda r: '.' in r.hostname)
 111     cluster.run(args=[
 112         'sudo',
 113         'hostname',
 114         run.Raw('$(hostname -s)'),
 115     ])
 116
 117     try:
 118         yield
 119     finally:
 120         pass
 121
 122
 123 @contextlib.contextmanager
 124 def download_cephadm(ctx, config, ref):
 125     cluster_name = config['cluster']
 126
 127     if config.get('cephadm_mode') != 'cephadm-package':
 128         if ctx.config.get('redhat'):
 129             _fetch_cephadm_from_rpm(ctx)
 130         # TODO: come up with a sensible way to detect if we need an "old, uncompiled"
 131         # cephadm
 132         elif 'cephadm_git_url' in config and 'cephadm_branch' in config:
 133             _fetch_cephadm_from_github(ctx, config, ref)
 134         else:
 135             _fetch_cephadm_from_chachra(ctx, config, cluster_name)
 136
 137     try:
 138         yield
 139     finally:
 140         _rm_cluster(ctx, cluster_name)
 141         if config.get('cephadm_mode') == 'root':
 142             _rm_cephadm(ctx)
 143
 144
 145 def _fetch_cephadm_from_rpm(ctx):
 146     log.info("Copying cephadm installed from an RPM package")
 147     # cephadm already installed from redhat.install task
 148     ctx.cluster.run(
 149         args=[
 150             'cp',
 151             run.Raw('$(which cephadm)'),
 152             ctx.cephadm,
 153             run.Raw('&&'),
 154             'ls', '-l',
 155             ctx.cephadm,
 156         ]
 157     )
 158
 159
 160 def _fetch_cephadm_from_github(ctx, config, ref):
 161     ref = config.get('cephadm_branch', ref)
 162     git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
 163     log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
 164     if git_url.startswith('https://github.com/'):
 165         # git archive doesn't like https:// URLs, which we use with github.
 166         rest = git_url.split('https://github.com/', 1)[1]
 167         rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
 168         ctx.cluster.run(
 169             args=[
 170                 'curl', '--silent',
 171                 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
 172                 run.Raw('>'),
 173                 ctx.cephadm,
 174                 run.Raw('&&'),
 175                 'ls', '-l',
 176                 ctx.cephadm,
 177             ],
 178         )
 179     else:
 180         ctx.cluster.run(
 181             args=[
 182                 'git', 'clone', git_url, 'testrepo',
 183                 run.Raw('&&'),
 184                 'cd', 'testrepo',
 185                 run.Raw('&&'),
 186                 'git', 'show', f'{ref}:src/cephadm/cephadm',
 187                 run.Raw('>'),
 188                 ctx.cephadm,
 189                 run.Raw('&&'),
 190                 'ls', '-l', ctx.cephadm,
 191             ],
 192         )
 193     # sanity-check the resulting file and set executable bit
 194     cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
 195     ctx.cluster.run(
 196         args=[
 197             'test', '-s', ctx.cephadm,
 198             run.Raw('&&'),
 199             'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
 200             run.Raw('&&'),
 201             'chmod', '+x', ctx.cephadm,
 202         ],
 203     )
 204
 205
 206 def _fetch_cephadm_from_chachra(ctx, config, cluster_name):
 207     log.info('Downloading "compiled" cephadm from cachra')
 208     bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 209     bp = packaging.get_builder_project()(
 210         config.get('project', 'ceph'),
 211         config,
 212         ctx=ctx,
 213         remote=bootstrap_remote,
 214     )
 215     log.info('builder_project result: %s' % (bp._result.json()))
 216
 217     flavor = config.get('flavor', 'default')
 218     branch = config.get('branch')
 219     sha1 = config.get('sha1')
 220
 221     # pull the cephadm binary from chacra
 222     url = chacra.get_binary_url(
 223             'cephadm',
 224             project=bp.project,
 225             distro=bp.distro.split('/')[0],
 226             release=bp.distro.split('/')[1],
 227             arch=bp.arch,
 228             flavor=flavor,
 229             branch=branch,
 230             sha1=sha1,
 231     )
 232     log.info("Discovered cachra url: %s", url)
 233     ctx.cluster.run(
 234         args=[
 235             'curl', '--silent', '-L', url,
 236             run.Raw('>'),
 237             ctx.cephadm,
 238             run.Raw('&&'),
 239             'ls', '-l',
 240             ctx.cephadm,
 241         ],
 242     )
 243
 244     # sanity-check the resulting file and set executable bit
 245     cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
 246     ctx.cluster.run(
 247         args=[
 248             'test', '-s', ctx.cephadm,
 249             run.Raw('&&'),
 250             'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
 251             run.Raw('&&'),
 252             'chmod', '+x', ctx.cephadm,
 253         ],
 254     )
 255
 256
 257 def _rm_cluster(ctx, cluster_name):
 258     log.info('Removing cluster...')
 259     ctx.cluster.run(args=[
 260         'sudo',
 261         ctx.cephadm,
 262         'rm-cluster',
 263         '--fsid', ctx.ceph[cluster_name].fsid,
 264         '--force',
 265     ])
 266
 267
 268 def _rm_cephadm(ctx):
 269     log.info('Removing cephadm ...')
 270     ctx.cluster.run(
 271         args=[
 272             'rm',
 273             '-rf',
 274             ctx.cephadm,
 275         ],
 276     )
 277
 278
 279 @contextlib.contextmanager
 280 def ceph_log(ctx, config):
 281     cluster_name = config['cluster']
 282     fsid = ctx.ceph[cluster_name].fsid
 283
 284     update_archive_setting(ctx, 'log', '/var/log/ceph')
 285
 286
 287     try:
 288         yield
 289
 290     except Exception:
 291         # we need to know this below
 292         ctx.summary['success'] = False
 293         raise
 294
 295     finally:
 296         log.info('Checking cluster log for badness...')
 297         def first_in_ceph_log(pattern, excludes):
 298             """
 299             Find the first occurrence of the pattern specified in the Ceph log,
 300             Returns None if none found.
 301
 302             :param pattern: Pattern scanned for.
 303             :param excludes: Patterns to ignore.
 304             :return: First line of text (or None if not found)
 305             """
 306             args = [
 307                 'sudo',
 308                 'egrep', pattern,
 309                 '/var/log/ceph/{fsid}/ceph.log'.format(
 310                     fsid=fsid),
 311             ]
 312             if excludes:
 313                 for exclude in excludes:
 314                     args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 315             args.extend([
 316                 run.Raw('|'), 'head', '-n', '1',
 317             ])
 318             r = ctx.ceph[cluster_name].bootstrap_remote.run(
 319                 stdout=StringIO(),
 320                 args=args,
 321             )
 322             stdout = r.stdout.getvalue()
 323             if stdout != '':
 324                 return stdout
 325             return None
 326
 327         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 328                              config.get('log-ignorelist')) is not None:
 329             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 330             ctx.summary['success'] = False
 331             # use the most severe problem as the failure reason
 332             if 'failure_reason' not in ctx.summary:
 333                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 334                     match = first_in_ceph_log(pattern, config['log-ignorelist'])
 335                     if match is not None:
 336                         ctx.summary['failure_reason'] = \
 337                             '"{match}" in cluster log'.format(
 338                                 match=match.rstrip('\n'),
 339                             )
 340                         break
 341
 342         if ctx.archive is not None and \
 343                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 344             # and logs
 345             log.info('Compressing logs...')
 346             run.wait(
 347                 ctx.cluster.run(
 348                     args=[
 349                         'sudo',
 350                         'find',
 351                         '/var/log/ceph',   # all logs, not just for the cluster
 352                         '/var/log/rbd-target-api', # ceph-iscsi
 353                         '-name',
 354                         '*.log',
 355                         '-print0',
 356                         run.Raw('|'),
 357                         'sudo',
 358                         'xargs',
 359                         '-0',
 360                         '--no-run-if-empty',
 361                         '--',
 362                         'gzip',
 363                         '--',
 364                     ],
 365                     wait=False,
 366                 ),
 367             )
 368
 369             log.info('Archiving logs...')
 370             path = os.path.join(ctx.archive, 'remote')
 371             try:
 372                 os.makedirs(path)
 373             except OSError:
 374                 pass
 375             for remote in ctx.cluster.remotes.keys():
 376                 sub = os.path.join(path, remote.shortname)
 377                 try:
 378                     os.makedirs(sub)
 379                 except OSError:
 380                     pass
 381                 try:
 382                     teuthology.pull_directory(remote, '/var/log/ceph',  # everything
 383                                               os.path.join(sub, 'log'))
 384                 except ReadError:
 385                     pass
 386
 387
 388 @contextlib.contextmanager
 389 def ceph_crash(ctx, config):
 390     """
 391     Gather crash dumps from /var/lib/ceph/$fsid/crash
 392     """
 393     cluster_name = config['cluster']
 394     fsid = ctx.ceph[cluster_name].fsid
 395
 396     update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
 397
 398     try:
 399         yield
 400
 401     finally:
 402         if ctx.archive is not None:
 403             log.info('Archiving crash dumps...')
 404             path = os.path.join(ctx.archive, 'remote')
 405             try:
 406                 os.makedirs(path)
 407             except OSError:
 408                 pass
 409             for remote in ctx.cluster.remotes.keys():
 410                 sub = os.path.join(path, remote.shortname)
 411                 try:
 412                     os.makedirs(sub)
 413                 except OSError:
 414                     pass
 415                 try:
 416                     teuthology.pull_directory(remote,
 417                                               '/var/lib/ceph/%s/crash' % fsid,
 418                                               os.path.join(sub, 'crash'))
 419                 except ReadError:
 420                     pass
 421
 422
 423 @contextlib.contextmanager
 424 def pull_image(ctx, config):
 425     cluster_name = config['cluster']
 426     log.info(f'Pulling image {ctx.ceph[cluster_name].image} on all hosts...')
 427     run.wait(
 428         ctx.cluster.run(
 429             args=[
 430                 'sudo',
 431                 ctx.cephadm,
 432                 '--image', ctx.ceph[cluster_name].image,
 433                 'pull',
 434             ],
 435             wait=False,
 436         )
 437     )
 438
 439     try:
 440         yield
 441     finally:
 442         pass
 443
 444
 445 @contextlib.contextmanager
 446 def ceph_bootstrap(ctx, config):
 447     """
 448     Bootstrap ceph cluster.
 449
 450     :param ctx: the argparse.Namespace object
 451     :param config: the config dict
 452     """
 453     cluster_name = config['cluster']
 454     testdir = teuthology.get_testdir(ctx)
 455     fsid = ctx.ceph[cluster_name].fsid
 456
 457     bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 458     first_mon = ctx.ceph[cluster_name].first_mon
 459     first_mon_role = ctx.ceph[cluster_name].first_mon_role
 460     mons = ctx.ceph[cluster_name].mons
 461
 462     ctx.cluster.run(args=[
 463         'sudo', 'mkdir', '-p', '/etc/ceph',
 464         ]);
 465     ctx.cluster.run(args=[
 466         'sudo', 'chmod', '777', '/etc/ceph',
 467         ]);
 468     try:
 469         # write seed config
 470         log.info('Writing seed config...')
 471         conf_fp = BytesIO()
 472         seed_config = build_initial_config(ctx, config)
 473         seed_config.write(conf_fp)
 474         bootstrap_remote.write_file(
 475             path='{}/seed.{}.conf'.format(testdir, cluster_name),
 476             data=conf_fp.getvalue())
 477         log.debug('Final config:\n' + conf_fp.getvalue().decode())
 478         ctx.ceph[cluster_name].conf = seed_config
 479
 480         # register initial daemons
 481         ctx.daemons.register_daemon(
 482             bootstrap_remote, 'mon', first_mon,
 483             cluster=cluster_name,
 484             fsid=fsid,
 485             logger=log.getChild('mon.' + first_mon),
 486             wait=False,
 487             started=True,
 488         )
 489         if not ctx.ceph[cluster_name].roleless:
 490             first_mgr = ctx.ceph[cluster_name].first_mgr
 491             ctx.daemons.register_daemon(
 492                 bootstrap_remote, 'mgr', first_mgr,
 493                 cluster=cluster_name,
 494                 fsid=fsid,
 495                 logger=log.getChild('mgr.' + first_mgr),
 496                 wait=False,
 497                 started=True,
 498             )
 499
 500         # bootstrap
 501         log.info('Bootstrapping...')
 502         cmd = [
 503             'sudo',
 504             ctx.cephadm,
 505             '--image', ctx.ceph[cluster_name].image,
 506             '-v',
 507             'bootstrap',
 508             '--fsid', fsid,
 509             '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
 510             '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
 511             '--output-keyring',
 512             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 513             '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
 514         ]
 515         if config.get("no_cgroups_split") is True:
 516             cmd.insert(cmd.index("bootstrap"), "--no-cgroups-split")
 517
 518         if config.get('registry-login'):
 519             registry = config['registry-login']
 520             cmd += [
 521                 "--registry-url", registry['url'],
 522                 "--registry-username", registry['username'],
 523                 "--registry-password", registry['password'],
 524             ]
 525
 526         if not ctx.ceph[cluster_name].roleless:
 527             cmd += [
 528                 '--mon-id', first_mon,
 529                 '--mgr-id', first_mgr,
 530                 '--orphan-initial-daemons',   # we will do it explicitly!
 531                 '--skip-monitoring-stack',    # we'll provision these explicitly
 532             ]
 533
 534         if mons[first_mon_role].startswith('['):
 535             cmd += ['--mon-addrv', mons[first_mon_role]]
 536         else:
 537             cmd += ['--mon-ip', mons[first_mon_role]]
 538         if config.get('skip_dashboard'):
 539             cmd += ['--skip-dashboard']
 540         if config.get('skip_monitoring_stack'):
 541             cmd += ['--skip-monitoring-stack']
 542         if config.get('single_host_defaults'):
 543             cmd += ['--single-host-defaults']
 544         if not config.get('avoid_pacific_features', False):
 545             cmd += ['--skip-admin-label']
 546         # bootstrap makes the keyring root 0600, so +r it for our purposes
 547         cmd += [
 548             run.Raw('&&'),
 549             'sudo', 'chmod', '+r',
 550             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 551         ]
 552         bootstrap_remote.run(args=cmd)
 553
 554         # fetch keys and configs
 555         log.info('Fetching config...')
 556         ctx.ceph[cluster_name].config_file = \
 557             bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
 558         log.info('Fetching client.admin keyring...')
 559         ctx.ceph[cluster_name].admin_keyring = \
 560             bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
 561         log.info('Fetching mon keyring...')
 562         ctx.ceph[cluster_name].mon_keyring = \
 563             bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
 564
 565         # fetch ssh key, distribute to additional nodes
 566         log.info('Fetching pub ssh key...')
 567         ssh_pub_key = bootstrap_remote.read_file(
 568             f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
 569
 570         log.info('Installing pub ssh key for root users...')
 571         ctx.cluster.run(args=[
 572             'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
 573             run.Raw('&&'),
 574             'echo', ssh_pub_key,
 575             run.Raw('|'),
 576             'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
 577             run.Raw('&&'),
 578             'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
 579         ])
 580
 581         # set options
 582         if config.get('allow_ptrace', True):
 583             _shell(ctx, cluster_name, bootstrap_remote,
 584                    ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
 585
 586         if not config.get('avoid_pacific_features', False):
 587             log.info('Distributing conf and client.admin keyring to all hosts + 0755')
 588             _shell(ctx, cluster_name, bootstrap_remote,
 589                    ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
 590                     '*', '--mode', '0755'],
 591                    check_status=False)
 592
 593         # add other hosts
 594         for remote in ctx.cluster.remotes.keys():
 595             if remote == bootstrap_remote:
 596                 continue
 597
 598             # note: this may be redundant (see above), but it avoids
 599             # us having to wait for cephadm to do it.
 600             log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
 601             remote.write_file(
 602                 path='/etc/ceph/{}.conf'.format(cluster_name),
 603                 data=ctx.ceph[cluster_name].config_file)
 604             remote.write_file(
 605                 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 606                 data=ctx.ceph[cluster_name].admin_keyring)
 607
 608             log.info('Adding host %s to orchestrator...' % remote.shortname)
 609             _shell(ctx, cluster_name, bootstrap_remote, [
 610                 'ceph', 'orch', 'host', 'add',
 611                 remote.shortname
 612             ])
 613             r = _shell(ctx, cluster_name, bootstrap_remote,
 614                        ['ceph', 'orch', 'host', 'ls', '--format=json'],
 615                        stdout=StringIO())
 616             hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
 617             assert remote.shortname in hosts
 618
 619         yield
 620
 621     finally:
 622         log.info('Cleaning up testdir ceph.* files...')
 623         ctx.cluster.run(args=[
 624             'rm', '-f',
 625             '{}/seed.{}.conf'.format(testdir, cluster_name),
 626             '{}/{}.pub'.format(testdir, cluster_name),
 627         ])
 628
 629         log.info('Stopping all daemons...')
 630
 631         # this doesn't block until they are all stopped...
 632         #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
 633
 634         # stop the daemons we know
 635         for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
 636             cluster, type_, id_ = teuthology.split_role(role)
 637             try:
 638                 ctx.daemons.get_daemon(type_, id_, cluster).stop()
 639             except Exception:
 640                 log.exception(f'Failed to stop "{role}"')
 641                 raise
 642
 643         # tear down anything left (but leave the logs behind)
 644         ctx.cluster.run(
 645             args=[
 646                 'sudo',
 647                 ctx.cephadm,
 648                 'rm-cluster',
 649                 '--fsid', fsid,
 650                 '--force',
 651                 '--keep-logs',
 652             ],
 653             check_status=False,  # may fail if upgrading from old cephadm
 654         )
 655
 656         # clean up /etc/ceph
 657         ctx.cluster.run(args=[
 658             'sudo', 'rm', '-f',
 659             '/etc/ceph/{}.conf'.format(cluster_name),
 660             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 661         ])
 662
 663
 664 @contextlib.contextmanager
 665 def ceph_mons(ctx, config):
 666     """
 667     Deploy any additional mons
 668     """
 669     cluster_name = config['cluster']
 670     fsid = ctx.ceph[cluster_name].fsid
 671
 672     try:
 673         daemons = {}
 674         if config.get('add_mons_via_daemon_add'):
 675             # This is the old way of adding mons that works with the (early) octopus
 676             # cephadm scheduler.
 677             num_mons = 1
 678             for remote, roles in ctx.cluster.remotes.items():
 679                 for mon in [r for r in roles
 680                             if teuthology.is_type('mon', cluster_name)(r)]:
 681                     c_, _, id_ = teuthology.split_role(mon)
 682                     if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 683                         continue
 684                     log.info('Adding %s on %s' % (mon, remote.shortname))
 685                     num_mons += 1
 686                     _shell(ctx, cluster_name, remote, [
 687                         'ceph', 'orch', 'daemon', 'add', 'mon',
 688                         remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
 689                     ])
 690                     ctx.daemons.register_daemon(
 691                         remote, 'mon', id_,
 692                         cluster=cluster_name,
 693                         fsid=fsid,
 694                         logger=log.getChild(mon),
 695                         wait=False,
 696                         started=True,
 697                     )
 698                     daemons[mon] = (remote, id_)
 699
 700                     with contextutil.safe_while(sleep=1, tries=180) as proceed:
 701                         while proceed():
 702                             log.info('Waiting for %d mons in monmap...' % (num_mons))
 703                             r = _shell(
 704                                 ctx=ctx,
 705                                 cluster_name=cluster_name,
 706                                 remote=remote,
 707                                 args=[
 708                                     'ceph', 'mon', 'dump', '-f', 'json',
 709                                 ],
 710                                 stdout=StringIO(),
 711                             )
 712                             j = json.loads(r.stdout.getvalue())
 713                             if len(j['mons']) == num_mons:
 714                                 break
 715         else:
 716             nodes = []
 717             for remote, roles in ctx.cluster.remotes.items():
 718                 for mon in [r for r in roles
 719                             if teuthology.is_type('mon', cluster_name)(r)]:
 720                     c_, _, id_ = teuthology.split_role(mon)
 721                     log.info('Adding %s on %s' % (mon, remote.shortname))
 722                     nodes.append(remote.shortname
 723                                  + ':' + ctx.ceph[cluster_name].mons[mon]
 724                                  + '=' + id_)
 725                     if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 726                         continue
 727                     daemons[mon] = (remote, id_)
 728
 729             _shell(ctx, cluster_name, remote, [
 730                 'ceph', 'orch', 'apply', 'mon',
 731                 str(len(nodes)) + ';' + ';'.join(nodes)]
 732                    )
 733             for mgr, i in daemons.items():
 734                 remote, id_ = i
 735                 ctx.daemons.register_daemon(
 736                     remote, 'mon', id_,
 737                     cluster=cluster_name,
 738                     fsid=fsid,
 739                     logger=log.getChild(mon),
 740                     wait=False,
 741                     started=True,
 742                 )
 743
 744             with contextutil.safe_while(sleep=1, tries=180) as proceed:
 745                 while proceed():
 746                     log.info('Waiting for %d mons in monmap...' % (len(nodes)))
 747                     r = _shell(
 748                         ctx=ctx,
 749                         cluster_name=cluster_name,
 750                         remote=remote,
 751                         args=[
 752                             'ceph', 'mon', 'dump', '-f', 'json',
 753                         ],
 754                         stdout=StringIO(),
 755                     )
 756                     j = json.loads(r.stdout.getvalue())
 757                     if len(j['mons']) == len(nodes):
 758                         break
 759
 760         # refresh our (final) ceph.conf file
 761         bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 762         log.info('Generating final ceph.conf file...')
 763         r = _shell(
 764             ctx=ctx,
 765             cluster_name=cluster_name,
 766             remote=bootstrap_remote,
 767             args=[
 768                 'ceph', 'config', 'generate-minimal-conf',
 769             ],
 770             stdout=StringIO(),
 771         )
 772         ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
 773
 774         yield
 775
 776     finally:
 777         pass
 778
 779
 780 @contextlib.contextmanager
 781 def ceph_mgrs(ctx, config):
 782     """
 783     Deploy any additional mgrs
 784     """
 785     cluster_name = config['cluster']
 786     fsid = ctx.ceph[cluster_name].fsid
 787
 788     try:
 789         nodes = []
 790         daemons = {}
 791         for remote, roles in ctx.cluster.remotes.items():
 792             for mgr in [r for r in roles
 793                         if teuthology.is_type('mgr', cluster_name)(r)]:
 794                 c_, _, id_ = teuthology.split_role(mgr)
 795                 log.info('Adding %s on %s' % (mgr, remote.shortname))
 796                 nodes.append(remote.shortname + '=' + id_)
 797                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
 798                     continue
 799                 daemons[mgr] = (remote, id_)
 800         if nodes:
 801             _shell(ctx, cluster_name, remote, [
 802                 'ceph', 'orch', 'apply', 'mgr',
 803                 str(len(nodes)) + ';' + ';'.join(nodes)]
 804             )
 805         for mgr, i in daemons.items():
 806             remote, id_ = i
 807             ctx.daemons.register_daemon(
 808                 remote, 'mgr', id_,
 809                 cluster=cluster_name,
 810                 fsid=fsid,
 811                 logger=log.getChild(mgr),
 812                 wait=False,
 813                 started=True,
 814             )
 815
 816         yield
 817
 818     finally:
 819         pass
 820
 821
 822 @contextlib.contextmanager
 823 def ceph_osds(ctx, config):
 824     """
 825     Deploy OSDs
 826     """
 827     cluster_name = config['cluster']
 828     fsid = ctx.ceph[cluster_name].fsid
 829
 830     try:
 831         log.info('Deploying OSDs...')
 832
 833         # provision OSDs in numeric order
 834         id_to_remote = {}
 835         devs_by_remote = {}
 836         for remote, roles in ctx.cluster.remotes.items():
 837             devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
 838             for osd in [r for r in roles
 839                         if teuthology.is_type('osd', cluster_name)(r)]:
 840                 _, _, id_ = teuthology.split_role(osd)
 841                 id_to_remote[int(id_)] = (osd, remote)
 842
 843         cur = 0
 844         for osd_id in sorted(id_to_remote.keys()):
 845             osd, remote = id_to_remote[osd_id]
 846             _, _, id_ = teuthology.split_role(osd)
 847             assert int(id_) == cur
 848             devs = devs_by_remote[remote]
 849             assert devs   ## FIXME ##
 850             dev = devs.pop()
 851             if all(_ in dev for _ in ('lv', 'vg')):
 852                 short_dev = dev.replace('/dev/', '')
 853             else:
 854                 short_dev = dev
 855             log.info('Deploying %s on %s with %s...' % (
 856                 osd, remote.shortname, dev))
 857             _shell(ctx, cluster_name, remote, [
 858                 'ceph-volume', 'lvm', 'zap', dev])
 859             add_osd_args = ['ceph', 'orch', 'daemon', 'add', 'osd',
 860                             remote.shortname + ':' + short_dev]
 861             osd_method = config.get('osd_method')
 862             if osd_method:
 863                 add_osd_args.append(osd_method)
 864             _shell(ctx, cluster_name, remote, add_osd_args)
 865             ctx.daemons.register_daemon(
 866                 remote, 'osd', id_,
 867                 cluster=cluster_name,
 868                 fsid=fsid,
 869                 logger=log.getChild(osd),
 870                 wait=False,
 871                 started=True,
 872             )
 873             cur += 1
 874
 875         if cur == 0:
 876             _shell(ctx, cluster_name, remote, [
 877                 'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
 878             ])
 879             # expect the number of scratch devs
 880             num_osds = sum(map(len, devs_by_remote.values()))
 881             assert num_osds
 882         else:
 883             # expect the number of OSDs we created
 884             num_osds = cur
 885
 886         log.info(f'Waiting for {num_osds} OSDs to come up...')
 887         with contextutil.safe_while(sleep=1, tries=120) as proceed:
 888             while proceed():
 889                 p = _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
 890                            ['ceph', 'osd', 'stat', '-f', 'json'], stdout=StringIO())
 891                 j = json.loads(p.stdout.getvalue())
 892                 if int(j.get('num_up_osds', 0)) == num_osds:
 893                     break;
 894
 895         if not hasattr(ctx, 'managers'):
 896             ctx.managers = {}
 897         ctx.managers[cluster_name] = CephManager(
 898             ctx.ceph[cluster_name].bootstrap_remote,
 899             ctx=ctx,
 900             logger=log.getChild('ceph_manager.' + cluster_name),
 901             cluster=cluster_name,
 902             cephadm=True,
 903         )
 904
 905         yield
 906     finally:
 907         pass
 908
 909
 910 @contextlib.contextmanager
 911 def ceph_mdss(ctx, config):
 912     """
 913     Deploy MDSss
 914     """
 915     cluster_name = config['cluster']
 916     fsid = ctx.ceph[cluster_name].fsid
 917
 918     nodes = []
 919     daemons = {}
 920     for remote, roles in ctx.cluster.remotes.items():
 921         for role in [r for r in roles
 922                     if teuthology.is_type('mds', cluster_name)(r)]:
 923             c_, _, id_ = teuthology.split_role(role)
 924             log.info('Adding %s on %s' % (role, remote.shortname))
 925             nodes.append(remote.shortname + '=' + id_)
 926             daemons[role] = (remote, id_)
 927     if nodes:
 928         _shell(ctx, cluster_name, remote, [
 929             'ceph', 'orch', 'apply', 'mds',
 930             'all',
 931             str(len(nodes)) + ';' + ';'.join(nodes)]
 932         )
 933     for role, i in daemons.items():
 934         remote, id_ = i
 935         ctx.daemons.register_daemon(
 936             remote, 'mds', id_,
 937             cluster=cluster_name,
 938             fsid=fsid,
 939             logger=log.getChild(role),
 940             wait=False,
 941             started=True,
 942         )
 943
 944     yield
 945
 946 @contextlib.contextmanager
 947 def cephfs_setup(ctx, config):
 948     mdss = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
 949
 950     # If there are any MDSs, then create a filesystem for them to use
 951     # Do this last because requires mon cluster to be up and running
 952     if len(mdss) > 0:
 953         log.info('Setting up CephFS filesystem(s)...')
 954         cephfs_config = config.get('cephfs', {})
 955         fs_configs =  cephfs_config.pop('fs', [{'name': 'cephfs'}])
 956         set_allow_multifs = len(fs_configs) > 1
 957
 958         # wait for standbys to become available (slow due to valgrind, perhaps)
 959         mdsc = MDSCluster(ctx)
 960         with contextutil.safe_while(sleep=2,tries=150) as proceed:
 961             while proceed():
 962                 if len(mdsc.get_standby_daemons()) >= len(mdss):
 963                     break
 964
 965         fss = []
 966         for fs_config in fs_configs:
 967             assert isinstance(fs_config, dict)
 968             name = fs_config.pop('name')
 969             temp = deepcopy(cephfs_config)
 970             teuthology.deep_merge(temp, fs_config)
 971             subvols = config.get('subvols', None)
 972             if subvols:
 973                 teuthology.deep_merge(temp, {'subvols': subvols})
 974             fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
 975             if set_allow_multifs:
 976                 fs.set_allow_multifs()
 977                 set_allow_multifs = False
 978             fss.append(fs)
 979
 980         yield
 981
 982         for fs in fss:
 983             fs.destroy()
 984     else:
 985         yield
 986
 987 @contextlib.contextmanager
 988 def ceph_monitoring(daemon_type, ctx, config):
 989     """
 990     Deploy prometheus, node-exporter, etc.
 991     """
 992     cluster_name = config['cluster']
 993     fsid = ctx.ceph[cluster_name].fsid
 994
 995     nodes = []
 996     daemons = {}
 997     for remote, roles in ctx.cluster.remotes.items():
 998         for role in [r for r in roles
 999                     if teuthology.is_type(daemon_type, cluster_name)(r)]:
1000             c_, _, id_ = teuthology.split_role(role)
1001             log.info('Adding %s on %s' % (role, remote.shortname))
1002             nodes.append(remote.shortname + '=' + id_)
1003             daemons[role] = (remote, id_)
1004     if nodes:
1005         _shell(ctx, cluster_name, remote, [
1006             'ceph', 'orch', 'apply', daemon_type,
1007             str(len(nodes)) + ';' + ';'.join(nodes)]
1008         )
1009     for role, i in daemons.items():
1010         remote, id_ = i
1011         ctx.daemons.register_daemon(
1012             remote, daemon_type, id_,
1013             cluster=cluster_name,
1014             fsid=fsid,
1015             logger=log.getChild(role),
1016             wait=False,
1017             started=True,
1018         )
1019
1020     yield
1021
1022
1023 @contextlib.contextmanager
1024 def ceph_rgw(ctx, config):
1025     """
1026     Deploy rgw
1027     """
1028     cluster_name = config['cluster']
1029     fsid = ctx.ceph[cluster_name].fsid
1030
1031     nodes = {}
1032     daemons = {}
1033     for remote, roles in ctx.cluster.remotes.items():
1034         for role in [r for r in roles
1035                     if teuthology.is_type('rgw', cluster_name)(r)]:
1036             c_, _, id_ = teuthology.split_role(role)
1037             log.info('Adding %s on %s' % (role, remote.shortname))
1038             svc = '.'.join(id_.split('.')[0:2])
1039             if svc not in nodes:
1040                 nodes[svc] = []
1041             nodes[svc].append(remote.shortname + '=' + id_)
1042             daemons[role] = (remote, id_)
1043
1044     for svc, nodes in nodes.items():
1045         _shell(ctx, cluster_name, remote, [
1046             'ceph', 'orch', 'apply', 'rgw', svc,
1047              '--placement',
1048              str(len(nodes)) + ';' + ';'.join(nodes)]
1049         )
1050     for role, i in daemons.items():
1051         remote, id_ = i
1052         ctx.daemons.register_daemon(
1053             remote, 'rgw', id_,
1054             cluster=cluster_name,
1055             fsid=fsid,
1056             logger=log.getChild(role),
1057             wait=False,
1058             started=True,
1059         )
1060
1061     yield
1062
1063
1064 @contextlib.contextmanager
1065 def ceph_iscsi(ctx, config):
1066     """
1067     Deploy iSCSIs
1068     """
1069     cluster_name = config['cluster']
1070     fsid = ctx.ceph[cluster_name].fsid
1071
1072     nodes = []
1073     daemons = {}
1074     ips = []
1075
1076     for remote, roles in ctx.cluster.remotes.items():
1077         for role in [r for r in roles
1078                      if teuthology.is_type('iscsi', cluster_name)(r)]:
1079             c_, _, id_ = teuthology.split_role(role)
1080             log.info('Adding %s on %s' % (role, remote.shortname))
1081             nodes.append(remote.shortname + '=' + id_)
1082             daemons[role] = (remote, id_)
1083             ips.append(remote.ip_address)
1084     trusted_ip_list = ','.join(ips)
1085     if nodes:
1086         poolname = 'datapool'
1087         # ceph osd pool create datapool 3 3 replicated
1088         _shell(ctx, cluster_name, remote, [
1089             'ceph', 'osd', 'pool', 'create',
1090             poolname, '3', '3', 'replicated']
1091         )
1092
1093         _shell(ctx, cluster_name, remote, [
1094             'rbd', 'pool', 'init', poolname]
1095         )
1096
1097         # ceph orch apply iscsi datapool (admin)user (admin)password
1098         _shell(ctx, cluster_name, remote, [
1099             'ceph', 'orch', 'apply', 'iscsi',
1100             poolname, 'admin', 'admin',
1101             '--trusted_ip_list', trusted_ip_list,
1102             '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
1103         )
1104
1105         # used by iscsi client to identify valid gateway ip's
1106         conf_data = dedent(f"""
1107         [config]
1108         trusted_ip_list = {trusted_ip_list}
1109         """)
1110         distribute_iscsi_gateway_cfg(ctx, conf_data)
1111
1112     for role, i in daemons.items():
1113         remote, id_ = i
1114         ctx.daemons.register_daemon(
1115             remote, 'iscsi', id_,
1116             cluster=cluster_name,
1117             fsid=fsid,
1118             logger=log.getChild(role),
1119             wait=False,
1120             started=True,
1121         )
1122
1123     yield
1124
1125
1126 @contextlib.contextmanager
1127 def ceph_clients(ctx, config):
1128     cluster_name = config['cluster']
1129
1130     log.info('Setting up client nodes...')
1131     clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
1132     for remote, roles_for_host in clients.remotes.items():
1133         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
1134                                                      cluster_name):
1135             name = teuthology.ceph_role(role)
1136             client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
1137                                                                 name)
1138             r = _shell(
1139                 ctx=ctx,
1140                 cluster_name=cluster_name,
1141                 remote=remote,
1142                 args=[
1143                     'ceph', 'auth',
1144                     'get-or-create', name,
1145                     'mon', 'allow *',
1146                     'osd', 'allow *',
1147                     'mds', 'allow *',
1148                     'mgr', 'allow *',
1149                 ],
1150                 stdout=StringIO(),
1151             )
1152             keyring = r.stdout.getvalue()
1153             remote.sudo_write_file(client_keyring, keyring, mode='0644')
1154     yield
1155
1156
1157 @contextlib.contextmanager
1158 def ceph_initial():
1159     try:
1160         yield
1161     finally:
1162         log.info('Teardown complete')
1163
1164
1165 ## public methods
1166 @contextlib.contextmanager
1167 def stop(ctx, config):
1168     """
1169     Stop ceph daemons
1170
1171     For example::
1172       tasks:
1173       - ceph.stop: [mds.*]
1174
1175       tasks:
1176       - ceph.stop: [osd.0, osd.2]
1177
1178       tasks:
1179       - ceph.stop:
1180           daemons: [osd.0, osd.2]
1181
1182     """
1183     if config is None:
1184         config = {}
1185     elif isinstance(config, list):
1186         config = {'daemons': config}
1187
1188     daemons = ctx.daemons.resolve_role_list(
1189         config.get('daemons', None), CEPH_ROLE_TYPES, True)
1190     clusters = set()
1191
1192     for role in daemons:
1193         cluster, type_, id_ = teuthology.split_role(role)
1194         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1195         clusters.add(cluster)
1196
1197 #    for cluster in clusters:
1198 #        ctx.ceph[cluster].watchdog.stop()
1199 #        ctx.ceph[cluster].watchdog.join()
1200
1201     yield
1202
1203
1204 def shell(ctx, config):
1205     """
1206     Execute (shell) commands
1207     """
1208     cluster_name = config.get('cluster', 'ceph')
1209
1210     args = []
1211     for k in config.pop('env', []):
1212         args.extend(['-e', k + '=' + ctx.config.get(k, '')])
1213     for k in config.pop('volumes', []):
1214         args.extend(['-v', k])
1215
1216     if 'all-roles' in config and len(config) == 1:
1217         a = config['all-roles']
1218         roles = teuthology.all_roles(ctx.cluster)
1219         config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
1220     elif 'all-hosts' in config and len(config) == 1:
1221         a = config['all-hosts']
1222         roles = teuthology.all_roles(ctx.cluster)
1223         config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
1224
1225     for role, cmd in config.items():
1226         (remote,) = ctx.cluster.only(role).remotes.keys()
1227         log.info('Running commands on role %s host %s', role, remote.name)
1228         if isinstance(cmd, list):
1229             for c in cmd:
1230                 _shell(ctx, cluster_name, remote,
1231                        ['bash', '-c', subst_vip(ctx, c)],
1232                        extra_cephadm_args=args)
1233         else:
1234             assert isinstance(cmd, str)
1235             _shell(ctx, cluster_name, remote,
1236                    ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
1237                    extra_cephadm_args=args)
1238
1239
1240 def apply(ctx, config):
1241     """
1242     Apply spec
1243
1244       tasks:
1245         - cephadm.apply:
1246             specs:
1247             - service_type: rgw
1248               service_id: foo
1249               spec:
1250                 rgw_frontend_port: 8000
1251             - service_type: rgw
1252               service_id: bar
1253               spec:
1254                 rgw_frontend_port: 9000
1255                 zone: bar
1256                 realm: asdf
1257
1258     """
1259     cluster_name = config.get('cluster', 'ceph')
1260
1261     specs = config.get('specs', [])
1262     y = subst_vip(ctx, yaml.dump_all(specs))
1263
1264     log.info(f'Applying spec(s):\n{y}')
1265     _shell(
1266         ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1267         ['ceph', 'orch', 'apply', '-i', '-'],
1268         stdin=y,
1269     )
1270
1271
1272 def wait_for_service(ctx, config):
1273     """
1274     Wait for a service to be fully started
1275
1276       tasks:
1277         - cephadm.wait_for_service:
1278             service: rgw.foo
1279             timeout: 60    # defaults to 300
1280
1281     """
1282     cluster_name = config.get('cluster', 'ceph')
1283     timeout = config.get('timeout', 300)
1284     service = config.get('service')
1285     assert service
1286
1287     log.info(
1288         f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1289     )
1290     with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
1291         while proceed():
1292             r = _shell(
1293                 ctx=ctx,
1294                 cluster_name=cluster_name,
1295                 remote=ctx.ceph[cluster_name].bootstrap_remote,
1296                 args=[
1297                     'ceph', 'orch', 'ls', '-f', 'json',
1298                 ],
1299                 stdout=StringIO(),
1300             )
1301             j = json.loads(r.stdout.getvalue())
1302             svc = None
1303             for s in j:
1304                 if s['service_name'] == service:
1305                     svc = s
1306                     break
1307             if svc:
1308                 log.info(
1309                     f"{service} has {s['status']['running']}/{s['status']['size']}"
1310                 )
1311                 if s['status']['running'] == s['status']['size']:
1312                     break
1313
1314
1315 @contextlib.contextmanager
1316 def tweaked_option(ctx, config):
1317     """
1318     set an option, and then restore it with its original value
1319
1320     Note, due to the way how tasks are executed/nested, it's not suggested to
1321     use this method as a standalone task. otherwise, it's likely that it will
1322     restore the tweaked option at the /end/ of 'tasks' block.
1323     """
1324     saved_options = {}
1325     # we can complicate this when necessary
1326     options = ['mon-health-to-clog']
1327     type_, id_ = 'mon', '*'
1328     cluster = config.get('cluster', 'ceph')
1329     manager = ctx.managers[cluster]
1330     if id_ == '*':
1331         get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
1332     else:
1333         get_from = id_
1334     for option in options:
1335         if option not in config:
1336             continue
1337         value = 'true' if config[option] else 'false'
1338         option = option.replace('-', '_')
1339         old_value = manager.get_config(type_, get_from, option)
1340         if value != old_value:
1341             saved_options[option] = old_value
1342             manager.inject_args(type_, id_, option, value)
1343     yield
1344     for option, value in saved_options.items():
1345         manager.inject_args(type_, id_, option, value)
1346
1347
1348 @contextlib.contextmanager
1349 def restart(ctx, config):
1350     """
1351    restart ceph daemons
1352
1353    For example::
1354       tasks:
1355       - ceph.restart: [all]
1356
1357    For example::
1358       tasks:
1359       - ceph.restart: [osd.0, mon.1, mds.*]
1360
1361    or::
1362
1363       tasks:
1364       - ceph.restart:
1365           daemons: [osd.0, mon.1]
1366           wait-for-healthy: false
1367           wait-for-osds-up: true
1368
1369     :param ctx: Context
1370     :param config: Configuration
1371     """
1372     if config is None:
1373         config = {}
1374     elif isinstance(config, list):
1375         config = {'daemons': config}
1376
1377     daemons = ctx.daemons.resolve_role_list(
1378         config.get('daemons', None), CEPH_ROLE_TYPES, True)
1379     clusters = set()
1380
1381     log.info('daemons %s' % daemons)
1382     with tweaked_option(ctx, config):
1383         for role in daemons:
1384             cluster, type_, id_ = teuthology.split_role(role)
1385             d = ctx.daemons.get_daemon(type_, id_, cluster)
1386             assert d, 'daemon %s does not exist' % role
1387             d.stop()
1388             if type_ == 'osd':
1389                 ctx.managers[cluster].mark_down_osd(id_)
1390             d.restart()
1391             clusters.add(cluster)
1392
1393     if config.get('wait-for-healthy', True):
1394         for cluster in clusters:
1395             healthy(ctx=ctx, config=dict(cluster=cluster))
1396     if config.get('wait-for-osds-up', False):
1397         for cluster in clusters:
1398             ctx.managers[cluster].wait_for_all_osds_up()
1399     yield
1400
1401
1402 @contextlib.contextmanager
1403 def distribute_config_and_admin_keyring(ctx, config):
1404     """
1405     Distribute a sufficient config and keyring for clients
1406     """
1407     cluster_name = config['cluster']
1408     log.info('Distributing (final) config and client.admin keyring...')
1409     for remote, roles in ctx.cluster.remotes.items():
1410         remote.write_file(
1411             '/etc/ceph/{}.conf'.format(cluster_name),
1412             ctx.ceph[cluster_name].config_file,
1413             sudo=True)
1414         remote.write_file(
1415             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1416             data=ctx.ceph[cluster_name].admin_keyring,
1417             sudo=True)
1418     try:
1419         yield
1420     finally:
1421         ctx.cluster.run(args=[
1422             'sudo', 'rm', '-f',
1423             '/etc/ceph/{}.conf'.format(cluster_name),
1424             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1425         ])
1426
1427
1428 @contextlib.contextmanager
1429 def crush_setup(ctx, config):
1430     cluster_name = config['cluster']
1431
1432     profile = config.get('crush_tunables', 'default')
1433     log.info('Setting crush tunables to %s', profile)
1434     _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1435         args=['ceph', 'osd', 'crush', 'tunables', profile])
1436     yield
1437
1438
1439 @contextlib.contextmanager
1440 def create_rbd_pool(ctx, config):
1441     if config.get('create_rbd_pool', False):
1442       cluster_name = config['cluster']
1443       log.info('Waiting for OSDs to come up')
1444       teuthology.wait_until_osds_up(
1445           ctx,
1446           cluster=ctx.cluster,
1447           remote=ctx.ceph[cluster_name].bootstrap_remote,
1448           ceph_cluster=cluster_name,
1449       )
1450       log.info('Creating RBD pool')
1451       _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1452           args=['sudo', 'ceph', '--cluster', cluster_name,
1453                 'osd', 'pool', 'create', 'rbd', '8'])
1454       _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1455           args=['sudo', 'ceph', '--cluster', cluster_name,
1456                 'osd', 'pool', 'application', 'enable',
1457                 'rbd', 'rbd', '--yes-i-really-mean-it'
1458           ])
1459     yield
1460
1461
1462 @contextlib.contextmanager
1463 def _bypass():
1464     yield
1465
1466
1467 @contextlib.contextmanager
1468 def initialize_config(ctx, config):
1469     cluster_name = config['cluster']
1470     testdir = teuthology.get_testdir(ctx)
1471
1472     ctx.ceph[cluster_name].thrashers = []
1473     # fixme: setup watchdog, ala ceph.py
1474
1475     ctx.ceph[cluster_name].roleless = False  # see below
1476
1477     first_ceph_cluster = False
1478     if not hasattr(ctx, 'daemons'):
1479         first_ceph_cluster = True
1480
1481     # cephadm mode?
1482     if 'cephadm_mode' not in config:
1483         config['cephadm_mode'] = 'root'
1484     assert config['cephadm_mode'] in ['root', 'cephadm-package']
1485     if config['cephadm_mode'] == 'root':
1486         ctx.cephadm = testdir + '/cephadm'
1487     else:
1488         ctx.cephadm = 'cephadm'  # in the path
1489
1490     if first_ceph_cluster:
1491         # FIXME: this is global for all clusters
1492         ctx.daemons = DaemonGroup(
1493             use_cephadm=ctx.cephadm)
1494
1495     # uuid
1496     fsid = str(uuid.uuid1())
1497     log.info('Cluster fsid is %s' % fsid)
1498     ctx.ceph[cluster_name].fsid = fsid
1499
1500     # mon ips
1501     log.info('Choosing monitor IPs and ports...')
1502     remotes_and_roles = ctx.cluster.remotes.items()
1503     ips = [host for (host, port) in
1504            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1505
1506     if config.get('roleless', False):
1507         # mons will be named after hosts
1508         first_mon = None
1509         max_mons = config.get('max_mons', 5)
1510         for remote, _ in remotes_and_roles:
1511             ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1512             if not first_mon:
1513                 first_mon = remote.shortname
1514                 bootstrap_remote = remote
1515             max_mons -= 1
1516             if not max_mons:
1517                 break
1518         log.info('No mon roles; fabricating mons')
1519
1520     roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1521
1522     ctx.ceph[cluster_name].mons = get_mons(
1523         roles, ips, cluster_name,
1524         mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1525         mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1526     )
1527     log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1528
1529     if config.get('roleless', False):
1530         ctx.ceph[cluster_name].roleless = True
1531         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1532         ctx.ceph[cluster_name].first_mon = first_mon
1533         ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1534     else:
1535         first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1536         _, _, first_mon = teuthology.split_role(first_mon_role)
1537         (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1538         log.info('First mon is mon.%s on %s' % (first_mon,
1539                                                 bootstrap_remote.shortname))
1540         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1541         ctx.ceph[cluster_name].first_mon = first_mon
1542         ctx.ceph[cluster_name].first_mon_role = first_mon_role
1543
1544         others = ctx.cluster.remotes[bootstrap_remote]
1545         mgrs = sorted([r for r in others
1546                        if teuthology.is_type('mgr', cluster_name)(r)])
1547         if not mgrs:
1548             raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1549         _, _, first_mgr = teuthology.split_role(mgrs[0])
1550         log.info('First mgr is %s' % (first_mgr))
1551         ctx.ceph[cluster_name].first_mgr = first_mgr
1552     yield
1553
1554
1555 @contextlib.contextmanager
1556 def task(ctx, config):
1557     """
1558     Deploy ceph cluster using cephadm
1559
1560     For example, teuthology.yaml can contain the 'defaults' section:
1561
1562         defaults:
1563           cephadm:
1564             containers:
1565               image: 'quay.io/ceph-ci/ceph'
1566
1567     Using overrides makes it possible to customize it per run.
1568     The equivalent 'overrides' section looks like:
1569
1570         overrides:
1571           cephadm:
1572             containers:
1573               image: 'quay.io/ceph-ci/ceph'
1574             registry-login:
1575               url:  registry-url
1576               username: registry-user
1577               password: registry-password
1578
1579     :param ctx: the argparse.Namespace object
1580     :param config: the config dict
1581     """
1582     if config is None:
1583         config = {}
1584
1585     assert isinstance(config, dict), \
1586         "task only supports a dictionary for configuration"
1587
1588     overrides = ctx.config.get('overrides', {})
1589     teuthology.deep_merge(config, overrides.get('ceph', {}))
1590     teuthology.deep_merge(config, overrides.get('cephadm', {}))
1591     log.info('Config: ' + str(config))
1592
1593     # set up cluster context
1594     if not hasattr(ctx, 'ceph'):
1595         ctx.ceph = {}
1596     if 'cluster' not in config:
1597         config['cluster'] = 'ceph'
1598     cluster_name = config['cluster']
1599     if cluster_name not in ctx.ceph:
1600         ctx.ceph[cluster_name] = argparse.Namespace()
1601         ctx.ceph[cluster_name].bootstrapped = False
1602
1603     # image
1604     teuth_defaults = teuth_config.get('defaults', {})
1605     cephadm_defaults = teuth_defaults.get('cephadm', {})
1606     containers_defaults = cephadm_defaults.get('containers', {})
1607     container_image_name = containers_defaults.get('image', None)
1608
1609     containers = config.get('containers', {})
1610     container_image_name = containers.get('image', container_image_name)
1611
1612     if not hasattr(ctx.ceph[cluster_name], 'image'):
1613         ctx.ceph[cluster_name].image = config.get('image')
1614     ref = ctx.config.get("branch", "main")
1615     if not ctx.ceph[cluster_name].image:
1616         if not container_image_name:
1617             raise Exception("Configuration error occurred. "
1618                             "The 'image' value is undefined for 'cephadm' task. "
1619                             "Please provide corresponding options in the task's "
1620                             "config, task 'overrides', or teuthology 'defaults' "
1621                             "section.")
1622         sha1 = config.get('sha1')
1623         flavor = config.get('flavor', 'default')
1624
1625         if sha1:
1626             if flavor == "crimson":
1627                 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1628             else:
1629                 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1630             ref = sha1
1631         else:
1632             # fall back to using the branch value
1633             ctx.ceph[cluster_name].image = container_image_name + ':' + ref
1634     log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1635
1636
1637     with contextutil.nested(
1638             #if the cluster is already bootstrapped bypass corresponding methods
1639             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1640                               else initialize_config(ctx=ctx, config=config),
1641             lambda: ceph_initial(),
1642             lambda: normalize_hostnames(ctx=ctx),
1643             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1644                               else download_cephadm(ctx=ctx, config=config, ref=ref),
1645             lambda: ceph_log(ctx=ctx, config=config),
1646             lambda: ceph_crash(ctx=ctx, config=config),
1647             lambda: pull_image(ctx=ctx, config=config),
1648             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1649                               else ceph_bootstrap(ctx, config),
1650             lambda: crush_setup(ctx=ctx, config=config),
1651             lambda: ceph_mons(ctx=ctx, config=config),
1652             lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1653             lambda: ceph_mgrs(ctx=ctx, config=config),
1654             lambda: ceph_osds(ctx=ctx, config=config),
1655             lambda: ceph_mdss(ctx=ctx, config=config),
1656             lambda: cephfs_setup(ctx=ctx, config=config),
1657             lambda: ceph_rgw(ctx=ctx, config=config),
1658             lambda: ceph_iscsi(ctx=ctx, config=config),
1659             lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1660             lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1661             lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1662             lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1663             lambda: ceph_clients(ctx=ctx, config=config),
1664             lambda: create_rbd_pool(ctx=ctx, config=config),
1665     ):
1666         try:
1667             if config.get('wait-for-healthy', True):
1668                 healthy(ctx=ctx, config=config)
1669
1670             log.info('Setup complete, yielding')
1671             yield
1672
1673         finally:
1674             log.info('Teardown begin')
1675