ceph/qa/tasks/cephadm.py

   1 """
   2 Ceph cluster task, deployed via cephadm orchestrator
   3 """
   4 import argparse
   5 import configobj
   6 import contextlib
   7 import logging
   8 import os
   9 import json
  10 import re
  11 import uuid
  12 import yaml
  13
  14 from io import BytesIO, StringIO
  15 from tarfile import ReadError
  16 from tasks.ceph_manager import CephManager
  17 from teuthology import misc as teuthology
  18 from teuthology import contextutil
  19 from teuthology.orchestra import run
  20 from teuthology.orchestra.daemon import DaemonGroup
  21 from teuthology.config import config as teuth_config
  22
  23 # these items we use from ceph.py should probably eventually move elsewhere
  24 from tasks.ceph import get_mons, healthy
  25 from tasks.vip import subst_vip
  26
  27 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
  28
  29 log = logging.getLogger(__name__)
  30
  31
  32 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
  33     teuthology.get_testdir(ctx)
  34     return remote.run(
  35         args=[
  36             'sudo',
  37             ctx.cephadm,
  38             '--image', ctx.ceph[cluster_name].image,
  39             'shell',
  40             '-c', '/etc/ceph/{}.conf'.format(cluster_name),
  41             '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
  42             '--fsid', ctx.ceph[cluster_name].fsid,
  43             ] + extra_cephadm_args + [
  44             '--',
  45             ] + args,
  46         **kwargs
  47     )
  48
  49
  50 def build_initial_config(ctx, config):
  51     cluster_name = config['cluster']
  52
  53     path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
  54     conf = configobj.ConfigObj(path, file_error=True)
  55
  56     conf.setdefault('global', {})
  57     conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
  58
  59     # overrides
  60     for section, keys in config.get('conf',{}).items():
  61         for key, value in keys.items():
  62             log.info(" override: [%s] %s = %s" % (section, key, value))
  63             if section not in conf:
  64                 conf[section] = {}
  65             conf[section][key] = value
  66
  67     return conf
  68
  69
  70 def update_archive_setting(ctx, key, value):
  71     """
  72     Add logs directory to job's info log file
  73     """
  74     if ctx.archive is None:
  75         return
  76     with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
  77         info_yaml = yaml.safe_load(info_file)
  78         info_file.seek(0)
  79         if 'archive' in info_yaml:
  80             info_yaml['archive'][key] = value
  81         else:
  82             info_yaml['archive'] = {key: value}
  83         yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
  84
  85
  86 @contextlib.contextmanager
  87 def normalize_hostnames(ctx):
  88     """
  89     Ensure we have short hostnames throughout, for consistency between
  90     remote.shortname and socket.gethostname() in cephadm.
  91     """
  92     log.info('Normalizing hostnames...')
  93     ctx.cluster.run(args=[
  94         'sudo',
  95         'hostname',
  96         run.Raw('$(hostname -s)'),
  97     ])
  98
  99     try:
 100         yield
 101     finally:
 102         pass
 103
 104
 105 @contextlib.contextmanager
 106 def download_cephadm(ctx, config, ref):
 107     cluster_name = config['cluster']
 108
 109     if config.get('cephadm_mode') != 'cephadm-package':
 110         ref = config.get('cephadm_branch', ref)
 111         git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
 112         log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
 113         if ctx.config.get('redhat'):
 114             log.info("Install cephadm using RPM")
 115             # cephadm already installed from redhat.install task
 116             ctx.cluster.run(
 117                 args=[
 118                     'cp',
 119                     run.Raw('$(which cephadm)'),
 120                     ctx.cephadm,
 121                     run.Raw('&&'),
 122                     'ls', '-l',
 123                     ctx.cephadm,
 124                 ]
 125             )
 126         elif git_url.startswith('https://github.com/'):
 127             # git archive doesn't like https:// URLs, which we use with github.
 128             rest = git_url.split('https://github.com/', 1)[1]
 129             rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
 130             ctx.cluster.run(
 131                 args=[
 132                     'curl', '--silent',
 133                     'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
 134                     run.Raw('>'),
 135                     ctx.cephadm,
 136                     run.Raw('&&'),
 137                     'ls', '-l',
 138                     ctx.cephadm,
 139                 ],
 140             )
 141         else:
 142             ctx.cluster.run(
 143                 args=[
 144                     'git', 'archive',
 145                     '--remote=' + git_url,
 146                     ref,
 147                     'src/cephadm/cephadm',
 148                     run.Raw('|'),
 149                     'tar', '-xO', 'src/cephadm/cephadm',
 150                     run.Raw('>'),
 151                     ctx.cephadm,
 152                 ],
 153             )
 154         # sanity-check the resulting file and set executable bit
 155         cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
 156         ctx.cluster.run(
 157             args=[
 158                 'test', '-s', ctx.cephadm,
 159                 run.Raw('&&'),
 160                 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
 161                 run.Raw('&&'),
 162                 'chmod', '+x', ctx.cephadm,
 163             ],
 164         )
 165
 166     try:
 167         yield
 168     finally:
 169         log.info('Removing cluster...')
 170         ctx.cluster.run(args=[
 171             'sudo',
 172             ctx.cephadm,
 173             'rm-cluster',
 174             '--fsid', ctx.ceph[cluster_name].fsid,
 175             '--force',
 176         ])
 177
 178         if config.get('cephadm_mode') == 'root':
 179             log.info('Removing cephadm ...')
 180             ctx.cluster.run(
 181                 args=[
 182                     'rm',
 183                     '-rf',
 184                     ctx.cephadm,
 185                 ],
 186             )
 187
 188
 189 @contextlib.contextmanager
 190 def ceph_log(ctx, config):
 191     cluster_name = config['cluster']
 192     fsid = ctx.ceph[cluster_name].fsid
 193
 194     update_archive_setting(ctx, 'log', '/var/log/ceph')
 195
 196
 197     try:
 198         yield
 199
 200     except Exception:
 201         # we need to know this below
 202         ctx.summary['success'] = False
 203         raise
 204
 205     finally:
 206         log.info('Checking cluster log for badness...')
 207         def first_in_ceph_log(pattern, excludes):
 208             """
 209             Find the first occurrence of the pattern specified in the Ceph log,
 210             Returns None if none found.
 211
 212             :param pattern: Pattern scanned for.
 213             :param excludes: Patterns to ignore.
 214             :return: First line of text (or None if not found)
 215             """
 216             args = [
 217                 'sudo',
 218                 'egrep', pattern,
 219                 '/var/log/ceph/{fsid}/ceph.log'.format(
 220                     fsid=fsid),
 221             ]
 222             if excludes:
 223                 for exclude in excludes:
 224                     args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 225             args.extend([
 226                 run.Raw('|'), 'head', '-n', '1',
 227             ])
 228             r = ctx.ceph[cluster_name].bootstrap_remote.run(
 229                 stdout=StringIO(),
 230                 args=args,
 231             )
 232             stdout = r.stdout.getvalue()
 233             if stdout != '':
 234                 return stdout
 235             return None
 236
 237         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 238                              config.get('log-ignorelist')) is not None:
 239             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 240             ctx.summary['success'] = False
 241             # use the most severe problem as the failure reason
 242             if 'failure_reason' not in ctx.summary:
 243                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 244                     match = first_in_ceph_log(pattern, config['log-ignorelist'])
 245                     if match is not None:
 246                         ctx.summary['failure_reason'] = \
 247                             '"{match}" in cluster log'.format(
 248                                 match=match.rstrip('\n'),
 249                             )
 250                         break
 251
 252         if ctx.archive is not None and \
 253                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 254             # and logs
 255             log.info('Compressing logs...')
 256             run.wait(
 257                 ctx.cluster.run(
 258                     args=[
 259                         'sudo',
 260                         'find',
 261                         '/var/log/ceph',   # all logs, not just for the cluster
 262                         '/var/log/rbd-target-api', # ceph-iscsi
 263                         '-name',
 264                         '*.log',
 265                         '-print0',
 266                         run.Raw('|'),
 267                         'sudo',
 268                         'xargs',
 269                         '-0',
 270                         '--no-run-if-empty',
 271                         '--',
 272                         'gzip',
 273                         '--',
 274                     ],
 275                     wait=False,
 276                 ),
 277             )
 278
 279             log.info('Archiving logs...')
 280             path = os.path.join(ctx.archive, 'remote')
 281             try:
 282                 os.makedirs(path)
 283             except OSError:
 284                 pass
 285             for remote in ctx.cluster.remotes.keys():
 286                 sub = os.path.join(path, remote.name)
 287                 try:
 288                     os.makedirs(sub)
 289                 except OSError:
 290                     pass
 291                 try:
 292                     teuthology.pull_directory(remote, '/var/log/ceph',  # everything
 293                                               os.path.join(sub, 'log'))
 294                 except ReadError:
 295                     pass
 296
 297
 298 @contextlib.contextmanager
 299 def ceph_crash(ctx, config):
 300     """
 301     Gather crash dumps from /var/lib/ceph/$fsid/crash
 302     """
 303     cluster_name = config['cluster']
 304     fsid = ctx.ceph[cluster_name].fsid
 305
 306     update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
 307
 308     try:
 309         yield
 310
 311     finally:
 312         if ctx.archive is not None:
 313             log.info('Archiving crash dumps...')
 314             path = os.path.join(ctx.archive, 'remote')
 315             try:
 316                 os.makedirs(path)
 317             except OSError:
 318                 pass
 319             for remote in ctx.cluster.remotes.keys():
 320                 sub = os.path.join(path, remote.name)
 321                 try:
 322                     os.makedirs(sub)
 323                 except OSError:
 324                     pass
 325                 try:
 326                     teuthology.pull_directory(remote,
 327                                               '/var/lib/ceph/%s/crash' % fsid,
 328                                               os.path.join(sub, 'crash'))
 329                 except ReadError:
 330                     pass
 331
 332
 333 @contextlib.contextmanager
 334 def ceph_bootstrap(ctx, config):
 335     """
 336     Bootstrap ceph cluster.
 337
 338     :param ctx: the argparse.Namespace object
 339     :param config: the config dict
 340     """
 341     cluster_name = config['cluster']
 342     testdir = teuthology.get_testdir(ctx)
 343     fsid = ctx.ceph[cluster_name].fsid
 344
 345     bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 346     first_mon = ctx.ceph[cluster_name].first_mon
 347     first_mon_role = ctx.ceph[cluster_name].first_mon_role
 348     mons = ctx.ceph[cluster_name].mons
 349
 350     ctx.cluster.run(args=[
 351         'sudo', 'mkdir', '-p', '/etc/ceph',
 352         ]);
 353     ctx.cluster.run(args=[
 354         'sudo', 'chmod', '777', '/etc/ceph',
 355         ]);
 356     try:
 357         # write seed config
 358         log.info('Writing seed config...')
 359         conf_fp = BytesIO()
 360         seed_config = build_initial_config(ctx, config)
 361         seed_config.write(conf_fp)
 362         bootstrap_remote.write_file(
 363             path='{}/seed.{}.conf'.format(testdir, cluster_name),
 364             data=conf_fp.getvalue())
 365         log.debug('Final config:\n' + conf_fp.getvalue().decode())
 366         ctx.ceph[cluster_name].conf = seed_config
 367
 368         # register initial daemons
 369         ctx.daemons.register_daemon(
 370             bootstrap_remote, 'mon', first_mon,
 371             cluster=cluster_name,
 372             fsid=fsid,
 373             logger=log.getChild('mon.' + first_mon),
 374             wait=False,
 375             started=True,
 376         )
 377         if not ctx.ceph[cluster_name].roleless:
 378             first_mgr = ctx.ceph[cluster_name].first_mgr
 379             ctx.daemons.register_daemon(
 380                 bootstrap_remote, 'mgr', first_mgr,
 381                 cluster=cluster_name,
 382                 fsid=fsid,
 383                 logger=log.getChild('mgr.' + first_mgr),
 384                 wait=False,
 385                 started=True,
 386             )
 387
 388         # bootstrap
 389         log.info('Bootstrapping...')
 390         cmd = [
 391             'sudo',
 392             ctx.cephadm,
 393             '--image', ctx.ceph[cluster_name].image,
 394             '-v',
 395             'bootstrap',
 396             '--fsid', fsid,
 397             '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
 398             '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
 399             '--output-keyring',
 400             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 401             '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
 402         ]
 403
 404         if config.get('registry-login'):
 405             registry = config['registry-login']
 406             cmd += [
 407                 "--registry-url", registry['url'],
 408                 "--registry-username", registry['username'],
 409                 "--registry-password", registry['password'],
 410             ]
 411
 412         if not ctx.ceph[cluster_name].roleless:
 413             cmd += [
 414                 '--mon-id', first_mon,
 415                 '--mgr-id', first_mgr,
 416                 '--orphan-initial-daemons',   # we will do it explicitly!
 417                 '--skip-monitoring-stack',    # we'll provision these explicitly
 418             ]
 419
 420         if mons[first_mon_role].startswith('['):
 421             cmd += ['--mon-addrv', mons[first_mon_role]]
 422         else:
 423             cmd += ['--mon-ip', mons[first_mon_role]]
 424         if config.get('skip_dashboard'):
 425             cmd += ['--skip-dashboard']
 426         if config.get('skip_monitoring_stack'):
 427             cmd += ['--skip-monitoring-stack']
 428         if config.get('single_host_defaults'):
 429             cmd += ['--single-host-defaults']
 430         if not config.get('avoid_pacific_features', False):
 431             cmd += ['--skip-admin-label']
 432         # bootstrap makes the keyring root 0600, so +r it for our purposes
 433         cmd += [
 434             run.Raw('&&'),
 435             'sudo', 'chmod', '+r',
 436             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 437         ]
 438         bootstrap_remote.run(args=cmd)
 439
 440         # fetch keys and configs
 441         log.info('Fetching config...')
 442         ctx.ceph[cluster_name].config_file = \
 443             bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
 444         log.info('Fetching client.admin keyring...')
 445         ctx.ceph[cluster_name].admin_keyring = \
 446             bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
 447         log.info('Fetching mon keyring...')
 448         ctx.ceph[cluster_name].mon_keyring = \
 449             bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
 450
 451         # fetch ssh key, distribute to additional nodes
 452         log.info('Fetching pub ssh key...')
 453         ssh_pub_key = bootstrap_remote.read_file(
 454             f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
 455
 456         log.info('Installing pub ssh key for root users...')
 457         ctx.cluster.run(args=[
 458             'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
 459             run.Raw('&&'),
 460             'echo', ssh_pub_key,
 461             run.Raw('|'),
 462             'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
 463             run.Raw('&&'),
 464             'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
 465         ])
 466
 467         # set options
 468         if config.get('allow_ptrace', True):
 469             _shell(ctx, cluster_name, bootstrap_remote,
 470                    ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
 471
 472         if not config.get('avoid_pacific_features', False):
 473             log.info('Distributing conf and client.admin keyring to all hosts + 0755')
 474             _shell(ctx, cluster_name, bootstrap_remote,
 475                    ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
 476                     '*', '--mode', '0755'],
 477                    check_status=False)
 478
 479         # add other hosts
 480         for remote in ctx.cluster.remotes.keys():
 481             if remote == bootstrap_remote:
 482                 continue
 483
 484             # note: this may be redundant (see above), but it avoids
 485             # us having to wait for cephadm to do it.
 486             log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
 487             remote.write_file(
 488                 path='/etc/ceph/{}.conf'.format(cluster_name),
 489                 data=ctx.ceph[cluster_name].config_file)
 490             remote.write_file(
 491                 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 492                 data=ctx.ceph[cluster_name].admin_keyring)
 493
 494             log.info('Adding host %s to orchestrator...' % remote.shortname)
 495             _shell(ctx, cluster_name, remote, [
 496                 'ceph', 'orch', 'host', 'add',
 497                 remote.shortname
 498             ])
 499             r = _shell(ctx, cluster_name, remote,
 500                        ['ceph', 'orch', 'host', 'ls', '--format=json'],
 501                        stdout=StringIO())
 502             hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
 503             assert remote.shortname in hosts
 504
 505         yield
 506
 507     finally:
 508         log.info('Cleaning up testdir ceph.* files...')
 509         ctx.cluster.run(args=[
 510             'rm', '-f',
 511             '{}/seed.{}.conf'.format(testdir, cluster_name),
 512             '{}/{}.pub'.format(testdir, cluster_name),
 513         ])
 514
 515         log.info('Stopping all daemons...')
 516
 517         # this doesn't block until they are all stopped...
 518         #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
 519
 520         # stop the daemons we know
 521         for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
 522             cluster, type_, id_ = teuthology.split_role(role)
 523             try:
 524                 ctx.daemons.get_daemon(type_, id_, cluster).stop()
 525             except Exception:
 526                 log.exception(f'Failed to stop "{role}"')
 527                 raise
 528
 529         # tear down anything left (but leave the logs behind)
 530         ctx.cluster.run(
 531             args=[
 532                 'sudo',
 533                 ctx.cephadm,
 534                 'rm-cluster',
 535                 '--fsid', fsid,
 536                 '--force',
 537                 '--keep-logs',
 538             ],
 539             check_status=False,  # may fail if upgrading from old cephadm
 540         )
 541
 542         # clean up /etc/ceph
 543         ctx.cluster.run(args=[
 544             'sudo', 'rm', '-f',
 545             '/etc/ceph/{}.conf'.format(cluster_name),
 546             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 547         ])
 548
 549
 550 @contextlib.contextmanager
 551 def ceph_mons(ctx, config):
 552     """
 553     Deploy any additional mons
 554     """
 555     cluster_name = config['cluster']
 556     fsid = ctx.ceph[cluster_name].fsid
 557
 558     try:
 559         daemons = {}
 560         if config.get('add_mons_via_daemon_add'):
 561             # This is the old way of adding mons that works with the (early) octopus
 562             # cephadm scheduler.
 563             num_mons = 1
 564             for remote, roles in ctx.cluster.remotes.items():
 565                 for mon in [r for r in roles
 566                             if teuthology.is_type('mon', cluster_name)(r)]:
 567                     c_, _, id_ = teuthology.split_role(mon)
 568                     if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 569                         continue
 570                     log.info('Adding %s on %s' % (mon, remote.shortname))
 571                     num_mons += 1
 572                     _shell(ctx, cluster_name, remote, [
 573                         'ceph', 'orch', 'daemon', 'add', 'mon',
 574                         remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
 575                     ])
 576                     ctx.daemons.register_daemon(
 577                         remote, 'mon', id_,
 578                         cluster=cluster_name,
 579                         fsid=fsid,
 580                         logger=log.getChild(mon),
 581                         wait=False,
 582                         started=True,
 583                     )
 584                     daemons[mon] = (remote, id_)
 585
 586                     with contextutil.safe_while(sleep=1, tries=180) as proceed:
 587                         while proceed():
 588                             log.info('Waiting for %d mons in monmap...' % (num_mons))
 589                             r = _shell(
 590                                 ctx=ctx,
 591                                 cluster_name=cluster_name,
 592                                 remote=remote,
 593                                 args=[
 594                                     'ceph', 'mon', 'dump', '-f', 'json',
 595                                 ],
 596                                 stdout=StringIO(),
 597                             )
 598                             j = json.loads(r.stdout.getvalue())
 599                             if len(j['mons']) == num_mons:
 600                                 break
 601         else:
 602             nodes = []
 603             for remote, roles in ctx.cluster.remotes.items():
 604                 for mon in [r for r in roles
 605                             if teuthology.is_type('mon', cluster_name)(r)]:
 606                     c_, _, id_ = teuthology.split_role(mon)
 607                     log.info('Adding %s on %s' % (mon, remote.shortname))
 608                     nodes.append(remote.shortname
 609                                  + ':' + ctx.ceph[cluster_name].mons[mon]
 610                                  + '=' + id_)
 611                     if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 612                         continue
 613                     daemons[mon] = (remote, id_)
 614
 615             _shell(ctx, cluster_name, remote, [
 616                 'ceph', 'orch', 'apply', 'mon',
 617                 str(len(nodes)) + ';' + ';'.join(nodes)]
 618                    )
 619             for mgr, i in daemons.items():
 620                 remote, id_ = i
 621                 ctx.daemons.register_daemon(
 622                     remote, 'mon', id_,
 623                     cluster=cluster_name,
 624                     fsid=fsid,
 625                     logger=log.getChild(mon),
 626                     wait=False,
 627                     started=True,
 628                 )
 629
 630             with contextutil.safe_while(sleep=1, tries=180) as proceed:
 631                 while proceed():
 632                     log.info('Waiting for %d mons in monmap...' % (len(nodes)))
 633                     r = _shell(
 634                         ctx=ctx,
 635                         cluster_name=cluster_name,
 636                         remote=remote,
 637                         args=[
 638                             'ceph', 'mon', 'dump', '-f', 'json',
 639                         ],
 640                         stdout=StringIO(),
 641                     )
 642                     j = json.loads(r.stdout.getvalue())
 643                     if len(j['mons']) == len(nodes):
 644                         break
 645
 646         # refresh our (final) ceph.conf file
 647         bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 648         log.info('Generating final ceph.conf file...')
 649         r = _shell(
 650             ctx=ctx,
 651             cluster_name=cluster_name,
 652             remote=bootstrap_remote,
 653             args=[
 654                 'ceph', 'config', 'generate-minimal-conf',
 655             ],
 656             stdout=StringIO(),
 657         )
 658         ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
 659
 660         yield
 661
 662     finally:
 663         pass
 664
 665
 666 @contextlib.contextmanager
 667 def ceph_mgrs(ctx, config):
 668     """
 669     Deploy any additional mgrs
 670     """
 671     cluster_name = config['cluster']
 672     fsid = ctx.ceph[cluster_name].fsid
 673
 674     try:
 675         nodes = []
 676         daemons = {}
 677         for remote, roles in ctx.cluster.remotes.items():
 678             for mgr in [r for r in roles
 679                         if teuthology.is_type('mgr', cluster_name)(r)]:
 680                 c_, _, id_ = teuthology.split_role(mgr)
 681                 log.info('Adding %s on %s' % (mgr, remote.shortname))
 682                 nodes.append(remote.shortname + '=' + id_)
 683                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
 684                     continue
 685                 daemons[mgr] = (remote, id_)
 686         if nodes:
 687             _shell(ctx, cluster_name, remote, [
 688                 'ceph', 'orch', 'apply', 'mgr',
 689                 str(len(nodes)) + ';' + ';'.join(nodes)]
 690             )
 691         for mgr, i in daemons.items():
 692             remote, id_ = i
 693             ctx.daemons.register_daemon(
 694                 remote, 'mgr', id_,
 695                 cluster=cluster_name,
 696                 fsid=fsid,
 697                 logger=log.getChild(mgr),
 698                 wait=False,
 699                 started=True,
 700             )
 701
 702         yield
 703
 704     finally:
 705         pass
 706
 707
 708 @contextlib.contextmanager
 709 def ceph_osds(ctx, config):
 710     """
 711     Deploy OSDs
 712     """
 713     cluster_name = config['cluster']
 714     fsid = ctx.ceph[cluster_name].fsid
 715
 716     try:
 717         log.info('Deploying OSDs...')
 718
 719         # provision OSDs in numeric order
 720         id_to_remote = {}
 721         devs_by_remote = {}
 722         for remote, roles in ctx.cluster.remotes.items():
 723             devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
 724             for osd in [r for r in roles
 725                         if teuthology.is_type('osd', cluster_name)(r)]:
 726                 _, _, id_ = teuthology.split_role(osd)
 727                 id_to_remote[int(id_)] = (osd, remote)
 728
 729         cur = 0
 730         for osd_id in sorted(id_to_remote.keys()):
 731             osd, remote = id_to_remote[osd_id]
 732             _, _, id_ = teuthology.split_role(osd)
 733             assert int(id_) == cur
 734             devs = devs_by_remote[remote]
 735             assert devs   ## FIXME ##
 736             dev = devs.pop()
 737             if all(_ in dev for _ in ('lv', 'vg')):
 738                 short_dev = dev.replace('/dev/', '')
 739             else:
 740                 short_dev = dev
 741             log.info('Deploying %s on %s with %s...' % (
 742                 osd, remote.shortname, dev))
 743             _shell(ctx, cluster_name, remote, [
 744                 'ceph-volume', 'lvm', 'zap', dev])
 745             _shell(ctx, cluster_name, remote, [
 746                 'ceph', 'orch', 'daemon', 'add', 'osd',
 747                 remote.shortname + ':' + short_dev
 748             ])
 749             ctx.daemons.register_daemon(
 750                 remote, 'osd', id_,
 751                 cluster=cluster_name,
 752                 fsid=fsid,
 753                 logger=log.getChild(osd),
 754                 wait=False,
 755                 started=True,
 756             )
 757             cur += 1
 758
 759         yield
 760     finally:
 761         pass
 762
 763
 764 @contextlib.contextmanager
 765 def ceph_mdss(ctx, config):
 766     """
 767     Deploy MDSss
 768     """
 769     cluster_name = config['cluster']
 770     fsid = ctx.ceph[cluster_name].fsid
 771
 772     nodes = []
 773     daemons = {}
 774     for remote, roles in ctx.cluster.remotes.items():
 775         for role in [r for r in roles
 776                     if teuthology.is_type('mds', cluster_name)(r)]:
 777             c_, _, id_ = teuthology.split_role(role)
 778             log.info('Adding %s on %s' % (role, remote.shortname))
 779             nodes.append(remote.shortname + '=' + id_)
 780             daemons[role] = (remote, id_)
 781     if nodes:
 782         _shell(ctx, cluster_name, remote, [
 783             'ceph', 'orch', 'apply', 'mds',
 784             'all',
 785             str(len(nodes)) + ';' + ';'.join(nodes)]
 786         )
 787     for role, i in daemons.items():
 788         remote, id_ = i
 789         ctx.daemons.register_daemon(
 790             remote, 'mds', id_,
 791             cluster=cluster_name,
 792             fsid=fsid,
 793             logger=log.getChild(role),
 794             wait=False,
 795             started=True,
 796         )
 797
 798     yield
 799
 800
 801 @contextlib.contextmanager
 802 def ceph_monitoring(daemon_type, ctx, config):
 803     """
 804     Deploy prometheus, node-exporter, etc.
 805     """
 806     cluster_name = config['cluster']
 807     fsid = ctx.ceph[cluster_name].fsid
 808
 809     nodes = []
 810     daemons = {}
 811     for remote, roles in ctx.cluster.remotes.items():
 812         for role in [r for r in roles
 813                     if teuthology.is_type(daemon_type, cluster_name)(r)]:
 814             c_, _, id_ = teuthology.split_role(role)
 815             log.info('Adding %s on %s' % (role, remote.shortname))
 816             nodes.append(remote.shortname + '=' + id_)
 817             daemons[role] = (remote, id_)
 818     if nodes:
 819         _shell(ctx, cluster_name, remote, [
 820             'ceph', 'orch', 'apply', daemon_type,
 821             str(len(nodes)) + ';' + ';'.join(nodes)]
 822         )
 823     for role, i in daemons.items():
 824         remote, id_ = i
 825         ctx.daemons.register_daemon(
 826             remote, daemon_type, id_,
 827             cluster=cluster_name,
 828             fsid=fsid,
 829             logger=log.getChild(role),
 830             wait=False,
 831             started=True,
 832         )
 833
 834     yield
 835
 836
 837 @contextlib.contextmanager
 838 def ceph_rgw(ctx, config):
 839     """
 840     Deploy rgw
 841     """
 842     cluster_name = config['cluster']
 843     fsid = ctx.ceph[cluster_name].fsid
 844
 845     nodes = {}
 846     daemons = {}
 847     for remote, roles in ctx.cluster.remotes.items():
 848         for role in [r for r in roles
 849                     if teuthology.is_type('rgw', cluster_name)(r)]:
 850             c_, _, id_ = teuthology.split_role(role)
 851             log.info('Adding %s on %s' % (role, remote.shortname))
 852             svc = '.'.join(id_.split('.')[0:2])
 853             if svc not in nodes:
 854                 nodes[svc] = []
 855             nodes[svc].append(remote.shortname + '=' + id_)
 856             daemons[role] = (remote, id_)
 857
 858     for svc, nodes in nodes.items():
 859         _shell(ctx, cluster_name, remote, [
 860             'ceph', 'orch', 'apply', 'rgw', svc,
 861              '--placement',
 862              str(len(nodes)) + ';' + ';'.join(nodes)]
 863         )
 864     for role, i in daemons.items():
 865         remote, id_ = i
 866         ctx.daemons.register_daemon(
 867             remote, 'rgw', id_,
 868             cluster=cluster_name,
 869             fsid=fsid,
 870             logger=log.getChild(role),
 871             wait=False,
 872             started=True,
 873         )
 874
 875     yield
 876
 877
 878 @contextlib.contextmanager
 879 def ceph_iscsi(ctx, config):
 880     """
 881     Deploy iSCSIs
 882     """
 883     cluster_name = config['cluster']
 884     fsid = ctx.ceph[cluster_name].fsid
 885
 886     nodes = []
 887     daemons = {}
 888     for remote, roles in ctx.cluster.remotes.items():
 889         for role in [r for r in roles
 890                     if teuthology.is_type('iscsi', cluster_name)(r)]:
 891             c_, _, id_ = teuthology.split_role(role)
 892             log.info('Adding %s on %s' % (role, remote.shortname))
 893             nodes.append(remote.shortname + '=' + id_)
 894             daemons[role] = (remote, id_)
 895     if nodes:
 896         poolname = 'iscsi'
 897         # ceph osd pool create iscsi 3 3 replicated
 898         _shell(ctx, cluster_name, remote, [
 899             'ceph', 'osd', 'pool', 'create',
 900             poolname, '3', '3', 'replicated']
 901         )
 902
 903         _shell(ctx, cluster_name, remote, [
 904             'ceph', 'osd', 'pool', 'application', 'enable',
 905             poolname, 'rbd']
 906         )
 907
 908         # ceph orch apply iscsi iscsi user password
 909         _shell(ctx, cluster_name, remote, [
 910             'ceph', 'orch', 'apply', 'iscsi',
 911             poolname, 'user', 'password',
 912             '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
 913         )
 914     for role, i in daemons.items():
 915         remote, id_ = i
 916         ctx.daemons.register_daemon(
 917             remote, 'iscsi', id_,
 918             cluster=cluster_name,
 919             fsid=fsid,
 920             logger=log.getChild(role),
 921             wait=False,
 922             started=True,
 923         )
 924
 925     yield
 926
 927
 928 @contextlib.contextmanager
 929 def ceph_clients(ctx, config):
 930     cluster_name = config['cluster']
 931
 932     log.info('Setting up client nodes...')
 933     clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
 934     for remote, roles_for_host in clients.remotes.items():
 935         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
 936                                                      cluster_name):
 937             name = teuthology.ceph_role(role)
 938             client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
 939                                                                 name)
 940             r = _shell(
 941                 ctx=ctx,
 942                 cluster_name=cluster_name,
 943                 remote=remote,
 944                 args=[
 945                     'ceph', 'auth',
 946                     'get-or-create', name,
 947                     'mon', 'allow *',
 948                     'osd', 'allow *',
 949                     'mds', 'allow *',
 950                     'mgr', 'allow *',
 951                 ],
 952                 stdout=StringIO(),
 953             )
 954             keyring = r.stdout.getvalue()
 955             remote.sudo_write_file(client_keyring, keyring, mode='0644')
 956     yield
 957
 958
 959 @contextlib.contextmanager
 960 def ceph_initial():
 961     try:
 962         yield
 963     finally:
 964         log.info('Teardown complete')
 965
 966
 967 ## public methods
 968 @contextlib.contextmanager
 969 def stop(ctx, config):
 970     """
 971     Stop ceph daemons
 972
 973     For example::
 974       tasks:
 975       - ceph.stop: [mds.*]
 976
 977       tasks:
 978       - ceph.stop: [osd.0, osd.2]
 979
 980       tasks:
 981       - ceph.stop:
 982           daemons: [osd.0, osd.2]
 983
 984     """
 985     if config is None:
 986         config = {}
 987     elif isinstance(config, list):
 988         config = {'daemons': config}
 989
 990     daemons = ctx.daemons.resolve_role_list(
 991         config.get('daemons', None), CEPH_ROLE_TYPES, True)
 992     clusters = set()
 993
 994     for role in daemons:
 995         cluster, type_, id_ = teuthology.split_role(role)
 996         ctx.daemons.get_daemon(type_, id_, cluster).stop()
 997         clusters.add(cluster)
 998
 999 #    for cluster in clusters:
1000 #        ctx.ceph[cluster].watchdog.stop()
1001 #        ctx.ceph[cluster].watchdog.join()
1002
1003     yield
1004
1005
1006 def shell(ctx, config):
1007     """
1008     Execute (shell) commands
1009     """
1010     cluster_name = config.get('cluster', 'ceph')
1011
1012     args = []
1013     for k in config.pop('env', []):
1014         args.extend(['-e', k + '=' + ctx.config.get(k, '')])
1015     for k in config.pop('volumes', []):
1016         args.extend(['-v', k])
1017
1018     if 'all-roles' in config and len(config) == 1:
1019         a = config['all-roles']
1020         roles = teuthology.all_roles(ctx.cluster)
1021         config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
1022     elif 'all-hosts' in config and len(config) == 1:
1023         a = config['all-hosts']
1024         roles = teuthology.all_roles(ctx.cluster)
1025         config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
1026
1027     for role, cmd in config.items():
1028         (remote,) = ctx.cluster.only(role).remotes.keys()
1029         log.info('Running commands on role %s host %s', role, remote.name)
1030         if isinstance(cmd, list):
1031             for c in cmd:
1032                 _shell(ctx, cluster_name, remote,
1033                        ['bash', '-c', subst_vip(ctx, c)],
1034                        extra_cephadm_args=args)
1035         else:
1036             assert isinstance(cmd, str)
1037             _shell(ctx, cluster_name, remote,
1038                    ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
1039                    extra_cephadm_args=args)
1040
1041
1042 def apply(ctx, config):
1043     """
1044     Apply spec
1045
1046       tasks:
1047         - cephadm.apply:
1048             specs:
1049             - service_type: rgw
1050               service_id: foo
1051               spec:
1052                 rgw_frontend_port: 8000
1053             - service_type: rgw
1054               service_id: bar
1055               spec:
1056                 rgw_frontend_port: 9000
1057                 zone: bar
1058                 realm: asdf
1059
1060     """
1061     cluster_name = config.get('cluster', 'ceph')
1062
1063     specs = config.get('specs', [])
1064     y = subst_vip(ctx, yaml.dump_all(specs))
1065
1066     log.info(f'Applying spec(s):\n{y}')
1067     _shell(
1068         ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1069         ['ceph', 'orch', 'apply', '-i', '-'],
1070         stdin=y,
1071     )
1072
1073
1074 def wait_for_service(ctx, config):
1075     """
1076     Wait for a service to be fully started
1077
1078       tasks:
1079         - cephadm.wait_for_service:
1080             service: rgw.foo
1081             timeout: 60    # defaults to 300
1082
1083     """
1084     cluster_name = config.get('cluster', 'ceph')
1085     timeout = config.get('timeout', 300)
1086     service = config.get('service')
1087     assert service
1088
1089     log.info(
1090         f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1091     )
1092     with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
1093         while proceed():
1094             r = _shell(
1095                 ctx=ctx,
1096                 cluster_name=cluster_name,
1097                 remote=ctx.ceph[cluster_name].bootstrap_remote,
1098                 args=[
1099                     'ceph', 'orch', 'ls', '-f', 'json',
1100                 ],
1101                 stdout=StringIO(),
1102             )
1103             j = json.loads(r.stdout.getvalue())
1104             svc = None
1105             for s in j:
1106                 if s['service_name'] == service:
1107                     svc = s
1108                     break
1109             if svc:
1110                 log.info(
1111                     f"{service} has {s['status']['running']}/{s['status']['size']}"
1112                 )
1113                 if s['status']['running'] == s['status']['size']:
1114                     break
1115
1116
1117 @contextlib.contextmanager
1118 def tweaked_option(ctx, config):
1119     """
1120     set an option, and then restore it with its original value
1121
1122     Note, due to the way how tasks are executed/nested, it's not suggested to
1123     use this method as a standalone task. otherwise, it's likely that it will
1124     restore the tweaked option at the /end/ of 'tasks' block.
1125     """
1126     saved_options = {}
1127     # we can complicate this when necessary
1128     options = ['mon-health-to-clog']
1129     type_, id_ = 'mon', '*'
1130     cluster = config.get('cluster', 'ceph')
1131     manager = ctx.managers[cluster]
1132     if id_ == '*':
1133         get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
1134     else:
1135         get_from = id_
1136     for option in options:
1137         if option not in config:
1138             continue
1139         value = 'true' if config[option] else 'false'
1140         option = option.replace('-', '_')
1141         old_value = manager.get_config(type_, get_from, option)
1142         if value != old_value:
1143             saved_options[option] = old_value
1144             manager.inject_args(type_, id_, option, value)
1145     yield
1146     for option, value in saved_options.items():
1147         manager.inject_args(type_, id_, option, value)
1148
1149
1150 @contextlib.contextmanager
1151 def restart(ctx, config):
1152     """
1153    restart ceph daemons
1154
1155    For example::
1156       tasks:
1157       - ceph.restart: [all]
1158
1159    For example::
1160       tasks:
1161       - ceph.restart: [osd.0, mon.1, mds.*]
1162
1163    or::
1164
1165       tasks:
1166       - ceph.restart:
1167           daemons: [osd.0, mon.1]
1168           wait-for-healthy: false
1169           wait-for-osds-up: true
1170
1171     :param ctx: Context
1172     :param config: Configuration
1173     """
1174     if config is None:
1175         config = {}
1176     elif isinstance(config, list):
1177         config = {'daemons': config}
1178
1179     daemons = ctx.daemons.resolve_role_list(
1180         config.get('daemons', None), CEPH_ROLE_TYPES, True)
1181     clusters = set()
1182
1183     log.info('daemons %s' % daemons)
1184     with tweaked_option(ctx, config):
1185         for role in daemons:
1186             cluster, type_, id_ = teuthology.split_role(role)
1187             d = ctx.daemons.get_daemon(type_, id_, cluster)
1188             assert d, 'daemon %s does not exist' % role
1189             d.stop()
1190             if type_ == 'osd':
1191                 ctx.managers[cluster].mark_down_osd(id_)
1192             d.restart()
1193             clusters.add(cluster)
1194
1195     if config.get('wait-for-healthy', True):
1196         for cluster in clusters:
1197             healthy(ctx=ctx, config=dict(cluster=cluster))
1198     if config.get('wait-for-osds-up', False):
1199         for cluster in clusters:
1200             ctx.managers[cluster].wait_for_all_osds_up()
1201     yield
1202
1203
1204 @contextlib.contextmanager
1205 def distribute_config_and_admin_keyring(ctx, config):
1206     """
1207     Distribute a sufficient config and keyring for clients
1208     """
1209     cluster_name = config['cluster']
1210     log.info('Distributing (final) config and client.admin keyring...')
1211     for remote, roles in ctx.cluster.remotes.items():
1212         remote.write_file(
1213             '/etc/ceph/{}.conf'.format(cluster_name),
1214             ctx.ceph[cluster_name].config_file,
1215             sudo=True)
1216         remote.write_file(
1217             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1218             data=ctx.ceph[cluster_name].admin_keyring,
1219             sudo=True)
1220     try:
1221         yield
1222     finally:
1223         ctx.cluster.run(args=[
1224             'sudo', 'rm', '-f',
1225             '/etc/ceph/{}.conf'.format(cluster_name),
1226             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1227         ])
1228
1229
1230 @contextlib.contextmanager
1231 def crush_setup(ctx, config):
1232     cluster_name = config['cluster']
1233
1234     profile = config.get('crush_tunables', 'default')
1235     log.info('Setting crush tunables to %s', profile)
1236     _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1237         args=['ceph', 'osd', 'crush', 'tunables', profile])
1238     yield
1239
1240
1241 @contextlib.contextmanager
1242 def create_rbd_pool(ctx, config):
1243     if config.get('create_rbd_pool', False):
1244       cluster_name = config['cluster']
1245       log.info('Waiting for OSDs to come up')
1246       teuthology.wait_until_osds_up(
1247           ctx,
1248           cluster=ctx.cluster,
1249           remote=ctx.ceph[cluster_name].bootstrap_remote,
1250           ceph_cluster=cluster_name,
1251       )
1252       log.info('Creating RBD pool')
1253       _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1254           args=['sudo', 'ceph', '--cluster', cluster_name,
1255                 'osd', 'pool', 'create', 'rbd', '8'])
1256       _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1257           args=['sudo', 'ceph', '--cluster', cluster_name,
1258                 'osd', 'pool', 'application', 'enable',
1259                 'rbd', 'rbd', '--yes-i-really-mean-it'
1260           ])
1261     yield
1262
1263
1264 @contextlib.contextmanager
1265 def _bypass():
1266     yield
1267
1268
1269 @contextlib.contextmanager
1270 def initialize_config(ctx, config):
1271     cluster_name = config['cluster']
1272     testdir = teuthology.get_testdir(ctx)
1273
1274     ctx.ceph[cluster_name].thrashers = []
1275     # fixme: setup watchdog, ala ceph.py
1276
1277     ctx.ceph[cluster_name].roleless = False  # see below
1278
1279     first_ceph_cluster = False
1280     if not hasattr(ctx, 'daemons'):
1281         first_ceph_cluster = True
1282
1283     # cephadm mode?
1284     if 'cephadm_mode' not in config:
1285         config['cephadm_mode'] = 'root'
1286     assert config['cephadm_mode'] in ['root', 'cephadm-package']
1287     if config['cephadm_mode'] == 'root':
1288         ctx.cephadm = testdir + '/cephadm'
1289     else:
1290         ctx.cephadm = 'cephadm'  # in the path
1291
1292     if first_ceph_cluster:
1293         # FIXME: this is global for all clusters
1294         ctx.daemons = DaemonGroup(
1295             use_cephadm=ctx.cephadm)
1296
1297     # uuid
1298     fsid = str(uuid.uuid1())
1299     log.info('Cluster fsid is %s' % fsid)
1300     ctx.ceph[cluster_name].fsid = fsid
1301
1302     # mon ips
1303     log.info('Choosing monitor IPs and ports...')
1304     remotes_and_roles = ctx.cluster.remotes.items()
1305     ips = [host for (host, port) in
1306            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1307
1308     if config.get('roleless', False):
1309         # mons will be named after hosts
1310         first_mon = None
1311         for remote, _ in remotes_and_roles:
1312             ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1313             if not first_mon:
1314                 first_mon = remote.shortname
1315                 bootstrap_remote = remote
1316         log.info('No mon roles; fabricating mons')
1317
1318     roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1319
1320     ctx.ceph[cluster_name].mons = get_mons(
1321         roles, ips, cluster_name,
1322         mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1323         mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1324     )
1325     log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1326
1327     if config.get('roleless', False):
1328         ctx.ceph[cluster_name].roleless = True
1329         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1330         ctx.ceph[cluster_name].first_mon = first_mon
1331         ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1332     else:
1333         first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1334         _, _, first_mon = teuthology.split_role(first_mon_role)
1335         (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1336         log.info('First mon is mon.%s on %s' % (first_mon,
1337                                                 bootstrap_remote.shortname))
1338         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1339         ctx.ceph[cluster_name].first_mon = first_mon
1340         ctx.ceph[cluster_name].first_mon_role = first_mon_role
1341
1342         others = ctx.cluster.remotes[bootstrap_remote]
1343         mgrs = sorted([r for r in others
1344                        if teuthology.is_type('mgr', cluster_name)(r)])
1345         if not mgrs:
1346             raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1347         _, _, first_mgr = teuthology.split_role(mgrs[0])
1348         log.info('First mgr is %s' % (first_mgr))
1349         ctx.ceph[cluster_name].first_mgr = first_mgr
1350     yield
1351
1352
1353 @contextlib.contextmanager
1354 def task(ctx, config):
1355     """
1356     Deploy ceph cluster using cephadm
1357
1358     For example, teuthology.yaml can contain the 'defaults' section:
1359
1360         defaults:
1361           cephadm:
1362             containers:
1363               image: 'quay.io/ceph-ci/ceph'
1364
1365     Using overrides makes it possible to customize it per run.
1366     The equivalent 'overrides' section looks like:
1367
1368         overrides:
1369           cephadm:
1370             containers:
1371               image: 'quay.io/ceph-ci/ceph'
1372             registry-login:
1373               url:  registry-url
1374               username: registry-user
1375               password: registry-password
1376
1377     :param ctx: the argparse.Namespace object
1378     :param config: the config dict
1379     """
1380     if config is None:
1381         config = {}
1382
1383     assert isinstance(config, dict), \
1384         "task only supports a dictionary for configuration"
1385
1386     overrides = ctx.config.get('overrides', {})
1387     teuthology.deep_merge(config, overrides.get('ceph', {}))
1388     teuthology.deep_merge(config, overrides.get('cephadm', {}))
1389     log.info('Config: ' + str(config))
1390
1391     # set up cluster context
1392     if not hasattr(ctx, 'ceph'):
1393         ctx.ceph = {}
1394     if 'cluster' not in config:
1395         config['cluster'] = 'ceph'
1396     cluster_name = config['cluster']
1397     if cluster_name not in ctx.ceph:
1398         ctx.ceph[cluster_name] = argparse.Namespace()
1399         ctx.ceph[cluster_name].bootstrapped = False
1400
1401     # image
1402     teuth_defaults = teuth_config.get('defaults', {})
1403     cephadm_defaults = teuth_defaults.get('cephadm', {})
1404     containers_defaults = cephadm_defaults.get('containers', {})
1405     container_image_name = containers_defaults.get('image', None)
1406
1407     containers = config.get('containers', {})
1408     container_image_name = containers.get('image', container_image_name)
1409
1410     if not hasattr(ctx.ceph[cluster_name], 'image'):
1411         ctx.ceph[cluster_name].image = config.get('image')
1412     ref = None
1413     if not ctx.ceph[cluster_name].image:
1414         if not container_image_name:
1415             raise Exception("Configuration error occurred. "
1416                             "The 'image' value is undefined for 'cephadm' task. "
1417                             "Please provide corresponding options in the task's "
1418                             "config, task 'overrides', or teuthology 'defaults' "
1419                             "section.")
1420         sha1 = config.get('sha1')
1421         flavor = config.get('flavor', 'default')
1422
1423         if sha1:
1424             if flavor == "crimson":
1425                 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1426             else:
1427                 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1428             ref = sha1
1429         else:
1430             # hmm, fall back to branch?
1431             branch = config.get('branch', 'master')
1432             ref = branch
1433             ctx.ceph[cluster_name].image = container_image_name + ':' + branch
1434     log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1435
1436
1437     with contextutil.nested(
1438             #if the cluster is already bootstrapped bypass corresponding methods
1439             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1440                               else initialize_config(ctx=ctx, config=config),
1441             lambda: ceph_initial(),
1442             lambda: normalize_hostnames(ctx=ctx),
1443             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1444                               else download_cephadm(ctx=ctx, config=config, ref=ref),
1445             lambda: ceph_log(ctx=ctx, config=config),
1446             lambda: ceph_crash(ctx=ctx, config=config),
1447             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1448                               else ceph_bootstrap(ctx, config),
1449             lambda: crush_setup(ctx=ctx, config=config),
1450             lambda: ceph_mons(ctx=ctx, config=config),
1451             lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1452             lambda: ceph_mgrs(ctx=ctx, config=config),
1453             lambda: ceph_osds(ctx=ctx, config=config),
1454             lambda: ceph_mdss(ctx=ctx, config=config),
1455             lambda: ceph_rgw(ctx=ctx, config=config),
1456             lambda: ceph_iscsi(ctx=ctx, config=config),
1457             lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1458             lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1459             lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1460             lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1461             lambda: ceph_clients(ctx=ctx, config=config),
1462             lambda: create_rbd_pool(ctx=ctx, config=config),
1463     ):
1464         if not hasattr(ctx, 'managers'):
1465             ctx.managers = {}
1466         ctx.managers[cluster_name] = CephManager(
1467             ctx.ceph[cluster_name].bootstrap_remote,
1468             ctx=ctx,
1469             logger=log.getChild('ceph_manager.' + cluster_name),
1470             cluster=cluster_name,
1471             cephadm=True,
1472         )
1473
1474         try:
1475             if config.get('wait-for-healthy', True):
1476                 healthy(ctx=ctx, config=config)
1477
1478             log.info('Setup complete, yielding')
1479             yield
1480
1481         finally:
1482             log.info('Teardown begin')
1483