ceph/qa/tasks/cephadm.py

   1 """
   2 Ceph cluster task, deployed via cephadm orchestrator
   3 """
   4 import argparse
   5 import configobj
   6 import contextlib
   7 import errno
   8 import logging
   9 import os
  10 import json
  11 import re
  12 import uuid
  13
  14 import six
  15 import toml
  16 from io import BytesIO
  17 from six import StringIO
  18 from tarfile import ReadError
  19 from tasks.ceph_manager import CephManager
  20 from teuthology import misc as teuthology
  21 from teuthology import contextutil
  22 from teuthology.orchestra import run
  23 from teuthology.orchestra.daemon import DaemonGroup
  24 from teuthology.config import config as teuth_config
  25
  26 # these items we use from ceph.py should probably eventually move elsewhere
  27 from tasks.ceph import get_mons, healthy
  28
  29 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
  30
  31 log = logging.getLogger(__name__)
  32
  33
  34 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
  35     testdir = teuthology.get_testdir(ctx)
  36     return remote.run(
  37         args=[
  38             'sudo',
  39             ctx.cephadm,
  40             '--image', ctx.ceph[cluster_name].image,
  41             'shell',
  42             '-c', '/etc/ceph/{}.conf'.format(cluster_name),
  43             '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
  44             '--fsid', ctx.ceph[cluster_name].fsid,
  45             ] + extra_cephadm_args + [
  46             '--',
  47             ] + args,
  48         **kwargs
  49     )
  50
  51 def build_initial_config(ctx, config):
  52     cluster_name = config['cluster']
  53
  54     path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
  55     conf = configobj.ConfigObj(path, file_error=True)
  56
  57     conf.setdefault('global', {})
  58     conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
  59
  60     # overrides
  61     for section, keys in config.get('conf',{}).items():
  62         for key, value in keys.items():
  63             log.info(" override: [%s] %s = %s" % (section, key, value))
  64             if section not in conf:
  65                 conf[section] = {}
  66             conf[section][key] = value
  67
  68     return conf
  69
  70 @contextlib.contextmanager
  71 def normalize_hostnames(ctx):
  72     """
  73     Ensure we have short hostnames throughout, for consistency between
  74     remote.shortname and socket.gethostname() in cephadm.
  75     """
  76     log.info('Normalizing hostnames...')
  77     ctx.cluster.run(args=[
  78         'sudo',
  79         'hostname',
  80         run.Raw('$(hostname -s)'),
  81     ])
  82
  83     try:
  84         yield
  85     finally:
  86         pass
  87
  88 @contextlib.contextmanager
  89 def download_cephadm(ctx, config, ref):
  90     cluster_name = config['cluster']
  91
  92     if config.get('cephadm_mode') != 'cephadm-package':
  93         ref = config.get('cephadm_branch', ref)
  94         git_url = teuth_config.get_ceph_git_url()
  95         log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
  96         if git_url.startswith('https://github.com/'):
  97             # git archive doesn't like https:// URLs, which we use with github.
  98             rest = git_url.split('https://github.com/', 1)[1]
  99             rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
 100             ctx.cluster.run(
 101                 args=[
 102                     'curl', '--silent',
 103                     'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
 104                     run.Raw('>'),
 105                     ctx.cephadm,
 106                     run.Raw('&&'),
 107                     'ls', '-l',
 108                     ctx.cephadm,
 109                 ],
 110             )
 111         else:
 112             ctx.cluster.run(
 113                 args=[
 114                     'git', 'archive',
 115                     '--remote=' + git_url,
 116                     ref,
 117                     'src/cephadm/cephadm',
 118                     run.Raw('|'),
 119                     'tar', '-xO', 'src/cephadm/cephadm',
 120                     run.Raw('>'),
 121                     ctx.cephadm,
 122                 ],
 123             )
 124         # sanity-check the resulting file and set executable bit
 125         cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
 126         ctx.cluster.run(
 127             args=[
 128                 'test', '-s', ctx.cephadm,
 129                 run.Raw('&&'),
 130                 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
 131                 run.Raw('&&'),
 132                 'chmod', '+x', ctx.cephadm,
 133             ],
 134         )
 135
 136     try:
 137         yield
 138     finally:
 139         log.info('Removing cluster...')
 140         ctx.cluster.run(args=[
 141             'sudo',
 142             ctx.cephadm,
 143             'rm-cluster',
 144             '--fsid', ctx.ceph[cluster_name].fsid,
 145             '--force',
 146         ])
 147
 148         if config.get('cephadm_mode') == 'root':
 149             log.info('Removing cephadm ...')
 150             ctx.cluster.run(
 151                 args=[
 152                     'rm',
 153                     '-rf',
 154                     ctx.cephadm,
 155                 ],
 156             )
 157
 158 @contextlib.contextmanager
 159 def ceph_log(ctx, config):
 160     cluster_name = config['cluster']
 161     fsid = ctx.ceph[cluster_name].fsid
 162
 163     try:
 164         yield
 165
 166     except Exception:
 167         # we need to know this below
 168         ctx.summary['success'] = False
 169         raise
 170
 171     finally:
 172         log.info('Checking cluster log for badness...')
 173         def first_in_ceph_log(pattern, excludes):
 174             """
 175             Find the first occurrence of the pattern specified in the Ceph log,
 176             Returns None if none found.
 177
 178             :param pattern: Pattern scanned for.
 179             :param excludes: Patterns to ignore.
 180             :return: First line of text (or None if not found)
 181             """
 182             args = [
 183                 'sudo',
 184                 'egrep', pattern,
 185                 '/var/log/ceph/{fsid}/ceph.log'.format(
 186                     fsid=fsid),
 187             ]
 188             if excludes:
 189                 for exclude in excludes:
 190                     args.extend([run.Raw('|'), 'egrep', '-v', exclude])
 191             args.extend([
 192                 run.Raw('|'), 'head', '-n', '1',
 193             ])
 194             r = ctx.ceph[cluster_name].bootstrap_remote.run(
 195                 stdout=StringIO(),
 196                 args=args,
 197             )
 198             stdout = r.stdout.getvalue()
 199             if stdout != '':
 200                 return stdout
 201             return None
 202
 203         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
 204                              config.get('log-whitelist')) is not None:
 205             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
 206             ctx.summary['success'] = False
 207             # use the most severe problem as the failure reason
 208             if 'failure_reason' not in ctx.summary:
 209                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
 210                     match = first_in_ceph_log(pattern, config['log-whitelist'])
 211                     if match is not None:
 212                         ctx.summary['failure_reason'] = \
 213                             '"{match}" in cluster log'.format(
 214                                 match=match.rstrip('\n'),
 215                             )
 216                         break
 217
 218         if ctx.archive is not None and \
 219                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
 220             # and logs
 221             log.info('Compressing logs...')
 222             run.wait(
 223                 ctx.cluster.run(
 224                     args=[
 225                         'sudo',
 226                         'find',
 227                         '/var/log/ceph',   # all logs, not just for the cluster
 228                         '-name',
 229                         '*.log',
 230                         '-print0',
 231                         run.Raw('|'),
 232                         'sudo',
 233                         'xargs',
 234                         '-0',
 235                         '--no-run-if-empty',
 236                         '--',
 237                         'gzip',
 238                         '--',
 239                     ],
 240                     wait=False,
 241                 ),
 242             )
 243
 244             log.info('Archiving logs...')
 245             path = os.path.join(ctx.archive, 'remote')
 246             try:
 247                 os.makedirs(path)
 248             except OSError:
 249                 pass
 250             for remote in ctx.cluster.remotes.keys():
 251                 sub = os.path.join(path, remote.name)
 252                 try:
 253                     os.makedirs(sub)
 254                 except OSError:
 255                     pass
 256                 try:
 257                     teuthology.pull_directory(remote, '/var/log/ceph',  # everything
 258                                               os.path.join(sub, 'log'))
 259                 except ReadError:
 260                     pass
 261
 262 @contextlib.contextmanager
 263 def ceph_crash(ctx, config):
 264     """
 265     Gather crash dumps from /var/lib/ceph/$fsid/crash
 266     """
 267     cluster_name = config['cluster']
 268     fsid = ctx.ceph[cluster_name].fsid
 269
 270     try:
 271         yield
 272
 273     finally:
 274         if ctx.archive is not None:
 275             log.info('Archiving crash dumps...')
 276             path = os.path.join(ctx.archive, 'remote')
 277             try:
 278                 os.makedirs(path)
 279             except OSError:
 280                 pass
 281             for remote in ctx.cluster.remotes.keys():
 282                 sub = os.path.join(path, remote.name)
 283                 try:
 284                     os.makedirs(sub)
 285                 except OSError:
 286                     pass
 287                 try:
 288                     teuthology.pull_directory(remote,
 289                                               '/var/lib/ceph/%s/crash' % fsid,
 290                                               os.path.join(sub, 'crash'))
 291                 except ReadError:
 292                     pass
 293
 294 @contextlib.contextmanager
 295 def ceph_bootstrap(ctx, config, registry):
 296     """
 297     Bootstrap ceph cluster, setup containers' registry mirror before
 298     the bootstrap if the registry is provided.
 299
 300     :param ctx: the argparse.Namespace object
 301     :param config: the config dict
 302     :param registry: url to  containers' mirror registry
 303     """
 304     cluster_name = config['cluster']
 305     testdir = teuthology.get_testdir(ctx)
 306     fsid = ctx.ceph[cluster_name].fsid
 307
 308     bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
 309     first_mon = ctx.ceph[cluster_name].first_mon
 310     first_mon_role = ctx.ceph[cluster_name].first_mon_role
 311     mons = ctx.ceph[cluster_name].mons
 312
 313     ctx.cluster.run(args=[
 314         'sudo', 'mkdir', '-p', '/etc/ceph',
 315         ]);
 316     ctx.cluster.run(args=[
 317         'sudo', 'chmod', '777', '/etc/ceph',
 318         ]);
 319     if registry:
 320         add_mirror_to_cluster(ctx, registry)
 321     try:
 322         # write seed config
 323         log.info('Writing seed config...')
 324         conf_fp = BytesIO()
 325         seed_config = build_initial_config(ctx, config)
 326         seed_config.write(conf_fp)
 327         teuthology.write_file(
 328             remote=bootstrap_remote,
 329             path='{}/seed.{}.conf'.format(testdir, cluster_name),
 330             data=conf_fp.getvalue())
 331         log.debug('Final config:\n' + conf_fp.getvalue().decode())
 332         ctx.ceph[cluster_name].conf = seed_config
 333
 334         # register initial daemons
 335         ctx.daemons.register_daemon(
 336             bootstrap_remote, 'mon', first_mon,
 337             cluster=cluster_name,
 338             fsid=fsid,
 339             logger=log.getChild('mon.' + first_mon),
 340             wait=False,
 341             started=True,
 342         )
 343         if not ctx.ceph[cluster_name].roleless:
 344             first_mgr = ctx.ceph[cluster_name].first_mgr
 345             ctx.daemons.register_daemon(
 346                 bootstrap_remote, 'mgr', first_mgr,
 347                 cluster=cluster_name,
 348                 fsid=fsid,
 349                 logger=log.getChild('mgr.' + first_mgr),
 350                 wait=False,
 351                 started=True,
 352             )
 353
 354         # bootstrap
 355         log.info('Bootstrapping...')
 356         cmd = [
 357             'sudo',
 358             ctx.cephadm,
 359             '--image', ctx.ceph[cluster_name].image,
 360             '-v',
 361             'bootstrap',
 362             '--fsid', fsid,
 363             '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
 364             '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
 365             '--output-keyring',
 366             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 367             '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
 368         ]
 369         if not ctx.ceph[cluster_name].roleless:
 370             cmd += [
 371                 '--mon-id', first_mon,
 372                 '--mgr-id', first_mgr,
 373                 '--orphan-initial-daemons',   # we will do it explicitly!
 374                 '--skip-monitoring-stack',    # we'll provision these explicitly
 375             ]
 376         if mons[first_mon_role].startswith('['):
 377             cmd += ['--mon-addrv', mons[first_mon_role]]
 378         else:
 379             cmd += ['--mon-ip', mons[first_mon_role]]
 380         if config.get('skip_dashboard'):
 381             cmd += ['--skip-dashboard']
 382         # bootstrap makes the keyring root 0600, so +r it for our purposes
 383         cmd += [
 384             run.Raw('&&'),
 385             'sudo', 'chmod', '+r',
 386             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 387         ]
 388         bootstrap_remote.run(args=cmd)
 389
 390         # fetch keys and configs
 391         log.info('Fetching config...')
 392         ctx.ceph[cluster_name].config_file = teuthology.get_file(
 393             remote=bootstrap_remote,
 394             path='/etc/ceph/{}.conf'.format(cluster_name))
 395         log.info('Fetching client.admin keyring...')
 396         ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
 397             remote=bootstrap_remote,
 398             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name))
 399         log.info('Fetching mon keyring...')
 400         ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
 401             remote=bootstrap_remote,
 402             path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
 403             sudo=True)
 404
 405         # fetch ssh key, distribute to additional nodes
 406         log.info('Fetching pub ssh key...')
 407         ssh_pub_key = teuthology.get_file(
 408             remote=bootstrap_remote,
 409             path='{}/{}.pub'.format(testdir, cluster_name)
 410         ).decode('ascii').strip()
 411
 412         log.info('Installing pub ssh key for root users...')
 413         ctx.cluster.run(args=[
 414             'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
 415             run.Raw('&&'),
 416             'echo', ssh_pub_key,
 417             run.Raw('|'),
 418             'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
 419             run.Raw('&&'),
 420             'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
 421         ])
 422
 423         # set options
 424         _shell(ctx, cluster_name, bootstrap_remote,
 425                ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
 426
 427         # add other hosts
 428         for remote in ctx.cluster.remotes.keys():
 429             if remote == bootstrap_remote:
 430                 continue
 431             log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
 432             teuthology.write_file(
 433                 remote=remote,
 434                 path='/etc/ceph/{}.conf'.format(cluster_name),
 435                 data=ctx.ceph[cluster_name].config_file)
 436             teuthology.write_file(
 437                 remote=remote,
 438                 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 439                 data=ctx.ceph[cluster_name].admin_keyring)
 440
 441             log.info('Adding host %s to orchestrator...' % remote.shortname)
 442             _shell(ctx, cluster_name, remote, [
 443                 'ceph', 'orch', 'host', 'add',
 444                 remote.shortname
 445             ])
 446             r = _shell(ctx, cluster_name, remote,
 447                        ['ceph', 'orch', 'host', 'ls', '--format=json'],
 448                        stdout=StringIO())
 449             hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
 450             assert remote.shortname in hosts
 451
 452         yield
 453
 454     finally:
 455         log.info('Cleaning up testdir ceph.* files...')
 456         ctx.cluster.run(args=[
 457             'rm', '-f',
 458             '{}/seed.{}.conf'.format(testdir, cluster_name),
 459             '{}/{}.pub'.format(testdir, cluster_name),
 460         ])
 461
 462         log.info('Stopping all daemons...')
 463
 464         # this doesn't block until they are all stopped...
 465         #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
 466
 467         # so, stop them individually
 468         for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
 469             cluster, type_, id_ = teuthology.split_role(role)
 470             try:
 471                 ctx.daemons.get_daemon(type_, id_, cluster).stop()
 472             except Exception:
 473                 log.exception('Failed to stop "{role}"'.format(role=role))
 474                 raise
 475
 476         # clean up /etc/ceph
 477         ctx.cluster.run(args=[
 478             'sudo', 'rm', '-f',
 479             '/etc/ceph/{}.conf'.format(cluster_name),
 480             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 481         ])
 482
 483 @contextlib.contextmanager
 484 def ceph_mons(ctx, config):
 485     """
 486     Deploy any additional mons
 487     """
 488     cluster_name = config['cluster']
 489     fsid = ctx.ceph[cluster_name].fsid
 490     num_mons = 1
 491
 492     try:
 493         for remote, roles in ctx.cluster.remotes.items():
 494             for mon in [r for r in roles
 495                         if teuthology.is_type('mon', cluster_name)(r)]:
 496                 c_, _, id_ = teuthology.split_role(mon)
 497                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
 498                     continue
 499                 log.info('Adding %s on %s' % (mon, remote.shortname))
 500                 num_mons += 1
 501                 _shell(ctx, cluster_name, remote, [
 502                     'ceph', 'orch', 'daemon', 'add', 'mon',
 503                     remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
 504                 ])
 505                 ctx.daemons.register_daemon(
 506                     remote, 'mon', id_,
 507                     cluster=cluster_name,
 508                     fsid=fsid,
 509                     logger=log.getChild(mon),
 510                     wait=False,
 511                     started=True,
 512                 )
 513
 514                 with contextutil.safe_while(sleep=1, tries=180) as proceed:
 515                     while proceed():
 516                         log.info('Waiting for %d mons in monmap...' % (num_mons))
 517                         r = _shell(
 518                             ctx=ctx,
 519                             cluster_name=cluster_name,
 520                             remote=remote,
 521                             args=[
 522                                 'ceph', 'mon', 'dump', '-f', 'json',
 523                             ],
 524                             stdout=StringIO(),
 525                         )
 526                         j = json.loads(r.stdout.getvalue())
 527                         if len(j['mons']) == num_mons:
 528                             break
 529
 530         # refresh our (final) ceph.conf file
 531         log.info('Generating final ceph.conf file...')
 532         r = _shell(
 533             ctx=ctx,
 534             cluster_name=cluster_name,
 535             remote=remote,
 536             args=[
 537                 'ceph', 'config', 'generate-minimal-conf',
 538             ],
 539             stdout=StringIO(),
 540         )
 541         ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
 542
 543         yield
 544
 545     finally:
 546         pass
 547
 548 @contextlib.contextmanager
 549 def ceph_mgrs(ctx, config):
 550     """
 551     Deploy any additional mgrs
 552     """
 553     cluster_name = config['cluster']
 554     fsid = ctx.ceph[cluster_name].fsid
 555
 556     try:
 557         nodes = []
 558         daemons = {}
 559         for remote, roles in ctx.cluster.remotes.items():
 560             for mgr in [r for r in roles
 561                         if teuthology.is_type('mgr', cluster_name)(r)]:
 562                 c_, _, id_ = teuthology.split_role(mgr)
 563                 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
 564                     continue
 565                 log.info('Adding %s on %s' % (mgr, remote.shortname))
 566                 nodes.append(remote.shortname + '=' + id_)
 567                 daemons[mgr] = (remote, id_)
 568         if nodes:
 569             _shell(ctx, cluster_name, remote, [
 570                 'ceph', 'orch', 'apply', 'mgr',
 571                 str(len(nodes) + 1) + ';' + ';'.join(nodes)]
 572             )
 573         for mgr, i in daemons.items():
 574             remote, id_ = i
 575             ctx.daemons.register_daemon(
 576                 remote, 'mgr', id_,
 577                 cluster=cluster_name,
 578                 fsid=fsid,
 579                 logger=log.getChild(mgr),
 580                 wait=False,
 581                 started=True,
 582             )
 583
 584         yield
 585
 586     finally:
 587         pass
 588
 589 @contextlib.contextmanager
 590 def ceph_osds(ctx, config):
 591     """
 592     Deploy OSDs
 593     """
 594     cluster_name = config['cluster']
 595     fsid = ctx.ceph[cluster_name].fsid
 596
 597     try:
 598         log.info('Deploying OSDs...')
 599
 600         # provision OSDs in numeric order
 601         id_to_remote = {}
 602         devs_by_remote = {}
 603         for remote, roles in ctx.cluster.remotes.items():
 604             devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
 605             for osd in [r for r in roles
 606                         if teuthology.is_type('osd', cluster_name)(r)]:
 607                 _, _, id_ = teuthology.split_role(osd)
 608                 id_to_remote[int(id_)] = (osd, remote)
 609
 610         cur = 0
 611         for osd_id in sorted(id_to_remote.keys()):
 612             osd, remote = id_to_remote[osd_id]
 613             _, _, id_ = teuthology.split_role(osd)
 614             assert int(id_) == cur
 615             devs = devs_by_remote[remote]
 616             assert devs   ## FIXME ##
 617             dev = devs.pop()
 618             if all(_ in dev for _ in ('lv', 'vg')):
 619                 short_dev = dev.replace('/dev/', '')
 620             else:
 621                 short_dev = dev
 622             log.info('Deploying %s on %s with %s...' % (
 623                 osd, remote.shortname, dev))
 624             _shell(ctx, cluster_name, remote, [
 625                 'ceph-volume', 'lvm', 'zap', dev])
 626             _shell(ctx, cluster_name, remote, [
 627                 'ceph', 'orch', 'daemon', 'add', 'osd',
 628                 remote.shortname + ':' + short_dev
 629             ])
 630             ctx.daemons.register_daemon(
 631                 remote, 'osd', id_,
 632                 cluster=cluster_name,
 633                 fsid=fsid,
 634                 logger=log.getChild(osd),
 635                 wait=False,
 636                 started=True,
 637             )
 638             cur += 1
 639
 640         yield
 641     finally:
 642         pass
 643
 644 @contextlib.contextmanager
 645 def ceph_mdss(ctx, config):
 646     """
 647     Deploy MDSss
 648     """
 649     cluster_name = config['cluster']
 650     fsid = ctx.ceph[cluster_name].fsid
 651
 652     nodes = []
 653     daemons = {}
 654     for remote, roles in ctx.cluster.remotes.items():
 655         for role in [r for r in roles
 656                     if teuthology.is_type('mds', cluster_name)(r)]:
 657             c_, _, id_ = teuthology.split_role(role)
 658             log.info('Adding %s on %s' % (role, remote.shortname))
 659             nodes.append(remote.shortname + '=' + id_)
 660             daemons[role] = (remote, id_)
 661     if nodes:
 662         _shell(ctx, cluster_name, remote, [
 663             'ceph', 'orch', 'apply', 'mds',
 664             'all',
 665             str(len(nodes)) + ';' + ';'.join(nodes)]
 666         )
 667     for role, i in daemons.items():
 668         remote, id_ = i
 669         ctx.daemons.register_daemon(
 670             remote, 'mds', id_,
 671             cluster=cluster_name,
 672             fsid=fsid,
 673             logger=log.getChild(role),
 674             wait=False,
 675             started=True,
 676         )
 677
 678     yield
 679
 680 @contextlib.contextmanager
 681 def ceph_monitoring(daemon_type, ctx, config):
 682     """
 683     Deploy prometheus, node-exporter, etc.
 684     """
 685     cluster_name = config['cluster']
 686     fsid = ctx.ceph[cluster_name].fsid
 687
 688     nodes = []
 689     daemons = {}
 690     for remote, roles in ctx.cluster.remotes.items():
 691         for role in [r for r in roles
 692                     if teuthology.is_type(daemon_type, cluster_name)(r)]:
 693             c_, _, id_ = teuthology.split_role(role)
 694             log.info('Adding %s on %s' % (role, remote.shortname))
 695             nodes.append(remote.shortname + '=' + id_)
 696             daemons[role] = (remote, id_)
 697     if nodes:
 698         _shell(ctx, cluster_name, remote, [
 699             'ceph', 'orch', 'apply', daemon_type,
 700             str(len(nodes)) + ';' + ';'.join(nodes)]
 701         )
 702     for role, i in daemons.items():
 703         remote, id_ = i
 704         ctx.daemons.register_daemon(
 705             remote, daemon_type, id_,
 706             cluster=cluster_name,
 707             fsid=fsid,
 708             logger=log.getChild(role),
 709             wait=False,
 710             started=True,
 711         )
 712
 713     yield
 714
 715 @contextlib.contextmanager
 716 def ceph_rgw(ctx, config):
 717     """
 718     Deploy rgw
 719     """
 720     cluster_name = config['cluster']
 721     fsid = ctx.ceph[cluster_name].fsid
 722
 723     nodes = {}
 724     daemons = {}
 725     for remote, roles in ctx.cluster.remotes.items():
 726         for role in [r for r in roles
 727                     if teuthology.is_type('rgw', cluster_name)(r)]:
 728             c_, _, id_ = teuthology.split_role(role)
 729             log.info('Adding %s on %s' % (role, remote.shortname))
 730             realmzone = '.'.join(id_.split('.')[0:2])
 731             if realmzone not in nodes:
 732                 nodes[realmzone] = []
 733             nodes[realmzone].append(remote.shortname + '=' + id_)
 734             daemons[role] = (remote, id_)
 735
 736     for realmzone in nodes.keys():
 737         (realm, zone) = realmzone.split('.', 1)
 738
 739         # TODO: those should be moved to mgr/cephadm
 740         _shell(ctx, cluster_name, remote,
 741                ['radosgw-admin', 'realm', 'create', '--rgw-realm', realm, '--default']
 742         )
 743         _shell(ctx, cluster_name, remote,
 744                ['radosgw-admin', 'zonegroup', 'create', '--rgw-zonegroup=default', '--master', '--default']
 745         )
 746         _shell(ctx, cluster_name, remote,
 747                ['radosgw-admin', 'zone', 'create', '--rgw-zonegroup=default', '--rgw-zone', zone,  '--master', '--default']
 748         )
 749
 750     for realmzone, nodes in nodes.items():
 751         (realm, zone) = realmzone.split('.', 1)
 752         _shell(ctx, cluster_name, remote, [
 753             'ceph', 'orch', 'apply', 'rgw', realm, zone,
 754              '--placement',
 755              str(len(nodes)) + ';' + ';'.join(nodes)]
 756         )
 757     for role, i in daemons.items():
 758         remote, id_ = i
 759         ctx.daemons.register_daemon(
 760             remote, 'rgw', id_,
 761             cluster=cluster_name,
 762             fsid=fsid,
 763             logger=log.getChild(role),
 764             wait=False,
 765             started=True,
 766         )
 767
 768     yield
 769
 770 @contextlib.contextmanager
 771 def ceph_clients(ctx, config):
 772     cluster_name = config['cluster']
 773     testdir = teuthology.get_testdir(ctx)
 774
 775     log.info('Setting up client nodes...')
 776     clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
 777     testdir = teuthology.get_testdir(ctx)
 778     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 779     for remote, roles_for_host in clients.remotes.items():
 780         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
 781                                                      cluster_name):
 782             name = teuthology.ceph_role(role)
 783             client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
 784                                                                 name)
 785             r = _shell(
 786                 ctx=ctx,
 787                 cluster_name=cluster_name,
 788                 remote=remote,
 789                 args=[
 790                     'ceph', 'auth',
 791                     'get-or-create', name,
 792                     'mon', 'allow *',
 793                     'osd', 'allow *',
 794                     'mds', 'allow *',
 795                     'mgr', 'allow *',
 796                 ],
 797                 stdout=StringIO(),
 798             )
 799             keyring = r.stdout.getvalue()
 800             teuthology.sudo_write_file(
 801                 remote=remote,
 802                 path=client_keyring,
 803                 data=keyring,
 804                 perms='0644'
 805             )
 806     yield
 807
 808 @contextlib.contextmanager
 809 def ceph_initial():
 810     try:
 811         yield
 812     finally:
 813         log.info('Teardown complete')
 814
 815 ## public methods
 816 @contextlib.contextmanager
 817 def stop(ctx, config):
 818     """
 819     Stop ceph daemons
 820
 821     For example::
 822       tasks:
 823       - ceph.stop: [mds.*]
 824
 825       tasks:
 826       - ceph.stop: [osd.0, osd.2]
 827
 828       tasks:
 829       - ceph.stop:
 830           daemons: [osd.0, osd.2]
 831
 832     """
 833     if config is None:
 834         config = {}
 835     elif isinstance(config, list):
 836         config = {'daemons': config}
 837
 838     daemons = ctx.daemons.resolve_role_list(
 839         config.get('daemons', None), CEPH_ROLE_TYPES, True)
 840     clusters = set()
 841
 842     for role in daemons:
 843         cluster, type_, id_ = teuthology.split_role(role)
 844         ctx.daemons.get_daemon(type_, id_, cluster).stop()
 845         clusters.add(cluster)
 846
 847 #    for cluster in clusters:
 848 #        ctx.ceph[cluster].watchdog.stop()
 849 #        ctx.ceph[cluster].watchdog.join()
 850
 851     yield
 852
 853 def shell(ctx, config):
 854     """
 855     Execute (shell) commands
 856     """
 857     cluster_name = config.get('cluster', 'ceph')
 858
 859     env = []
 860     if 'env' in config:
 861         for k in config['env']:
 862             env.extend(['-e', k + '=' + ctx.config.get(k, '')])
 863         del config['env']
 864
 865     if 'all' in config and len(config) == 1:
 866         a = config['all']
 867         roles = teuthology.all_roles(ctx.cluster)
 868         config = dict((id_, a) for id_ in roles)
 869
 870     for role, ls in config.items():
 871         (remote,) = ctx.cluster.only(role).remotes.keys()
 872         log.info('Running commands on role %s host %s', role, remote.name)
 873         for c in ls:
 874             _shell(ctx, cluster_name, remote,
 875                    ['bash', '-c', c],
 876                    extra_cephadm_args=env)
 877
 878 @contextlib.contextmanager
 879 def tweaked_option(ctx, config):
 880     """
 881     set an option, and then restore it with its original value
 882
 883     Note, due to the way how tasks are executed/nested, it's not suggested to
 884     use this method as a standalone task. otherwise, it's likely that it will
 885     restore the tweaked option at the /end/ of 'tasks' block.
 886     """
 887     saved_options = {}
 888     # we can complicate this when necessary
 889     options = ['mon-health-to-clog']
 890     type_, id_ = 'mon', '*'
 891     cluster = config.get('cluster', 'ceph')
 892     manager = ctx.managers[cluster]
 893     if id_ == '*':
 894         get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
 895     else:
 896         get_from = id_
 897     for option in options:
 898         if option not in config:
 899             continue
 900         value = 'true' if config[option] else 'false'
 901         option = option.replace('-', '_')
 902         old_value = manager.get_config(type_, get_from, option)
 903         if value != old_value:
 904             saved_options[option] = old_value
 905             manager.inject_args(type_, id_, option, value)
 906     yield
 907     for option, value in saved_options.items():
 908         manager.inject_args(type_, id_, option, value)
 909
 910 @contextlib.contextmanager
 911 def restart(ctx, config):
 912     """
 913    restart ceph daemons
 914
 915    For example::
 916       tasks:
 917       - ceph.restart: [all]
 918
 919    For example::
 920       tasks:
 921       - ceph.restart: [osd.0, mon.1, mds.*]
 922
 923    or::
 924
 925       tasks:
 926       - ceph.restart:
 927           daemons: [osd.0, mon.1]
 928           wait-for-healthy: false
 929           wait-for-osds-up: true
 930
 931     :param ctx: Context
 932     :param config: Configuration
 933     """
 934     if config is None:
 935         config = {}
 936     elif isinstance(config, list):
 937         config = {'daemons': config}
 938
 939     daemons = ctx.daemons.resolve_role_list(
 940         config.get('daemons', None), CEPH_ROLE_TYPES, True)
 941     clusters = set()
 942
 943     log.info('daemons %s' % daemons)
 944     with tweaked_option(ctx, config):
 945         for role in daemons:
 946             cluster, type_, id_ = teuthology.split_role(role)
 947             d = ctx.daemons.get_daemon(type_, id_, cluster)
 948             assert d, 'daemon %s does not exist' % role
 949             d.stop()
 950             if type_ == 'osd':
 951                 ctx.managers[cluster].mark_down_osd(id_)
 952             d.restart()
 953             clusters.add(cluster)
 954
 955     if config.get('wait-for-healthy', True):
 956         for cluster in clusters:
 957             healthy(ctx=ctx, config=dict(cluster=cluster))
 958     if config.get('wait-for-osds-up', False):
 959         for cluster in clusters:
 960             ctx.managers[cluster].wait_for_all_osds_up()
 961     yield
 962
 963 @contextlib.contextmanager
 964 def distribute_config_and_admin_keyring(ctx, config):
 965     """
 966     Distribute a sufficient config and keyring for clients
 967     """
 968     cluster_name = config['cluster']
 969     log.info('Distributing (final) config and client.admin keyring...')
 970     for remote, roles in ctx.cluster.remotes.items():
 971         teuthology.sudo_write_file(
 972             remote=remote,
 973             path='/etc/ceph/{}.conf'.format(cluster_name),
 974             data=ctx.ceph[cluster_name].config_file)
 975         teuthology.sudo_write_file(
 976             remote=remote,
 977             path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 978             data=ctx.ceph[cluster_name].admin_keyring)
 979     try:
 980         yield
 981     finally:
 982         ctx.cluster.run(args=[
 983             'sudo', 'rm', '-f',
 984             '/etc/ceph/{}.conf'.format(cluster_name),
 985             '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
 986         ])
 987
 988 @contextlib.contextmanager
 989 def crush_setup(ctx, config):
 990     cluster_name = config['cluster']
 991
 992     profile = config.get('crush_tunables', 'default')
 993     log.info('Setting crush tunables to %s', profile)
 994     _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
 995         args=['ceph', 'osd', 'crush', 'tunables', profile])
 996     yield
 997
 998 @contextlib.contextmanager
 999 def _bypass():
1000     yield
1001
1002 @contextlib.contextmanager
1003 def initialize_config(ctx, config):
1004     cluster_name = config['cluster']
1005     testdir = teuthology.get_testdir(ctx)
1006
1007     ctx.ceph[cluster_name].thrashers = []
1008     # fixme: setup watchdog, ala ceph.py
1009
1010     ctx.ceph[cluster_name].roleless = False  # see below
1011
1012     first_ceph_cluster = False
1013     if not hasattr(ctx, 'daemons'):
1014         first_ceph_cluster = True
1015
1016     # cephadm mode?
1017     if 'cephadm_mode' not in config:
1018         config['cephadm_mode'] = 'root'
1019     assert config['cephadm_mode'] in ['root', 'cephadm-package']
1020     if config['cephadm_mode'] == 'root':
1021         ctx.cephadm = testdir + '/cephadm'
1022     else:
1023         ctx.cephadm = 'cephadm'  # in the path
1024
1025     if first_ceph_cluster:
1026         # FIXME: this is global for all clusters
1027         ctx.daemons = DaemonGroup(
1028             use_cephadm=ctx.cephadm)
1029
1030     # uuid
1031     fsid = str(uuid.uuid1())
1032     log.info('Cluster fsid is %s' % fsid)
1033     ctx.ceph[cluster_name].fsid = fsid
1034
1035     # mon ips
1036     log.info('Choosing monitor IPs and ports...')
1037     remotes_and_roles = ctx.cluster.remotes.items()
1038     ips = [host for (host, port) in
1039            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1040
1041     if config.get('roleless', False):
1042         # mons will be named after hosts
1043         first_mon = None
1044         for remote, _ in remotes_and_roles:
1045             ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1046             if not first_mon:
1047                 first_mon = remote.shortname
1048                 bootstrap_remote = remote
1049         log.info('No mon roles; fabricating mons')
1050
1051     roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1052
1053     ctx.ceph[cluster_name].mons = get_mons(
1054         roles, ips, cluster_name,
1055         mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1056         mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1057     )
1058     log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1059
1060     if config.get('roleless', False):
1061         ctx.ceph[cluster_name].roleless = True
1062         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1063         ctx.ceph[cluster_name].first_mon = first_mon
1064         ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1065     else:
1066         first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1067         _, _, first_mon = teuthology.split_role(first_mon_role)
1068         (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1069         log.info('First mon is mon.%s on %s' % (first_mon,
1070                                                 bootstrap_remote.shortname))
1071         ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1072         ctx.ceph[cluster_name].first_mon = first_mon
1073         ctx.ceph[cluster_name].first_mon_role = first_mon_role
1074
1075         others = ctx.cluster.remotes[bootstrap_remote]
1076         mgrs = sorted([r for r in others
1077                        if teuthology.is_type('mgr', cluster_name)(r)])
1078         if not mgrs:
1079             raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1080         _, _, first_mgr = teuthology.split_role(mgrs[0])
1081         log.info('First mgr is %s' % (first_mgr))
1082         ctx.ceph[cluster_name].first_mgr = first_mgr
1083     yield
1084
1085 @contextlib.contextmanager
1086 def task(ctx, config):
1087     """
1088     Deploy ceph cluster using cephadm
1089
1090     Setup containers' mirrors before the bootstrap, if corresponding
1091     config provided in teuthology server config yaml file.
1092
1093     For example, teuthology.yaml can contain the 'defaults' section:
1094
1095         defaults:
1096           cephadm:
1097             containers:
1098               registry_mirrors:
1099                 docker.io: 'registry.mirror.example.com:5000'
1100               image: 'quay.io/ceph-ci/ceph'
1101
1102     Using overrides makes it possible to customize it per run.
1103     The equivalent 'overrides' section looks like:
1104
1105         overrides:
1106           cephadm:
1107             containers:
1108               registry_mirrors:
1109                 docker.io: 'registry.mirror.example.com:5000'
1110               image: 'quay.io/ceph-ci/ceph'
1111
1112     :param ctx: the argparse.Namespace object
1113     :param config: the config dict
1114     """
1115     if config is None:
1116         config = {}
1117
1118     assert isinstance(config, dict), \
1119         "task only supports a dictionary for configuration"
1120
1121     overrides = ctx.config.get('overrides', {})
1122     teuthology.deep_merge(config, overrides.get('ceph', {}))
1123     teuthology.deep_merge(config, overrides.get('cephadm', {}))
1124     log.info('Config: ' + str(config))
1125
1126     testdir = teuthology.get_testdir(ctx)
1127
1128     # set up cluster context
1129     if not hasattr(ctx, 'ceph'):
1130         ctx.ceph = {}
1131         ctx.managers = {}
1132     if 'cluster' not in config:
1133         config['cluster'] = 'ceph'
1134     cluster_name = config['cluster']
1135     if cluster_name not in ctx.ceph:
1136         ctx.ceph[cluster_name] = argparse.Namespace()
1137         ctx.ceph[cluster_name].bootstrapped = False
1138
1139     # image
1140     teuth_defaults = teuth_config.get('defaults', {})
1141     cephadm_defaults = teuth_defaults.get('cephadm', {})
1142     containers_defaults = cephadm_defaults.get('containers', {})
1143     mirrors_defaults = containers_defaults.get('registry_mirrors', {})
1144     container_registry_mirror = mirrors_defaults.get('docker.io', None)
1145     container_image_name = containers_defaults.get('image', None)
1146
1147     containers = config.get('containers', {})
1148     mirrors = containers.get('registry_mirrors', {})
1149     container_image_name = containers.get('image', container_image_name)
1150     container_registry_mirror = mirrors.get('docker.io',
1151                                             container_registry_mirror)
1152
1153     if not container_image_name:
1154         raise Exception("Configuration error occurred. "
1155                         "The 'image' value is undefined for 'cephadm' task. "
1156                         "Please provide corresponding options in the task's "
1157                         "config, task 'overrides', or teuthology 'defaults' "
1158                         "section.")
1159
1160     if not hasattr(ctx.ceph[cluster_name], 'image'):
1161         ctx.ceph[cluster_name].image = config.get('image')
1162     ref = None
1163     if not ctx.ceph[cluster_name].image:
1164         sha1 = config.get('sha1')
1165         if sha1:
1166             ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1167             ref = sha1
1168         else:
1169             # hmm, fall back to branch?
1170             branch = config.get('branch', 'master')
1171             ref = branch
1172             ctx.ceph[cluster_name].image = container_image_name + ':' + branch
1173     log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1174
1175
1176     with contextutil.nested(
1177             #if the cluster is already bootstrapped bypass corresponding methods
1178             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1179                               else initialize_config(ctx=ctx, config=config),
1180             lambda: ceph_initial(),
1181             lambda: normalize_hostnames(ctx=ctx),
1182             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1183                               else download_cephadm(ctx=ctx, config=config, ref=ref),
1184             lambda: ceph_log(ctx=ctx, config=config),
1185             lambda: ceph_crash(ctx=ctx, config=config),
1186             lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1187                               else ceph_bootstrap(ctx, config,
1188                                                   container_registry_mirror),
1189             lambda: crush_setup(ctx=ctx, config=config),
1190             lambda: ceph_mons(ctx=ctx, config=config),
1191             lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1192             lambda: ceph_mgrs(ctx=ctx, config=config),
1193             lambda: ceph_osds(ctx=ctx, config=config),
1194             lambda: ceph_mdss(ctx=ctx, config=config),
1195             lambda: ceph_rgw(ctx=ctx, config=config),
1196             lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1197             lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1198             lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1199             lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1200             lambda: ceph_clients(ctx=ctx, config=config),
1201     ):
1202         ctx.managers[cluster_name] = CephManager(
1203             ctx.ceph[cluster_name].bootstrap_remote,
1204             ctx=ctx,
1205             logger=log.getChild('ceph_manager.' + cluster_name),
1206             cluster=cluster_name,
1207             cephadm=True,
1208         )
1209
1210         try:
1211             if config.get('wait-for-healthy', True):
1212                 healthy(ctx=ctx, config=config)
1213
1214             log.info('Setup complete, yielding')
1215             yield
1216
1217         finally:
1218             log.info('Teardown begin')
1219
1220
1221 def registries_add_mirror_to_docker_io(conf, mirror):
1222     config = toml.loads(conf)
1223     is_v1 = 'registries' in config
1224     if is_v1:
1225         search = config.get('registries', {}).get('search', {}).get('registries', [])
1226         insecure = config.get('registries', {}).get('search', {}).get('insecure', [])
1227         # v2: MutableMapping[str, Any] = { needs Python 3
1228         v2 = {
1229             'unqualified-search-registries': search,
1230             'registry': [
1231                 {
1232                     'prefix': reg,
1233                     'location': reg,
1234                     'insecure': reg in insecure,
1235                     'blocked': False,
1236                 } for reg in search
1237             ]
1238         }
1239     else:
1240         v2 = config  # type: ignore
1241     dockers = [r for r in v2['registry'] if r['prefix'] == 'docker.io']
1242     if dockers:
1243         docker = dockers[0]
1244         docker['mirror'] = [{
1245             "location": mirror,
1246             "insecure": True,
1247         }]
1248     return v2
1249
1250
1251 def add_mirror_to_cluster(ctx, mirror):
1252     log.info('Adding local image mirror %s' % mirror)
1253
1254     registries_conf = '/etc/containers/registries.conf'
1255
1256     for remote in ctx.cluster.remotes.keys():
1257         try:
1258             config = teuthology.get_file(
1259                 remote=remote,
1260                 path=registries_conf
1261             )
1262             new_config = toml.dumps(registries_add_mirror_to_docker_io(config.decode('utf-8'), mirror))
1263
1264             teuthology.sudo_write_file(
1265                 remote=remote,
1266                 path=registries_conf,
1267                 data=six.ensure_str(new_config),
1268             )
1269         except IOError as e:  # py3: use FileNotFoundError instead.
1270             if e.errno != errno.ENOENT:
1271                 raise
1272
1273             # Docker doesn't ship a registries.conf
1274             log.info('Failed to add mirror: %s' % str(e))