]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
2 Ceph cluster task, deployed via cephadm orchestrator
14 from copy
import deepcopy
15 from io
import BytesIO
, StringIO
16 from tarfile
import ReadError
17 from tasks
.ceph_manager
import CephManager
18 from teuthology
import misc
as teuthology
19 from teuthology
import contextutil
20 from teuthology
.orchestra
import run
21 from teuthology
.orchestra
.daemon
import DaemonGroup
22 from teuthology
.config
import config
as teuth_config
23 from textwrap
import dedent
24 from tasks
.cephfs
.filesystem
import MDSCluster
, Filesystem
26 # these items we use from ceph.py should probably eventually move elsewhere
27 from tasks
.ceph
import get_mons
, healthy
28 from tasks
.vip
import subst_vip
30 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
32 log
= logging
.getLogger(__name__
)
35 def _shell(ctx
, cluster_name
, remote
, args
, extra_cephadm_args
=[], **kwargs
):
36 teuthology
.get_testdir(ctx
)
41 '--image', ctx
.ceph
[cluster_name
].image
,
43 '-c', '/etc/ceph/{}.conf'.format(cluster_name
),
44 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
45 '--fsid', ctx
.ceph
[cluster_name
].fsid
,
46 ] + extra_cephadm_args
+ [
53 def build_initial_config(ctx
, config
):
54 cluster_name
= config
['cluster']
56 path
= os
.path
.join(os
.path
.dirname(__file__
), 'cephadm.conf')
57 conf
= configobj
.ConfigObj(path
, file_error
=True)
59 conf
.setdefault('global', {})
60 conf
['global']['fsid'] = ctx
.ceph
[cluster_name
].fsid
63 for section
, keys
in config
.get('conf',{}).items():
64 for key
, value
in keys
.items():
65 log
.info(" override: [%s] %s = %s" % (section
, key
, value
))
66 if section
not in conf
:
68 conf
[section
][key
] = value
73 def distribute_iscsi_gateway_cfg(ctx
, conf_data
):
75 Distribute common gateway config to get the IPs.
76 These will help in iscsi clients with finding trusted_ip_list.
78 log
.info('Distributing iscsi-gateway.cfg...')
79 for remote
, roles
in ctx
.cluster
.remotes
.items():
81 path
='/etc/ceph/iscsi-gateway.cfg',
85 def update_archive_setting(ctx
, key
, value
):
87 Add logs directory to job's info log file
89 if ctx
.archive
is None:
91 with
open(os
.path
.join(ctx
.archive
, 'info.yaml'), 'r+') as info_file
:
92 info_yaml
= yaml
.safe_load(info_file
)
94 if 'archive' in info_yaml
:
95 info_yaml
['archive'][key
] = value
97 info_yaml
['archive'] = {key
: value
}
98 yaml
.safe_dump(info_yaml
, info_file
, default_flow_style
=False)
101 @contextlib.contextmanager
102 def normalize_hostnames(ctx
):
104 Ensure we have short hostnames throughout, for consistency between
105 remote.shortname and socket.gethostname() in cephadm.
107 log
.info('Normalizing hostnames...')
108 ctx
.cluster
.run(args
=[
111 run
.Raw('$(hostname -s)'),
120 @contextlib.contextmanager
121 def download_cephadm(ctx
, config
, ref
):
122 cluster_name
= config
['cluster']
124 if config
.get('cephadm_mode') != 'cephadm-package':
125 ref
= config
.get('cephadm_branch', ref
)
126 git_url
= config
.get('cephadm_git_url', teuth_config
.get_ceph_git_url())
127 log
.info('Downloading cephadm (repo %s ref %s)...' % (git_url
, ref
))
128 if ctx
.config
.get('redhat'):
129 log
.info("Install cephadm using RPM")
130 # cephadm already installed from redhat.install task
134 run
.Raw('$(which cephadm)'),
141 elif git_url
.startswith('https://github.com/'):
142 # git archive doesn't like https:// URLs, which we use with github.
143 rest
= git_url
.split('https://github.com/', 1)[1]
144 rest
= re
.sub(r
'\.git/?$', '', rest
).strip() # no .git suffix
148 'https://raw.githubusercontent.com/' + rest
+ '/' + ref
+ '/src/cephadm/cephadm',
160 '--remote=' + git_url
,
162 'src/cephadm/cephadm',
164 'tar', '-xO', 'src/cephadm/cephadm',
169 # sanity-check the resulting file and set executable bit
170 cephadm_file_size
= '$(stat -c%s {})'.format(ctx
.cephadm
)
173 'test', '-s', ctx
.cephadm
,
175 'test', run
.Raw(cephadm_file_size
), "-gt", run
.Raw('1000'),
177 'chmod', '+x', ctx
.cephadm
,
184 log
.info('Removing cluster...')
185 ctx
.cluster
.run(args
=[
189 '--fsid', ctx
.ceph
[cluster_name
].fsid
,
193 if config
.get('cephadm_mode') == 'root':
194 log
.info('Removing cephadm ...')
204 @contextlib.contextmanager
205 def ceph_log(ctx
, config
):
206 cluster_name
= config
['cluster']
207 fsid
= ctx
.ceph
[cluster_name
].fsid
209 update_archive_setting(ctx
, 'log', '/var/log/ceph')
216 # we need to know this below
217 ctx
.summary
['success'] = False
221 log
.info('Checking cluster log for badness...')
222 def first_in_ceph_log(pattern
, excludes
):
224 Find the first occurrence of the pattern specified in the Ceph log,
225 Returns None if none found.
227 :param pattern: Pattern scanned for.
228 :param excludes: Patterns to ignore.
229 :return: First line of text (or None if not found)
234 '/var/log/ceph/{fsid}/ceph.log'.format(
238 for exclude
in excludes
:
239 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
241 run
.Raw('|'), 'head', '-n', '1',
243 r
= ctx
.ceph
[cluster_name
].bootstrap_remote
.run(
247 stdout
= r
.stdout
.getvalue()
252 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
253 config
.get('log-ignorelist')) is not None:
254 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
255 ctx
.summary
['success'] = False
256 # use the most severe problem as the failure reason
257 if 'failure_reason' not in ctx
.summary
:
258 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
259 match
= first_in_ceph_log(pattern
, config
['log-ignorelist'])
260 if match
is not None:
261 ctx
.summary
['failure_reason'] = \
262 '"{match}" in cluster log'.format(
263 match
=match
.rstrip('\n'),
267 if ctx
.archive
is not None and \
268 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
270 log
.info('Compressing logs...')
276 '/var/log/ceph', # all logs, not just for the cluster
277 '/var/log/rbd-target-api', # ceph-iscsi
294 log
.info('Archiving logs...')
295 path
= os
.path
.join(ctx
.archive
, 'remote')
300 for remote
in ctx
.cluster
.remotes
.keys():
301 sub
= os
.path
.join(path
, remote
.shortname
)
307 teuthology
.pull_directory(remote
, '/var/log/ceph', # everything
308 os
.path
.join(sub
, 'log'))
313 @contextlib.contextmanager
314 def ceph_crash(ctx
, config
):
316 Gather crash dumps from /var/lib/ceph/$fsid/crash
318 cluster_name
= config
['cluster']
319 fsid
= ctx
.ceph
[cluster_name
].fsid
321 update_archive_setting(ctx
, 'crash', '/var/lib/ceph/crash')
327 if ctx
.archive
is not None:
328 log
.info('Archiving crash dumps...')
329 path
= os
.path
.join(ctx
.archive
, 'remote')
334 for remote
in ctx
.cluster
.remotes
.keys():
335 sub
= os
.path
.join(path
, remote
.shortname
)
341 teuthology
.pull_directory(remote
,
342 '/var/lib/ceph/%s/crash' % fsid
,
343 os
.path
.join(sub
, 'crash'))
348 @contextlib.contextmanager
349 def pull_image(ctx
, config
):
350 cluster_name
= config
['cluster']
351 log
.info(f
'Pulling image {ctx.ceph[cluster_name].image} on all hosts...')
357 '--image', ctx
.ceph
[cluster_name
].image
,
370 @contextlib.contextmanager
371 def ceph_bootstrap(ctx
, config
):
373 Bootstrap ceph cluster.
375 :param ctx: the argparse.Namespace object
376 :param config: the config dict
378 cluster_name
= config
['cluster']
379 testdir
= teuthology
.get_testdir(ctx
)
380 fsid
= ctx
.ceph
[cluster_name
].fsid
382 bootstrap_remote
= ctx
.ceph
[cluster_name
].bootstrap_remote
383 first_mon
= ctx
.ceph
[cluster_name
].first_mon
384 first_mon_role
= ctx
.ceph
[cluster_name
].first_mon_role
385 mons
= ctx
.ceph
[cluster_name
].mons
387 ctx
.cluster
.run(args
=[
388 'sudo', 'mkdir', '-p', '/etc/ceph',
390 ctx
.cluster
.run(args
=[
391 'sudo', 'chmod', '777', '/etc/ceph',
395 log
.info('Writing seed config...')
397 seed_config
= build_initial_config(ctx
, config
)
398 seed_config
.write(conf_fp
)
399 bootstrap_remote
.write_file(
400 path
='{}/seed.{}.conf'.format(testdir
, cluster_name
),
401 data
=conf_fp
.getvalue())
402 log
.debug('Final config:\n' + conf_fp
.getvalue().decode())
403 ctx
.ceph
[cluster_name
].conf
= seed_config
405 # register initial daemons
406 ctx
.daemons
.register_daemon(
407 bootstrap_remote
, 'mon', first_mon
,
408 cluster
=cluster_name
,
410 logger
=log
.getChild('mon.' + first_mon
),
414 if not ctx
.ceph
[cluster_name
].roleless
:
415 first_mgr
= ctx
.ceph
[cluster_name
].first_mgr
416 ctx
.daemons
.register_daemon(
417 bootstrap_remote
, 'mgr', first_mgr
,
418 cluster
=cluster_name
,
420 logger
=log
.getChild('mgr.' + first_mgr
),
426 log
.info('Bootstrapping...')
430 '--image', ctx
.ceph
[cluster_name
].image
,
434 '--config', '{}/seed.{}.conf'.format(testdir
, cluster_name
),
435 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name
),
437 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
438 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir
, cluster_name
),
441 if config
.get('registry-login'):
442 registry
= config
['registry-login']
444 "--registry-url", registry
['url'],
445 "--registry-username", registry
['username'],
446 "--registry-password", registry
['password'],
449 if not ctx
.ceph
[cluster_name
].roleless
:
451 '--mon-id', first_mon
,
452 '--mgr-id', first_mgr
,
453 '--orphan-initial-daemons', # we will do it explicitly!
454 '--skip-monitoring-stack', # we'll provision these explicitly
457 if mons
[first_mon_role
].startswith('['):
458 cmd
+= ['--mon-addrv', mons
[first_mon_role
]]
460 cmd
+= ['--mon-ip', mons
[first_mon_role
]]
461 if config
.get('skip_dashboard'):
462 cmd
+= ['--skip-dashboard']
463 if config
.get('skip_monitoring_stack'):
464 cmd
+= ['--skip-monitoring-stack']
465 if config
.get('single_host_defaults'):
466 cmd
+= ['--single-host-defaults']
467 if not config
.get('avoid_pacific_features', False):
468 cmd
+= ['--skip-admin-label']
469 # bootstrap makes the keyring root 0600, so +r it for our purposes
472 'sudo', 'chmod', '+r',
473 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
475 bootstrap_remote
.run(args
=cmd
)
477 # fetch keys and configs
478 log
.info('Fetching config...')
479 ctx
.ceph
[cluster_name
].config_file
= \
480 bootstrap_remote
.read_file(f
'/etc/ceph/{cluster_name}.conf')
481 log
.info('Fetching client.admin keyring...')
482 ctx
.ceph
[cluster_name
].admin_keyring
= \
483 bootstrap_remote
.read_file(f
'/etc/ceph/{cluster_name}.client.admin.keyring')
484 log
.info('Fetching mon keyring...')
485 ctx
.ceph
[cluster_name
].mon_keyring
= \
486 bootstrap_remote
.read_file(f
'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo
=True)
488 # fetch ssh key, distribute to additional nodes
489 log
.info('Fetching pub ssh key...')
490 ssh_pub_key
= bootstrap_remote
.read_file(
491 f
'{testdir}/{cluster_name}.pub').decode('ascii').strip()
493 log
.info('Installing pub ssh key for root users...')
494 ctx
.cluster
.run(args
=[
495 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
499 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
501 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
505 if config
.get('allow_ptrace', True):
506 _shell(ctx
, cluster_name
, bootstrap_remote
,
507 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
509 if not config
.get('avoid_pacific_features', False):
510 log
.info('Distributing conf and client.admin keyring to all hosts + 0755')
511 _shell(ctx
, cluster_name
, bootstrap_remote
,
512 ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
513 '*', '--mode', '0755'],
517 for remote
in ctx
.cluster
.remotes
.keys():
518 if remote
== bootstrap_remote
:
521 # note: this may be redundant (see above), but it avoids
522 # us having to wait for cephadm to do it.
523 log
.info('Writing (initial) conf and keyring to %s' % remote
.shortname
)
525 path
='/etc/ceph/{}.conf'.format(cluster_name
),
526 data
=ctx
.ceph
[cluster_name
].config_file
)
528 path
='/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
529 data
=ctx
.ceph
[cluster_name
].admin_keyring
)
531 log
.info('Adding host %s to orchestrator...' % remote
.shortname
)
532 _shell(ctx
, cluster_name
, bootstrap_remote
, [
533 'ceph', 'orch', 'host', 'add',
536 r
= _shell(ctx
, cluster_name
, bootstrap_remote
,
537 ['ceph', 'orch', 'host', 'ls', '--format=json'],
539 hosts
= [node
['hostname'] for node
in json
.loads(r
.stdout
.getvalue())]
540 assert remote
.shortname
in hosts
545 log
.info('Cleaning up testdir ceph.* files...')
546 ctx
.cluster
.run(args
=[
548 '{}/seed.{}.conf'.format(testdir
, cluster_name
),
549 '{}/{}.pub'.format(testdir
, cluster_name
),
552 log
.info('Stopping all daemons...')
554 # this doesn't block until they are all stopped...
555 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
557 # stop the daemons we know
558 for role
in ctx
.daemons
.resolve_role_list(None, CEPH_ROLE_TYPES
, True):
559 cluster
, type_
, id_
= teuthology
.split_role(role
)
561 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
563 log
.exception(f
'Failed to stop "{role}"')
566 # tear down anything left (but leave the logs behind)
576 check_status
=False, # may fail if upgrading from old cephadm
580 ctx
.cluster
.run(args
=[
582 '/etc/ceph/{}.conf'.format(cluster_name
),
583 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
587 @contextlib.contextmanager
588 def ceph_mons(ctx
, config
):
590 Deploy any additional mons
592 cluster_name
= config
['cluster']
593 fsid
= ctx
.ceph
[cluster_name
].fsid
597 if config
.get('add_mons_via_daemon_add'):
598 # This is the old way of adding mons that works with the (early) octopus
601 for remote
, roles
in ctx
.cluster
.remotes
.items():
602 for mon
in [r
for r
in roles
603 if teuthology
.is_type('mon', cluster_name
)(r
)]:
604 c_
, _
, id_
= teuthology
.split_role(mon
)
605 if c_
== cluster_name
and id_
== ctx
.ceph
[cluster_name
].first_mon
:
607 log
.info('Adding %s on %s' % (mon
, remote
.shortname
))
609 _shell(ctx
, cluster_name
, remote
, [
610 'ceph', 'orch', 'daemon', 'add', 'mon',
611 remote
.shortname
+ ':' + ctx
.ceph
[cluster_name
].mons
[mon
] + '=' + id_
,
613 ctx
.daemons
.register_daemon(
615 cluster
=cluster_name
,
617 logger
=log
.getChild(mon
),
621 daemons
[mon
] = (remote
, id_
)
623 with contextutil
.safe_while(sleep
=1, tries
=180) as proceed
:
625 log
.info('Waiting for %d mons in monmap...' % (num_mons
))
628 cluster_name
=cluster_name
,
631 'ceph', 'mon', 'dump', '-f', 'json',
635 j
= json
.loads(r
.stdout
.getvalue())
636 if len(j
['mons']) == num_mons
:
640 for remote
, roles
in ctx
.cluster
.remotes
.items():
641 for mon
in [r
for r
in roles
642 if teuthology
.is_type('mon', cluster_name
)(r
)]:
643 c_
, _
, id_
= teuthology
.split_role(mon
)
644 log
.info('Adding %s on %s' % (mon
, remote
.shortname
))
645 nodes
.append(remote
.shortname
646 + ':' + ctx
.ceph
[cluster_name
].mons
[mon
]
648 if c_
== cluster_name
and id_
== ctx
.ceph
[cluster_name
].first_mon
:
650 daemons
[mon
] = (remote
, id_
)
652 _shell(ctx
, cluster_name
, remote
, [
653 'ceph', 'orch', 'apply', 'mon',
654 str(len(nodes
)) + ';' + ';'.join(nodes
)]
656 for mgr
, i
in daemons
.items():
658 ctx
.daemons
.register_daemon(
660 cluster
=cluster_name
,
662 logger
=log
.getChild(mon
),
667 with contextutil
.safe_while(sleep
=1, tries
=180) as proceed
:
669 log
.info('Waiting for %d mons in monmap...' % (len(nodes
)))
672 cluster_name
=cluster_name
,
675 'ceph', 'mon', 'dump', '-f', 'json',
679 j
= json
.loads(r
.stdout
.getvalue())
680 if len(j
['mons']) == len(nodes
):
683 # refresh our (final) ceph.conf file
684 bootstrap_remote
= ctx
.ceph
[cluster_name
].bootstrap_remote
685 log
.info('Generating final ceph.conf file...')
688 cluster_name
=cluster_name
,
689 remote
=bootstrap_remote
,
691 'ceph', 'config', 'generate-minimal-conf',
695 ctx
.ceph
[cluster_name
].config_file
= r
.stdout
.getvalue()
703 @contextlib.contextmanager
704 def ceph_mgrs(ctx
, config
):
706 Deploy any additional mgrs
708 cluster_name
= config
['cluster']
709 fsid
= ctx
.ceph
[cluster_name
].fsid
714 for remote
, roles
in ctx
.cluster
.remotes
.items():
715 for mgr
in [r
for r
in roles
716 if teuthology
.is_type('mgr', cluster_name
)(r
)]:
717 c_
, _
, id_
= teuthology
.split_role(mgr
)
718 log
.info('Adding %s on %s' % (mgr
, remote
.shortname
))
719 nodes
.append(remote
.shortname
+ '=' + id_
)
720 if c_
== cluster_name
and id_
== ctx
.ceph
[cluster_name
].first_mgr
:
722 daemons
[mgr
] = (remote
, id_
)
724 _shell(ctx
, cluster_name
, remote
, [
725 'ceph', 'orch', 'apply', 'mgr',
726 str(len(nodes
)) + ';' + ';'.join(nodes
)]
728 for mgr
, i
in daemons
.items():
730 ctx
.daemons
.register_daemon(
732 cluster
=cluster_name
,
734 logger
=log
.getChild(mgr
),
745 @contextlib.contextmanager
746 def ceph_osds(ctx
, config
):
750 cluster_name
= config
['cluster']
751 fsid
= ctx
.ceph
[cluster_name
].fsid
754 log
.info('Deploying OSDs...')
756 # provision OSDs in numeric order
759 for remote
, roles
in ctx
.cluster
.remotes
.items():
760 devs_by_remote
[remote
] = teuthology
.get_scratch_devices(remote
)
761 for osd
in [r
for r
in roles
762 if teuthology
.is_type('osd', cluster_name
)(r
)]:
763 _
, _
, id_
= teuthology
.split_role(osd
)
764 id_to_remote
[int(id_
)] = (osd
, remote
)
767 for osd_id
in sorted(id_to_remote
.keys()):
768 osd
, remote
= id_to_remote
[osd_id
]
769 _
, _
, id_
= teuthology
.split_role(osd
)
770 assert int(id_
) == cur
771 devs
= devs_by_remote
[remote
]
772 assert devs
## FIXME ##
774 if all(_
in dev
for _
in ('lv', 'vg')):
775 short_dev
= dev
.replace('/dev/', '')
778 log
.info('Deploying %s on %s with %s...' % (
779 osd
, remote
.shortname
, dev
))
780 _shell(ctx
, cluster_name
, remote
, [
781 'ceph-volume', 'lvm', 'zap', dev
])
782 _shell(ctx
, cluster_name
, remote
, [
783 'ceph', 'orch', 'daemon', 'add', 'osd',
784 remote
.shortname
+ ':' + short_dev
786 ctx
.daemons
.register_daemon(
788 cluster
=cluster_name
,
790 logger
=log
.getChild(osd
),
797 _shell(ctx
, cluster_name
, remote
, [
798 'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
800 # expect the number of scratch devs
801 num_osds
= sum(map(len, devs_by_remote
.values()))
804 # expect the number of OSDs we created
807 log
.info(f
'Waiting for {num_osds} OSDs to come up...')
808 with contextutil
.safe_while(sleep
=1, tries
=120) as proceed
:
810 p
= _shell(ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
811 ['ceph', 'osd', 'stat', '-f', 'json'], stdout
=StringIO())
812 j
= json
.loads(p
.stdout
.getvalue())
813 if int(j
.get('num_up_osds', 0)) == num_osds
:
816 if not hasattr(ctx
, 'managers'):
818 ctx
.managers
[cluster_name
] = CephManager(
819 ctx
.ceph
[cluster_name
].bootstrap_remote
,
821 logger
=log
.getChild('ceph_manager.' + cluster_name
),
822 cluster
=cluster_name
,
831 @contextlib.contextmanager
832 def ceph_mdss(ctx
, config
):
836 cluster_name
= config
['cluster']
837 fsid
= ctx
.ceph
[cluster_name
].fsid
841 for remote
, roles
in ctx
.cluster
.remotes
.items():
842 for role
in [r
for r
in roles
843 if teuthology
.is_type('mds', cluster_name
)(r
)]:
844 c_
, _
, id_
= teuthology
.split_role(role
)
845 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
846 nodes
.append(remote
.shortname
+ '=' + id_
)
847 daemons
[role
] = (remote
, id_
)
849 _shell(ctx
, cluster_name
, remote
, [
850 'ceph', 'orch', 'apply', 'mds',
852 str(len(nodes
)) + ';' + ';'.join(nodes
)]
854 for role
, i
in daemons
.items():
856 ctx
.daemons
.register_daemon(
858 cluster
=cluster_name
,
860 logger
=log
.getChild(role
),
867 @contextlib.contextmanager
868 def cephfs_setup(ctx
, config
):
869 mdss
= list(teuthology
.all_roles_of_type(ctx
.cluster
, 'mds'))
871 # If there are any MDSs, then create a filesystem for them to use
872 # Do this last because requires mon cluster to be up and running
874 log
.info('Setting up CephFS filesystem(s)...')
875 cephfs_config
= config
.get('cephfs', {})
876 fs_configs
= cephfs_config
.pop('fs', [{'name': 'cephfs'}])
877 set_allow_multifs
= len(fs_configs
) > 1
879 # wait for standbys to become available (slow due to valgrind, perhaps)
880 mdsc
= MDSCluster(ctx
)
881 with contextutil
.safe_while(sleep
=2,tries
=150) as proceed
:
883 if len(mdsc
.get_standby_daemons()) >= len(mdss
):
887 for fs_config
in fs_configs
:
888 assert isinstance(fs_config
, dict)
889 name
= fs_config
.pop('name')
890 temp
= deepcopy(cephfs_config
)
891 teuthology
.deep_merge(temp
, fs_config
)
892 fs
= Filesystem(ctx
, fs_config
=temp
, name
=name
, create
=True)
893 if set_allow_multifs
:
894 fs
.set_allow_multifs()
895 set_allow_multifs
= False
905 @contextlib.contextmanager
906 def ceph_monitoring(daemon_type
, ctx
, config
):
908 Deploy prometheus, node-exporter, etc.
910 cluster_name
= config
['cluster']
911 fsid
= ctx
.ceph
[cluster_name
].fsid
915 for remote
, roles
in ctx
.cluster
.remotes
.items():
916 for role
in [r
for r
in roles
917 if teuthology
.is_type(daemon_type
, cluster_name
)(r
)]:
918 c_
, _
, id_
= teuthology
.split_role(role
)
919 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
920 nodes
.append(remote
.shortname
+ '=' + id_
)
921 daemons
[role
] = (remote
, id_
)
923 _shell(ctx
, cluster_name
, remote
, [
924 'ceph', 'orch', 'apply', daemon_type
,
925 str(len(nodes
)) + ';' + ';'.join(nodes
)]
927 for role
, i
in daemons
.items():
929 ctx
.daemons
.register_daemon(
930 remote
, daemon_type
, id_
,
931 cluster
=cluster_name
,
933 logger
=log
.getChild(role
),
941 @contextlib.contextmanager
942 def ceph_rgw(ctx
, config
):
946 cluster_name
= config
['cluster']
947 fsid
= ctx
.ceph
[cluster_name
].fsid
951 for remote
, roles
in ctx
.cluster
.remotes
.items():
952 for role
in [r
for r
in roles
953 if teuthology
.is_type('rgw', cluster_name
)(r
)]:
954 c_
, _
, id_
= teuthology
.split_role(role
)
955 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
956 svc
= '.'.join(id_
.split('.')[0:2])
959 nodes
[svc
].append(remote
.shortname
+ '=' + id_
)
960 daemons
[role
] = (remote
, id_
)
962 for svc
, nodes
in nodes
.items():
963 _shell(ctx
, cluster_name
, remote
, [
964 'ceph', 'orch', 'apply', 'rgw', svc
,
966 str(len(nodes
)) + ';' + ';'.join(nodes
)]
968 for role
, i
in daemons
.items():
970 ctx
.daemons
.register_daemon(
972 cluster
=cluster_name
,
974 logger
=log
.getChild(role
),
982 @contextlib.contextmanager
983 def ceph_iscsi(ctx
, config
):
987 cluster_name
= config
['cluster']
988 fsid
= ctx
.ceph
[cluster_name
].fsid
994 for remote
, roles
in ctx
.cluster
.remotes
.items():
995 for role
in [r
for r
in roles
996 if teuthology
.is_type('iscsi', cluster_name
)(r
)]:
997 c_
, _
, id_
= teuthology
.split_role(role
)
998 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
999 nodes
.append(remote
.shortname
+ '=' + id_
)
1000 daemons
[role
] = (remote
, id_
)
1001 ips
.append(remote
.ip_address
)
1002 trusted_ip_list
= ','.join(ips
)
1004 poolname
= 'datapool'
1005 # ceph osd pool create datapool 3 3 replicated
1006 _shell(ctx
, cluster_name
, remote
, [
1007 'ceph', 'osd', 'pool', 'create',
1008 poolname
, '3', '3', 'replicated']
1011 _shell(ctx
, cluster_name
, remote
, [
1012 'rbd', 'pool', 'init', poolname
]
1015 # ceph orch apply iscsi datapool (admin)user (admin)password
1016 _shell(ctx
, cluster_name
, remote
, [
1017 'ceph', 'orch', 'apply', 'iscsi',
1018 poolname
, 'admin', 'admin',
1019 '--trusted_ip_list', trusted_ip_list
,
1020 '--placement', str(len(nodes
)) + ';' + ';'.join(nodes
)]
1023 # used by iscsi client to identify valid gateway ip's
1024 conf_data
= dedent(f
"""
1026 trusted_ip_list = {trusted_ip_list}
1028 distribute_iscsi_gateway_cfg(ctx
, conf_data
)
1030 for role
, i
in daemons
.items():
1032 ctx
.daemons
.register_daemon(
1033 remote
, 'iscsi', id_
,
1034 cluster
=cluster_name
,
1036 logger
=log
.getChild(role
),
1044 @contextlib.contextmanager
1045 def ceph_clients(ctx
, config
):
1046 cluster_name
= config
['cluster']
1048 log
.info('Setting up client nodes...')
1049 clients
= ctx
.cluster
.only(teuthology
.is_type('client', cluster_name
))
1050 for remote
, roles_for_host
in clients
.remotes
.items():
1051 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client',
1053 name
= teuthology
.ceph_role(role
)
1054 client_keyring
= '/etc/ceph/{0}.{1}.keyring'.format(cluster_name
,
1058 cluster_name
=cluster_name
,
1062 'get-or-create', name
,
1070 keyring
= r
.stdout
.getvalue()
1071 remote
.sudo_write_file(client_keyring
, keyring
, mode
='0644')
1075 @contextlib.contextmanager
1080 log
.info('Teardown complete')
1084 @contextlib.contextmanager
1085 def stop(ctx
, config
):
1091 - ceph.stop: [mds.*]
1094 - ceph.stop: [osd.0, osd.2]
1098 daemons: [osd.0, osd.2]
1103 elif isinstance(config
, list):
1104 config
= {'daemons': config
}
1106 daemons
= ctx
.daemons
.resolve_role_list(
1107 config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1110 for role
in daemons
:
1111 cluster
, type_
, id_
= teuthology
.split_role(role
)
1112 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1113 clusters
.add(cluster
)
1115 # for cluster in clusters:
1116 # ctx.ceph[cluster].watchdog.stop()
1117 # ctx.ceph[cluster].watchdog.join()
1122 def shell(ctx
, config
):
1124 Execute (shell) commands
1126 cluster_name
= config
.get('cluster', 'ceph')
1129 for k
in config
.pop('env', []):
1130 args
.extend(['-e', k
+ '=' + ctx
.config
.get(k
, '')])
1131 for k
in config
.pop('volumes', []):
1132 args
.extend(['-v', k
])
1134 if 'all-roles' in config
and len(config
) == 1:
1135 a
= config
['all-roles']
1136 roles
= teuthology
.all_roles(ctx
.cluster
)
1137 config
= dict((id_
, a
) for id_
in roles
if not id_
.startswith('host.'))
1138 elif 'all-hosts' in config
and len(config
) == 1:
1139 a
= config
['all-hosts']
1140 roles
= teuthology
.all_roles(ctx
.cluster
)
1141 config
= dict((id_
, a
) for id_
in roles
if id_
.startswith('host.'))
1143 for role
, cmd
in config
.items():
1144 (remote
,) = ctx
.cluster
.only(role
).remotes
.keys()
1145 log
.info('Running commands on role %s host %s', role
, remote
.name
)
1146 if isinstance(cmd
, list):
1148 _shell(ctx
, cluster_name
, remote
,
1149 ['bash', '-c', subst_vip(ctx
, c
)],
1150 extra_cephadm_args
=args
)
1152 assert isinstance(cmd
, str)
1153 _shell(ctx
, cluster_name
, remote
,
1154 ['bash', '-ex', '-c', subst_vip(ctx
, cmd
)],
1155 extra_cephadm_args
=args
)
1158 def apply(ctx
, config
):
1168 rgw_frontend_port: 8000
1172 rgw_frontend_port: 9000
1177 cluster_name
= config
.get('cluster', 'ceph')
1179 specs
= config
.get('specs', [])
1180 y
= subst_vip(ctx
, yaml
.dump_all(specs
))
1182 log
.info(f
'Applying spec(s):\n{y}')
1184 ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1185 ['ceph', 'orch', 'apply', '-i', '-'],
1190 def wait_for_service(ctx
, config
):
1192 Wait for a service to be fully started
1195 - cephadm.wait_for_service:
1197 timeout: 60 # defaults to 300
1200 cluster_name
= config
.get('cluster', 'ceph')
1201 timeout
= config
.get('timeout', 300)
1202 service
= config
.get('service')
1206 f
'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1208 with contextutil
.safe_while(sleep
=1, tries
=timeout
) as proceed
:
1212 cluster_name
=cluster_name
,
1213 remote
=ctx
.ceph
[cluster_name
].bootstrap_remote
,
1215 'ceph', 'orch', 'ls', '-f', 'json',
1219 j
= json
.loads(r
.stdout
.getvalue())
1222 if s
['service_name'] == service
:
1227 f
"{service} has {s['status']['running']}/{s['status']['size']}"
1229 if s
['status']['running'] == s
['status']['size']:
1233 @contextlib.contextmanager
1234 def tweaked_option(ctx
, config
):
1236 set an option, and then restore it with its original value
1238 Note, due to the way how tasks are executed/nested, it's not suggested to
1239 use this method as a standalone task. otherwise, it's likely that it will
1240 restore the tweaked option at the /end/ of 'tasks' block.
1243 # we can complicate this when necessary
1244 options
= ['mon-health-to-clog']
1245 type_
, id_
= 'mon', '*'
1246 cluster
= config
.get('cluster', 'ceph')
1247 manager
= ctx
.managers
[cluster
]
1249 get_from
= next(teuthology
.all_roles_of_type(ctx
.cluster
, type_
))
1252 for option
in options
:
1253 if option
not in config
:
1255 value
= 'true' if config
[option
] else 'false'
1256 option
= option
.replace('-', '_')
1257 old_value
= manager
.get_config(type_
, get_from
, option
)
1258 if value
!= old_value
:
1259 saved_options
[option
] = old_value
1260 manager
.inject_args(type_
, id_
, option
, value
)
1262 for option
, value
in saved_options
.items():
1263 manager
.inject_args(type_
, id_
, option
, value
)
1266 @contextlib.contextmanager
1267 def restart(ctx
, config
):
1269 restart ceph daemons
1273 - ceph.restart: [all]
1277 - ceph.restart: [osd.0, mon.1, mds.*]
1283 daemons: [osd.0, mon.1]
1284 wait-for-healthy: false
1285 wait-for-osds-up: true
1288 :param config: Configuration
1292 elif isinstance(config
, list):
1293 config
= {'daemons': config
}
1295 daemons
= ctx
.daemons
.resolve_role_list(
1296 config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1299 log
.info('daemons %s' % daemons
)
1300 with
tweaked_option(ctx
, config
):
1301 for role
in daemons
:
1302 cluster
, type_
, id_
= teuthology
.split_role(role
)
1303 d
= ctx
.daemons
.get_daemon(type_
, id_
, cluster
)
1304 assert d
, 'daemon %s does not exist' % role
1307 ctx
.managers
[cluster
].mark_down_osd(id_
)
1309 clusters
.add(cluster
)
1311 if config
.get('wait-for-healthy', True):
1312 for cluster
in clusters
:
1313 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1314 if config
.get('wait-for-osds-up', False):
1315 for cluster
in clusters
:
1316 ctx
.managers
[cluster
].wait_for_all_osds_up()
1320 @contextlib.contextmanager
1321 def distribute_config_and_admin_keyring(ctx
, config
):
1323 Distribute a sufficient config and keyring for clients
1325 cluster_name
= config
['cluster']
1326 log
.info('Distributing (final) config and client.admin keyring...')
1327 for remote
, roles
in ctx
.cluster
.remotes
.items():
1329 '/etc/ceph/{}.conf'.format(cluster_name
),
1330 ctx
.ceph
[cluster_name
].config_file
,
1333 path
='/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
1334 data
=ctx
.ceph
[cluster_name
].admin_keyring
,
1339 ctx
.cluster
.run(args
=[
1341 '/etc/ceph/{}.conf'.format(cluster_name
),
1342 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
1346 @contextlib.contextmanager
1347 def crush_setup(ctx
, config
):
1348 cluster_name
= config
['cluster']
1350 profile
= config
.get('crush_tunables', 'default')
1351 log
.info('Setting crush tunables to %s', profile
)
1352 _shell(ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1353 args
=['ceph', 'osd', 'crush', 'tunables', profile
])
1357 @contextlib.contextmanager
1358 def create_rbd_pool(ctx
, config
):
1359 if config
.get('create_rbd_pool', False):
1360 cluster_name
= config
['cluster']
1361 log
.info('Waiting for OSDs to come up')
1362 teuthology
.wait_until_osds_up(
1364 cluster
=ctx
.cluster
,
1365 remote
=ctx
.ceph
[cluster_name
].bootstrap_remote
,
1366 ceph_cluster
=cluster_name
,
1368 log
.info('Creating RBD pool')
1369 _shell(ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1370 args
=['sudo', 'ceph', '--cluster', cluster_name
,
1371 'osd', 'pool', 'create', 'rbd', '8'])
1372 _shell(ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1373 args
=['sudo', 'ceph', '--cluster', cluster_name
,
1374 'osd', 'pool', 'application', 'enable',
1375 'rbd', 'rbd', '--yes-i-really-mean-it'
1380 @contextlib.contextmanager
1385 @contextlib.contextmanager
1386 def initialize_config(ctx
, config
):
1387 cluster_name
= config
['cluster']
1388 testdir
= teuthology
.get_testdir(ctx
)
1390 ctx
.ceph
[cluster_name
].thrashers
= []
1391 # fixme: setup watchdog, ala ceph.py
1393 ctx
.ceph
[cluster_name
].roleless
= False # see below
1395 first_ceph_cluster
= False
1396 if not hasattr(ctx
, 'daemons'):
1397 first_ceph_cluster
= True
1400 if 'cephadm_mode' not in config
:
1401 config
['cephadm_mode'] = 'root'
1402 assert config
['cephadm_mode'] in ['root', 'cephadm-package']
1403 if config
['cephadm_mode'] == 'root':
1404 ctx
.cephadm
= testdir
+ '/cephadm'
1406 ctx
.cephadm
= 'cephadm' # in the path
1408 if first_ceph_cluster
:
1409 # FIXME: this is global for all clusters
1410 ctx
.daemons
= DaemonGroup(
1411 use_cephadm
=ctx
.cephadm
)
1414 fsid
= str(uuid
.uuid1())
1415 log
.info('Cluster fsid is %s' % fsid
)
1416 ctx
.ceph
[cluster_name
].fsid
= fsid
1419 log
.info('Choosing monitor IPs and ports...')
1420 remotes_and_roles
= ctx
.cluster
.remotes
.items()
1421 ips
= [host
for (host
, port
) in
1422 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
1424 if config
.get('roleless', False):
1425 # mons will be named after hosts
1427 max_mons
= config
.get('max_mons', 5)
1428 for remote
, _
in remotes_and_roles
:
1429 ctx
.cluster
.remotes
[remote
].append('mon.' + remote
.shortname
)
1431 first_mon
= remote
.shortname
1432 bootstrap_remote
= remote
1436 log
.info('No mon roles; fabricating mons')
1438 roles
= [role_list
for (remote
, role_list
) in ctx
.cluster
.remotes
.items()]
1440 ctx
.ceph
[cluster_name
].mons
= get_mons(
1441 roles
, ips
, cluster_name
,
1442 mon_bind_msgr2
=config
.get('mon_bind_msgr2', True),
1443 mon_bind_addrvec
=config
.get('mon_bind_addrvec', True),
1445 log
.info('Monitor IPs: %s' % ctx
.ceph
[cluster_name
].mons
)
1447 if config
.get('roleless', False):
1448 ctx
.ceph
[cluster_name
].roleless
= True
1449 ctx
.ceph
[cluster_name
].bootstrap_remote
= bootstrap_remote
1450 ctx
.ceph
[cluster_name
].first_mon
= first_mon
1451 ctx
.ceph
[cluster_name
].first_mon_role
= 'mon.' + first_mon
1453 first_mon_role
= sorted(ctx
.ceph
[cluster_name
].mons
.keys())[0]
1454 _
, _
, first_mon
= teuthology
.split_role(first_mon_role
)
1455 (bootstrap_remote
,) = ctx
.cluster
.only(first_mon_role
).remotes
.keys()
1456 log
.info('First mon is mon.%s on %s' % (first_mon
,
1457 bootstrap_remote
.shortname
))
1458 ctx
.ceph
[cluster_name
].bootstrap_remote
= bootstrap_remote
1459 ctx
.ceph
[cluster_name
].first_mon
= first_mon
1460 ctx
.ceph
[cluster_name
].first_mon_role
= first_mon_role
1462 others
= ctx
.cluster
.remotes
[bootstrap_remote
]
1463 mgrs
= sorted([r
for r
in others
1464 if teuthology
.is_type('mgr', cluster_name
)(r
)])
1466 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon
)
1467 _
, _
, first_mgr
= teuthology
.split_role(mgrs
[0])
1468 log
.info('First mgr is %s' % (first_mgr
))
1469 ctx
.ceph
[cluster_name
].first_mgr
= first_mgr
1473 @contextlib.contextmanager
1474 def task(ctx
, config
):
1476 Deploy ceph cluster using cephadm
1478 For example, teuthology.yaml can contain the 'defaults' section:
1483 image: 'quay.io/ceph-ci/ceph'
1485 Using overrides makes it possible to customize it per run.
1486 The equivalent 'overrides' section looks like:
1491 image: 'quay.io/ceph-ci/ceph'
1494 username: registry-user
1495 password: registry-password
1497 :param ctx: the argparse.Namespace object
1498 :param config: the config dict
1503 assert isinstance(config
, dict), \
1504 "task only supports a dictionary for configuration"
1506 overrides
= ctx
.config
.get('overrides', {})
1507 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1508 teuthology
.deep_merge(config
, overrides
.get('cephadm', {}))
1509 log
.info('Config: ' + str(config
))
1511 # set up cluster context
1512 if not hasattr(ctx
, 'ceph'):
1514 if 'cluster' not in config
:
1515 config
['cluster'] = 'ceph'
1516 cluster_name
= config
['cluster']
1517 if cluster_name
not in ctx
.ceph
:
1518 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
1519 ctx
.ceph
[cluster_name
].bootstrapped
= False
1522 teuth_defaults
= teuth_config
.get('defaults', {})
1523 cephadm_defaults
= teuth_defaults
.get('cephadm', {})
1524 containers_defaults
= cephadm_defaults
.get('containers', {})
1525 container_image_name
= containers_defaults
.get('image', None)
1527 containers
= config
.get('containers', {})
1528 container_image_name
= containers
.get('image', container_image_name
)
1530 if not hasattr(ctx
.ceph
[cluster_name
], 'image'):
1531 ctx
.ceph
[cluster_name
].image
= config
.get('image')
1533 if not ctx
.ceph
[cluster_name
].image
:
1534 if not container_image_name
:
1535 raise Exception("Configuration error occurred. "
1536 "The 'image' value is undefined for 'cephadm' task. "
1537 "Please provide corresponding options in the task's "
1538 "config, task 'overrides', or teuthology 'defaults' "
1540 sha1
= config
.get('sha1')
1541 flavor
= config
.get('flavor', 'default')
1544 if flavor
== "crimson":
1545 ctx
.ceph
[cluster_name
].image
= container_image_name
+ ':' + sha1
+ '-' + flavor
1547 ctx
.ceph
[cluster_name
].image
= container_image_name
+ ':' + sha1
1550 # hmm, fall back to branch?
1551 branch
= config
.get('branch', 'master')
1553 ctx
.ceph
[cluster_name
].image
= container_image_name
+ ':' + branch
1554 log
.info('Cluster image is %s' % ctx
.ceph
[cluster_name
].image
)
1557 with contextutil
.nested(
1558 #if the cluster is already bootstrapped bypass corresponding methods
1559 lambda: _bypass() if (ctx
.ceph
[cluster_name
].bootstrapped
)\
1560 else initialize_config(ctx
=ctx
, config
=config
),
1561 lambda: ceph_initial(),
1562 lambda: normalize_hostnames(ctx
=ctx
),
1563 lambda: _bypass() if (ctx
.ceph
[cluster_name
].bootstrapped
)\
1564 else download_cephadm(ctx
=ctx
, config
=config
, ref
=ref
),
1565 lambda: ceph_log(ctx
=ctx
, config
=config
),
1566 lambda: ceph_crash(ctx
=ctx
, config
=config
),
1567 lambda: pull_image(ctx
=ctx
, config
=config
),
1568 lambda: _bypass() if (ctx
.ceph
[cluster_name
].bootstrapped
)\
1569 else ceph_bootstrap(ctx
, config
),
1570 lambda: crush_setup(ctx
=ctx
, config
=config
),
1571 lambda: ceph_mons(ctx
=ctx
, config
=config
),
1572 lambda: distribute_config_and_admin_keyring(ctx
=ctx
, config
=config
),
1573 lambda: ceph_mgrs(ctx
=ctx
, config
=config
),
1574 lambda: ceph_osds(ctx
=ctx
, config
=config
),
1575 lambda: ceph_mdss(ctx
=ctx
, config
=config
),
1576 lambda: cephfs_setup(ctx
=ctx
, config
=config
),
1577 lambda: ceph_rgw(ctx
=ctx
, config
=config
),
1578 lambda: ceph_iscsi(ctx
=ctx
, config
=config
),
1579 lambda: ceph_monitoring('prometheus', ctx
=ctx
, config
=config
),
1580 lambda: ceph_monitoring('node-exporter', ctx
=ctx
, config
=config
),
1581 lambda: ceph_monitoring('alertmanager', ctx
=ctx
, config
=config
),
1582 lambda: ceph_monitoring('grafana', ctx
=ctx
, config
=config
),
1583 lambda: ceph_clients(ctx
=ctx
, config
=config
),
1584 lambda: create_rbd_pool(ctx
=ctx
, config
=config
),
1587 if config
.get('wait-for-healthy', True):
1588 healthy(ctx
=ctx
, config
=config
)
1590 log
.info('Setup complete, yielding')
1594 log
.info('Teardown begin')