]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
2 Ceph cluster task, deployed via cephadm orchestrator
14 from io
import BytesIO
, StringIO
15 from tarfile
import ReadError
16 from tasks
.ceph_manager
import CephManager
17 from teuthology
import misc
as teuthology
18 from teuthology
import contextutil
19 from teuthology
.orchestra
import run
20 from teuthology
.orchestra
.daemon
import DaemonGroup
21 from teuthology
.config
import config
as teuth_config
23 # these items we use from ceph.py should probably eventually move elsewhere
24 from tasks
.ceph
import get_mons
, healthy
25 from tasks
.vip
import subst_vip
27 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
29 log
= logging
.getLogger(__name__
)
32 def _shell(ctx
, cluster_name
, remote
, args
, extra_cephadm_args
=[], **kwargs
):
33 teuthology
.get_testdir(ctx
)
38 '--image', ctx
.ceph
[cluster_name
].image
,
40 '-c', '/etc/ceph/{}.conf'.format(cluster_name
),
41 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
42 '--fsid', ctx
.ceph
[cluster_name
].fsid
,
43 ] + extra_cephadm_args
+ [
50 def build_initial_config(ctx
, config
):
51 cluster_name
= config
['cluster']
53 path
= os
.path
.join(os
.path
.dirname(__file__
), 'cephadm.conf')
54 conf
= configobj
.ConfigObj(path
, file_error
=True)
56 conf
.setdefault('global', {})
57 conf
['global']['fsid'] = ctx
.ceph
[cluster_name
].fsid
60 for section
, keys
in config
.get('conf',{}).items():
61 for key
, value
in keys
.items():
62 log
.info(" override: [%s] %s = %s" % (section
, key
, value
))
63 if section
not in conf
:
65 conf
[section
][key
] = value
70 def update_archive_setting(ctx
, key
, value
):
72 Add logs directory to job's info log file
74 if ctx
.archive
is None:
76 with
open(os
.path
.join(ctx
.archive
, 'info.yaml'), 'r+') as info_file
:
77 info_yaml
= yaml
.safe_load(info_file
)
79 if 'archive' in info_yaml
:
80 info_yaml
['archive'][key
] = value
82 info_yaml
['archive'] = {key
: value
}
83 yaml
.safe_dump(info_yaml
, info_file
, default_flow_style
=False)
86 @contextlib.contextmanager
87 def normalize_hostnames(ctx
):
89 Ensure we have short hostnames throughout, for consistency between
90 remote.shortname and socket.gethostname() in cephadm.
92 log
.info('Normalizing hostnames...')
93 ctx
.cluster
.run(args
=[
96 run
.Raw('$(hostname -s)'),
105 @contextlib.contextmanager
106 def download_cephadm(ctx
, config
, ref
):
107 cluster_name
= config
['cluster']
109 if config
.get('cephadm_mode') != 'cephadm-package':
110 ref
= config
.get('cephadm_branch', ref
)
111 git_url
= config
.get('cephadm_git_url', teuth_config
.get_ceph_git_url())
112 log
.info('Downloading cephadm (repo %s ref %s)...' % (git_url
, ref
))
113 if ctx
.config
.get('redhat'):
114 log
.info("Install cephadm using RPM")
115 # cephadm already installed from redhat.install task
119 run
.Raw('$(which cephadm)'),
126 elif git_url
.startswith('https://github.com/'):
127 # git archive doesn't like https:// URLs, which we use with github.
128 rest
= git_url
.split('https://github.com/', 1)[1]
129 rest
= re
.sub(r
'\.git/?$', '', rest
).strip() # no .git suffix
133 'https://raw.githubusercontent.com/' + rest
+ '/' + ref
+ '/src/cephadm/cephadm',
145 '--remote=' + git_url
,
147 'src/cephadm/cephadm',
149 'tar', '-xO', 'src/cephadm/cephadm',
154 # sanity-check the resulting file and set executable bit
155 cephadm_file_size
= '$(stat -c%s {})'.format(ctx
.cephadm
)
158 'test', '-s', ctx
.cephadm
,
160 'test', run
.Raw(cephadm_file_size
), "-gt", run
.Raw('1000'),
162 'chmod', '+x', ctx
.cephadm
,
169 log
.info('Removing cluster...')
170 ctx
.cluster
.run(args
=[
174 '--fsid', ctx
.ceph
[cluster_name
].fsid
,
178 if config
.get('cephadm_mode') == 'root':
179 log
.info('Removing cephadm ...')
189 @contextlib.contextmanager
190 def ceph_log(ctx
, config
):
191 cluster_name
= config
['cluster']
192 fsid
= ctx
.ceph
[cluster_name
].fsid
194 update_archive_setting(ctx
, 'log', '/var/log/ceph')
201 # we need to know this below
202 ctx
.summary
['success'] = False
206 log
.info('Checking cluster log for badness...')
207 def first_in_ceph_log(pattern
, excludes
):
209 Find the first occurrence of the pattern specified in the Ceph log,
210 Returns None if none found.
212 :param pattern: Pattern scanned for.
213 :param excludes: Patterns to ignore.
214 :return: First line of text (or None if not found)
219 '/var/log/ceph/{fsid}/ceph.log'.format(
223 for exclude
in excludes
:
224 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
226 run
.Raw('|'), 'head', '-n', '1',
228 r
= ctx
.ceph
[cluster_name
].bootstrap_remote
.run(
232 stdout
= r
.stdout
.getvalue()
237 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
238 config
.get('log-ignorelist')) is not None:
239 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
240 ctx
.summary
['success'] = False
241 # use the most severe problem as the failure reason
242 if 'failure_reason' not in ctx
.summary
:
243 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
244 match
= first_in_ceph_log(pattern
, config
['log-ignorelist'])
245 if match
is not None:
246 ctx
.summary
['failure_reason'] = \
247 '"{match}" in cluster log'.format(
248 match
=match
.rstrip('\n'),
252 if ctx
.archive
is not None and \
253 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
255 log
.info('Compressing logs...')
261 '/var/log/ceph', # all logs, not just for the cluster
262 '/var/log/rbd-target-api', # ceph-iscsi
279 log
.info('Archiving logs...')
280 path
= os
.path
.join(ctx
.archive
, 'remote')
285 for remote
in ctx
.cluster
.remotes
.keys():
286 sub
= os
.path
.join(path
, remote
.name
)
292 teuthology
.pull_directory(remote
, '/var/log/ceph', # everything
293 os
.path
.join(sub
, 'log'))
298 @contextlib.contextmanager
299 def ceph_crash(ctx
, config
):
301 Gather crash dumps from /var/lib/ceph/$fsid/crash
303 cluster_name
= config
['cluster']
304 fsid
= ctx
.ceph
[cluster_name
].fsid
306 update_archive_setting(ctx
, 'crash', '/var/lib/ceph/crash')
312 if ctx
.archive
is not None:
313 log
.info('Archiving crash dumps...')
314 path
= os
.path
.join(ctx
.archive
, 'remote')
319 for remote
in ctx
.cluster
.remotes
.keys():
320 sub
= os
.path
.join(path
, remote
.name
)
326 teuthology
.pull_directory(remote
,
327 '/var/lib/ceph/%s/crash' % fsid
,
328 os
.path
.join(sub
, 'crash'))
333 @contextlib.contextmanager
334 def ceph_bootstrap(ctx
, config
):
336 Bootstrap ceph cluster.
338 :param ctx: the argparse.Namespace object
339 :param config: the config dict
341 cluster_name
= config
['cluster']
342 testdir
= teuthology
.get_testdir(ctx
)
343 fsid
= ctx
.ceph
[cluster_name
].fsid
345 bootstrap_remote
= ctx
.ceph
[cluster_name
].bootstrap_remote
346 first_mon
= ctx
.ceph
[cluster_name
].first_mon
347 first_mon_role
= ctx
.ceph
[cluster_name
].first_mon_role
348 mons
= ctx
.ceph
[cluster_name
].mons
350 ctx
.cluster
.run(args
=[
351 'sudo', 'mkdir', '-p', '/etc/ceph',
353 ctx
.cluster
.run(args
=[
354 'sudo', 'chmod', '777', '/etc/ceph',
358 log
.info('Writing seed config...')
360 seed_config
= build_initial_config(ctx
, config
)
361 seed_config
.write(conf_fp
)
362 bootstrap_remote
.write_file(
363 path
='{}/seed.{}.conf'.format(testdir
, cluster_name
),
364 data
=conf_fp
.getvalue())
365 log
.debug('Final config:\n' + conf_fp
.getvalue().decode())
366 ctx
.ceph
[cluster_name
].conf
= seed_config
368 # register initial daemons
369 ctx
.daemons
.register_daemon(
370 bootstrap_remote
, 'mon', first_mon
,
371 cluster
=cluster_name
,
373 logger
=log
.getChild('mon.' + first_mon
),
377 if not ctx
.ceph
[cluster_name
].roleless
:
378 first_mgr
= ctx
.ceph
[cluster_name
].first_mgr
379 ctx
.daemons
.register_daemon(
380 bootstrap_remote
, 'mgr', first_mgr
,
381 cluster
=cluster_name
,
383 logger
=log
.getChild('mgr.' + first_mgr
),
389 log
.info('Bootstrapping...')
393 '--image', ctx
.ceph
[cluster_name
].image
,
397 '--config', '{}/seed.{}.conf'.format(testdir
, cluster_name
),
398 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name
),
400 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
401 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir
, cluster_name
),
404 if config
.get('registry-login'):
405 registry
= config
['registry-login']
407 "--registry-url", registry
['url'],
408 "--registry-username", registry
['username'],
409 "--registry-password", registry
['password'],
412 if not ctx
.ceph
[cluster_name
].roleless
:
414 '--mon-id', first_mon
,
415 '--mgr-id', first_mgr
,
416 '--orphan-initial-daemons', # we will do it explicitly!
417 '--skip-monitoring-stack', # we'll provision these explicitly
420 if mons
[first_mon_role
].startswith('['):
421 cmd
+= ['--mon-addrv', mons
[first_mon_role
]]
423 cmd
+= ['--mon-ip', mons
[first_mon_role
]]
424 if config
.get('skip_dashboard'):
425 cmd
+= ['--skip-dashboard']
426 if config
.get('skip_monitoring_stack'):
427 cmd
+= ['--skip-monitoring-stack']
428 if config
.get('single_host_defaults'):
429 cmd
+= ['--single-host-defaults']
430 if not config
.get('avoid_pacific_features', False):
431 cmd
+= ['--skip-admin-label']
432 # bootstrap makes the keyring root 0600, so +r it for our purposes
435 'sudo', 'chmod', '+r',
436 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
438 bootstrap_remote
.run(args
=cmd
)
440 # fetch keys and configs
441 log
.info('Fetching config...')
442 ctx
.ceph
[cluster_name
].config_file
= \
443 bootstrap_remote
.read_file(f
'/etc/ceph/{cluster_name}.conf')
444 log
.info('Fetching client.admin keyring...')
445 ctx
.ceph
[cluster_name
].admin_keyring
= \
446 bootstrap_remote
.read_file(f
'/etc/ceph/{cluster_name}.client.admin.keyring')
447 log
.info('Fetching mon keyring...')
448 ctx
.ceph
[cluster_name
].mon_keyring
= \
449 bootstrap_remote
.read_file(f
'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo
=True)
451 # fetch ssh key, distribute to additional nodes
452 log
.info('Fetching pub ssh key...')
453 ssh_pub_key
= bootstrap_remote
.read_file(
454 f
'{testdir}/{cluster_name}.pub').decode('ascii').strip()
456 log
.info('Installing pub ssh key for root users...')
457 ctx
.cluster
.run(args
=[
458 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
462 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
464 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
468 if config
.get('allow_ptrace', True):
469 _shell(ctx
, cluster_name
, bootstrap_remote
,
470 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
472 if not config
.get('avoid_pacific_features', False):
473 log
.info('Distributing conf and client.admin keyring to all hosts + 0755')
474 _shell(ctx
, cluster_name
, bootstrap_remote
,
475 ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
476 '*', '--mode', '0755'],
480 for remote
in ctx
.cluster
.remotes
.keys():
481 if remote
== bootstrap_remote
:
484 # note: this may be redundant (see above), but it avoids
485 # us having to wait for cephadm to do it.
486 log
.info('Writing (initial) conf and keyring to %s' % remote
.shortname
)
488 path
='/etc/ceph/{}.conf'.format(cluster_name
),
489 data
=ctx
.ceph
[cluster_name
].config_file
)
491 path
='/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
492 data
=ctx
.ceph
[cluster_name
].admin_keyring
)
494 log
.info('Adding host %s to orchestrator...' % remote
.shortname
)
495 _shell(ctx
, cluster_name
, remote
, [
496 'ceph', 'orch', 'host', 'add',
499 r
= _shell(ctx
, cluster_name
, remote
,
500 ['ceph', 'orch', 'host', 'ls', '--format=json'],
502 hosts
= [node
['hostname'] for node
in json
.loads(r
.stdout
.getvalue())]
503 assert remote
.shortname
in hosts
508 log
.info('Cleaning up testdir ceph.* files...')
509 ctx
.cluster
.run(args
=[
511 '{}/seed.{}.conf'.format(testdir
, cluster_name
),
512 '{}/{}.pub'.format(testdir
, cluster_name
),
515 log
.info('Stopping all daemons...')
517 # this doesn't block until they are all stopped...
518 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
520 # stop the daemons we know
521 for role
in ctx
.daemons
.resolve_role_list(None, CEPH_ROLE_TYPES
, True):
522 cluster
, type_
, id_
= teuthology
.split_role(role
)
524 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
526 log
.exception(f
'Failed to stop "{role}"')
529 # tear down anything left (but leave the logs behind)
539 check_status
=False, # may fail if upgrading from old cephadm
543 ctx
.cluster
.run(args
=[
545 '/etc/ceph/{}.conf'.format(cluster_name
),
546 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
550 @contextlib.contextmanager
551 def ceph_mons(ctx
, config
):
553 Deploy any additional mons
555 cluster_name
= config
['cluster']
556 fsid
= ctx
.ceph
[cluster_name
].fsid
560 if config
.get('add_mons_via_daemon_add'):
561 # This is the old way of adding mons that works with the (early) octopus
564 for remote
, roles
in ctx
.cluster
.remotes
.items():
565 for mon
in [r
for r
in roles
566 if teuthology
.is_type('mon', cluster_name
)(r
)]:
567 c_
, _
, id_
= teuthology
.split_role(mon
)
568 if c_
== cluster_name
and id_
== ctx
.ceph
[cluster_name
].first_mon
:
570 log
.info('Adding %s on %s' % (mon
, remote
.shortname
))
572 _shell(ctx
, cluster_name
, remote
, [
573 'ceph', 'orch', 'daemon', 'add', 'mon',
574 remote
.shortname
+ ':' + ctx
.ceph
[cluster_name
].mons
[mon
] + '=' + id_
,
576 ctx
.daemons
.register_daemon(
578 cluster
=cluster_name
,
580 logger
=log
.getChild(mon
),
584 daemons
[mon
] = (remote
, id_
)
586 with contextutil
.safe_while(sleep
=1, tries
=180) as proceed
:
588 log
.info('Waiting for %d mons in monmap...' % (num_mons
))
591 cluster_name
=cluster_name
,
594 'ceph', 'mon', 'dump', '-f', 'json',
598 j
= json
.loads(r
.stdout
.getvalue())
599 if len(j
['mons']) == num_mons
:
603 for remote
, roles
in ctx
.cluster
.remotes
.items():
604 for mon
in [r
for r
in roles
605 if teuthology
.is_type('mon', cluster_name
)(r
)]:
606 c_
, _
, id_
= teuthology
.split_role(mon
)
607 log
.info('Adding %s on %s' % (mon
, remote
.shortname
))
608 nodes
.append(remote
.shortname
609 + ':' + ctx
.ceph
[cluster_name
].mons
[mon
]
611 if c_
== cluster_name
and id_
== ctx
.ceph
[cluster_name
].first_mon
:
613 daemons
[mon
] = (remote
, id_
)
615 _shell(ctx
, cluster_name
, remote
, [
616 'ceph', 'orch', 'apply', 'mon',
617 str(len(nodes
)) + ';' + ';'.join(nodes
)]
619 for mgr
, i
in daemons
.items():
621 ctx
.daemons
.register_daemon(
623 cluster
=cluster_name
,
625 logger
=log
.getChild(mon
),
630 with contextutil
.safe_while(sleep
=1, tries
=180) as proceed
:
632 log
.info('Waiting for %d mons in monmap...' % (len(nodes
)))
635 cluster_name
=cluster_name
,
638 'ceph', 'mon', 'dump', '-f', 'json',
642 j
= json
.loads(r
.stdout
.getvalue())
643 if len(j
['mons']) == len(nodes
):
646 # refresh our (final) ceph.conf file
647 bootstrap_remote
= ctx
.ceph
[cluster_name
].bootstrap_remote
648 log
.info('Generating final ceph.conf file...')
651 cluster_name
=cluster_name
,
652 remote
=bootstrap_remote
,
654 'ceph', 'config', 'generate-minimal-conf',
658 ctx
.ceph
[cluster_name
].config_file
= r
.stdout
.getvalue()
666 @contextlib.contextmanager
667 def ceph_mgrs(ctx
, config
):
669 Deploy any additional mgrs
671 cluster_name
= config
['cluster']
672 fsid
= ctx
.ceph
[cluster_name
].fsid
677 for remote
, roles
in ctx
.cluster
.remotes
.items():
678 for mgr
in [r
for r
in roles
679 if teuthology
.is_type('mgr', cluster_name
)(r
)]:
680 c_
, _
, id_
= teuthology
.split_role(mgr
)
681 log
.info('Adding %s on %s' % (mgr
, remote
.shortname
))
682 nodes
.append(remote
.shortname
+ '=' + id_
)
683 if c_
== cluster_name
and id_
== ctx
.ceph
[cluster_name
].first_mgr
:
685 daemons
[mgr
] = (remote
, id_
)
687 _shell(ctx
, cluster_name
, remote
, [
688 'ceph', 'orch', 'apply', 'mgr',
689 str(len(nodes
)) + ';' + ';'.join(nodes
)]
691 for mgr
, i
in daemons
.items():
693 ctx
.daemons
.register_daemon(
695 cluster
=cluster_name
,
697 logger
=log
.getChild(mgr
),
708 @contextlib.contextmanager
709 def ceph_osds(ctx
, config
):
713 cluster_name
= config
['cluster']
714 fsid
= ctx
.ceph
[cluster_name
].fsid
717 log
.info('Deploying OSDs...')
719 # provision OSDs in numeric order
722 for remote
, roles
in ctx
.cluster
.remotes
.items():
723 devs_by_remote
[remote
] = teuthology
.get_scratch_devices(remote
)
724 for osd
in [r
for r
in roles
725 if teuthology
.is_type('osd', cluster_name
)(r
)]:
726 _
, _
, id_
= teuthology
.split_role(osd
)
727 id_to_remote
[int(id_
)] = (osd
, remote
)
730 for osd_id
in sorted(id_to_remote
.keys()):
731 osd
, remote
= id_to_remote
[osd_id
]
732 _
, _
, id_
= teuthology
.split_role(osd
)
733 assert int(id_
) == cur
734 devs
= devs_by_remote
[remote
]
735 assert devs
## FIXME ##
737 if all(_
in dev
for _
in ('lv', 'vg')):
738 short_dev
= dev
.replace('/dev/', '')
741 log
.info('Deploying %s on %s with %s...' % (
742 osd
, remote
.shortname
, dev
))
743 _shell(ctx
, cluster_name
, remote
, [
744 'ceph-volume', 'lvm', 'zap', dev
])
745 _shell(ctx
, cluster_name
, remote
, [
746 'ceph', 'orch', 'daemon', 'add', 'osd',
747 remote
.shortname
+ ':' + short_dev
749 ctx
.daemons
.register_daemon(
751 cluster
=cluster_name
,
753 logger
=log
.getChild(osd
),
764 @contextlib.contextmanager
765 def ceph_mdss(ctx
, config
):
769 cluster_name
= config
['cluster']
770 fsid
= ctx
.ceph
[cluster_name
].fsid
774 for remote
, roles
in ctx
.cluster
.remotes
.items():
775 for role
in [r
for r
in roles
776 if teuthology
.is_type('mds', cluster_name
)(r
)]:
777 c_
, _
, id_
= teuthology
.split_role(role
)
778 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
779 nodes
.append(remote
.shortname
+ '=' + id_
)
780 daemons
[role
] = (remote
, id_
)
782 _shell(ctx
, cluster_name
, remote
, [
783 'ceph', 'orch', 'apply', 'mds',
785 str(len(nodes
)) + ';' + ';'.join(nodes
)]
787 for role
, i
in daemons
.items():
789 ctx
.daemons
.register_daemon(
791 cluster
=cluster_name
,
793 logger
=log
.getChild(role
),
801 @contextlib.contextmanager
802 def ceph_monitoring(daemon_type
, ctx
, config
):
804 Deploy prometheus, node-exporter, etc.
806 cluster_name
= config
['cluster']
807 fsid
= ctx
.ceph
[cluster_name
].fsid
811 for remote
, roles
in ctx
.cluster
.remotes
.items():
812 for role
in [r
for r
in roles
813 if teuthology
.is_type(daemon_type
, cluster_name
)(r
)]:
814 c_
, _
, id_
= teuthology
.split_role(role
)
815 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
816 nodes
.append(remote
.shortname
+ '=' + id_
)
817 daemons
[role
] = (remote
, id_
)
819 _shell(ctx
, cluster_name
, remote
, [
820 'ceph', 'orch', 'apply', daemon_type
,
821 str(len(nodes
)) + ';' + ';'.join(nodes
)]
823 for role
, i
in daemons
.items():
825 ctx
.daemons
.register_daemon(
826 remote
, daemon_type
, id_
,
827 cluster
=cluster_name
,
829 logger
=log
.getChild(role
),
837 @contextlib.contextmanager
838 def ceph_rgw(ctx
, config
):
842 cluster_name
= config
['cluster']
843 fsid
= ctx
.ceph
[cluster_name
].fsid
847 for remote
, roles
in ctx
.cluster
.remotes
.items():
848 for role
in [r
for r
in roles
849 if teuthology
.is_type('rgw', cluster_name
)(r
)]:
850 c_
, _
, id_
= teuthology
.split_role(role
)
851 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
852 svc
= '.'.join(id_
.split('.')[0:2])
855 nodes
[svc
].append(remote
.shortname
+ '=' + id_
)
856 daemons
[role
] = (remote
, id_
)
858 for svc
, nodes
in nodes
.items():
859 _shell(ctx
, cluster_name
, remote
, [
860 'ceph', 'orch', 'apply', 'rgw', svc
,
862 str(len(nodes
)) + ';' + ';'.join(nodes
)]
864 for role
, i
in daemons
.items():
866 ctx
.daemons
.register_daemon(
868 cluster
=cluster_name
,
870 logger
=log
.getChild(role
),
878 @contextlib.contextmanager
879 def ceph_iscsi(ctx
, config
):
883 cluster_name
= config
['cluster']
884 fsid
= ctx
.ceph
[cluster_name
].fsid
888 for remote
, roles
in ctx
.cluster
.remotes
.items():
889 for role
in [r
for r
in roles
890 if teuthology
.is_type('iscsi', cluster_name
)(r
)]:
891 c_
, _
, id_
= teuthology
.split_role(role
)
892 log
.info('Adding %s on %s' % (role
, remote
.shortname
))
893 nodes
.append(remote
.shortname
+ '=' + id_
)
894 daemons
[role
] = (remote
, id_
)
897 # ceph osd pool create iscsi 3 3 replicated
898 _shell(ctx
, cluster_name
, remote
, [
899 'ceph', 'osd', 'pool', 'create',
900 poolname
, '3', '3', 'replicated']
903 _shell(ctx
, cluster_name
, remote
, [
904 'ceph', 'osd', 'pool', 'application', 'enable',
908 # ceph orch apply iscsi iscsi user password
909 _shell(ctx
, cluster_name
, remote
, [
910 'ceph', 'orch', 'apply', 'iscsi',
911 poolname
, 'user', 'password',
912 '--placement', str(len(nodes
)) + ';' + ';'.join(nodes
)]
914 for role
, i
in daemons
.items():
916 ctx
.daemons
.register_daemon(
917 remote
, 'iscsi', id_
,
918 cluster
=cluster_name
,
920 logger
=log
.getChild(role
),
928 @contextlib.contextmanager
929 def ceph_clients(ctx
, config
):
930 cluster_name
= config
['cluster']
932 log
.info('Setting up client nodes...')
933 clients
= ctx
.cluster
.only(teuthology
.is_type('client', cluster_name
))
934 for remote
, roles_for_host
in clients
.remotes
.items():
935 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client',
937 name
= teuthology
.ceph_role(role
)
938 client_keyring
= '/etc/ceph/{0}.{1}.keyring'.format(cluster_name
,
942 cluster_name
=cluster_name
,
946 'get-or-create', name
,
954 keyring
= r
.stdout
.getvalue()
955 remote
.sudo_write_file(client_keyring
, keyring
, mode
='0644')
959 @contextlib.contextmanager
964 log
.info('Teardown complete')
968 @contextlib.contextmanager
969 def stop(ctx
, config
):
978 - ceph.stop: [osd.0, osd.2]
982 daemons: [osd.0, osd.2]
987 elif isinstance(config
, list):
988 config
= {'daemons': config
}
990 daemons
= ctx
.daemons
.resolve_role_list(
991 config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
995 cluster
, type_
, id_
= teuthology
.split_role(role
)
996 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
997 clusters
.add(cluster
)
999 # for cluster in clusters:
1000 # ctx.ceph[cluster].watchdog.stop()
1001 # ctx.ceph[cluster].watchdog.join()
1006 def shell(ctx
, config
):
1008 Execute (shell) commands
1010 cluster_name
= config
.get('cluster', 'ceph')
1013 for k
in config
.pop('env', []):
1014 args
.extend(['-e', k
+ '=' + ctx
.config
.get(k
, '')])
1015 for k
in config
.pop('volumes', []):
1016 args
.extend(['-v', k
])
1018 if 'all-roles' in config
and len(config
) == 1:
1019 a
= config
['all-roles']
1020 roles
= teuthology
.all_roles(ctx
.cluster
)
1021 config
= dict((id_
, a
) for id_
in roles
if not id_
.startswith('host.'))
1022 elif 'all-hosts' in config
and len(config
) == 1:
1023 a
= config
['all-hosts']
1024 roles
= teuthology
.all_roles(ctx
.cluster
)
1025 config
= dict((id_
, a
) for id_
in roles
if id_
.startswith('host.'))
1027 for role
, cmd
in config
.items():
1028 (remote
,) = ctx
.cluster
.only(role
).remotes
.keys()
1029 log
.info('Running commands on role %s host %s', role
, remote
.name
)
1030 if isinstance(cmd
, list):
1032 _shell(ctx
, cluster_name
, remote
,
1033 ['bash', '-c', subst_vip(ctx
, c
)],
1034 extra_cephadm_args
=args
)
1036 assert isinstance(cmd
, str)
1037 _shell(ctx
, cluster_name
, remote
,
1038 ['bash', '-ex', '-c', subst_vip(ctx
, cmd
)],
1039 extra_cephadm_args
=args
)
1042 def apply(ctx
, config
):
1052 rgw_frontend_port: 8000
1056 rgw_frontend_port: 9000
1061 cluster_name
= config
.get('cluster', 'ceph')
1063 specs
= config
.get('specs', [])
1064 y
= subst_vip(ctx
, yaml
.dump_all(specs
))
1066 log
.info(f
'Applying spec(s):\n{y}')
1068 ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1069 ['ceph', 'orch', 'apply', '-i', '-'],
1074 def wait_for_service(ctx
, config
):
1076 Wait for a service to be fully started
1079 - cephadm.wait_for_service:
1081 timeout: 60 # defaults to 300
1084 cluster_name
= config
.get('cluster', 'ceph')
1085 timeout
= config
.get('timeout', 300)
1086 service
= config
.get('service')
1090 f
'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1092 with contextutil
.safe_while(sleep
=1, tries
=timeout
) as proceed
:
1096 cluster_name
=cluster_name
,
1097 remote
=ctx
.ceph
[cluster_name
].bootstrap_remote
,
1099 'ceph', 'orch', 'ls', '-f', 'json',
1103 j
= json
.loads(r
.stdout
.getvalue())
1106 if s
['service_name'] == service
:
1111 f
"{service} has {s['status']['running']}/{s['status']['size']}"
1113 if s
['status']['running'] == s
['status']['size']:
1117 @contextlib.contextmanager
1118 def tweaked_option(ctx
, config
):
1120 set an option, and then restore it with its original value
1122 Note, due to the way how tasks are executed/nested, it's not suggested to
1123 use this method as a standalone task. otherwise, it's likely that it will
1124 restore the tweaked option at the /end/ of 'tasks' block.
1127 # we can complicate this when necessary
1128 options
= ['mon-health-to-clog']
1129 type_
, id_
= 'mon', '*'
1130 cluster
= config
.get('cluster', 'ceph')
1131 manager
= ctx
.managers
[cluster
]
1133 get_from
= next(teuthology
.all_roles_of_type(ctx
.cluster
, type_
))
1136 for option
in options
:
1137 if option
not in config
:
1139 value
= 'true' if config
[option
] else 'false'
1140 option
= option
.replace('-', '_')
1141 old_value
= manager
.get_config(type_
, get_from
, option
)
1142 if value
!= old_value
:
1143 saved_options
[option
] = old_value
1144 manager
.inject_args(type_
, id_
, option
, value
)
1146 for option
, value
in saved_options
.items():
1147 manager
.inject_args(type_
, id_
, option
, value
)
1150 @contextlib.contextmanager
1151 def restart(ctx
, config
):
1153 restart ceph daemons
1157 - ceph.restart: [all]
1161 - ceph.restart: [osd.0, mon.1, mds.*]
1167 daemons: [osd.0, mon.1]
1168 wait-for-healthy: false
1169 wait-for-osds-up: true
1172 :param config: Configuration
1176 elif isinstance(config
, list):
1177 config
= {'daemons': config
}
1179 daemons
= ctx
.daemons
.resolve_role_list(
1180 config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1183 log
.info('daemons %s' % daemons
)
1184 with
tweaked_option(ctx
, config
):
1185 for role
in daemons
:
1186 cluster
, type_
, id_
= teuthology
.split_role(role
)
1187 d
= ctx
.daemons
.get_daemon(type_
, id_
, cluster
)
1188 assert d
, 'daemon %s does not exist' % role
1191 ctx
.managers
[cluster
].mark_down_osd(id_
)
1193 clusters
.add(cluster
)
1195 if config
.get('wait-for-healthy', True):
1196 for cluster
in clusters
:
1197 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1198 if config
.get('wait-for-osds-up', False):
1199 for cluster
in clusters
:
1200 ctx
.managers
[cluster
].wait_for_all_osds_up()
1204 @contextlib.contextmanager
1205 def distribute_config_and_admin_keyring(ctx
, config
):
1207 Distribute a sufficient config and keyring for clients
1209 cluster_name
= config
['cluster']
1210 log
.info('Distributing (final) config and client.admin keyring...')
1211 for remote
, roles
in ctx
.cluster
.remotes
.items():
1213 '/etc/ceph/{}.conf'.format(cluster_name
),
1214 ctx
.ceph
[cluster_name
].config_file
,
1217 path
='/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
1218 data
=ctx
.ceph
[cluster_name
].admin_keyring
,
1223 ctx
.cluster
.run(args
=[
1225 '/etc/ceph/{}.conf'.format(cluster_name
),
1226 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name
),
1230 @contextlib.contextmanager
1231 def crush_setup(ctx
, config
):
1232 cluster_name
= config
['cluster']
1234 profile
= config
.get('crush_tunables', 'default')
1235 log
.info('Setting crush tunables to %s', profile
)
1236 _shell(ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1237 args
=['ceph', 'osd', 'crush', 'tunables', profile
])
1241 @contextlib.contextmanager
1242 def create_rbd_pool(ctx
, config
):
1243 if config
.get('create_rbd_pool', False):
1244 cluster_name
= config
['cluster']
1245 log
.info('Waiting for OSDs to come up')
1246 teuthology
.wait_until_osds_up(
1248 cluster
=ctx
.cluster
,
1249 remote
=ctx
.ceph
[cluster_name
].bootstrap_remote
,
1250 ceph_cluster
=cluster_name
,
1252 log
.info('Creating RBD pool')
1253 _shell(ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1254 args
=['sudo', 'ceph', '--cluster', cluster_name
,
1255 'osd', 'pool', 'create', 'rbd', '8'])
1256 _shell(ctx
, cluster_name
, ctx
.ceph
[cluster_name
].bootstrap_remote
,
1257 args
=['sudo', 'ceph', '--cluster', cluster_name
,
1258 'osd', 'pool', 'application', 'enable',
1259 'rbd', 'rbd', '--yes-i-really-mean-it'
1264 @contextlib.contextmanager
1269 @contextlib.contextmanager
1270 def initialize_config(ctx
, config
):
1271 cluster_name
= config
['cluster']
1272 testdir
= teuthology
.get_testdir(ctx
)
1274 ctx
.ceph
[cluster_name
].thrashers
= []
1275 # fixme: setup watchdog, ala ceph.py
1277 ctx
.ceph
[cluster_name
].roleless
= False # see below
1279 first_ceph_cluster
= False
1280 if not hasattr(ctx
, 'daemons'):
1281 first_ceph_cluster
= True
1284 if 'cephadm_mode' not in config
:
1285 config
['cephadm_mode'] = 'root'
1286 assert config
['cephadm_mode'] in ['root', 'cephadm-package']
1287 if config
['cephadm_mode'] == 'root':
1288 ctx
.cephadm
= testdir
+ '/cephadm'
1290 ctx
.cephadm
= 'cephadm' # in the path
1292 if first_ceph_cluster
:
1293 # FIXME: this is global for all clusters
1294 ctx
.daemons
= DaemonGroup(
1295 use_cephadm
=ctx
.cephadm
)
1298 fsid
= str(uuid
.uuid1())
1299 log
.info('Cluster fsid is %s' % fsid
)
1300 ctx
.ceph
[cluster_name
].fsid
= fsid
1303 log
.info('Choosing monitor IPs and ports...')
1304 remotes_and_roles
= ctx
.cluster
.remotes
.items()
1305 ips
= [host
for (host
, port
) in
1306 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
1308 if config
.get('roleless', False):
1309 # mons will be named after hosts
1311 for remote
, _
in remotes_and_roles
:
1312 ctx
.cluster
.remotes
[remote
].append('mon.' + remote
.shortname
)
1314 first_mon
= remote
.shortname
1315 bootstrap_remote
= remote
1316 log
.info('No mon roles; fabricating mons')
1318 roles
= [role_list
for (remote
, role_list
) in ctx
.cluster
.remotes
.items()]
1320 ctx
.ceph
[cluster_name
].mons
= get_mons(
1321 roles
, ips
, cluster_name
,
1322 mon_bind_msgr2
=config
.get('mon_bind_msgr2', True),
1323 mon_bind_addrvec
=config
.get('mon_bind_addrvec', True),
1325 log
.info('Monitor IPs: %s' % ctx
.ceph
[cluster_name
].mons
)
1327 if config
.get('roleless', False):
1328 ctx
.ceph
[cluster_name
].roleless
= True
1329 ctx
.ceph
[cluster_name
].bootstrap_remote
= bootstrap_remote
1330 ctx
.ceph
[cluster_name
].first_mon
= first_mon
1331 ctx
.ceph
[cluster_name
].first_mon_role
= 'mon.' + first_mon
1333 first_mon_role
= sorted(ctx
.ceph
[cluster_name
].mons
.keys())[0]
1334 _
, _
, first_mon
= teuthology
.split_role(first_mon_role
)
1335 (bootstrap_remote
,) = ctx
.cluster
.only(first_mon_role
).remotes
.keys()
1336 log
.info('First mon is mon.%s on %s' % (first_mon
,
1337 bootstrap_remote
.shortname
))
1338 ctx
.ceph
[cluster_name
].bootstrap_remote
= bootstrap_remote
1339 ctx
.ceph
[cluster_name
].first_mon
= first_mon
1340 ctx
.ceph
[cluster_name
].first_mon_role
= first_mon_role
1342 others
= ctx
.cluster
.remotes
[bootstrap_remote
]
1343 mgrs
= sorted([r
for r
in others
1344 if teuthology
.is_type('mgr', cluster_name
)(r
)])
1346 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon
)
1347 _
, _
, first_mgr
= teuthology
.split_role(mgrs
[0])
1348 log
.info('First mgr is %s' % (first_mgr
))
1349 ctx
.ceph
[cluster_name
].first_mgr
= first_mgr
1353 @contextlib.contextmanager
1354 def task(ctx
, config
):
1356 Deploy ceph cluster using cephadm
1358 For example, teuthology.yaml can contain the 'defaults' section:
1363 image: 'quay.io/ceph-ci/ceph'
1365 Using overrides makes it possible to customize it per run.
1366 The equivalent 'overrides' section looks like:
1371 image: 'quay.io/ceph-ci/ceph'
1374 username: registry-user
1375 password: registry-password
1377 :param ctx: the argparse.Namespace object
1378 :param config: the config dict
1383 assert isinstance(config
, dict), \
1384 "task only supports a dictionary for configuration"
1386 overrides
= ctx
.config
.get('overrides', {})
1387 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1388 teuthology
.deep_merge(config
, overrides
.get('cephadm', {}))
1389 log
.info('Config: ' + str(config
))
1391 # set up cluster context
1392 if not hasattr(ctx
, 'ceph'):
1394 if 'cluster' not in config
:
1395 config
['cluster'] = 'ceph'
1396 cluster_name
= config
['cluster']
1397 if cluster_name
not in ctx
.ceph
:
1398 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
1399 ctx
.ceph
[cluster_name
].bootstrapped
= False
1402 teuth_defaults
= teuth_config
.get('defaults', {})
1403 cephadm_defaults
= teuth_defaults
.get('cephadm', {})
1404 containers_defaults
= cephadm_defaults
.get('containers', {})
1405 container_image_name
= containers_defaults
.get('image', None)
1407 containers
= config
.get('containers', {})
1408 container_image_name
= containers
.get('image', container_image_name
)
1410 if not hasattr(ctx
.ceph
[cluster_name
], 'image'):
1411 ctx
.ceph
[cluster_name
].image
= config
.get('image')
1413 if not ctx
.ceph
[cluster_name
].image
:
1414 if not container_image_name
:
1415 raise Exception("Configuration error occurred. "
1416 "The 'image' value is undefined for 'cephadm' task. "
1417 "Please provide corresponding options in the task's "
1418 "config, task 'overrides', or teuthology 'defaults' "
1420 sha1
= config
.get('sha1')
1421 flavor
= config
.get('flavor', 'default')
1424 if flavor
== "crimson":
1425 ctx
.ceph
[cluster_name
].image
= container_image_name
+ ':' + sha1
+ '-' + flavor
1427 ctx
.ceph
[cluster_name
].image
= container_image_name
+ ':' + sha1
1430 # hmm, fall back to branch?
1431 branch
= config
.get('branch', 'master')
1433 ctx
.ceph
[cluster_name
].image
= container_image_name
+ ':' + branch
1434 log
.info('Cluster image is %s' % ctx
.ceph
[cluster_name
].image
)
1437 with contextutil
.nested(
1438 #if the cluster is already bootstrapped bypass corresponding methods
1439 lambda: _bypass() if (ctx
.ceph
[cluster_name
].bootstrapped
)\
1440 else initialize_config(ctx
=ctx
, config
=config
),
1441 lambda: ceph_initial(),
1442 lambda: normalize_hostnames(ctx
=ctx
),
1443 lambda: _bypass() if (ctx
.ceph
[cluster_name
].bootstrapped
)\
1444 else download_cephadm(ctx
=ctx
, config
=config
, ref
=ref
),
1445 lambda: ceph_log(ctx
=ctx
, config
=config
),
1446 lambda: ceph_crash(ctx
=ctx
, config
=config
),
1447 lambda: _bypass() if (ctx
.ceph
[cluster_name
].bootstrapped
)\
1448 else ceph_bootstrap(ctx
, config
),
1449 lambda: crush_setup(ctx
=ctx
, config
=config
),
1450 lambda: ceph_mons(ctx
=ctx
, config
=config
),
1451 lambda: distribute_config_and_admin_keyring(ctx
=ctx
, config
=config
),
1452 lambda: ceph_mgrs(ctx
=ctx
, config
=config
),
1453 lambda: ceph_osds(ctx
=ctx
, config
=config
),
1454 lambda: ceph_mdss(ctx
=ctx
, config
=config
),
1455 lambda: ceph_rgw(ctx
=ctx
, config
=config
),
1456 lambda: ceph_iscsi(ctx
=ctx
, config
=config
),
1457 lambda: ceph_monitoring('prometheus', ctx
=ctx
, config
=config
),
1458 lambda: ceph_monitoring('node-exporter', ctx
=ctx
, config
=config
),
1459 lambda: ceph_monitoring('alertmanager', ctx
=ctx
, config
=config
),
1460 lambda: ceph_monitoring('grafana', ctx
=ctx
, config
=config
),
1461 lambda: ceph_clients(ctx
=ctx
, config
=config
),
1462 lambda: create_rbd_pool(ctx
=ctx
, config
=config
),
1464 if not hasattr(ctx
, 'managers'):
1466 ctx
.managers
[cluster_name
] = CephManager(
1467 ctx
.ceph
[cluster_name
].bootstrap_remote
,
1469 logger
=log
.getChild('ceph_manager.' + cluster_name
),
1470 cluster
=cluster_name
,
1475 if config
.get('wait-for-healthy', True):
1476 healthy(ctx
=ctx
, config
=config
)
1478 log
.info('Setup complete, yielding')
1482 log
.info('Teardown begin')