]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
84685b824bf462e669e1099310f85f3e73580603
[ceph.git] / ceph / qa / tasks / cephadm.py
1 """
2 Ceph cluster task, deployed via cephadm orchestrator
3 """
4 import argparse
5 import configobj
6 import contextlib
7 import logging
8 import os
9 import json
10 import re
11 import uuid
12 import yaml
13
14 from copy import deepcopy
15 from io import BytesIO, StringIO
16 from tarfile import ReadError
17 from tasks.ceph_manager import CephManager
18 from teuthology import misc as teuthology
19 from teuthology import contextutil
20 from teuthology import packaging
21 from teuthology.orchestra import run
22 from teuthology.orchestra.daemon import DaemonGroup
23 from teuthology.config import config as teuth_config
24 from textwrap import dedent
25 from tasks.cephfs.filesystem import MDSCluster, Filesystem
26 from tasks.util import chacra
27
28 # these items we use from ceph.py should probably eventually move elsewhere
29 from tasks.ceph import get_mons, healthy
30 from tasks.vip import subst_vip
31
32 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
33
34 log = logging.getLogger(__name__)
35
36
37 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
38 teuthology.get_testdir(ctx)
39 return remote.run(
40 args=[
41 'sudo',
42 ctx.cephadm,
43 '--image', ctx.ceph[cluster_name].image,
44 'shell',
45 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
46 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
47 '--fsid', ctx.ceph[cluster_name].fsid,
48 ] + extra_cephadm_args + [
49 '--',
50 ] + args,
51 **kwargs
52 )
53
54
55 def build_initial_config(ctx, config):
56 cluster_name = config['cluster']
57
58 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
59 conf = configobj.ConfigObj(path, file_error=True)
60
61 conf.setdefault('global', {})
62 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
63
64 # overrides
65 for section, keys in config.get('conf',{}).items():
66 for key, value in keys.items():
67 log.info(" override: [%s] %s = %s" % (section, key, value))
68 if section not in conf:
69 conf[section] = {}
70 conf[section][key] = value
71
72 return conf
73
74
75 def distribute_iscsi_gateway_cfg(ctx, conf_data):
76 """
77 Distribute common gateway config to get the IPs.
78 These will help in iscsi clients with finding trusted_ip_list.
79 """
80 log.info('Distributing iscsi-gateway.cfg...')
81 for remote, roles in ctx.cluster.remotes.items():
82 remote.write_file(
83 path='/etc/ceph/iscsi-gateway.cfg',
84 data=conf_data,
85 sudo=True)
86
87 def update_archive_setting(ctx, key, value):
88 """
89 Add logs directory to job's info log file
90 """
91 if ctx.archive is None:
92 return
93 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
94 info_yaml = yaml.safe_load(info_file)
95 info_file.seek(0)
96 if 'archive' in info_yaml:
97 info_yaml['archive'][key] = value
98 else:
99 info_yaml['archive'] = {key: value}
100 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
101
102
103 @contextlib.contextmanager
104 def normalize_hostnames(ctx):
105 """
106 Ensure we have short hostnames throughout, for consistency between
107 remote.shortname and socket.gethostname() in cephadm.
108 """
109 log.info('Normalizing hostnames...')
110 cluster = ctx.cluster.filter(lambda r: '.' in r.hostname)
111 cluster.run(args=[
112 'sudo',
113 'hostname',
114 run.Raw('$(hostname -s)'),
115 ])
116
117 try:
118 yield
119 finally:
120 pass
121
122
123 @contextlib.contextmanager
124 def download_cephadm(ctx, config, ref):
125 cluster_name = config['cluster']
126
127 if config.get('cephadm_mode') != 'cephadm-package':
128 if ctx.config.get('redhat'):
129 _fetch_cephadm_from_rpm(ctx)
130 # TODO: come up with a sensible way to detect if we need an "old, uncompiled"
131 # cephadm
132 elif 'cephadm_git_url' in config and 'cephadm_branch' in config:
133 _fetch_cephadm_from_github(ctx, config, ref)
134 else:
135 _fetch_cephadm_from_chachra(ctx, config, cluster_name)
136
137 try:
138 yield
139 finally:
140 _rm_cluster(ctx, cluster_name)
141 if config.get('cephadm_mode') == 'root':
142 _rm_cephadm(ctx)
143
144
145 def _fetch_cephadm_from_rpm(ctx):
146 log.info("Copying cephadm installed from an RPM package")
147 # cephadm already installed from redhat.install task
148 ctx.cluster.run(
149 args=[
150 'cp',
151 run.Raw('$(which cephadm)'),
152 ctx.cephadm,
153 run.Raw('&&'),
154 'ls', '-l',
155 ctx.cephadm,
156 ]
157 )
158
159
160 def _fetch_cephadm_from_github(ctx, config, ref):
161 ref = config.get('cephadm_branch', ref)
162 git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
163 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
164 if git_url.startswith('https://github.com/'):
165 # git archive doesn't like https:// URLs, which we use with github.
166 rest = git_url.split('https://github.com/', 1)[1]
167 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
168 ctx.cluster.run(
169 args=[
170 'curl', '--silent',
171 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
172 run.Raw('>'),
173 ctx.cephadm,
174 run.Raw('&&'),
175 'ls', '-l',
176 ctx.cephadm,
177 ],
178 )
179 else:
180 ctx.cluster.run(
181 args=[
182 'git', 'clone', git_url, 'testrepo',
183 run.Raw('&&'),
184 'cd', 'testrepo',
185 run.Raw('&&'),
186 'git', 'show', f'{ref}:src/cephadm/cephadm',
187 run.Raw('>'),
188 ctx.cephadm,
189 run.Raw('&&'),
190 'ls', '-l', ctx.cephadm,
191 ],
192 )
193 # sanity-check the resulting file and set executable bit
194 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
195 ctx.cluster.run(
196 args=[
197 'test', '-s', ctx.cephadm,
198 run.Raw('&&'),
199 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
200 run.Raw('&&'),
201 'chmod', '+x', ctx.cephadm,
202 ],
203 )
204
205
206 def _fetch_cephadm_from_chachra(ctx, config, cluster_name):
207 log.info('Downloading "compiled" cephadm from cachra')
208 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
209 bp = packaging.get_builder_project()(
210 config.get('project', 'ceph'),
211 config,
212 ctx=ctx,
213 remote=bootstrap_remote,
214 )
215 log.info('builder_project result: %s' % (bp._result.json()))
216
217 flavor = config.get('flavor', 'default')
218 branch = config.get('branch')
219 sha1 = config.get('sha1')
220
221 # pull the cephadm binary from chacra
222 url = chacra.get_binary_url(
223 'cephadm',
224 project=bp.project,
225 distro=bp.distro.split('/')[0],
226 release=bp.distro.split('/')[1],
227 arch=bp.arch,
228 flavor=flavor,
229 branch=branch,
230 sha1=sha1,
231 )
232 log.info("Discovered cachra url: %s", url)
233 ctx.cluster.run(
234 args=[
235 'curl', '--silent', '-L', url,
236 run.Raw('>'),
237 ctx.cephadm,
238 run.Raw('&&'),
239 'ls', '-l',
240 ctx.cephadm,
241 ],
242 )
243
244 # sanity-check the resulting file and set executable bit
245 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
246 ctx.cluster.run(
247 args=[
248 'test', '-s', ctx.cephadm,
249 run.Raw('&&'),
250 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
251 run.Raw('&&'),
252 'chmod', '+x', ctx.cephadm,
253 ],
254 )
255
256
257 def _rm_cluster(ctx, cluster_name):
258 log.info('Removing cluster...')
259 ctx.cluster.run(args=[
260 'sudo',
261 ctx.cephadm,
262 'rm-cluster',
263 '--fsid', ctx.ceph[cluster_name].fsid,
264 '--force',
265 ])
266
267
268 def _rm_cephadm(ctx):
269 log.info('Removing cephadm ...')
270 ctx.cluster.run(
271 args=[
272 'rm',
273 '-rf',
274 ctx.cephadm,
275 ],
276 )
277
278
279 @contextlib.contextmanager
280 def ceph_log(ctx, config):
281 cluster_name = config['cluster']
282 fsid = ctx.ceph[cluster_name].fsid
283
284 update_archive_setting(ctx, 'log', '/var/log/ceph')
285
286
287 try:
288 yield
289
290 except Exception:
291 # we need to know this below
292 ctx.summary['success'] = False
293 raise
294
295 finally:
296 log.info('Checking cluster log for badness...')
297 def first_in_ceph_log(pattern, excludes):
298 """
299 Find the first occurrence of the pattern specified in the Ceph log,
300 Returns None if none found.
301
302 :param pattern: Pattern scanned for.
303 :param excludes: Patterns to ignore.
304 :return: First line of text (or None if not found)
305 """
306 args = [
307 'sudo',
308 'egrep', pattern,
309 '/var/log/ceph/{fsid}/ceph.log'.format(
310 fsid=fsid),
311 ]
312 if excludes:
313 for exclude in excludes:
314 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
315 args.extend([
316 run.Raw('|'), 'head', '-n', '1',
317 ])
318 r = ctx.ceph[cluster_name].bootstrap_remote.run(
319 stdout=StringIO(),
320 args=args,
321 )
322 stdout = r.stdout.getvalue()
323 if stdout != '':
324 return stdout
325 return None
326
327 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
328 config.get('log-ignorelist')) is not None:
329 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
330 ctx.summary['success'] = False
331 # use the most severe problem as the failure reason
332 if 'failure_reason' not in ctx.summary:
333 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
334 match = first_in_ceph_log(pattern, config['log-ignorelist'])
335 if match is not None:
336 ctx.summary['failure_reason'] = \
337 '"{match}" in cluster log'.format(
338 match=match.rstrip('\n'),
339 )
340 break
341
342 if ctx.archive is not None and \
343 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
344 # and logs
345 log.info('Compressing logs...')
346 run.wait(
347 ctx.cluster.run(
348 args=[
349 'sudo',
350 'find',
351 '/var/log/ceph', # all logs, not just for the cluster
352 '/var/log/rbd-target-api', # ceph-iscsi
353 '-name',
354 '*.log',
355 '-print0',
356 run.Raw('|'),
357 'sudo',
358 'xargs',
359 '-0',
360 '--no-run-if-empty',
361 '--',
362 'gzip',
363 '--',
364 ],
365 wait=False,
366 ),
367 )
368
369 log.info('Archiving logs...')
370 path = os.path.join(ctx.archive, 'remote')
371 try:
372 os.makedirs(path)
373 except OSError:
374 pass
375 for remote in ctx.cluster.remotes.keys():
376 sub = os.path.join(path, remote.shortname)
377 try:
378 os.makedirs(sub)
379 except OSError:
380 pass
381 try:
382 teuthology.pull_directory(remote, '/var/log/ceph', # everything
383 os.path.join(sub, 'log'))
384 except ReadError:
385 pass
386
387
388 @contextlib.contextmanager
389 def ceph_crash(ctx, config):
390 """
391 Gather crash dumps from /var/lib/ceph/$fsid/crash
392 """
393 cluster_name = config['cluster']
394 fsid = ctx.ceph[cluster_name].fsid
395
396 update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
397
398 try:
399 yield
400
401 finally:
402 if ctx.archive is not None:
403 log.info('Archiving crash dumps...')
404 path = os.path.join(ctx.archive, 'remote')
405 try:
406 os.makedirs(path)
407 except OSError:
408 pass
409 for remote in ctx.cluster.remotes.keys():
410 sub = os.path.join(path, remote.shortname)
411 try:
412 os.makedirs(sub)
413 except OSError:
414 pass
415 try:
416 teuthology.pull_directory(remote,
417 '/var/lib/ceph/%s/crash' % fsid,
418 os.path.join(sub, 'crash'))
419 except ReadError:
420 pass
421
422
423 @contextlib.contextmanager
424 def pull_image(ctx, config):
425 cluster_name = config['cluster']
426 log.info(f'Pulling image {ctx.ceph[cluster_name].image} on all hosts...')
427 run.wait(
428 ctx.cluster.run(
429 args=[
430 'sudo',
431 ctx.cephadm,
432 '--image', ctx.ceph[cluster_name].image,
433 'pull',
434 ],
435 wait=False,
436 )
437 )
438
439 try:
440 yield
441 finally:
442 pass
443
444
445 @contextlib.contextmanager
446 def ceph_bootstrap(ctx, config):
447 """
448 Bootstrap ceph cluster.
449
450 :param ctx: the argparse.Namespace object
451 :param config: the config dict
452 """
453 cluster_name = config['cluster']
454 testdir = teuthology.get_testdir(ctx)
455 fsid = ctx.ceph[cluster_name].fsid
456
457 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
458 first_mon = ctx.ceph[cluster_name].first_mon
459 first_mon_role = ctx.ceph[cluster_name].first_mon_role
460 mons = ctx.ceph[cluster_name].mons
461
462 ctx.cluster.run(args=[
463 'sudo', 'mkdir', '-p', '/etc/ceph',
464 ]);
465 ctx.cluster.run(args=[
466 'sudo', 'chmod', '777', '/etc/ceph',
467 ]);
468 try:
469 # write seed config
470 log.info('Writing seed config...')
471 conf_fp = BytesIO()
472 seed_config = build_initial_config(ctx, config)
473 seed_config.write(conf_fp)
474 bootstrap_remote.write_file(
475 path='{}/seed.{}.conf'.format(testdir, cluster_name),
476 data=conf_fp.getvalue())
477 log.debug('Final config:\n' + conf_fp.getvalue().decode())
478 ctx.ceph[cluster_name].conf = seed_config
479
480 # register initial daemons
481 ctx.daemons.register_daemon(
482 bootstrap_remote, 'mon', first_mon,
483 cluster=cluster_name,
484 fsid=fsid,
485 logger=log.getChild('mon.' + first_mon),
486 wait=False,
487 started=True,
488 )
489 if not ctx.ceph[cluster_name].roleless:
490 first_mgr = ctx.ceph[cluster_name].first_mgr
491 ctx.daemons.register_daemon(
492 bootstrap_remote, 'mgr', first_mgr,
493 cluster=cluster_name,
494 fsid=fsid,
495 logger=log.getChild('mgr.' + first_mgr),
496 wait=False,
497 started=True,
498 )
499
500 # bootstrap
501 log.info('Bootstrapping...')
502 cmd = [
503 'sudo',
504 ctx.cephadm,
505 '--image', ctx.ceph[cluster_name].image,
506 '-v',
507 'bootstrap',
508 '--fsid', fsid,
509 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
510 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
511 '--output-keyring',
512 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
513 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
514 ]
515 if config.get("no_cgroups_split") is True:
516 cmd.insert(cmd.index("bootstrap"), "--no-cgroups-split")
517
518 if config.get('registry-login'):
519 registry = config['registry-login']
520 cmd += [
521 "--registry-url", registry['url'],
522 "--registry-username", registry['username'],
523 "--registry-password", registry['password'],
524 ]
525
526 if not ctx.ceph[cluster_name].roleless:
527 cmd += [
528 '--mon-id', first_mon,
529 '--mgr-id', first_mgr,
530 '--orphan-initial-daemons', # we will do it explicitly!
531 '--skip-monitoring-stack', # we'll provision these explicitly
532 ]
533
534 if mons[first_mon_role].startswith('['):
535 cmd += ['--mon-addrv', mons[first_mon_role]]
536 else:
537 cmd += ['--mon-ip', mons[first_mon_role]]
538 if config.get('skip_dashboard'):
539 cmd += ['--skip-dashboard']
540 if config.get('skip_monitoring_stack'):
541 cmd += ['--skip-monitoring-stack']
542 if config.get('single_host_defaults'):
543 cmd += ['--single-host-defaults']
544 if not config.get('avoid_pacific_features', False):
545 cmd += ['--skip-admin-label']
546 # bootstrap makes the keyring root 0600, so +r it for our purposes
547 cmd += [
548 run.Raw('&&'),
549 'sudo', 'chmod', '+r',
550 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
551 ]
552 bootstrap_remote.run(args=cmd)
553
554 # fetch keys and configs
555 log.info('Fetching config...')
556 ctx.ceph[cluster_name].config_file = \
557 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
558 log.info('Fetching client.admin keyring...')
559 ctx.ceph[cluster_name].admin_keyring = \
560 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
561 log.info('Fetching mon keyring...')
562 ctx.ceph[cluster_name].mon_keyring = \
563 bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
564
565 # fetch ssh key, distribute to additional nodes
566 log.info('Fetching pub ssh key...')
567 ssh_pub_key = bootstrap_remote.read_file(
568 f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
569
570 log.info('Installing pub ssh key for root users...')
571 ctx.cluster.run(args=[
572 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
573 run.Raw('&&'),
574 'echo', ssh_pub_key,
575 run.Raw('|'),
576 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
577 run.Raw('&&'),
578 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
579 ])
580
581 # set options
582 if config.get('allow_ptrace', True):
583 _shell(ctx, cluster_name, bootstrap_remote,
584 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
585
586 if not config.get('avoid_pacific_features', False):
587 log.info('Distributing conf and client.admin keyring to all hosts + 0755')
588 _shell(ctx, cluster_name, bootstrap_remote,
589 ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
590 '*', '--mode', '0755'],
591 check_status=False)
592
593 # add other hosts
594 for remote in ctx.cluster.remotes.keys():
595 if remote == bootstrap_remote:
596 continue
597
598 # note: this may be redundant (see above), but it avoids
599 # us having to wait for cephadm to do it.
600 log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
601 remote.write_file(
602 path='/etc/ceph/{}.conf'.format(cluster_name),
603 data=ctx.ceph[cluster_name].config_file)
604 remote.write_file(
605 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
606 data=ctx.ceph[cluster_name].admin_keyring)
607
608 log.info('Adding host %s to orchestrator...' % remote.shortname)
609 _shell(ctx, cluster_name, bootstrap_remote, [
610 'ceph', 'orch', 'host', 'add',
611 remote.shortname
612 ])
613 r = _shell(ctx, cluster_name, bootstrap_remote,
614 ['ceph', 'orch', 'host', 'ls', '--format=json'],
615 stdout=StringIO())
616 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
617 assert remote.shortname in hosts
618
619 yield
620
621 finally:
622 log.info('Cleaning up testdir ceph.* files...')
623 ctx.cluster.run(args=[
624 'rm', '-f',
625 '{}/seed.{}.conf'.format(testdir, cluster_name),
626 '{}/{}.pub'.format(testdir, cluster_name),
627 ])
628
629 log.info('Stopping all daemons...')
630
631 # this doesn't block until they are all stopped...
632 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
633
634 # stop the daemons we know
635 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
636 cluster, type_, id_ = teuthology.split_role(role)
637 try:
638 ctx.daemons.get_daemon(type_, id_, cluster).stop()
639 except Exception:
640 log.exception(f'Failed to stop "{role}"')
641 raise
642
643 # tear down anything left (but leave the logs behind)
644 ctx.cluster.run(
645 args=[
646 'sudo',
647 ctx.cephadm,
648 'rm-cluster',
649 '--fsid', fsid,
650 '--force',
651 '--keep-logs',
652 ],
653 check_status=False, # may fail if upgrading from old cephadm
654 )
655
656 # clean up /etc/ceph
657 ctx.cluster.run(args=[
658 'sudo', 'rm', '-f',
659 '/etc/ceph/{}.conf'.format(cluster_name),
660 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
661 ])
662
663
664 @contextlib.contextmanager
665 def ceph_mons(ctx, config):
666 """
667 Deploy any additional mons
668 """
669 cluster_name = config['cluster']
670 fsid = ctx.ceph[cluster_name].fsid
671
672 try:
673 daemons = {}
674 if config.get('add_mons_via_daemon_add'):
675 # This is the old way of adding mons that works with the (early) octopus
676 # cephadm scheduler.
677 num_mons = 1
678 for remote, roles in ctx.cluster.remotes.items():
679 for mon in [r for r in roles
680 if teuthology.is_type('mon', cluster_name)(r)]:
681 c_, _, id_ = teuthology.split_role(mon)
682 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
683 continue
684 log.info('Adding %s on %s' % (mon, remote.shortname))
685 num_mons += 1
686 _shell(ctx, cluster_name, remote, [
687 'ceph', 'orch', 'daemon', 'add', 'mon',
688 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
689 ])
690 ctx.daemons.register_daemon(
691 remote, 'mon', id_,
692 cluster=cluster_name,
693 fsid=fsid,
694 logger=log.getChild(mon),
695 wait=False,
696 started=True,
697 )
698 daemons[mon] = (remote, id_)
699
700 with contextutil.safe_while(sleep=1, tries=180) as proceed:
701 while proceed():
702 log.info('Waiting for %d mons in monmap...' % (num_mons))
703 r = _shell(
704 ctx=ctx,
705 cluster_name=cluster_name,
706 remote=remote,
707 args=[
708 'ceph', 'mon', 'dump', '-f', 'json',
709 ],
710 stdout=StringIO(),
711 )
712 j = json.loads(r.stdout.getvalue())
713 if len(j['mons']) == num_mons:
714 break
715 else:
716 nodes = []
717 for remote, roles in ctx.cluster.remotes.items():
718 for mon in [r for r in roles
719 if teuthology.is_type('mon', cluster_name)(r)]:
720 c_, _, id_ = teuthology.split_role(mon)
721 log.info('Adding %s on %s' % (mon, remote.shortname))
722 nodes.append(remote.shortname
723 + ':' + ctx.ceph[cluster_name].mons[mon]
724 + '=' + id_)
725 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
726 continue
727 daemons[mon] = (remote, id_)
728
729 _shell(ctx, cluster_name, remote, [
730 'ceph', 'orch', 'apply', 'mon',
731 str(len(nodes)) + ';' + ';'.join(nodes)]
732 )
733 for mgr, i in daemons.items():
734 remote, id_ = i
735 ctx.daemons.register_daemon(
736 remote, 'mon', id_,
737 cluster=cluster_name,
738 fsid=fsid,
739 logger=log.getChild(mon),
740 wait=False,
741 started=True,
742 )
743
744 with contextutil.safe_while(sleep=1, tries=180) as proceed:
745 while proceed():
746 log.info('Waiting for %d mons in monmap...' % (len(nodes)))
747 r = _shell(
748 ctx=ctx,
749 cluster_name=cluster_name,
750 remote=remote,
751 args=[
752 'ceph', 'mon', 'dump', '-f', 'json',
753 ],
754 stdout=StringIO(),
755 )
756 j = json.loads(r.stdout.getvalue())
757 if len(j['mons']) == len(nodes):
758 break
759
760 # refresh our (final) ceph.conf file
761 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
762 log.info('Generating final ceph.conf file...')
763 r = _shell(
764 ctx=ctx,
765 cluster_name=cluster_name,
766 remote=bootstrap_remote,
767 args=[
768 'ceph', 'config', 'generate-minimal-conf',
769 ],
770 stdout=StringIO(),
771 )
772 ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
773
774 yield
775
776 finally:
777 pass
778
779
780 @contextlib.contextmanager
781 def ceph_mgrs(ctx, config):
782 """
783 Deploy any additional mgrs
784 """
785 cluster_name = config['cluster']
786 fsid = ctx.ceph[cluster_name].fsid
787
788 try:
789 nodes = []
790 daemons = {}
791 for remote, roles in ctx.cluster.remotes.items():
792 for mgr in [r for r in roles
793 if teuthology.is_type('mgr', cluster_name)(r)]:
794 c_, _, id_ = teuthology.split_role(mgr)
795 log.info('Adding %s on %s' % (mgr, remote.shortname))
796 nodes.append(remote.shortname + '=' + id_)
797 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
798 continue
799 daemons[mgr] = (remote, id_)
800 if nodes:
801 _shell(ctx, cluster_name, remote, [
802 'ceph', 'orch', 'apply', 'mgr',
803 str(len(nodes)) + ';' + ';'.join(nodes)]
804 )
805 for mgr, i in daemons.items():
806 remote, id_ = i
807 ctx.daemons.register_daemon(
808 remote, 'mgr', id_,
809 cluster=cluster_name,
810 fsid=fsid,
811 logger=log.getChild(mgr),
812 wait=False,
813 started=True,
814 )
815
816 yield
817
818 finally:
819 pass
820
821
822 @contextlib.contextmanager
823 def ceph_osds(ctx, config):
824 """
825 Deploy OSDs
826 """
827 cluster_name = config['cluster']
828 fsid = ctx.ceph[cluster_name].fsid
829
830 try:
831 log.info('Deploying OSDs...')
832
833 # provision OSDs in numeric order
834 id_to_remote = {}
835 devs_by_remote = {}
836 for remote, roles in ctx.cluster.remotes.items():
837 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
838 for osd in [r for r in roles
839 if teuthology.is_type('osd', cluster_name)(r)]:
840 _, _, id_ = teuthology.split_role(osd)
841 id_to_remote[int(id_)] = (osd, remote)
842
843 cur = 0
844 for osd_id in sorted(id_to_remote.keys()):
845 osd, remote = id_to_remote[osd_id]
846 _, _, id_ = teuthology.split_role(osd)
847 assert int(id_) == cur
848 devs = devs_by_remote[remote]
849 assert devs ## FIXME ##
850 dev = devs.pop()
851 if all(_ in dev for _ in ('lv', 'vg')):
852 short_dev = dev.replace('/dev/', '')
853 else:
854 short_dev = dev
855 log.info('Deploying %s on %s with %s...' % (
856 osd, remote.shortname, dev))
857 _shell(ctx, cluster_name, remote, [
858 'ceph-volume', 'lvm', 'zap', dev])
859 add_osd_args = ['ceph', 'orch', 'daemon', 'add', 'osd',
860 remote.shortname + ':' + short_dev]
861 osd_method = config.get('osd_method')
862 if osd_method:
863 add_osd_args.append(osd_method)
864 _shell(ctx, cluster_name, remote, add_osd_args)
865 ctx.daemons.register_daemon(
866 remote, 'osd', id_,
867 cluster=cluster_name,
868 fsid=fsid,
869 logger=log.getChild(osd),
870 wait=False,
871 started=True,
872 )
873 cur += 1
874
875 if cur == 0:
876 _shell(ctx, cluster_name, remote, [
877 'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
878 ])
879 # expect the number of scratch devs
880 num_osds = sum(map(len, devs_by_remote.values()))
881 assert num_osds
882 else:
883 # expect the number of OSDs we created
884 num_osds = cur
885
886 log.info(f'Waiting for {num_osds} OSDs to come up...')
887 with contextutil.safe_while(sleep=1, tries=120) as proceed:
888 while proceed():
889 p = _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
890 ['ceph', 'osd', 'stat', '-f', 'json'], stdout=StringIO())
891 j = json.loads(p.stdout.getvalue())
892 if int(j.get('num_up_osds', 0)) == num_osds:
893 break;
894
895 if not hasattr(ctx, 'managers'):
896 ctx.managers = {}
897 ctx.managers[cluster_name] = CephManager(
898 ctx.ceph[cluster_name].bootstrap_remote,
899 ctx=ctx,
900 logger=log.getChild('ceph_manager.' + cluster_name),
901 cluster=cluster_name,
902 cephadm=True,
903 )
904
905 yield
906 finally:
907 pass
908
909
910 @contextlib.contextmanager
911 def ceph_mdss(ctx, config):
912 """
913 Deploy MDSss
914 """
915 cluster_name = config['cluster']
916 fsid = ctx.ceph[cluster_name].fsid
917
918 nodes = []
919 daemons = {}
920 for remote, roles in ctx.cluster.remotes.items():
921 for role in [r for r in roles
922 if teuthology.is_type('mds', cluster_name)(r)]:
923 c_, _, id_ = teuthology.split_role(role)
924 log.info('Adding %s on %s' % (role, remote.shortname))
925 nodes.append(remote.shortname + '=' + id_)
926 daemons[role] = (remote, id_)
927 if nodes:
928 _shell(ctx, cluster_name, remote, [
929 'ceph', 'orch', 'apply', 'mds',
930 'all',
931 str(len(nodes)) + ';' + ';'.join(nodes)]
932 )
933 for role, i in daemons.items():
934 remote, id_ = i
935 ctx.daemons.register_daemon(
936 remote, 'mds', id_,
937 cluster=cluster_name,
938 fsid=fsid,
939 logger=log.getChild(role),
940 wait=False,
941 started=True,
942 )
943
944 yield
945
946 @contextlib.contextmanager
947 def cephfs_setup(ctx, config):
948 mdss = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
949
950 # If there are any MDSs, then create a filesystem for them to use
951 # Do this last because requires mon cluster to be up and running
952 if len(mdss) > 0:
953 log.info('Setting up CephFS filesystem(s)...')
954 cephfs_config = config.get('cephfs', {})
955 fs_configs = cephfs_config.pop('fs', [{'name': 'cephfs'}])
956 set_allow_multifs = len(fs_configs) > 1
957
958 # wait for standbys to become available (slow due to valgrind, perhaps)
959 mdsc = MDSCluster(ctx)
960 with contextutil.safe_while(sleep=2,tries=150) as proceed:
961 while proceed():
962 if len(mdsc.get_standby_daemons()) >= len(mdss):
963 break
964
965 fss = []
966 for fs_config in fs_configs:
967 assert isinstance(fs_config, dict)
968 name = fs_config.pop('name')
969 temp = deepcopy(cephfs_config)
970 teuthology.deep_merge(temp, fs_config)
971 subvols = config.get('subvols', None)
972 if subvols:
973 teuthology.deep_merge(temp, {'subvols': subvols})
974 fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
975 if set_allow_multifs:
976 fs.set_allow_multifs()
977 set_allow_multifs = False
978 fss.append(fs)
979
980 yield
981
982 for fs in fss:
983 fs.destroy()
984 else:
985 yield
986
987 @contextlib.contextmanager
988 def ceph_monitoring(daemon_type, ctx, config):
989 """
990 Deploy prometheus, node-exporter, etc.
991 """
992 cluster_name = config['cluster']
993 fsid = ctx.ceph[cluster_name].fsid
994
995 nodes = []
996 daemons = {}
997 for remote, roles in ctx.cluster.remotes.items():
998 for role in [r for r in roles
999 if teuthology.is_type(daemon_type, cluster_name)(r)]:
1000 c_, _, id_ = teuthology.split_role(role)
1001 log.info('Adding %s on %s' % (role, remote.shortname))
1002 nodes.append(remote.shortname + '=' + id_)
1003 daemons[role] = (remote, id_)
1004 if nodes:
1005 _shell(ctx, cluster_name, remote, [
1006 'ceph', 'orch', 'apply', daemon_type,
1007 str(len(nodes)) + ';' + ';'.join(nodes)]
1008 )
1009 for role, i in daemons.items():
1010 remote, id_ = i
1011 ctx.daemons.register_daemon(
1012 remote, daemon_type, id_,
1013 cluster=cluster_name,
1014 fsid=fsid,
1015 logger=log.getChild(role),
1016 wait=False,
1017 started=True,
1018 )
1019
1020 yield
1021
1022
1023 @contextlib.contextmanager
1024 def ceph_rgw(ctx, config):
1025 """
1026 Deploy rgw
1027 """
1028 cluster_name = config['cluster']
1029 fsid = ctx.ceph[cluster_name].fsid
1030
1031 nodes = {}
1032 daemons = {}
1033 for remote, roles in ctx.cluster.remotes.items():
1034 for role in [r for r in roles
1035 if teuthology.is_type('rgw', cluster_name)(r)]:
1036 c_, _, id_ = teuthology.split_role(role)
1037 log.info('Adding %s on %s' % (role, remote.shortname))
1038 svc = '.'.join(id_.split('.')[0:2])
1039 if svc not in nodes:
1040 nodes[svc] = []
1041 nodes[svc].append(remote.shortname + '=' + id_)
1042 daemons[role] = (remote, id_)
1043
1044 for svc, nodes in nodes.items():
1045 _shell(ctx, cluster_name, remote, [
1046 'ceph', 'orch', 'apply', 'rgw', svc,
1047 '--placement',
1048 str(len(nodes)) + ';' + ';'.join(nodes)]
1049 )
1050 for role, i in daemons.items():
1051 remote, id_ = i
1052 ctx.daemons.register_daemon(
1053 remote, 'rgw', id_,
1054 cluster=cluster_name,
1055 fsid=fsid,
1056 logger=log.getChild(role),
1057 wait=False,
1058 started=True,
1059 )
1060
1061 yield
1062
1063
1064 @contextlib.contextmanager
1065 def ceph_iscsi(ctx, config):
1066 """
1067 Deploy iSCSIs
1068 """
1069 cluster_name = config['cluster']
1070 fsid = ctx.ceph[cluster_name].fsid
1071
1072 nodes = []
1073 daemons = {}
1074 ips = []
1075
1076 for remote, roles in ctx.cluster.remotes.items():
1077 for role in [r for r in roles
1078 if teuthology.is_type('iscsi', cluster_name)(r)]:
1079 c_, _, id_ = teuthology.split_role(role)
1080 log.info('Adding %s on %s' % (role, remote.shortname))
1081 nodes.append(remote.shortname + '=' + id_)
1082 daemons[role] = (remote, id_)
1083 ips.append(remote.ip_address)
1084 trusted_ip_list = ','.join(ips)
1085 if nodes:
1086 poolname = 'datapool'
1087 # ceph osd pool create datapool 3 3 replicated
1088 _shell(ctx, cluster_name, remote, [
1089 'ceph', 'osd', 'pool', 'create',
1090 poolname, '3', '3', 'replicated']
1091 )
1092
1093 _shell(ctx, cluster_name, remote, [
1094 'rbd', 'pool', 'init', poolname]
1095 )
1096
1097 # ceph orch apply iscsi datapool (admin)user (admin)password
1098 _shell(ctx, cluster_name, remote, [
1099 'ceph', 'orch', 'apply', 'iscsi',
1100 poolname, 'admin', 'admin',
1101 '--trusted_ip_list', trusted_ip_list,
1102 '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
1103 )
1104
1105 # used by iscsi client to identify valid gateway ip's
1106 conf_data = dedent(f"""
1107 [config]
1108 trusted_ip_list = {trusted_ip_list}
1109 """)
1110 distribute_iscsi_gateway_cfg(ctx, conf_data)
1111
1112 for role, i in daemons.items():
1113 remote, id_ = i
1114 ctx.daemons.register_daemon(
1115 remote, 'iscsi', id_,
1116 cluster=cluster_name,
1117 fsid=fsid,
1118 logger=log.getChild(role),
1119 wait=False,
1120 started=True,
1121 )
1122
1123 yield
1124
1125
1126 @contextlib.contextmanager
1127 def ceph_clients(ctx, config):
1128 cluster_name = config['cluster']
1129
1130 log.info('Setting up client nodes...')
1131 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
1132 for remote, roles_for_host in clients.remotes.items():
1133 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
1134 cluster_name):
1135 name = teuthology.ceph_role(role)
1136 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
1137 name)
1138 r = _shell(
1139 ctx=ctx,
1140 cluster_name=cluster_name,
1141 remote=remote,
1142 args=[
1143 'ceph', 'auth',
1144 'get-or-create', name,
1145 'mon', 'allow *',
1146 'osd', 'allow *',
1147 'mds', 'allow *',
1148 'mgr', 'allow *',
1149 ],
1150 stdout=StringIO(),
1151 )
1152 keyring = r.stdout.getvalue()
1153 remote.sudo_write_file(client_keyring, keyring, mode='0644')
1154 yield
1155
1156
1157 @contextlib.contextmanager
1158 def ceph_initial():
1159 try:
1160 yield
1161 finally:
1162 log.info('Teardown complete')
1163
1164
1165 ## public methods
1166 @contextlib.contextmanager
1167 def stop(ctx, config):
1168 """
1169 Stop ceph daemons
1170
1171 For example::
1172 tasks:
1173 - ceph.stop: [mds.*]
1174
1175 tasks:
1176 - ceph.stop: [osd.0, osd.2]
1177
1178 tasks:
1179 - ceph.stop:
1180 daemons: [osd.0, osd.2]
1181
1182 """
1183 if config is None:
1184 config = {}
1185 elif isinstance(config, list):
1186 config = {'daemons': config}
1187
1188 daemons = ctx.daemons.resolve_role_list(
1189 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1190 clusters = set()
1191
1192 for role in daemons:
1193 cluster, type_, id_ = teuthology.split_role(role)
1194 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1195 clusters.add(cluster)
1196
1197 # for cluster in clusters:
1198 # ctx.ceph[cluster].watchdog.stop()
1199 # ctx.ceph[cluster].watchdog.join()
1200
1201 yield
1202
1203
1204 def shell(ctx, config):
1205 """
1206 Execute (shell) commands
1207 """
1208 cluster_name = config.get('cluster', 'ceph')
1209
1210 args = []
1211 for k in config.pop('env', []):
1212 args.extend(['-e', k + '=' + ctx.config.get(k, '')])
1213 for k in config.pop('volumes', []):
1214 args.extend(['-v', k])
1215
1216 if 'all-roles' in config and len(config) == 1:
1217 a = config['all-roles']
1218 roles = teuthology.all_roles(ctx.cluster)
1219 config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
1220 elif 'all-hosts' in config and len(config) == 1:
1221 a = config['all-hosts']
1222 roles = teuthology.all_roles(ctx.cluster)
1223 config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
1224
1225 for role, cmd in config.items():
1226 (remote,) = ctx.cluster.only(role).remotes.keys()
1227 log.info('Running commands on role %s host %s', role, remote.name)
1228 if isinstance(cmd, list):
1229 for c in cmd:
1230 _shell(ctx, cluster_name, remote,
1231 ['bash', '-c', subst_vip(ctx, c)],
1232 extra_cephadm_args=args)
1233 else:
1234 assert isinstance(cmd, str)
1235 _shell(ctx, cluster_name, remote,
1236 ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
1237 extra_cephadm_args=args)
1238
1239
1240 def apply(ctx, config):
1241 """
1242 Apply spec
1243
1244 tasks:
1245 - cephadm.apply:
1246 specs:
1247 - service_type: rgw
1248 service_id: foo
1249 spec:
1250 rgw_frontend_port: 8000
1251 - service_type: rgw
1252 service_id: bar
1253 spec:
1254 rgw_frontend_port: 9000
1255 zone: bar
1256 realm: asdf
1257
1258 """
1259 cluster_name = config.get('cluster', 'ceph')
1260
1261 specs = config.get('specs', [])
1262 y = subst_vip(ctx, yaml.dump_all(specs))
1263
1264 log.info(f'Applying spec(s):\n{y}')
1265 _shell(
1266 ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1267 ['ceph', 'orch', 'apply', '-i', '-'],
1268 stdin=y,
1269 )
1270
1271
1272 def wait_for_service(ctx, config):
1273 """
1274 Wait for a service to be fully started
1275
1276 tasks:
1277 - cephadm.wait_for_service:
1278 service: rgw.foo
1279 timeout: 60 # defaults to 300
1280
1281 """
1282 cluster_name = config.get('cluster', 'ceph')
1283 timeout = config.get('timeout', 300)
1284 service = config.get('service')
1285 assert service
1286
1287 log.info(
1288 f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1289 )
1290 with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
1291 while proceed():
1292 r = _shell(
1293 ctx=ctx,
1294 cluster_name=cluster_name,
1295 remote=ctx.ceph[cluster_name].bootstrap_remote,
1296 args=[
1297 'ceph', 'orch', 'ls', '-f', 'json',
1298 ],
1299 stdout=StringIO(),
1300 )
1301 j = json.loads(r.stdout.getvalue())
1302 svc = None
1303 for s in j:
1304 if s['service_name'] == service:
1305 svc = s
1306 break
1307 if svc:
1308 log.info(
1309 f"{service} has {s['status']['running']}/{s['status']['size']}"
1310 )
1311 if s['status']['running'] == s['status']['size']:
1312 break
1313
1314
1315 @contextlib.contextmanager
1316 def tweaked_option(ctx, config):
1317 """
1318 set an option, and then restore it with its original value
1319
1320 Note, due to the way how tasks are executed/nested, it's not suggested to
1321 use this method as a standalone task. otherwise, it's likely that it will
1322 restore the tweaked option at the /end/ of 'tasks' block.
1323 """
1324 saved_options = {}
1325 # we can complicate this when necessary
1326 options = ['mon-health-to-clog']
1327 type_, id_ = 'mon', '*'
1328 cluster = config.get('cluster', 'ceph')
1329 manager = ctx.managers[cluster]
1330 if id_ == '*':
1331 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
1332 else:
1333 get_from = id_
1334 for option in options:
1335 if option not in config:
1336 continue
1337 value = 'true' if config[option] else 'false'
1338 option = option.replace('-', '_')
1339 old_value = manager.get_config(type_, get_from, option)
1340 if value != old_value:
1341 saved_options[option] = old_value
1342 manager.inject_args(type_, id_, option, value)
1343 yield
1344 for option, value in saved_options.items():
1345 manager.inject_args(type_, id_, option, value)
1346
1347
1348 @contextlib.contextmanager
1349 def restart(ctx, config):
1350 """
1351 restart ceph daemons
1352
1353 For example::
1354 tasks:
1355 - ceph.restart: [all]
1356
1357 For example::
1358 tasks:
1359 - ceph.restart: [osd.0, mon.1, mds.*]
1360
1361 or::
1362
1363 tasks:
1364 - ceph.restart:
1365 daemons: [osd.0, mon.1]
1366 wait-for-healthy: false
1367 wait-for-osds-up: true
1368
1369 :param ctx: Context
1370 :param config: Configuration
1371 """
1372 if config is None:
1373 config = {}
1374 elif isinstance(config, list):
1375 config = {'daemons': config}
1376
1377 daemons = ctx.daemons.resolve_role_list(
1378 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1379 clusters = set()
1380
1381 log.info('daemons %s' % daemons)
1382 with tweaked_option(ctx, config):
1383 for role in daemons:
1384 cluster, type_, id_ = teuthology.split_role(role)
1385 d = ctx.daemons.get_daemon(type_, id_, cluster)
1386 assert d, 'daemon %s does not exist' % role
1387 d.stop()
1388 if type_ == 'osd':
1389 ctx.managers[cluster].mark_down_osd(id_)
1390 d.restart()
1391 clusters.add(cluster)
1392
1393 if config.get('wait-for-healthy', True):
1394 for cluster in clusters:
1395 healthy(ctx=ctx, config=dict(cluster=cluster))
1396 if config.get('wait-for-osds-up', False):
1397 for cluster in clusters:
1398 ctx.managers[cluster].wait_for_all_osds_up()
1399 yield
1400
1401
1402 @contextlib.contextmanager
1403 def distribute_config_and_admin_keyring(ctx, config):
1404 """
1405 Distribute a sufficient config and keyring for clients
1406 """
1407 cluster_name = config['cluster']
1408 log.info('Distributing (final) config and client.admin keyring...')
1409 for remote, roles in ctx.cluster.remotes.items():
1410 remote.write_file(
1411 '/etc/ceph/{}.conf'.format(cluster_name),
1412 ctx.ceph[cluster_name].config_file,
1413 sudo=True)
1414 remote.write_file(
1415 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1416 data=ctx.ceph[cluster_name].admin_keyring,
1417 sudo=True)
1418 try:
1419 yield
1420 finally:
1421 ctx.cluster.run(args=[
1422 'sudo', 'rm', '-f',
1423 '/etc/ceph/{}.conf'.format(cluster_name),
1424 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1425 ])
1426
1427
1428 @contextlib.contextmanager
1429 def crush_setup(ctx, config):
1430 cluster_name = config['cluster']
1431
1432 profile = config.get('crush_tunables', 'default')
1433 log.info('Setting crush tunables to %s', profile)
1434 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1435 args=['ceph', 'osd', 'crush', 'tunables', profile])
1436 yield
1437
1438
1439 @contextlib.contextmanager
1440 def create_rbd_pool(ctx, config):
1441 if config.get('create_rbd_pool', False):
1442 cluster_name = config['cluster']
1443 log.info('Waiting for OSDs to come up')
1444 teuthology.wait_until_osds_up(
1445 ctx,
1446 cluster=ctx.cluster,
1447 remote=ctx.ceph[cluster_name].bootstrap_remote,
1448 ceph_cluster=cluster_name,
1449 )
1450 log.info('Creating RBD pool')
1451 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1452 args=['sudo', 'ceph', '--cluster', cluster_name,
1453 'osd', 'pool', 'create', 'rbd', '8'])
1454 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1455 args=['sudo', 'ceph', '--cluster', cluster_name,
1456 'osd', 'pool', 'application', 'enable',
1457 'rbd', 'rbd', '--yes-i-really-mean-it'
1458 ])
1459 yield
1460
1461
1462 @contextlib.contextmanager
1463 def _bypass():
1464 yield
1465
1466
1467 @contextlib.contextmanager
1468 def initialize_config(ctx, config):
1469 cluster_name = config['cluster']
1470 testdir = teuthology.get_testdir(ctx)
1471
1472 ctx.ceph[cluster_name].thrashers = []
1473 # fixme: setup watchdog, ala ceph.py
1474
1475 ctx.ceph[cluster_name].roleless = False # see below
1476
1477 first_ceph_cluster = False
1478 if not hasattr(ctx, 'daemons'):
1479 first_ceph_cluster = True
1480
1481 # cephadm mode?
1482 if 'cephadm_mode' not in config:
1483 config['cephadm_mode'] = 'root'
1484 assert config['cephadm_mode'] in ['root', 'cephadm-package']
1485 if config['cephadm_mode'] == 'root':
1486 ctx.cephadm = testdir + '/cephadm'
1487 else:
1488 ctx.cephadm = 'cephadm' # in the path
1489
1490 if first_ceph_cluster:
1491 # FIXME: this is global for all clusters
1492 ctx.daemons = DaemonGroup(
1493 use_cephadm=ctx.cephadm)
1494
1495 # uuid
1496 fsid = str(uuid.uuid1())
1497 log.info('Cluster fsid is %s' % fsid)
1498 ctx.ceph[cluster_name].fsid = fsid
1499
1500 # mon ips
1501 log.info('Choosing monitor IPs and ports...')
1502 remotes_and_roles = ctx.cluster.remotes.items()
1503 ips = [host for (host, port) in
1504 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1505
1506 if config.get('roleless', False):
1507 # mons will be named after hosts
1508 first_mon = None
1509 max_mons = config.get('max_mons', 5)
1510 for remote, _ in remotes_and_roles:
1511 ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1512 if not first_mon:
1513 first_mon = remote.shortname
1514 bootstrap_remote = remote
1515 max_mons -= 1
1516 if not max_mons:
1517 break
1518 log.info('No mon roles; fabricating mons')
1519
1520 roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1521
1522 ctx.ceph[cluster_name].mons = get_mons(
1523 roles, ips, cluster_name,
1524 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1525 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1526 )
1527 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1528
1529 if config.get('roleless', False):
1530 ctx.ceph[cluster_name].roleless = True
1531 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1532 ctx.ceph[cluster_name].first_mon = first_mon
1533 ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1534 else:
1535 first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1536 _, _, first_mon = teuthology.split_role(first_mon_role)
1537 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1538 log.info('First mon is mon.%s on %s' % (first_mon,
1539 bootstrap_remote.shortname))
1540 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1541 ctx.ceph[cluster_name].first_mon = first_mon
1542 ctx.ceph[cluster_name].first_mon_role = first_mon_role
1543
1544 others = ctx.cluster.remotes[bootstrap_remote]
1545 mgrs = sorted([r for r in others
1546 if teuthology.is_type('mgr', cluster_name)(r)])
1547 if not mgrs:
1548 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1549 _, _, first_mgr = teuthology.split_role(mgrs[0])
1550 log.info('First mgr is %s' % (first_mgr))
1551 ctx.ceph[cluster_name].first_mgr = first_mgr
1552 yield
1553
1554
1555 @contextlib.contextmanager
1556 def task(ctx, config):
1557 """
1558 Deploy ceph cluster using cephadm
1559
1560 For example, teuthology.yaml can contain the 'defaults' section:
1561
1562 defaults:
1563 cephadm:
1564 containers:
1565 image: 'quay.io/ceph-ci/ceph'
1566
1567 Using overrides makes it possible to customize it per run.
1568 The equivalent 'overrides' section looks like:
1569
1570 overrides:
1571 cephadm:
1572 containers:
1573 image: 'quay.io/ceph-ci/ceph'
1574 registry-login:
1575 url: registry-url
1576 username: registry-user
1577 password: registry-password
1578
1579 :param ctx: the argparse.Namespace object
1580 :param config: the config dict
1581 """
1582 if config is None:
1583 config = {}
1584
1585 assert isinstance(config, dict), \
1586 "task only supports a dictionary for configuration"
1587
1588 overrides = ctx.config.get('overrides', {})
1589 teuthology.deep_merge(config, overrides.get('ceph', {}))
1590 teuthology.deep_merge(config, overrides.get('cephadm', {}))
1591 log.info('Config: ' + str(config))
1592
1593 # set up cluster context
1594 if not hasattr(ctx, 'ceph'):
1595 ctx.ceph = {}
1596 if 'cluster' not in config:
1597 config['cluster'] = 'ceph'
1598 cluster_name = config['cluster']
1599 if cluster_name not in ctx.ceph:
1600 ctx.ceph[cluster_name] = argparse.Namespace()
1601 ctx.ceph[cluster_name].bootstrapped = False
1602
1603 # image
1604 teuth_defaults = teuth_config.get('defaults', {})
1605 cephadm_defaults = teuth_defaults.get('cephadm', {})
1606 containers_defaults = cephadm_defaults.get('containers', {})
1607 container_image_name = containers_defaults.get('image', None)
1608
1609 containers = config.get('containers', {})
1610 container_image_name = containers.get('image', container_image_name)
1611
1612 if not hasattr(ctx.ceph[cluster_name], 'image'):
1613 ctx.ceph[cluster_name].image = config.get('image')
1614 ref = ctx.config.get("branch", "main")
1615 if not ctx.ceph[cluster_name].image:
1616 if not container_image_name:
1617 raise Exception("Configuration error occurred. "
1618 "The 'image' value is undefined for 'cephadm' task. "
1619 "Please provide corresponding options in the task's "
1620 "config, task 'overrides', or teuthology 'defaults' "
1621 "section.")
1622 sha1 = config.get('sha1')
1623 flavor = config.get('flavor', 'default')
1624
1625 if sha1:
1626 if flavor == "crimson":
1627 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1628 else:
1629 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1630 ref = sha1
1631 else:
1632 # fall back to using the branch value
1633 ctx.ceph[cluster_name].image = container_image_name + ':' + ref
1634 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1635
1636
1637 with contextutil.nested(
1638 #if the cluster is already bootstrapped bypass corresponding methods
1639 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1640 else initialize_config(ctx=ctx, config=config),
1641 lambda: ceph_initial(),
1642 lambda: normalize_hostnames(ctx=ctx),
1643 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1644 else download_cephadm(ctx=ctx, config=config, ref=ref),
1645 lambda: ceph_log(ctx=ctx, config=config),
1646 lambda: ceph_crash(ctx=ctx, config=config),
1647 lambda: pull_image(ctx=ctx, config=config),
1648 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1649 else ceph_bootstrap(ctx, config),
1650 lambda: crush_setup(ctx=ctx, config=config),
1651 lambda: ceph_mons(ctx=ctx, config=config),
1652 lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1653 lambda: ceph_mgrs(ctx=ctx, config=config),
1654 lambda: ceph_osds(ctx=ctx, config=config),
1655 lambda: ceph_mdss(ctx=ctx, config=config),
1656 lambda: cephfs_setup(ctx=ctx, config=config),
1657 lambda: ceph_rgw(ctx=ctx, config=config),
1658 lambda: ceph_iscsi(ctx=ctx, config=config),
1659 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1660 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1661 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1662 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1663 lambda: ceph_clients(ctx=ctx, config=config),
1664 lambda: create_rbd_pool(ctx=ctx, config=config),
1665 ):
1666 try:
1667 if config.get('wait-for-healthy', True):
1668 healthy(ctx=ctx, config=config)
1669
1670 log.info('Setup complete, yielding')
1671 yield
1672
1673 finally:
1674 log.info('Teardown begin')
1675