]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
602676eed7f736c77b5a3c256bec51307324fe7a
[ceph.git] / ceph / qa / tasks / cephadm.py
1 """
2 Ceph cluster task, deployed via cephadm orchestrator
3 """
4 import argparse
5 import configobj
6 import contextlib
7 import logging
8 import os
9 import json
10 import re
11 import uuid
12 import yaml
13
14 from copy import deepcopy
15 from io import BytesIO, StringIO
16 from tarfile import ReadError
17 from tasks.ceph_manager import CephManager
18 from teuthology import misc as teuthology
19 from teuthology import contextutil
20 from teuthology.orchestra import run
21 from teuthology.orchestra.daemon import DaemonGroup
22 from teuthology.config import config as teuth_config
23 from textwrap import dedent
24 from tasks.cephfs.filesystem import MDSCluster, Filesystem
25
26 # these items we use from ceph.py should probably eventually move elsewhere
27 from tasks.ceph import get_mons, healthy
28 from tasks.vip import subst_vip
29
30 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
31
32 log = logging.getLogger(__name__)
33
34
35 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
36 teuthology.get_testdir(ctx)
37 return remote.run(
38 args=[
39 'sudo',
40 ctx.cephadm,
41 '--image', ctx.ceph[cluster_name].image,
42 'shell',
43 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
44 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
45 '--fsid', ctx.ceph[cluster_name].fsid,
46 ] + extra_cephadm_args + [
47 '--',
48 ] + args,
49 **kwargs
50 )
51
52
53 def build_initial_config(ctx, config):
54 cluster_name = config['cluster']
55
56 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
57 conf = configobj.ConfigObj(path, file_error=True)
58
59 conf.setdefault('global', {})
60 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
61
62 # overrides
63 for section, keys in config.get('conf',{}).items():
64 for key, value in keys.items():
65 log.info(" override: [%s] %s = %s" % (section, key, value))
66 if section not in conf:
67 conf[section] = {}
68 conf[section][key] = value
69
70 return conf
71
72
73 def distribute_iscsi_gateway_cfg(ctx, conf_data):
74 """
75 Distribute common gateway config to get the IPs.
76 These will help in iscsi clients with finding trusted_ip_list.
77 """
78 log.info('Distributing iscsi-gateway.cfg...')
79 for remote, roles in ctx.cluster.remotes.items():
80 remote.write_file(
81 path='/etc/ceph/iscsi-gateway.cfg',
82 data=conf_data,
83 sudo=True)
84
85 def update_archive_setting(ctx, key, value):
86 """
87 Add logs directory to job's info log file
88 """
89 if ctx.archive is None:
90 return
91 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
92 info_yaml = yaml.safe_load(info_file)
93 info_file.seek(0)
94 if 'archive' in info_yaml:
95 info_yaml['archive'][key] = value
96 else:
97 info_yaml['archive'] = {key: value}
98 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
99
100
101 @contextlib.contextmanager
102 def normalize_hostnames(ctx):
103 """
104 Ensure we have short hostnames throughout, for consistency between
105 remote.shortname and socket.gethostname() in cephadm.
106 """
107 log.info('Normalizing hostnames...')
108 ctx.cluster.run(args=[
109 'sudo',
110 'hostname',
111 run.Raw('$(hostname -s)'),
112 ])
113
114 try:
115 yield
116 finally:
117 pass
118
119
120 @contextlib.contextmanager
121 def download_cephadm(ctx, config, ref):
122 cluster_name = config['cluster']
123
124 if config.get('cephadm_mode') != 'cephadm-package':
125 ref = config.get('cephadm_branch', ref)
126 git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
127 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
128 if ctx.config.get('redhat'):
129 log.info("Install cephadm using RPM")
130 # cephadm already installed from redhat.install task
131 ctx.cluster.run(
132 args=[
133 'cp',
134 run.Raw('$(which cephadm)'),
135 ctx.cephadm,
136 run.Raw('&&'),
137 'ls', '-l',
138 ctx.cephadm,
139 ]
140 )
141 elif git_url.startswith('https://github.com/'):
142 # git archive doesn't like https:// URLs, which we use with github.
143 rest = git_url.split('https://github.com/', 1)[1]
144 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
145 ctx.cluster.run(
146 args=[
147 'curl', '--silent',
148 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
149 run.Raw('>'),
150 ctx.cephadm,
151 run.Raw('&&'),
152 'ls', '-l',
153 ctx.cephadm,
154 ],
155 )
156 else:
157 ctx.cluster.run(
158 args=[
159 'git', 'archive',
160 '--remote=' + git_url,
161 ref,
162 'src/cephadm/cephadm',
163 run.Raw('|'),
164 'tar', '-xO', 'src/cephadm/cephadm',
165 run.Raw('>'),
166 ctx.cephadm,
167 ],
168 )
169 # sanity-check the resulting file and set executable bit
170 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
171 ctx.cluster.run(
172 args=[
173 'test', '-s', ctx.cephadm,
174 run.Raw('&&'),
175 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
176 run.Raw('&&'),
177 'chmod', '+x', ctx.cephadm,
178 ],
179 )
180
181 try:
182 yield
183 finally:
184 log.info('Removing cluster...')
185 ctx.cluster.run(args=[
186 'sudo',
187 ctx.cephadm,
188 'rm-cluster',
189 '--fsid', ctx.ceph[cluster_name].fsid,
190 '--force',
191 ])
192
193 if config.get('cephadm_mode') == 'root':
194 log.info('Removing cephadm ...')
195 ctx.cluster.run(
196 args=[
197 'rm',
198 '-rf',
199 ctx.cephadm,
200 ],
201 )
202
203
204 @contextlib.contextmanager
205 def ceph_log(ctx, config):
206 cluster_name = config['cluster']
207 fsid = ctx.ceph[cluster_name].fsid
208
209 update_archive_setting(ctx, 'log', '/var/log/ceph')
210
211
212 try:
213 yield
214
215 except Exception:
216 # we need to know this below
217 ctx.summary['success'] = False
218 raise
219
220 finally:
221 log.info('Checking cluster log for badness...')
222 def first_in_ceph_log(pattern, excludes):
223 """
224 Find the first occurrence of the pattern specified in the Ceph log,
225 Returns None if none found.
226
227 :param pattern: Pattern scanned for.
228 :param excludes: Patterns to ignore.
229 :return: First line of text (or None if not found)
230 """
231 args = [
232 'sudo',
233 'egrep', pattern,
234 '/var/log/ceph/{fsid}/ceph.log'.format(
235 fsid=fsid),
236 ]
237 if excludes:
238 for exclude in excludes:
239 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
240 args.extend([
241 run.Raw('|'), 'head', '-n', '1',
242 ])
243 r = ctx.ceph[cluster_name].bootstrap_remote.run(
244 stdout=StringIO(),
245 args=args,
246 )
247 stdout = r.stdout.getvalue()
248 if stdout != '':
249 return stdout
250 return None
251
252 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
253 config.get('log-ignorelist')) is not None:
254 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
255 ctx.summary['success'] = False
256 # use the most severe problem as the failure reason
257 if 'failure_reason' not in ctx.summary:
258 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
259 match = first_in_ceph_log(pattern, config['log-ignorelist'])
260 if match is not None:
261 ctx.summary['failure_reason'] = \
262 '"{match}" in cluster log'.format(
263 match=match.rstrip('\n'),
264 )
265 break
266
267 if ctx.archive is not None and \
268 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
269 # and logs
270 log.info('Compressing logs...')
271 run.wait(
272 ctx.cluster.run(
273 args=[
274 'sudo',
275 'find',
276 '/var/log/ceph', # all logs, not just for the cluster
277 '/var/log/rbd-target-api', # ceph-iscsi
278 '-name',
279 '*.log',
280 '-print0',
281 run.Raw('|'),
282 'sudo',
283 'xargs',
284 '-0',
285 '--no-run-if-empty',
286 '--',
287 'gzip',
288 '--',
289 ],
290 wait=False,
291 ),
292 )
293
294 log.info('Archiving logs...')
295 path = os.path.join(ctx.archive, 'remote')
296 try:
297 os.makedirs(path)
298 except OSError:
299 pass
300 for remote in ctx.cluster.remotes.keys():
301 sub = os.path.join(path, remote.shortname)
302 try:
303 os.makedirs(sub)
304 except OSError:
305 pass
306 try:
307 teuthology.pull_directory(remote, '/var/log/ceph', # everything
308 os.path.join(sub, 'log'))
309 except ReadError:
310 pass
311
312
313 @contextlib.contextmanager
314 def ceph_crash(ctx, config):
315 """
316 Gather crash dumps from /var/lib/ceph/$fsid/crash
317 """
318 cluster_name = config['cluster']
319 fsid = ctx.ceph[cluster_name].fsid
320
321 update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
322
323 try:
324 yield
325
326 finally:
327 if ctx.archive is not None:
328 log.info('Archiving crash dumps...')
329 path = os.path.join(ctx.archive, 'remote')
330 try:
331 os.makedirs(path)
332 except OSError:
333 pass
334 for remote in ctx.cluster.remotes.keys():
335 sub = os.path.join(path, remote.shortname)
336 try:
337 os.makedirs(sub)
338 except OSError:
339 pass
340 try:
341 teuthology.pull_directory(remote,
342 '/var/lib/ceph/%s/crash' % fsid,
343 os.path.join(sub, 'crash'))
344 except ReadError:
345 pass
346
347
348 @contextlib.contextmanager
349 def pull_image(ctx, config):
350 cluster_name = config['cluster']
351 log.info(f'Pulling image {ctx.ceph[cluster_name].image} on all hosts...')
352 run.wait(
353 ctx.cluster.run(
354 args=[
355 'sudo',
356 ctx.cephadm,
357 '--image', ctx.ceph[cluster_name].image,
358 'pull',
359 ],
360 wait=False,
361 )
362 )
363
364 try:
365 yield
366 finally:
367 pass
368
369
370 @contextlib.contextmanager
371 def ceph_bootstrap(ctx, config):
372 """
373 Bootstrap ceph cluster.
374
375 :param ctx: the argparse.Namespace object
376 :param config: the config dict
377 """
378 cluster_name = config['cluster']
379 testdir = teuthology.get_testdir(ctx)
380 fsid = ctx.ceph[cluster_name].fsid
381
382 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
383 first_mon = ctx.ceph[cluster_name].first_mon
384 first_mon_role = ctx.ceph[cluster_name].first_mon_role
385 mons = ctx.ceph[cluster_name].mons
386
387 ctx.cluster.run(args=[
388 'sudo', 'mkdir', '-p', '/etc/ceph',
389 ]);
390 ctx.cluster.run(args=[
391 'sudo', 'chmod', '777', '/etc/ceph',
392 ]);
393 try:
394 # write seed config
395 log.info('Writing seed config...')
396 conf_fp = BytesIO()
397 seed_config = build_initial_config(ctx, config)
398 seed_config.write(conf_fp)
399 bootstrap_remote.write_file(
400 path='{}/seed.{}.conf'.format(testdir, cluster_name),
401 data=conf_fp.getvalue())
402 log.debug('Final config:\n' + conf_fp.getvalue().decode())
403 ctx.ceph[cluster_name].conf = seed_config
404
405 # register initial daemons
406 ctx.daemons.register_daemon(
407 bootstrap_remote, 'mon', first_mon,
408 cluster=cluster_name,
409 fsid=fsid,
410 logger=log.getChild('mon.' + first_mon),
411 wait=False,
412 started=True,
413 )
414 if not ctx.ceph[cluster_name].roleless:
415 first_mgr = ctx.ceph[cluster_name].first_mgr
416 ctx.daemons.register_daemon(
417 bootstrap_remote, 'mgr', first_mgr,
418 cluster=cluster_name,
419 fsid=fsid,
420 logger=log.getChild('mgr.' + first_mgr),
421 wait=False,
422 started=True,
423 )
424
425 # bootstrap
426 log.info('Bootstrapping...')
427 cmd = [
428 'sudo',
429 ctx.cephadm,
430 '--image', ctx.ceph[cluster_name].image,
431 '-v',
432 'bootstrap',
433 '--fsid', fsid,
434 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
435 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
436 '--output-keyring',
437 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
438 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
439 ]
440
441 if config.get('registry-login'):
442 registry = config['registry-login']
443 cmd += [
444 "--registry-url", registry['url'],
445 "--registry-username", registry['username'],
446 "--registry-password", registry['password'],
447 ]
448
449 if not ctx.ceph[cluster_name].roleless:
450 cmd += [
451 '--mon-id', first_mon,
452 '--mgr-id', first_mgr,
453 '--orphan-initial-daemons', # we will do it explicitly!
454 '--skip-monitoring-stack', # we'll provision these explicitly
455 ]
456
457 if mons[first_mon_role].startswith('['):
458 cmd += ['--mon-addrv', mons[first_mon_role]]
459 else:
460 cmd += ['--mon-ip', mons[first_mon_role]]
461 if config.get('skip_dashboard'):
462 cmd += ['--skip-dashboard']
463 if config.get('skip_monitoring_stack'):
464 cmd += ['--skip-monitoring-stack']
465 if config.get('single_host_defaults'):
466 cmd += ['--single-host-defaults']
467 if not config.get('avoid_pacific_features', False):
468 cmd += ['--skip-admin-label']
469 # bootstrap makes the keyring root 0600, so +r it for our purposes
470 cmd += [
471 run.Raw('&&'),
472 'sudo', 'chmod', '+r',
473 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
474 ]
475 bootstrap_remote.run(args=cmd)
476
477 # fetch keys and configs
478 log.info('Fetching config...')
479 ctx.ceph[cluster_name].config_file = \
480 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
481 log.info('Fetching client.admin keyring...')
482 ctx.ceph[cluster_name].admin_keyring = \
483 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
484 log.info('Fetching mon keyring...')
485 ctx.ceph[cluster_name].mon_keyring = \
486 bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
487
488 # fetch ssh key, distribute to additional nodes
489 log.info('Fetching pub ssh key...')
490 ssh_pub_key = bootstrap_remote.read_file(
491 f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
492
493 log.info('Installing pub ssh key for root users...')
494 ctx.cluster.run(args=[
495 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
496 run.Raw('&&'),
497 'echo', ssh_pub_key,
498 run.Raw('|'),
499 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
500 run.Raw('&&'),
501 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
502 ])
503
504 # set options
505 if config.get('allow_ptrace', True):
506 _shell(ctx, cluster_name, bootstrap_remote,
507 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
508
509 if not config.get('avoid_pacific_features', False):
510 log.info('Distributing conf and client.admin keyring to all hosts + 0755')
511 _shell(ctx, cluster_name, bootstrap_remote,
512 ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
513 '*', '--mode', '0755'],
514 check_status=False)
515
516 # add other hosts
517 for remote in ctx.cluster.remotes.keys():
518 if remote == bootstrap_remote:
519 continue
520
521 # note: this may be redundant (see above), but it avoids
522 # us having to wait for cephadm to do it.
523 log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
524 remote.write_file(
525 path='/etc/ceph/{}.conf'.format(cluster_name),
526 data=ctx.ceph[cluster_name].config_file)
527 remote.write_file(
528 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
529 data=ctx.ceph[cluster_name].admin_keyring)
530
531 log.info('Adding host %s to orchestrator...' % remote.shortname)
532 _shell(ctx, cluster_name, bootstrap_remote, [
533 'ceph', 'orch', 'host', 'add',
534 remote.shortname
535 ])
536 r = _shell(ctx, cluster_name, bootstrap_remote,
537 ['ceph', 'orch', 'host', 'ls', '--format=json'],
538 stdout=StringIO())
539 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
540 assert remote.shortname in hosts
541
542 yield
543
544 finally:
545 log.info('Cleaning up testdir ceph.* files...')
546 ctx.cluster.run(args=[
547 'rm', '-f',
548 '{}/seed.{}.conf'.format(testdir, cluster_name),
549 '{}/{}.pub'.format(testdir, cluster_name),
550 ])
551
552 log.info('Stopping all daemons...')
553
554 # this doesn't block until they are all stopped...
555 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
556
557 # stop the daemons we know
558 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
559 cluster, type_, id_ = teuthology.split_role(role)
560 try:
561 ctx.daemons.get_daemon(type_, id_, cluster).stop()
562 except Exception:
563 log.exception(f'Failed to stop "{role}"')
564 raise
565
566 # tear down anything left (but leave the logs behind)
567 ctx.cluster.run(
568 args=[
569 'sudo',
570 ctx.cephadm,
571 'rm-cluster',
572 '--fsid', fsid,
573 '--force',
574 '--keep-logs',
575 ],
576 check_status=False, # may fail if upgrading from old cephadm
577 )
578
579 # clean up /etc/ceph
580 ctx.cluster.run(args=[
581 'sudo', 'rm', '-f',
582 '/etc/ceph/{}.conf'.format(cluster_name),
583 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
584 ])
585
586
587 @contextlib.contextmanager
588 def ceph_mons(ctx, config):
589 """
590 Deploy any additional mons
591 """
592 cluster_name = config['cluster']
593 fsid = ctx.ceph[cluster_name].fsid
594
595 try:
596 daemons = {}
597 if config.get('add_mons_via_daemon_add'):
598 # This is the old way of adding mons that works with the (early) octopus
599 # cephadm scheduler.
600 num_mons = 1
601 for remote, roles in ctx.cluster.remotes.items():
602 for mon in [r for r in roles
603 if teuthology.is_type('mon', cluster_name)(r)]:
604 c_, _, id_ = teuthology.split_role(mon)
605 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
606 continue
607 log.info('Adding %s on %s' % (mon, remote.shortname))
608 num_mons += 1
609 _shell(ctx, cluster_name, remote, [
610 'ceph', 'orch', 'daemon', 'add', 'mon',
611 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
612 ])
613 ctx.daemons.register_daemon(
614 remote, 'mon', id_,
615 cluster=cluster_name,
616 fsid=fsid,
617 logger=log.getChild(mon),
618 wait=False,
619 started=True,
620 )
621 daemons[mon] = (remote, id_)
622
623 with contextutil.safe_while(sleep=1, tries=180) as proceed:
624 while proceed():
625 log.info('Waiting for %d mons in monmap...' % (num_mons))
626 r = _shell(
627 ctx=ctx,
628 cluster_name=cluster_name,
629 remote=remote,
630 args=[
631 'ceph', 'mon', 'dump', '-f', 'json',
632 ],
633 stdout=StringIO(),
634 )
635 j = json.loads(r.stdout.getvalue())
636 if len(j['mons']) == num_mons:
637 break
638 else:
639 nodes = []
640 for remote, roles in ctx.cluster.remotes.items():
641 for mon in [r for r in roles
642 if teuthology.is_type('mon', cluster_name)(r)]:
643 c_, _, id_ = teuthology.split_role(mon)
644 log.info('Adding %s on %s' % (mon, remote.shortname))
645 nodes.append(remote.shortname
646 + ':' + ctx.ceph[cluster_name].mons[mon]
647 + '=' + id_)
648 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
649 continue
650 daemons[mon] = (remote, id_)
651
652 _shell(ctx, cluster_name, remote, [
653 'ceph', 'orch', 'apply', 'mon',
654 str(len(nodes)) + ';' + ';'.join(nodes)]
655 )
656 for mgr, i in daemons.items():
657 remote, id_ = i
658 ctx.daemons.register_daemon(
659 remote, 'mon', id_,
660 cluster=cluster_name,
661 fsid=fsid,
662 logger=log.getChild(mon),
663 wait=False,
664 started=True,
665 )
666
667 with contextutil.safe_while(sleep=1, tries=180) as proceed:
668 while proceed():
669 log.info('Waiting for %d mons in monmap...' % (len(nodes)))
670 r = _shell(
671 ctx=ctx,
672 cluster_name=cluster_name,
673 remote=remote,
674 args=[
675 'ceph', 'mon', 'dump', '-f', 'json',
676 ],
677 stdout=StringIO(),
678 )
679 j = json.loads(r.stdout.getvalue())
680 if len(j['mons']) == len(nodes):
681 break
682
683 # refresh our (final) ceph.conf file
684 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
685 log.info('Generating final ceph.conf file...')
686 r = _shell(
687 ctx=ctx,
688 cluster_name=cluster_name,
689 remote=bootstrap_remote,
690 args=[
691 'ceph', 'config', 'generate-minimal-conf',
692 ],
693 stdout=StringIO(),
694 )
695 ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
696
697 yield
698
699 finally:
700 pass
701
702
703 @contextlib.contextmanager
704 def ceph_mgrs(ctx, config):
705 """
706 Deploy any additional mgrs
707 """
708 cluster_name = config['cluster']
709 fsid = ctx.ceph[cluster_name].fsid
710
711 try:
712 nodes = []
713 daemons = {}
714 for remote, roles in ctx.cluster.remotes.items():
715 for mgr in [r for r in roles
716 if teuthology.is_type('mgr', cluster_name)(r)]:
717 c_, _, id_ = teuthology.split_role(mgr)
718 log.info('Adding %s on %s' % (mgr, remote.shortname))
719 nodes.append(remote.shortname + '=' + id_)
720 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
721 continue
722 daemons[mgr] = (remote, id_)
723 if nodes:
724 _shell(ctx, cluster_name, remote, [
725 'ceph', 'orch', 'apply', 'mgr',
726 str(len(nodes)) + ';' + ';'.join(nodes)]
727 )
728 for mgr, i in daemons.items():
729 remote, id_ = i
730 ctx.daemons.register_daemon(
731 remote, 'mgr', id_,
732 cluster=cluster_name,
733 fsid=fsid,
734 logger=log.getChild(mgr),
735 wait=False,
736 started=True,
737 )
738
739 yield
740
741 finally:
742 pass
743
744
745 @contextlib.contextmanager
746 def ceph_osds(ctx, config):
747 """
748 Deploy OSDs
749 """
750 cluster_name = config['cluster']
751 fsid = ctx.ceph[cluster_name].fsid
752
753 try:
754 log.info('Deploying OSDs...')
755
756 # provision OSDs in numeric order
757 id_to_remote = {}
758 devs_by_remote = {}
759 for remote, roles in ctx.cluster.remotes.items():
760 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
761 for osd in [r for r in roles
762 if teuthology.is_type('osd', cluster_name)(r)]:
763 _, _, id_ = teuthology.split_role(osd)
764 id_to_remote[int(id_)] = (osd, remote)
765
766 cur = 0
767 for osd_id in sorted(id_to_remote.keys()):
768 osd, remote = id_to_remote[osd_id]
769 _, _, id_ = teuthology.split_role(osd)
770 assert int(id_) == cur
771 devs = devs_by_remote[remote]
772 assert devs ## FIXME ##
773 dev = devs.pop()
774 if all(_ in dev for _ in ('lv', 'vg')):
775 short_dev = dev.replace('/dev/', '')
776 else:
777 short_dev = dev
778 log.info('Deploying %s on %s with %s...' % (
779 osd, remote.shortname, dev))
780 _shell(ctx, cluster_name, remote, [
781 'ceph-volume', 'lvm', 'zap', dev])
782 _shell(ctx, cluster_name, remote, [
783 'ceph', 'orch', 'daemon', 'add', 'osd',
784 remote.shortname + ':' + short_dev
785 ])
786 ctx.daemons.register_daemon(
787 remote, 'osd', id_,
788 cluster=cluster_name,
789 fsid=fsid,
790 logger=log.getChild(osd),
791 wait=False,
792 started=True,
793 )
794 cur += 1
795
796 if cur == 0:
797 _shell(ctx, cluster_name, remote, [
798 'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
799 ])
800 # expect the number of scratch devs
801 num_osds = sum(map(len, devs_by_remote.values()))
802 assert num_osds
803 else:
804 # expect the number of OSDs we created
805 num_osds = cur
806
807 log.info(f'Waiting for {num_osds} OSDs to come up...')
808 with contextutil.safe_while(sleep=1, tries=120) as proceed:
809 while proceed():
810 p = _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
811 ['ceph', 'osd', 'stat', '-f', 'json'], stdout=StringIO())
812 j = json.loads(p.stdout.getvalue())
813 if int(j.get('num_up_osds', 0)) == num_osds:
814 break;
815
816 if not hasattr(ctx, 'managers'):
817 ctx.managers = {}
818 ctx.managers[cluster_name] = CephManager(
819 ctx.ceph[cluster_name].bootstrap_remote,
820 ctx=ctx,
821 logger=log.getChild('ceph_manager.' + cluster_name),
822 cluster=cluster_name,
823 cephadm=True,
824 )
825
826 yield
827 finally:
828 pass
829
830
831 @contextlib.contextmanager
832 def ceph_mdss(ctx, config):
833 """
834 Deploy MDSss
835 """
836 cluster_name = config['cluster']
837 fsid = ctx.ceph[cluster_name].fsid
838
839 nodes = []
840 daemons = {}
841 for remote, roles in ctx.cluster.remotes.items():
842 for role in [r for r in roles
843 if teuthology.is_type('mds', cluster_name)(r)]:
844 c_, _, id_ = teuthology.split_role(role)
845 log.info('Adding %s on %s' % (role, remote.shortname))
846 nodes.append(remote.shortname + '=' + id_)
847 daemons[role] = (remote, id_)
848 if nodes:
849 _shell(ctx, cluster_name, remote, [
850 'ceph', 'orch', 'apply', 'mds',
851 'all',
852 str(len(nodes)) + ';' + ';'.join(nodes)]
853 )
854 for role, i in daemons.items():
855 remote, id_ = i
856 ctx.daemons.register_daemon(
857 remote, 'mds', id_,
858 cluster=cluster_name,
859 fsid=fsid,
860 logger=log.getChild(role),
861 wait=False,
862 started=True,
863 )
864
865 yield
866
867 @contextlib.contextmanager
868 def cephfs_setup(ctx, config):
869 mdss = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
870
871 # If there are any MDSs, then create a filesystem for them to use
872 # Do this last because requires mon cluster to be up and running
873 if len(mdss) > 0:
874 log.info('Setting up CephFS filesystem(s)...')
875 cephfs_config = config.get('cephfs', {})
876 fs_configs = cephfs_config.pop('fs', [{'name': 'cephfs'}])
877 set_allow_multifs = len(fs_configs) > 1
878
879 # wait for standbys to become available (slow due to valgrind, perhaps)
880 mdsc = MDSCluster(ctx)
881 with contextutil.safe_while(sleep=2,tries=150) as proceed:
882 while proceed():
883 if len(mdsc.get_standby_daemons()) >= len(mdss):
884 break
885
886 fss = []
887 for fs_config in fs_configs:
888 assert isinstance(fs_config, dict)
889 name = fs_config.pop('name')
890 temp = deepcopy(cephfs_config)
891 teuthology.deep_merge(temp, fs_config)
892 fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
893 if set_allow_multifs:
894 fs.set_allow_multifs()
895 set_allow_multifs = False
896 fss.append(fs)
897
898 yield
899
900 for fs in fss:
901 fs.destroy()
902 else:
903 yield
904
905 @contextlib.contextmanager
906 def ceph_monitoring(daemon_type, ctx, config):
907 """
908 Deploy prometheus, node-exporter, etc.
909 """
910 cluster_name = config['cluster']
911 fsid = ctx.ceph[cluster_name].fsid
912
913 nodes = []
914 daemons = {}
915 for remote, roles in ctx.cluster.remotes.items():
916 for role in [r for r in roles
917 if teuthology.is_type(daemon_type, cluster_name)(r)]:
918 c_, _, id_ = teuthology.split_role(role)
919 log.info('Adding %s on %s' % (role, remote.shortname))
920 nodes.append(remote.shortname + '=' + id_)
921 daemons[role] = (remote, id_)
922 if nodes:
923 _shell(ctx, cluster_name, remote, [
924 'ceph', 'orch', 'apply', daemon_type,
925 str(len(nodes)) + ';' + ';'.join(nodes)]
926 )
927 for role, i in daemons.items():
928 remote, id_ = i
929 ctx.daemons.register_daemon(
930 remote, daemon_type, id_,
931 cluster=cluster_name,
932 fsid=fsid,
933 logger=log.getChild(role),
934 wait=False,
935 started=True,
936 )
937
938 yield
939
940
941 @contextlib.contextmanager
942 def ceph_rgw(ctx, config):
943 """
944 Deploy rgw
945 """
946 cluster_name = config['cluster']
947 fsid = ctx.ceph[cluster_name].fsid
948
949 nodes = {}
950 daemons = {}
951 for remote, roles in ctx.cluster.remotes.items():
952 for role in [r for r in roles
953 if teuthology.is_type('rgw', cluster_name)(r)]:
954 c_, _, id_ = teuthology.split_role(role)
955 log.info('Adding %s on %s' % (role, remote.shortname))
956 svc = '.'.join(id_.split('.')[0:2])
957 if svc not in nodes:
958 nodes[svc] = []
959 nodes[svc].append(remote.shortname + '=' + id_)
960 daemons[role] = (remote, id_)
961
962 for svc, nodes in nodes.items():
963 _shell(ctx, cluster_name, remote, [
964 'ceph', 'orch', 'apply', 'rgw', svc,
965 '--placement',
966 str(len(nodes)) + ';' + ';'.join(nodes)]
967 )
968 for role, i in daemons.items():
969 remote, id_ = i
970 ctx.daemons.register_daemon(
971 remote, 'rgw', id_,
972 cluster=cluster_name,
973 fsid=fsid,
974 logger=log.getChild(role),
975 wait=False,
976 started=True,
977 )
978
979 yield
980
981
982 @contextlib.contextmanager
983 def ceph_iscsi(ctx, config):
984 """
985 Deploy iSCSIs
986 """
987 cluster_name = config['cluster']
988 fsid = ctx.ceph[cluster_name].fsid
989
990 nodes = []
991 daemons = {}
992 ips = []
993
994 for remote, roles in ctx.cluster.remotes.items():
995 for role in [r for r in roles
996 if teuthology.is_type('iscsi', cluster_name)(r)]:
997 c_, _, id_ = teuthology.split_role(role)
998 log.info('Adding %s on %s' % (role, remote.shortname))
999 nodes.append(remote.shortname + '=' + id_)
1000 daemons[role] = (remote, id_)
1001 ips.append(remote.ip_address)
1002 trusted_ip_list = ','.join(ips)
1003 if nodes:
1004 poolname = 'datapool'
1005 # ceph osd pool create datapool 3 3 replicated
1006 _shell(ctx, cluster_name, remote, [
1007 'ceph', 'osd', 'pool', 'create',
1008 poolname, '3', '3', 'replicated']
1009 )
1010
1011 _shell(ctx, cluster_name, remote, [
1012 'rbd', 'pool', 'init', poolname]
1013 )
1014
1015 # ceph orch apply iscsi datapool (admin)user (admin)password
1016 _shell(ctx, cluster_name, remote, [
1017 'ceph', 'orch', 'apply', 'iscsi',
1018 poolname, 'admin', 'admin',
1019 '--trusted_ip_list', trusted_ip_list,
1020 '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
1021 )
1022
1023 # used by iscsi client to identify valid gateway ip's
1024 conf_data = dedent(f"""
1025 [config]
1026 trusted_ip_list = {trusted_ip_list}
1027 """)
1028 distribute_iscsi_gateway_cfg(ctx, conf_data)
1029
1030 for role, i in daemons.items():
1031 remote, id_ = i
1032 ctx.daemons.register_daemon(
1033 remote, 'iscsi', id_,
1034 cluster=cluster_name,
1035 fsid=fsid,
1036 logger=log.getChild(role),
1037 wait=False,
1038 started=True,
1039 )
1040
1041 yield
1042
1043
1044 @contextlib.contextmanager
1045 def ceph_clients(ctx, config):
1046 cluster_name = config['cluster']
1047
1048 log.info('Setting up client nodes...')
1049 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
1050 for remote, roles_for_host in clients.remotes.items():
1051 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
1052 cluster_name):
1053 name = teuthology.ceph_role(role)
1054 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
1055 name)
1056 r = _shell(
1057 ctx=ctx,
1058 cluster_name=cluster_name,
1059 remote=remote,
1060 args=[
1061 'ceph', 'auth',
1062 'get-or-create', name,
1063 'mon', 'allow *',
1064 'osd', 'allow *',
1065 'mds', 'allow *',
1066 'mgr', 'allow *',
1067 ],
1068 stdout=StringIO(),
1069 )
1070 keyring = r.stdout.getvalue()
1071 remote.sudo_write_file(client_keyring, keyring, mode='0644')
1072 yield
1073
1074
1075 @contextlib.contextmanager
1076 def ceph_initial():
1077 try:
1078 yield
1079 finally:
1080 log.info('Teardown complete')
1081
1082
1083 ## public methods
1084 @contextlib.contextmanager
1085 def stop(ctx, config):
1086 """
1087 Stop ceph daemons
1088
1089 For example::
1090 tasks:
1091 - ceph.stop: [mds.*]
1092
1093 tasks:
1094 - ceph.stop: [osd.0, osd.2]
1095
1096 tasks:
1097 - ceph.stop:
1098 daemons: [osd.0, osd.2]
1099
1100 """
1101 if config is None:
1102 config = {}
1103 elif isinstance(config, list):
1104 config = {'daemons': config}
1105
1106 daemons = ctx.daemons.resolve_role_list(
1107 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1108 clusters = set()
1109
1110 for role in daemons:
1111 cluster, type_, id_ = teuthology.split_role(role)
1112 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1113 clusters.add(cluster)
1114
1115 # for cluster in clusters:
1116 # ctx.ceph[cluster].watchdog.stop()
1117 # ctx.ceph[cluster].watchdog.join()
1118
1119 yield
1120
1121
1122 def shell(ctx, config):
1123 """
1124 Execute (shell) commands
1125 """
1126 cluster_name = config.get('cluster', 'ceph')
1127
1128 args = []
1129 for k in config.pop('env', []):
1130 args.extend(['-e', k + '=' + ctx.config.get(k, '')])
1131 for k in config.pop('volumes', []):
1132 args.extend(['-v', k])
1133
1134 if 'all-roles' in config and len(config) == 1:
1135 a = config['all-roles']
1136 roles = teuthology.all_roles(ctx.cluster)
1137 config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
1138 elif 'all-hosts' in config and len(config) == 1:
1139 a = config['all-hosts']
1140 roles = teuthology.all_roles(ctx.cluster)
1141 config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
1142
1143 for role, cmd in config.items():
1144 (remote,) = ctx.cluster.only(role).remotes.keys()
1145 log.info('Running commands on role %s host %s', role, remote.name)
1146 if isinstance(cmd, list):
1147 for c in cmd:
1148 _shell(ctx, cluster_name, remote,
1149 ['bash', '-c', subst_vip(ctx, c)],
1150 extra_cephadm_args=args)
1151 else:
1152 assert isinstance(cmd, str)
1153 _shell(ctx, cluster_name, remote,
1154 ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
1155 extra_cephadm_args=args)
1156
1157
1158 def apply(ctx, config):
1159 """
1160 Apply spec
1161
1162 tasks:
1163 - cephadm.apply:
1164 specs:
1165 - service_type: rgw
1166 service_id: foo
1167 spec:
1168 rgw_frontend_port: 8000
1169 - service_type: rgw
1170 service_id: bar
1171 spec:
1172 rgw_frontend_port: 9000
1173 zone: bar
1174 realm: asdf
1175
1176 """
1177 cluster_name = config.get('cluster', 'ceph')
1178
1179 specs = config.get('specs', [])
1180 y = subst_vip(ctx, yaml.dump_all(specs))
1181
1182 log.info(f'Applying spec(s):\n{y}')
1183 _shell(
1184 ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1185 ['ceph', 'orch', 'apply', '-i', '-'],
1186 stdin=y,
1187 )
1188
1189
1190 def wait_for_service(ctx, config):
1191 """
1192 Wait for a service to be fully started
1193
1194 tasks:
1195 - cephadm.wait_for_service:
1196 service: rgw.foo
1197 timeout: 60 # defaults to 300
1198
1199 """
1200 cluster_name = config.get('cluster', 'ceph')
1201 timeout = config.get('timeout', 300)
1202 service = config.get('service')
1203 assert service
1204
1205 log.info(
1206 f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1207 )
1208 with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
1209 while proceed():
1210 r = _shell(
1211 ctx=ctx,
1212 cluster_name=cluster_name,
1213 remote=ctx.ceph[cluster_name].bootstrap_remote,
1214 args=[
1215 'ceph', 'orch', 'ls', '-f', 'json',
1216 ],
1217 stdout=StringIO(),
1218 )
1219 j = json.loads(r.stdout.getvalue())
1220 svc = None
1221 for s in j:
1222 if s['service_name'] == service:
1223 svc = s
1224 break
1225 if svc:
1226 log.info(
1227 f"{service} has {s['status']['running']}/{s['status']['size']}"
1228 )
1229 if s['status']['running'] == s['status']['size']:
1230 break
1231
1232
1233 @contextlib.contextmanager
1234 def tweaked_option(ctx, config):
1235 """
1236 set an option, and then restore it with its original value
1237
1238 Note, due to the way how tasks are executed/nested, it's not suggested to
1239 use this method as a standalone task. otherwise, it's likely that it will
1240 restore the tweaked option at the /end/ of 'tasks' block.
1241 """
1242 saved_options = {}
1243 # we can complicate this when necessary
1244 options = ['mon-health-to-clog']
1245 type_, id_ = 'mon', '*'
1246 cluster = config.get('cluster', 'ceph')
1247 manager = ctx.managers[cluster]
1248 if id_ == '*':
1249 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
1250 else:
1251 get_from = id_
1252 for option in options:
1253 if option not in config:
1254 continue
1255 value = 'true' if config[option] else 'false'
1256 option = option.replace('-', '_')
1257 old_value = manager.get_config(type_, get_from, option)
1258 if value != old_value:
1259 saved_options[option] = old_value
1260 manager.inject_args(type_, id_, option, value)
1261 yield
1262 for option, value in saved_options.items():
1263 manager.inject_args(type_, id_, option, value)
1264
1265
1266 @contextlib.contextmanager
1267 def restart(ctx, config):
1268 """
1269 restart ceph daemons
1270
1271 For example::
1272 tasks:
1273 - ceph.restart: [all]
1274
1275 For example::
1276 tasks:
1277 - ceph.restart: [osd.0, mon.1, mds.*]
1278
1279 or::
1280
1281 tasks:
1282 - ceph.restart:
1283 daemons: [osd.0, mon.1]
1284 wait-for-healthy: false
1285 wait-for-osds-up: true
1286
1287 :param ctx: Context
1288 :param config: Configuration
1289 """
1290 if config is None:
1291 config = {}
1292 elif isinstance(config, list):
1293 config = {'daemons': config}
1294
1295 daemons = ctx.daemons.resolve_role_list(
1296 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1297 clusters = set()
1298
1299 log.info('daemons %s' % daemons)
1300 with tweaked_option(ctx, config):
1301 for role in daemons:
1302 cluster, type_, id_ = teuthology.split_role(role)
1303 d = ctx.daemons.get_daemon(type_, id_, cluster)
1304 assert d, 'daemon %s does not exist' % role
1305 d.stop()
1306 if type_ == 'osd':
1307 ctx.managers[cluster].mark_down_osd(id_)
1308 d.restart()
1309 clusters.add(cluster)
1310
1311 if config.get('wait-for-healthy', True):
1312 for cluster in clusters:
1313 healthy(ctx=ctx, config=dict(cluster=cluster))
1314 if config.get('wait-for-osds-up', False):
1315 for cluster in clusters:
1316 ctx.managers[cluster].wait_for_all_osds_up()
1317 yield
1318
1319
1320 @contextlib.contextmanager
1321 def distribute_config_and_admin_keyring(ctx, config):
1322 """
1323 Distribute a sufficient config and keyring for clients
1324 """
1325 cluster_name = config['cluster']
1326 log.info('Distributing (final) config and client.admin keyring...')
1327 for remote, roles in ctx.cluster.remotes.items():
1328 remote.write_file(
1329 '/etc/ceph/{}.conf'.format(cluster_name),
1330 ctx.ceph[cluster_name].config_file,
1331 sudo=True)
1332 remote.write_file(
1333 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1334 data=ctx.ceph[cluster_name].admin_keyring,
1335 sudo=True)
1336 try:
1337 yield
1338 finally:
1339 ctx.cluster.run(args=[
1340 'sudo', 'rm', '-f',
1341 '/etc/ceph/{}.conf'.format(cluster_name),
1342 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1343 ])
1344
1345
1346 @contextlib.contextmanager
1347 def crush_setup(ctx, config):
1348 cluster_name = config['cluster']
1349
1350 profile = config.get('crush_tunables', 'default')
1351 log.info('Setting crush tunables to %s', profile)
1352 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1353 args=['ceph', 'osd', 'crush', 'tunables', profile])
1354 yield
1355
1356
1357 @contextlib.contextmanager
1358 def create_rbd_pool(ctx, config):
1359 if config.get('create_rbd_pool', False):
1360 cluster_name = config['cluster']
1361 log.info('Waiting for OSDs to come up')
1362 teuthology.wait_until_osds_up(
1363 ctx,
1364 cluster=ctx.cluster,
1365 remote=ctx.ceph[cluster_name].bootstrap_remote,
1366 ceph_cluster=cluster_name,
1367 )
1368 log.info('Creating RBD pool')
1369 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1370 args=['sudo', 'ceph', '--cluster', cluster_name,
1371 'osd', 'pool', 'create', 'rbd', '8'])
1372 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1373 args=['sudo', 'ceph', '--cluster', cluster_name,
1374 'osd', 'pool', 'application', 'enable',
1375 'rbd', 'rbd', '--yes-i-really-mean-it'
1376 ])
1377 yield
1378
1379
1380 @contextlib.contextmanager
1381 def _bypass():
1382 yield
1383
1384
1385 @contextlib.contextmanager
1386 def initialize_config(ctx, config):
1387 cluster_name = config['cluster']
1388 testdir = teuthology.get_testdir(ctx)
1389
1390 ctx.ceph[cluster_name].thrashers = []
1391 # fixme: setup watchdog, ala ceph.py
1392
1393 ctx.ceph[cluster_name].roleless = False # see below
1394
1395 first_ceph_cluster = False
1396 if not hasattr(ctx, 'daemons'):
1397 first_ceph_cluster = True
1398
1399 # cephadm mode?
1400 if 'cephadm_mode' not in config:
1401 config['cephadm_mode'] = 'root'
1402 assert config['cephadm_mode'] in ['root', 'cephadm-package']
1403 if config['cephadm_mode'] == 'root':
1404 ctx.cephadm = testdir + '/cephadm'
1405 else:
1406 ctx.cephadm = 'cephadm' # in the path
1407
1408 if first_ceph_cluster:
1409 # FIXME: this is global for all clusters
1410 ctx.daemons = DaemonGroup(
1411 use_cephadm=ctx.cephadm)
1412
1413 # uuid
1414 fsid = str(uuid.uuid1())
1415 log.info('Cluster fsid is %s' % fsid)
1416 ctx.ceph[cluster_name].fsid = fsid
1417
1418 # mon ips
1419 log.info('Choosing monitor IPs and ports...')
1420 remotes_and_roles = ctx.cluster.remotes.items()
1421 ips = [host for (host, port) in
1422 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1423
1424 if config.get('roleless', False):
1425 # mons will be named after hosts
1426 first_mon = None
1427 max_mons = config.get('max_mons', 5)
1428 for remote, _ in remotes_and_roles:
1429 ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1430 if not first_mon:
1431 first_mon = remote.shortname
1432 bootstrap_remote = remote
1433 max_mons -= 1
1434 if not max_mons:
1435 break
1436 log.info('No mon roles; fabricating mons')
1437
1438 roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1439
1440 ctx.ceph[cluster_name].mons = get_mons(
1441 roles, ips, cluster_name,
1442 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1443 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1444 )
1445 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1446
1447 if config.get('roleless', False):
1448 ctx.ceph[cluster_name].roleless = True
1449 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1450 ctx.ceph[cluster_name].first_mon = first_mon
1451 ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1452 else:
1453 first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1454 _, _, first_mon = teuthology.split_role(first_mon_role)
1455 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1456 log.info('First mon is mon.%s on %s' % (first_mon,
1457 bootstrap_remote.shortname))
1458 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1459 ctx.ceph[cluster_name].first_mon = first_mon
1460 ctx.ceph[cluster_name].first_mon_role = first_mon_role
1461
1462 others = ctx.cluster.remotes[bootstrap_remote]
1463 mgrs = sorted([r for r in others
1464 if teuthology.is_type('mgr', cluster_name)(r)])
1465 if not mgrs:
1466 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1467 _, _, first_mgr = teuthology.split_role(mgrs[0])
1468 log.info('First mgr is %s' % (first_mgr))
1469 ctx.ceph[cluster_name].first_mgr = first_mgr
1470 yield
1471
1472
1473 @contextlib.contextmanager
1474 def task(ctx, config):
1475 """
1476 Deploy ceph cluster using cephadm
1477
1478 For example, teuthology.yaml can contain the 'defaults' section:
1479
1480 defaults:
1481 cephadm:
1482 containers:
1483 image: 'quay.io/ceph-ci/ceph'
1484
1485 Using overrides makes it possible to customize it per run.
1486 The equivalent 'overrides' section looks like:
1487
1488 overrides:
1489 cephadm:
1490 containers:
1491 image: 'quay.io/ceph-ci/ceph'
1492 registry-login:
1493 url: registry-url
1494 username: registry-user
1495 password: registry-password
1496
1497 :param ctx: the argparse.Namespace object
1498 :param config: the config dict
1499 """
1500 if config is None:
1501 config = {}
1502
1503 assert isinstance(config, dict), \
1504 "task only supports a dictionary for configuration"
1505
1506 overrides = ctx.config.get('overrides', {})
1507 teuthology.deep_merge(config, overrides.get('ceph', {}))
1508 teuthology.deep_merge(config, overrides.get('cephadm', {}))
1509 log.info('Config: ' + str(config))
1510
1511 # set up cluster context
1512 if not hasattr(ctx, 'ceph'):
1513 ctx.ceph = {}
1514 if 'cluster' not in config:
1515 config['cluster'] = 'ceph'
1516 cluster_name = config['cluster']
1517 if cluster_name not in ctx.ceph:
1518 ctx.ceph[cluster_name] = argparse.Namespace()
1519 ctx.ceph[cluster_name].bootstrapped = False
1520
1521 # image
1522 teuth_defaults = teuth_config.get('defaults', {})
1523 cephadm_defaults = teuth_defaults.get('cephadm', {})
1524 containers_defaults = cephadm_defaults.get('containers', {})
1525 container_image_name = containers_defaults.get('image', None)
1526
1527 containers = config.get('containers', {})
1528 container_image_name = containers.get('image', container_image_name)
1529
1530 if not hasattr(ctx.ceph[cluster_name], 'image'):
1531 ctx.ceph[cluster_name].image = config.get('image')
1532 ref = None
1533 if not ctx.ceph[cluster_name].image:
1534 if not container_image_name:
1535 raise Exception("Configuration error occurred. "
1536 "The 'image' value is undefined for 'cephadm' task. "
1537 "Please provide corresponding options in the task's "
1538 "config, task 'overrides', or teuthology 'defaults' "
1539 "section.")
1540 sha1 = config.get('sha1')
1541 flavor = config.get('flavor', 'default')
1542
1543 if sha1:
1544 if flavor == "crimson":
1545 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1546 else:
1547 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1548 ref = sha1
1549 else:
1550 # hmm, fall back to branch?
1551 branch = config.get('branch', 'master')
1552 ref = branch
1553 ctx.ceph[cluster_name].image = container_image_name + ':' + branch
1554 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1555
1556
1557 with contextutil.nested(
1558 #if the cluster is already bootstrapped bypass corresponding methods
1559 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1560 else initialize_config(ctx=ctx, config=config),
1561 lambda: ceph_initial(),
1562 lambda: normalize_hostnames(ctx=ctx),
1563 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1564 else download_cephadm(ctx=ctx, config=config, ref=ref),
1565 lambda: ceph_log(ctx=ctx, config=config),
1566 lambda: ceph_crash(ctx=ctx, config=config),
1567 lambda: pull_image(ctx=ctx, config=config),
1568 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1569 else ceph_bootstrap(ctx, config),
1570 lambda: crush_setup(ctx=ctx, config=config),
1571 lambda: ceph_mons(ctx=ctx, config=config),
1572 lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1573 lambda: ceph_mgrs(ctx=ctx, config=config),
1574 lambda: ceph_osds(ctx=ctx, config=config),
1575 lambda: ceph_mdss(ctx=ctx, config=config),
1576 lambda: cephfs_setup(ctx=ctx, config=config),
1577 lambda: ceph_rgw(ctx=ctx, config=config),
1578 lambda: ceph_iscsi(ctx=ctx, config=config),
1579 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1580 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1581 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1582 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1583 lambda: ceph_clients(ctx=ctx, config=config),
1584 lambda: create_rbd_pool(ctx=ctx, config=config),
1585 ):
1586 try:
1587 if config.get('wait-for-healthy', True):
1588 healthy(ctx=ctx, config=config)
1589
1590 log.info('Setup complete, yielding')
1591 yield
1592
1593 finally:
1594 log.info('Teardown begin')
1595