]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
a5065ba3ee1fcb231d23fe990d8f474240d2ab7a
[ceph.git] / ceph / qa / tasks / cephadm.py
1 """
2 Ceph cluster task, deployed via cephadm orchestrator
3 """
4 import argparse
5 import configobj
6 import contextlib
7 import errno
8 import logging
9 import os
10 import json
11 import re
12 import uuid
13
14 import six
15 import toml
16 from io import BytesIO
17 from six import StringIO
18 from tarfile import ReadError
19 from tasks.ceph_manager import CephManager
20 from teuthology import misc as teuthology
21 from teuthology import contextutil
22 from teuthology.orchestra import run
23 from teuthology.orchestra.daemon import DaemonGroup
24 from teuthology.config import config as teuth_config
25
26 # these items we use from ceph.py should probably eventually move elsewhere
27 from tasks.ceph import get_mons, healthy
28
29 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
30
31 log = logging.getLogger(__name__)
32
33
34 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
35 testdir = teuthology.get_testdir(ctx)
36 return remote.run(
37 args=[
38 'sudo',
39 ctx.cephadm,
40 '--image', ctx.ceph[cluster_name].image,
41 'shell',
42 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
43 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
44 '--fsid', ctx.ceph[cluster_name].fsid,
45 ] + extra_cephadm_args + [
46 '--',
47 ] + args,
48 **kwargs
49 )
50
51 def build_initial_config(ctx, config):
52 cluster_name = config['cluster']
53
54 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
55 conf = configobj.ConfigObj(path, file_error=True)
56
57 conf.setdefault('global', {})
58 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
59
60 # overrides
61 for section, keys in config.get('conf',{}).items():
62 for key, value in keys.items():
63 log.info(" override: [%s] %s = %s" % (section, key, value))
64 if section not in conf:
65 conf[section] = {}
66 conf[section][key] = value
67
68 return conf
69
70 @contextlib.contextmanager
71 def normalize_hostnames(ctx):
72 """
73 Ensure we have short hostnames throughout, for consistency between
74 remote.shortname and socket.gethostname() in cephadm.
75 """
76 log.info('Normalizing hostnames...')
77 ctx.cluster.run(args=[
78 'sudo',
79 'hostname',
80 run.Raw('$(hostname -s)'),
81 ])
82
83 try:
84 yield
85 finally:
86 pass
87
88 @contextlib.contextmanager
89 def download_cephadm(ctx, config, ref):
90 cluster_name = config['cluster']
91
92 if config.get('cephadm_mode') != 'cephadm-package':
93 ref = config.get('cephadm_branch', ref)
94 git_url = teuth_config.get_ceph_git_url()
95 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
96 if git_url.startswith('https://github.com/'):
97 # git archive doesn't like https:// URLs, which we use with github.
98 rest = git_url.split('https://github.com/', 1)[1]
99 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
100 ctx.cluster.run(
101 args=[
102 'curl', '--silent',
103 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
104 run.Raw('>'),
105 ctx.cephadm,
106 run.Raw('&&'),
107 'ls', '-l',
108 ctx.cephadm,
109 ],
110 )
111 else:
112 ctx.cluster.run(
113 args=[
114 'git', 'archive',
115 '--remote=' + git_url,
116 ref,
117 'src/cephadm/cephadm',
118 run.Raw('|'),
119 'tar', '-xO', 'src/cephadm/cephadm',
120 run.Raw('>'),
121 ctx.cephadm,
122 ],
123 )
124 # sanity-check the resulting file and set executable bit
125 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
126 ctx.cluster.run(
127 args=[
128 'test', '-s', ctx.cephadm,
129 run.Raw('&&'),
130 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
131 run.Raw('&&'),
132 'chmod', '+x', ctx.cephadm,
133 ],
134 )
135
136 try:
137 yield
138 finally:
139 log.info('Removing cluster...')
140 ctx.cluster.run(args=[
141 'sudo',
142 ctx.cephadm,
143 'rm-cluster',
144 '--fsid', ctx.ceph[cluster_name].fsid,
145 '--force',
146 ])
147
148 if config.get('cephadm_mode') == 'root':
149 log.info('Removing cephadm ...')
150 ctx.cluster.run(
151 args=[
152 'rm',
153 '-rf',
154 ctx.cephadm,
155 ],
156 )
157
158 @contextlib.contextmanager
159 def ceph_log(ctx, config):
160 cluster_name = config['cluster']
161 fsid = ctx.ceph[cluster_name].fsid
162
163 try:
164 yield
165
166 except Exception:
167 # we need to know this below
168 ctx.summary['success'] = False
169 raise
170
171 finally:
172 log.info('Checking cluster log for badness...')
173 def first_in_ceph_log(pattern, excludes):
174 """
175 Find the first occurrence of the pattern specified in the Ceph log,
176 Returns None if none found.
177
178 :param pattern: Pattern scanned for.
179 :param excludes: Patterns to ignore.
180 :return: First line of text (or None if not found)
181 """
182 args = [
183 'sudo',
184 'egrep', pattern,
185 '/var/log/ceph/{fsid}/ceph.log'.format(
186 fsid=fsid),
187 ]
188 if excludes:
189 for exclude in excludes:
190 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
191 args.extend([
192 run.Raw('|'), 'head', '-n', '1',
193 ])
194 r = ctx.ceph[cluster_name].bootstrap_remote.run(
195 stdout=StringIO(),
196 args=args,
197 )
198 stdout = r.stdout.getvalue()
199 if stdout != '':
200 return stdout
201 return None
202
203 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
204 config.get('log-whitelist')) is not None:
205 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
206 ctx.summary['success'] = False
207 # use the most severe problem as the failure reason
208 if 'failure_reason' not in ctx.summary:
209 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
210 match = first_in_ceph_log(pattern, config['log-whitelist'])
211 if match is not None:
212 ctx.summary['failure_reason'] = \
213 '"{match}" in cluster log'.format(
214 match=match.rstrip('\n'),
215 )
216 break
217
218 if ctx.archive is not None and \
219 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
220 # and logs
221 log.info('Compressing logs...')
222 run.wait(
223 ctx.cluster.run(
224 args=[
225 'sudo',
226 'find',
227 '/var/log/ceph', # all logs, not just for the cluster
228 '-name',
229 '*.log',
230 '-print0',
231 run.Raw('|'),
232 'sudo',
233 'xargs',
234 '-0',
235 '--no-run-if-empty',
236 '--',
237 'gzip',
238 '--',
239 ],
240 wait=False,
241 ),
242 )
243
244 log.info('Archiving logs...')
245 path = os.path.join(ctx.archive, 'remote')
246 try:
247 os.makedirs(path)
248 except OSError:
249 pass
250 for remote in ctx.cluster.remotes.keys():
251 sub = os.path.join(path, remote.name)
252 try:
253 os.makedirs(sub)
254 except OSError:
255 pass
256 try:
257 teuthology.pull_directory(remote, '/var/log/ceph', # everything
258 os.path.join(sub, 'log'))
259 except ReadError:
260 pass
261
262 @contextlib.contextmanager
263 def ceph_crash(ctx, config):
264 """
265 Gather crash dumps from /var/lib/ceph/$fsid/crash
266 """
267 cluster_name = config['cluster']
268 fsid = ctx.ceph[cluster_name].fsid
269
270 try:
271 yield
272
273 finally:
274 if ctx.archive is not None:
275 log.info('Archiving crash dumps...')
276 path = os.path.join(ctx.archive, 'remote')
277 try:
278 os.makedirs(path)
279 except OSError:
280 pass
281 for remote in ctx.cluster.remotes.keys():
282 sub = os.path.join(path, remote.name)
283 try:
284 os.makedirs(sub)
285 except OSError:
286 pass
287 try:
288 teuthology.pull_directory(remote,
289 '/var/lib/ceph/%s/crash' % fsid,
290 os.path.join(sub, 'crash'))
291 except ReadError:
292 pass
293
294 @contextlib.contextmanager
295 def ceph_bootstrap(ctx, config, registry):
296 """
297 Bootstrap ceph cluster, setup containers' registry mirror before
298 the bootstrap if the registry is provided.
299
300 :param ctx: the argparse.Namespace object
301 :param config: the config dict
302 :param registry: url to containers' mirror registry
303 """
304 cluster_name = config['cluster']
305 testdir = teuthology.get_testdir(ctx)
306 fsid = ctx.ceph[cluster_name].fsid
307
308 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
309 first_mon = ctx.ceph[cluster_name].first_mon
310 first_mon_role = ctx.ceph[cluster_name].first_mon_role
311 mons = ctx.ceph[cluster_name].mons
312
313 ctx.cluster.run(args=[
314 'sudo', 'mkdir', '-p', '/etc/ceph',
315 ]);
316 ctx.cluster.run(args=[
317 'sudo', 'chmod', '777', '/etc/ceph',
318 ]);
319 if registry:
320 add_mirror_to_cluster(ctx, registry)
321 try:
322 # write seed config
323 log.info('Writing seed config...')
324 conf_fp = BytesIO()
325 seed_config = build_initial_config(ctx, config)
326 seed_config.write(conf_fp)
327 teuthology.write_file(
328 remote=bootstrap_remote,
329 path='{}/seed.{}.conf'.format(testdir, cluster_name),
330 data=conf_fp.getvalue())
331 log.debug('Final config:\n' + conf_fp.getvalue().decode())
332 ctx.ceph[cluster_name].conf = seed_config
333
334 # register initial daemons
335 ctx.daemons.register_daemon(
336 bootstrap_remote, 'mon', first_mon,
337 cluster=cluster_name,
338 fsid=fsid,
339 logger=log.getChild('mon.' + first_mon),
340 wait=False,
341 started=True,
342 )
343 if not ctx.ceph[cluster_name].roleless:
344 first_mgr = ctx.ceph[cluster_name].first_mgr
345 ctx.daemons.register_daemon(
346 bootstrap_remote, 'mgr', first_mgr,
347 cluster=cluster_name,
348 fsid=fsid,
349 logger=log.getChild('mgr.' + first_mgr),
350 wait=False,
351 started=True,
352 )
353
354 # bootstrap
355 log.info('Bootstrapping...')
356 cmd = [
357 'sudo',
358 ctx.cephadm,
359 '--image', ctx.ceph[cluster_name].image,
360 '-v',
361 'bootstrap',
362 '--fsid', fsid,
363 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
364 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
365 '--output-keyring',
366 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
367 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
368 ]
369 if not ctx.ceph[cluster_name].roleless:
370 cmd += [
371 '--mon-id', first_mon,
372 '--mgr-id', first_mgr,
373 '--orphan-initial-daemons', # we will do it explicitly!
374 '--skip-monitoring-stack', # we'll provision these explicitly
375 ]
376 if mons[first_mon_role].startswith('['):
377 cmd += ['--mon-addrv', mons[first_mon_role]]
378 else:
379 cmd += ['--mon-ip', mons[first_mon_role]]
380 if config.get('skip_dashboard'):
381 cmd += ['--skip-dashboard']
382 # bootstrap makes the keyring root 0600, so +r it for our purposes
383 cmd += [
384 run.Raw('&&'),
385 'sudo', 'chmod', '+r',
386 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
387 ]
388 bootstrap_remote.run(args=cmd)
389
390 # fetch keys and configs
391 log.info('Fetching config...')
392 ctx.ceph[cluster_name].config_file = teuthology.get_file(
393 remote=bootstrap_remote,
394 path='/etc/ceph/{}.conf'.format(cluster_name))
395 log.info('Fetching client.admin keyring...')
396 ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
397 remote=bootstrap_remote,
398 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name))
399 log.info('Fetching mon keyring...')
400 ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
401 remote=bootstrap_remote,
402 path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
403 sudo=True)
404
405 # fetch ssh key, distribute to additional nodes
406 log.info('Fetching pub ssh key...')
407 ssh_pub_key = teuthology.get_file(
408 remote=bootstrap_remote,
409 path='{}/{}.pub'.format(testdir, cluster_name)
410 ).decode('ascii').strip()
411
412 log.info('Installing pub ssh key for root users...')
413 ctx.cluster.run(args=[
414 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
415 run.Raw('&&'),
416 'echo', ssh_pub_key,
417 run.Raw('|'),
418 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
419 run.Raw('&&'),
420 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
421 ])
422
423 # set options
424 _shell(ctx, cluster_name, bootstrap_remote,
425 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
426
427 # add other hosts
428 for remote in ctx.cluster.remotes.keys():
429 if remote == bootstrap_remote:
430 continue
431 log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
432 teuthology.write_file(
433 remote=remote,
434 path='/etc/ceph/{}.conf'.format(cluster_name),
435 data=ctx.ceph[cluster_name].config_file)
436 teuthology.write_file(
437 remote=remote,
438 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
439 data=ctx.ceph[cluster_name].admin_keyring)
440
441 log.info('Adding host %s to orchestrator...' % remote.shortname)
442 _shell(ctx, cluster_name, remote, [
443 'ceph', 'orch', 'host', 'add',
444 remote.shortname
445 ])
446 r = _shell(ctx, cluster_name, remote,
447 ['ceph', 'orch', 'host', 'ls', '--format=json'],
448 stdout=StringIO())
449 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
450 assert remote.shortname in hosts
451
452 yield
453
454 finally:
455 log.info('Cleaning up testdir ceph.* files...')
456 ctx.cluster.run(args=[
457 'rm', '-f',
458 '{}/seed.{}.conf'.format(testdir, cluster_name),
459 '{}/{}.pub'.format(testdir, cluster_name),
460 ])
461
462 log.info('Stopping all daemons...')
463
464 # this doesn't block until they are all stopped...
465 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
466
467 # so, stop them individually
468 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
469 cluster, type_, id_ = teuthology.split_role(role)
470 try:
471 ctx.daemons.get_daemon(type_, id_, cluster).stop()
472 except Exception:
473 log.exception('Failed to stop "{role}"'.format(role=role))
474 raise
475
476 # clean up /etc/ceph
477 ctx.cluster.run(args=[
478 'sudo', 'rm', '-f',
479 '/etc/ceph/{}.conf'.format(cluster_name),
480 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
481 ])
482
483 @contextlib.contextmanager
484 def ceph_mons(ctx, config):
485 """
486 Deploy any additional mons
487 """
488 cluster_name = config['cluster']
489 fsid = ctx.ceph[cluster_name].fsid
490 num_mons = 1
491
492 try:
493 for remote, roles in ctx.cluster.remotes.items():
494 for mon in [r for r in roles
495 if teuthology.is_type('mon', cluster_name)(r)]:
496 c_, _, id_ = teuthology.split_role(mon)
497 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
498 continue
499 log.info('Adding %s on %s' % (mon, remote.shortname))
500 num_mons += 1
501 _shell(ctx, cluster_name, remote, [
502 'ceph', 'orch', 'daemon', 'add', 'mon',
503 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
504 ])
505 ctx.daemons.register_daemon(
506 remote, 'mon', id_,
507 cluster=cluster_name,
508 fsid=fsid,
509 logger=log.getChild(mon),
510 wait=False,
511 started=True,
512 )
513
514 with contextutil.safe_while(sleep=1, tries=180) as proceed:
515 while proceed():
516 log.info('Waiting for %d mons in monmap...' % (num_mons))
517 r = _shell(
518 ctx=ctx,
519 cluster_name=cluster_name,
520 remote=remote,
521 args=[
522 'ceph', 'mon', 'dump', '-f', 'json',
523 ],
524 stdout=StringIO(),
525 )
526 j = json.loads(r.stdout.getvalue())
527 if len(j['mons']) == num_mons:
528 break
529
530 # refresh our (final) ceph.conf file
531 log.info('Generating final ceph.conf file...')
532 r = _shell(
533 ctx=ctx,
534 cluster_name=cluster_name,
535 remote=remote,
536 args=[
537 'ceph', 'config', 'generate-minimal-conf',
538 ],
539 stdout=StringIO(),
540 )
541 ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
542
543 yield
544
545 finally:
546 pass
547
548 @contextlib.contextmanager
549 def ceph_mgrs(ctx, config):
550 """
551 Deploy any additional mgrs
552 """
553 cluster_name = config['cluster']
554 fsid = ctx.ceph[cluster_name].fsid
555
556 try:
557 nodes = []
558 daemons = {}
559 for remote, roles in ctx.cluster.remotes.items():
560 for mgr in [r for r in roles
561 if teuthology.is_type('mgr', cluster_name)(r)]:
562 c_, _, id_ = teuthology.split_role(mgr)
563 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
564 continue
565 log.info('Adding %s on %s' % (mgr, remote.shortname))
566 nodes.append(remote.shortname + '=' + id_)
567 daemons[mgr] = (remote, id_)
568 if nodes:
569 _shell(ctx, cluster_name, remote, [
570 'ceph', 'orch', 'apply', 'mgr',
571 str(len(nodes) + 1) + ';' + ';'.join(nodes)]
572 )
573 for mgr, i in daemons.items():
574 remote, id_ = i
575 ctx.daemons.register_daemon(
576 remote, 'mgr', id_,
577 cluster=cluster_name,
578 fsid=fsid,
579 logger=log.getChild(mgr),
580 wait=False,
581 started=True,
582 )
583
584 yield
585
586 finally:
587 pass
588
589 @contextlib.contextmanager
590 def ceph_osds(ctx, config):
591 """
592 Deploy OSDs
593 """
594 cluster_name = config['cluster']
595 fsid = ctx.ceph[cluster_name].fsid
596
597 try:
598 log.info('Deploying OSDs...')
599
600 # provision OSDs in numeric order
601 id_to_remote = {}
602 devs_by_remote = {}
603 for remote, roles in ctx.cluster.remotes.items():
604 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
605 for osd in [r for r in roles
606 if teuthology.is_type('osd', cluster_name)(r)]:
607 _, _, id_ = teuthology.split_role(osd)
608 id_to_remote[int(id_)] = (osd, remote)
609
610 cur = 0
611 for osd_id in sorted(id_to_remote.keys()):
612 osd, remote = id_to_remote[osd_id]
613 _, _, id_ = teuthology.split_role(osd)
614 assert int(id_) == cur
615 devs = devs_by_remote[remote]
616 assert devs ## FIXME ##
617 dev = devs.pop()
618 if all(_ in dev for _ in ('lv', 'vg')):
619 short_dev = dev.replace('/dev/', '')
620 else:
621 short_dev = dev
622 log.info('Deploying %s on %s with %s...' % (
623 osd, remote.shortname, dev))
624 _shell(ctx, cluster_name, remote, [
625 'ceph-volume', 'lvm', 'zap', dev])
626 _shell(ctx, cluster_name, remote, [
627 'ceph', 'orch', 'daemon', 'add', 'osd',
628 remote.shortname + ':' + short_dev
629 ])
630 ctx.daemons.register_daemon(
631 remote, 'osd', id_,
632 cluster=cluster_name,
633 fsid=fsid,
634 logger=log.getChild(osd),
635 wait=False,
636 started=True,
637 )
638 cur += 1
639
640 yield
641 finally:
642 pass
643
644 @contextlib.contextmanager
645 def ceph_mdss(ctx, config):
646 """
647 Deploy MDSss
648 """
649 cluster_name = config['cluster']
650 fsid = ctx.ceph[cluster_name].fsid
651
652 nodes = []
653 daemons = {}
654 for remote, roles in ctx.cluster.remotes.items():
655 for role in [r for r in roles
656 if teuthology.is_type('mds', cluster_name)(r)]:
657 c_, _, id_ = teuthology.split_role(role)
658 log.info('Adding %s on %s' % (role, remote.shortname))
659 nodes.append(remote.shortname + '=' + id_)
660 daemons[role] = (remote, id_)
661 if nodes:
662 _shell(ctx, cluster_name, remote, [
663 'ceph', 'orch', 'apply', 'mds',
664 'all',
665 str(len(nodes)) + ';' + ';'.join(nodes)]
666 )
667 for role, i in daemons.items():
668 remote, id_ = i
669 ctx.daemons.register_daemon(
670 remote, 'mds', id_,
671 cluster=cluster_name,
672 fsid=fsid,
673 logger=log.getChild(role),
674 wait=False,
675 started=True,
676 )
677
678 yield
679
680 @contextlib.contextmanager
681 def ceph_monitoring(daemon_type, ctx, config):
682 """
683 Deploy prometheus, node-exporter, etc.
684 """
685 cluster_name = config['cluster']
686 fsid = ctx.ceph[cluster_name].fsid
687
688 nodes = []
689 daemons = {}
690 for remote, roles in ctx.cluster.remotes.items():
691 for role in [r for r in roles
692 if teuthology.is_type(daemon_type, cluster_name)(r)]:
693 c_, _, id_ = teuthology.split_role(role)
694 log.info('Adding %s on %s' % (role, remote.shortname))
695 nodes.append(remote.shortname + '=' + id_)
696 daemons[role] = (remote, id_)
697 if nodes:
698 _shell(ctx, cluster_name, remote, [
699 'ceph', 'orch', 'apply', daemon_type,
700 str(len(nodes)) + ';' + ';'.join(nodes)]
701 )
702 for role, i in daemons.items():
703 remote, id_ = i
704 ctx.daemons.register_daemon(
705 remote, daemon_type, id_,
706 cluster=cluster_name,
707 fsid=fsid,
708 logger=log.getChild(role),
709 wait=False,
710 started=True,
711 )
712
713 yield
714
715 @contextlib.contextmanager
716 def ceph_rgw(ctx, config):
717 """
718 Deploy rgw
719 """
720 cluster_name = config['cluster']
721 fsid = ctx.ceph[cluster_name].fsid
722
723 nodes = {}
724 daemons = {}
725 for remote, roles in ctx.cluster.remotes.items():
726 for role in [r for r in roles
727 if teuthology.is_type('rgw', cluster_name)(r)]:
728 c_, _, id_ = teuthology.split_role(role)
729 log.info('Adding %s on %s' % (role, remote.shortname))
730 realmzone = '.'.join(id_.split('.')[0:2])
731 if realmzone not in nodes:
732 nodes[realmzone] = []
733 nodes[realmzone].append(remote.shortname + '=' + id_)
734 daemons[role] = (remote, id_)
735
736 for realmzone in nodes.keys():
737 (realm, zone) = realmzone.split('.', 1)
738
739 # TODO: those should be moved to mgr/cephadm
740 _shell(ctx, cluster_name, remote,
741 ['radosgw-admin', 'realm', 'create', '--rgw-realm', realm, '--default']
742 )
743 _shell(ctx, cluster_name, remote,
744 ['radosgw-admin', 'zonegroup', 'create', '--rgw-zonegroup=default', '--master', '--default']
745 )
746 _shell(ctx, cluster_name, remote,
747 ['radosgw-admin', 'zone', 'create', '--rgw-zonegroup=default', '--rgw-zone', zone, '--master', '--default']
748 )
749
750 for realmzone, nodes in nodes.items():
751 (realm, zone) = realmzone.split('.', 1)
752 _shell(ctx, cluster_name, remote, [
753 'ceph', 'orch', 'apply', 'rgw', realm, zone,
754 '--placement',
755 str(len(nodes)) + ';' + ';'.join(nodes)]
756 )
757 for role, i in daemons.items():
758 remote, id_ = i
759 ctx.daemons.register_daemon(
760 remote, 'rgw', id_,
761 cluster=cluster_name,
762 fsid=fsid,
763 logger=log.getChild(role),
764 wait=False,
765 started=True,
766 )
767
768 yield
769
770 @contextlib.contextmanager
771 def ceph_clients(ctx, config):
772 cluster_name = config['cluster']
773 testdir = teuthology.get_testdir(ctx)
774
775 log.info('Setting up client nodes...')
776 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
777 testdir = teuthology.get_testdir(ctx)
778 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
779 for remote, roles_for_host in clients.remotes.items():
780 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
781 cluster_name):
782 name = teuthology.ceph_role(role)
783 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
784 name)
785 r = _shell(
786 ctx=ctx,
787 cluster_name=cluster_name,
788 remote=remote,
789 args=[
790 'ceph', 'auth',
791 'get-or-create', name,
792 'mon', 'allow *',
793 'osd', 'allow *',
794 'mds', 'allow *',
795 'mgr', 'allow *',
796 ],
797 stdout=StringIO(),
798 )
799 keyring = r.stdout.getvalue()
800 teuthology.sudo_write_file(
801 remote=remote,
802 path=client_keyring,
803 data=keyring,
804 perms='0644'
805 )
806 yield
807
808 @contextlib.contextmanager
809 def ceph_initial():
810 try:
811 yield
812 finally:
813 log.info('Teardown complete')
814
815 ## public methods
816 @contextlib.contextmanager
817 def stop(ctx, config):
818 """
819 Stop ceph daemons
820
821 For example::
822 tasks:
823 - ceph.stop: [mds.*]
824
825 tasks:
826 - ceph.stop: [osd.0, osd.2]
827
828 tasks:
829 - ceph.stop:
830 daemons: [osd.0, osd.2]
831
832 """
833 if config is None:
834 config = {}
835 elif isinstance(config, list):
836 config = {'daemons': config}
837
838 daemons = ctx.daemons.resolve_role_list(
839 config.get('daemons', None), CEPH_ROLE_TYPES, True)
840 clusters = set()
841
842 for role in daemons:
843 cluster, type_, id_ = teuthology.split_role(role)
844 ctx.daemons.get_daemon(type_, id_, cluster).stop()
845 clusters.add(cluster)
846
847 # for cluster in clusters:
848 # ctx.ceph[cluster].watchdog.stop()
849 # ctx.ceph[cluster].watchdog.join()
850
851 yield
852
853 def shell(ctx, config):
854 """
855 Execute (shell) commands
856 """
857 cluster_name = config.get('cluster', 'ceph')
858
859 env = []
860 if 'env' in config:
861 for k in config['env']:
862 env.extend(['-e', k + '=' + ctx.config.get(k, '')])
863 del config['env']
864
865 if 'all' in config and len(config) == 1:
866 a = config['all']
867 roles = teuthology.all_roles(ctx.cluster)
868 config = dict((id_, a) for id_ in roles)
869
870 for role, ls in config.items():
871 (remote,) = ctx.cluster.only(role).remotes.keys()
872 log.info('Running commands on role %s host %s', role, remote.name)
873 for c in ls:
874 _shell(ctx, cluster_name, remote,
875 ['bash', '-c', c],
876 extra_cephadm_args=env)
877
878 @contextlib.contextmanager
879 def tweaked_option(ctx, config):
880 """
881 set an option, and then restore it with its original value
882
883 Note, due to the way how tasks are executed/nested, it's not suggested to
884 use this method as a standalone task. otherwise, it's likely that it will
885 restore the tweaked option at the /end/ of 'tasks' block.
886 """
887 saved_options = {}
888 # we can complicate this when necessary
889 options = ['mon-health-to-clog']
890 type_, id_ = 'mon', '*'
891 cluster = config.get('cluster', 'ceph')
892 manager = ctx.managers[cluster]
893 if id_ == '*':
894 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
895 else:
896 get_from = id_
897 for option in options:
898 if option not in config:
899 continue
900 value = 'true' if config[option] else 'false'
901 option = option.replace('-', '_')
902 old_value = manager.get_config(type_, get_from, option)
903 if value != old_value:
904 saved_options[option] = old_value
905 manager.inject_args(type_, id_, option, value)
906 yield
907 for option, value in saved_options.items():
908 manager.inject_args(type_, id_, option, value)
909
910 @contextlib.contextmanager
911 def restart(ctx, config):
912 """
913 restart ceph daemons
914
915 For example::
916 tasks:
917 - ceph.restart: [all]
918
919 For example::
920 tasks:
921 - ceph.restart: [osd.0, mon.1, mds.*]
922
923 or::
924
925 tasks:
926 - ceph.restart:
927 daemons: [osd.0, mon.1]
928 wait-for-healthy: false
929 wait-for-osds-up: true
930
931 :param ctx: Context
932 :param config: Configuration
933 """
934 if config is None:
935 config = {}
936 elif isinstance(config, list):
937 config = {'daemons': config}
938
939 daemons = ctx.daemons.resolve_role_list(
940 config.get('daemons', None), CEPH_ROLE_TYPES, True)
941 clusters = set()
942
943 log.info('daemons %s' % daemons)
944 with tweaked_option(ctx, config):
945 for role in daemons:
946 cluster, type_, id_ = teuthology.split_role(role)
947 d = ctx.daemons.get_daemon(type_, id_, cluster)
948 assert d, 'daemon %s does not exist' % role
949 d.stop()
950 if type_ == 'osd':
951 ctx.managers[cluster].mark_down_osd(id_)
952 d.restart()
953 clusters.add(cluster)
954
955 if config.get('wait-for-healthy', True):
956 for cluster in clusters:
957 healthy(ctx=ctx, config=dict(cluster=cluster))
958 if config.get('wait-for-osds-up', False):
959 for cluster in clusters:
960 ctx.managers[cluster].wait_for_all_osds_up()
961 yield
962
963 @contextlib.contextmanager
964 def distribute_config_and_admin_keyring(ctx, config):
965 """
966 Distribute a sufficient config and keyring for clients
967 """
968 cluster_name = config['cluster']
969 log.info('Distributing (final) config and client.admin keyring...')
970 for remote, roles in ctx.cluster.remotes.items():
971 teuthology.sudo_write_file(
972 remote=remote,
973 path='/etc/ceph/{}.conf'.format(cluster_name),
974 data=ctx.ceph[cluster_name].config_file)
975 teuthology.sudo_write_file(
976 remote=remote,
977 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
978 data=ctx.ceph[cluster_name].admin_keyring)
979 try:
980 yield
981 finally:
982 ctx.cluster.run(args=[
983 'sudo', 'rm', '-f',
984 '/etc/ceph/{}.conf'.format(cluster_name),
985 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
986 ])
987
988 @contextlib.contextmanager
989 def crush_setup(ctx, config):
990 cluster_name = config['cluster']
991
992 profile = config.get('crush_tunables', 'default')
993 log.info('Setting crush tunables to %s', profile)
994 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
995 args=['ceph', 'osd', 'crush', 'tunables', profile])
996 yield
997
998 @contextlib.contextmanager
999 def _bypass():
1000 yield
1001
1002 @contextlib.contextmanager
1003 def initialize_config(ctx, config):
1004 cluster_name = config['cluster']
1005 testdir = teuthology.get_testdir(ctx)
1006
1007 ctx.ceph[cluster_name].thrashers = []
1008 # fixme: setup watchdog, ala ceph.py
1009
1010 ctx.ceph[cluster_name].roleless = False # see below
1011
1012 first_ceph_cluster = False
1013 if not hasattr(ctx, 'daemons'):
1014 first_ceph_cluster = True
1015
1016 # cephadm mode?
1017 if 'cephadm_mode' not in config:
1018 config['cephadm_mode'] = 'root'
1019 assert config['cephadm_mode'] in ['root', 'cephadm-package']
1020 if config['cephadm_mode'] == 'root':
1021 ctx.cephadm = testdir + '/cephadm'
1022 else:
1023 ctx.cephadm = 'cephadm' # in the path
1024
1025 if first_ceph_cluster:
1026 # FIXME: this is global for all clusters
1027 ctx.daemons = DaemonGroup(
1028 use_cephadm=ctx.cephadm)
1029
1030 # uuid
1031 fsid = str(uuid.uuid1())
1032 log.info('Cluster fsid is %s' % fsid)
1033 ctx.ceph[cluster_name].fsid = fsid
1034
1035 # mon ips
1036 log.info('Choosing monitor IPs and ports...')
1037 remotes_and_roles = ctx.cluster.remotes.items()
1038 ips = [host for (host, port) in
1039 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1040
1041 if config.get('roleless', False):
1042 # mons will be named after hosts
1043 first_mon = None
1044 for remote, _ in remotes_and_roles:
1045 ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1046 if not first_mon:
1047 first_mon = remote.shortname
1048 bootstrap_remote = remote
1049 log.info('No mon roles; fabricating mons')
1050
1051 roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1052
1053 ctx.ceph[cluster_name].mons = get_mons(
1054 roles, ips, cluster_name,
1055 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1056 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1057 )
1058 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1059
1060 if config.get('roleless', False):
1061 ctx.ceph[cluster_name].roleless = True
1062 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1063 ctx.ceph[cluster_name].first_mon = first_mon
1064 ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1065 else:
1066 first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1067 _, _, first_mon = teuthology.split_role(first_mon_role)
1068 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1069 log.info('First mon is mon.%s on %s' % (first_mon,
1070 bootstrap_remote.shortname))
1071 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1072 ctx.ceph[cluster_name].first_mon = first_mon
1073 ctx.ceph[cluster_name].first_mon_role = first_mon_role
1074
1075 others = ctx.cluster.remotes[bootstrap_remote]
1076 mgrs = sorted([r for r in others
1077 if teuthology.is_type('mgr', cluster_name)(r)])
1078 if not mgrs:
1079 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1080 _, _, first_mgr = teuthology.split_role(mgrs[0])
1081 log.info('First mgr is %s' % (first_mgr))
1082 ctx.ceph[cluster_name].first_mgr = first_mgr
1083 yield
1084
1085 @contextlib.contextmanager
1086 def task(ctx, config):
1087 """
1088 Deploy ceph cluster using cephadm
1089
1090 Setup containers' mirrors before the bootstrap, if corresponding
1091 config provided in teuthology server config yaml file.
1092
1093 For example, teuthology.yaml can contain the 'defaults' section:
1094
1095 defaults:
1096 cephadm:
1097 containers:
1098 registry_mirrors:
1099 docker.io: 'registry.mirror.example.com:5000'
1100 image: 'quay.io/ceph-ci/ceph'
1101
1102 Using overrides makes it possible to customize it per run.
1103 The equivalent 'overrides' section looks like:
1104
1105 overrides:
1106 cephadm:
1107 containers:
1108 registry_mirrors:
1109 docker.io: 'registry.mirror.example.com:5000'
1110 image: 'quay.io/ceph-ci/ceph'
1111
1112 :param ctx: the argparse.Namespace object
1113 :param config: the config dict
1114 """
1115 if config is None:
1116 config = {}
1117
1118 assert isinstance(config, dict), \
1119 "task only supports a dictionary for configuration"
1120
1121 overrides = ctx.config.get('overrides', {})
1122 teuthology.deep_merge(config, overrides.get('ceph', {}))
1123 teuthology.deep_merge(config, overrides.get('cephadm', {}))
1124 log.info('Config: ' + str(config))
1125
1126 testdir = teuthology.get_testdir(ctx)
1127
1128 # set up cluster context
1129 if not hasattr(ctx, 'ceph'):
1130 ctx.ceph = {}
1131 ctx.managers = {}
1132 if 'cluster' not in config:
1133 config['cluster'] = 'ceph'
1134 cluster_name = config['cluster']
1135 if cluster_name not in ctx.ceph:
1136 ctx.ceph[cluster_name] = argparse.Namespace()
1137 ctx.ceph[cluster_name].bootstrapped = False
1138
1139 # image
1140 teuth_defaults = teuth_config.get('defaults', {})
1141 cephadm_defaults = teuth_defaults.get('cephadm', {})
1142 containers_defaults = cephadm_defaults.get('containers', {})
1143 mirrors_defaults = containers_defaults.get('registry_mirrors', {})
1144 container_registry_mirror = mirrors_defaults.get('docker.io', None)
1145 container_image_name = containers_defaults.get('image', None)
1146
1147 containers = config.get('containers', {})
1148 mirrors = containers.get('registry_mirrors', {})
1149 container_image_name = containers.get('image', container_image_name)
1150 container_registry_mirror = mirrors.get('docker.io',
1151 container_registry_mirror)
1152
1153 if not container_image_name:
1154 raise Exception("Configuration error occurred. "
1155 "The 'image' value is undefined for 'cephadm' task. "
1156 "Please provide corresponding options in the task's "
1157 "config, task 'overrides', or teuthology 'defaults' "
1158 "section.")
1159
1160 if not hasattr(ctx.ceph[cluster_name], 'image'):
1161 ctx.ceph[cluster_name].image = config.get('image')
1162 ref = None
1163 if not ctx.ceph[cluster_name].image:
1164 sha1 = config.get('sha1')
1165 if sha1:
1166 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1167 ref = sha1
1168 else:
1169 # hmm, fall back to branch?
1170 branch = config.get('branch', 'master')
1171 ref = branch
1172 ctx.ceph[cluster_name].image = container_image_name + ':' + branch
1173 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1174
1175
1176 with contextutil.nested(
1177 #if the cluster is already bootstrapped bypass corresponding methods
1178 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1179 else initialize_config(ctx=ctx, config=config),
1180 lambda: ceph_initial(),
1181 lambda: normalize_hostnames(ctx=ctx),
1182 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1183 else download_cephadm(ctx=ctx, config=config, ref=ref),
1184 lambda: ceph_log(ctx=ctx, config=config),
1185 lambda: ceph_crash(ctx=ctx, config=config),
1186 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1187 else ceph_bootstrap(ctx, config,
1188 container_registry_mirror),
1189 lambda: crush_setup(ctx=ctx, config=config),
1190 lambda: ceph_mons(ctx=ctx, config=config),
1191 lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1192 lambda: ceph_mgrs(ctx=ctx, config=config),
1193 lambda: ceph_osds(ctx=ctx, config=config),
1194 lambda: ceph_mdss(ctx=ctx, config=config),
1195 lambda: ceph_rgw(ctx=ctx, config=config),
1196 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1197 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1198 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1199 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1200 lambda: ceph_clients(ctx=ctx, config=config),
1201 ):
1202 ctx.managers[cluster_name] = CephManager(
1203 ctx.ceph[cluster_name].bootstrap_remote,
1204 ctx=ctx,
1205 logger=log.getChild('ceph_manager.' + cluster_name),
1206 cluster=cluster_name,
1207 cephadm=True,
1208 )
1209
1210 try:
1211 if config.get('wait-for-healthy', True):
1212 healthy(ctx=ctx, config=config)
1213
1214 log.info('Setup complete, yielding')
1215 yield
1216
1217 finally:
1218 log.info('Teardown begin')
1219
1220
1221 def registries_add_mirror_to_docker_io(conf, mirror):
1222 config = toml.loads(conf)
1223 is_v1 = 'registries' in config
1224 if is_v1:
1225 search = config.get('registries', {}).get('search', {}).get('registries', [])
1226 insecure = config.get('registries', {}).get('search', {}).get('insecure', [])
1227 # v2: MutableMapping[str, Any] = { needs Python 3
1228 v2 = {
1229 'unqualified-search-registries': search,
1230 'registry': [
1231 {
1232 'prefix': reg,
1233 'location': reg,
1234 'insecure': reg in insecure,
1235 'blocked': False,
1236 } for reg in search
1237 ]
1238 }
1239 else:
1240 v2 = config # type: ignore
1241 dockers = [r for r in v2['registry'] if r['prefix'] == 'docker.io']
1242 if dockers:
1243 docker = dockers[0]
1244 docker['mirror'] = [{
1245 "location": mirror,
1246 "insecure": True,
1247 }]
1248 return v2
1249
1250
1251 def add_mirror_to_cluster(ctx, mirror):
1252 log.info('Adding local image mirror %s' % mirror)
1253
1254 registries_conf = '/etc/containers/registries.conf'
1255
1256 for remote in ctx.cluster.remotes.keys():
1257 try:
1258 config = teuthology.get_file(
1259 remote=remote,
1260 path=registries_conf
1261 )
1262 new_config = toml.dumps(registries_add_mirror_to_docker_io(config.decode('utf-8'), mirror))
1263
1264 teuthology.sudo_write_file(
1265 remote=remote,
1266 path=registries_conf,
1267 data=six.ensure_str(new_config),
1268 )
1269 except IOError as e: # py3: use FileNotFoundError instead.
1270 if e.errno != errno.ENOENT:
1271 raise
1272
1273 # Docker doesn't ship a registries.conf
1274 log.info('Failed to add mirror: %s' % str(e))