]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
aaf0e68ff74926c5c43aab1c7e6c4af585a69c06
[ceph.git] / ceph / qa / tasks / cephadm.py
1 """
2 Ceph cluster task, deployed via cephadm orchestrator
3 """
4 import argparse
5 import configobj
6 import contextlib
7 import errno
8 import logging
9 import os
10 import json
11 import re
12 import uuid
13 import yaml
14
15 import six
16 import toml
17 from io import BytesIO
18 from six import StringIO
19 from tarfile import ReadError
20 from tasks.ceph_manager import CephManager
21 from teuthology import misc as teuthology
22 from teuthology import contextutil
23 from teuthology.orchestra import run
24 from teuthology.orchestra.daemon import DaemonGroup
25 from teuthology.config import config as teuth_config
26
27 # these items we use from ceph.py should probably eventually move elsewhere
28 from tasks.ceph import get_mons, healthy
29
30 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
31
32 log = logging.getLogger(__name__)
33
34
35 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
36 testdir = teuthology.get_testdir(ctx)
37 return remote.run(
38 args=[
39 'sudo',
40 ctx.cephadm,
41 '--image', ctx.ceph[cluster_name].image,
42 'shell',
43 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
44 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
45 '--fsid', ctx.ceph[cluster_name].fsid,
46 ] + extra_cephadm_args + [
47 '--',
48 ] + args,
49 **kwargs
50 )
51
52 def build_initial_config(ctx, config):
53 cluster_name = config['cluster']
54
55 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
56 conf = configobj.ConfigObj(path, file_error=True)
57
58 conf.setdefault('global', {})
59 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
60
61 # overrides
62 for section, keys in config.get('conf',{}).items():
63 for key, value in keys.items():
64 log.info(" override: [%s] %s = %s" % (section, key, value))
65 if section not in conf:
66 conf[section] = {}
67 conf[section][key] = value
68
69 return conf
70
71 @contextlib.contextmanager
72 def normalize_hostnames(ctx):
73 """
74 Ensure we have short hostnames throughout, for consistency between
75 remote.shortname and socket.gethostname() in cephadm.
76 """
77 log.info('Normalizing hostnames...')
78 ctx.cluster.run(args=[
79 'sudo',
80 'hostname',
81 run.Raw('$(hostname -s)'),
82 ])
83
84 try:
85 yield
86 finally:
87 pass
88
89 @contextlib.contextmanager
90 def download_cephadm(ctx, config, ref):
91 cluster_name = config['cluster']
92
93 if config.get('cephadm_mode') != 'cephadm-package':
94 ref = config.get('cephadm_branch', ref)
95 git_url = teuth_config.get_ceph_git_url()
96 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
97 if git_url.startswith('https://github.com/'):
98 # git archive doesn't like https:// URLs, which we use with github.
99 rest = git_url.split('https://github.com/', 1)[1]
100 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
101 ctx.cluster.run(
102 args=[
103 'curl', '--silent',
104 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
105 run.Raw('>'),
106 ctx.cephadm,
107 run.Raw('&&'),
108 'ls', '-l',
109 ctx.cephadm,
110 ],
111 )
112 else:
113 ctx.cluster.run(
114 args=[
115 'git', 'archive',
116 '--remote=' + git_url,
117 ref,
118 'src/cephadm/cephadm',
119 run.Raw('|'),
120 'tar', '-xO', 'src/cephadm/cephadm',
121 run.Raw('>'),
122 ctx.cephadm,
123 ],
124 )
125 # sanity-check the resulting file and set executable bit
126 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
127 ctx.cluster.run(
128 args=[
129 'test', '-s', ctx.cephadm,
130 run.Raw('&&'),
131 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
132 run.Raw('&&'),
133 'chmod', '+x', ctx.cephadm,
134 ],
135 )
136
137 try:
138 yield
139 finally:
140 log.info('Removing cluster...')
141 ctx.cluster.run(args=[
142 'sudo',
143 ctx.cephadm,
144 'rm-cluster',
145 '--fsid', ctx.ceph[cluster_name].fsid,
146 '--force',
147 ])
148
149 if config.get('cephadm_mode') == 'root':
150 log.info('Removing cephadm ...')
151 ctx.cluster.run(
152 args=[
153 'rm',
154 '-rf',
155 ctx.cephadm,
156 ],
157 )
158
159 @contextlib.contextmanager
160 def ceph_log(ctx, config):
161 cluster_name = config['cluster']
162 fsid = ctx.ceph[cluster_name].fsid
163
164 # Add logs directory to job's info log file
165 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
166 info_yaml = yaml.safe_load(info_file)
167 info_file.seek(0)
168 if 'archive' not in info_yaml:
169 info_yaml['archive'] = {'log': '/var/log/ceph'}
170 else:
171 info_yaml['archive']['log'] = '/var/log/ceph'
172 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
173
174 try:
175 yield
176
177 except Exception:
178 # we need to know this below
179 ctx.summary['success'] = False
180 raise
181
182 finally:
183 log.info('Checking cluster log for badness...')
184 def first_in_ceph_log(pattern, excludes):
185 """
186 Find the first occurrence of the pattern specified in the Ceph log,
187 Returns None if none found.
188
189 :param pattern: Pattern scanned for.
190 :param excludes: Patterns to ignore.
191 :return: First line of text (or None if not found)
192 """
193 args = [
194 'sudo',
195 'egrep', pattern,
196 '/var/log/ceph/{fsid}/ceph.log'.format(
197 fsid=fsid),
198 ]
199 if excludes:
200 for exclude in excludes:
201 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
202 args.extend([
203 run.Raw('|'), 'head', '-n', '1',
204 ])
205 r = ctx.ceph[cluster_name].bootstrap_remote.run(
206 stdout=StringIO(),
207 args=args,
208 )
209 stdout = r.stdout.getvalue()
210 if stdout != '':
211 return stdout
212 return None
213
214 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
215 config.get('log-whitelist')) is not None:
216 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
217 ctx.summary['success'] = False
218 # use the most severe problem as the failure reason
219 if 'failure_reason' not in ctx.summary:
220 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
221 match = first_in_ceph_log(pattern, config['log-whitelist'])
222 if match is not None:
223 ctx.summary['failure_reason'] = \
224 '"{match}" in cluster log'.format(
225 match=match.rstrip('\n'),
226 )
227 break
228
229 if ctx.archive is not None and \
230 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
231 # and logs
232 log.info('Compressing logs...')
233 run.wait(
234 ctx.cluster.run(
235 args=[
236 'sudo',
237 'find',
238 '/var/log/ceph', # all logs, not just for the cluster
239 '/var/log/rbd-target-api', # ceph-iscsi
240 '-name',
241 '*.log',
242 '-print0',
243 run.Raw('|'),
244 'sudo',
245 'xargs',
246 '-0',
247 '--no-run-if-empty',
248 '--',
249 'gzip',
250 '--',
251 ],
252 wait=False,
253 ),
254 )
255
256 log.info('Archiving logs...')
257 path = os.path.join(ctx.archive, 'remote')
258 try:
259 os.makedirs(path)
260 except OSError:
261 pass
262 for remote in ctx.cluster.remotes.keys():
263 sub = os.path.join(path, remote.name)
264 try:
265 os.makedirs(sub)
266 except OSError:
267 pass
268 try:
269 teuthology.pull_directory(remote, '/var/log/ceph', # everything
270 os.path.join(sub, 'log'))
271 except ReadError:
272 pass
273
274 @contextlib.contextmanager
275 def ceph_crash(ctx, config):
276 """
277 Gather crash dumps from /var/lib/ceph/$fsid/crash
278 """
279 cluster_name = config['cluster']
280 fsid = ctx.ceph[cluster_name].fsid
281
282 # Add logs directory to job's info log file
283 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
284 info_yaml = yaml.safe_load(info_file)
285 info_file.seek(0)
286 if 'archive' not in info_yaml:
287 info_yaml['archive'] = {'crash': '/var/lib/ceph/%s/crash' % fsid}
288 else:
289 info_yaml['archive']['crash'] = '/var/lib/ceph/%s/crash' % fsid
290 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
291
292 try:
293 yield
294
295 finally:
296 if ctx.archive is not None:
297 log.info('Archiving crash dumps...')
298 path = os.path.join(ctx.archive, 'remote')
299 try:
300 os.makedirs(path)
301 except OSError:
302 pass
303 for remote in ctx.cluster.remotes.keys():
304 sub = os.path.join(path, remote.name)
305 try:
306 os.makedirs(sub)
307 except OSError:
308 pass
309 try:
310 teuthology.pull_directory(remote,
311 '/var/lib/ceph/%s/crash' % fsid,
312 os.path.join(sub, 'crash'))
313 except ReadError:
314 pass
315
316 @contextlib.contextmanager
317 def ceph_bootstrap(ctx, config, registry):
318 """
319 Bootstrap ceph cluster, setup containers' registry mirror before
320 the bootstrap if the registry is provided.
321
322 :param ctx: the argparse.Namespace object
323 :param config: the config dict
324 :param registry: url to containers' mirror registry
325 """
326 cluster_name = config['cluster']
327 testdir = teuthology.get_testdir(ctx)
328 fsid = ctx.ceph[cluster_name].fsid
329
330 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
331 first_mon = ctx.ceph[cluster_name].first_mon
332 first_mon_role = ctx.ceph[cluster_name].first_mon_role
333 mons = ctx.ceph[cluster_name].mons
334
335 ctx.cluster.run(args=[
336 'sudo', 'mkdir', '-p', '/etc/ceph',
337 ]);
338 ctx.cluster.run(args=[
339 'sudo', 'chmod', '777', '/etc/ceph',
340 ]);
341 if registry:
342 add_mirror_to_cluster(ctx, registry)
343 try:
344 # write seed config
345 log.info('Writing seed config...')
346 conf_fp = BytesIO()
347 seed_config = build_initial_config(ctx, config)
348 seed_config.write(conf_fp)
349 teuthology.write_file(
350 remote=bootstrap_remote,
351 path='{}/seed.{}.conf'.format(testdir, cluster_name),
352 data=conf_fp.getvalue())
353 log.debug('Final config:\n' + conf_fp.getvalue().decode())
354 ctx.ceph[cluster_name].conf = seed_config
355
356 # register initial daemons
357 ctx.daemons.register_daemon(
358 bootstrap_remote, 'mon', first_mon,
359 cluster=cluster_name,
360 fsid=fsid,
361 logger=log.getChild('mon.' + first_mon),
362 wait=False,
363 started=True,
364 )
365 if not ctx.ceph[cluster_name].roleless:
366 first_mgr = ctx.ceph[cluster_name].first_mgr
367 ctx.daemons.register_daemon(
368 bootstrap_remote, 'mgr', first_mgr,
369 cluster=cluster_name,
370 fsid=fsid,
371 logger=log.getChild('mgr.' + first_mgr),
372 wait=False,
373 started=True,
374 )
375
376 # bootstrap
377 log.info('Bootstrapping...')
378 cmd = [
379 'sudo',
380 ctx.cephadm,
381 '--image', ctx.ceph[cluster_name].image,
382 '-v',
383 'bootstrap',
384 '--fsid', fsid,
385 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
386 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
387 '--output-keyring',
388 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
389 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
390 ]
391 if not ctx.ceph[cluster_name].roleless:
392 cmd += [
393 '--mon-id', first_mon,
394 '--mgr-id', first_mgr,
395 '--orphan-initial-daemons', # we will do it explicitly!
396 '--skip-monitoring-stack', # we'll provision these explicitly
397 ]
398 if mons[first_mon_role].startswith('['):
399 cmd += ['--mon-addrv', mons[first_mon_role]]
400 else:
401 cmd += ['--mon-ip', mons[first_mon_role]]
402 if config.get('skip_dashboard'):
403 cmd += ['--skip-dashboard']
404 # bootstrap makes the keyring root 0600, so +r it for our purposes
405 cmd += [
406 run.Raw('&&'),
407 'sudo', 'chmod', '+r',
408 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
409 ]
410 bootstrap_remote.run(args=cmd)
411
412 # fetch keys and configs
413 log.info('Fetching config...')
414 ctx.ceph[cluster_name].config_file = teuthology.get_file(
415 remote=bootstrap_remote,
416 path='/etc/ceph/{}.conf'.format(cluster_name))
417 log.info('Fetching client.admin keyring...')
418 ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
419 remote=bootstrap_remote,
420 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name))
421 log.info('Fetching mon keyring...')
422 ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
423 remote=bootstrap_remote,
424 path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
425 sudo=True)
426
427 # fetch ssh key, distribute to additional nodes
428 log.info('Fetching pub ssh key...')
429 ssh_pub_key = teuthology.get_file(
430 remote=bootstrap_remote,
431 path='{}/{}.pub'.format(testdir, cluster_name)
432 ).decode('ascii').strip()
433
434 log.info('Installing pub ssh key for root users...')
435 ctx.cluster.run(args=[
436 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
437 run.Raw('&&'),
438 'echo', ssh_pub_key,
439 run.Raw('|'),
440 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
441 run.Raw('&&'),
442 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
443 ])
444
445 # set options
446 _shell(ctx, cluster_name, bootstrap_remote,
447 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
448
449 # add other hosts
450 for remote in ctx.cluster.remotes.keys():
451 if remote == bootstrap_remote:
452 continue
453 log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
454 teuthology.write_file(
455 remote=remote,
456 path='/etc/ceph/{}.conf'.format(cluster_name),
457 data=ctx.ceph[cluster_name].config_file)
458 teuthology.write_file(
459 remote=remote,
460 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
461 data=ctx.ceph[cluster_name].admin_keyring)
462
463 log.info('Adding host %s to orchestrator...' % remote.shortname)
464 _shell(ctx, cluster_name, remote, [
465 'ceph', 'orch', 'host', 'add',
466 remote.shortname
467 ])
468 r = _shell(ctx, cluster_name, remote,
469 ['ceph', 'orch', 'host', 'ls', '--format=json'],
470 stdout=StringIO())
471 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
472 assert remote.shortname in hosts
473
474 yield
475
476 finally:
477 log.info('Cleaning up testdir ceph.* files...')
478 ctx.cluster.run(args=[
479 'rm', '-f',
480 '{}/seed.{}.conf'.format(testdir, cluster_name),
481 '{}/{}.pub'.format(testdir, cluster_name),
482 ])
483
484 log.info('Stopping all daemons...')
485
486 # this doesn't block until they are all stopped...
487 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
488
489 # so, stop them individually
490 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
491 cluster, type_, id_ = teuthology.split_role(role)
492 try:
493 ctx.daemons.get_daemon(type_, id_, cluster).stop()
494 except Exception:
495 log.exception('Failed to stop "{role}"'.format(role=role))
496 raise
497
498 # clean up /etc/ceph
499 ctx.cluster.run(args=[
500 'sudo', 'rm', '-f',
501 '/etc/ceph/{}.conf'.format(cluster_name),
502 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
503 ])
504
505 @contextlib.contextmanager
506 def ceph_mons(ctx, config):
507 """
508 Deploy any additional mons
509 """
510 cluster_name = config['cluster']
511 fsid = ctx.ceph[cluster_name].fsid
512 num_mons = 1
513
514 try:
515 for remote, roles in ctx.cluster.remotes.items():
516 for mon in [r for r in roles
517 if teuthology.is_type('mon', cluster_name)(r)]:
518 c_, _, id_ = teuthology.split_role(mon)
519 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
520 continue
521 log.info('Adding %s on %s' % (mon, remote.shortname))
522 num_mons += 1
523 _shell(ctx, cluster_name, remote, [
524 'ceph', 'orch', 'daemon', 'add', 'mon',
525 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
526 ])
527 ctx.daemons.register_daemon(
528 remote, 'mon', id_,
529 cluster=cluster_name,
530 fsid=fsid,
531 logger=log.getChild(mon),
532 wait=False,
533 started=True,
534 )
535
536 with contextutil.safe_while(sleep=1, tries=180) as proceed:
537 while proceed():
538 log.info('Waiting for %d mons in monmap...' % (num_mons))
539 r = _shell(
540 ctx=ctx,
541 cluster_name=cluster_name,
542 remote=remote,
543 args=[
544 'ceph', 'mon', 'dump', '-f', 'json',
545 ],
546 stdout=StringIO(),
547 )
548 j = json.loads(r.stdout.getvalue())
549 if len(j['mons']) == num_mons:
550 break
551
552 # refresh our (final) ceph.conf file
553 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
554 log.info('Generating final ceph.conf file...')
555 r = _shell(
556 ctx=ctx,
557 cluster_name=cluster_name,
558 remote=bootstrap_remote,
559 args=[
560 'ceph', 'config', 'generate-minimal-conf',
561 ],
562 stdout=StringIO(),
563 )
564 ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
565
566 yield
567
568 finally:
569 pass
570
571 @contextlib.contextmanager
572 def ceph_mgrs(ctx, config):
573 """
574 Deploy any additional mgrs
575 """
576 cluster_name = config['cluster']
577 fsid = ctx.ceph[cluster_name].fsid
578
579 try:
580 nodes = []
581 daemons = {}
582 for remote, roles in ctx.cluster.remotes.items():
583 for mgr in [r for r in roles
584 if teuthology.is_type('mgr', cluster_name)(r)]:
585 c_, _, id_ = teuthology.split_role(mgr)
586 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
587 continue
588 log.info('Adding %s on %s' % (mgr, remote.shortname))
589 nodes.append(remote.shortname + '=' + id_)
590 daemons[mgr] = (remote, id_)
591 if nodes:
592 _shell(ctx, cluster_name, remote, [
593 'ceph', 'orch', 'apply', 'mgr',
594 str(len(nodes) + 1) + ';' + ';'.join(nodes)]
595 )
596 for mgr, i in daemons.items():
597 remote, id_ = i
598 ctx.daemons.register_daemon(
599 remote, 'mgr', id_,
600 cluster=cluster_name,
601 fsid=fsid,
602 logger=log.getChild(mgr),
603 wait=False,
604 started=True,
605 )
606
607 yield
608
609 finally:
610 pass
611
612 @contextlib.contextmanager
613 def ceph_osds(ctx, config):
614 """
615 Deploy OSDs
616 """
617 cluster_name = config['cluster']
618 fsid = ctx.ceph[cluster_name].fsid
619
620 try:
621 log.info('Deploying OSDs...')
622
623 # provision OSDs in numeric order
624 id_to_remote = {}
625 devs_by_remote = {}
626 for remote, roles in ctx.cluster.remotes.items():
627 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
628 for osd in [r for r in roles
629 if teuthology.is_type('osd', cluster_name)(r)]:
630 _, _, id_ = teuthology.split_role(osd)
631 id_to_remote[int(id_)] = (osd, remote)
632
633 cur = 0
634 for osd_id in sorted(id_to_remote.keys()):
635 osd, remote = id_to_remote[osd_id]
636 _, _, id_ = teuthology.split_role(osd)
637 assert int(id_) == cur
638 devs = devs_by_remote[remote]
639 assert devs ## FIXME ##
640 dev = devs.pop()
641 if all(_ in dev for _ in ('lv', 'vg')):
642 short_dev = dev.replace('/dev/', '')
643 else:
644 short_dev = dev
645 log.info('Deploying %s on %s with %s...' % (
646 osd, remote.shortname, dev))
647 _shell(ctx, cluster_name, remote, [
648 'ceph-volume', 'lvm', 'zap', dev])
649 _shell(ctx, cluster_name, remote, [
650 'ceph', 'orch', 'daemon', 'add', 'osd',
651 remote.shortname + ':' + short_dev
652 ])
653 ctx.daemons.register_daemon(
654 remote, 'osd', id_,
655 cluster=cluster_name,
656 fsid=fsid,
657 logger=log.getChild(osd),
658 wait=False,
659 started=True,
660 )
661 cur += 1
662
663 yield
664 finally:
665 pass
666
667 @contextlib.contextmanager
668 def ceph_mdss(ctx, config):
669 """
670 Deploy MDSss
671 """
672 cluster_name = config['cluster']
673 fsid = ctx.ceph[cluster_name].fsid
674
675 nodes = []
676 daemons = {}
677 for remote, roles in ctx.cluster.remotes.items():
678 for role in [r for r in roles
679 if teuthology.is_type('mds', cluster_name)(r)]:
680 c_, _, id_ = teuthology.split_role(role)
681 log.info('Adding %s on %s' % (role, remote.shortname))
682 nodes.append(remote.shortname + '=' + id_)
683 daemons[role] = (remote, id_)
684 if nodes:
685 _shell(ctx, cluster_name, remote, [
686 'ceph', 'orch', 'apply', 'mds',
687 'all',
688 str(len(nodes)) + ';' + ';'.join(nodes)]
689 )
690 for role, i in daemons.items():
691 remote, id_ = i
692 ctx.daemons.register_daemon(
693 remote, 'mds', id_,
694 cluster=cluster_name,
695 fsid=fsid,
696 logger=log.getChild(role),
697 wait=False,
698 started=True,
699 )
700
701 yield
702
703 @contextlib.contextmanager
704 def ceph_monitoring(daemon_type, ctx, config):
705 """
706 Deploy prometheus, node-exporter, etc.
707 """
708 cluster_name = config['cluster']
709 fsid = ctx.ceph[cluster_name].fsid
710
711 nodes = []
712 daemons = {}
713 for remote, roles in ctx.cluster.remotes.items():
714 for role in [r for r in roles
715 if teuthology.is_type(daemon_type, cluster_name)(r)]:
716 c_, _, id_ = teuthology.split_role(role)
717 log.info('Adding %s on %s' % (role, remote.shortname))
718 nodes.append(remote.shortname + '=' + id_)
719 daemons[role] = (remote, id_)
720 if nodes:
721 _shell(ctx, cluster_name, remote, [
722 'ceph', 'orch', 'apply', daemon_type,
723 str(len(nodes)) + ';' + ';'.join(nodes)]
724 )
725 for role, i in daemons.items():
726 remote, id_ = i
727 ctx.daemons.register_daemon(
728 remote, daemon_type, id_,
729 cluster=cluster_name,
730 fsid=fsid,
731 logger=log.getChild(role),
732 wait=False,
733 started=True,
734 )
735
736 yield
737
738 @contextlib.contextmanager
739 def ceph_rgw(ctx, config):
740 """
741 Deploy rgw
742 """
743 cluster_name = config['cluster']
744 fsid = ctx.ceph[cluster_name].fsid
745
746 nodes = {}
747 daemons = {}
748 for remote, roles in ctx.cluster.remotes.items():
749 for role in [r for r in roles
750 if teuthology.is_type('rgw', cluster_name)(r)]:
751 c_, _, id_ = teuthology.split_role(role)
752 log.info('Adding %s on %s' % (role, remote.shortname))
753 realmzone = '.'.join(id_.split('.')[0:2])
754 if realmzone not in nodes:
755 nodes[realmzone] = []
756 nodes[realmzone].append(remote.shortname + '=' + id_)
757 daemons[role] = (remote, id_)
758
759 for realmzone in nodes.keys():
760 (realm, zone) = realmzone.split('.', 1)
761
762 # TODO: those should be moved to mgr/cephadm
763 _shell(ctx, cluster_name, remote,
764 ['radosgw-admin', 'realm', 'create', '--rgw-realm', realm, '--default']
765 )
766 _shell(ctx, cluster_name, remote,
767 ['radosgw-admin', 'zonegroup', 'create', '--rgw-zonegroup=default', '--master', '--default']
768 )
769 _shell(ctx, cluster_name, remote,
770 ['radosgw-admin', 'zone', 'create', '--rgw-zonegroup=default', '--rgw-zone', zone, '--master', '--default']
771 )
772
773 for realmzone, nodes in nodes.items():
774 (realm, zone) = realmzone.split('.', 1)
775 _shell(ctx, cluster_name, remote, [
776 'ceph', 'orch', 'apply', 'rgw', realm, zone,
777 '--placement',
778 str(len(nodes)) + ';' + ';'.join(nodes)]
779 )
780 for role, i in daemons.items():
781 remote, id_ = i
782 ctx.daemons.register_daemon(
783 remote, 'rgw', id_,
784 cluster=cluster_name,
785 fsid=fsid,
786 logger=log.getChild(role),
787 wait=False,
788 started=True,
789 )
790
791 yield
792
793
794 @contextlib.contextmanager
795 def ceph_iscsi(ctx, config):
796 """
797 Deploy iSCSIs
798 """
799 cluster_name = config['cluster']
800 fsid = ctx.ceph[cluster_name].fsid
801
802 nodes = []
803 daemons = {}
804 for remote, roles in ctx.cluster.remotes.items():
805 for role in [r for r in roles
806 if teuthology.is_type('iscsi', cluster_name)(r)]:
807 c_, _, id_ = teuthology.split_role(role)
808 log.info('Adding %s on %s' % (role, remote.shortname))
809 nodes.append(remote.shortname + '=' + id_)
810 daemons[role] = (remote, id_)
811 if nodes:
812 poolname = 'iscsi'
813 # ceph osd pool create iscsi 3 3 replicated
814 _shell(ctx, cluster_name, remote, [
815 'ceph', 'osd', 'pool', 'create',
816 poolname, '3', '3', 'replicated']
817 )
818
819 _shell(ctx, cluster_name, remote, [
820 'ceph', 'osd', 'pool', 'application', 'enable',
821 poolname, 'rbd']
822 )
823
824 # ceph orch apply iscsi iscsi user password
825 _shell(ctx, cluster_name, remote, [
826 'ceph', 'orch', 'apply', 'iscsi',
827 poolname, 'user', 'password',
828 '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
829 )
830 for role, i in daemons.items():
831 remote, id_ = i
832 ctx.daemons.register_daemon(
833 remote, 'iscsi', id_,
834 cluster=cluster_name,
835 fsid=fsid,
836 logger=log.getChild(role),
837 wait=False,
838 started=True,
839 )
840
841 yield
842
843 @contextlib.contextmanager
844 def ceph_clients(ctx, config):
845 cluster_name = config['cluster']
846 testdir = teuthology.get_testdir(ctx)
847
848 log.info('Setting up client nodes...')
849 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
850 testdir = teuthology.get_testdir(ctx)
851 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
852 for remote, roles_for_host in clients.remotes.items():
853 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
854 cluster_name):
855 name = teuthology.ceph_role(role)
856 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
857 name)
858 r = _shell(
859 ctx=ctx,
860 cluster_name=cluster_name,
861 remote=remote,
862 args=[
863 'ceph', 'auth',
864 'get-or-create', name,
865 'mon', 'allow *',
866 'osd', 'allow *',
867 'mds', 'allow *',
868 'mgr', 'allow *',
869 ],
870 stdout=StringIO(),
871 )
872 keyring = r.stdout.getvalue()
873 teuthology.sudo_write_file(
874 remote=remote,
875 path=client_keyring,
876 data=keyring,
877 perms='0644'
878 )
879 yield
880
881 @contextlib.contextmanager
882 def ceph_initial():
883 try:
884 yield
885 finally:
886 log.info('Teardown complete')
887
888 ## public methods
889 @contextlib.contextmanager
890 def stop(ctx, config):
891 """
892 Stop ceph daemons
893
894 For example::
895 tasks:
896 - ceph.stop: [mds.*]
897
898 tasks:
899 - ceph.stop: [osd.0, osd.2]
900
901 tasks:
902 - ceph.stop:
903 daemons: [osd.0, osd.2]
904
905 """
906 if config is None:
907 config = {}
908 elif isinstance(config, list):
909 config = {'daemons': config}
910
911 daemons = ctx.daemons.resolve_role_list(
912 config.get('daemons', None), CEPH_ROLE_TYPES, True)
913 clusters = set()
914
915 for role in daemons:
916 cluster, type_, id_ = teuthology.split_role(role)
917 ctx.daemons.get_daemon(type_, id_, cluster).stop()
918 clusters.add(cluster)
919
920 # for cluster in clusters:
921 # ctx.ceph[cluster].watchdog.stop()
922 # ctx.ceph[cluster].watchdog.join()
923
924 yield
925
926 def shell(ctx, config):
927 """
928 Execute (shell) commands
929 """
930 cluster_name = config.get('cluster', 'ceph')
931
932 env = []
933 if 'env' in config:
934 for k in config['env']:
935 env.extend(['-e', k + '=' + ctx.config.get(k, '')])
936 del config['env']
937
938 if 'all' in config and len(config) == 1:
939 a = config['all']
940 roles = teuthology.all_roles(ctx.cluster)
941 config = dict((id_, a) for id_ in roles)
942
943 for role, ls in config.items():
944 (remote,) = ctx.cluster.only(role).remotes.keys()
945 log.info('Running commands on role %s host %s', role, remote.name)
946 for c in ls:
947 _shell(ctx, cluster_name, remote,
948 ['bash', '-c', c],
949 extra_cephadm_args=env)
950
951 @contextlib.contextmanager
952 def tweaked_option(ctx, config):
953 """
954 set an option, and then restore it with its original value
955
956 Note, due to the way how tasks are executed/nested, it's not suggested to
957 use this method as a standalone task. otherwise, it's likely that it will
958 restore the tweaked option at the /end/ of 'tasks' block.
959 """
960 saved_options = {}
961 # we can complicate this when necessary
962 options = ['mon-health-to-clog']
963 type_, id_ = 'mon', '*'
964 cluster = config.get('cluster', 'ceph')
965 manager = ctx.managers[cluster]
966 if id_ == '*':
967 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
968 else:
969 get_from = id_
970 for option in options:
971 if option not in config:
972 continue
973 value = 'true' if config[option] else 'false'
974 option = option.replace('-', '_')
975 old_value = manager.get_config(type_, get_from, option)
976 if value != old_value:
977 saved_options[option] = old_value
978 manager.inject_args(type_, id_, option, value)
979 yield
980 for option, value in saved_options.items():
981 manager.inject_args(type_, id_, option, value)
982
983 @contextlib.contextmanager
984 def restart(ctx, config):
985 """
986 restart ceph daemons
987
988 For example::
989 tasks:
990 - ceph.restart: [all]
991
992 For example::
993 tasks:
994 - ceph.restart: [osd.0, mon.1, mds.*]
995
996 or::
997
998 tasks:
999 - ceph.restart:
1000 daemons: [osd.0, mon.1]
1001 wait-for-healthy: false
1002 wait-for-osds-up: true
1003
1004 :param ctx: Context
1005 :param config: Configuration
1006 """
1007 if config is None:
1008 config = {}
1009 elif isinstance(config, list):
1010 config = {'daemons': config}
1011
1012 daemons = ctx.daemons.resolve_role_list(
1013 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1014 clusters = set()
1015
1016 log.info('daemons %s' % daemons)
1017 with tweaked_option(ctx, config):
1018 for role in daemons:
1019 cluster, type_, id_ = teuthology.split_role(role)
1020 d = ctx.daemons.get_daemon(type_, id_, cluster)
1021 assert d, 'daemon %s does not exist' % role
1022 d.stop()
1023 if type_ == 'osd':
1024 ctx.managers[cluster].mark_down_osd(id_)
1025 d.restart()
1026 clusters.add(cluster)
1027
1028 if config.get('wait-for-healthy', True):
1029 for cluster in clusters:
1030 healthy(ctx=ctx, config=dict(cluster=cluster))
1031 if config.get('wait-for-osds-up', False):
1032 for cluster in clusters:
1033 ctx.managers[cluster].wait_for_all_osds_up()
1034 yield
1035
1036 @contextlib.contextmanager
1037 def distribute_config_and_admin_keyring(ctx, config):
1038 """
1039 Distribute a sufficient config and keyring for clients
1040 """
1041 cluster_name = config['cluster']
1042 log.info('Distributing (final) config and client.admin keyring...')
1043 for remote, roles in ctx.cluster.remotes.items():
1044 teuthology.sudo_write_file(
1045 remote=remote,
1046 path='/etc/ceph/{}.conf'.format(cluster_name),
1047 data=ctx.ceph[cluster_name].config_file)
1048 teuthology.sudo_write_file(
1049 remote=remote,
1050 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1051 data=ctx.ceph[cluster_name].admin_keyring)
1052 try:
1053 yield
1054 finally:
1055 ctx.cluster.run(args=[
1056 'sudo', 'rm', '-f',
1057 '/etc/ceph/{}.conf'.format(cluster_name),
1058 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1059 ])
1060
1061 @contextlib.contextmanager
1062 def crush_setup(ctx, config):
1063 cluster_name = config['cluster']
1064
1065 profile = config.get('crush_tunables', 'default')
1066 log.info('Setting crush tunables to %s', profile)
1067 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1068 args=['ceph', 'osd', 'crush', 'tunables', profile])
1069 yield
1070
1071 @contextlib.contextmanager
1072 def _bypass():
1073 yield
1074
1075 @contextlib.contextmanager
1076 def initialize_config(ctx, config):
1077 cluster_name = config['cluster']
1078 testdir = teuthology.get_testdir(ctx)
1079
1080 ctx.ceph[cluster_name].thrashers = []
1081 # fixme: setup watchdog, ala ceph.py
1082
1083 ctx.ceph[cluster_name].roleless = False # see below
1084
1085 first_ceph_cluster = False
1086 if not hasattr(ctx, 'daemons'):
1087 first_ceph_cluster = True
1088
1089 # cephadm mode?
1090 if 'cephadm_mode' not in config:
1091 config['cephadm_mode'] = 'root'
1092 assert config['cephadm_mode'] in ['root', 'cephadm-package']
1093 if config['cephadm_mode'] == 'root':
1094 ctx.cephadm = testdir + '/cephadm'
1095 else:
1096 ctx.cephadm = 'cephadm' # in the path
1097
1098 if first_ceph_cluster:
1099 # FIXME: this is global for all clusters
1100 ctx.daemons = DaemonGroup(
1101 use_cephadm=ctx.cephadm)
1102
1103 # uuid
1104 fsid = str(uuid.uuid1())
1105 log.info('Cluster fsid is %s' % fsid)
1106 ctx.ceph[cluster_name].fsid = fsid
1107
1108 # mon ips
1109 log.info('Choosing monitor IPs and ports...')
1110 remotes_and_roles = ctx.cluster.remotes.items()
1111 ips = [host for (host, port) in
1112 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1113
1114 if config.get('roleless', False):
1115 # mons will be named after hosts
1116 first_mon = None
1117 for remote, _ in remotes_and_roles:
1118 ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1119 if not first_mon:
1120 first_mon = remote.shortname
1121 bootstrap_remote = remote
1122 log.info('No mon roles; fabricating mons')
1123
1124 roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1125
1126 ctx.ceph[cluster_name].mons = get_mons(
1127 roles, ips, cluster_name,
1128 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1129 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1130 )
1131 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1132
1133 if config.get('roleless', False):
1134 ctx.ceph[cluster_name].roleless = True
1135 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1136 ctx.ceph[cluster_name].first_mon = first_mon
1137 ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1138 else:
1139 first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1140 _, _, first_mon = teuthology.split_role(first_mon_role)
1141 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1142 log.info('First mon is mon.%s on %s' % (first_mon,
1143 bootstrap_remote.shortname))
1144 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1145 ctx.ceph[cluster_name].first_mon = first_mon
1146 ctx.ceph[cluster_name].first_mon_role = first_mon_role
1147
1148 others = ctx.cluster.remotes[bootstrap_remote]
1149 mgrs = sorted([r for r in others
1150 if teuthology.is_type('mgr', cluster_name)(r)])
1151 if not mgrs:
1152 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1153 _, _, first_mgr = teuthology.split_role(mgrs[0])
1154 log.info('First mgr is %s' % (first_mgr))
1155 ctx.ceph[cluster_name].first_mgr = first_mgr
1156 yield
1157
1158 @contextlib.contextmanager
1159 def task(ctx, config):
1160 """
1161 Deploy ceph cluster using cephadm
1162
1163 Setup containers' mirrors before the bootstrap, if corresponding
1164 config provided in teuthology server config yaml file.
1165
1166 For example, teuthology.yaml can contain the 'defaults' section:
1167
1168 defaults:
1169 cephadm:
1170 containers:
1171 registry_mirrors:
1172 docker.io: 'registry.mirror.example.com:5000'
1173 image: 'quay.io/ceph-ci/ceph'
1174
1175 Using overrides makes it possible to customize it per run.
1176 The equivalent 'overrides' section looks like:
1177
1178 overrides:
1179 cephadm:
1180 containers:
1181 registry_mirrors:
1182 docker.io: 'registry.mirror.example.com:5000'
1183 image: 'quay.io/ceph-ci/ceph'
1184
1185 :param ctx: the argparse.Namespace object
1186 :param config: the config dict
1187 """
1188 if config is None:
1189 config = {}
1190
1191 assert isinstance(config, dict), \
1192 "task only supports a dictionary for configuration"
1193
1194 overrides = ctx.config.get('overrides', {})
1195 teuthology.deep_merge(config, overrides.get('ceph', {}))
1196 teuthology.deep_merge(config, overrides.get('cephadm', {}))
1197 log.info('Config: ' + str(config))
1198
1199 testdir = teuthology.get_testdir(ctx)
1200
1201 # set up cluster context
1202 if not hasattr(ctx, 'ceph'):
1203 ctx.ceph = {}
1204 ctx.managers = {}
1205 if 'cluster' not in config:
1206 config['cluster'] = 'ceph'
1207 cluster_name = config['cluster']
1208 if cluster_name not in ctx.ceph:
1209 ctx.ceph[cluster_name] = argparse.Namespace()
1210 ctx.ceph[cluster_name].bootstrapped = False
1211
1212 # image
1213 teuth_defaults = teuth_config.get('defaults', {})
1214 cephadm_defaults = teuth_defaults.get('cephadm', {})
1215 containers_defaults = cephadm_defaults.get('containers', {})
1216 mirrors_defaults = containers_defaults.get('registry_mirrors', {})
1217 container_registry_mirror = mirrors_defaults.get('docker.io', None)
1218 container_image_name = containers_defaults.get('image', None)
1219
1220 containers = config.get('containers', {})
1221 mirrors = containers.get('registry_mirrors', {})
1222 container_image_name = containers.get('image', container_image_name)
1223 container_registry_mirror = mirrors.get('docker.io',
1224 container_registry_mirror)
1225
1226
1227 if not hasattr(ctx.ceph[cluster_name], 'image'):
1228 ctx.ceph[cluster_name].image = config.get('image')
1229 ref = None
1230 if not ctx.ceph[cluster_name].image:
1231 if not container_image_name:
1232 raise Exception("Configuration error occurred. "
1233 "The 'image' value is undefined for 'cephadm' task. "
1234 "Please provide corresponding options in the task's "
1235 "config, task 'overrides', or teuthology 'defaults' "
1236 "section.")
1237 sha1 = config.get('sha1')
1238 flavor = config.get('flavor', 'default')
1239
1240 if sha1:
1241 if flavor == "crimson":
1242 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1243 else:
1244 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
1245 ref = sha1
1246 else:
1247 # hmm, fall back to branch?
1248 branch = config.get('branch', 'master')
1249 ref = branch
1250 ctx.ceph[cluster_name].image = container_image_name + ':' + branch
1251 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1252
1253
1254 with contextutil.nested(
1255 #if the cluster is already bootstrapped bypass corresponding methods
1256 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1257 else initialize_config(ctx=ctx, config=config),
1258 lambda: ceph_initial(),
1259 lambda: normalize_hostnames(ctx=ctx),
1260 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1261 else download_cephadm(ctx=ctx, config=config, ref=ref),
1262 lambda: ceph_log(ctx=ctx, config=config),
1263 lambda: ceph_crash(ctx=ctx, config=config),
1264 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1265 else ceph_bootstrap(ctx, config,
1266 container_registry_mirror),
1267 lambda: crush_setup(ctx=ctx, config=config),
1268 lambda: ceph_mons(ctx=ctx, config=config),
1269 lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1270 lambda: ceph_mgrs(ctx=ctx, config=config),
1271 lambda: ceph_osds(ctx=ctx, config=config),
1272 lambda: ceph_mdss(ctx=ctx, config=config),
1273 lambda: ceph_rgw(ctx=ctx, config=config),
1274 lambda: ceph_iscsi(ctx=ctx, config=config),
1275 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1276 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1277 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1278 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1279 lambda: ceph_clients(ctx=ctx, config=config),
1280 ):
1281 ctx.managers[cluster_name] = CephManager(
1282 ctx.ceph[cluster_name].bootstrap_remote,
1283 ctx=ctx,
1284 logger=log.getChild('ceph_manager.' + cluster_name),
1285 cluster=cluster_name,
1286 cephadm=True,
1287 )
1288
1289 try:
1290 if config.get('wait-for-healthy', True):
1291 healthy(ctx=ctx, config=config)
1292
1293 log.info('Setup complete, yielding')
1294 yield
1295
1296 finally:
1297 log.info('Teardown begin')
1298
1299
1300 def registries_add_mirror_to_docker_io(conf, mirror):
1301 config = toml.loads(conf)
1302 is_v1 = 'registries' in config
1303 if is_v1:
1304 search = config.get('registries', {}).get('search', {}).get('registries', [])
1305 insecure = config.get('registries', {}).get('search', {}).get('insecure', [])
1306 # v2: MutableMapping[str, Any] = { needs Python 3
1307 v2 = {
1308 'unqualified-search-registries': search,
1309 'registry': [
1310 {
1311 'prefix': reg,
1312 'location': reg,
1313 'insecure': reg in insecure,
1314 'blocked': False,
1315 } for reg in search
1316 ]
1317 }
1318 else:
1319 v2 = config # type: ignore
1320 dockers = [
1321 r for r in v2['registry'] if
1322 r.get('prefix') == 'docker.io' or r.get('location') == 'docker.io'
1323 ]
1324 if dockers:
1325 docker = dockers[0]
1326 if 'mirror' not in docker:
1327 docker['mirror'] = [{
1328 "location": mirror,
1329 "insecure": True,
1330 }]
1331 return v2
1332
1333
1334 def add_mirror_to_cluster(ctx, mirror):
1335 log.info('Adding local image mirror %s' % mirror)
1336
1337 registries_conf = '/etc/containers/registries.conf'
1338
1339 for remote in ctx.cluster.remotes.keys():
1340 try:
1341 config = teuthology.get_file(
1342 remote=remote,
1343 path=registries_conf
1344 )
1345 new_config = toml.dumps(registries_add_mirror_to_docker_io(config.decode('utf-8'), mirror))
1346
1347 teuthology.sudo_write_file(
1348 remote=remote,
1349 path=registries_conf,
1350 data=six.ensure_str(new_config),
1351 )
1352 except IOError as e: # py3: use FileNotFoundError instead.
1353 if e.errno != errno.ENOENT:
1354 raise
1355
1356 # Docker doesn't ship a registries.conf
1357 log.info('Failed to add mirror: %s' % str(e))