]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
import 15.2.2 octopus source
[ceph.git] / ceph / qa / tasks / cephadm.py
1 """
2 Ceph cluster task, deployed via cephadm orchestrator
3 """
4 from io import BytesIO
5
6 import argparse
7 import configobj
8 import contextlib
9 import logging
10 import os
11 import json
12 import re
13 import uuid
14
15 from ceph_manager import CephManager
16 from tarfile import ReadError
17 from teuthology import misc as teuthology
18 from teuthology import contextutil
19 from teuthology.orchestra import run
20 from teuthology.orchestra.daemon import DaemonGroup
21 from teuthology.config import config as teuth_config
22
23 # these items we use from ceph.py should probably eventually move elsewhere
24 from tasks.ceph import get_mons, healthy
25
26 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
27
28 log = logging.getLogger(__name__)
29
30
31 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
32 testdir = teuthology.get_testdir(ctx)
33 return remote.run(
34 args=[
35 'sudo',
36 ctx.cephadm,
37 '--image', ctx.ceph[cluster_name].image,
38 'shell',
39 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
40 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
41 '--fsid', ctx.ceph[cluster_name].fsid,
42 ] + extra_cephadm_args + [
43 '--',
44 ] + args,
45 **kwargs
46 )
47
48 def build_initial_config(ctx, config):
49 cluster_name = config['cluster']
50
51 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
52 conf = configobj.ConfigObj(path, file_error=True)
53
54 conf.setdefault('global', {})
55 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
56
57 # overrides
58 for section, keys in config.get('conf',{}).items():
59 for key, value in keys.items():
60 log.info(" override: [%s] %s = %s" % (section, key, value))
61 if section not in conf:
62 conf[section] = {}
63 conf[section][key] = value
64
65 return conf
66
67 @contextlib.contextmanager
68 def normalize_hostnames(ctx):
69 """
70 Ensure we have short hostnames throughout, for consistency between
71 remote.shortname and socket.gethostname() in cephadm.
72 """
73 log.info('Normalizing hostnames...')
74 ctx.cluster.run(args=[
75 'sudo',
76 'hostname',
77 run.Raw('$(hostname -s)'),
78 ])
79
80 try:
81 yield
82 finally:
83 pass
84
85 @contextlib.contextmanager
86 def download_cephadm(ctx, config, ref):
87 cluster_name = config['cluster']
88
89 if config.get('cephadm_mode') != 'cephadm-package':
90 ref = config.get('cephadm_branch', ref)
91 git_url = teuth_config.get_ceph_git_url()
92 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
93 if git_url.startswith('https://github.com/'):
94 # git archive doesn't like https:// URLs, which we use with github.
95 rest = git_url.split('https://github.com/', 1)[1]
96 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
97 ctx.cluster.run(
98 args=[
99 'curl', '--silent',
100 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
101 run.Raw('>'),
102 ctx.cephadm,
103 run.Raw('&&'),
104 'ls', '-l',
105 ctx.cephadm,
106 ],
107 )
108 else:
109 ctx.cluster.run(
110 args=[
111 'git', 'archive',
112 '--remote=' + git_url,
113 ref,
114 'src/cephadm/cephadm',
115 run.Raw('|'),
116 'tar', '-xO', 'src/cephadm/cephadm',
117 run.Raw('>'),
118 ctx.cephadm,
119 ],
120 )
121 # sanity-check the resulting file and set executable bit
122 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
123 ctx.cluster.run(
124 args=[
125 'test', '-s', ctx.cephadm,
126 run.Raw('&&'),
127 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
128 run.Raw('&&'),
129 'chmod', '+x', ctx.cephadm,
130 ],
131 )
132
133 try:
134 yield
135 finally:
136 log.info('Removing cluster...')
137 ctx.cluster.run(args=[
138 'sudo',
139 ctx.cephadm,
140 'rm-cluster',
141 '--fsid', ctx.ceph[cluster_name].fsid,
142 '--force',
143 ])
144
145 if config.get('cephadm_mode') == 'root':
146 log.info('Removing cephadm ...')
147 ctx.cluster.run(
148 args=[
149 'rm',
150 '-rf',
151 ctx.cephadm,
152 ],
153 )
154
155 @contextlib.contextmanager
156 def ceph_log(ctx, config):
157 cluster_name = config['cluster']
158 fsid = ctx.ceph[cluster_name].fsid
159
160 try:
161 yield
162
163 except Exception:
164 # we need to know this below
165 ctx.summary['success'] = False
166 raise
167
168 finally:
169 log.info('Checking cluster log for badness...')
170 def first_in_ceph_log(pattern, excludes):
171 """
172 Find the first occurrence of the pattern specified in the Ceph log,
173 Returns None if none found.
174
175 :param pattern: Pattern scanned for.
176 :param excludes: Patterns to ignore.
177 :return: First line of text (or None if not found)
178 """
179 args = [
180 'sudo',
181 'egrep', pattern,
182 '/var/log/ceph/{fsid}/ceph.log'.format(
183 fsid=fsid),
184 ]
185 if excludes:
186 for exclude in excludes:
187 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
188 args.extend([
189 run.Raw('|'), 'head', '-n', '1',
190 ])
191 r = ctx.ceph[cluster_name].bootstrap_remote.run(
192 stdout=BytesIO(),
193 args=args,
194 )
195 stdout = r.stdout.getvalue()
196 if stdout != '':
197 return stdout
198 return None
199
200 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
201 config.get('log-whitelist')) is not None:
202 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
203 ctx.summary['success'] = False
204 # use the most severe problem as the failure reason
205 if 'failure_reason' not in ctx.summary:
206 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
207 match = first_in_ceph_log(pattern, config['log-whitelist'])
208 if match is not None:
209 ctx.summary['failure_reason'] = \
210 '"{match}" in cluster log'.format(
211 match=match.rstrip('\n'),
212 )
213 break
214
215 if ctx.archive is not None and \
216 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
217 # and logs
218 log.info('Compressing logs...')
219 run.wait(
220 ctx.cluster.run(
221 args=[
222 'sudo',
223 'find',
224 '/var/log/ceph', # all logs, not just for the cluster
225 '-name',
226 '*.log',
227 '-print0',
228 run.Raw('|'),
229 'sudo',
230 'xargs',
231 '-0',
232 '--no-run-if-empty',
233 '--',
234 'gzip',
235 '--',
236 ],
237 wait=False,
238 ),
239 )
240
241 log.info('Archiving logs...')
242 path = os.path.join(ctx.archive, 'remote')
243 try:
244 os.makedirs(path)
245 except OSError:
246 pass
247 for remote in ctx.cluster.remotes.keys():
248 sub = os.path.join(path, remote.name)
249 try:
250 os.makedirs(sub)
251 except OSError:
252 pass
253 teuthology.pull_directory(remote, '/var/log/ceph', # everything
254 os.path.join(sub, 'log'))
255
256 @contextlib.contextmanager
257 def ceph_crash(ctx, config):
258 """
259 Gather crash dumps from /var/lib/ceph/$fsid/crash
260 """
261 cluster_name = config['cluster']
262 fsid = ctx.ceph[cluster_name].fsid
263
264 try:
265 yield
266
267 finally:
268 if ctx.archive is not None:
269 log.info('Archiving crash dumps...')
270 path = os.path.join(ctx.archive, 'remote')
271 try:
272 os.makedirs(path)
273 except OSError:
274 pass
275 for remote in ctx.cluster.remotes.keys():
276 sub = os.path.join(path, remote.name)
277 try:
278 os.makedirs(sub)
279 except OSError:
280 pass
281 try:
282 teuthology.pull_directory(remote,
283 '/var/lib/ceph/%s/crash' % fsid,
284 os.path.join(sub, 'crash'))
285 except ReadError:
286 pass
287
288 @contextlib.contextmanager
289 def ceph_bootstrap(ctx, config):
290 cluster_name = config['cluster']
291 testdir = teuthology.get_testdir(ctx)
292 fsid = ctx.ceph[cluster_name].fsid
293
294 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
295 first_mon = ctx.ceph[cluster_name].first_mon
296 first_mon_role = ctx.ceph[cluster_name].first_mon_role
297 mons = ctx.ceph[cluster_name].mons
298
299 ctx.cluster.run(args=[
300 'sudo', 'mkdir', '-p', '/etc/ceph',
301 ]);
302 ctx.cluster.run(args=[
303 'sudo', 'chmod', '777', '/etc/ceph',
304 ]);
305 try:
306 # write seed config
307 log.info('Writing seed config...')
308 conf_fp = BytesIO()
309 seed_config = build_initial_config(ctx, config)
310 seed_config.write(conf_fp)
311 teuthology.write_file(
312 remote=bootstrap_remote,
313 path='{}/seed.{}.conf'.format(testdir, cluster_name),
314 data=conf_fp.getvalue())
315 log.debug('Final config:\n' + conf_fp.getvalue())
316 ctx.ceph[cluster_name].conf = seed_config
317
318 # register initial daemons
319 ctx.daemons.register_daemon(
320 bootstrap_remote, 'mon', first_mon,
321 cluster=cluster_name,
322 fsid=fsid,
323 logger=log.getChild('mon.' + first_mon),
324 wait=False,
325 started=True,
326 )
327 if not ctx.ceph[cluster_name].roleless:
328 first_mgr = ctx.ceph[cluster_name].first_mgr
329 ctx.daemons.register_daemon(
330 bootstrap_remote, 'mgr', first_mgr,
331 cluster=cluster_name,
332 fsid=fsid,
333 logger=log.getChild('mgr.' + first_mgr),
334 wait=False,
335 started=True,
336 )
337
338 # bootstrap
339 log.info('Bootstrapping...')
340 cmd = [
341 'sudo',
342 ctx.cephadm,
343 '--image', ctx.ceph[cluster_name].image,
344 '-v',
345 'bootstrap',
346 '--fsid', fsid,
347 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
348 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
349 '--output-keyring',
350 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
351 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
352 ]
353 if not ctx.ceph[cluster_name].roleless:
354 cmd += [
355 '--mon-id', first_mon,
356 '--mgr-id', first_mgr,
357 '--orphan-initial-daemons', # we will do it explicitly!
358 '--skip-monitoring-stack', # we'll provision these explicitly
359 ]
360 if mons[first_mon_role].startswith('['):
361 cmd += ['--mon-addrv', mons[first_mon_role]]
362 else:
363 cmd += ['--mon-ip', mons[first_mon_role]]
364 if config.get('skip_dashboard'):
365 cmd += ['--skip-dashboard']
366 # bootstrap makes the keyring root 0600, so +r it for our purposes
367 cmd += [
368 run.Raw('&&'),
369 'sudo', 'chmod', '+r',
370 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
371 ]
372 bootstrap_remote.run(args=cmd)
373
374 # fetch keys and configs
375 log.info('Fetching config...')
376 ctx.ceph[cluster_name].config_file = teuthology.get_file(
377 remote=bootstrap_remote,
378 path='/etc/ceph/{}.conf'.format(cluster_name))
379 log.info('Fetching client.admin keyring...')
380 ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
381 remote=bootstrap_remote,
382 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name))
383 log.info('Fetching mon keyring...')
384 ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
385 remote=bootstrap_remote,
386 path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
387 sudo=True)
388
389 # fetch ssh key, distribute to additional nodes
390 log.info('Fetching pub ssh key...')
391 ssh_pub_key = teuthology.get_file(
392 remote=bootstrap_remote,
393 path='{}/{}.pub'.format(testdir, cluster_name)
394 ).strip()
395
396 log.info('Installing pub ssh key for root users...')
397 ctx.cluster.run(args=[
398 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
399 run.Raw('&&'),
400 'echo', ssh_pub_key,
401 run.Raw('|'),
402 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
403 run.Raw('&&'),
404 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
405 ])
406
407 # set options
408 _shell(ctx, cluster_name, bootstrap_remote,
409 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
410
411 # add other hosts
412 for remote in ctx.cluster.remotes.keys():
413 if remote == bootstrap_remote:
414 continue
415 log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
416 teuthology.write_file(
417 remote=remote,
418 path='/etc/ceph/{}.conf'.format(cluster_name),
419 data=ctx.ceph[cluster_name].config_file)
420 teuthology.write_file(
421 remote=remote,
422 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
423 data=ctx.ceph[cluster_name].admin_keyring)
424
425 log.info('Adding host %s to orchestrator...' % remote.shortname)
426 _shell(ctx, cluster_name, remote, [
427 'ceph', 'orch', 'host', 'add',
428 remote.shortname
429 ])
430 r = _shell(ctx, cluster_name, remote,
431 ['ceph', 'orch', 'host', 'ls', '--format=json'],
432 stdout=BytesIO())
433 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
434 assert remote.shortname in hosts
435
436 yield
437
438 finally:
439 log.info('Cleaning up testdir ceph.* files...')
440 ctx.cluster.run(args=[
441 'rm', '-f',
442 '{}/seed.{}.conf'.format(testdir, cluster_name),
443 '{}/{}.pub'.format(testdir, cluster_name),
444 ])
445
446 log.info('Stopping all daemons...')
447
448 # this doesn't block until they are all stopped...
449 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
450
451 # so, stop them individually
452 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES):
453 cluster, type_, id_ = teuthology.split_role(role)
454 ctx.daemons.get_daemon(type_, id_, cluster).stop()
455
456 # clean up /etc/ceph
457 ctx.cluster.run(args=[
458 'sudo', 'rm', '-f',
459 '/etc/ceph/{}.conf'.format(cluster_name),
460 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
461 ])
462
463 @contextlib.contextmanager
464 def ceph_mons(ctx, config):
465 """
466 Deploy any additional mons
467 """
468 cluster_name = config['cluster']
469 fsid = ctx.ceph[cluster_name].fsid
470 num_mons = 1
471
472 try:
473 for remote, roles in ctx.cluster.remotes.items():
474 for mon in [r for r in roles
475 if teuthology.is_type('mon', cluster_name)(r)]:
476 c_, _, id_ = teuthology.split_role(mon)
477 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
478 continue
479 log.info('Adding %s on %s' % (mon, remote.shortname))
480 num_mons += 1
481 _shell(ctx, cluster_name, remote, [
482 'ceph', 'orch', 'daemon', 'add', 'mon',
483 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
484 ])
485 ctx.daemons.register_daemon(
486 remote, 'mon', id_,
487 cluster=cluster_name,
488 fsid=fsid,
489 logger=log.getChild(mon),
490 wait=False,
491 started=True,
492 )
493
494 with contextutil.safe_while(sleep=1, tries=180) as proceed:
495 while proceed():
496 log.info('Waiting for %d mons in monmap...' % (num_mons))
497 r = _shell(
498 ctx=ctx,
499 cluster_name=cluster_name,
500 remote=remote,
501 args=[
502 'ceph', 'mon', 'dump', '-f', 'json',
503 ],
504 stdout=BytesIO(),
505 )
506 j = json.loads(r.stdout.getvalue())
507 if len(j['mons']) == num_mons:
508 break
509
510 # refresh our (final) ceph.conf file
511 log.info('Generating final ceph.conf file...')
512 r = _shell(
513 ctx=ctx,
514 cluster_name=cluster_name,
515 remote=remote,
516 args=[
517 'ceph', 'config', 'generate-minimal-conf',
518 ],
519 stdout=BytesIO(),
520 )
521 ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
522
523 yield
524
525 finally:
526 pass
527
528 @contextlib.contextmanager
529 def ceph_mgrs(ctx, config):
530 """
531 Deploy any additional mgrs
532 """
533 cluster_name = config['cluster']
534 fsid = ctx.ceph[cluster_name].fsid
535
536 try:
537 nodes = []
538 daemons = {}
539 for remote, roles in ctx.cluster.remotes.items():
540 for mgr in [r for r in roles
541 if teuthology.is_type('mgr', cluster_name)(r)]:
542 c_, _, id_ = teuthology.split_role(mgr)
543 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
544 continue
545 log.info('Adding %s on %s' % (mgr, remote.shortname))
546 nodes.append(remote.shortname + '=' + id_)
547 daemons[mgr] = (remote, id_)
548 if nodes:
549 _shell(ctx, cluster_name, remote, [
550 'ceph', 'orch', 'apply', 'mgr',
551 str(len(nodes) + 1) + ';' + ';'.join(nodes)]
552 )
553 for mgr, i in daemons.items():
554 remote, id_ = i
555 ctx.daemons.register_daemon(
556 remote, 'mgr', id_,
557 cluster=cluster_name,
558 fsid=fsid,
559 logger=log.getChild(mgr),
560 wait=False,
561 started=True,
562 )
563
564 yield
565
566 finally:
567 pass
568
569 @contextlib.contextmanager
570 def ceph_osds(ctx, config):
571 """
572 Deploy OSDs
573 """
574 cluster_name = config['cluster']
575 fsid = ctx.ceph[cluster_name].fsid
576
577 try:
578 log.info('Deploying OSDs...')
579
580 # provision OSDs in numeric order
581 id_to_remote = {}
582 devs_by_remote = {}
583 for remote, roles in ctx.cluster.remotes.items():
584 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
585 for osd in [r for r in roles
586 if teuthology.is_type('osd', cluster_name)(r)]:
587 _, _, id_ = teuthology.split_role(osd)
588 id_to_remote[int(id_)] = (osd, remote)
589
590 cur = 0
591 for osd_id in sorted(id_to_remote.keys()):
592 osd, remote = id_to_remote[osd_id]
593 _, _, id_ = teuthology.split_role(osd)
594 assert int(id_) == cur
595 devs = devs_by_remote[remote]
596 assert devs ## FIXME ##
597 dev = devs.pop()
598 short_dev = dev.replace('/dev/', '')
599 log.info('Deploying %s on %s with %s...' % (
600 osd, remote.shortname, dev))
601 _shell(ctx, cluster_name, remote, [
602 'ceph-volume', 'lvm', 'zap', dev])
603 _shell(ctx, cluster_name, remote, [
604 'ceph', 'orch', 'daemon', 'add', 'osd',
605 remote.shortname + ':' + short_dev
606 ])
607 ctx.daemons.register_daemon(
608 remote, 'osd', id_,
609 cluster=cluster_name,
610 fsid=fsid,
611 logger=log.getChild(osd),
612 wait=False,
613 started=True,
614 )
615 cur += 1
616
617 yield
618 finally:
619 pass
620
621 @contextlib.contextmanager
622 def ceph_mdss(ctx, config):
623 """
624 Deploy MDSss
625 """
626 cluster_name = config['cluster']
627 fsid = ctx.ceph[cluster_name].fsid
628
629 nodes = []
630 daemons = {}
631 for remote, roles in ctx.cluster.remotes.items():
632 for role in [r for r in roles
633 if teuthology.is_type('mds', cluster_name)(r)]:
634 c_, _, id_ = teuthology.split_role(role)
635 log.info('Adding %s on %s' % (role, remote.shortname))
636 nodes.append(remote.shortname + '=' + id_)
637 daemons[role] = (remote, id_)
638 if nodes:
639 _shell(ctx, cluster_name, remote, [
640 'ceph', 'orch', 'apply', 'mds',
641 'all',
642 str(len(nodes)) + ';' + ';'.join(nodes)]
643 )
644 for role, i in daemons.items():
645 remote, id_ = i
646 ctx.daemons.register_daemon(
647 remote, 'mds', id_,
648 cluster=cluster_name,
649 fsid=fsid,
650 logger=log.getChild(role),
651 wait=False,
652 started=True,
653 )
654
655 yield
656
657 @contextlib.contextmanager
658 def ceph_monitoring(daemon_type, ctx, config):
659 """
660 Deploy prometheus, node-exporter, etc.
661 """
662 cluster_name = config['cluster']
663 fsid = ctx.ceph[cluster_name].fsid
664
665 nodes = []
666 daemons = {}
667 for remote, roles in ctx.cluster.remotes.items():
668 for role in [r for r in roles
669 if teuthology.is_type(daemon_type, cluster_name)(r)]:
670 c_, _, id_ = teuthology.split_role(role)
671 log.info('Adding %s on %s' % (role, remote.shortname))
672 nodes.append(remote.shortname + '=' + id_)
673 daemons[role] = (remote, id_)
674 if nodes:
675 _shell(ctx, cluster_name, remote, [
676 'ceph', 'orch', 'apply', daemon_type,
677 str(len(nodes)) + ';' + ';'.join(nodes)]
678 )
679 for role, i in daemons.items():
680 remote, id_ = i
681 ctx.daemons.register_daemon(
682 remote, daemon_type, id_,
683 cluster=cluster_name,
684 fsid=fsid,
685 logger=log.getChild(role),
686 wait=False,
687 started=True,
688 )
689
690 yield
691
692 @contextlib.contextmanager
693 def ceph_rgw(ctx, config):
694 """
695 Deploy rgw
696 """
697 cluster_name = config['cluster']
698 fsid = ctx.ceph[cluster_name].fsid
699
700 nodes = {}
701 daemons = {}
702 for remote, roles in ctx.cluster.remotes.items():
703 for role in [r for r in roles
704 if teuthology.is_type('rgw', cluster_name)(r)]:
705 c_, _, id_ = teuthology.split_role(role)
706 log.info('Adding %s on %s' % (role, remote.shortname))
707 realmzone = '.'.join(id_.split('.')[0:2])
708 if realmzone not in nodes:
709 nodes[realmzone] = []
710 nodes[realmzone].append(remote.shortname + '=' + id_)
711 daemons[role] = (remote, id_)
712 for realmzone, nodes in nodes.items():
713 (realm, zone) = realmzone.split('.', 1)
714 _shell(ctx, cluster_name, remote, [
715 'ceph', 'orch', 'apply', 'rgw',
716 realm, zone,
717 str(len(nodes)) + ';' + ';'.join(nodes)]
718 )
719 for role, i in daemons.items():
720 remote, id_ = i
721 ctx.daemons.register_daemon(
722 remote, 'rgw', id_,
723 cluster=cluster_name,
724 fsid=fsid,
725 logger=log.getChild(role),
726 wait=False,
727 started=True,
728 )
729
730 yield
731
732 @contextlib.contextmanager
733 def ceph_clients(ctx, config):
734 cluster_name = config['cluster']
735 testdir = teuthology.get_testdir(ctx)
736
737 log.info('Setting up client nodes...')
738 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
739 testdir = teuthology.get_testdir(ctx)
740 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
741 for remote, roles_for_host in clients.remotes.items():
742 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
743 cluster_name):
744 name = teuthology.ceph_role(role)
745 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
746 name)
747 r = _shell(
748 ctx=ctx,
749 cluster_name=cluster_name,
750 remote=remote,
751 args=[
752 'ceph', 'auth',
753 'get-or-create', name,
754 'mon', 'allow *',
755 'osd', 'allow *',
756 'mds', 'allow *',
757 'mgr', 'allow *',
758 ],
759 stdout=BytesIO(),
760 )
761 keyring = r.stdout.getvalue()
762 teuthology.sudo_write_file(
763 remote=remote,
764 path=client_keyring,
765 data=keyring,
766 perms='0644'
767 )
768 yield
769
770 @contextlib.contextmanager
771 def ceph_initial():
772 try:
773 yield
774 finally:
775 log.info('Teardown complete')
776
777 ## public methods
778 @contextlib.contextmanager
779 def stop(ctx, config):
780 """
781 Stop ceph daemons
782
783 For example::
784 tasks:
785 - ceph.stop: [mds.*]
786
787 tasks:
788 - ceph.stop: [osd.0, osd.2]
789
790 tasks:
791 - ceph.stop:
792 daemons: [osd.0, osd.2]
793
794 """
795 if config is None:
796 config = {}
797 elif isinstance(config, list):
798 config = {'daemons': config}
799
800 daemons = ctx.daemons.resolve_role_list(
801 config.get('daemons', None), CEPH_ROLE_TYPES, True)
802 clusters = set()
803
804 for role in daemons:
805 cluster, type_, id_ = teuthology.split_role(role)
806 ctx.daemons.get_daemon(type_, id_, cluster).stop()
807 clusters.add(cluster)
808
809 # for cluster in clusters:
810 # ctx.ceph[cluster].watchdog.stop()
811 # ctx.ceph[cluster].watchdog.join()
812
813 yield
814
815 def shell(ctx, config):
816 """
817 Execute (shell) commands
818 """
819 cluster_name = config.get('cluster', 'ceph')
820
821 env = []
822 if 'env' in config:
823 for k in config['env']:
824 env.extend(['-e', k + '=' + ctx.config.get(k, '')])
825 del config['env']
826
827 if 'all' in config and len(config) == 1:
828 a = config['all']
829 roles = teuthology.all_roles(ctx.cluster)
830 config = dict((id_, a) for id_ in roles)
831
832 for role, ls in config.items():
833 (remote,) = ctx.cluster.only(role).remotes.keys()
834 log.info('Running commands on role %s host %s', role, remote.name)
835 for c in ls:
836 _shell(ctx, cluster_name, remote,
837 ['bash', '-c', c],
838 extra_cephadm_args=env)
839
840 @contextlib.contextmanager
841 def tweaked_option(ctx, config):
842 """
843 set an option, and then restore it with its original value
844
845 Note, due to the way how tasks are executed/nested, it's not suggested to
846 use this method as a standalone task. otherwise, it's likely that it will
847 restore the tweaked option at the /end/ of 'tasks' block.
848 """
849 saved_options = {}
850 # we can complicate this when necessary
851 options = ['mon-health-to-clog']
852 type_, id_ = 'mon', '*'
853 cluster = config.get('cluster', 'ceph')
854 manager = ctx.managers[cluster]
855 if id_ == '*':
856 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
857 else:
858 get_from = id_
859 for option in options:
860 if option not in config:
861 continue
862 value = 'true' if config[option] else 'false'
863 option = option.replace('-', '_')
864 old_value = manager.get_config(type_, get_from, option)
865 if value != old_value:
866 saved_options[option] = old_value
867 manager.inject_args(type_, id_, option, value)
868 yield
869 for option, value in saved_options.items():
870 manager.inject_args(type_, id_, option, value)
871
872 @contextlib.contextmanager
873 def restart(ctx, config):
874 """
875 restart ceph daemons
876
877 For example::
878 tasks:
879 - ceph.restart: [all]
880
881 For example::
882 tasks:
883 - ceph.restart: [osd.0, mon.1, mds.*]
884
885 or::
886
887 tasks:
888 - ceph.restart:
889 daemons: [osd.0, mon.1]
890 wait-for-healthy: false
891 wait-for-osds-up: true
892
893 :param ctx: Context
894 :param config: Configuration
895 """
896 if config is None:
897 config = {}
898 elif isinstance(config, list):
899 config = {'daemons': config}
900
901 daemons = ctx.daemons.resolve_role_list(
902 config.get('daemons', None), CEPH_ROLE_TYPES, True)
903 clusters = set()
904
905 log.info('daemons %s' % daemons)
906 with tweaked_option(ctx, config):
907 for role in daemons:
908 cluster, type_, id_ = teuthology.split_role(role)
909 d = ctx.daemons.get_daemon(type_, id_, cluster)
910 assert d, 'daemon %s does not exist' % role
911 d.stop()
912 if type_ == 'osd':
913 ctx.managers[cluster].mark_down_osd(id_)
914 d.restart()
915 clusters.add(cluster)
916
917 if config.get('wait-for-healthy', True):
918 for cluster in clusters:
919 healthy(ctx=ctx, config=dict(cluster=cluster))
920 if config.get('wait-for-osds-up', False):
921 for cluster in clusters:
922 ctx.managers[cluster].wait_for_all_osds_up()
923 yield
924
925 @contextlib.contextmanager
926 def distribute_config_and_admin_keyring(ctx, config):
927 """
928 Distribute a sufficient config and keyring for clients
929 """
930 cluster_name = config['cluster']
931 log.info('Distributing (final) config and client.admin keyring...')
932 for remote, roles in ctx.cluster.remotes.items():
933 teuthology.sudo_write_file(
934 remote=remote,
935 path='/etc/ceph/{}.conf'.format(cluster_name),
936 data=ctx.ceph[cluster_name].config_file)
937 teuthology.sudo_write_file(
938 remote=remote,
939 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
940 data=ctx.ceph[cluster_name].admin_keyring)
941 try:
942 yield
943 finally:
944 ctx.cluster.run(args=[
945 'sudo', 'rm', '-f',
946 '/etc/ceph/{}.conf'.format(cluster_name),
947 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
948 ])
949
950 @contextlib.contextmanager
951 def crush_setup(ctx, config):
952 cluster_name = config['cluster']
953
954 profile = config.get('crush_tunables', 'default')
955 log.info('Setting crush tunables to %s', profile)
956 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
957 args=['ceph', 'osd', 'crush', 'tunables', profile])
958 yield
959
960 @contextlib.contextmanager
961 def task(ctx, config):
962 if config is None:
963 config = {}
964
965 assert isinstance(config, dict), \
966 "task only supports a dictionary for configuration"
967
968 overrides = ctx.config.get('overrides', {})
969 teuthology.deep_merge(config, overrides.get('ceph', {}))
970 log.info('Config: ' + str(config))
971
972 testdir = teuthology.get_testdir(ctx)
973
974 # set up cluster context
975 first_ceph_cluster = False
976 if not hasattr(ctx, 'daemons'):
977 first_ceph_cluster = True
978 if not hasattr(ctx, 'ceph'):
979 ctx.ceph = {}
980 ctx.managers = {}
981 if 'cluster' not in config:
982 config['cluster'] = 'ceph'
983 cluster_name = config['cluster']
984 ctx.ceph[cluster_name] = argparse.Namespace()
985
986 ctx.ceph[cluster_name].thrashers = []
987 # fixme: setup watchdog, ala ceph.py
988
989 ctx.ceph[cluster_name].roleless = False # see below
990
991 # cephadm mode?
992 if 'cephadm_mode' not in config:
993 config['cephadm_mode'] = 'root'
994 assert config['cephadm_mode'] in ['root', 'cephadm-package']
995 if config['cephadm_mode'] == 'root':
996 ctx.cephadm = testdir + '/cephadm'
997 else:
998 ctx.cephadm = 'cephadm' # in the path
999
1000 if first_ceph_cluster:
1001 # FIXME: this is global for all clusters
1002 ctx.daemons = DaemonGroup(
1003 use_cephadm=ctx.cephadm)
1004
1005 # image
1006 ctx.ceph[cluster_name].image = config.get('image')
1007 ref = None
1008 if not ctx.ceph[cluster_name].image:
1009 sha1 = config.get('sha1')
1010 if sha1:
1011 ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % sha1
1012 ref = sha1
1013 else:
1014 # hmm, fall back to branch?
1015 branch = config.get('branch', 'master')
1016 ref = branch
1017 ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % branch
1018 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1019
1020 # uuid
1021 fsid = str(uuid.uuid1())
1022 log.info('Cluster fsid is %s' % fsid)
1023 ctx.ceph[cluster_name].fsid = fsid
1024
1025 # mon ips
1026 log.info('Choosing monitor IPs and ports...')
1027 remotes_and_roles = ctx.cluster.remotes.items()
1028 roles = [role_list for (remote, role_list) in remotes_and_roles]
1029 ips = [host for (host, port) in
1030 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1031
1032 if config.get('roleless', False):
1033 # mons will be named after hosts
1034 n = len(roles)
1035 roles = []
1036 first_mon = None
1037 for remote, _ in remotes_and_roles:
1038 roles.append(['mon.' + remote.shortname])
1039 if not first_mon:
1040 first_mon = remote.shortname
1041 bootstrap_remote = remote
1042 log.info('No roles; fabricating mons %s' % roles)
1043
1044 ctx.ceph[cluster_name].mons = get_mons(
1045 roles, ips, cluster_name,
1046 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1047 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1048 )
1049 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1050
1051 if config.get('roleless', False):
1052 ctx.ceph[cluster_name].roleless = True
1053 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1054 ctx.ceph[cluster_name].first_mon = first_mon
1055 ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1056 else:
1057 first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1058 _, _, first_mon = teuthology.split_role(first_mon_role)
1059 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1060 log.info('First mon is mon.%s on %s' % (first_mon,
1061 bootstrap_remote.shortname))
1062 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1063 ctx.ceph[cluster_name].first_mon = first_mon
1064 ctx.ceph[cluster_name].first_mon_role = first_mon_role
1065
1066 others = ctx.cluster.remotes[bootstrap_remote]
1067 mgrs = sorted([r for r in others
1068 if teuthology.is_type('mgr', cluster_name)(r)])
1069 if not mgrs:
1070 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1071 _, _, first_mgr = teuthology.split_role(mgrs[0])
1072 log.info('First mgr is %s' % (first_mgr))
1073 ctx.ceph[cluster_name].first_mgr = first_mgr
1074
1075
1076 with contextutil.nested(
1077 lambda: ceph_initial(),
1078 lambda: normalize_hostnames(ctx=ctx),
1079 lambda: download_cephadm(ctx=ctx, config=config, ref=ref),
1080 lambda: ceph_log(ctx=ctx, config=config),
1081 lambda: ceph_crash(ctx=ctx, config=config),
1082 lambda: ceph_bootstrap(ctx=ctx, config=config),
1083 lambda: crush_setup(ctx=ctx, config=config),
1084 lambda: ceph_mons(ctx=ctx, config=config),
1085 lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
1086 lambda: ceph_mgrs(ctx=ctx, config=config),
1087 lambda: ceph_osds(ctx=ctx, config=config),
1088 lambda: ceph_mdss(ctx=ctx, config=config),
1089 lambda: ceph_rgw(ctx=ctx, config=config),
1090 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1091 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1092 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1093 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1094 lambda: ceph_clients(ctx=ctx, config=config),
1095 ):
1096 ctx.managers[cluster_name] = CephManager(
1097 ctx.ceph[cluster_name].bootstrap_remote,
1098 ctx=ctx,
1099 logger=log.getChild('ceph_manager.' + cluster_name),
1100 cluster=cluster_name,
1101 cephadm=True,
1102 )
1103
1104 try:
1105 if config.get('wait-for-healthy', True):
1106 healthy(ctx=ctx, config=config)
1107
1108 log.info('Setup complete, yielding')
1109 yield
1110
1111 finally:
1112 log.info('Teardown begin')