]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephadm.py
2b076053a784cb1745a5db278b27b61e40efb4c0
[ceph.git] / ceph / qa / tasks / cephadm.py
1 """
2 Ceph cluster task, deployed via cephadm orchestrator
3 """
4 from io import BytesIO
5
6 import argparse
7 import configobj
8 import contextlib
9 import logging
10 import os
11 import json
12 import re
13 import uuid
14
15 from ceph_manager import CephManager
16 from tarfile import ReadError
17 from teuthology import misc as teuthology
18 from teuthology import contextutil
19 from teuthology.orchestra import run
20 from teuthology.orchestra.daemon import DaemonGroup
21 from teuthology.config import config as teuth_config
22
23 # these items we use from ceph.py should probably eventually move elsewhere
24 from tasks.ceph import get_mons, healthy
25
26 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
27
28 log = logging.getLogger(__name__)
29
30
31 def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
32 testdir = teuthology.get_testdir(ctx)
33 return remote.run(
34 args=[
35 'sudo',
36 ctx.cephadm,
37 '--image', ctx.ceph[cluster_name].image,
38 'shell',
39 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
40 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
41 '--fsid', ctx.ceph[cluster_name].fsid,
42 ] + extra_cephadm_args + [
43 '--',
44 ] + args,
45 **kwargs
46 )
47
48 def build_initial_config(ctx, config):
49 cluster_name = config['cluster']
50
51 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
52 conf = configobj.ConfigObj(path, file_error=True)
53
54 conf.setdefault('global', {})
55 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
56
57 # overrides
58 for section, keys in config.get('conf',{}).items():
59 for key, value in keys.items():
60 log.info(" override: [%s] %s = %s" % (section, key, value))
61 if section not in conf:
62 conf[section] = {}
63 conf[section][key] = value
64
65 return conf
66
67 @contextlib.contextmanager
68 def normalize_hostnames(ctx):
69 """
70 Ensure we have short hostnames throughout, for consistency between
71 remote.shortname and socket.gethostname() in cephadm.
72 """
73 log.info('Normalizing hostnames...')
74 ctx.cluster.run(args=[
75 'sudo',
76 'hostname',
77 run.Raw('$(hostname -s)'),
78 ])
79
80 try:
81 yield
82 finally:
83 pass
84
85 @contextlib.contextmanager
86 def download_cephadm(ctx, config, ref):
87 cluster_name = config['cluster']
88
89 if config.get('cephadm_mode') != 'cephadm-package':
90 ref = config.get('cephadm_branch', ref)
91 git_url = teuth_config.get_ceph_git_url()
92 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
93 if git_url.startswith('https://github.com/'):
94 # git archive doesn't like https:// URLs, which we use with github.
95 rest = git_url.split('https://github.com/', 1)[1]
96 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
97 ctx.cluster.run(
98 args=[
99 'curl', '--silent',
100 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
101 run.Raw('>'),
102 ctx.cephadm,
103 run.Raw('&&'),
104 'ls', '-l',
105 ctx.cephadm,
106 ],
107 )
108 else:
109 ctx.cluster.run(
110 args=[
111 'git', 'archive',
112 '--remote=' + git_url,
113 ref,
114 'src/cephadm/cephadm',
115 run.Raw('|'),
116 'tar', '-xO', 'src/cephadm/cephadm',
117 run.Raw('>'),
118 ctx.cephadm,
119 ],
120 )
121 # sanity-check the resulting file and set executable bit
122 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
123 ctx.cluster.run(
124 args=[
125 'test', '-s', ctx.cephadm,
126 run.Raw('&&'),
127 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
128 run.Raw('&&'),
129 'chmod', '+x', ctx.cephadm,
130 ],
131 )
132
133 try:
134 yield
135 finally:
136 log.info('Removing cluster...')
137 ctx.cluster.run(args=[
138 'sudo',
139 ctx.cephadm,
140 'rm-cluster',
141 '--fsid', ctx.ceph[cluster_name].fsid,
142 '--force',
143 ])
144
145 if config.get('cephadm_mode') == 'root':
146 log.info('Removing cephadm ...')
147 ctx.cluster.run(
148 args=[
149 'rm',
150 '-rf',
151 ctx.cephadm,
152 ],
153 )
154
155 @contextlib.contextmanager
156 def ceph_log(ctx, config):
157 cluster_name = config['cluster']
158 fsid = ctx.ceph[cluster_name].fsid
159
160 try:
161 yield
162
163 except Exception:
164 # we need to know this below
165 ctx.summary['success'] = False
166 raise
167
168 finally:
169 log.info('Checking cluster log for badness...')
170 def first_in_ceph_log(pattern, excludes):
171 """
172 Find the first occurrence of the pattern specified in the Ceph log,
173 Returns None if none found.
174
175 :param pattern: Pattern scanned for.
176 :param excludes: Patterns to ignore.
177 :return: First line of text (or None if not found)
178 """
179 args = [
180 'sudo',
181 'egrep', pattern,
182 '/var/log/ceph/{fsid}/ceph.log'.format(
183 fsid=fsid),
184 ]
185 if excludes:
186 for exclude in excludes:
187 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
188 args.extend([
189 run.Raw('|'), 'head', '-n', '1',
190 ])
191 r = ctx.ceph[cluster_name].bootstrap_remote.run(
192 stdout=BytesIO(),
193 args=args,
194 )
195 stdout = r.stdout.getvalue()
196 if stdout != '':
197 return stdout
198 return None
199
200 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
201 config.get('log-whitelist')) is not None:
202 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
203 ctx.summary['success'] = False
204 # use the most severe problem as the failure reason
205 if 'failure_reason' not in ctx.summary:
206 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
207 match = first_in_ceph_log(pattern, config['log-whitelist'])
208 if match is not None:
209 ctx.summary['failure_reason'] = \
210 '"{match}" in cluster log'.format(
211 match=match.rstrip('\n'),
212 )
213 break
214
215 if ctx.archive is not None and \
216 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
217 # and logs
218 log.info('Compressing logs...')
219 run.wait(
220 ctx.cluster.run(
221 args=[
222 'sudo',
223 'find',
224 '/var/log/ceph', # all logs, not just for the cluster
225 '-name',
226 '*.log',
227 '-print0',
228 run.Raw('|'),
229 'sudo',
230 'xargs',
231 '-0',
232 '--no-run-if-empty',
233 '--',
234 'gzip',
235 '--',
236 ],
237 wait=False,
238 ),
239 )
240
241 log.info('Archiving logs...')
242 path = os.path.join(ctx.archive, 'remote')
243 try:
244 os.makedirs(path)
245 except OSError:
246 pass
247 for remote in ctx.cluster.remotes.keys():
248 sub = os.path.join(path, remote.name)
249 try:
250 os.makedirs(sub)
251 except OSError:
252 pass
253 teuthology.pull_directory(remote, '/var/log/ceph', # everything
254 os.path.join(sub, 'log'))
255
256 @contextlib.contextmanager
257 def ceph_crash(ctx, config):
258 """
259 Gather crash dumps from /var/lib/ceph/$fsid/crash
260 """
261 cluster_name = config['cluster']
262 fsid = ctx.ceph[cluster_name].fsid
263
264 try:
265 yield
266
267 finally:
268 if ctx.archive is not None:
269 log.info('Archiving crash dumps...')
270 path = os.path.join(ctx.archive, 'remote')
271 try:
272 os.makedirs(path)
273 except OSError:
274 pass
275 for remote in ctx.cluster.remotes.keys():
276 sub = os.path.join(path, remote.name)
277 try:
278 os.makedirs(sub)
279 except OSError:
280 pass
281 try:
282 teuthology.pull_directory(remote,
283 '/var/lib/ceph/%s/crash' % fsid,
284 os.path.join(sub, 'crash'))
285 except ReadError:
286 pass
287
288 @contextlib.contextmanager
289 def ceph_bootstrap(ctx, config):
290 cluster_name = config['cluster']
291 testdir = teuthology.get_testdir(ctx)
292 fsid = ctx.ceph[cluster_name].fsid
293
294 mons = ctx.ceph[cluster_name].mons
295 first_mon_role = sorted(mons.keys())[0]
296 _, _, first_mon = teuthology.split_role(first_mon_role)
297 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
298 log.info('First mon is mon.%s on %s' % (first_mon,
299 bootstrap_remote.shortname))
300 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
301 ctx.ceph[cluster_name].first_mon = first_mon
302
303 others = ctx.cluster.remotes[bootstrap_remote]
304 log.info('others %s' % others)
305 mgrs = sorted([r for r in others
306 if teuthology.is_type('mgr', cluster_name)(r)])
307 if not mgrs:
308 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
309 _, _, first_mgr = teuthology.split_role(mgrs[0])
310 log.info('First mgr is %s' % (first_mgr))
311 ctx.ceph[cluster_name].first_mgr = first_mgr
312
313 ctx.cluster.run(args=[
314 'sudo', 'mkdir', '-p', '/etc/ceph',
315 ]);
316 ctx.cluster.run(args=[
317 'sudo', 'chmod', '777', '/etc/ceph',
318 ]);
319 try:
320 # write seed config
321 log.info('Writing seed config...')
322 conf_fp = BytesIO()
323 seed_config = build_initial_config(ctx, config)
324 seed_config.write(conf_fp)
325 teuthology.write_file(
326 remote=bootstrap_remote,
327 path='{}/seed.{}.conf'.format(testdir, cluster_name),
328 data=conf_fp.getvalue())
329 log.debug('Final config:\n' + conf_fp.getvalue())
330 ctx.ceph[cluster_name].conf = seed_config
331
332 # register initial daemons
333 ctx.daemons.register_daemon(
334 bootstrap_remote, 'mon', first_mon,
335 cluster=cluster_name,
336 fsid=fsid,
337 logger=log.getChild('mon.' + first_mon),
338 wait=False,
339 started=True,
340 )
341 ctx.daemons.register_daemon(
342 bootstrap_remote, 'mgr', first_mgr,
343 cluster=cluster_name,
344 fsid=fsid,
345 logger=log.getChild('mgr.' + first_mgr),
346 wait=False,
347 started=True,
348 )
349
350 # bootstrap
351 log.info('Bootstrapping...')
352 cmd = [
353 'sudo',
354 ctx.cephadm,
355 '--image', ctx.ceph[cluster_name].image,
356 '-v',
357 'bootstrap',
358 '--fsid', fsid,
359 '--mon-id', first_mon,
360 '--mgr-id', first_mgr,
361 '--orphan-initial-daemons', # we will do it explicitly!
362 '--skip-monitoring-stack', # we'll provision these explicitly
363 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
364 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
365 '--output-keyring',
366 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
367 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
368 ]
369 if mons[first_mon_role].startswith('['):
370 cmd += ['--mon-addrv', mons[first_mon_role]]
371 else:
372 cmd += ['--mon-ip', mons[first_mon_role]]
373 if config.get('skip_dashboard'):
374 cmd += ['--skip-dashboard']
375 # bootstrap makes the keyring root 0600, so +r it for our purposes
376 cmd += [
377 run.Raw('&&'),
378 'sudo', 'chmod', '+r',
379 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
380 ]
381 bootstrap_remote.run(args=cmd)
382
383 # fetch keys and configs
384 log.info('Fetching config...')
385 ctx.ceph[cluster_name].config_file = teuthology.get_file(
386 remote=bootstrap_remote,
387 path='/etc/ceph/{}.conf'.format(cluster_name))
388 log.info('Fetching client.admin keyring...')
389 ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
390 remote=bootstrap_remote,
391 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name))
392 log.info('Fetching mon keyring...')
393 ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
394 remote=bootstrap_remote,
395 path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
396 sudo=True)
397
398 # fetch ssh key, distribute to additional nodes
399 log.info('Fetching pub ssh key...')
400 ssh_pub_key = teuthology.get_file(
401 remote=bootstrap_remote,
402 path='{}/{}.pub'.format(testdir, cluster_name)
403 ).strip()
404
405 log.info('Installing pub ssh key for root users...')
406 ctx.cluster.run(args=[
407 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
408 run.Raw('&&'),
409 'echo', ssh_pub_key,
410 run.Raw('|'),
411 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
412 run.Raw('&&'),
413 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
414 ])
415
416 # set options
417 _shell(ctx, cluster_name, bootstrap_remote,
418 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
419
420 # add other hosts
421 for remote in ctx.cluster.remotes.keys():
422 if remote == bootstrap_remote:
423 continue
424 log.info('Writing conf and keyring to %s' % remote.shortname)
425 teuthology.write_file(
426 remote=remote,
427 path='/etc/ceph/{}.conf'.format(cluster_name),
428 data=ctx.ceph[cluster_name].config_file)
429 teuthology.write_file(
430 remote=remote,
431 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
432 data=ctx.ceph[cluster_name].admin_keyring)
433
434 log.info('Adding host %s to orchestrator...' % remote.shortname)
435 _shell(ctx, cluster_name, remote, [
436 'ceph', 'orch', 'host', 'add',
437 remote.shortname
438 ])
439 r = _shell(ctx, cluster_name, remote,
440 ['ceph', 'orch', 'host', 'ls', '--format=json'],
441 stdout=BytesIO())
442 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
443 assert remote.shortname in hosts
444
445 yield
446
447 finally:
448 log.info('Cleaning up testdir ceph.* files...')
449 ctx.cluster.run(args=[
450 'rm', '-f',
451 '{}/seed.{}.conf'.format(testdir, cluster_name),
452 '{}/{}.pub'.format(testdir, cluster_name),
453 ])
454
455 log.info('Stopping all daemons...')
456
457 # this doesn't block until they are all stopped...
458 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
459
460 # so, stop them individually
461 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES):
462 cluster, type_, id_ = teuthology.split_role(role)
463 ctx.daemons.get_daemon(type_, id_, cluster).stop()
464
465 # clean up /etc/ceph
466 ctx.cluster.run(args=[
467 'sudo', 'rm', '-f',
468 '/etc/ceph/{}.conf'.format(cluster_name),
469 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
470 ])
471
472 @contextlib.contextmanager
473 def ceph_mons(ctx, config):
474 """
475 Deploy any additional mons
476 """
477 cluster_name = config['cluster']
478 fsid = ctx.ceph[cluster_name].fsid
479 num_mons = 1
480
481 try:
482 for remote, roles in ctx.cluster.remotes.items():
483 for mon in [r for r in roles
484 if teuthology.is_type('mon', cluster_name)(r)]:
485 c_, _, id_ = teuthology.split_role(mon)
486 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
487 continue
488 log.info('Adding %s on %s' % (mon, remote.shortname))
489 num_mons += 1
490 _shell(ctx, cluster_name, remote, [
491 'ceph', 'orch', 'daemon', 'add', 'mon',
492 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
493 ])
494 ctx.daemons.register_daemon(
495 remote, 'mon', id_,
496 cluster=cluster_name,
497 fsid=fsid,
498 logger=log.getChild(mon),
499 wait=False,
500 started=True,
501 )
502
503 with contextutil.safe_while(sleep=1, tries=180) as proceed:
504 while proceed():
505 log.info('Waiting for %d mons in monmap...' % (num_mons))
506 r = _shell(
507 ctx=ctx,
508 cluster_name=cluster_name,
509 remote=remote,
510 args=[
511 'ceph', 'mon', 'dump', '-f', 'json',
512 ],
513 stdout=BytesIO(),
514 )
515 j = json.loads(r.stdout.getvalue())
516 if len(j['mons']) == num_mons:
517 break
518
519 # refresh ceph.conf files for all mons + first mgr
520 for remote, roles in ctx.cluster.remotes.items():
521 for mon in [r for r in roles
522 if teuthology.is_type('mon', cluster_name)(r)]:
523 c_, _, id_ = teuthology.split_role(mon)
524 _shell(ctx, cluster_name, remote, [
525 'ceph', 'orch', 'daemon', 'reconfig',
526 'mon.' + id_,
527 ])
528 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote, [
529 'ceph', 'orch', 'daemon', 'reconfig',
530 'mgr.' + ctx.ceph[cluster_name].first_mgr,
531 ])
532
533 yield
534
535 finally:
536 pass
537
538 @contextlib.contextmanager
539 def ceph_mgrs(ctx, config):
540 """
541 Deploy any additional mgrs
542 """
543 cluster_name = config['cluster']
544 fsid = ctx.ceph[cluster_name].fsid
545
546 try:
547 nodes = []
548 daemons = {}
549 for remote, roles in ctx.cluster.remotes.items():
550 for mgr in [r for r in roles
551 if teuthology.is_type('mgr', cluster_name)(r)]:
552 c_, _, id_ = teuthology.split_role(mgr)
553 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
554 continue
555 log.info('Adding %s on %s' % (mgr, remote.shortname))
556 nodes.append(remote.shortname + '=' + id_)
557 daemons[mgr] = (remote, id_)
558 if nodes:
559 _shell(ctx, cluster_name, remote, [
560 'ceph', 'orch', 'apply', 'mgr',
561 str(len(nodes) + 1) + ';' + ';'.join(nodes)]
562 )
563 for mgr, i in daemons.items():
564 remote, id_ = i
565 ctx.daemons.register_daemon(
566 remote, 'mgr', id_,
567 cluster=cluster_name,
568 fsid=fsid,
569 logger=log.getChild(mgr),
570 wait=False,
571 started=True,
572 )
573
574 yield
575
576 finally:
577 pass
578
579 @contextlib.contextmanager
580 def ceph_osds(ctx, config):
581 """
582 Deploy OSDs
583 """
584 cluster_name = config['cluster']
585 fsid = ctx.ceph[cluster_name].fsid
586 try:
587 log.info('Deploying OSDs...')
588
589 # provision OSDs in numeric order
590 id_to_remote = {}
591 devs_by_remote = {}
592 for remote, roles in ctx.cluster.remotes.items():
593 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
594 for osd in [r for r in roles
595 if teuthology.is_type('osd', cluster_name)(r)]:
596 _, _, id_ = teuthology.split_role(osd)
597 id_to_remote[int(id_)] = (osd, remote)
598
599 cur = 0
600 for osd_id in sorted(id_to_remote.keys()):
601 osd, remote = id_to_remote[osd_id]
602 _, _, id_ = teuthology.split_role(osd)
603 assert int(id_) == cur
604 devs = devs_by_remote[remote]
605 assert devs ## FIXME ##
606 dev = devs.pop()
607 short_dev = dev.replace('/dev/', '')
608 log.info('Deploying %s on %s with %s...' % (
609 osd, remote.shortname, dev))
610 _shell(ctx, cluster_name, remote, [
611 'ceph-volume', 'lvm', 'zap', dev])
612 _shell(ctx, cluster_name, remote, [
613 'ceph', 'orch', 'daemon', 'add', 'osd',
614 remote.shortname + ':' + short_dev
615 ])
616 ctx.daemons.register_daemon(
617 remote, 'osd', id_,
618 cluster=cluster_name,
619 fsid=fsid,
620 logger=log.getChild(osd),
621 wait=False,
622 started=True,
623 )
624 cur += 1
625
626 yield
627 finally:
628 pass
629
630 @contextlib.contextmanager
631 def ceph_mdss(ctx, config):
632 """
633 Deploy MDSss
634 """
635 cluster_name = config['cluster']
636 fsid = ctx.ceph[cluster_name].fsid
637
638 nodes = []
639 daemons = {}
640 for remote, roles in ctx.cluster.remotes.items():
641 for role in [r for r in roles
642 if teuthology.is_type('mds', cluster_name)(r)]:
643 c_, _, id_ = teuthology.split_role(role)
644 log.info('Adding %s on %s' % (role, remote.shortname))
645 nodes.append(remote.shortname + '=' + id_)
646 daemons[role] = (remote, id_)
647 if nodes:
648 _shell(ctx, cluster_name, remote, [
649 'ceph', 'orch', 'apply', 'mds',
650 'all',
651 str(len(nodes)) + ';' + ';'.join(nodes)]
652 )
653 for role, i in daemons.items():
654 remote, id_ = i
655 ctx.daemons.register_daemon(
656 remote, 'mds', id_,
657 cluster=cluster_name,
658 fsid=fsid,
659 logger=log.getChild(role),
660 wait=False,
661 started=True,
662 )
663
664 yield
665
666 @contextlib.contextmanager
667 def ceph_monitoring(daemon_type, ctx, config):
668 """
669 Deploy prometheus, node-exporter, etc.
670 """
671 cluster_name = config['cluster']
672 fsid = ctx.ceph[cluster_name].fsid
673
674 nodes = []
675 daemons = {}
676 for remote, roles in ctx.cluster.remotes.items():
677 for role in [r for r in roles
678 if teuthology.is_type(daemon_type, cluster_name)(r)]:
679 c_, _, id_ = teuthology.split_role(role)
680 log.info('Adding %s on %s' % (role, remote.shortname))
681 nodes.append(remote.shortname + '=' + id_)
682 daemons[role] = (remote, id_)
683 if nodes:
684 _shell(ctx, cluster_name, remote, [
685 'ceph', 'orch', 'apply', daemon_type,
686 str(len(nodes)) + ';' + ';'.join(nodes)]
687 )
688 for role, i in daemons.items():
689 remote, id_ = i
690 ctx.daemons.register_daemon(
691 remote, daemon_type, id_,
692 cluster=cluster_name,
693 fsid=fsid,
694 logger=log.getChild(role),
695 wait=False,
696 started=True,
697 )
698
699 yield
700
701 @contextlib.contextmanager
702 def ceph_rgw(ctx, config):
703 """
704 Deploy rgw
705 """
706 cluster_name = config['cluster']
707 fsid = ctx.ceph[cluster_name].fsid
708
709 nodes = {}
710 daemons = {}
711 for remote, roles in ctx.cluster.remotes.items():
712 for role in [r for r in roles
713 if teuthology.is_type('rgw', cluster_name)(r)]:
714 c_, _, id_ = teuthology.split_role(role)
715 log.info('Adding %s on %s' % (role, remote.shortname))
716 realmzone = '.'.join(id_.split('.')[0:2])
717 if realmzone not in nodes:
718 nodes[realmzone] = []
719 nodes[realmzone].append(remote.shortname + '=' + id_)
720 daemons[role] = (remote, id_)
721 for realmzone, nodes in nodes.items():
722 (realm, zone) = realmzone.split('.', 1)
723 _shell(ctx, cluster_name, remote, [
724 'ceph', 'orch', 'apply', 'rgw',
725 realm, zone,
726 str(len(nodes)) + ';' + ';'.join(nodes)]
727 )
728 for role, i in daemons.items():
729 remote, id_ = i
730 ctx.daemons.register_daemon(
731 remote, 'rgw', id_,
732 cluster=cluster_name,
733 fsid=fsid,
734 logger=log.getChild(role),
735 wait=False,
736 started=True,
737 )
738
739 yield
740
741 @contextlib.contextmanager
742 def ceph_clients(ctx, config):
743 cluster_name = config['cluster']
744 testdir = teuthology.get_testdir(ctx)
745
746 log.info('Setting up client nodes...')
747 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
748 testdir = teuthology.get_testdir(ctx)
749 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
750 for remote, roles_for_host in clients.remotes.items():
751 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
752 cluster_name):
753 name = teuthology.ceph_role(role)
754 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
755 name)
756 r = _shell(
757 ctx=ctx,
758 cluster_name=cluster_name,
759 remote=remote,
760 args=[
761 'ceph', 'auth',
762 'get-or-create', name,
763 'mon', 'allow *',
764 'osd', 'allow *',
765 'mds', 'allow *',
766 'mgr', 'allow *',
767 ],
768 stdout=BytesIO(),
769 )
770 keyring = r.stdout.getvalue()
771 teuthology.sudo_write_file(
772 remote=remote,
773 path=client_keyring,
774 data=keyring,
775 perms='0644'
776 )
777 yield
778
779 @contextlib.contextmanager
780 def ceph_initial():
781 try:
782 yield
783 finally:
784 log.info('Teardown complete')
785
786 ## public methods
787 @contextlib.contextmanager
788 def stop(ctx, config):
789 """
790 Stop ceph daemons
791
792 For example::
793 tasks:
794 - ceph.stop: [mds.*]
795
796 tasks:
797 - ceph.stop: [osd.0, osd.2]
798
799 tasks:
800 - ceph.stop:
801 daemons: [osd.0, osd.2]
802
803 """
804 if config is None:
805 config = {}
806 elif isinstance(config, list):
807 config = {'daemons': config}
808
809 daemons = ctx.daemons.resolve_role_list(
810 config.get('daemons', None), CEPH_ROLE_TYPES, True)
811 clusters = set()
812
813 for role in daemons:
814 cluster, type_, id_ = teuthology.split_role(role)
815 ctx.daemons.get_daemon(type_, id_, cluster).stop()
816 clusters.add(cluster)
817
818 # for cluster in clusters:
819 # ctx.ceph[cluster].watchdog.stop()
820 # ctx.ceph[cluster].watchdog.join()
821
822 yield
823
824 def shell(ctx, config):
825 """
826 Execute (shell) commands
827 """
828 cluster_name = config.get('cluster', 'ceph')
829
830 env = []
831 if 'env' in config:
832 for k in config['env']:
833 env.extend(['-e', k + '=' + ctx.config.get(k, '')])
834 del config['env']
835
836 if 'all' in config and len(config) == 1:
837 a = config['all']
838 roles = teuthology.all_roles(ctx.cluster)
839 config = dict((id_, a) for id_ in roles)
840
841 for role, ls in config.items():
842 (remote,) = ctx.cluster.only(role).remotes.keys()
843 log.info('Running commands on role %s host %s', role, remote.name)
844 for c in ls:
845 _shell(ctx, cluster_name, remote,
846 ['bash', '-c', c],
847 extra_cephadm_args=env)
848
849 @contextlib.contextmanager
850 def tweaked_option(ctx, config):
851 """
852 set an option, and then restore it with its original value
853
854 Note, due to the way how tasks are executed/nested, it's not suggested to
855 use this method as a standalone task. otherwise, it's likely that it will
856 restore the tweaked option at the /end/ of 'tasks' block.
857 """
858 saved_options = {}
859 # we can complicate this when necessary
860 options = ['mon-health-to-clog']
861 type_, id_ = 'mon', '*'
862 cluster = config.get('cluster', 'ceph')
863 manager = ctx.managers[cluster]
864 if id_ == '*':
865 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
866 else:
867 get_from = id_
868 for option in options:
869 if option not in config:
870 continue
871 value = 'true' if config[option] else 'false'
872 option = option.replace('-', '_')
873 old_value = manager.get_config(type_, get_from, option)
874 if value != old_value:
875 saved_options[option] = old_value
876 manager.inject_args(type_, id_, option, value)
877 yield
878 for option, value in saved_options.items():
879 manager.inject_args(type_, id_, option, value)
880
881 @contextlib.contextmanager
882 def restart(ctx, config):
883 """
884 restart ceph daemons
885
886 For example::
887 tasks:
888 - ceph.restart: [all]
889
890 For example::
891 tasks:
892 - ceph.restart: [osd.0, mon.1, mds.*]
893
894 or::
895
896 tasks:
897 - ceph.restart:
898 daemons: [osd.0, mon.1]
899 wait-for-healthy: false
900 wait-for-osds-up: true
901
902 :param ctx: Context
903 :param config: Configuration
904 """
905 if config is None:
906 config = {}
907 elif isinstance(config, list):
908 config = {'daemons': config}
909
910 daemons = ctx.daemons.resolve_role_list(
911 config.get('daemons', None), CEPH_ROLE_TYPES, True)
912 clusters = set()
913
914 log.info('daemons %s' % daemons)
915 with tweaked_option(ctx, config):
916 for role in daemons:
917 cluster, type_, id_ = teuthology.split_role(role)
918 d = ctx.daemons.get_daemon(type_, id_, cluster)
919 assert d, 'daemon %s does not exist' % role
920 d.stop()
921 if type_ == 'osd':
922 ctx.managers[cluster].mark_down_osd(id_)
923 d.restart()
924 clusters.add(cluster)
925
926 if config.get('wait-for-healthy', True):
927 for cluster in clusters:
928 healthy(ctx=ctx, config=dict(cluster=cluster))
929 if config.get('wait-for-osds-up', False):
930 for cluster in clusters:
931 ctx.managers[cluster].wait_for_all_osds_up()
932 yield
933
934 @contextlib.contextmanager
935 def crush_setup(ctx, config):
936 cluster_name = config['cluster']
937 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
938 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
939
940 profile = config.get('crush_tunables', 'default')
941 log.info('Setting crush tunables to %s', profile)
942 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
943 args=['ceph', 'osd', 'crush', 'tunables', profile])
944 yield
945
946 @contextlib.contextmanager
947 def task(ctx, config):
948 if config is None:
949 config = {}
950
951 assert isinstance(config, dict), \
952 "task only supports a dictionary for configuration"
953
954 overrides = ctx.config.get('overrides', {})
955 teuthology.deep_merge(config, overrides.get('ceph', {}))
956 log.info('Config: ' + str(config))
957
958 testdir = teuthology.get_testdir(ctx)
959
960 # set up cluster context
961 first_ceph_cluster = False
962 if not hasattr(ctx, 'daemons'):
963 first_ceph_cluster = True
964 if not hasattr(ctx, 'ceph'):
965 ctx.ceph = {}
966 ctx.managers = {}
967 if 'cluster' not in config:
968 config['cluster'] = 'ceph'
969 cluster_name = config['cluster']
970 ctx.ceph[cluster_name] = argparse.Namespace()
971
972 ctx.ceph[cluster_name].thrashers = []
973 # fixme: setup watchdog, ala ceph.py
974
975 # cephadm mode?
976 if 'cephadm_mode' not in config:
977 config['cephadm_mode'] = 'root'
978 assert config['cephadm_mode'] in ['root', 'cephadm-package']
979 if config['cephadm_mode'] == 'root':
980 ctx.cephadm = testdir + '/cephadm'
981 else:
982 ctx.cephadm = 'cephadm' # in the path
983
984 if first_ceph_cluster:
985 # FIXME: this is global for all clusters
986 ctx.daemons = DaemonGroup(
987 use_cephadm=ctx.cephadm)
988
989 # image
990 ctx.ceph[cluster_name].image = config.get('image')
991 ref = None
992 if not ctx.ceph[cluster_name].image:
993 sha1 = config.get('sha1')
994 if sha1:
995 ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % sha1
996 ref = sha1
997 else:
998 # hmm, fall back to branch?
999 branch = config.get('branch', 'master')
1000 ref = branch
1001 ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % branch
1002 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1003
1004 # uuid
1005 fsid = str(uuid.uuid1())
1006 log.info('Cluster fsid is %s' % fsid)
1007 ctx.ceph[cluster_name].fsid = fsid
1008
1009 # mon ips
1010 log.info('Choosing monitor IPs and ports...')
1011 remotes_and_roles = ctx.cluster.remotes.items()
1012 roles = [role_list for (remote, role_list) in remotes_and_roles]
1013 ips = [host for (host, port) in
1014 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1015 ctx.ceph[cluster_name].mons = get_mons(
1016 roles, ips, cluster_name,
1017 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1018 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1019 )
1020 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1021
1022 with contextutil.nested(
1023 lambda: ceph_initial(),
1024 lambda: normalize_hostnames(ctx=ctx),
1025 lambda: download_cephadm(ctx=ctx, config=config, ref=ref),
1026 lambda: ceph_log(ctx=ctx, config=config),
1027 lambda: ceph_crash(ctx=ctx, config=config),
1028 lambda: ceph_bootstrap(ctx=ctx, config=config),
1029 lambda: crush_setup(ctx=ctx, config=config),
1030 lambda: ceph_mons(ctx=ctx, config=config),
1031 lambda: ceph_mgrs(ctx=ctx, config=config),
1032 lambda: ceph_osds(ctx=ctx, config=config),
1033 lambda: ceph_mdss(ctx=ctx, config=config),
1034 lambda: ceph_rgw(ctx=ctx, config=config),
1035 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1036 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1037 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1038 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1039 lambda: ceph_clients(ctx=ctx, config=config),
1040 ):
1041 ctx.managers[cluster_name] = CephManager(
1042 ctx.ceph[cluster_name].bootstrap_remote,
1043 ctx=ctx,
1044 logger=log.getChild('ceph_manager.' + cluster_name),
1045 cluster=cluster_name,
1046 cephadm=True,
1047 )
1048
1049 try:
1050 if config.get('wait-for-healthy', True):
1051 healthy(ctx=ctx, config=config)
1052
1053 log.info('Setup complete, yielding')
1054 yield
1055
1056 finally:
1057 log.info('Teardown begin')