]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephadm.py
buildsys: change download over to reef release
[ceph.git] / ceph / qa / tasks / cephadm.py
CommitLineData
9f95a23c
TL
1"""
2Ceph cluster task, deployed via cephadm orchestrator
3"""
9f95a23c
TL
4import argparse
5import configobj
6import contextlib
7import logging
8import os
9import json
10import re
11import uuid
f91f0fd5 12import yaml
9f95a23c 13
20effc67 14from copy import deepcopy
f67539c2 15from io import BytesIO, StringIO
9f95a23c 16from tarfile import ReadError
e306af50 17from tasks.ceph_manager import CephManager
9f95a23c
TL
18from teuthology import misc as teuthology
19from teuthology import contextutil
20from teuthology.orchestra import run
21from teuthology.orchestra.daemon import DaemonGroup
22from teuthology.config import config as teuth_config
20effc67
TL
23from textwrap import dedent
24from tasks.cephfs.filesystem import MDSCluster, Filesystem
9f95a23c
TL
25
26# these items we use from ceph.py should probably eventually move elsewhere
27from tasks.ceph import get_mons, healthy
f67539c2 28from tasks.vip import subst_vip
9f95a23c
TL
29
30CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
31
32log = logging.getLogger(__name__)
33
34
35def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
f67539c2 36 teuthology.get_testdir(ctx)
9f95a23c
TL
37 return remote.run(
38 args=[
39 'sudo',
40 ctx.cephadm,
41 '--image', ctx.ceph[cluster_name].image,
42 'shell',
43 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
44 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
45 '--fsid', ctx.ceph[cluster_name].fsid,
46 ] + extra_cephadm_args + [
47 '--',
48 ] + args,
49 **kwargs
50 )
51
b3b6e05e 52
9f95a23c
TL
53def build_initial_config(ctx, config):
54 cluster_name = config['cluster']
55
56 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
57 conf = configobj.ConfigObj(path, file_error=True)
58
59 conf.setdefault('global', {})
60 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
61
62 # overrides
63 for section, keys in config.get('conf',{}).items():
64 for key, value in keys.items():
65 log.info(" override: [%s] %s = %s" % (section, key, value))
66 if section not in conf:
67 conf[section] = {}
68 conf[section][key] = value
69
70 return conf
71
b3b6e05e 72
20effc67
TL
73def distribute_iscsi_gateway_cfg(ctx, conf_data):
74 """
75 Distribute common gateway config to get the IPs.
76 These will help in iscsi clients with finding trusted_ip_list.
77 """
78 log.info('Distributing iscsi-gateway.cfg...')
79 for remote, roles in ctx.cluster.remotes.items():
80 remote.write_file(
81 path='/etc/ceph/iscsi-gateway.cfg',
82 data=conf_data,
83 sudo=True)
84
f67539c2
TL
85def update_archive_setting(ctx, key, value):
86 """
87 Add logs directory to job's info log file
88 """
b3b6e05e
TL
89 if ctx.archive is None:
90 return
f67539c2
TL
91 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
92 info_yaml = yaml.safe_load(info_file)
93 info_file.seek(0)
94 if 'archive' in info_yaml:
95 info_yaml['archive'][key] = value
96 else:
97 info_yaml['archive'] = {key: value}
98 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
99
b3b6e05e 100
9f95a23c
TL
101@contextlib.contextmanager
102def normalize_hostnames(ctx):
103 """
104 Ensure we have short hostnames throughout, for consistency between
105 remote.shortname and socket.gethostname() in cephadm.
106 """
107 log.info('Normalizing hostnames...')
108 ctx.cluster.run(args=[
109 'sudo',
110 'hostname',
111 run.Raw('$(hostname -s)'),
112 ])
113
114 try:
115 yield
116 finally:
117 pass
118
b3b6e05e 119
9f95a23c
TL
120@contextlib.contextmanager
121def download_cephadm(ctx, config, ref):
122 cluster_name = config['cluster']
123
124 if config.get('cephadm_mode') != 'cephadm-package':
125 ref = config.get('cephadm_branch', ref)
f67539c2 126 git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
9f95a23c 127 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
b3b6e05e
TL
128 if ctx.config.get('redhat'):
129 log.info("Install cephadm using RPM")
130 # cephadm already installed from redhat.install task
131 ctx.cluster.run(
132 args=[
133 'cp',
134 run.Raw('$(which cephadm)'),
135 ctx.cephadm,
136 run.Raw('&&'),
137 'ls', '-l',
138 ctx.cephadm,
139 ]
140 )
141 elif git_url.startswith('https://github.com/'):
9f95a23c
TL
142 # git archive doesn't like https:// URLs, which we use with github.
143 rest = git_url.split('https://github.com/', 1)[1]
144 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
145 ctx.cluster.run(
146 args=[
147 'curl', '--silent',
148 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
149 run.Raw('>'),
150 ctx.cephadm,
151 run.Raw('&&'),
152 'ls', '-l',
153 ctx.cephadm,
154 ],
155 )
156 else:
157 ctx.cluster.run(
158 args=[
39ae355f
TL
159 'git', 'clone', git_url, 'testrepo',
160 run.Raw('&&'),
161 'cd', 'testrepo',
162 run.Raw('&&'),
163 'git', 'show', f'{ref}:src/cephadm/cephadm',
9f95a23c
TL
164 run.Raw('>'),
165 ctx.cephadm,
39ae355f
TL
166 run.Raw('&&'),
167 'ls', '-l', ctx.cephadm,
9f95a23c
TL
168 ],
169 )
170 # sanity-check the resulting file and set executable bit
171 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
172 ctx.cluster.run(
173 args=[
174 'test', '-s', ctx.cephadm,
175 run.Raw('&&'),
176 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
177 run.Raw('&&'),
178 'chmod', '+x', ctx.cephadm,
179 ],
180 )
181
182 try:
183 yield
184 finally:
185 log.info('Removing cluster...')
186 ctx.cluster.run(args=[
187 'sudo',
188 ctx.cephadm,
189 'rm-cluster',
190 '--fsid', ctx.ceph[cluster_name].fsid,
191 '--force',
192 ])
193
194 if config.get('cephadm_mode') == 'root':
195 log.info('Removing cephadm ...')
196 ctx.cluster.run(
197 args=[
198 'rm',
199 '-rf',
200 ctx.cephadm,
201 ],
202 )
203
b3b6e05e 204
9f95a23c
TL
205@contextlib.contextmanager
206def ceph_log(ctx, config):
207 cluster_name = config['cluster']
208 fsid = ctx.ceph[cluster_name].fsid
209
f67539c2
TL
210 update_archive_setting(ctx, 'log', '/var/log/ceph')
211
f91f0fd5 212
9f95a23c
TL
213 try:
214 yield
215
216 except Exception:
217 # we need to know this below
218 ctx.summary['success'] = False
219 raise
220
221 finally:
222 log.info('Checking cluster log for badness...')
223 def first_in_ceph_log(pattern, excludes):
224 """
225 Find the first occurrence of the pattern specified in the Ceph log,
226 Returns None if none found.
227
228 :param pattern: Pattern scanned for.
229 :param excludes: Patterns to ignore.
230 :return: First line of text (or None if not found)
231 """
232 args = [
233 'sudo',
234 'egrep', pattern,
235 '/var/log/ceph/{fsid}/ceph.log'.format(
236 fsid=fsid),
237 ]
238 if excludes:
239 for exclude in excludes:
240 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
241 args.extend([
242 run.Raw('|'), 'head', '-n', '1',
243 ])
244 r = ctx.ceph[cluster_name].bootstrap_remote.run(
e306af50 245 stdout=StringIO(),
9f95a23c
TL
246 args=args,
247 )
248 stdout = r.stdout.getvalue()
249 if stdout != '':
250 return stdout
251 return None
252
253 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
cd265ab1 254 config.get('log-ignorelist')) is not None:
9f95a23c
TL
255 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
256 ctx.summary['success'] = False
257 # use the most severe problem as the failure reason
258 if 'failure_reason' not in ctx.summary:
259 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
cd265ab1 260 match = first_in_ceph_log(pattern, config['log-ignorelist'])
9f95a23c
TL
261 if match is not None:
262 ctx.summary['failure_reason'] = \
263 '"{match}" in cluster log'.format(
264 match=match.rstrip('\n'),
265 )
266 break
267
268 if ctx.archive is not None and \
269 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
270 # and logs
271 log.info('Compressing logs...')
272 run.wait(
273 ctx.cluster.run(
274 args=[
275 'sudo',
276 'find',
277 '/var/log/ceph', # all logs, not just for the cluster
f91f0fd5 278 '/var/log/rbd-target-api', # ceph-iscsi
9f95a23c
TL
279 '-name',
280 '*.log',
281 '-print0',
282 run.Raw('|'),
283 'sudo',
284 'xargs',
285 '-0',
286 '--no-run-if-empty',
287 '--',
288 'gzip',
289 '--',
290 ],
291 wait=False,
292 ),
293 )
294
295 log.info('Archiving logs...')
296 path = os.path.join(ctx.archive, 'remote')
297 try:
298 os.makedirs(path)
299 except OSError:
300 pass
301 for remote in ctx.cluster.remotes.keys():
20effc67 302 sub = os.path.join(path, remote.shortname)
9f95a23c
TL
303 try:
304 os.makedirs(sub)
305 except OSError:
306 pass
e306af50
TL
307 try:
308 teuthology.pull_directory(remote, '/var/log/ceph', # everything
309 os.path.join(sub, 'log'))
310 except ReadError:
311 pass
9f95a23c 312
b3b6e05e 313
9f95a23c
TL
314@contextlib.contextmanager
315def ceph_crash(ctx, config):
316 """
317 Gather crash dumps from /var/lib/ceph/$fsid/crash
318 """
319 cluster_name = config['cluster']
320 fsid = ctx.ceph[cluster_name].fsid
321
f67539c2 322 update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
f91f0fd5 323
9f95a23c
TL
324 try:
325 yield
326
327 finally:
328 if ctx.archive is not None:
329 log.info('Archiving crash dumps...')
330 path = os.path.join(ctx.archive, 'remote')
331 try:
332 os.makedirs(path)
333 except OSError:
334 pass
335 for remote in ctx.cluster.remotes.keys():
20effc67 336 sub = os.path.join(path, remote.shortname)
9f95a23c
TL
337 try:
338 os.makedirs(sub)
339 except OSError:
340 pass
341 try:
342 teuthology.pull_directory(remote,
343 '/var/lib/ceph/%s/crash' % fsid,
344 os.path.join(sub, 'crash'))
345 except ReadError:
346 pass
347
b3b6e05e 348
20effc67
TL
349@contextlib.contextmanager
350def pull_image(ctx, config):
351 cluster_name = config['cluster']
352 log.info(f'Pulling image {ctx.ceph[cluster_name].image} on all hosts...')
353 run.wait(
354 ctx.cluster.run(
355 args=[
356 'sudo',
357 ctx.cephadm,
358 '--image', ctx.ceph[cluster_name].image,
359 'pull',
360 ],
361 wait=False,
362 )
363 )
364
365 try:
366 yield
367 finally:
368 pass
369
370
9f95a23c 371@contextlib.contextmanager
f67539c2 372def ceph_bootstrap(ctx, config):
e306af50 373 """
f67539c2 374 Bootstrap ceph cluster.
e306af50
TL
375
376 :param ctx: the argparse.Namespace object
377 :param config: the config dict
e306af50 378 """
9f95a23c
TL
379 cluster_name = config['cluster']
380 testdir = teuthology.get_testdir(ctx)
381 fsid = ctx.ceph[cluster_name].fsid
382
1911f103
TL
383 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
384 first_mon = ctx.ceph[cluster_name].first_mon
385 first_mon_role = ctx.ceph[cluster_name].first_mon_role
9f95a23c 386 mons = ctx.ceph[cluster_name].mons
f91f0fd5 387
9f95a23c
TL
388 ctx.cluster.run(args=[
389 'sudo', 'mkdir', '-p', '/etc/ceph',
390 ]);
391 ctx.cluster.run(args=[
392 'sudo', 'chmod', '777', '/etc/ceph',
393 ]);
394 try:
395 # write seed config
396 log.info('Writing seed config...')
397 conf_fp = BytesIO()
398 seed_config = build_initial_config(ctx, config)
399 seed_config.write(conf_fp)
f67539c2 400 bootstrap_remote.write_file(
9f95a23c
TL
401 path='{}/seed.{}.conf'.format(testdir, cluster_name),
402 data=conf_fp.getvalue())
e306af50 403 log.debug('Final config:\n' + conf_fp.getvalue().decode())
9f95a23c
TL
404 ctx.ceph[cluster_name].conf = seed_config
405
406 # register initial daemons
407 ctx.daemons.register_daemon(
408 bootstrap_remote, 'mon', first_mon,
409 cluster=cluster_name,
410 fsid=fsid,
411 logger=log.getChild('mon.' + first_mon),
412 wait=False,
413 started=True,
414 )
1911f103
TL
415 if not ctx.ceph[cluster_name].roleless:
416 first_mgr = ctx.ceph[cluster_name].first_mgr
417 ctx.daemons.register_daemon(
418 bootstrap_remote, 'mgr', first_mgr,
419 cluster=cluster_name,
420 fsid=fsid,
421 logger=log.getChild('mgr.' + first_mgr),
422 wait=False,
423 started=True,
424 )
9f95a23c
TL
425
426 # bootstrap
427 log.info('Bootstrapping...')
428 cmd = [
429 'sudo',
430 ctx.cephadm,
431 '--image', ctx.ceph[cluster_name].image,
432 '-v',
433 'bootstrap',
434 '--fsid', fsid,
9f95a23c
TL
435 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
436 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
437 '--output-keyring',
438 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
439 '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
440 ]
b3b6e05e
TL
441
442 if config.get('registry-login'):
443 registry = config['registry-login']
444 cmd += [
445 "--registry-url", registry['url'],
446 "--registry-username", registry['username'],
447 "--registry-password", registry['password'],
448 ]
449
1911f103
TL
450 if not ctx.ceph[cluster_name].roleless:
451 cmd += [
452 '--mon-id', first_mon,
453 '--mgr-id', first_mgr,
454 '--orphan-initial-daemons', # we will do it explicitly!
455 '--skip-monitoring-stack', # we'll provision these explicitly
456 ]
b3b6e05e 457
9f95a23c
TL
458 if mons[first_mon_role].startswith('['):
459 cmd += ['--mon-addrv', mons[first_mon_role]]
460 else:
461 cmd += ['--mon-ip', mons[first_mon_role]]
462 if config.get('skip_dashboard'):
463 cmd += ['--skip-dashboard']
f67539c2
TL
464 if config.get('skip_monitoring_stack'):
465 cmd += ['--skip-monitoring-stack']
b3b6e05e
TL
466 if config.get('single_host_defaults'):
467 cmd += ['--single-host-defaults']
468 if not config.get('avoid_pacific_features', False):
469 cmd += ['--skip-admin-label']
9f95a23c
TL
470 # bootstrap makes the keyring root 0600, so +r it for our purposes
471 cmd += [
472 run.Raw('&&'),
473 'sudo', 'chmod', '+r',
474 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
475 ]
476 bootstrap_remote.run(args=cmd)
477
478 # fetch keys and configs
479 log.info('Fetching config...')
f67539c2
TL
480 ctx.ceph[cluster_name].config_file = \
481 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
9f95a23c 482 log.info('Fetching client.admin keyring...')
f67539c2
TL
483 ctx.ceph[cluster_name].admin_keyring = \
484 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
9f95a23c 485 log.info('Fetching mon keyring...')
f67539c2
TL
486 ctx.ceph[cluster_name].mon_keyring = \
487 bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
9f95a23c
TL
488
489 # fetch ssh key, distribute to additional nodes
490 log.info('Fetching pub ssh key...')
f67539c2
TL
491 ssh_pub_key = bootstrap_remote.read_file(
492 f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
9f95a23c
TL
493
494 log.info('Installing pub ssh key for root users...')
495 ctx.cluster.run(args=[
496 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
497 run.Raw('&&'),
498 'echo', ssh_pub_key,
499 run.Raw('|'),
500 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
501 run.Raw('&&'),
502 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
503 ])
504
505 # set options
f67539c2
TL
506 if config.get('allow_ptrace', True):
507 _shell(ctx, cluster_name, bootstrap_remote,
508 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
9f95a23c 509
b3b6e05e
TL
510 if not config.get('avoid_pacific_features', False):
511 log.info('Distributing conf and client.admin keyring to all hosts + 0755')
512 _shell(ctx, cluster_name, bootstrap_remote,
513 ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
514 '*', '--mode', '0755'],
515 check_status=False)
516
9f95a23c
TL
517 # add other hosts
518 for remote in ctx.cluster.remotes.keys():
519 if remote == bootstrap_remote:
520 continue
b3b6e05e
TL
521
522 # note: this may be redundant (see above), but it avoids
523 # us having to wait for cephadm to do it.
1911f103 524 log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
f67539c2 525 remote.write_file(
9f95a23c
TL
526 path='/etc/ceph/{}.conf'.format(cluster_name),
527 data=ctx.ceph[cluster_name].config_file)
f67539c2 528 remote.write_file(
9f95a23c
TL
529 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
530 data=ctx.ceph[cluster_name].admin_keyring)
531
532 log.info('Adding host %s to orchestrator...' % remote.shortname)
20effc67 533 _shell(ctx, cluster_name, bootstrap_remote, [
9f95a23c
TL
534 'ceph', 'orch', 'host', 'add',
535 remote.shortname
536 ])
20effc67 537 r = _shell(ctx, cluster_name, bootstrap_remote,
9f95a23c 538 ['ceph', 'orch', 'host', 'ls', '--format=json'],
e306af50 539 stdout=StringIO())
9f95a23c
TL
540 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
541 assert remote.shortname in hosts
542
543 yield
544
545 finally:
546 log.info('Cleaning up testdir ceph.* files...')
547 ctx.cluster.run(args=[
548 'rm', '-f',
549 '{}/seed.{}.conf'.format(testdir, cluster_name),
550 '{}/{}.pub'.format(testdir, cluster_name),
551 ])
552
553 log.info('Stopping all daemons...')
554
555 # this doesn't block until they are all stopped...
556 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
557
f67539c2 558 # stop the daemons we know
e306af50 559 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
9f95a23c 560 cluster, type_, id_ = teuthology.split_role(role)
e306af50
TL
561 try:
562 ctx.daemons.get_daemon(type_, id_, cluster).stop()
563 except Exception:
f67539c2 564 log.exception(f'Failed to stop "{role}"')
f91f0fd5 565 raise
9f95a23c 566
f67539c2
TL
567 # tear down anything left (but leave the logs behind)
568 ctx.cluster.run(
569 args=[
570 'sudo',
571 ctx.cephadm,
572 'rm-cluster',
573 '--fsid', fsid,
574 '--force',
575 '--keep-logs',
576 ],
577 check_status=False, # may fail if upgrading from old cephadm
578 )
579
9f95a23c
TL
580 # clean up /etc/ceph
581 ctx.cluster.run(args=[
582 'sudo', 'rm', '-f',
583 '/etc/ceph/{}.conf'.format(cluster_name),
584 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
585 ])
586
b3b6e05e 587
9f95a23c
TL
588@contextlib.contextmanager
589def ceph_mons(ctx, config):
590 """
591 Deploy any additional mons
592 """
593 cluster_name = config['cluster']
594 fsid = ctx.ceph[cluster_name].fsid
9f95a23c
TL
595
596 try:
f67539c2
TL
597 daemons = {}
598 if config.get('add_mons_via_daemon_add'):
599 # This is the old way of adding mons that works with the (early) octopus
600 # cephadm scheduler.
601 num_mons = 1
602 for remote, roles in ctx.cluster.remotes.items():
603 for mon in [r for r in roles
604 if teuthology.is_type('mon', cluster_name)(r)]:
605 c_, _, id_ = teuthology.split_role(mon)
606 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
607 continue
608 log.info('Adding %s on %s' % (mon, remote.shortname))
609 num_mons += 1
610 _shell(ctx, cluster_name, remote, [
611 'ceph', 'orch', 'daemon', 'add', 'mon',
612 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
613 ])
614 ctx.daemons.register_daemon(
615 remote, 'mon', id_,
616 cluster=cluster_name,
617 fsid=fsid,
618 logger=log.getChild(mon),
619 wait=False,
620 started=True,
621 )
622 daemons[mon] = (remote, id_)
623
624 with contextutil.safe_while(sleep=1, tries=180) as proceed:
625 while proceed():
626 log.info('Waiting for %d mons in monmap...' % (num_mons))
627 r = _shell(
628 ctx=ctx,
629 cluster_name=cluster_name,
630 remote=remote,
631 args=[
632 'ceph', 'mon', 'dump', '-f', 'json',
633 ],
634 stdout=StringIO(),
635 )
636 j = json.loads(r.stdout.getvalue())
637 if len(j['mons']) == num_mons:
638 break
639 else:
640 nodes = []
641 for remote, roles in ctx.cluster.remotes.items():
642 for mon in [r for r in roles
643 if teuthology.is_type('mon', cluster_name)(r)]:
644 c_, _, id_ = teuthology.split_role(mon)
645 log.info('Adding %s on %s' % (mon, remote.shortname))
646 nodes.append(remote.shortname
647 + ':' + ctx.ceph[cluster_name].mons[mon]
648 + '=' + id_)
649 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
650 continue
651 daemons[mon] = (remote, id_)
652
653 _shell(ctx, cluster_name, remote, [
654 'ceph', 'orch', 'apply', 'mon',
655 str(len(nodes)) + ';' + ';'.join(nodes)]
656 )
657 for mgr, i in daemons.items():
658 remote, id_ = i
9f95a23c
TL
659 ctx.daemons.register_daemon(
660 remote, 'mon', id_,
661 cluster=cluster_name,
662 fsid=fsid,
663 logger=log.getChild(mon),
664 wait=False,
665 started=True,
666 )
667
f67539c2
TL
668 with contextutil.safe_while(sleep=1, tries=180) as proceed:
669 while proceed():
670 log.info('Waiting for %d mons in monmap...' % (len(nodes)))
671 r = _shell(
672 ctx=ctx,
673 cluster_name=cluster_name,
674 remote=remote,
675 args=[
676 'ceph', 'mon', 'dump', '-f', 'json',
677 ],
678 stdout=StringIO(),
679 )
680 j = json.loads(r.stdout.getvalue())
681 if len(j['mons']) == len(nodes):
682 break
9f95a23c 683
1911f103 684 # refresh our (final) ceph.conf file
f91f0fd5 685 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
1911f103
TL
686 log.info('Generating final ceph.conf file...')
687 r = _shell(
688 ctx=ctx,
689 cluster_name=cluster_name,
f91f0fd5 690 remote=bootstrap_remote,
1911f103
TL
691 args=[
692 'ceph', 'config', 'generate-minimal-conf',
693 ],
e306af50 694 stdout=StringIO(),
1911f103
TL
695 )
696 ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
9f95a23c
TL
697
698 yield
699
700 finally:
701 pass
702
b3b6e05e 703
9f95a23c
TL
704@contextlib.contextmanager
705def ceph_mgrs(ctx, config):
706 """
707 Deploy any additional mgrs
708 """
709 cluster_name = config['cluster']
710 fsid = ctx.ceph[cluster_name].fsid
711
712 try:
713 nodes = []
714 daemons = {}
715 for remote, roles in ctx.cluster.remotes.items():
716 for mgr in [r for r in roles
717 if teuthology.is_type('mgr', cluster_name)(r)]:
718 c_, _, id_ = teuthology.split_role(mgr)
9f95a23c
TL
719 log.info('Adding %s on %s' % (mgr, remote.shortname))
720 nodes.append(remote.shortname + '=' + id_)
f67539c2
TL
721 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
722 continue
9f95a23c
TL
723 daemons[mgr] = (remote, id_)
724 if nodes:
725 _shell(ctx, cluster_name, remote, [
726 'ceph', 'orch', 'apply', 'mgr',
f67539c2 727 str(len(nodes)) + ';' + ';'.join(nodes)]
9f95a23c
TL
728 )
729 for mgr, i in daemons.items():
730 remote, id_ = i
731 ctx.daemons.register_daemon(
732 remote, 'mgr', id_,
733 cluster=cluster_name,
734 fsid=fsid,
735 logger=log.getChild(mgr),
736 wait=False,
737 started=True,
738 )
739
740 yield
741
742 finally:
743 pass
744
b3b6e05e 745
9f95a23c
TL
746@contextlib.contextmanager
747def ceph_osds(ctx, config):
748 """
749 Deploy OSDs
750 """
751 cluster_name = config['cluster']
752 fsid = ctx.ceph[cluster_name].fsid
1911f103 753
9f95a23c
TL
754 try:
755 log.info('Deploying OSDs...')
756
757 # provision OSDs in numeric order
758 id_to_remote = {}
759 devs_by_remote = {}
760 for remote, roles in ctx.cluster.remotes.items():
761 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
762 for osd in [r for r in roles
763 if teuthology.is_type('osd', cluster_name)(r)]:
764 _, _, id_ = teuthology.split_role(osd)
765 id_to_remote[int(id_)] = (osd, remote)
766
767 cur = 0
768 for osd_id in sorted(id_to_remote.keys()):
769 osd, remote = id_to_remote[osd_id]
770 _, _, id_ = teuthology.split_role(osd)
771 assert int(id_) == cur
772 devs = devs_by_remote[remote]
773 assert devs ## FIXME ##
774 dev = devs.pop()
e306af50
TL
775 if all(_ in dev for _ in ('lv', 'vg')):
776 short_dev = dev.replace('/dev/', '')
777 else:
778 short_dev = dev
9f95a23c
TL
779 log.info('Deploying %s on %s with %s...' % (
780 osd, remote.shortname, dev))
781 _shell(ctx, cluster_name, remote, [
782 'ceph-volume', 'lvm', 'zap', dev])
783 _shell(ctx, cluster_name, remote, [
784 'ceph', 'orch', 'daemon', 'add', 'osd',
785 remote.shortname + ':' + short_dev
786 ])
787 ctx.daemons.register_daemon(
788 remote, 'osd', id_,
789 cluster=cluster_name,
790 fsid=fsid,
791 logger=log.getChild(osd),
792 wait=False,
793 started=True,
794 )
795 cur += 1
796
20effc67
TL
797 if cur == 0:
798 _shell(ctx, cluster_name, remote, [
799 'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
800 ])
801 # expect the number of scratch devs
802 num_osds = sum(map(len, devs_by_remote.values()))
803 assert num_osds
804 else:
805 # expect the number of OSDs we created
806 num_osds = cur
807
808 log.info(f'Waiting for {num_osds} OSDs to come up...')
809 with contextutil.safe_while(sleep=1, tries=120) as proceed:
810 while proceed():
811 p = _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
812 ['ceph', 'osd', 'stat', '-f', 'json'], stdout=StringIO())
813 j = json.loads(p.stdout.getvalue())
814 if int(j.get('num_up_osds', 0)) == num_osds:
815 break;
816
817 if not hasattr(ctx, 'managers'):
818 ctx.managers = {}
819 ctx.managers[cluster_name] = CephManager(
820 ctx.ceph[cluster_name].bootstrap_remote,
821 ctx=ctx,
822 logger=log.getChild('ceph_manager.' + cluster_name),
823 cluster=cluster_name,
824 cephadm=True,
825 )
826
9f95a23c
TL
827 yield
828 finally:
829 pass
830
b3b6e05e 831
9f95a23c
TL
832@contextlib.contextmanager
833def ceph_mdss(ctx, config):
834 """
835 Deploy MDSss
836 """
837 cluster_name = config['cluster']
838 fsid = ctx.ceph[cluster_name].fsid
839
840 nodes = []
841 daemons = {}
842 for remote, roles in ctx.cluster.remotes.items():
843 for role in [r for r in roles
844 if teuthology.is_type('mds', cluster_name)(r)]:
845 c_, _, id_ = teuthology.split_role(role)
846 log.info('Adding %s on %s' % (role, remote.shortname))
847 nodes.append(remote.shortname + '=' + id_)
848 daemons[role] = (remote, id_)
849 if nodes:
850 _shell(ctx, cluster_name, remote, [
851 'ceph', 'orch', 'apply', 'mds',
852 'all',
853 str(len(nodes)) + ';' + ';'.join(nodes)]
854 )
855 for role, i in daemons.items():
856 remote, id_ = i
857 ctx.daemons.register_daemon(
858 remote, 'mds', id_,
859 cluster=cluster_name,
860 fsid=fsid,
861 logger=log.getChild(role),
862 wait=False,
863 started=True,
864 )
865
866 yield
867
20effc67
TL
868@contextlib.contextmanager
869def cephfs_setup(ctx, config):
870 mdss = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
871
872 # If there are any MDSs, then create a filesystem for them to use
873 # Do this last because requires mon cluster to be up and running
874 if len(mdss) > 0:
875 log.info('Setting up CephFS filesystem(s)...')
876 cephfs_config = config.get('cephfs', {})
877 fs_configs = cephfs_config.pop('fs', [{'name': 'cephfs'}])
878 set_allow_multifs = len(fs_configs) > 1
879
880 # wait for standbys to become available (slow due to valgrind, perhaps)
881 mdsc = MDSCluster(ctx)
882 with contextutil.safe_while(sleep=2,tries=150) as proceed:
883 while proceed():
884 if len(mdsc.get_standby_daemons()) >= len(mdss):
885 break
886
887 fss = []
888 for fs_config in fs_configs:
889 assert isinstance(fs_config, dict)
890 name = fs_config.pop('name')
891 temp = deepcopy(cephfs_config)
892 teuthology.deep_merge(temp, fs_config)
893 fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
894 if set_allow_multifs:
895 fs.set_allow_multifs()
896 set_allow_multifs = False
897 fss.append(fs)
898
899 yield
900
901 for fs in fss:
902 fs.destroy()
903 else:
904 yield
b3b6e05e 905
9f95a23c
TL
906@contextlib.contextmanager
907def ceph_monitoring(daemon_type, ctx, config):
908 """
909 Deploy prometheus, node-exporter, etc.
910 """
911 cluster_name = config['cluster']
912 fsid = ctx.ceph[cluster_name].fsid
913
914 nodes = []
915 daemons = {}
916 for remote, roles in ctx.cluster.remotes.items():
917 for role in [r for r in roles
918 if teuthology.is_type(daemon_type, cluster_name)(r)]:
919 c_, _, id_ = teuthology.split_role(role)
920 log.info('Adding %s on %s' % (role, remote.shortname))
921 nodes.append(remote.shortname + '=' + id_)
922 daemons[role] = (remote, id_)
923 if nodes:
924 _shell(ctx, cluster_name, remote, [
925 'ceph', 'orch', 'apply', daemon_type,
926 str(len(nodes)) + ';' + ';'.join(nodes)]
927 )
928 for role, i in daemons.items():
929 remote, id_ = i
930 ctx.daemons.register_daemon(
931 remote, daemon_type, id_,
932 cluster=cluster_name,
933 fsid=fsid,
934 logger=log.getChild(role),
935 wait=False,
936 started=True,
937 )
938
939 yield
940
b3b6e05e 941
9f95a23c
TL
942@contextlib.contextmanager
943def ceph_rgw(ctx, config):
944 """
945 Deploy rgw
946 """
947 cluster_name = config['cluster']
948 fsid = ctx.ceph[cluster_name].fsid
949
950 nodes = {}
951 daemons = {}
952 for remote, roles in ctx.cluster.remotes.items():
953 for role in [r for r in roles
954 if teuthology.is_type('rgw', cluster_name)(r)]:
955 c_, _, id_ = teuthology.split_role(role)
956 log.info('Adding %s on %s' % (role, remote.shortname))
f67539c2
TL
957 svc = '.'.join(id_.split('.')[0:2])
958 if svc not in nodes:
959 nodes[svc] = []
960 nodes[svc].append(remote.shortname + '=' + id_)
9f95a23c 961 daemons[role] = (remote, id_)
e306af50 962
f67539c2 963 for svc, nodes in nodes.items():
9f95a23c 964 _shell(ctx, cluster_name, remote, [
f67539c2 965 'ceph', 'orch', 'apply', 'rgw', svc,
e306af50
TL
966 '--placement',
967 str(len(nodes)) + ';' + ';'.join(nodes)]
9f95a23c
TL
968 )
969 for role, i in daemons.items():
970 remote, id_ = i
971 ctx.daemons.register_daemon(
972 remote, 'rgw', id_,
973 cluster=cluster_name,
974 fsid=fsid,
975 logger=log.getChild(role),
976 wait=False,
977 started=True,
978 )
979
980 yield
981
f91f0fd5
TL
982
983@contextlib.contextmanager
984def ceph_iscsi(ctx, config):
985 """
986 Deploy iSCSIs
987 """
988 cluster_name = config['cluster']
989 fsid = ctx.ceph[cluster_name].fsid
990
991 nodes = []
992 daemons = {}
20effc67
TL
993 ips = []
994
f91f0fd5
TL
995 for remote, roles in ctx.cluster.remotes.items():
996 for role in [r for r in roles
20effc67 997 if teuthology.is_type('iscsi', cluster_name)(r)]:
f91f0fd5
TL
998 c_, _, id_ = teuthology.split_role(role)
999 log.info('Adding %s on %s' % (role, remote.shortname))
1000 nodes.append(remote.shortname + '=' + id_)
1001 daemons[role] = (remote, id_)
20effc67
TL
1002 ips.append(remote.ip_address)
1003 trusted_ip_list = ','.join(ips)
f91f0fd5 1004 if nodes:
20effc67
TL
1005 poolname = 'datapool'
1006 # ceph osd pool create datapool 3 3 replicated
f91f0fd5
TL
1007 _shell(ctx, cluster_name, remote, [
1008 'ceph', 'osd', 'pool', 'create',
1009 poolname, '3', '3', 'replicated']
1010 )
1011
1012 _shell(ctx, cluster_name, remote, [
20effc67 1013 'rbd', 'pool', 'init', poolname]
f91f0fd5
TL
1014 )
1015
20effc67 1016 # ceph orch apply iscsi datapool (admin)user (admin)password
f91f0fd5
TL
1017 _shell(ctx, cluster_name, remote, [
1018 'ceph', 'orch', 'apply', 'iscsi',
20effc67
TL
1019 poolname, 'admin', 'admin',
1020 '--trusted_ip_list', trusted_ip_list,
f91f0fd5
TL
1021 '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
1022 )
20effc67
TL
1023
1024 # used by iscsi client to identify valid gateway ip's
1025 conf_data = dedent(f"""
1026 [config]
1027 trusted_ip_list = {trusted_ip_list}
1028 """)
1029 distribute_iscsi_gateway_cfg(ctx, conf_data)
1030
f91f0fd5
TL
1031 for role, i in daemons.items():
1032 remote, id_ = i
1033 ctx.daemons.register_daemon(
1034 remote, 'iscsi', id_,
1035 cluster=cluster_name,
1036 fsid=fsid,
1037 logger=log.getChild(role),
1038 wait=False,
1039 started=True,
1040 )
1041
1042 yield
1043
b3b6e05e 1044
9f95a23c
TL
1045@contextlib.contextmanager
1046def ceph_clients(ctx, config):
1047 cluster_name = config['cluster']
9f95a23c
TL
1048
1049 log.info('Setting up client nodes...')
1050 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
9f95a23c
TL
1051 for remote, roles_for_host in clients.remotes.items():
1052 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
1053 cluster_name):
1054 name = teuthology.ceph_role(role)
1055 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
1056 name)
1057 r = _shell(
1058 ctx=ctx,
1059 cluster_name=cluster_name,
1060 remote=remote,
1061 args=[
1062 'ceph', 'auth',
1063 'get-or-create', name,
1064 'mon', 'allow *',
1065 'osd', 'allow *',
1066 'mds', 'allow *',
1067 'mgr', 'allow *',
1068 ],
e306af50 1069 stdout=StringIO(),
9f95a23c
TL
1070 )
1071 keyring = r.stdout.getvalue()
f67539c2 1072 remote.sudo_write_file(client_keyring, keyring, mode='0644')
9f95a23c
TL
1073 yield
1074
b3b6e05e 1075
9f95a23c
TL
1076@contextlib.contextmanager
1077def ceph_initial():
1078 try:
1079 yield
1080 finally:
1081 log.info('Teardown complete')
1082
b3b6e05e 1083
9f95a23c
TL
1084## public methods
1085@contextlib.contextmanager
1086def stop(ctx, config):
1087 """
1088 Stop ceph daemons
1089
1090 For example::
1091 tasks:
1092 - ceph.stop: [mds.*]
1093
1094 tasks:
1095 - ceph.stop: [osd.0, osd.2]
1096
1097 tasks:
1098 - ceph.stop:
1099 daemons: [osd.0, osd.2]
1100
1101 """
1102 if config is None:
1103 config = {}
1104 elif isinstance(config, list):
1105 config = {'daemons': config}
1106
1107 daemons = ctx.daemons.resolve_role_list(
1108 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1109 clusters = set()
1110
1111 for role in daemons:
1112 cluster, type_, id_ = teuthology.split_role(role)
1113 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1114 clusters.add(cluster)
1115
1116# for cluster in clusters:
1117# ctx.ceph[cluster].watchdog.stop()
1118# ctx.ceph[cluster].watchdog.join()
1119
1120 yield
1121
b3b6e05e 1122
9f95a23c
TL
1123def shell(ctx, config):
1124 """
1125 Execute (shell) commands
1126 """
1127 cluster_name = config.get('cluster', 'ceph')
1128
b3b6e05e
TL
1129 args = []
1130 for k in config.pop('env', []):
1131 args.extend(['-e', k + '=' + ctx.config.get(k, '')])
1132 for k in config.pop('volumes', []):
1133 args.extend(['-v', k])
9f95a23c 1134
f67539c2
TL
1135 if 'all-roles' in config and len(config) == 1:
1136 a = config['all-roles']
9f95a23c 1137 roles = teuthology.all_roles(ctx.cluster)
f67539c2
TL
1138 config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
1139 elif 'all-hosts' in config and len(config) == 1:
1140 a = config['all-hosts']
1141 roles = teuthology.all_roles(ctx.cluster)
1142 config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
9f95a23c 1143
f67539c2 1144 for role, cmd in config.items():
9f95a23c
TL
1145 (remote,) = ctx.cluster.only(role).remotes.keys()
1146 log.info('Running commands on role %s host %s', role, remote.name)
f67539c2
TL
1147 if isinstance(cmd, list):
1148 for c in cmd:
1149 _shell(ctx, cluster_name, remote,
1150 ['bash', '-c', subst_vip(ctx, c)],
b3b6e05e 1151 extra_cephadm_args=args)
f67539c2
TL
1152 else:
1153 assert isinstance(cmd, str)
9f95a23c 1154 _shell(ctx, cluster_name, remote,
f67539c2 1155 ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
b3b6e05e 1156 extra_cephadm_args=args)
9f95a23c 1157
f67539c2
TL
1158
1159def apply(ctx, config):
1160 """
1161 Apply spec
1162
1163 tasks:
1164 - cephadm.apply:
1165 specs:
1166 - service_type: rgw
1167 service_id: foo
1168 spec:
1169 rgw_frontend_port: 8000
1170 - service_type: rgw
1171 service_id: bar
1172 spec:
1173 rgw_frontend_port: 9000
1174 zone: bar
1175 realm: asdf
1176
1177 """
1178 cluster_name = config.get('cluster', 'ceph')
1179
1180 specs = config.get('specs', [])
1181 y = subst_vip(ctx, yaml.dump_all(specs))
1182
1183 log.info(f'Applying spec(s):\n{y}')
1184 _shell(
1185 ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1186 ['ceph', 'orch', 'apply', '-i', '-'],
1187 stdin=y,
1188 )
1189
1190
1191def wait_for_service(ctx, config):
1192 """
1193 Wait for a service to be fully started
1194
1195 tasks:
1196 - cephadm.wait_for_service:
1197 service: rgw.foo
1198 timeout: 60 # defaults to 300
1199
1200 """
1201 cluster_name = config.get('cluster', 'ceph')
1202 timeout = config.get('timeout', 300)
1203 service = config.get('service')
1204 assert service
1205
1206 log.info(
1207 f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1208 )
1209 with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
1210 while proceed():
1211 r = _shell(
1212 ctx=ctx,
1213 cluster_name=cluster_name,
1214 remote=ctx.ceph[cluster_name].bootstrap_remote,
1215 args=[
1216 'ceph', 'orch', 'ls', '-f', 'json',
1217 ],
1218 stdout=StringIO(),
1219 )
1220 j = json.loads(r.stdout.getvalue())
1221 svc = None
1222 for s in j:
1223 if s['service_name'] == service:
1224 svc = s
1225 break
1226 if svc:
1227 log.info(
1228 f"{service} has {s['status']['running']}/{s['status']['size']}"
1229 )
1230 if s['status']['running'] == s['status']['size']:
1231 break
1232
1233
9f95a23c
TL
1234@contextlib.contextmanager
1235def tweaked_option(ctx, config):
1236 """
1237 set an option, and then restore it with its original value
1238
1239 Note, due to the way how tasks are executed/nested, it's not suggested to
1240 use this method as a standalone task. otherwise, it's likely that it will
1241 restore the tweaked option at the /end/ of 'tasks' block.
1242 """
1243 saved_options = {}
1244 # we can complicate this when necessary
1245 options = ['mon-health-to-clog']
1246 type_, id_ = 'mon', '*'
1247 cluster = config.get('cluster', 'ceph')
1248 manager = ctx.managers[cluster]
1249 if id_ == '*':
1250 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
1251 else:
1252 get_from = id_
1253 for option in options:
1254 if option not in config:
1255 continue
1256 value = 'true' if config[option] else 'false'
1257 option = option.replace('-', '_')
1258 old_value = manager.get_config(type_, get_from, option)
1259 if value != old_value:
1260 saved_options[option] = old_value
1261 manager.inject_args(type_, id_, option, value)
1262 yield
1263 for option, value in saved_options.items():
1264 manager.inject_args(type_, id_, option, value)
1265
b3b6e05e 1266
9f95a23c
TL
1267@contextlib.contextmanager
1268def restart(ctx, config):
1269 """
1270 restart ceph daemons
1271
1272 For example::
1273 tasks:
1274 - ceph.restart: [all]
1275
1276 For example::
1277 tasks:
1278 - ceph.restart: [osd.0, mon.1, mds.*]
1279
1280 or::
1281
1282 tasks:
1283 - ceph.restart:
1284 daemons: [osd.0, mon.1]
1285 wait-for-healthy: false
1286 wait-for-osds-up: true
1287
1288 :param ctx: Context
1289 :param config: Configuration
1290 """
1291 if config is None:
1292 config = {}
1293 elif isinstance(config, list):
1294 config = {'daemons': config}
1295
1296 daemons = ctx.daemons.resolve_role_list(
1297 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1298 clusters = set()
1299
1300 log.info('daemons %s' % daemons)
1301 with tweaked_option(ctx, config):
1302 for role in daemons:
1303 cluster, type_, id_ = teuthology.split_role(role)
1304 d = ctx.daemons.get_daemon(type_, id_, cluster)
1305 assert d, 'daemon %s does not exist' % role
1306 d.stop()
1307 if type_ == 'osd':
1308 ctx.managers[cluster].mark_down_osd(id_)
1309 d.restart()
1310 clusters.add(cluster)
1311
1312 if config.get('wait-for-healthy', True):
1313 for cluster in clusters:
1314 healthy(ctx=ctx, config=dict(cluster=cluster))
1315 if config.get('wait-for-osds-up', False):
1316 for cluster in clusters:
1317 ctx.managers[cluster].wait_for_all_osds_up()
1318 yield
1319
b3b6e05e 1320
1911f103
TL
1321@contextlib.contextmanager
1322def distribute_config_and_admin_keyring(ctx, config):
1323 """
1324 Distribute a sufficient config and keyring for clients
1325 """
1326 cluster_name = config['cluster']
1327 log.info('Distributing (final) config and client.admin keyring...')
1328 for remote, roles in ctx.cluster.remotes.items():
f67539c2
TL
1329 remote.write_file(
1330 '/etc/ceph/{}.conf'.format(cluster_name),
1331 ctx.ceph[cluster_name].config_file,
1332 sudo=True)
1333 remote.write_file(
1911f103 1334 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
f67539c2
TL
1335 data=ctx.ceph[cluster_name].admin_keyring,
1336 sudo=True)
1911f103
TL
1337 try:
1338 yield
1339 finally:
1340 ctx.cluster.run(args=[
1341 'sudo', 'rm', '-f',
1342 '/etc/ceph/{}.conf'.format(cluster_name),
1343 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1344 ])
1345
b3b6e05e 1346
9f95a23c
TL
1347@contextlib.contextmanager
1348def crush_setup(ctx, config):
1349 cluster_name = config['cluster']
9f95a23c
TL
1350
1351 profile = config.get('crush_tunables', 'default')
1352 log.info('Setting crush tunables to %s', profile)
1353 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1354 args=['ceph', 'osd', 'crush', 'tunables', profile])
1355 yield
1356
b3b6e05e 1357
f67539c2
TL
1358@contextlib.contextmanager
1359def create_rbd_pool(ctx, config):
1360 if config.get('create_rbd_pool', False):
1361 cluster_name = config['cluster']
1362 log.info('Waiting for OSDs to come up')
1363 teuthology.wait_until_osds_up(
1364 ctx,
1365 cluster=ctx.cluster,
1366 remote=ctx.ceph[cluster_name].bootstrap_remote,
1367 ceph_cluster=cluster_name,
1368 )
1369 log.info('Creating RBD pool')
1370 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1371 args=['sudo', 'ceph', '--cluster', cluster_name,
1372 'osd', 'pool', 'create', 'rbd', '8'])
1373 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1374 args=['sudo', 'ceph', '--cluster', cluster_name,
1375 'osd', 'pool', 'application', 'enable',
1376 'rbd', 'rbd', '--yes-i-really-mean-it'
1377 ])
1378 yield
1379
b3b6e05e 1380
9f95a23c 1381@contextlib.contextmanager
e306af50
TL
1382def _bypass():
1383 yield
9f95a23c 1384
b3b6e05e 1385
e306af50
TL
1386@contextlib.contextmanager
1387def initialize_config(ctx, config):
9f95a23c 1388 cluster_name = config['cluster']
e306af50 1389 testdir = teuthology.get_testdir(ctx)
9f95a23c
TL
1390
1391 ctx.ceph[cluster_name].thrashers = []
1392 # fixme: setup watchdog, ala ceph.py
1393
1911f103
TL
1394 ctx.ceph[cluster_name].roleless = False # see below
1395
e306af50
TL
1396 first_ceph_cluster = False
1397 if not hasattr(ctx, 'daemons'):
1398 first_ceph_cluster = True
1399
9f95a23c
TL
1400 # cephadm mode?
1401 if 'cephadm_mode' not in config:
1402 config['cephadm_mode'] = 'root'
1403 assert config['cephadm_mode'] in ['root', 'cephadm-package']
1404 if config['cephadm_mode'] == 'root':
1405 ctx.cephadm = testdir + '/cephadm'
1406 else:
1407 ctx.cephadm = 'cephadm' # in the path
1408
1409 if first_ceph_cluster:
1410 # FIXME: this is global for all clusters
1411 ctx.daemons = DaemonGroup(
1412 use_cephadm=ctx.cephadm)
1413
9f95a23c
TL
1414 # uuid
1415 fsid = str(uuid.uuid1())
1416 log.info('Cluster fsid is %s' % fsid)
1417 ctx.ceph[cluster_name].fsid = fsid
1418
1419 # mon ips
1420 log.info('Choosing monitor IPs and ports...')
1421 remotes_and_roles = ctx.cluster.remotes.items()
9f95a23c
TL
1422 ips = [host for (host, port) in
1423 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1911f103
TL
1424
1425 if config.get('roleless', False):
1426 # mons will be named after hosts
1911f103 1427 first_mon = None
20effc67 1428 max_mons = config.get('max_mons', 5)
1911f103 1429 for remote, _ in remotes_and_roles:
e306af50 1430 ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1911f103
TL
1431 if not first_mon:
1432 first_mon = remote.shortname
1433 bootstrap_remote = remote
20effc67
TL
1434 max_mons -= 1
1435 if not max_mons:
1436 break
e306af50
TL
1437 log.info('No mon roles; fabricating mons')
1438
1439 roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1440
9f95a23c
TL
1441 ctx.ceph[cluster_name].mons = get_mons(
1442 roles, ips, cluster_name,
1443 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1444 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1911f103 1445 )
9f95a23c
TL
1446 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1447
1911f103
TL
1448 if config.get('roleless', False):
1449 ctx.ceph[cluster_name].roleless = True
1450 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1451 ctx.ceph[cluster_name].first_mon = first_mon
1452 ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1453 else:
1454 first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1455 _, _, first_mon = teuthology.split_role(first_mon_role)
1456 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1457 log.info('First mon is mon.%s on %s' % (first_mon,
1458 bootstrap_remote.shortname))
1459 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1460 ctx.ceph[cluster_name].first_mon = first_mon
1461 ctx.ceph[cluster_name].first_mon_role = first_mon_role
1462
1463 others = ctx.cluster.remotes[bootstrap_remote]
1464 mgrs = sorted([r for r in others
1465 if teuthology.is_type('mgr', cluster_name)(r)])
1466 if not mgrs:
1467 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1468 _, _, first_mgr = teuthology.split_role(mgrs[0])
1469 log.info('First mgr is %s' % (first_mgr))
1470 ctx.ceph[cluster_name].first_mgr = first_mgr
e306af50
TL
1471 yield
1472
b3b6e05e 1473
e306af50
TL
1474@contextlib.contextmanager
1475def task(ctx, config):
1476 """
1477 Deploy ceph cluster using cephadm
1478
e306af50
TL
1479 For example, teuthology.yaml can contain the 'defaults' section:
1480
1481 defaults:
1482 cephadm:
1483 containers:
e306af50
TL
1484 image: 'quay.io/ceph-ci/ceph'
1485
1486 Using overrides makes it possible to customize it per run.
1487 The equivalent 'overrides' section looks like:
1488
1489 overrides:
1490 cephadm:
1491 containers:
e306af50 1492 image: 'quay.io/ceph-ci/ceph'
b3b6e05e
TL
1493 registry-login:
1494 url: registry-url
1495 username: registry-user
1496 password: registry-password
e306af50
TL
1497
1498 :param ctx: the argparse.Namespace object
1499 :param config: the config dict
1500 """
1501 if config is None:
1502 config = {}
1503
1504 assert isinstance(config, dict), \
1505 "task only supports a dictionary for configuration"
1506
1507 overrides = ctx.config.get('overrides', {})
1508 teuthology.deep_merge(config, overrides.get('ceph', {}))
1509 teuthology.deep_merge(config, overrides.get('cephadm', {}))
1510 log.info('Config: ' + str(config))
1511
e306af50
TL
1512 # set up cluster context
1513 if not hasattr(ctx, 'ceph'):
1514 ctx.ceph = {}
e306af50
TL
1515 if 'cluster' not in config:
1516 config['cluster'] = 'ceph'
1517 cluster_name = config['cluster']
1518 if cluster_name not in ctx.ceph:
1519 ctx.ceph[cluster_name] = argparse.Namespace()
1520 ctx.ceph[cluster_name].bootstrapped = False
f91f0fd5 1521
e306af50
TL
1522 # image
1523 teuth_defaults = teuth_config.get('defaults', {})
1524 cephadm_defaults = teuth_defaults.get('cephadm', {})
1525 containers_defaults = cephadm_defaults.get('containers', {})
e306af50
TL
1526 container_image_name = containers_defaults.get('image', None)
1527
1528 containers = config.get('containers', {})
e306af50 1529 container_image_name = containers.get('image', container_image_name)
e306af50 1530
e306af50
TL
1531 if not hasattr(ctx.ceph[cluster_name], 'image'):
1532 ctx.ceph[cluster_name].image = config.get('image')
1533 ref = None
1534 if not ctx.ceph[cluster_name].image:
f6b5b4d7
TL
1535 if not container_image_name:
1536 raise Exception("Configuration error occurred. "
1537 "The 'image' value is undefined for 'cephadm' task. "
1538 "Please provide corresponding options in the task's "
1539 "config, task 'overrides', or teuthology 'defaults' "
1540 "section.")
e306af50 1541 sha1 = config.get('sha1')
f6b5b4d7
TL
1542 flavor = config.get('flavor', 'default')
1543
e306af50 1544 if sha1:
f6b5b4d7
TL
1545 if flavor == "crimson":
1546 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1547 else:
1548 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
e306af50
TL
1549 ref = sha1
1550 else:
1551 # hmm, fall back to branch?
1552 branch = config.get('branch', 'master')
1553 ref = branch
1554 ctx.ceph[cluster_name].image = container_image_name + ':' + branch
1555 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1556
1911f103 1557
9f95a23c 1558 with contextutil.nested(
e306af50
TL
1559 #if the cluster is already bootstrapped bypass corresponding methods
1560 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1561 else initialize_config(ctx=ctx, config=config),
9f95a23c
TL
1562 lambda: ceph_initial(),
1563 lambda: normalize_hostnames(ctx=ctx),
e306af50
TL
1564 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
1565 else download_cephadm(ctx=ctx, config=config, ref=ref),
9f95a23c
TL
1566 lambda: ceph_log(ctx=ctx, config=config),
1567 lambda: ceph_crash(ctx=ctx, config=config),
20effc67 1568 lambda: pull_image(ctx=ctx, config=config),
e306af50 1569 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped)\
f67539c2 1570 else ceph_bootstrap(ctx, config),
9f95a23c
TL
1571 lambda: crush_setup(ctx=ctx, config=config),
1572 lambda: ceph_mons(ctx=ctx, config=config),
1911f103 1573 lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
9f95a23c
TL
1574 lambda: ceph_mgrs(ctx=ctx, config=config),
1575 lambda: ceph_osds(ctx=ctx, config=config),
1576 lambda: ceph_mdss(ctx=ctx, config=config),
20effc67 1577 lambda: cephfs_setup(ctx=ctx, config=config),
9f95a23c 1578 lambda: ceph_rgw(ctx=ctx, config=config),
f91f0fd5 1579 lambda: ceph_iscsi(ctx=ctx, config=config),
9f95a23c
TL
1580 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1581 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1582 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1583 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1584 lambda: ceph_clients(ctx=ctx, config=config),
f67539c2 1585 lambda: create_rbd_pool(ctx=ctx, config=config),
9f95a23c 1586 ):
9f95a23c
TL
1587 try:
1588 if config.get('wait-for-healthy', True):
1589 healthy(ctx=ctx, config=config)
1590
1591 log.info('Setup complete, yielding')
1592 yield
1593
1594 finally:
1595 log.info('Teardown begin')
e306af50 1596