]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephadm.py
bump version to 18.2.2-pve1
[ceph.git] / ceph / qa / tasks / cephadm.py
CommitLineData
9f95a23c
TL
1"""
2Ceph cluster task, deployed via cephadm orchestrator
3"""
9f95a23c
TL
4import argparse
5import configobj
6import contextlib
7import logging
8import os
9import json
10import re
11import uuid
f91f0fd5 12import yaml
9f95a23c 13
20effc67 14from copy import deepcopy
f67539c2 15from io import BytesIO, StringIO
9f95a23c 16from tarfile import ReadError
e306af50 17from tasks.ceph_manager import CephManager
9f95a23c
TL
18from teuthology import misc as teuthology
19from teuthology import contextutil
1e59de90 20from teuthology import packaging
9f95a23c
TL
21from teuthology.orchestra import run
22from teuthology.orchestra.daemon import DaemonGroup
23from teuthology.config import config as teuth_config
20effc67
TL
24from textwrap import dedent
25from tasks.cephfs.filesystem import MDSCluster, Filesystem
1e59de90 26from tasks.util import chacra
9f95a23c
TL
27
28# these items we use from ceph.py should probably eventually move elsewhere
29from tasks.ceph import get_mons, healthy
f67539c2 30from tasks.vip import subst_vip
9f95a23c
TL
31
32CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw', 'prometheus']
33
34log = logging.getLogger(__name__)
35
36
37def _shell(ctx, cluster_name, remote, args, extra_cephadm_args=[], **kwargs):
f67539c2 38 teuthology.get_testdir(ctx)
9f95a23c
TL
39 return remote.run(
40 args=[
41 'sudo',
42 ctx.cephadm,
43 '--image', ctx.ceph[cluster_name].image,
44 'shell',
45 '-c', '/etc/ceph/{}.conf'.format(cluster_name),
46 '-k', '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
47 '--fsid', ctx.ceph[cluster_name].fsid,
48 ] + extra_cephadm_args + [
49 '--',
50 ] + args,
51 **kwargs
52 )
53
b3b6e05e 54
9f95a23c
TL
55def build_initial_config(ctx, config):
56 cluster_name = config['cluster']
57
58 path = os.path.join(os.path.dirname(__file__), 'cephadm.conf')
59 conf = configobj.ConfigObj(path, file_error=True)
60
61 conf.setdefault('global', {})
62 conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
63
64 # overrides
65 for section, keys in config.get('conf',{}).items():
66 for key, value in keys.items():
67 log.info(" override: [%s] %s = %s" % (section, key, value))
68 if section not in conf:
69 conf[section] = {}
70 conf[section][key] = value
71
72 return conf
73
b3b6e05e 74
20effc67
TL
75def distribute_iscsi_gateway_cfg(ctx, conf_data):
76 """
77 Distribute common gateway config to get the IPs.
78 These will help in iscsi clients with finding trusted_ip_list.
79 """
80 log.info('Distributing iscsi-gateway.cfg...')
81 for remote, roles in ctx.cluster.remotes.items():
82 remote.write_file(
83 path='/etc/ceph/iscsi-gateway.cfg',
84 data=conf_data,
85 sudo=True)
86
f67539c2
TL
87def update_archive_setting(ctx, key, value):
88 """
89 Add logs directory to job's info log file
90 """
b3b6e05e
TL
91 if ctx.archive is None:
92 return
f67539c2
TL
93 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
94 info_yaml = yaml.safe_load(info_file)
95 info_file.seek(0)
96 if 'archive' in info_yaml:
97 info_yaml['archive'][key] = value
98 else:
99 info_yaml['archive'] = {key: value}
100 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
101
b3b6e05e 102
9f95a23c
TL
103@contextlib.contextmanager
104def normalize_hostnames(ctx):
105 """
106 Ensure we have short hostnames throughout, for consistency between
107 remote.shortname and socket.gethostname() in cephadm.
108 """
109 log.info('Normalizing hostnames...')
1e59de90
TL
110 cluster = ctx.cluster.filter(lambda r: '.' in r.hostname)
111 cluster.run(args=[
9f95a23c
TL
112 'sudo',
113 'hostname',
114 run.Raw('$(hostname -s)'),
115 ])
116
117 try:
118 yield
119 finally:
120 pass
121
b3b6e05e 122
9f95a23c
TL
123@contextlib.contextmanager
124def download_cephadm(ctx, config, ref):
125 cluster_name = config['cluster']
126
127 if config.get('cephadm_mode') != 'cephadm-package':
b3b6e05e 128 if ctx.config.get('redhat'):
1e59de90
TL
129 _fetch_cephadm_from_rpm(ctx)
130 # TODO: come up with a sensible way to detect if we need an "old, uncompiled"
131 # cephadm
132 elif 'cephadm_git_url' in config and 'cephadm_branch' in config:
133 _fetch_cephadm_from_github(ctx, config, ref)
9f95a23c 134 else:
1e59de90
TL
135 _fetch_cephadm_from_chachra(ctx, config, cluster_name)
136
137 try:
138 yield
139 finally:
140 _rm_cluster(ctx, cluster_name)
141 if config.get('cephadm_mode') == 'root':
142 _rm_cephadm(ctx)
143
144
145def _fetch_cephadm_from_rpm(ctx):
146 log.info("Copying cephadm installed from an RPM package")
147 # cephadm already installed from redhat.install task
148 ctx.cluster.run(
149 args=[
150 'cp',
151 run.Raw('$(which cephadm)'),
152 ctx.cephadm,
153 run.Raw('&&'),
154 'ls', '-l',
155 ctx.cephadm,
156 ]
157 )
158
159
160def _fetch_cephadm_from_github(ctx, config, ref):
161 ref = config.get('cephadm_branch', ref)
162 git_url = config.get('cephadm_git_url', teuth_config.get_ceph_git_url())
163 log.info('Downloading cephadm (repo %s ref %s)...' % (git_url, ref))
164 if git_url.startswith('https://github.com/'):
165 # git archive doesn't like https:// URLs, which we use with github.
166 rest = git_url.split('https://github.com/', 1)[1]
167 rest = re.sub(r'\.git/?$', '', rest).strip() # no .git suffix
168 ctx.cluster.run(
169 args=[
170 'curl', '--silent',
171 'https://raw.githubusercontent.com/' + rest + '/' + ref + '/src/cephadm/cephadm',
172 run.Raw('>'),
173 ctx.cephadm,
174 run.Raw('&&'),
175 'ls', '-l',
176 ctx.cephadm,
177 ],
178 )
179 else:
9f95a23c
TL
180 ctx.cluster.run(
181 args=[
1e59de90 182 'git', 'clone', git_url, 'testrepo',
9f95a23c 183 run.Raw('&&'),
1e59de90 184 'cd', 'testrepo',
9f95a23c 185 run.Raw('&&'),
1e59de90
TL
186 'git', 'show', f'{ref}:src/cephadm/cephadm',
187 run.Raw('>'),
188 ctx.cephadm,
189 run.Raw('&&'),
190 'ls', '-l', ctx.cephadm,
9f95a23c
TL
191 ],
192 )
1e59de90
TL
193 # sanity-check the resulting file and set executable bit
194 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
195 ctx.cluster.run(
196 args=[
197 'test', '-s', ctx.cephadm,
198 run.Raw('&&'),
199 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
200 run.Raw('&&'),
201 'chmod', '+x', ctx.cephadm,
202 ],
203 )
9f95a23c 204
1e59de90
TL
205
206def _fetch_cephadm_from_chachra(ctx, config, cluster_name):
207 log.info('Downloading "compiled" cephadm from cachra')
208 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
209 bp = packaging.get_builder_project()(
210 config.get('project', 'ceph'),
211 config,
212 ctx=ctx,
213 remote=bootstrap_remote,
214 )
215 log.info('builder_project result: %s' % (bp._result.json()))
216
217 flavor = config.get('flavor', 'default')
218 branch = config.get('branch')
219 sha1 = config.get('sha1')
220
221 # pull the cephadm binary from chacra
222 url = chacra.get_binary_url(
223 'cephadm',
224 project=bp.project,
225 distro=bp.distro.split('/')[0],
226 release=bp.distro.split('/')[1],
227 arch=bp.arch,
228 flavor=flavor,
229 branch=branch,
230 sha1=sha1,
231 )
232 log.info("Discovered cachra url: %s", url)
233 ctx.cluster.run(
234 args=[
235 'curl', '--silent', '-L', url,
236 run.Raw('>'),
9f95a23c 237 ctx.cephadm,
1e59de90
TL
238 run.Raw('&&'),
239 'ls', '-l',
240 ctx.cephadm,
241 ],
242 )
9f95a23c 243
1e59de90
TL
244 # sanity-check the resulting file and set executable bit
245 cephadm_file_size = '$(stat -c%s {})'.format(ctx.cephadm)
246 ctx.cluster.run(
247 args=[
248 'test', '-s', ctx.cephadm,
249 run.Raw('&&'),
250 'test', run.Raw(cephadm_file_size), "-gt", run.Raw('1000'),
251 run.Raw('&&'),
252 'chmod', '+x', ctx.cephadm,
253 ],
254 )
255
256
257def _rm_cluster(ctx, cluster_name):
258 log.info('Removing cluster...')
259 ctx.cluster.run(args=[
260 'sudo',
261 ctx.cephadm,
262 'rm-cluster',
263 '--fsid', ctx.ceph[cluster_name].fsid,
264 '--force',
265 ])
266
267
268def _rm_cephadm(ctx):
269 log.info('Removing cephadm ...')
270 ctx.cluster.run(
271 args=[
272 'rm',
273 '-rf',
274 ctx.cephadm,
275 ],
276 )
9f95a23c 277
b3b6e05e 278
9f95a23c
TL
279@contextlib.contextmanager
280def ceph_log(ctx, config):
281 cluster_name = config['cluster']
282 fsid = ctx.ceph[cluster_name].fsid
283
f67539c2
TL
284 update_archive_setting(ctx, 'log', '/var/log/ceph')
285
f91f0fd5 286
9f95a23c
TL
287 try:
288 yield
289
290 except Exception:
291 # we need to know this below
292 ctx.summary['success'] = False
293 raise
294
295 finally:
296 log.info('Checking cluster log for badness...')
297 def first_in_ceph_log(pattern, excludes):
298 """
299 Find the first occurrence of the pattern specified in the Ceph log,
300 Returns None if none found.
301
302 :param pattern: Pattern scanned for.
303 :param excludes: Patterns to ignore.
304 :return: First line of text (or None if not found)
305 """
306 args = [
307 'sudo',
308 'egrep', pattern,
309 '/var/log/ceph/{fsid}/ceph.log'.format(
310 fsid=fsid),
311 ]
312 if excludes:
313 for exclude in excludes:
314 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
315 args.extend([
316 run.Raw('|'), 'head', '-n', '1',
317 ])
318 r = ctx.ceph[cluster_name].bootstrap_remote.run(
e306af50 319 stdout=StringIO(),
9f95a23c
TL
320 args=args,
321 )
322 stdout = r.stdout.getvalue()
323 if stdout != '':
324 return stdout
325 return None
326
327 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
cd265ab1 328 config.get('log-ignorelist')) is not None:
9f95a23c
TL
329 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
330 ctx.summary['success'] = False
331 # use the most severe problem as the failure reason
332 if 'failure_reason' not in ctx.summary:
333 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
cd265ab1 334 match = first_in_ceph_log(pattern, config['log-ignorelist'])
9f95a23c
TL
335 if match is not None:
336 ctx.summary['failure_reason'] = \
337 '"{match}" in cluster log'.format(
338 match=match.rstrip('\n'),
339 )
340 break
341
342 if ctx.archive is not None and \
343 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
344 # and logs
345 log.info('Compressing logs...')
346 run.wait(
347 ctx.cluster.run(
348 args=[
349 'sudo',
350 'find',
351 '/var/log/ceph', # all logs, not just for the cluster
f91f0fd5 352 '/var/log/rbd-target-api', # ceph-iscsi
9f95a23c
TL
353 '-name',
354 '*.log',
355 '-print0',
356 run.Raw('|'),
357 'sudo',
358 'xargs',
359 '-0',
360 '--no-run-if-empty',
361 '--',
362 'gzip',
363 '--',
364 ],
365 wait=False,
366 ),
367 )
368
369 log.info('Archiving logs...')
370 path = os.path.join(ctx.archive, 'remote')
371 try:
372 os.makedirs(path)
373 except OSError:
374 pass
375 for remote in ctx.cluster.remotes.keys():
20effc67 376 sub = os.path.join(path, remote.shortname)
9f95a23c
TL
377 try:
378 os.makedirs(sub)
379 except OSError:
380 pass
e306af50
TL
381 try:
382 teuthology.pull_directory(remote, '/var/log/ceph', # everything
383 os.path.join(sub, 'log'))
384 except ReadError:
385 pass
9f95a23c 386
b3b6e05e 387
9f95a23c
TL
388@contextlib.contextmanager
389def ceph_crash(ctx, config):
390 """
391 Gather crash dumps from /var/lib/ceph/$fsid/crash
392 """
393 cluster_name = config['cluster']
394 fsid = ctx.ceph[cluster_name].fsid
395
f67539c2 396 update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
f91f0fd5 397
9f95a23c
TL
398 try:
399 yield
400
401 finally:
402 if ctx.archive is not None:
403 log.info('Archiving crash dumps...')
404 path = os.path.join(ctx.archive, 'remote')
405 try:
406 os.makedirs(path)
407 except OSError:
408 pass
409 for remote in ctx.cluster.remotes.keys():
20effc67 410 sub = os.path.join(path, remote.shortname)
9f95a23c
TL
411 try:
412 os.makedirs(sub)
413 except OSError:
414 pass
415 try:
416 teuthology.pull_directory(remote,
417 '/var/lib/ceph/%s/crash' % fsid,
418 os.path.join(sub, 'crash'))
419 except ReadError:
420 pass
421
b3b6e05e 422
20effc67
TL
423@contextlib.contextmanager
424def pull_image(ctx, config):
425 cluster_name = config['cluster']
426 log.info(f'Pulling image {ctx.ceph[cluster_name].image} on all hosts...')
427 run.wait(
428 ctx.cluster.run(
429 args=[
430 'sudo',
431 ctx.cephadm,
432 '--image', ctx.ceph[cluster_name].image,
433 'pull',
434 ],
435 wait=False,
436 )
437 )
438
439 try:
440 yield
441 finally:
442 pass
443
aee94f69
TL
444@contextlib.contextmanager
445def setup_ca_signed_keys(ctx, config):
446 # generate our ca key
447 cluster_name = config['cluster']
448 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
449 bootstrap_remote.run(args=[
450 'sudo', 'ssh-keygen', '-t', 'rsa', '-f', '/root/ca-key', '-N', ''
451 ])
452
453 # not using read_file here because it runs dd as a non-root
454 # user and would hit permission issues
455 r = bootstrap_remote.run(args=[
456 'sudo', 'cat', '/root/ca-key.pub'
457 ], stdout=StringIO())
458 ca_key_pub_contents = r.stdout.getvalue()
459
460 # make CA key accepted on each host
461 for remote in ctx.cluster.remotes.keys():
462 # write key to each host's /etc/ssh dir
463 remote.run(args=[
464 'sudo', 'echo', ca_key_pub_contents,
465 run.Raw('|'),
466 'sudo', 'tee', '-a', '/etc/ssh/ca-key.pub',
467 ])
468 # make sshd accept the CA signed key
469 remote.run(args=[
470 'sudo', 'echo', 'TrustedUserCAKeys /etc/ssh/ca-key.pub',
471 run.Raw('|'),
472 'sudo', 'tee', '-a', '/etc/ssh/sshd_config',
473 run.Raw('&&'),
474 'sudo', 'systemctl', 'restart', 'sshd',
475 ])
476
477 # generate a new key pair and sign the pub key to make a cert
478 bootstrap_remote.run(args=[
479 'sudo', 'ssh-keygen', '-t', 'rsa', '-f', '/root/cephadm-ssh-key', '-N', '',
480 run.Raw('&&'),
481 'sudo', 'ssh-keygen', '-s', '/root/ca-key', '-I', 'user_root', '-n', 'root', '-V', '+52w', '/root/cephadm-ssh-key',
482 ])
483
484 # for debugging, to make sure this setup has worked as intended
485 for remote in ctx.cluster.remotes.keys():
486 remote.run(args=[
487 'sudo', 'cat', '/etc/ssh/ca-key.pub'
488 ])
489 remote.run(args=[
490 'sudo', 'cat', '/etc/ssh/sshd_config',
491 run.Raw('|'),
492 'grep', 'TrustedUserCAKeys'
493 ])
494 bootstrap_remote.run(args=[
495 'sudo', 'ls', '/root/'
496 ])
497
498 ctx.ca_signed_key_info = {}
499 ctx.ca_signed_key_info['ca-key'] = '/root/ca-key'
500 ctx.ca_signed_key_info['ca-key-pub'] = '/root/ca-key.pub'
501 ctx.ca_signed_key_info['private-key'] = '/root/cephadm-ssh-key'
502 ctx.ca_signed_key_info['ca-signed-cert'] = '/root/cephadm-ssh-key-cert.pub'
503
504 try:
505 yield
506 finally:
507 pass
20effc67 508
9f95a23c 509@contextlib.contextmanager
f67539c2 510def ceph_bootstrap(ctx, config):
e306af50 511 """
f67539c2 512 Bootstrap ceph cluster.
e306af50
TL
513
514 :param ctx: the argparse.Namespace object
515 :param config: the config dict
e306af50 516 """
9f95a23c
TL
517 cluster_name = config['cluster']
518 testdir = teuthology.get_testdir(ctx)
519 fsid = ctx.ceph[cluster_name].fsid
520
1911f103
TL
521 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
522 first_mon = ctx.ceph[cluster_name].first_mon
523 first_mon_role = ctx.ceph[cluster_name].first_mon_role
9f95a23c 524 mons = ctx.ceph[cluster_name].mons
f91f0fd5 525
9f95a23c
TL
526 ctx.cluster.run(args=[
527 'sudo', 'mkdir', '-p', '/etc/ceph',
528 ]);
529 ctx.cluster.run(args=[
530 'sudo', 'chmod', '777', '/etc/ceph',
531 ]);
532 try:
533 # write seed config
534 log.info('Writing seed config...')
535 conf_fp = BytesIO()
536 seed_config = build_initial_config(ctx, config)
537 seed_config.write(conf_fp)
f67539c2 538 bootstrap_remote.write_file(
9f95a23c
TL
539 path='{}/seed.{}.conf'.format(testdir, cluster_name),
540 data=conf_fp.getvalue())
e306af50 541 log.debug('Final config:\n' + conf_fp.getvalue().decode())
9f95a23c
TL
542 ctx.ceph[cluster_name].conf = seed_config
543
544 # register initial daemons
545 ctx.daemons.register_daemon(
546 bootstrap_remote, 'mon', first_mon,
547 cluster=cluster_name,
548 fsid=fsid,
549 logger=log.getChild('mon.' + first_mon),
550 wait=False,
551 started=True,
552 )
1911f103
TL
553 if not ctx.ceph[cluster_name].roleless:
554 first_mgr = ctx.ceph[cluster_name].first_mgr
555 ctx.daemons.register_daemon(
556 bootstrap_remote, 'mgr', first_mgr,
557 cluster=cluster_name,
558 fsid=fsid,
559 logger=log.getChild('mgr.' + first_mgr),
560 wait=False,
561 started=True,
562 )
9f95a23c
TL
563
564 # bootstrap
565 log.info('Bootstrapping...')
566 cmd = [
567 'sudo',
568 ctx.cephadm,
569 '--image', ctx.ceph[cluster_name].image,
570 '-v',
571 'bootstrap',
572 '--fsid', fsid,
9f95a23c
TL
573 '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
574 '--output-config', '/etc/ceph/{}.conf'.format(cluster_name),
575 '--output-keyring',
576 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
9f95a23c 577 ]
aee94f69
TL
578
579 if not config.get("use-ca-signed-key", False):
580 cmd += ['--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name)]
581 else:
582 # ctx.ca_signed_key_info should have been set up in
583 # setup_ca_signed_keys function which we expect to have
584 # run before bootstrap if use-ca-signed-key is true
585 signed_key_info = ctx.ca_signed_key_info
586 cmd += [
587 "--ssh-private-key", signed_key_info['private-key'],
588 "--ssh-signed-cert", signed_key_info['ca-signed-cert'],
589 ]
590
1e59de90
TL
591 if config.get("no_cgroups_split") is True:
592 cmd.insert(cmd.index("bootstrap"), "--no-cgroups-split")
b3b6e05e
TL
593
594 if config.get('registry-login'):
595 registry = config['registry-login']
596 cmd += [
597 "--registry-url", registry['url'],
598 "--registry-username", registry['username'],
599 "--registry-password", registry['password'],
600 ]
601
1911f103
TL
602 if not ctx.ceph[cluster_name].roleless:
603 cmd += [
604 '--mon-id', first_mon,
605 '--mgr-id', first_mgr,
606 '--orphan-initial-daemons', # we will do it explicitly!
607 '--skip-monitoring-stack', # we'll provision these explicitly
608 ]
b3b6e05e 609
9f95a23c
TL
610 if mons[first_mon_role].startswith('['):
611 cmd += ['--mon-addrv', mons[first_mon_role]]
612 else:
613 cmd += ['--mon-ip', mons[first_mon_role]]
614 if config.get('skip_dashboard'):
615 cmd += ['--skip-dashboard']
f67539c2
TL
616 if config.get('skip_monitoring_stack'):
617 cmd += ['--skip-monitoring-stack']
b3b6e05e
TL
618 if config.get('single_host_defaults'):
619 cmd += ['--single-host-defaults']
620 if not config.get('avoid_pacific_features', False):
621 cmd += ['--skip-admin-label']
9f95a23c
TL
622 # bootstrap makes the keyring root 0600, so +r it for our purposes
623 cmd += [
624 run.Raw('&&'),
625 'sudo', 'chmod', '+r',
626 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
627 ]
628 bootstrap_remote.run(args=cmd)
629
630 # fetch keys and configs
631 log.info('Fetching config...')
f67539c2
TL
632 ctx.ceph[cluster_name].config_file = \
633 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.conf')
9f95a23c 634 log.info('Fetching client.admin keyring...')
f67539c2
TL
635 ctx.ceph[cluster_name].admin_keyring = \
636 bootstrap_remote.read_file(f'/etc/ceph/{cluster_name}.client.admin.keyring')
9f95a23c 637 log.info('Fetching mon keyring...')
f67539c2
TL
638 ctx.ceph[cluster_name].mon_keyring = \
639 bootstrap_remote.read_file(f'/var/lib/ceph/{fsid}/mon.{first_mon}/keyring', sudo=True)
9f95a23c 640
aee94f69
TL
641 if not config.get("use-ca-signed-key", False):
642 # fetch ssh key, distribute to additional nodes
643 log.info('Fetching pub ssh key...')
644 ssh_pub_key = bootstrap_remote.read_file(
645 f'{testdir}/{cluster_name}.pub').decode('ascii').strip()
9f95a23c 646
aee94f69
TL
647 log.info('Installing pub ssh key for root users...')
648 ctx.cluster.run(args=[
649 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
650 run.Raw('&&'),
651 'echo', ssh_pub_key,
652 run.Raw('|'),
653 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
654 run.Raw('&&'),
655 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
656 ])
9f95a23c
TL
657
658 # set options
f67539c2
TL
659 if config.get('allow_ptrace', True):
660 _shell(ctx, cluster_name, bootstrap_remote,
661 ['ceph', 'config', 'set', 'mgr', 'mgr/cephadm/allow_ptrace', 'true'])
9f95a23c 662
b3b6e05e
TL
663 if not config.get('avoid_pacific_features', False):
664 log.info('Distributing conf and client.admin keyring to all hosts + 0755')
665 _shell(ctx, cluster_name, bootstrap_remote,
666 ['ceph', 'orch', 'client-keyring', 'set', 'client.admin',
667 '*', '--mode', '0755'],
668 check_status=False)
669
9f95a23c
TL
670 # add other hosts
671 for remote in ctx.cluster.remotes.keys():
672 if remote == bootstrap_remote:
673 continue
b3b6e05e
TL
674
675 # note: this may be redundant (see above), but it avoids
676 # us having to wait for cephadm to do it.
1911f103 677 log.info('Writing (initial) conf and keyring to %s' % remote.shortname)
f67539c2 678 remote.write_file(
9f95a23c
TL
679 path='/etc/ceph/{}.conf'.format(cluster_name),
680 data=ctx.ceph[cluster_name].config_file)
f67539c2 681 remote.write_file(
9f95a23c
TL
682 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
683 data=ctx.ceph[cluster_name].admin_keyring)
684
685 log.info('Adding host %s to orchestrator...' % remote.shortname)
20effc67 686 _shell(ctx, cluster_name, bootstrap_remote, [
9f95a23c
TL
687 'ceph', 'orch', 'host', 'add',
688 remote.shortname
689 ])
20effc67 690 r = _shell(ctx, cluster_name, bootstrap_remote,
9f95a23c 691 ['ceph', 'orch', 'host', 'ls', '--format=json'],
e306af50 692 stdout=StringIO())
9f95a23c
TL
693 hosts = [node['hostname'] for node in json.loads(r.stdout.getvalue())]
694 assert remote.shortname in hosts
695
696 yield
697
698 finally:
699 log.info('Cleaning up testdir ceph.* files...')
700 ctx.cluster.run(args=[
701 'rm', '-f',
702 '{}/seed.{}.conf'.format(testdir, cluster_name),
703 '{}/{}.pub'.format(testdir, cluster_name),
704 ])
705
706 log.info('Stopping all daemons...')
707
708 # this doesn't block until they are all stopped...
709 #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
710
f67539c2 711 # stop the daemons we know
e306af50 712 for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES, True):
9f95a23c 713 cluster, type_, id_ = teuthology.split_role(role)
e306af50
TL
714 try:
715 ctx.daemons.get_daemon(type_, id_, cluster).stop()
716 except Exception:
f67539c2 717 log.exception(f'Failed to stop "{role}"')
f91f0fd5 718 raise
9f95a23c 719
f67539c2
TL
720 # tear down anything left (but leave the logs behind)
721 ctx.cluster.run(
722 args=[
723 'sudo',
724 ctx.cephadm,
725 'rm-cluster',
726 '--fsid', fsid,
727 '--force',
728 '--keep-logs',
729 ],
730 check_status=False, # may fail if upgrading from old cephadm
731 )
732
9f95a23c
TL
733 # clean up /etc/ceph
734 ctx.cluster.run(args=[
735 'sudo', 'rm', '-f',
736 '/etc/ceph/{}.conf'.format(cluster_name),
737 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
738 ])
739
b3b6e05e 740
9f95a23c
TL
741@contextlib.contextmanager
742def ceph_mons(ctx, config):
743 """
744 Deploy any additional mons
745 """
746 cluster_name = config['cluster']
747 fsid = ctx.ceph[cluster_name].fsid
9f95a23c
TL
748
749 try:
f67539c2
TL
750 daemons = {}
751 if config.get('add_mons_via_daemon_add'):
752 # This is the old way of adding mons that works with the (early) octopus
753 # cephadm scheduler.
754 num_mons = 1
755 for remote, roles in ctx.cluster.remotes.items():
756 for mon in [r for r in roles
757 if teuthology.is_type('mon', cluster_name)(r)]:
758 c_, _, id_ = teuthology.split_role(mon)
759 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
760 continue
761 log.info('Adding %s on %s' % (mon, remote.shortname))
762 num_mons += 1
763 _shell(ctx, cluster_name, remote, [
764 'ceph', 'orch', 'daemon', 'add', 'mon',
765 remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_,
766 ])
767 ctx.daemons.register_daemon(
768 remote, 'mon', id_,
769 cluster=cluster_name,
770 fsid=fsid,
771 logger=log.getChild(mon),
772 wait=False,
773 started=True,
774 )
775 daemons[mon] = (remote, id_)
776
777 with contextutil.safe_while(sleep=1, tries=180) as proceed:
778 while proceed():
779 log.info('Waiting for %d mons in monmap...' % (num_mons))
780 r = _shell(
781 ctx=ctx,
782 cluster_name=cluster_name,
783 remote=remote,
784 args=[
785 'ceph', 'mon', 'dump', '-f', 'json',
786 ],
787 stdout=StringIO(),
788 )
789 j = json.loads(r.stdout.getvalue())
790 if len(j['mons']) == num_mons:
791 break
792 else:
793 nodes = []
794 for remote, roles in ctx.cluster.remotes.items():
795 for mon in [r for r in roles
796 if teuthology.is_type('mon', cluster_name)(r)]:
797 c_, _, id_ = teuthology.split_role(mon)
798 log.info('Adding %s on %s' % (mon, remote.shortname))
799 nodes.append(remote.shortname
800 + ':' + ctx.ceph[cluster_name].mons[mon]
801 + '=' + id_)
802 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
803 continue
804 daemons[mon] = (remote, id_)
805
806 _shell(ctx, cluster_name, remote, [
807 'ceph', 'orch', 'apply', 'mon',
808 str(len(nodes)) + ';' + ';'.join(nodes)]
809 )
810 for mgr, i in daemons.items():
811 remote, id_ = i
9f95a23c
TL
812 ctx.daemons.register_daemon(
813 remote, 'mon', id_,
814 cluster=cluster_name,
815 fsid=fsid,
816 logger=log.getChild(mon),
817 wait=False,
818 started=True,
819 )
820
f67539c2
TL
821 with contextutil.safe_while(sleep=1, tries=180) as proceed:
822 while proceed():
823 log.info('Waiting for %d mons in monmap...' % (len(nodes)))
824 r = _shell(
825 ctx=ctx,
826 cluster_name=cluster_name,
827 remote=remote,
828 args=[
829 'ceph', 'mon', 'dump', '-f', 'json',
830 ],
831 stdout=StringIO(),
832 )
833 j = json.loads(r.stdout.getvalue())
834 if len(j['mons']) == len(nodes):
835 break
9f95a23c 836
1911f103 837 # refresh our (final) ceph.conf file
f91f0fd5 838 bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote
1911f103
TL
839 log.info('Generating final ceph.conf file...')
840 r = _shell(
841 ctx=ctx,
842 cluster_name=cluster_name,
f91f0fd5 843 remote=bootstrap_remote,
1911f103
TL
844 args=[
845 'ceph', 'config', 'generate-minimal-conf',
846 ],
e306af50 847 stdout=StringIO(),
1911f103
TL
848 )
849 ctx.ceph[cluster_name].config_file = r.stdout.getvalue()
9f95a23c
TL
850
851 yield
852
853 finally:
854 pass
855
b3b6e05e 856
9f95a23c
TL
857@contextlib.contextmanager
858def ceph_mgrs(ctx, config):
859 """
860 Deploy any additional mgrs
861 """
862 cluster_name = config['cluster']
863 fsid = ctx.ceph[cluster_name].fsid
864
865 try:
866 nodes = []
867 daemons = {}
868 for remote, roles in ctx.cluster.remotes.items():
869 for mgr in [r for r in roles
870 if teuthology.is_type('mgr', cluster_name)(r)]:
871 c_, _, id_ = teuthology.split_role(mgr)
9f95a23c
TL
872 log.info('Adding %s on %s' % (mgr, remote.shortname))
873 nodes.append(remote.shortname + '=' + id_)
f67539c2
TL
874 if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
875 continue
9f95a23c
TL
876 daemons[mgr] = (remote, id_)
877 if nodes:
878 _shell(ctx, cluster_name, remote, [
879 'ceph', 'orch', 'apply', 'mgr',
f67539c2 880 str(len(nodes)) + ';' + ';'.join(nodes)]
9f95a23c
TL
881 )
882 for mgr, i in daemons.items():
883 remote, id_ = i
884 ctx.daemons.register_daemon(
885 remote, 'mgr', id_,
886 cluster=cluster_name,
887 fsid=fsid,
888 logger=log.getChild(mgr),
889 wait=False,
890 started=True,
891 )
892
893 yield
894
895 finally:
896 pass
897
b3b6e05e 898
9f95a23c
TL
899@contextlib.contextmanager
900def ceph_osds(ctx, config):
901 """
902 Deploy OSDs
903 """
904 cluster_name = config['cluster']
905 fsid = ctx.ceph[cluster_name].fsid
1911f103 906
9f95a23c
TL
907 try:
908 log.info('Deploying OSDs...')
909
910 # provision OSDs in numeric order
911 id_to_remote = {}
912 devs_by_remote = {}
913 for remote, roles in ctx.cluster.remotes.items():
914 devs_by_remote[remote] = teuthology.get_scratch_devices(remote)
915 for osd in [r for r in roles
916 if teuthology.is_type('osd', cluster_name)(r)]:
917 _, _, id_ = teuthology.split_role(osd)
918 id_to_remote[int(id_)] = (osd, remote)
919
920 cur = 0
921 for osd_id in sorted(id_to_remote.keys()):
922 osd, remote = id_to_remote[osd_id]
923 _, _, id_ = teuthology.split_role(osd)
924 assert int(id_) == cur
925 devs = devs_by_remote[remote]
926 assert devs ## FIXME ##
927 dev = devs.pop()
e306af50
TL
928 if all(_ in dev for _ in ('lv', 'vg')):
929 short_dev = dev.replace('/dev/', '')
930 else:
931 short_dev = dev
9f95a23c
TL
932 log.info('Deploying %s on %s with %s...' % (
933 osd, remote.shortname, dev))
934 _shell(ctx, cluster_name, remote, [
935 'ceph-volume', 'lvm', 'zap', dev])
1e59de90
TL
936 add_osd_args = ['ceph', 'orch', 'daemon', 'add', 'osd',
937 remote.shortname + ':' + short_dev]
938 osd_method = config.get('osd_method')
939 if osd_method:
940 add_osd_args.append(osd_method)
941 _shell(ctx, cluster_name, remote, add_osd_args)
9f95a23c
TL
942 ctx.daemons.register_daemon(
943 remote, 'osd', id_,
944 cluster=cluster_name,
945 fsid=fsid,
946 logger=log.getChild(osd),
947 wait=False,
948 started=True,
949 )
950 cur += 1
951
20effc67
TL
952 if cur == 0:
953 _shell(ctx, cluster_name, remote, [
954 'ceph', 'orch', 'apply', 'osd', '--all-available-devices',
955 ])
956 # expect the number of scratch devs
957 num_osds = sum(map(len, devs_by_remote.values()))
958 assert num_osds
959 else:
960 # expect the number of OSDs we created
961 num_osds = cur
962
963 log.info(f'Waiting for {num_osds} OSDs to come up...')
964 with contextutil.safe_while(sleep=1, tries=120) as proceed:
965 while proceed():
966 p = _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
967 ['ceph', 'osd', 'stat', '-f', 'json'], stdout=StringIO())
968 j = json.loads(p.stdout.getvalue())
969 if int(j.get('num_up_osds', 0)) == num_osds:
970 break;
971
972 if not hasattr(ctx, 'managers'):
973 ctx.managers = {}
974 ctx.managers[cluster_name] = CephManager(
975 ctx.ceph[cluster_name].bootstrap_remote,
976 ctx=ctx,
977 logger=log.getChild('ceph_manager.' + cluster_name),
978 cluster=cluster_name,
979 cephadm=True,
980 )
981
9f95a23c
TL
982 yield
983 finally:
984 pass
985
b3b6e05e 986
9f95a23c
TL
987@contextlib.contextmanager
988def ceph_mdss(ctx, config):
989 """
990 Deploy MDSss
991 """
992 cluster_name = config['cluster']
993 fsid = ctx.ceph[cluster_name].fsid
994
995 nodes = []
996 daemons = {}
997 for remote, roles in ctx.cluster.remotes.items():
998 for role in [r for r in roles
999 if teuthology.is_type('mds', cluster_name)(r)]:
1000 c_, _, id_ = teuthology.split_role(role)
1001 log.info('Adding %s on %s' % (role, remote.shortname))
1002 nodes.append(remote.shortname + '=' + id_)
1003 daemons[role] = (remote, id_)
1004 if nodes:
1005 _shell(ctx, cluster_name, remote, [
1006 'ceph', 'orch', 'apply', 'mds',
1007 'all',
1008 str(len(nodes)) + ';' + ';'.join(nodes)]
1009 )
1010 for role, i in daemons.items():
1011 remote, id_ = i
1012 ctx.daemons.register_daemon(
1013 remote, 'mds', id_,
1014 cluster=cluster_name,
1015 fsid=fsid,
1016 logger=log.getChild(role),
1017 wait=False,
1018 started=True,
1019 )
1020
1021 yield
1022
20effc67
TL
1023@contextlib.contextmanager
1024def cephfs_setup(ctx, config):
1025 mdss = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
1026
1027 # If there are any MDSs, then create a filesystem for them to use
1028 # Do this last because requires mon cluster to be up and running
1029 if len(mdss) > 0:
1030 log.info('Setting up CephFS filesystem(s)...')
1031 cephfs_config = config.get('cephfs', {})
1032 fs_configs = cephfs_config.pop('fs', [{'name': 'cephfs'}])
1033 set_allow_multifs = len(fs_configs) > 1
1034
1035 # wait for standbys to become available (slow due to valgrind, perhaps)
1036 mdsc = MDSCluster(ctx)
1037 with contextutil.safe_while(sleep=2,tries=150) as proceed:
1038 while proceed():
1039 if len(mdsc.get_standby_daemons()) >= len(mdss):
1040 break
1041
1042 fss = []
1043 for fs_config in fs_configs:
1044 assert isinstance(fs_config, dict)
1045 name = fs_config.pop('name')
1046 temp = deepcopy(cephfs_config)
1047 teuthology.deep_merge(temp, fs_config)
1e59de90
TL
1048 subvols = config.get('subvols', None)
1049 if subvols:
1050 teuthology.deep_merge(temp, {'subvols': subvols})
20effc67
TL
1051 fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
1052 if set_allow_multifs:
1053 fs.set_allow_multifs()
1054 set_allow_multifs = False
1055 fss.append(fs)
1056
1057 yield
1058
1059 for fs in fss:
1060 fs.destroy()
1061 else:
1062 yield
b3b6e05e 1063
9f95a23c
TL
1064@contextlib.contextmanager
1065def ceph_monitoring(daemon_type, ctx, config):
1066 """
1067 Deploy prometheus, node-exporter, etc.
1068 """
1069 cluster_name = config['cluster']
1070 fsid = ctx.ceph[cluster_name].fsid
1071
1072 nodes = []
1073 daemons = {}
1074 for remote, roles in ctx.cluster.remotes.items():
1075 for role in [r for r in roles
1076 if teuthology.is_type(daemon_type, cluster_name)(r)]:
1077 c_, _, id_ = teuthology.split_role(role)
1078 log.info('Adding %s on %s' % (role, remote.shortname))
1079 nodes.append(remote.shortname + '=' + id_)
1080 daemons[role] = (remote, id_)
1081 if nodes:
1082 _shell(ctx, cluster_name, remote, [
1083 'ceph', 'orch', 'apply', daemon_type,
1084 str(len(nodes)) + ';' + ';'.join(nodes)]
1085 )
1086 for role, i in daemons.items():
1087 remote, id_ = i
1088 ctx.daemons.register_daemon(
1089 remote, daemon_type, id_,
1090 cluster=cluster_name,
1091 fsid=fsid,
1092 logger=log.getChild(role),
1093 wait=False,
1094 started=True,
1095 )
1096
1097 yield
1098
b3b6e05e 1099
9f95a23c
TL
1100@contextlib.contextmanager
1101def ceph_rgw(ctx, config):
1102 """
1103 Deploy rgw
1104 """
1105 cluster_name = config['cluster']
1106 fsid = ctx.ceph[cluster_name].fsid
1107
1108 nodes = {}
1109 daemons = {}
1110 for remote, roles in ctx.cluster.remotes.items():
1111 for role in [r for r in roles
1112 if teuthology.is_type('rgw', cluster_name)(r)]:
1113 c_, _, id_ = teuthology.split_role(role)
1114 log.info('Adding %s on %s' % (role, remote.shortname))
f67539c2
TL
1115 svc = '.'.join(id_.split('.')[0:2])
1116 if svc not in nodes:
1117 nodes[svc] = []
1118 nodes[svc].append(remote.shortname + '=' + id_)
9f95a23c 1119 daemons[role] = (remote, id_)
e306af50 1120
f67539c2 1121 for svc, nodes in nodes.items():
9f95a23c 1122 _shell(ctx, cluster_name, remote, [
f67539c2 1123 'ceph', 'orch', 'apply', 'rgw', svc,
e306af50
TL
1124 '--placement',
1125 str(len(nodes)) + ';' + ';'.join(nodes)]
9f95a23c
TL
1126 )
1127 for role, i in daemons.items():
1128 remote, id_ = i
1129 ctx.daemons.register_daemon(
1130 remote, 'rgw', id_,
1131 cluster=cluster_name,
1132 fsid=fsid,
1133 logger=log.getChild(role),
1134 wait=False,
1135 started=True,
1136 )
1137
1138 yield
1139
f91f0fd5
TL
1140
1141@contextlib.contextmanager
1142def ceph_iscsi(ctx, config):
1143 """
1144 Deploy iSCSIs
1145 """
1146 cluster_name = config['cluster']
1147 fsid = ctx.ceph[cluster_name].fsid
1148
1149 nodes = []
1150 daemons = {}
20effc67
TL
1151 ips = []
1152
f91f0fd5
TL
1153 for remote, roles in ctx.cluster.remotes.items():
1154 for role in [r for r in roles
20effc67 1155 if teuthology.is_type('iscsi', cluster_name)(r)]:
f91f0fd5
TL
1156 c_, _, id_ = teuthology.split_role(role)
1157 log.info('Adding %s on %s' % (role, remote.shortname))
1158 nodes.append(remote.shortname + '=' + id_)
1159 daemons[role] = (remote, id_)
20effc67
TL
1160 ips.append(remote.ip_address)
1161 trusted_ip_list = ','.join(ips)
f91f0fd5 1162 if nodes:
20effc67
TL
1163 poolname = 'datapool'
1164 # ceph osd pool create datapool 3 3 replicated
f91f0fd5
TL
1165 _shell(ctx, cluster_name, remote, [
1166 'ceph', 'osd', 'pool', 'create',
1167 poolname, '3', '3', 'replicated']
1168 )
1169
1170 _shell(ctx, cluster_name, remote, [
20effc67 1171 'rbd', 'pool', 'init', poolname]
f91f0fd5
TL
1172 )
1173
20effc67 1174 # ceph orch apply iscsi datapool (admin)user (admin)password
f91f0fd5
TL
1175 _shell(ctx, cluster_name, remote, [
1176 'ceph', 'orch', 'apply', 'iscsi',
20effc67
TL
1177 poolname, 'admin', 'admin',
1178 '--trusted_ip_list', trusted_ip_list,
f91f0fd5
TL
1179 '--placement', str(len(nodes)) + ';' + ';'.join(nodes)]
1180 )
20effc67
TL
1181
1182 # used by iscsi client to identify valid gateway ip's
1183 conf_data = dedent(f"""
1184 [config]
1185 trusted_ip_list = {trusted_ip_list}
1186 """)
1187 distribute_iscsi_gateway_cfg(ctx, conf_data)
1188
f91f0fd5
TL
1189 for role, i in daemons.items():
1190 remote, id_ = i
1191 ctx.daemons.register_daemon(
1192 remote, 'iscsi', id_,
1193 cluster=cluster_name,
1194 fsid=fsid,
1195 logger=log.getChild(role),
1196 wait=False,
1197 started=True,
1198 )
1199
1200 yield
1201
b3b6e05e 1202
9f95a23c
TL
1203@contextlib.contextmanager
1204def ceph_clients(ctx, config):
1205 cluster_name = config['cluster']
9f95a23c
TL
1206
1207 log.info('Setting up client nodes...')
1208 clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
9f95a23c
TL
1209 for remote, roles_for_host in clients.remotes.items():
1210 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
1211 cluster_name):
1212 name = teuthology.ceph_role(role)
1213 client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name,
1214 name)
1215 r = _shell(
1216 ctx=ctx,
1217 cluster_name=cluster_name,
1218 remote=remote,
1219 args=[
1220 'ceph', 'auth',
1221 'get-or-create', name,
1222 'mon', 'allow *',
1223 'osd', 'allow *',
1224 'mds', 'allow *',
1225 'mgr', 'allow *',
1226 ],
e306af50 1227 stdout=StringIO(),
9f95a23c
TL
1228 )
1229 keyring = r.stdout.getvalue()
f67539c2 1230 remote.sudo_write_file(client_keyring, keyring, mode='0644')
9f95a23c
TL
1231 yield
1232
b3b6e05e 1233
9f95a23c
TL
1234@contextlib.contextmanager
1235def ceph_initial():
1236 try:
1237 yield
1238 finally:
1239 log.info('Teardown complete')
1240
b3b6e05e 1241
9f95a23c
TL
1242## public methods
1243@contextlib.contextmanager
1244def stop(ctx, config):
1245 """
1246 Stop ceph daemons
1247
1248 For example::
1249 tasks:
1250 - ceph.stop: [mds.*]
1251
1252 tasks:
1253 - ceph.stop: [osd.0, osd.2]
1254
1255 tasks:
1256 - ceph.stop:
1257 daemons: [osd.0, osd.2]
1258
1259 """
1260 if config is None:
1261 config = {}
1262 elif isinstance(config, list):
1263 config = {'daemons': config}
1264
1265 daemons = ctx.daemons.resolve_role_list(
1266 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1267 clusters = set()
1268
1269 for role in daemons:
1270 cluster, type_, id_ = teuthology.split_role(role)
1271 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1272 clusters.add(cluster)
1273
1274# for cluster in clusters:
1275# ctx.ceph[cluster].watchdog.stop()
1276# ctx.ceph[cluster].watchdog.join()
1277
1278 yield
1279
b3b6e05e 1280
9f95a23c
TL
1281def shell(ctx, config):
1282 """
1283 Execute (shell) commands
1284 """
1285 cluster_name = config.get('cluster', 'ceph')
1286
b3b6e05e
TL
1287 args = []
1288 for k in config.pop('env', []):
1289 args.extend(['-e', k + '=' + ctx.config.get(k, '')])
1290 for k in config.pop('volumes', []):
1291 args.extend(['-v', k])
9f95a23c 1292
f67539c2
TL
1293 if 'all-roles' in config and len(config) == 1:
1294 a = config['all-roles']
9f95a23c 1295 roles = teuthology.all_roles(ctx.cluster)
f67539c2
TL
1296 config = dict((id_, a) for id_ in roles if not id_.startswith('host.'))
1297 elif 'all-hosts' in config and len(config) == 1:
1298 a = config['all-hosts']
1299 roles = teuthology.all_roles(ctx.cluster)
1300 config = dict((id_, a) for id_ in roles if id_.startswith('host.'))
9f95a23c 1301
f67539c2 1302 for role, cmd in config.items():
9f95a23c
TL
1303 (remote,) = ctx.cluster.only(role).remotes.keys()
1304 log.info('Running commands on role %s host %s', role, remote.name)
f67539c2
TL
1305 if isinstance(cmd, list):
1306 for c in cmd:
1307 _shell(ctx, cluster_name, remote,
1308 ['bash', '-c', subst_vip(ctx, c)],
b3b6e05e 1309 extra_cephadm_args=args)
f67539c2
TL
1310 else:
1311 assert isinstance(cmd, str)
9f95a23c 1312 _shell(ctx, cluster_name, remote,
f67539c2 1313 ['bash', '-ex', '-c', subst_vip(ctx, cmd)],
b3b6e05e 1314 extra_cephadm_args=args)
9f95a23c 1315
f67539c2
TL
1316
1317def apply(ctx, config):
1318 """
1319 Apply spec
1320
1321 tasks:
1322 - cephadm.apply:
1323 specs:
1324 - service_type: rgw
1325 service_id: foo
1326 spec:
1327 rgw_frontend_port: 8000
1328 - service_type: rgw
1329 service_id: bar
1330 spec:
1331 rgw_frontend_port: 9000
1332 zone: bar
1333 realm: asdf
1334
1335 """
1336 cluster_name = config.get('cluster', 'ceph')
1337
1338 specs = config.get('specs', [])
1339 y = subst_vip(ctx, yaml.dump_all(specs))
1340
1341 log.info(f'Applying spec(s):\n{y}')
1342 _shell(
1343 ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1344 ['ceph', 'orch', 'apply', '-i', '-'],
1345 stdin=y,
1346 )
1347
1348
1349def wait_for_service(ctx, config):
1350 """
1351 Wait for a service to be fully started
1352
1353 tasks:
1354 - cephadm.wait_for_service:
1355 service: rgw.foo
1356 timeout: 60 # defaults to 300
1357
1358 """
1359 cluster_name = config.get('cluster', 'ceph')
1360 timeout = config.get('timeout', 300)
1361 service = config.get('service')
1362 assert service
1363
1364 log.info(
1365 f'Waiting for {cluster_name} service {service} to start (timeout {timeout})...'
1366 )
1367 with contextutil.safe_while(sleep=1, tries=timeout) as proceed:
1368 while proceed():
1369 r = _shell(
1370 ctx=ctx,
1371 cluster_name=cluster_name,
1372 remote=ctx.ceph[cluster_name].bootstrap_remote,
1373 args=[
1374 'ceph', 'orch', 'ls', '-f', 'json',
1375 ],
1376 stdout=StringIO(),
1377 )
1378 j = json.loads(r.stdout.getvalue())
1379 svc = None
1380 for s in j:
1381 if s['service_name'] == service:
1382 svc = s
1383 break
1384 if svc:
1385 log.info(
1386 f"{service} has {s['status']['running']}/{s['status']['size']}"
1387 )
1388 if s['status']['running'] == s['status']['size']:
1389 break
1390
1391
9f95a23c
TL
1392@contextlib.contextmanager
1393def tweaked_option(ctx, config):
1394 """
1395 set an option, and then restore it with its original value
1396
1397 Note, due to the way how tasks are executed/nested, it's not suggested to
1398 use this method as a standalone task. otherwise, it's likely that it will
1399 restore the tweaked option at the /end/ of 'tasks' block.
1400 """
1401 saved_options = {}
1402 # we can complicate this when necessary
1403 options = ['mon-health-to-clog']
1404 type_, id_ = 'mon', '*'
1405 cluster = config.get('cluster', 'ceph')
1406 manager = ctx.managers[cluster]
1407 if id_ == '*':
1408 get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
1409 else:
1410 get_from = id_
1411 for option in options:
1412 if option not in config:
1413 continue
1414 value = 'true' if config[option] else 'false'
1415 option = option.replace('-', '_')
1416 old_value = manager.get_config(type_, get_from, option)
1417 if value != old_value:
1418 saved_options[option] = old_value
1419 manager.inject_args(type_, id_, option, value)
1420 yield
1421 for option, value in saved_options.items():
1422 manager.inject_args(type_, id_, option, value)
1423
b3b6e05e 1424
9f95a23c
TL
1425@contextlib.contextmanager
1426def restart(ctx, config):
1427 """
1428 restart ceph daemons
1429
1430 For example::
1431 tasks:
1432 - ceph.restart: [all]
1433
1434 For example::
1435 tasks:
1436 - ceph.restart: [osd.0, mon.1, mds.*]
1437
1438 or::
1439
1440 tasks:
1441 - ceph.restart:
1442 daemons: [osd.0, mon.1]
1443 wait-for-healthy: false
1444 wait-for-osds-up: true
1445
1446 :param ctx: Context
1447 :param config: Configuration
1448 """
1449 if config is None:
1450 config = {}
1451 elif isinstance(config, list):
1452 config = {'daemons': config}
1453
1454 daemons = ctx.daemons.resolve_role_list(
1455 config.get('daemons', None), CEPH_ROLE_TYPES, True)
1456 clusters = set()
1457
1458 log.info('daemons %s' % daemons)
1459 with tweaked_option(ctx, config):
1460 for role in daemons:
1461 cluster, type_, id_ = teuthology.split_role(role)
1462 d = ctx.daemons.get_daemon(type_, id_, cluster)
1463 assert d, 'daemon %s does not exist' % role
1464 d.stop()
1465 if type_ == 'osd':
1466 ctx.managers[cluster].mark_down_osd(id_)
1467 d.restart()
1468 clusters.add(cluster)
1469
1470 if config.get('wait-for-healthy', True):
1471 for cluster in clusters:
1472 healthy(ctx=ctx, config=dict(cluster=cluster))
1473 if config.get('wait-for-osds-up', False):
1474 for cluster in clusters:
1475 ctx.managers[cluster].wait_for_all_osds_up()
1476 yield
1477
b3b6e05e 1478
1911f103
TL
1479@contextlib.contextmanager
1480def distribute_config_and_admin_keyring(ctx, config):
1481 """
1482 Distribute a sufficient config and keyring for clients
1483 """
1484 cluster_name = config['cluster']
1485 log.info('Distributing (final) config and client.admin keyring...')
1486 for remote, roles in ctx.cluster.remotes.items():
f67539c2
TL
1487 remote.write_file(
1488 '/etc/ceph/{}.conf'.format(cluster_name),
1489 ctx.ceph[cluster_name].config_file,
1490 sudo=True)
1491 remote.write_file(
1911f103 1492 path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
f67539c2
TL
1493 data=ctx.ceph[cluster_name].admin_keyring,
1494 sudo=True)
1911f103
TL
1495 try:
1496 yield
1497 finally:
1498 ctx.cluster.run(args=[
1499 'sudo', 'rm', '-f',
1500 '/etc/ceph/{}.conf'.format(cluster_name),
1501 '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
1502 ])
1503
b3b6e05e 1504
9f95a23c
TL
1505@contextlib.contextmanager
1506def crush_setup(ctx, config):
1507 cluster_name = config['cluster']
9f95a23c
TL
1508
1509 profile = config.get('crush_tunables', 'default')
1510 log.info('Setting crush tunables to %s', profile)
1511 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1512 args=['ceph', 'osd', 'crush', 'tunables', profile])
1513 yield
1514
b3b6e05e 1515
f67539c2
TL
1516@contextlib.contextmanager
1517def create_rbd_pool(ctx, config):
1518 if config.get('create_rbd_pool', False):
1519 cluster_name = config['cluster']
1520 log.info('Waiting for OSDs to come up')
1521 teuthology.wait_until_osds_up(
1522 ctx,
1523 cluster=ctx.cluster,
1524 remote=ctx.ceph[cluster_name].bootstrap_remote,
1525 ceph_cluster=cluster_name,
1526 )
1527 log.info('Creating RBD pool')
1528 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1529 args=['sudo', 'ceph', '--cluster', cluster_name,
1530 'osd', 'pool', 'create', 'rbd', '8'])
1531 _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote,
1532 args=['sudo', 'ceph', '--cluster', cluster_name,
1533 'osd', 'pool', 'application', 'enable',
1534 'rbd', 'rbd', '--yes-i-really-mean-it'
1535 ])
1536 yield
1537
b3b6e05e 1538
9f95a23c 1539@contextlib.contextmanager
e306af50
TL
1540def _bypass():
1541 yield
9f95a23c 1542
b3b6e05e 1543
e306af50
TL
1544@contextlib.contextmanager
1545def initialize_config(ctx, config):
9f95a23c 1546 cluster_name = config['cluster']
e306af50 1547 testdir = teuthology.get_testdir(ctx)
9f95a23c
TL
1548
1549 ctx.ceph[cluster_name].thrashers = []
1550 # fixme: setup watchdog, ala ceph.py
1551
1911f103
TL
1552 ctx.ceph[cluster_name].roleless = False # see below
1553
e306af50
TL
1554 first_ceph_cluster = False
1555 if not hasattr(ctx, 'daemons'):
1556 first_ceph_cluster = True
1557
9f95a23c
TL
1558 # cephadm mode?
1559 if 'cephadm_mode' not in config:
1560 config['cephadm_mode'] = 'root'
1561 assert config['cephadm_mode'] in ['root', 'cephadm-package']
1562 if config['cephadm_mode'] == 'root':
1563 ctx.cephadm = testdir + '/cephadm'
1564 else:
1565 ctx.cephadm = 'cephadm' # in the path
1566
1567 if first_ceph_cluster:
1568 # FIXME: this is global for all clusters
1569 ctx.daemons = DaemonGroup(
1570 use_cephadm=ctx.cephadm)
1571
9f95a23c
TL
1572 # uuid
1573 fsid = str(uuid.uuid1())
1574 log.info('Cluster fsid is %s' % fsid)
1575 ctx.ceph[cluster_name].fsid = fsid
1576
1577 # mon ips
1578 log.info('Choosing monitor IPs and ports...')
1579 remotes_and_roles = ctx.cluster.remotes.items()
9f95a23c
TL
1580 ips = [host for (host, port) in
1581 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
1911f103
TL
1582
1583 if config.get('roleless', False):
1584 # mons will be named after hosts
1911f103 1585 first_mon = None
20effc67 1586 max_mons = config.get('max_mons', 5)
1911f103 1587 for remote, _ in remotes_and_roles:
e306af50 1588 ctx.cluster.remotes[remote].append('mon.' + remote.shortname)
1911f103
TL
1589 if not first_mon:
1590 first_mon = remote.shortname
1591 bootstrap_remote = remote
20effc67
TL
1592 max_mons -= 1
1593 if not max_mons:
1594 break
e306af50
TL
1595 log.info('No mon roles; fabricating mons')
1596
1597 roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()]
1598
9f95a23c
TL
1599 ctx.ceph[cluster_name].mons = get_mons(
1600 roles, ips, cluster_name,
1601 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1602 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1911f103 1603 )
9f95a23c
TL
1604 log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
1605
1911f103
TL
1606 if config.get('roleless', False):
1607 ctx.ceph[cluster_name].roleless = True
1608 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1609 ctx.ceph[cluster_name].first_mon = first_mon
1610 ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon
1611 else:
1612 first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0]
1613 _, _, first_mon = teuthology.split_role(first_mon_role)
1614 (bootstrap_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
1615 log.info('First mon is mon.%s on %s' % (first_mon,
1616 bootstrap_remote.shortname))
1617 ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote
1618 ctx.ceph[cluster_name].first_mon = first_mon
1619 ctx.ceph[cluster_name].first_mon_role = first_mon_role
1620
1621 others = ctx.cluster.remotes[bootstrap_remote]
1622 mgrs = sorted([r for r in others
1623 if teuthology.is_type('mgr', cluster_name)(r)])
1624 if not mgrs:
1625 raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
1626 _, _, first_mgr = teuthology.split_role(mgrs[0])
1627 log.info('First mgr is %s' % (first_mgr))
1628 ctx.ceph[cluster_name].first_mgr = first_mgr
e306af50
TL
1629 yield
1630
b3b6e05e 1631
e306af50
TL
1632@contextlib.contextmanager
1633def task(ctx, config):
1634 """
1635 Deploy ceph cluster using cephadm
1636
e306af50
TL
1637 For example, teuthology.yaml can contain the 'defaults' section:
1638
1639 defaults:
1640 cephadm:
1641 containers:
e306af50
TL
1642 image: 'quay.io/ceph-ci/ceph'
1643
1644 Using overrides makes it possible to customize it per run.
1645 The equivalent 'overrides' section looks like:
1646
1647 overrides:
1648 cephadm:
1649 containers:
e306af50 1650 image: 'quay.io/ceph-ci/ceph'
b3b6e05e
TL
1651 registry-login:
1652 url: registry-url
1653 username: registry-user
1654 password: registry-password
e306af50
TL
1655
1656 :param ctx: the argparse.Namespace object
1657 :param config: the config dict
1658 """
1659 if config is None:
1660 config = {}
1661
1662 assert isinstance(config, dict), \
1663 "task only supports a dictionary for configuration"
1664
1665 overrides = ctx.config.get('overrides', {})
1666 teuthology.deep_merge(config, overrides.get('ceph', {}))
1667 teuthology.deep_merge(config, overrides.get('cephadm', {}))
1668 log.info('Config: ' + str(config))
1669
e306af50
TL
1670 # set up cluster context
1671 if not hasattr(ctx, 'ceph'):
1672 ctx.ceph = {}
e306af50
TL
1673 if 'cluster' not in config:
1674 config['cluster'] = 'ceph'
1675 cluster_name = config['cluster']
1676 if cluster_name not in ctx.ceph:
1677 ctx.ceph[cluster_name] = argparse.Namespace()
1678 ctx.ceph[cluster_name].bootstrapped = False
f91f0fd5 1679
e306af50
TL
1680 # image
1681 teuth_defaults = teuth_config.get('defaults', {})
1682 cephadm_defaults = teuth_defaults.get('cephadm', {})
1683 containers_defaults = cephadm_defaults.get('containers', {})
e306af50
TL
1684 container_image_name = containers_defaults.get('image', None)
1685
1686 containers = config.get('containers', {})
e306af50 1687 container_image_name = containers.get('image', container_image_name)
e306af50 1688
e306af50
TL
1689 if not hasattr(ctx.ceph[cluster_name], 'image'):
1690 ctx.ceph[cluster_name].image = config.get('image')
1e59de90 1691 ref = ctx.config.get("branch", "main")
e306af50 1692 if not ctx.ceph[cluster_name].image:
f6b5b4d7
TL
1693 if not container_image_name:
1694 raise Exception("Configuration error occurred. "
1695 "The 'image' value is undefined for 'cephadm' task. "
1696 "Please provide corresponding options in the task's "
1697 "config, task 'overrides', or teuthology 'defaults' "
1698 "section.")
e306af50 1699 sha1 = config.get('sha1')
f6b5b4d7
TL
1700 flavor = config.get('flavor', 'default')
1701
e306af50 1702 if sha1:
f6b5b4d7
TL
1703 if flavor == "crimson":
1704 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1 + '-' + flavor
1705 else:
1706 ctx.ceph[cluster_name].image = container_image_name + ':' + sha1
e306af50
TL
1707 ref = sha1
1708 else:
1e59de90
TL
1709 # fall back to using the branch value
1710 ctx.ceph[cluster_name].image = container_image_name + ':' + ref
e306af50
TL
1711 log.info('Cluster image is %s' % ctx.ceph[cluster_name].image)
1712
1911f103 1713
9f95a23c 1714 with contextutil.nested(
e306af50 1715 #if the cluster is already bootstrapped bypass corresponding methods
aee94f69 1716 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped) \
e306af50 1717 else initialize_config(ctx=ctx, config=config),
9f95a23c
TL
1718 lambda: ceph_initial(),
1719 lambda: normalize_hostnames(ctx=ctx),
aee94f69 1720 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped) \
e306af50 1721 else download_cephadm(ctx=ctx, config=config, ref=ref),
9f95a23c
TL
1722 lambda: ceph_log(ctx=ctx, config=config),
1723 lambda: ceph_crash(ctx=ctx, config=config),
20effc67 1724 lambda: pull_image(ctx=ctx, config=config),
aee94f69
TL
1725 lambda: _bypass() if not (config.get('use-ca-signed-key', False)) \
1726 else setup_ca_signed_keys(ctx, config),
1727 lambda: _bypass() if (ctx.ceph[cluster_name].bootstrapped) \
f67539c2 1728 else ceph_bootstrap(ctx, config),
9f95a23c
TL
1729 lambda: crush_setup(ctx=ctx, config=config),
1730 lambda: ceph_mons(ctx=ctx, config=config),
1911f103 1731 lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
9f95a23c
TL
1732 lambda: ceph_mgrs(ctx=ctx, config=config),
1733 lambda: ceph_osds(ctx=ctx, config=config),
1734 lambda: ceph_mdss(ctx=ctx, config=config),
20effc67 1735 lambda: cephfs_setup(ctx=ctx, config=config),
9f95a23c 1736 lambda: ceph_rgw(ctx=ctx, config=config),
f91f0fd5 1737 lambda: ceph_iscsi(ctx=ctx, config=config),
9f95a23c
TL
1738 lambda: ceph_monitoring('prometheus', ctx=ctx, config=config),
1739 lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config),
1740 lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config),
1741 lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
1742 lambda: ceph_clients(ctx=ctx, config=config),
f67539c2 1743 lambda: create_rbd_pool(ctx=ctx, config=config),
9f95a23c 1744 ):
9f95a23c
TL
1745 try:
1746 if config.get('wait-for-healthy', True):
1747 healthy(ctx=ctx, config=config)
1748
1749 log.info('Setup complete, yielding')
1750 yield
1751
1752 finally:
1753 log.info('Teardown begin')
e306af50 1754