]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
bump version to 18.2.2-pve1
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
f67539c2 6from copy import deepcopy
9f95a23c 7from io import BytesIO
f91f0fd5 8from io import StringIO
7c673cae
FG
9
10import argparse
11fdf7f2 11import configobj
7c673cae
FG
12import contextlib
13import errno
14import logging
15import os
16import json
17import time
18import gevent
11fdf7f2 19import re
7c673cae 20import socket
f67539c2 21import yaml
7c673cae
FG
22
23from paramiko import SSHException
f67539c2 24from tasks.ceph_manager import CephManager, write_conf, get_valgrind_args
11fdf7f2 25from tarfile import ReadError
f67539c2 26from tasks.cephfs.filesystem import MDSCluster, Filesystem
7c673cae
FG
27from teuthology import misc as teuthology
28from teuthology import contextutil
29from teuthology import exceptions
30from teuthology.orchestra import run
f67539c2 31from tasks import ceph_client as cclient
7c673cae 32from teuthology.orchestra.daemon import DaemonGroup
9f95a23c 33from tasks.daemonwatchdog import DaemonWatchdog
7c673cae
FG
34
35CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
11fdf7f2 36DATA_PATH = '/var/lib/ceph/{type_}/{cluster}-{id_}'
7c673cae
FG
37
38log = logging.getLogger(__name__)
39
40
41def generate_caps(type_):
42 """
43 Each call will return the next capability for each system type
44 (essentially a subset of possible role values). Valid types are osd,
45 mds and client.
46 """
47 defaults = dict(
48 osd=dict(
a4b75251
TL
49 mon='allow profile osd',
50 mgr='allow profile osd',
7c673cae
FG
51 osd='allow *',
52 ),
53 mgr=dict(
3efd9988
FG
54 mon='allow profile mgr',
55 osd='allow *',
56 mds='allow *',
7c673cae
FG
57 ),
58 mds=dict(
59 mon='allow *',
60 mgr='allow *',
61 osd='allow *',
62 mds='allow',
63 ),
64 client=dict(
65 mon='allow rw',
66 mgr='allow r',
67 osd='allow rwx',
68 mds='allow',
69 ),
70 )
71 for subsystem, capability in defaults[type_].items():
72 yield '--cap'
73 yield subsystem
74 yield capability
75
76
f67539c2
TL
77def update_archive_setting(ctx, key, value):
78 """
79 Add logs directory to job's info log file
80 """
81 if ctx.archive is None:
82 return
83 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
84 info_yaml = yaml.safe_load(info_file)
85 info_file.seek(0)
86 if 'archive' in info_yaml:
87 info_yaml['archive'][key] = value
88 else:
89 info_yaml['archive'] = {key: value}
90 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
91
92
11fdf7f2
TL
93@contextlib.contextmanager
94def ceph_crash(ctx, config):
95 """
f67539c2 96 Gather crash dumps from /var/lib/ceph/crash
11fdf7f2 97 """
f67539c2
TL
98
99 # Add crash directory to job's archive
100 update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
101
11fdf7f2
TL
102 try:
103 yield
104
105 finally:
106 if ctx.archive is not None:
107 log.info('Archiving crash dumps...')
108 path = os.path.join(ctx.archive, 'remote')
109 try:
110 os.makedirs(path)
9f95a23c 111 except OSError:
11fdf7f2 112 pass
9f95a23c 113 for remote in ctx.cluster.remotes.keys():
11fdf7f2
TL
114 sub = os.path.join(path, remote.shortname)
115 try:
116 os.makedirs(sub)
9f95a23c 117 except OSError:
11fdf7f2
TL
118 pass
119 try:
120 teuthology.pull_directory(remote, '/var/lib/ceph/crash',
121 os.path.join(sub, 'crash'))
9f95a23c 122 except ReadError:
11fdf7f2
TL
123 pass
124
125
7c673cae
FG
126@contextlib.contextmanager
127def ceph_log(ctx, config):
128 """
129 Create /var/log/ceph log directory that is open to everyone.
130 Add valgrind and profiling-logger directories.
131
132 :param ctx: Context
133 :param config: Configuration
134 """
135 log.info('Making ceph log dir writeable by non-root...')
136 run.wait(
137 ctx.cluster.run(
138 args=[
139 'sudo',
140 'chmod',
141 '777',
142 '/var/log/ceph',
143 ],
144 wait=False,
145 )
146 )
147 log.info('Disabling ceph logrotate...')
148 run.wait(
149 ctx.cluster.run(
150 args=[
151 'sudo',
152 'rm', '-f', '--',
153 '/etc/logrotate.d/ceph',
154 ],
155 wait=False,
156 )
157 )
158 log.info('Creating extra log directories...')
159 run.wait(
160 ctx.cluster.run(
161 args=[
162 'sudo',
163 'install', '-d', '-m0777', '--',
164 '/var/log/ceph/valgrind',
165 '/var/log/ceph/profiling-logger',
166 ],
167 wait=False,
168 )
169 )
170
f67539c2
TL
171 # Add logs directory to job's info log file
172 update_archive_setting(ctx, 'log', '/var/log/ceph')
173
7c673cae
FG
174 class Rotater(object):
175 stop_event = gevent.event.Event()
176
177 def invoke_logrotate(self):
178 # 1) install ceph-test.conf in /etc/logrotate.d
179 # 2) continuously loop over logrotate invocation with ceph-test.conf
180 while not self.stop_event.is_set():
181 self.stop_event.wait(timeout=30)
182 try:
adb31ebb
TL
183 procs = ctx.cluster.run(
184 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'],
185 wait=False,
186 stderr=StringIO()
7c673cae 187 )
adb31ebb 188 run.wait(procs)
7c673cae
FG
189 except exceptions.ConnectionLostError as e:
190 # Some tests may power off nodes during test, in which
191 # case we will see connection errors that we should ignore.
192 log.debug("Missed logrotate, node '{0}' is offline".format(
193 e.node))
9f95a23c 194 except EOFError:
7c673cae
FG
195 # Paramiko sometimes raises this when it fails to
196 # connect to a node during open_session. As with
197 # ConnectionLostError, we ignore this because nodes
198 # are allowed to get power cycled during tests.
199 log.debug("Missed logrotate, EOFError")
9f95a23c 200 except SSHException:
7c673cae 201 log.debug("Missed logrotate, SSHException")
adb31ebb
TL
202 except run.CommandFailedError as e:
203 for p in procs:
204 if p.finished and p.exitstatus != 0:
205 err = p.stderr.getvalue()
206 if 'error: error renaming temp state file' in err:
207 log.info('ignoring transient state error: %s', e)
208 else:
209 raise
7c673cae 210 except socket.error as e:
92f5a8d4 211 if e.errno in (errno.EHOSTUNREACH, errno.ECONNRESET):
7c673cae
FG
212 log.debug("Missed logrotate, host unreachable")
213 else:
214 raise
215
216 def begin(self):
217 self.thread = gevent.spawn(self.invoke_logrotate)
218
219 def end(self):
220 self.stop_event.set()
221 self.thread.get()
222
223 def write_rotate_conf(ctx, daemons):
224 testdir = teuthology.get_testdir(ctx)
9f95a23c 225 remote_logrotate_conf = '%s/logrotate.ceph-test.conf' % testdir
7c673cae 226 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
f91f0fd5 227 with open(rotate_conf_path) as f:
7c673cae 228 conf = ""
9f95a23c
TL
229 for daemon, size in daemons.items():
230 log.info('writing logrotate stanza for {}'.format(daemon))
f91f0fd5
TL
231 conf += f.read().format(daemon_type=daemon,
232 max_size=size)
7c673cae
FG
233 f.seek(0, 0)
234
9f95a23c 235 for remote in ctx.cluster.remotes.keys():
f67539c2
TL
236 remote.write_file(remote_logrotate_conf, BytesIO(conf.encode()))
237 remote.sh(
238 f'sudo mv {remote_logrotate_conf} /etc/logrotate.d/ceph-test.conf && '
239 'sudo chmod 0644 /etc/logrotate.d/ceph-test.conf && '
240 'sudo chown root.root /etc/logrotate.d/ceph-test.conf')
7c673cae
FG
241 remote.chcon('/etc/logrotate.d/ceph-test.conf',
242 'system_u:object_r:etc_t:s0')
243
244 if ctx.config.get('log-rotate'):
245 daemons = ctx.config.get('log-rotate')
246 log.info('Setting up log rotation with ' + str(daemons))
247 write_rotate_conf(ctx, daemons)
248 logrotater = Rotater()
249 logrotater.begin()
250 try:
251 yield
252
253 finally:
254 if ctx.config.get('log-rotate'):
255 log.info('Shutting down logrotate')
256 logrotater.end()
f67539c2 257 ctx.cluster.sh('sudo rm /etc/logrotate.d/ceph-test.conf')
7c673cae
FG
258 if ctx.archive is not None and \
259 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
260 # and logs
261 log.info('Compressing logs...')
262 run.wait(
263 ctx.cluster.run(
264 args=[
265 'sudo',
266 'find',
267 '/var/log/ceph',
268 '-name',
269 '*.log',
270 '-print0',
271 run.Raw('|'),
272 'sudo',
273 'xargs',
274 '-0',
275 '--no-run-if-empty',
276 '--',
277 'gzip',
278 '--',
279 ],
280 wait=False,
281 ),
282 )
283
284 log.info('Archiving logs...')
285 path = os.path.join(ctx.archive, 'remote')
11fdf7f2
TL
286 try:
287 os.makedirs(path)
9f95a23c 288 except OSError:
11fdf7f2 289 pass
9f95a23c 290 for remote in ctx.cluster.remotes.keys():
7c673cae 291 sub = os.path.join(path, remote.shortname)
11fdf7f2
TL
292 try:
293 os.makedirs(sub)
9f95a23c 294 except OSError:
11fdf7f2 295 pass
7c673cae
FG
296 teuthology.pull_directory(remote, '/var/log/ceph',
297 os.path.join(sub, 'log'))
298
299
300def assign_devs(roles, devs):
301 """
302 Create a dictionary of devs indexed by roles
303
304 :param roles: List of roles
305 :param devs: Corresponding list of devices.
306 :returns: Dictionary of devs indexed by roles.
307 """
308 return dict(zip(roles, devs))
309
310
311@contextlib.contextmanager
312def valgrind_post(ctx, config):
313 """
11fdf7f2
TL
314 After the tests run, look through all the valgrind logs. Exceptions are raised
315 if textual errors occurred in the logs, or if valgrind exceptions were detected in
7c673cae
FG
316 the logs.
317
318 :param ctx: Context
319 :param config: Configuration
320 """
321 try:
322 yield
323 finally:
324 lookup_procs = list()
325 log.info('Checking for errors in any valgrind logs...')
9f95a23c 326 for remote in ctx.cluster.remotes.keys():
7c673cae
FG
327 # look at valgrind logs for each node
328 proc = remote.run(
9f95a23c
TL
329 args="sudo zgrep '<kind>' /var/log/ceph/valgrind/* "
330 # include a second file so that we always get
331 # a filename prefix on the output
332 "/dev/null | sort | uniq",
7c673cae
FG
333 wait=False,
334 check_status=False,
f91f0fd5 335 stdout=StringIO(),
7c673cae
FG
336 )
337 lookup_procs.append((proc, remote))
338
339 valgrind_exception = None
340 for (proc, remote) in lookup_procs:
341 proc.wait()
f91f0fd5 342 out = proc.stdout.getvalue()
7c673cae
FG
343 for line in out.split('\n'):
344 if line == '':
345 continue
346 try:
347 (file, kind) = line.split(':')
348 except Exception:
349 log.error('failed to split line %s', line)
350 raise
351 log.debug('file %s kind %s', file, kind)
352 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
353 continue
354 log.error('saw valgrind issue %s in %s', kind, file)
355 valgrind_exception = Exception('saw valgrind issues')
356
357 if config.get('expect_valgrind_errors'):
358 if not valgrind_exception:
359 raise Exception('expected valgrind issues and found none')
360 else:
361 if valgrind_exception:
362 raise valgrind_exception
363
364
365@contextlib.contextmanager
366def crush_setup(ctx, config):
367 cluster_name = config['cluster']
368 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 369 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
370
371 profile = config.get('crush_tunables', 'default')
372 log.info('Setting crush tunables to %s', profile)
373 mon_remote.run(
374 args=['sudo', 'ceph', '--cluster', cluster_name,
375 'osd', 'crush', 'tunables', profile])
376 yield
377
378
1e59de90
TL
379@contextlib.contextmanager
380def check_enable_crimson(ctx, config):
381 # enable crimson-osds if crimson
382 log.info("check_enable_crimson: {}".format(is_crimson(config)))
383 if is_crimson(config):
384 cluster_name = config['cluster']
385 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
386 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
387 log.info('check_enable_crimson: setting set-allow-crimson')
388 mon_remote.run(
389 args=[
390 'sudo', 'ceph', '--cluster', cluster_name,
391 'osd', 'set-allow-crimson', '--yes-i-really-mean-it'
392 ]
393 )
394 yield
395
396
a4b75251
TL
397@contextlib.contextmanager
398def setup_manager(ctx, config):
399 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
400 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
401 if not hasattr(ctx, 'managers'):
402 ctx.managers = {}
403 ctx.managers[config['cluster']] = CephManager(
404 mon,
405 ctx=ctx,
406 logger=log.getChild('ceph_manager.' + config['cluster']),
407 cluster=config['cluster'],
408 )
409 yield
410
224ce89b
WB
411@contextlib.contextmanager
412def create_rbd_pool(ctx, config):
413 cluster_name = config['cluster']
414 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 415 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
224ce89b
WB
416 log.info('Waiting for OSDs to come up')
417 teuthology.wait_until_osds_up(
418 ctx,
419 cluster=ctx.cluster,
420 remote=mon_remote,
421 ceph_cluster=cluster_name,
422 )
3efd9988
FG
423 if config.get('create_rbd_pool', True):
424 log.info('Creating RBD pool')
425 mon_remote.run(
426 args=['sudo', 'ceph', '--cluster', cluster_name,
427 'osd', 'pool', 'create', 'rbd', '8'])
428 mon_remote.run(
429 args=[
430 'sudo', 'ceph', '--cluster', cluster_name,
431 'osd', 'pool', 'application', 'enable',
432 'rbd', 'rbd', '--yes-i-really-mean-it'
433 ],
434 check_status=False)
224ce89b
WB
435 yield
436
7c673cae
FG
437@contextlib.contextmanager
438def cephfs_setup(ctx, config):
439 cluster_name = config['cluster']
7c673cae
FG
440
441 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 442 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
443 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
444 # If there are any MDSs, then create a filesystem for them to use
445 # Do this last because requires mon cluster to be up and running
446 if mdss.remotes:
f67539c2
TL
447 log.info('Setting up CephFS filesystem(s)...')
448 cephfs_config = config.get('cephfs', {})
449 fs_configs = cephfs_config.pop('fs', [{'name': 'cephfs'}])
f67539c2
TL
450
451 # wait for standbys to become available (slow due to valgrind, perhaps)
452 mdsc = MDSCluster(ctx)
453 mds_count = len(list(teuthology.all_roles_of_type(ctx.cluster, 'mds')))
454 with contextutil.safe_while(sleep=2,tries=150) as proceed:
455 while proceed():
456 if len(mdsc.get_standby_daemons()) >= mds_count:
457 break
458
459 fss = []
460 for fs_config in fs_configs:
461 assert isinstance(fs_config, dict)
462 name = fs_config.pop('name')
463 temp = deepcopy(cephfs_config)
464 teuthology.deep_merge(temp, fs_config)
1e59de90
TL
465 subvols = config.get('subvols', None)
466 if subvols:
467 teuthology.deep_merge(temp, {'subvols': subvols})
f67539c2 468 fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
f67539c2 469 fss.append(fs)
7c673cae 470
f67539c2 471 yield
7c673cae 472
f67539c2
TL
473 for fs in fss:
474 fs.destroy()
475 else:
476 yield
7c673cae 477
9f95a23c
TL
478@contextlib.contextmanager
479def watchdog_setup(ctx, config):
480 ctx.ceph[config['cluster']].thrashers = []
481 ctx.ceph[config['cluster']].watchdog = DaemonWatchdog(ctx, config, ctx.ceph[config['cluster']].thrashers)
482 ctx.ceph[config['cluster']].watchdog.start()
483 yield
7c673cae 484
11fdf7f2
TL
485def get_mons(roles, ips, cluster_name,
486 mon_bind_msgr2=False,
487 mon_bind_addrvec=False):
488 """
489 Get monitors and their associated addresses
490 """
491 mons = {}
492 v1_ports = {}
493 v2_ports = {}
11fdf7f2
TL
494 is_mon = teuthology.is_type('mon', cluster_name)
495 for idx, roles in enumerate(roles):
496 for role in roles:
497 if not is_mon(role):
498 continue
499 if ips[idx] not in v1_ports:
500 v1_ports[ips[idx]] = 6789
501 else:
502 v1_ports[ips[idx]] += 1
503 if mon_bind_msgr2:
504 if ips[idx] not in v2_ports:
505 v2_ports[ips[idx]] = 3300
506 addr = '{ip}'.format(ip=ips[idx])
507 else:
508 assert mon_bind_addrvec
509 v2_ports[ips[idx]] += 1
510 addr = '[v2:{ip}:{port2},v1:{ip}:{port1}]'.format(
511 ip=ips[idx],
512 port2=v2_ports[ips[idx]],
513 port1=v1_ports[ips[idx]],
514 )
515 elif mon_bind_addrvec:
516 addr = '[v1:{ip}:{port}]'.format(
517 ip=ips[idx],
518 port=v1_ports[ips[idx]],
519 )
520 else:
521 addr = '{ip}:{port}'.format(
522 ip=ips[idx],
523 port=v1_ports[ips[idx]],
524 )
11fdf7f2
TL
525 mons[role] = addr
526 assert mons
527 return mons
528
529def skeleton_config(ctx, roles, ips, mons, cluster='ceph'):
530 """
531 Returns a ConfigObj that is prefilled with a skeleton config.
532
533 Use conf[section][key]=value or conf.merge to change it.
534
535 Use conf.write to write it out, override .filename first if you want.
536 """
537 path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template')
9f95a23c 538 conf = configobj.ConfigObj(path, file_error=True)
11fdf7f2 539 mon_hosts = []
9f95a23c 540 for role, addr in mons.items():
11fdf7f2
TL
541 mon_cluster, _, _ = teuthology.split_role(role)
542 if mon_cluster != cluster:
543 continue
544 name = teuthology.ceph_role(role)
545 conf.setdefault(name, {})
546 mon_hosts.append(addr)
547 conf.setdefault('global', {})
548 conf['global']['mon host'] = ','.join(mon_hosts)
549 # set up standby mds's
550 is_mds = teuthology.is_type('mds', cluster)
551 for roles_subset in roles:
552 for role in roles_subset:
553 if is_mds(role):
554 name = teuthology.ceph_role(role)
555 conf.setdefault(name, {})
556 return conf
557
558def create_simple_monmap(ctx, remote, conf, mons,
559 path=None,
560 mon_bind_addrvec=False):
561 """
562 Writes a simple monmap based on current ceph.conf into path, or
563 <testdir>/monmap by default.
564
565 Assumes ceph_conf is up to date.
566
567 Assumes mon sections are named "mon.*", with the dot.
568
569 :return the FSID (as a string) of the newly created monmap
570 """
571
9f95a23c 572 addresses = list(mons.items())
11fdf7f2
TL
573 assert addresses, "There are no monitors in config!"
574 log.debug('Ceph mon addresses: %s', addresses)
575
f67539c2
TL
576 try:
577 log.debug('writing out conf {c}'.format(c=conf))
578 except:
579 log.debug('my conf logging attempt failed')
11fdf7f2 580 testdir = teuthology.get_testdir(ctx)
f67539c2
TL
581 tmp_conf_path = '{tdir}/ceph.tmp.conf'.format(tdir=testdir)
582 conf_fp = BytesIO()
583 conf.write(conf_fp)
584 conf_fp.seek(0)
585 teuthology.write_file(remote, tmp_conf_path, conf_fp)
11fdf7f2
TL
586 args = [
587 'adjust-ulimits',
588 'ceph-coverage',
589 '{tdir}/archive/coverage'.format(tdir=testdir),
590 'monmaptool',
f67539c2
TL
591 '-c',
592 '{conf}'.format(conf=tmp_conf_path),
11fdf7f2
TL
593 '--create',
594 '--clobber',
595 ]
596 if mon_bind_addrvec:
597 args.extend(['--enable-all-features'])
598 for (role, addr) in addresses:
599 _, _, n = teuthology.split_role(role)
600 if mon_bind_addrvec and (',' in addr or 'v' in addr or ':' in addr):
601 args.extend(('--addv', n, addr))
602 else:
603 args.extend(('--add', n, addr))
604 if not path:
605 path = '{tdir}/monmap'.format(tdir=testdir)
606 args.extend([
607 '--print',
608 path
609 ])
610
9f95a23c 611 monmap_output = remote.sh(args)
11fdf7f2
TL
612 fsid = re.search("generated fsid (.+)$",
613 monmap_output, re.MULTILINE).group(1)
f67539c2 614 teuthology.delete_file(remote, tmp_conf_path)
11fdf7f2
TL
615 return fsid
616
f67539c2 617
1e59de90
TL
618def is_crimson(config):
619 return config.get('flavor', 'default') == 'crimson'
620
621
f67539c2 622def maybe_redirect_stderr(config, type_, args, log_path):
1e59de90 623 if type_ == 'osd' and is_crimson(config):
f67539c2
TL
624 # teuthworker uses ubuntu:ubuntu to access the test nodes
625 create_log_cmd = \
626 f'sudo install -b -o ubuntu -g ubuntu /dev/null {log_path}'
627 return create_log_cmd, args + [run.Raw('2>>'), log_path]
628 else:
629 return None, args
630
631
7c673cae
FG
632@contextlib.contextmanager
633def cluster(ctx, config):
634 """
635 Handle the creation and removal of a ceph cluster.
636
637 On startup:
638 Create directories needed for the cluster.
639 Create remote journals for all osds.
640 Create and set keyring.
11fdf7f2 641 Copy the monmap to the test systems.
7c673cae
FG
642 Setup mon nodes.
643 Setup mds nodes.
644 Mkfs osd nodes.
645 Add keyring information to monmaps
646 Mkfs mon nodes.
647
648 On exit:
11fdf7f2 649 If errors occurred, extract a failure message and store in ctx.summary.
7c673cae
FG
650 Unmount all test files and temporary journaling files.
651 Save the monitor information and archive all ceph logs.
652 Cleanup the keyring setup, and remove all monitor map and data files left over.
653
654 :param ctx: Context
655 :param config: Configuration
656 """
657 if ctx.config.get('use_existing_cluster', False) is True:
658 log.info("'use_existing_cluster' is true; skipping cluster creation")
659 yield
660
661 testdir = teuthology.get_testdir(ctx)
662 cluster_name = config['cluster']
663 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
664 log.info('Creating ceph cluster %s...', cluster_name)
11fdf7f2
TL
665 log.info('config %s', config)
666 log.info('ctx.config %s', ctx.config)
7c673cae
FG
667 run.wait(
668 ctx.cluster.run(
669 args=[
670 'install', '-d', '-m0755', '--',
671 data_dir,
672 ],
673 wait=False,
674 )
675 )
676
677 run.wait(
678 ctx.cluster.run(
679 args=[
680 'sudo',
681 'install', '-d', '-m0777', '--', '/var/run/ceph',
682 ],
683 wait=False,
684 )
685 )
686
687 devs_to_clean = {}
688 remote_to_roles_to_devs = {}
7c673cae 689 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
9f95a23c 690 for remote, roles_for_host in osds.remotes.items():
7c673cae 691 devs = teuthology.get_scratch_devices(remote)
801d1391
TL
692 roles_to_devs = assign_devs(
693 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), devs
694 )
695 devs_to_clean[remote] = []
696 log.info('osd dev map: {}'.format(roles_to_devs))
697 assert roles_to_devs, \
698 "remote {} has osd roles, but no osd devices were specified!".format(remote.hostname)
7c673cae 699 remote_to_roles_to_devs[remote] = roles_to_devs
801d1391
TL
700 log.info("remote_to_roles_to_devs: {}".format(remote_to_roles_to_devs))
701 for osd_role, dev_name in remote_to_roles_to_devs.items():
702 assert dev_name, "{} has no associated device!".format(osd_role)
7c673cae
FG
703
704 log.info('Generating config...')
705 remotes_and_roles = ctx.cluster.remotes.items()
706 roles = [role_list for (remote, role_list) in remotes_and_roles]
707 ips = [host for (host, port) in
708 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
11fdf7f2
TL
709 mons = get_mons(
710 roles, ips, cluster_name,
711 mon_bind_msgr2=config.get('mon_bind_msgr2'),
712 mon_bind_addrvec=config.get('mon_bind_addrvec'),
713 )
714 conf = skeleton_config(
715 ctx, roles=roles, ips=ips, mons=mons, cluster=cluster_name,
716 )
9f95a23c
TL
717 for section, keys in config['conf'].items():
718 for key, value in keys.items():
7c673cae
FG
719 log.info("[%s] %s = %s" % (section, key, value))
720 if section not in conf:
721 conf[section] = {}
722 conf[section][key] = value
723
7c673cae
FG
724 if not hasattr(ctx, 'ceph'):
725 ctx.ceph = {}
726 ctx.ceph[cluster_name] = argparse.Namespace()
727 ctx.ceph[cluster_name].conf = conf
11fdf7f2 728 ctx.ceph[cluster_name].mons = mons
7c673cae
FG
729
730 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
731 keyring_path = config.get('keyring_path', default_keyring)
732
733 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
734
735 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
736
737 log.info('Setting up %s...' % firstmon)
738 ctx.cluster.only(firstmon).run(
739 args=[
740 'sudo',
741 'adjust-ulimits',
742 'ceph-coverage',
743 coverage_dir,
744 'ceph-authtool',
745 '--create-keyring',
746 keyring_path,
747 ],
748 )
749 ctx.cluster.only(firstmon).run(
750 args=[
751 'sudo',
752 'adjust-ulimits',
753 'ceph-coverage',
754 coverage_dir,
755 'ceph-authtool',
756 '--gen-key',
757 '--name=mon.',
758 keyring_path,
759 ],
760 )
761 ctx.cluster.only(firstmon).run(
762 args=[
763 'sudo',
764 'chmod',
765 '0644',
766 keyring_path,
767 ],
768 )
769 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
770 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
771 cluster=cluster_name)
11fdf7f2 772 fsid = create_simple_monmap(
7c673cae
FG
773 ctx,
774 remote=mon0_remote,
775 conf=conf,
11fdf7f2 776 mons=mons,
7c673cae 777 path=monmap_path,
11fdf7f2 778 mon_bind_addrvec=config.get('mon_bind_addrvec'),
7c673cae 779 )
f67539c2 780 ctx.ceph[cluster_name].fsid = fsid
7c673cae
FG
781 if not 'global' in conf:
782 conf['global'] = {}
783 conf['global']['fsid'] = fsid
784
785 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
786 conf_path = config.get('conf_path', default_conf_path)
787 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
788 write_conf(ctx, conf_path, cluster_name)
789
790 log.info('Creating admin key on %s...' % firstmon)
791 ctx.cluster.only(firstmon).run(
792 args=[
793 'sudo',
794 'adjust-ulimits',
795 'ceph-coverage',
796 coverage_dir,
797 'ceph-authtool',
798 '--gen-key',
799 '--name=client.admin',
7c673cae
FG
800 '--cap', 'mon', 'allow *',
801 '--cap', 'osd', 'allow *',
802 '--cap', 'mds', 'allow *',
803 '--cap', 'mgr', 'allow *',
804 keyring_path,
805 ],
806 )
807
808 log.info('Copying monmap to all nodes...')
f67539c2
TL
809 keyring = mon0_remote.read_file(keyring_path)
810 monmap = mon0_remote.read_file(monmap_path)
7c673cae 811
9f95a23c 812 for rem in ctx.cluster.remotes.keys():
7c673cae
FG
813 # copy mon key and initial monmap
814 log.info('Sending monmap to node {remote}'.format(remote=rem))
f67539c2
TL
815 rem.write_file(keyring_path, keyring, mode='0644', sudo=True)
816 rem.write_file(monmap_path, monmap)
7c673cae
FG
817
818 log.info('Setting up mon nodes...')
819 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
7c673cae
FG
820
821 if not config.get('skip_mgr_daemons', False):
822 log.info('Setting up mgr nodes...')
823 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
9f95a23c 824 for remote, roles_for_host in mgrs.remotes.items():
7c673cae
FG
825 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
826 cluster_name):
827 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
828 mgr_dir = DATA_PATH.format(
829 type_='mgr', cluster=cluster_name, id_=id_)
7c673cae
FG
830 remote.run(
831 args=[
832 'sudo',
833 'mkdir',
834 '-p',
835 mgr_dir,
836 run.Raw('&&'),
837 'sudo',
838 'adjust-ulimits',
839 'ceph-coverage',
840 coverage_dir,
841 'ceph-authtool',
842 '--create-keyring',
843 '--gen-key',
844 '--name=mgr.{id}'.format(id=id_),
845 mgr_dir + '/keyring',
846 ],
847 )
848
849 log.info('Setting up mds nodes...')
850 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
9f95a23c 851 for remote, roles_for_host in mdss.remotes.items():
7c673cae
FG
852 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
853 cluster_name):
854 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
855 mds_dir = DATA_PATH.format(
856 type_='mds', cluster=cluster_name, id_=id_)
7c673cae
FG
857 remote.run(
858 args=[
859 'sudo',
860 'mkdir',
861 '-p',
862 mds_dir,
863 run.Raw('&&'),
864 'sudo',
865 'adjust-ulimits',
866 'ceph-coverage',
867 coverage_dir,
868 'ceph-authtool',
869 '--create-keyring',
870 '--gen-key',
871 '--name=mds.{id}'.format(id=id_),
872 mds_dir + '/keyring',
873 ],
874 )
11fdf7f2
TL
875 remote.run(args=[
876 'sudo', 'chown', '-R', 'ceph:ceph', mds_dir
877 ])
7c673cae
FG
878
879 cclient.create_keyring(ctx, cluster_name)
880 log.info('Running mkfs on osd nodes...')
881
882 if not hasattr(ctx, 'disk_config'):
883 ctx.disk_config = argparse.Namespace()
884 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
885 ctx.disk_config.remote_to_roles_to_dev = {}
7c673cae
FG
886 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
887 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
888 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
889 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
890
891 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
7c673cae
FG
892
893 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
1e59de90 894
9f95a23c 895 for remote, roles_for_host in osds.remotes.items():
7c673cae 896 roles_to_devs = remote_to_roles_to_devs[remote]
7c673cae
FG
897
898 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
899 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
900 mnt_point = DATA_PATH.format(
901 type_='osd', cluster=cluster_name, id_=id_)
7c673cae
FG
902 remote.run(
903 args=[
904 'sudo',
905 'mkdir',
906 '-p',
907 mnt_point,
908 ])
801d1391
TL
909 log.info('roles_to_devs: {}'.format(roles_to_devs))
910 log.info('role: {}'.format(role))
7c673cae
FG
911 if roles_to_devs.get(role):
912 dev = roles_to_devs[role]
913 fs = config.get('fs')
914 package = None
915 mkfs_options = config.get('mkfs_options')
916 mount_options = config.get('mount_options')
917 if fs == 'btrfs':
918 # package = 'btrfs-tools'
919 if mount_options is None:
920 mount_options = ['noatime', 'user_subvol_rm_allowed']
921 if mkfs_options is None:
922 mkfs_options = ['-m', 'single',
923 '-l', '32768',
924 '-n', '32768']
925 if fs == 'xfs':
926 # package = 'xfsprogs'
927 if mount_options is None:
928 mount_options = ['noatime']
929 if mkfs_options is None:
930 mkfs_options = ['-f', '-i', 'size=2048']
931 if fs == 'ext4' or fs == 'ext3':
932 if mount_options is None:
933 mount_options = ['noatime', 'user_xattr']
934
935 if mount_options is None:
936 mount_options = []
937 if mkfs_options is None:
938 mkfs_options = []
939 mkfs = ['mkfs.%s' % fs] + mkfs_options
940 log.info('%s on %s on %s' % (mkfs, dev, remote))
941 if package is not None:
9f95a23c 942 remote.sh('sudo apt-get install -y %s' % package)
7c673cae
FG
943
944 try:
945 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
946 except run.CommandFailedError:
947 # Newer btfs-tools doesn't prompt for overwrite, use -f
948 if '-f' not in mount_options:
949 mkfs_options.append('-f')
950 mkfs = ['mkfs.%s' % fs] + mkfs_options
951 log.info('%s on %s on %s' % (mkfs, dev, remote))
952 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
953
954 log.info('mount %s on %s -o %s' % (dev, remote,
955 ','.join(mount_options)))
956 remote.run(
957 args=[
958 'sudo',
959 'mount',
960 '-t', fs,
961 '-o', ','.join(mount_options),
962 dev,
963 mnt_point,
964 ]
965 )
966 remote.run(
967 args=[
968 'sudo', '/sbin/restorecon', mnt_point,
969 ],
970 check_status=False,
971 )
972 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
973 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
974 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
975 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
976 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
977 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
978 devs_to_clean[remote].append(mnt_point)
979
980 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
981 _, _, id_ = teuthology.split_role(role)
11fdf7f2 982 try:
f67539c2 983 args = ['sudo',
11fdf7f2
TL
984 'MALLOC_CHECK_=3',
985 'adjust-ulimits',
f67539c2 986 'ceph-coverage', coverage_dir,
11fdf7f2
TL
987 'ceph-osd',
988 '--no-mon-config',
f67539c2 989 '--cluster', cluster_name,
11fdf7f2
TL
990 '--mkfs',
991 '--mkkey',
992 '-i', id_,
f67539c2
TL
993 '--monmap', monmap_path]
994 log_path = f'/var/log/ceph/{cluster_name}-osd.{id_}.log'
995 create_log_cmd, args = \
996 maybe_redirect_stderr(config, 'osd', args, log_path)
997 if create_log_cmd:
998 remote.sh(create_log_cmd)
999 remote.run(args=args)
11fdf7f2
TL
1000 except run.CommandFailedError:
1001 # try without --no-mon-config.. this may be an upgrade test
1002 remote.run(
1003 args=[
1004 'sudo',
1005 'MALLOC_CHECK_=3',
1006 'adjust-ulimits',
1007 'ceph-coverage',
1008 coverage_dir,
1009 'ceph-osd',
1010 '--cluster',
1011 cluster_name,
1012 '--mkfs',
1013 '--mkkey',
1014 '-i', id_,
1015 '--monmap', monmap_path,
1016 ],
1017 )
1018 mnt_point = DATA_PATH.format(
1019 type_='osd', cluster=cluster_name, id_=id_)
f91f0fd5
TL
1020 remote.run(args=[
1021 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
1022 ])
7c673cae
FG
1023
1024 log.info('Reading keys from all nodes...')
9f95a23c 1025 keys_fp = BytesIO()
7c673cae 1026 keys = []
9f95a23c 1027 for remote, roles_for_host in ctx.cluster.remotes.items():
7c673cae
FG
1028 for type_ in ['mgr', 'mds', 'osd']:
1029 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
1030 continue
1031 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
1032 _, _, id_ = teuthology.split_role(role)
f67539c2
TL
1033 data = remote.read_file(
1034 os.path.join(
11fdf7f2
TL
1035 DATA_PATH.format(
1036 type_=type_, id_=id_, cluster=cluster_name),
1037 'keyring',
7c673cae
FG
1038 ),
1039 sudo=True,
1040 )
1041 keys.append((type_, id_, data))
1042 keys_fp.write(data)
9f95a23c 1043 for remote, roles_for_host in ctx.cluster.remotes.items():
7c673cae
FG
1044 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
1045 _, _, id_ = teuthology.split_role(role)
f67539c2
TL
1046 data = remote.read_file(
1047 '/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
7c673cae
FG
1048 )
1049 keys.append(('client', id_, data))
1050 keys_fp.write(data)
1051
1052 log.info('Adding keys to all mons...')
1053 writes = mons.run(
1054 args=[
1055 'sudo', 'tee', '-a',
1056 keyring_path,
1057 ],
1058 stdin=run.PIPE,
1059 wait=False,
9f95a23c 1060 stdout=BytesIO(),
7c673cae
FG
1061 )
1062 keys_fp.seek(0)
1063 teuthology.feed_many_stdins_and_close(keys_fp, writes)
1064 run.wait(writes)
1065 for type_, id_, data in keys:
1066 run.wait(
1067 mons.run(
1068 args=[
1069 'sudo',
1070 'adjust-ulimits',
1071 'ceph-coverage',
1072 coverage_dir,
1073 'ceph-authtool',
1074 keyring_path,
1075 '--name={type}.{id}'.format(
1076 type=type_,
1077 id=id_,
1078 ),
1079 ] + list(generate_caps(type_)),
1080 wait=False,
1081 ),
1082 )
1083
1084 log.info('Running mkfs on mon nodes...')
9f95a23c 1085 for remote, roles_for_host in mons.remotes.items():
7c673cae
FG
1086 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
1087 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
1088 mnt_point = DATA_PATH.format(
1089 type_='mon', id_=id_, cluster=cluster_name)
7c673cae
FG
1090 remote.run(
1091 args=[
1092 'sudo',
1093 'mkdir',
1094 '-p',
11fdf7f2 1095 mnt_point,
7c673cae
FG
1096 ],
1097 )
1098 remote.run(
1099 args=[
1100 'sudo',
1101 'adjust-ulimits',
1102 'ceph-coverage',
1103 coverage_dir,
1104 'ceph-mon',
1105 '--cluster', cluster_name,
1106 '--mkfs',
1107 '-i', id_,
1108 '--monmap', monmap_path,
7c673cae
FG
1109 '--keyring', keyring_path,
1110 ],
1111 )
f91f0fd5
TL
1112 remote.run(args=[
1113 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
1114 ])
7c673cae
FG
1115
1116 run.wait(
1117 mons.run(
1118 args=[
1119 'rm',
1120 '--',
1121 monmap_path,
7c673cae
FG
1122 ],
1123 wait=False,
1124 ),
1125 )
1126
1127 try:
1128 yield
1129 except Exception:
1130 # we need to know this below
1131 ctx.summary['success'] = False
1132 raise
1133 finally:
1134 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1135
1136 log.info('Checking cluster log for badness...')
1137
1138 def first_in_ceph_log(pattern, excludes):
1139 """
11fdf7f2 1140 Find the first occurrence of the pattern specified in the Ceph log,
7c673cae
FG
1141 Returns None if none found.
1142
1143 :param pattern: Pattern scanned for.
1144 :param excludes: Patterns to ignore.
1145 :return: First line of text (or None if not found)
1146 """
1147 args = [
1148 'sudo',
1149 'egrep', pattern,
1150 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
1151 ]
1152 for exclude in excludes:
1153 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
1154 args.extend([
1155 run.Raw('|'), 'head', '-n', '1',
1156 ])
9f95a23c
TL
1157 stdout = mon0_remote.sh(args)
1158 return stdout or None
7c673cae
FG
1159
1160 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
cd265ab1 1161 config['log_ignorelist']) is not None:
7c673cae
FG
1162 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
1163 ctx.summary['success'] = False
1164 # use the most severe problem as the failure reason
1165 if 'failure_reason' not in ctx.summary:
1166 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
cd265ab1 1167 match = first_in_ceph_log(pattern, config['log_ignorelist'])
7c673cae
FG
1168 if match is not None:
1169 ctx.summary['failure_reason'] = \
1170 '"{match}" in cluster log'.format(
1171 match=match.rstrip('\n'),
1172 )
1173 break
1174
9f95a23c 1175 for remote, dirs in devs_to_clean.items():
7c673cae
FG
1176 for dir_ in dirs:
1177 log.info('Unmounting %s on %s' % (dir_, remote))
1178 try:
1179 remote.run(
1180 args=[
1181 'sync',
1182 run.Raw('&&'),
1183 'sudo',
1184 'umount',
1185 '-f',
1186 dir_
1187 ]
1188 )
1189 except Exception as e:
1190 remote.run(args=[
1191 'sudo',
1192 run.Raw('PATH=/usr/sbin:$PATH'),
1193 'lsof',
1194 run.Raw(';'),
1195 'ps', 'auxf',
1196 ])
1197 raise e
1198
7c673cae
FG
1199 if ctx.archive is not None and \
1200 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
1201
1202 # archive mon data, too
1203 log.info('Archiving mon data...')
1204 path = os.path.join(ctx.archive, 'data')
1205 try:
1206 os.makedirs(path)
1207 except OSError as e:
1208 if e.errno == errno.EEXIST:
1209 pass
1210 else:
1211 raise
9f95a23c 1212 for remote, roles in mons.remotes.items():
7c673cae
FG
1213 for role in roles:
1214 is_mon = teuthology.is_type('mon', cluster_name)
1215 if is_mon(role):
1216 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
1217 mon_dir = DATA_PATH.format(
1218 type_='mon', id_=id_, cluster=cluster_name)
7c673cae
FG
1219 teuthology.pull_directory_tarball(
1220 remote,
1221 mon_dir,
1222 path + '/' + role + '.tgz')
1223
1224 log.info('Cleaning ceph cluster...')
1225 run.wait(
1226 ctx.cluster.run(
1227 args=[
1228 'sudo',
1229 'rm',
1230 '-rf',
1231 '--',
1232 conf_path,
1233 keyring_path,
1234 data_dir,
1235 monmap_path,
7c673cae
FG
1236 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1237 ],
1238 wait=False,
1239 ),
1240 )
1241
1242
1243def osd_scrub_pgs(ctx, config):
1244 """
1245 Scrub pgs when we exit.
1246
1247 First make sure all pgs are active and clean.
1248 Next scrub all osds.
1249 Then periodically check until all pgs have scrub time stamps that
11fdf7f2 1250 indicate the last scrub completed. Time out if no progress is made
7c673cae
FG
1251 here after two minutes.
1252 """
d2e6a577
FG
1253 retries = 40
1254 delays = 20
7c673cae
FG
1255 cluster_name = config['cluster']
1256 manager = ctx.managers[cluster_name]
f67539c2 1257 for _ in range(retries):
7c673cae 1258 stats = manager.get_pg_stats()
11fdf7f2
TL
1259 unclean = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1260 split_merge = []
1261 osd_dump = manager.get_osd_dump_json()
9f95a23c
TL
1262 try:
1263 split_merge = [i['pool_name'] for i in osd_dump['pools'] if i['pg_num'] != i['pg_num_target']]
1264 except KeyError:
1265 # we don't support pg_num_target before nautilus
1266 pass
11fdf7f2 1267 if not unclean and not split_merge:
7c673cae 1268 break
f67539c2
TL
1269 waiting_on = []
1270 if unclean:
1271 waiting_on.append(f'{unclean} to go clean')
1272 if split_merge:
1273 waiting_on.append(f'{split_merge} to split/merge')
1274 waiting_on = ' and '.join(waiting_on)
1275 log.info('Waiting for all PGs to be active+clean and split+merged, waiting on %s', waiting_on)
7c673cae 1276 time.sleep(delays)
f67539c2 1277 else:
31f18b77 1278 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
7c673cae
FG
1279 check_time_now = time.localtime()
1280 time.sleep(1)
1281 all_roles = teuthology.all_roles(ctx.cluster)
1282 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1283 log.info("Scrubbing {osd}".format(osd=role))
1284 _, _, id_ = teuthology.split_role(role)
31f18b77
FG
1285 # allow this to fail; in certain cases the OSD might not be up
1286 # at this point. we will catch all pgs below.
1287 try:
28e407b8
AA
1288 manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
1289 'osd_debug_deep_scrub_sleep', '0');
31f18b77
FG
1290 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1291 except run.CommandFailedError:
1292 pass
7c673cae
FG
1293 prev_good = 0
1294 gap_cnt = 0
1295 loop = True
1296 while loop:
1297 stats = manager.get_pg_stats()
1298 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1299 loop = False
1300 thiscnt = 0
9f95a23c 1301 re_scrub = []
7c673cae 1302 for (pgid, tmval) in timez:
9f95a23c
TL
1303 t = tmval[0:tmval.find('.')].replace(' ', 'T')
1304 pgtm = time.strptime(t, '%Y-%m-%dT%H:%M:%S')
7c673cae
FG
1305 if pgtm > check_time_now:
1306 thiscnt += 1
1307 else:
1308 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1309 loop = True
9f95a23c 1310 re_scrub.append(pgid)
7c673cae
FG
1311 if thiscnt > prev_good:
1312 prev_good = thiscnt
1313 gap_cnt = 0
1314 else:
1315 gap_cnt += 1
31f18b77 1316 if gap_cnt % 6 == 0:
9f95a23c 1317 for pgid in re_scrub:
31f18b77 1318 # re-request scrub every so often in case the earlier
11fdf7f2 1319 # request was missed. do not do it every time because
31f18b77
FG
1320 # the scrub may be in progress or not reported yet and
1321 # we will starve progress.
1322 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
7c673cae 1323 if gap_cnt > retries:
31f18b77 1324 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
7c673cae
FG
1325 if loop:
1326 log.info('Still waiting for all pgs to be scrubbed.')
1327 time.sleep(delays)
1328
1329
1330@contextlib.contextmanager
1331def run_daemon(ctx, config, type_):
1332 """
1333 Run daemons for a role type. Handle the startup and termination of a a daemon.
1334 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1335 and a max_mds value for one mds.
1336 On cleanup -- Stop all existing daemons of this type.
1337
1338 :param ctx: Context
1339 :param config: Configuration
9f95a23c 1340 :param type_: Role type
7c673cae
FG
1341 """
1342 cluster_name = config['cluster']
1343 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1344 testdir = teuthology.get_testdir(ctx)
1345 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1346
1347 # check whether any daemons if this type are configured
1348 if daemons is None:
1349 return
1350 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1351
1352 daemon_signal = 'kill'
1353 if config.get('coverage') or config.get('valgrind') is not None:
1354 daemon_signal = 'term'
1355
c07f9fc5 1356 # create osds in order. (this only matters for pre-luminous, which might
f91f0fd5 1357 # be jewel/hammer, which doesn't take an id_ argument to legacy 'osd create').
c07f9fc5 1358 osd_uuids = {}
9f95a23c 1359 for remote, roles_for_host in daemons.remotes.items():
7c673cae
FG
1360 is_type_ = teuthology.is_type(type_, cluster_name)
1361 for role in roles_for_host:
1362 if not is_type_(role):
1363 continue
1364 _, _, id_ = teuthology.split_role(role)
1365
c07f9fc5 1366
224ce89b
WB
1367 if type_ == 'osd':
1368 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1369 cluster=cluster_name, id=id_)
f67539c2
TL
1370 osd_uuid = remote.read_file(
1371 datadir + '/fsid', sudo=True).decode().strip()
c07f9fc5
FG
1372 osd_uuids[id_] = osd_uuid
1373 for osd_id in range(len(osd_uuids)):
1374 id_ = str(osd_id)
1375 osd_uuid = osd_uuids.get(id_)
1376 try:
1377 remote.run(
1378 args=[
1379 'sudo', 'ceph', '--cluster', cluster_name,
1380 'osd', 'new', osd_uuid, id_,
1381 ]
1382 )
1383 except:
f91f0fd5 1384 # fallback to pre-luminous (jewel)
c07f9fc5
FG
1385 remote.run(
1386 args=[
1387 'sudo', 'ceph', '--cluster', cluster_name,
1388 'osd', 'create', osd_uuid,
1389 ]
1390 )
1391 if config.get('add_osds_to_crush'):
1392 remote.run(
1393 args=[
1394 'sudo', 'ceph', '--cluster', cluster_name,
1395 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1396 '1.0', 'host=localhost', 'root=default',
1397 ]
1398 )
1399
9f95a23c 1400 for remote, roles_for_host in daemons.remotes.items():
c07f9fc5
FG
1401 is_type_ = teuthology.is_type(type_, cluster_name)
1402 for role in roles_for_host:
1403 if not is_type_(role):
1404 continue
1405 _, _, id_ = teuthology.split_role(role)
224ce89b 1406
7c673cae
FG
1407 run_cmd = [
1408 'sudo',
1409 'adjust-ulimits',
1410 'ceph-coverage',
1411 coverage_dir,
1412 'daemon-helper',
1413 daemon_signal,
1414 ]
1415 run_cmd_tail = [
1416 'ceph-%s' % (type_),
1417 '-f',
1418 '--cluster', cluster_name,
1419 '-i', id_]
1420
1421 if type_ in config.get('cpu_profile', []):
1422 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1423 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1424
f67539c2
TL
1425 vc = config.get('valgrind')
1426 if vc is not None:
7c673cae 1427 valgrind_args = None
f67539c2
TL
1428 if type_ in vc:
1429 valgrind_args = vc[type_]
1430 if role in vc:
1431 valgrind_args = vc[role]
1432 exit_on_first_error = vc.get('exit_on_first_error', True)
1433 run_cmd = get_valgrind_args(testdir, role, run_cmd, valgrind_args,
1434 exit_on_first_error=exit_on_first_error)
7c673cae
FG
1435
1436 run_cmd.extend(run_cmd_tail)
f67539c2
TL
1437 log_path = f'/var/log/ceph/{cluster_name}-{type_}.{id_}.log'
1438 create_log_cmd, run_cmd = \
1439 maybe_redirect_stderr(config, type_, run_cmd, log_path)
1440 if create_log_cmd:
1441 remote.sh(create_log_cmd)
7c673cae
FG
1442 # always register mgr; don't necessarily start
1443 ctx.daemons.register_daemon(
1444 remote, type_, id_,
1445 cluster=cluster_name,
1446 args=run_cmd,
1447 logger=log.getChild(role),
1448 stdin=run.PIPE,
1449 wait=False
1450 )
1451 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1452 role = cluster_name + '.' + type_
1453 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1454
9f95a23c
TL
1455 # kludge: run any pre-manager commands
1456 if type_ == 'mon':
1457 for cmd in config.get('pre-mgr-commands', []):
1458 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1459 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1460 remote.run(args=cmd.split(' '))
1461
7c673cae
FG
1462 try:
1463 yield
1464 finally:
1465 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1466
1467
1468def healthy(ctx, config):
1469 """
1470 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1471
1472 :param ctx: Context
1473 :param config: Configuration
1474 """
1475 config = config if isinstance(config, dict) else dict()
1476 cluster_name = config.get('cluster', 'ceph')
c07f9fc5
FG
1477 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1478 manager = ctx.managers[cluster_name]
1479 try:
d2e6a577
FG
1480 manager.wait_for_mgr_available(timeout=30)
1481 except (run.CommandFailedError, AssertionError) as e:
1482 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
c07f9fc5 1483
9f95a23c 1484 manager.wait_for_all_osds_up(timeout=300)
c07f9fc5
FG
1485
1486 try:
1487 manager.flush_all_pg_stats()
d2e6a577
FG
1488 except (run.CommandFailedError, Exception) as e:
1489 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
c07f9fc5
FG
1490 manager.wait_for_clean()
1491
b32b8144
FG
1492 if config.get('wait-for-healthy', True):
1493 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
9f95a23c 1494 manager.wait_until_healthy(timeout=300)
7c673cae
FG
1495
1496 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1497 # Some MDSs exist, wait for them to be healthy
1e59de90
TL
1498 for fs in Filesystem.get_all_fs(ctx):
1499 fs.wait_for_daemons(timeout=300)
7c673cae 1500
7c673cae
FG
1501def wait_for_mon_quorum(ctx, config):
1502 """
1503 Check renote ceph status until all monitors are up.
1504
1505 :param ctx: Context
1506 :param config: Configuration
1507 """
1508 if isinstance(config, dict):
1509 mons = config['daemons']
1510 cluster_name = config.get('cluster', 'ceph')
1511 else:
1512 assert isinstance(config, list)
1513 mons = config
1514 cluster_name = 'ceph'
1515 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1516 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1517 with contextutil.safe_while(sleep=10, tries=60,
1518 action='wait for monitor quorum') as proceed:
1519 while proceed():
9f95a23c
TL
1520 quorum_status = remote.sh('sudo ceph quorum_status',
1521 logger=log.getChild('quorum_status'))
1522 j = json.loads(quorum_status)
7c673cae
FG
1523 q = j.get('quorum_names', [])
1524 log.debug('Quorum: %s', q)
1525 if sorted(q) == sorted(mons):
1526 break
1527
1528
1529def created_pool(ctx, config):
1530 """
1531 Add new pools to the dictionary of pools that the ceph-manager
1532 knows about.
1533 """
1534 for new_pool in config:
1535 if new_pool not in ctx.managers['ceph'].pools:
9f95a23c 1536 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_int_property(
7c673cae
FG
1537 new_pool, 'pg_num')
1538
1539
11fdf7f2 1540@contextlib.contextmanager
9f95a23c 1541def suppress_mon_health_to_clog(ctx, config):
11fdf7f2 1542 """
9f95a23c 1543 set the option, and then restore it with its original value
11fdf7f2
TL
1544
1545 Note, due to the way how tasks are executed/nested, it's not suggested to
1546 use this method as a standalone task. otherwise, it's likely that it will
1547 restore the tweaked option at the /end/ of 'tasks' block.
1548 """
9f95a23c 1549 if config.get('mon-health-to-clog', 'true') == 'false':
9f95a23c
TL
1550 cluster = config.get('cluster', 'ceph')
1551 manager = ctx.managers[cluster]
1552 manager.raw_cluster_command(
1553 'config', 'set', 'mon', 'mon_health_to_clog', 'false'
1554 )
1555 yield
1556 manager.raw_cluster_command(
1557 'config', 'rm', 'mon', 'mon_health_to_clog'
1558 )
11fdf7f2 1559 else:
9f95a23c 1560 yield
11fdf7f2 1561
7c673cae
FG
1562@contextlib.contextmanager
1563def restart(ctx, config):
1564 """
1565 restart ceph daemons
1566
1567 For example::
1568 tasks:
1569 - ceph.restart: [all]
1570
1571 For example::
1572 tasks:
1573 - ceph.restart: [osd.0, mon.1, mds.*]
1574
1575 or::
1576
1577 tasks:
1578 - ceph.restart:
1579 daemons: [osd.0, mon.1]
1580 wait-for-healthy: false
1581 wait-for-osds-up: true
1582
1583 :param ctx: Context
1584 :param config: Configuration
1585 """
1586 if config is None:
1587 config = {}
1588 elif isinstance(config, list):
1589 config = {'daemons': config}
1590
1591 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1592 clusters = set()
7c673cae 1593
9f95a23c 1594 with suppress_mon_health_to_clog(ctx, config):
11fdf7f2
TL
1595 for role in daemons:
1596 cluster, type_, id_ = teuthology.split_role(role)
9f95a23c
TL
1597 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1598 if type_ == 'osd':
1599 ctx.managers[cluster].mark_down_osd(id_)
11fdf7f2
TL
1600 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1601 clusters.add(cluster)
f67539c2 1602
7c673cae
FG
1603 if config.get('wait-for-healthy', True):
1604 for cluster in clusters:
1605 healthy(ctx=ctx, config=dict(cluster=cluster))
1606 if config.get('wait-for-osds-up', False):
1607 for cluster in clusters:
9f95a23c 1608 ctx.managers[cluster].wait_for_all_osds_up()
1e59de90
TL
1609 if config.get('expected-failure') is not None:
1610 log.info('Checking for expected-failure in osds logs after restart...')
1611 expected_fail = config.get('expected-failure')
1612 is_osd = teuthology.is_type('osd')
1613 for role in daemons:
1614 if not is_osd(role):
1615 continue
1616 (remote,) = ctx.cluster.only(role).remotes.keys()
1617 cluster, type_, id_ = teuthology.split_role(role)
1618 remote.run(
1619 args = ['sudo',
1620 'egrep', expected_fail,
1621 '/var/log/ceph/{cluster}-{type_}.{id_}.log'.format(cluster=cluster, type_=type_, id_=id_),
1622 ])
7c673cae
FG
1623 yield
1624
1625
1626@contextlib.contextmanager
1627def stop(ctx, config):
1628 """
1629 Stop ceph daemons
1630
1631 For example::
1632 tasks:
1633 - ceph.stop: [mds.*]
1634
1635 tasks:
1636 - ceph.stop: [osd.0, osd.2]
1637
1638 tasks:
1639 - ceph.stop:
1640 daemons: [osd.0, osd.2]
1641
1642 """
1643 if config is None:
1644 config = {}
1645 elif isinstance(config, list):
1646 config = {'daemons': config}
1647
1648 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
9f95a23c
TL
1649 clusters = set()
1650
7c673cae
FG
1651 for role in daemons:
1652 cluster, type_, id_ = teuthology.split_role(role)
1653 ctx.daemons.get_daemon(type_, id_, cluster).stop()
9f95a23c
TL
1654 clusters.add(cluster)
1655
1656
1657 for cluster in clusters:
1658 ctx.ceph[cluster].watchdog.stop()
1659 ctx.ceph[cluster].watchdog.join()
7c673cae
FG
1660
1661 yield
1662
1663
1664@contextlib.contextmanager
1665def wait_for_failure(ctx, config):
1666 """
1667 Wait for a failure of a ceph daemon
1668
1669 For example::
1670 tasks:
1671 - ceph.wait_for_failure: [mds.*]
1672
1673 tasks:
1674 - ceph.wait_for_failure: [osd.0, osd.2]
1675
1676 tasks:
1677 - ceph.wait_for_failure:
1678 daemons: [osd.0, osd.2]
1679
1680 """
1681 if config is None:
1682 config = {}
1683 elif isinstance(config, list):
1684 config = {'daemons': config}
1685
1686 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1687 for role in daemons:
1688 cluster, type_, id_ = teuthology.split_role(role)
1689 try:
1690 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1691 except:
1692 log.info('Saw expected daemon failure. Continuing.')
1693 pass
1694 else:
1695 raise RuntimeError('daemon %s did not fail' % role)
1696
1697 yield
1698
1699
1700def validate_config(ctx, config):
1701 """
1702 Perform some simple validation on task configuration.
1703 Raises exceptions.ConfigError if an error is found.
1704 """
1705 # check for osds from multiple clusters on the same host
1706 for remote, roles_for_host in ctx.cluster.remotes.items():
1707 last_cluster = None
1708 last_role = None
1709 for role in roles_for_host:
1710 role_cluster, role_type, _ = teuthology.split_role(role)
1711 if role_type != 'osd':
1712 continue
1713 if last_cluster and last_cluster != role_cluster:
1714 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1715 last_role, role)
1716 raise exceptions.ConfigError(msg)
1717 last_cluster = role_cluster
1718 last_role = role
1719
1720
1721@contextlib.contextmanager
1722def task(ctx, config):
1723 """
1724 Set up and tear down a Ceph cluster.
1725
1726 For example::
1727
1728 tasks:
1729 - ceph:
1730 - interactive:
1731
1732 You can also specify what branch to run::
1733
1734 tasks:
1735 - ceph:
1736 branch: foo
1737
1738 Or a tag::
1739
1740 tasks:
1741 - ceph:
1742 tag: v0.42.13
1743
1744 Or a sha1::
1745
1746 tasks:
1747 - ceph:
1748 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1749
1750 Or a local source dir::
1751
1752 tasks:
1753 - ceph:
1754 path: /home/sage/ceph
1755
1756 To capture code coverage data, use::
1757
1758 tasks:
1759 - ceph:
1760 coverage: true
1761
1762 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1763
1764 tasks:
1765 - ceph:
1766 fs: xfs
1767 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1768 mount_options: [nobarrier, inode64]
1769
f91f0fd5
TL
1770 To change the cephfs's default max_mds (1), use::
1771
1772 tasks:
1773 - ceph:
1774 cephfs:
1775 max_mds: 2
1776
f67539c2
TL
1777 To change the max_mds of a specific filesystem, use::
1778
1779 tasks:
1780 - ceph:
1781 cephfs:
1782 max_mds: 2
1783 fs:
1784 - name: a
1785 max_mds: 3
1786 - name: b
1787
1788 In the above example, filesystem 'a' will have 'max_mds' 3,
1789 and filesystme 'b' will have 'max_mds' 2.
1790
f91f0fd5
TL
1791 To change the mdsmap's default session_timeout (60 seconds), use::
1792
1793 tasks:
1794 - ceph:
1795 cephfs:
1796 session_timeout: 300
1797
7c673cae
FG
1798 Note, this will cause the task to check the /scratch_devs file on each node
1799 for available devices. If no such file is found, /dev/sdb will be used.
1800
1801 To run some daemons under valgrind, include their names
1802 and the tool/args to use in a valgrind section::
1803
1804 tasks:
1805 - ceph:
1806 valgrind:
1807 mds.1: --tool=memcheck
1808 osd.1: [--tool=memcheck, --leak-check=no]
1809
1810 Those nodes which are using memcheck or valgrind will get
1811 checked for bad results.
1812
1813 To adjust or modify config options, use::
1814
1815 tasks:
1816 - ceph:
1817 conf:
1818 section:
1819 key: value
1820
1821 For example::
1822
1823 tasks:
1824 - ceph:
1825 conf:
1826 mds.0:
1827 some option: value
1828 other key: other value
1829 client.0:
1830 debug client: 10
1831 debug ms: 1
1832
1833 By default, the cluster log is checked for errors and warnings,
1834 and the run marked failed if any appear. You can ignore log
1835 entries by giving a list of egrep compatible regexes, i.e.:
1836
1837 tasks:
1838 - ceph:
cd265ab1 1839 log-ignorelist: ['foo.*bar', 'bad message']
7c673cae
FG
1840
1841 To run multiple ceph clusters, use multiple ceph tasks, and roles
1842 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1843 cluster use the default cluster name, 'ceph'. OSDs from separate
1844 clusters must be on separate hosts. Clients and non-osd daemons
1845 from multiple clusters may be colocated. For each cluster, add an
1846 instance of the ceph task with the cluster name specified, e.g.::
1847
1848 roles:
1849 - [mon.a, osd.0, osd.1]
1850 - [backup.mon.a, backup.osd.0, backup.osd.1]
1851 - [client.0, backup.client.0]
1852 tasks:
1853 - ceph:
1854 cluster: ceph
1855 - ceph:
1856 cluster: backup
1857
1858 :param ctx: Context
1859 :param config: Configuration
1860
1861 """
1862 if config is None:
1863 config = {}
1864 assert isinstance(config, dict), \
1865 "task ceph only supports a dictionary for configuration"
1866
1867 overrides = ctx.config.get('overrides', {})
1868 teuthology.deep_merge(config, overrides.get('ceph', {}))
1869
1870 first_ceph_cluster = False
1871 if not hasattr(ctx, 'daemons'):
1872 first_ceph_cluster = True
1873 ctx.daemons = DaemonGroup()
1874
1875 testdir = teuthology.get_testdir(ctx)
1876 if config.get('coverage'):
1877 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1878 log.info('Creating coverage directory...')
1879 run.wait(
1880 ctx.cluster.run(
1881 args=[
1882 'install', '-d', '-m0755', '--',
1883 coverage_dir,
1884 ],
1885 wait=False,
1886 )
1887 )
1888
1889 if 'cluster' not in config:
1890 config['cluster'] = 'ceph'
1891
1892 validate_config(ctx, config)
1893
1894 subtasks = []
1895 if first_ceph_cluster:
1896 # these tasks handle general log setup and parsing on all hosts,
1897 # so they should only be run once
1898 subtasks = [
1899 lambda: ceph_log(ctx=ctx, config=None),
11fdf7f2 1900 lambda: ceph_crash(ctx=ctx, config=None),
7c673cae
FG
1901 lambda: valgrind_post(ctx=ctx, config=config),
1902 ]
1903
1904 subtasks += [
1905 lambda: cluster(ctx=ctx, config=dict(
1906 conf=config.get('conf', {}),
1907 fs=config.get('fs', 'xfs'),
1908 mkfs_options=config.get('mkfs_options', None),
1909 mount_options=config.get('mount_options', None),
7c673cae 1910 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
cd265ab1 1911 log_ignorelist=config.get('log-ignorelist', []),
7c673cae
FG
1912 cpu_profile=set(config.get('cpu_profile', []),),
1913 cluster=config['cluster'],
11fdf7f2
TL
1914 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1915 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
7c673cae
FG
1916 )),
1917 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1918 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1919 lambda: crush_setup(ctx=ctx, config=config),
1e59de90 1920 lambda: check_enable_crimson(ctx=ctx, config=config),
7c673cae 1921 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
a4b75251 1922 lambda: setup_manager(ctx=ctx, config=config),
224ce89b 1923 lambda: create_rbd_pool(ctx=ctx, config=config),
7c673cae 1924 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
f91f0fd5 1925 lambda: cephfs_setup(ctx=ctx, config=config),
9f95a23c 1926 lambda: watchdog_setup(ctx=ctx, config=config),
7c673cae
FG
1927 ]
1928
1929 with contextutil.nested(*subtasks):
7c673cae
FG
1930 try:
1931 if config.get('wait-for-healthy', True):
1932 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1933
1934 yield
1935 finally:
11fdf7f2
TL
1936 # set pg_num_targets back to actual pg_num, so we don't have to
1937 # wait for pending merges (which can take a while!)
1e59de90
TL
1938 if not config.get('skip_stop_pg_num_changes', True):
1939 ctx.managers[config['cluster']].stop_pg_num_changes()
11fdf7f2 1940
7c673cae 1941 if config.get('wait-for-scrub', True):
f67539c2
TL
1942 # wait for pgs to become active+clean in case any
1943 # recoveries were triggered since the last health check
1944 ctx.managers[config['cluster']].wait_for_clean()
7c673cae 1945 osd_scrub_pgs(ctx, config)
224ce89b
WB
1946
1947 # stop logging health to clog during shutdown, or else we generate
1948 # a bunch of scary messages unrelated to our actual run.
1949 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1950 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1951 mon0_remote.run(
1952 args=[
1953 'sudo',
1954 'ceph',
1955 '--cluster', config['cluster'],
9f95a23c
TL
1956 'config', 'set', 'global',
1957 'mon_health_to_clog', 'false',
1958 ],
1959 check_status=False,
224ce89b 1960 )