]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
import 15.2.1 Octopus source
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
9f95a23c 6from io import BytesIO
7c673cae
FG
7
8import argparse
11fdf7f2 9import configobj
7c673cae
FG
10import contextlib
11import errno
12import logging
13import os
14import json
15import time
16import gevent
11fdf7f2 17import re
9f95a23c 18import six
7c673cae
FG
19import socket
20
21from paramiko import SSHException
9f95a23c 22from tasks.ceph_manager import CephManager, write_conf
11fdf7f2 23from tarfile import ReadError
7c673cae
FG
24from tasks.cephfs.filesystem import Filesystem
25from teuthology import misc as teuthology
26from teuthology import contextutil
27from teuthology import exceptions
28from teuthology.orchestra import run
9f95a23c 29import tasks.ceph_client as cclient
7c673cae 30from teuthology.orchestra.daemon import DaemonGroup
9f95a23c 31from tasks.daemonwatchdog import DaemonWatchdog
7c673cae
FG
32
33CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
11fdf7f2 34DATA_PATH = '/var/lib/ceph/{type_}/{cluster}-{id_}'
7c673cae
FG
35
36log = logging.getLogger(__name__)
37
38
39def generate_caps(type_):
40 """
41 Each call will return the next capability for each system type
42 (essentially a subset of possible role values). Valid types are osd,
43 mds and client.
44 """
45 defaults = dict(
46 osd=dict(
47 mon='allow *',
48 mgr='allow *',
49 osd='allow *',
50 ),
51 mgr=dict(
3efd9988
FG
52 mon='allow profile mgr',
53 osd='allow *',
54 mds='allow *',
7c673cae
FG
55 ),
56 mds=dict(
57 mon='allow *',
58 mgr='allow *',
59 osd='allow *',
60 mds='allow',
61 ),
62 client=dict(
63 mon='allow rw',
64 mgr='allow r',
65 osd='allow rwx',
66 mds='allow',
67 ),
68 )
69 for subsystem, capability in defaults[type_].items():
70 yield '--cap'
71 yield subsystem
72 yield capability
73
74
11fdf7f2
TL
75@contextlib.contextmanager
76def ceph_crash(ctx, config):
77 """
78 Gather crash dumps from /var/lib/crash
79 """
80 try:
81 yield
82
83 finally:
84 if ctx.archive is not None:
85 log.info('Archiving crash dumps...')
86 path = os.path.join(ctx.archive, 'remote')
87 try:
88 os.makedirs(path)
9f95a23c 89 except OSError:
11fdf7f2 90 pass
9f95a23c 91 for remote in ctx.cluster.remotes.keys():
11fdf7f2
TL
92 sub = os.path.join(path, remote.shortname)
93 try:
94 os.makedirs(sub)
9f95a23c 95 except OSError:
11fdf7f2
TL
96 pass
97 try:
98 teuthology.pull_directory(remote, '/var/lib/ceph/crash',
99 os.path.join(sub, 'crash'))
9f95a23c 100 except ReadError:
11fdf7f2
TL
101 pass
102
103
7c673cae
FG
104@contextlib.contextmanager
105def ceph_log(ctx, config):
106 """
107 Create /var/log/ceph log directory that is open to everyone.
108 Add valgrind and profiling-logger directories.
109
110 :param ctx: Context
111 :param config: Configuration
112 """
113 log.info('Making ceph log dir writeable by non-root...')
114 run.wait(
115 ctx.cluster.run(
116 args=[
117 'sudo',
118 'chmod',
119 '777',
120 '/var/log/ceph',
121 ],
122 wait=False,
123 )
124 )
125 log.info('Disabling ceph logrotate...')
126 run.wait(
127 ctx.cluster.run(
128 args=[
129 'sudo',
130 'rm', '-f', '--',
131 '/etc/logrotate.d/ceph',
132 ],
133 wait=False,
134 )
135 )
136 log.info('Creating extra log directories...')
137 run.wait(
138 ctx.cluster.run(
139 args=[
140 'sudo',
141 'install', '-d', '-m0777', '--',
142 '/var/log/ceph/valgrind',
143 '/var/log/ceph/profiling-logger',
144 ],
145 wait=False,
146 )
147 )
148
149 class Rotater(object):
150 stop_event = gevent.event.Event()
151
152 def invoke_logrotate(self):
153 # 1) install ceph-test.conf in /etc/logrotate.d
154 # 2) continuously loop over logrotate invocation with ceph-test.conf
155 while not self.stop_event.is_set():
156 self.stop_event.wait(timeout=30)
157 try:
158 run.wait(
159 ctx.cluster.run(
160 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
161 ],
162 wait=False,
163 )
164 )
165 except exceptions.ConnectionLostError as e:
166 # Some tests may power off nodes during test, in which
167 # case we will see connection errors that we should ignore.
168 log.debug("Missed logrotate, node '{0}' is offline".format(
169 e.node))
9f95a23c 170 except EOFError:
7c673cae
FG
171 # Paramiko sometimes raises this when it fails to
172 # connect to a node during open_session. As with
173 # ConnectionLostError, we ignore this because nodes
174 # are allowed to get power cycled during tests.
175 log.debug("Missed logrotate, EOFError")
9f95a23c 176 except SSHException:
7c673cae
FG
177 log.debug("Missed logrotate, SSHException")
178 except socket.error as e:
92f5a8d4 179 if e.errno in (errno.EHOSTUNREACH, errno.ECONNRESET):
7c673cae
FG
180 log.debug("Missed logrotate, host unreachable")
181 else:
182 raise
183
184 def begin(self):
185 self.thread = gevent.spawn(self.invoke_logrotate)
186
187 def end(self):
188 self.stop_event.set()
189 self.thread.get()
190
191 def write_rotate_conf(ctx, daemons):
192 testdir = teuthology.get_testdir(ctx)
9f95a23c 193 remote_logrotate_conf = '%s/logrotate.ceph-test.conf' % testdir
7c673cae 194 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
9f95a23c 195 with open(rotate_conf_path, 'rb') as f:
7c673cae 196 conf = ""
9f95a23c
TL
197 for daemon, size in daemons.items():
198 log.info('writing logrotate stanza for {}'.format(daemon))
199 conf += six.ensure_str(f.read()).format(daemon_type=daemon,
200 max_size=size)
7c673cae
FG
201 f.seek(0, 0)
202
9f95a23c 203 for remote in ctx.cluster.remotes.keys():
7c673cae 204 teuthology.write_file(remote=remote,
9f95a23c
TL
205 path=remote_logrotate_conf,
206 data=BytesIO(conf.encode())
7c673cae
FG
207 )
208 remote.run(
209 args=[
210 'sudo',
211 'mv',
9f95a23c 212 remote_logrotate_conf,
7c673cae
FG
213 '/etc/logrotate.d/ceph-test.conf',
214 run.Raw('&&'),
215 'sudo',
216 'chmod',
217 '0644',
218 '/etc/logrotate.d/ceph-test.conf',
219 run.Raw('&&'),
220 'sudo',
221 'chown',
222 'root.root',
223 '/etc/logrotate.d/ceph-test.conf'
224 ]
225 )
226 remote.chcon('/etc/logrotate.d/ceph-test.conf',
227 'system_u:object_r:etc_t:s0')
228
229 if ctx.config.get('log-rotate'):
230 daemons = ctx.config.get('log-rotate')
231 log.info('Setting up log rotation with ' + str(daemons))
232 write_rotate_conf(ctx, daemons)
233 logrotater = Rotater()
234 logrotater.begin()
235 try:
236 yield
237
238 finally:
239 if ctx.config.get('log-rotate'):
240 log.info('Shutting down logrotate')
241 logrotater.end()
242 ctx.cluster.run(
243 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
244 ]
245 )
246 if ctx.archive is not None and \
247 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
248 # and logs
249 log.info('Compressing logs...')
250 run.wait(
251 ctx.cluster.run(
252 args=[
253 'sudo',
254 'find',
255 '/var/log/ceph',
256 '-name',
257 '*.log',
258 '-print0',
259 run.Raw('|'),
260 'sudo',
261 'xargs',
262 '-0',
263 '--no-run-if-empty',
264 '--',
265 'gzip',
266 '--',
267 ],
268 wait=False,
269 ),
270 )
271
272 log.info('Archiving logs...')
273 path = os.path.join(ctx.archive, 'remote')
11fdf7f2
TL
274 try:
275 os.makedirs(path)
9f95a23c 276 except OSError:
11fdf7f2 277 pass
9f95a23c 278 for remote in ctx.cluster.remotes.keys():
7c673cae 279 sub = os.path.join(path, remote.shortname)
11fdf7f2
TL
280 try:
281 os.makedirs(sub)
9f95a23c 282 except OSError:
11fdf7f2 283 pass
7c673cae
FG
284 teuthology.pull_directory(remote, '/var/log/ceph',
285 os.path.join(sub, 'log'))
286
287
288def assign_devs(roles, devs):
289 """
290 Create a dictionary of devs indexed by roles
291
292 :param roles: List of roles
293 :param devs: Corresponding list of devices.
294 :returns: Dictionary of devs indexed by roles.
295 """
296 return dict(zip(roles, devs))
297
298
299@contextlib.contextmanager
300def valgrind_post(ctx, config):
301 """
11fdf7f2
TL
302 After the tests run, look through all the valgrind logs. Exceptions are raised
303 if textual errors occurred in the logs, or if valgrind exceptions were detected in
7c673cae
FG
304 the logs.
305
306 :param ctx: Context
307 :param config: Configuration
308 """
309 try:
310 yield
311 finally:
312 lookup_procs = list()
313 log.info('Checking for errors in any valgrind logs...')
9f95a23c 314 for remote in ctx.cluster.remotes.keys():
7c673cae
FG
315 # look at valgrind logs for each node
316 proc = remote.run(
9f95a23c
TL
317 args="sudo zgrep '<kind>' /var/log/ceph/valgrind/* "
318 # include a second file so that we always get
319 # a filename prefix on the output
320 "/dev/null | sort | uniq",
7c673cae
FG
321 wait=False,
322 check_status=False,
9f95a23c 323 stdout=BytesIO(),
7c673cae
FG
324 )
325 lookup_procs.append((proc, remote))
326
327 valgrind_exception = None
328 for (proc, remote) in lookup_procs:
329 proc.wait()
9f95a23c 330 out = six.ensure_str(proc.stdout.getvalue())
7c673cae
FG
331 for line in out.split('\n'):
332 if line == '':
333 continue
334 try:
335 (file, kind) = line.split(':')
336 except Exception:
337 log.error('failed to split line %s', line)
338 raise
339 log.debug('file %s kind %s', file, kind)
340 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
341 continue
342 log.error('saw valgrind issue %s in %s', kind, file)
343 valgrind_exception = Exception('saw valgrind issues')
344
345 if config.get('expect_valgrind_errors'):
346 if not valgrind_exception:
347 raise Exception('expected valgrind issues and found none')
348 else:
349 if valgrind_exception:
350 raise valgrind_exception
351
352
353@contextlib.contextmanager
354def crush_setup(ctx, config):
355 cluster_name = config['cluster']
356 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 357 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
358
359 profile = config.get('crush_tunables', 'default')
360 log.info('Setting crush tunables to %s', profile)
361 mon_remote.run(
362 args=['sudo', 'ceph', '--cluster', cluster_name,
363 'osd', 'crush', 'tunables', profile])
364 yield
365
366
224ce89b
WB
367@contextlib.contextmanager
368def create_rbd_pool(ctx, config):
369 cluster_name = config['cluster']
370 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 371 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
224ce89b
WB
372 log.info('Waiting for OSDs to come up')
373 teuthology.wait_until_osds_up(
374 ctx,
375 cluster=ctx.cluster,
376 remote=mon_remote,
377 ceph_cluster=cluster_name,
378 )
3efd9988
FG
379 if config.get('create_rbd_pool', True):
380 log.info('Creating RBD pool')
381 mon_remote.run(
382 args=['sudo', 'ceph', '--cluster', cluster_name,
383 'osd', 'pool', 'create', 'rbd', '8'])
384 mon_remote.run(
385 args=[
386 'sudo', 'ceph', '--cluster', cluster_name,
387 'osd', 'pool', 'application', 'enable',
388 'rbd', 'rbd', '--yes-i-really-mean-it'
389 ],
390 check_status=False)
224ce89b
WB
391 yield
392
7c673cae
FG
393@contextlib.contextmanager
394def cephfs_setup(ctx, config):
395 cluster_name = config['cluster']
7c673cae
FG
396
397 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 398 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
399 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
400 # If there are any MDSs, then create a filesystem for them to use
401 # Do this last because requires mon cluster to be up and running
402 if mdss.remotes:
403 log.info('Setting up CephFS filesystem...')
404
3efd9988
FG
405 fs = Filesystem(ctx, name='cephfs', create=True,
406 ec_profile=config.get('cephfs_ec_profile', None))
7c673cae 407
11fdf7f2
TL
408 max_mds = config.get('max_mds', 1)
409 if max_mds > 1:
410 fs.set_max_mds(max_mds)
7c673cae
FG
411
412 yield
413
9f95a23c
TL
414@contextlib.contextmanager
415def watchdog_setup(ctx, config):
416 ctx.ceph[config['cluster']].thrashers = []
417 ctx.ceph[config['cluster']].watchdog = DaemonWatchdog(ctx, config, ctx.ceph[config['cluster']].thrashers)
418 ctx.ceph[config['cluster']].watchdog.start()
419 yield
7c673cae 420
11fdf7f2
TL
421def get_mons(roles, ips, cluster_name,
422 mon_bind_msgr2=False,
423 mon_bind_addrvec=False):
424 """
425 Get monitors and their associated addresses
426 """
427 mons = {}
428 v1_ports = {}
429 v2_ports = {}
11fdf7f2
TL
430 is_mon = teuthology.is_type('mon', cluster_name)
431 for idx, roles in enumerate(roles):
432 for role in roles:
433 if not is_mon(role):
434 continue
435 if ips[idx] not in v1_ports:
436 v1_ports[ips[idx]] = 6789
437 else:
438 v1_ports[ips[idx]] += 1
439 if mon_bind_msgr2:
440 if ips[idx] not in v2_ports:
441 v2_ports[ips[idx]] = 3300
442 addr = '{ip}'.format(ip=ips[idx])
443 else:
444 assert mon_bind_addrvec
445 v2_ports[ips[idx]] += 1
446 addr = '[v2:{ip}:{port2},v1:{ip}:{port1}]'.format(
447 ip=ips[idx],
448 port2=v2_ports[ips[idx]],
449 port1=v1_ports[ips[idx]],
450 )
451 elif mon_bind_addrvec:
452 addr = '[v1:{ip}:{port}]'.format(
453 ip=ips[idx],
454 port=v1_ports[ips[idx]],
455 )
456 else:
457 addr = '{ip}:{port}'.format(
458 ip=ips[idx],
459 port=v1_ports[ips[idx]],
460 )
11fdf7f2
TL
461 mons[role] = addr
462 assert mons
463 return mons
464
465def skeleton_config(ctx, roles, ips, mons, cluster='ceph'):
466 """
467 Returns a ConfigObj that is prefilled with a skeleton config.
468
469 Use conf[section][key]=value or conf.merge to change it.
470
471 Use conf.write to write it out, override .filename first if you want.
472 """
473 path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template')
9f95a23c 474 conf = configobj.ConfigObj(path, file_error=True)
11fdf7f2 475 mon_hosts = []
9f95a23c 476 for role, addr in mons.items():
11fdf7f2
TL
477 mon_cluster, _, _ = teuthology.split_role(role)
478 if mon_cluster != cluster:
479 continue
480 name = teuthology.ceph_role(role)
481 conf.setdefault(name, {})
482 mon_hosts.append(addr)
483 conf.setdefault('global', {})
484 conf['global']['mon host'] = ','.join(mon_hosts)
485 # set up standby mds's
486 is_mds = teuthology.is_type('mds', cluster)
487 for roles_subset in roles:
488 for role in roles_subset:
489 if is_mds(role):
490 name = teuthology.ceph_role(role)
491 conf.setdefault(name, {})
492 return conf
493
494def create_simple_monmap(ctx, remote, conf, mons,
495 path=None,
496 mon_bind_addrvec=False):
497 """
498 Writes a simple monmap based on current ceph.conf into path, or
499 <testdir>/monmap by default.
500
501 Assumes ceph_conf is up to date.
502
503 Assumes mon sections are named "mon.*", with the dot.
504
505 :return the FSID (as a string) of the newly created monmap
506 """
507
9f95a23c 508 addresses = list(mons.items())
11fdf7f2
TL
509 assert addresses, "There are no monitors in config!"
510 log.debug('Ceph mon addresses: %s', addresses)
511
512 testdir = teuthology.get_testdir(ctx)
513 args = [
514 'adjust-ulimits',
515 'ceph-coverage',
516 '{tdir}/archive/coverage'.format(tdir=testdir),
517 'monmaptool',
518 '--create',
519 '--clobber',
520 ]
521 if mon_bind_addrvec:
522 args.extend(['--enable-all-features'])
523 for (role, addr) in addresses:
524 _, _, n = teuthology.split_role(role)
525 if mon_bind_addrvec and (',' in addr or 'v' in addr or ':' in addr):
526 args.extend(('--addv', n, addr))
527 else:
528 args.extend(('--add', n, addr))
529 if not path:
530 path = '{tdir}/monmap'.format(tdir=testdir)
531 args.extend([
532 '--print',
533 path
534 ])
535
9f95a23c 536 monmap_output = remote.sh(args)
11fdf7f2
TL
537 fsid = re.search("generated fsid (.+)$",
538 monmap_output, re.MULTILINE).group(1)
539 return fsid
540
7c673cae
FG
541@contextlib.contextmanager
542def cluster(ctx, config):
543 """
544 Handle the creation and removal of a ceph cluster.
545
546 On startup:
547 Create directories needed for the cluster.
548 Create remote journals for all osds.
549 Create and set keyring.
11fdf7f2 550 Copy the monmap to the test systems.
7c673cae
FG
551 Setup mon nodes.
552 Setup mds nodes.
553 Mkfs osd nodes.
554 Add keyring information to monmaps
555 Mkfs mon nodes.
556
557 On exit:
11fdf7f2 558 If errors occurred, extract a failure message and store in ctx.summary.
7c673cae
FG
559 Unmount all test files and temporary journaling files.
560 Save the monitor information and archive all ceph logs.
561 Cleanup the keyring setup, and remove all monitor map and data files left over.
562
563 :param ctx: Context
564 :param config: Configuration
565 """
566 if ctx.config.get('use_existing_cluster', False) is True:
567 log.info("'use_existing_cluster' is true; skipping cluster creation")
568 yield
569
570 testdir = teuthology.get_testdir(ctx)
571 cluster_name = config['cluster']
572 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
573 log.info('Creating ceph cluster %s...', cluster_name)
11fdf7f2
TL
574 log.info('config %s', config)
575 log.info('ctx.config %s', ctx.config)
7c673cae
FG
576 run.wait(
577 ctx.cluster.run(
578 args=[
579 'install', '-d', '-m0755', '--',
580 data_dir,
581 ],
582 wait=False,
583 )
584 )
585
586 run.wait(
587 ctx.cluster.run(
588 args=[
589 'sudo',
590 'install', '-d', '-m0777', '--', '/var/run/ceph',
591 ],
592 wait=False,
593 )
594 )
595
596 devs_to_clean = {}
597 remote_to_roles_to_devs = {}
7c673cae 598 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
9f95a23c 599 for remote, roles_for_host in osds.remotes.items():
7c673cae 600 devs = teuthology.get_scratch_devices(remote)
801d1391
TL
601 roles_to_devs = assign_devs(
602 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), devs
603 )
604 devs_to_clean[remote] = []
605 log.info('osd dev map: {}'.format(roles_to_devs))
606 assert roles_to_devs, \
607 "remote {} has osd roles, but no osd devices were specified!".format(remote.hostname)
7c673cae 608 remote_to_roles_to_devs[remote] = roles_to_devs
801d1391
TL
609 log.info("remote_to_roles_to_devs: {}".format(remote_to_roles_to_devs))
610 for osd_role, dev_name in remote_to_roles_to_devs.items():
611 assert dev_name, "{} has no associated device!".format(osd_role)
7c673cae
FG
612
613 log.info('Generating config...')
614 remotes_and_roles = ctx.cluster.remotes.items()
615 roles = [role_list for (remote, role_list) in remotes_and_roles]
616 ips = [host for (host, port) in
617 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
11fdf7f2
TL
618 mons = get_mons(
619 roles, ips, cluster_name,
620 mon_bind_msgr2=config.get('mon_bind_msgr2'),
621 mon_bind_addrvec=config.get('mon_bind_addrvec'),
622 )
623 conf = skeleton_config(
624 ctx, roles=roles, ips=ips, mons=mons, cluster=cluster_name,
625 )
9f95a23c
TL
626 for section, keys in config['conf'].items():
627 for key, value in keys.items():
7c673cae
FG
628 log.info("[%s] %s = %s" % (section, key, value))
629 if section not in conf:
630 conf[section] = {}
631 conf[section][key] = value
632
7c673cae
FG
633 if not hasattr(ctx, 'ceph'):
634 ctx.ceph = {}
635 ctx.ceph[cluster_name] = argparse.Namespace()
636 ctx.ceph[cluster_name].conf = conf
11fdf7f2 637 ctx.ceph[cluster_name].mons = mons
7c673cae
FG
638
639 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
640 keyring_path = config.get('keyring_path', default_keyring)
641
642 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
643
644 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
645
646 log.info('Setting up %s...' % firstmon)
647 ctx.cluster.only(firstmon).run(
648 args=[
649 'sudo',
650 'adjust-ulimits',
651 'ceph-coverage',
652 coverage_dir,
653 'ceph-authtool',
654 '--create-keyring',
655 keyring_path,
656 ],
657 )
658 ctx.cluster.only(firstmon).run(
659 args=[
660 'sudo',
661 'adjust-ulimits',
662 'ceph-coverage',
663 coverage_dir,
664 'ceph-authtool',
665 '--gen-key',
666 '--name=mon.',
667 keyring_path,
668 ],
669 )
670 ctx.cluster.only(firstmon).run(
671 args=[
672 'sudo',
673 'chmod',
674 '0644',
675 keyring_path,
676 ],
677 )
678 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
679 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
680 cluster=cluster_name)
11fdf7f2 681 fsid = create_simple_monmap(
7c673cae
FG
682 ctx,
683 remote=mon0_remote,
684 conf=conf,
11fdf7f2 685 mons=mons,
7c673cae 686 path=monmap_path,
11fdf7f2 687 mon_bind_addrvec=config.get('mon_bind_addrvec'),
7c673cae
FG
688 )
689 if not 'global' in conf:
690 conf['global'] = {}
691 conf['global']['fsid'] = fsid
692
693 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
694 conf_path = config.get('conf_path', default_conf_path)
695 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
696 write_conf(ctx, conf_path, cluster_name)
697
698 log.info('Creating admin key on %s...' % firstmon)
699 ctx.cluster.only(firstmon).run(
700 args=[
701 'sudo',
702 'adjust-ulimits',
703 'ceph-coverage',
704 coverage_dir,
705 'ceph-authtool',
706 '--gen-key',
707 '--name=client.admin',
7c673cae
FG
708 '--cap', 'mon', 'allow *',
709 '--cap', 'osd', 'allow *',
710 '--cap', 'mds', 'allow *',
711 '--cap', 'mgr', 'allow *',
712 keyring_path,
713 ],
714 )
715
716 log.info('Copying monmap to all nodes...')
717 keyring = teuthology.get_file(
718 remote=mon0_remote,
719 path=keyring_path,
720 )
721 monmap = teuthology.get_file(
722 remote=mon0_remote,
723 path=monmap_path,
724 )
725
9f95a23c 726 for rem in ctx.cluster.remotes.keys():
7c673cae
FG
727 # copy mon key and initial monmap
728 log.info('Sending monmap to node {remote}'.format(remote=rem))
729 teuthology.sudo_write_file(
730 remote=rem,
731 path=keyring_path,
732 data=keyring,
733 perms='0644'
734 )
735 teuthology.write_file(
736 remote=rem,
737 path=monmap_path,
738 data=monmap,
739 )
740
741 log.info('Setting up mon nodes...')
742 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
7c673cae
FG
743
744 if not config.get('skip_mgr_daemons', False):
745 log.info('Setting up mgr nodes...')
746 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
9f95a23c 747 for remote, roles_for_host in mgrs.remotes.items():
7c673cae
FG
748 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
749 cluster_name):
750 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
751 mgr_dir = DATA_PATH.format(
752 type_='mgr', cluster=cluster_name, id_=id_)
7c673cae
FG
753 remote.run(
754 args=[
755 'sudo',
756 'mkdir',
757 '-p',
758 mgr_dir,
759 run.Raw('&&'),
760 'sudo',
761 'adjust-ulimits',
762 'ceph-coverage',
763 coverage_dir,
764 'ceph-authtool',
765 '--create-keyring',
766 '--gen-key',
767 '--name=mgr.{id}'.format(id=id_),
768 mgr_dir + '/keyring',
769 ],
770 )
771
772 log.info('Setting up mds nodes...')
773 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
9f95a23c 774 for remote, roles_for_host in mdss.remotes.items():
7c673cae
FG
775 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
776 cluster_name):
777 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
778 mds_dir = DATA_PATH.format(
779 type_='mds', cluster=cluster_name, id_=id_)
7c673cae
FG
780 remote.run(
781 args=[
782 'sudo',
783 'mkdir',
784 '-p',
785 mds_dir,
786 run.Raw('&&'),
787 'sudo',
788 'adjust-ulimits',
789 'ceph-coverage',
790 coverage_dir,
791 'ceph-authtool',
792 '--create-keyring',
793 '--gen-key',
794 '--name=mds.{id}'.format(id=id_),
795 mds_dir + '/keyring',
796 ],
797 )
11fdf7f2
TL
798 remote.run(args=[
799 'sudo', 'chown', '-R', 'ceph:ceph', mds_dir
800 ])
7c673cae
FG
801
802 cclient.create_keyring(ctx, cluster_name)
803 log.info('Running mkfs on osd nodes...')
804
805 if not hasattr(ctx, 'disk_config'):
806 ctx.disk_config = argparse.Namespace()
807 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
808 ctx.disk_config.remote_to_roles_to_dev = {}
7c673cae
FG
809 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
810 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
811 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
812 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
813
814 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
7c673cae
FG
815
816 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
9f95a23c 817 for remote, roles_for_host in osds.remotes.items():
7c673cae 818 roles_to_devs = remote_to_roles_to_devs[remote]
7c673cae
FG
819
820 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
821 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
822 mnt_point = DATA_PATH.format(
823 type_='osd', cluster=cluster_name, id_=id_)
7c673cae
FG
824 remote.run(
825 args=[
826 'sudo',
827 'mkdir',
828 '-p',
829 mnt_point,
830 ])
801d1391
TL
831 log.info('roles_to_devs: {}'.format(roles_to_devs))
832 log.info('role: {}'.format(role))
7c673cae
FG
833 if roles_to_devs.get(role):
834 dev = roles_to_devs[role]
835 fs = config.get('fs')
836 package = None
837 mkfs_options = config.get('mkfs_options')
838 mount_options = config.get('mount_options')
839 if fs == 'btrfs':
840 # package = 'btrfs-tools'
841 if mount_options is None:
842 mount_options = ['noatime', 'user_subvol_rm_allowed']
843 if mkfs_options is None:
844 mkfs_options = ['-m', 'single',
845 '-l', '32768',
846 '-n', '32768']
847 if fs == 'xfs':
848 # package = 'xfsprogs'
849 if mount_options is None:
850 mount_options = ['noatime']
851 if mkfs_options is None:
852 mkfs_options = ['-f', '-i', 'size=2048']
853 if fs == 'ext4' or fs == 'ext3':
854 if mount_options is None:
855 mount_options = ['noatime', 'user_xattr']
856
857 if mount_options is None:
858 mount_options = []
859 if mkfs_options is None:
860 mkfs_options = []
861 mkfs = ['mkfs.%s' % fs] + mkfs_options
862 log.info('%s on %s on %s' % (mkfs, dev, remote))
863 if package is not None:
9f95a23c 864 remote.sh('sudo apt-get install -y %s' % package)
7c673cae
FG
865
866 try:
867 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
868 except run.CommandFailedError:
869 # Newer btfs-tools doesn't prompt for overwrite, use -f
870 if '-f' not in mount_options:
871 mkfs_options.append('-f')
872 mkfs = ['mkfs.%s' % fs] + mkfs_options
873 log.info('%s on %s on %s' % (mkfs, dev, remote))
874 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
875
876 log.info('mount %s on %s -o %s' % (dev, remote,
877 ','.join(mount_options)))
878 remote.run(
879 args=[
880 'sudo',
881 'mount',
882 '-t', fs,
883 '-o', ','.join(mount_options),
884 dev,
885 mnt_point,
886 ]
887 )
888 remote.run(
889 args=[
890 'sudo', '/sbin/restorecon', mnt_point,
891 ],
892 check_status=False,
893 )
894 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
895 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
896 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
897 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
898 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
899 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
900 devs_to_clean[remote].append(mnt_point)
901
902 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
903 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
904 try:
905 remote.run(
906 args=[
907 'sudo',
908 'MALLOC_CHECK_=3',
909 'adjust-ulimits',
910 'ceph-coverage',
911 coverage_dir,
912 'ceph-osd',
913 '--no-mon-config',
914 '--cluster',
915 cluster_name,
916 '--mkfs',
917 '--mkkey',
918 '-i', id_,
7c673cae 919 '--monmap', monmap_path,
11fdf7f2
TL
920 ],
921 )
922 except run.CommandFailedError:
923 # try without --no-mon-config.. this may be an upgrade test
924 remote.run(
925 args=[
926 'sudo',
927 'MALLOC_CHECK_=3',
928 'adjust-ulimits',
929 'ceph-coverage',
930 coverage_dir,
931 'ceph-osd',
932 '--cluster',
933 cluster_name,
934 '--mkfs',
935 '--mkkey',
936 '-i', id_,
937 '--monmap', monmap_path,
938 ],
939 )
940 mnt_point = DATA_PATH.format(
941 type_='osd', cluster=cluster_name, id_=id_)
942 try:
943 remote.run(args=[
944 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
945 ])
946 except run.CommandFailedError as e:
947 # hammer does not have ceph user, so ignore this error
948 log.info('ignoring error when chown ceph:ceph,'
949 'probably installing hammer: %s', e)
7c673cae
FG
950
951 log.info('Reading keys from all nodes...')
9f95a23c 952 keys_fp = BytesIO()
7c673cae 953 keys = []
9f95a23c 954 for remote, roles_for_host in ctx.cluster.remotes.items():
7c673cae
FG
955 for type_ in ['mgr', 'mds', 'osd']:
956 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
957 continue
958 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
959 _, _, id_ = teuthology.split_role(role)
960 data = teuthology.get_file(
961 remote=remote,
11fdf7f2
TL
962 path=os.path.join(
963 DATA_PATH.format(
964 type_=type_, id_=id_, cluster=cluster_name),
965 'keyring',
7c673cae
FG
966 ),
967 sudo=True,
968 )
969 keys.append((type_, id_, data))
970 keys_fp.write(data)
9f95a23c 971 for remote, roles_for_host in ctx.cluster.remotes.items():
7c673cae
FG
972 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
973 _, _, id_ = teuthology.split_role(role)
974 data = teuthology.get_file(
975 remote=remote,
976 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
977 )
978 keys.append(('client', id_, data))
979 keys_fp.write(data)
980
981 log.info('Adding keys to all mons...')
982 writes = mons.run(
983 args=[
984 'sudo', 'tee', '-a',
985 keyring_path,
986 ],
987 stdin=run.PIPE,
988 wait=False,
9f95a23c 989 stdout=BytesIO(),
7c673cae
FG
990 )
991 keys_fp.seek(0)
992 teuthology.feed_many_stdins_and_close(keys_fp, writes)
993 run.wait(writes)
994 for type_, id_, data in keys:
995 run.wait(
996 mons.run(
997 args=[
998 'sudo',
999 'adjust-ulimits',
1000 'ceph-coverage',
1001 coverage_dir,
1002 'ceph-authtool',
1003 keyring_path,
1004 '--name={type}.{id}'.format(
1005 type=type_,
1006 id=id_,
1007 ),
1008 ] + list(generate_caps(type_)),
1009 wait=False,
1010 ),
1011 )
1012
1013 log.info('Running mkfs on mon nodes...')
9f95a23c 1014 for remote, roles_for_host in mons.remotes.items():
7c673cae
FG
1015 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
1016 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
1017 mnt_point = DATA_PATH.format(
1018 type_='mon', id_=id_, cluster=cluster_name)
7c673cae
FG
1019 remote.run(
1020 args=[
1021 'sudo',
1022 'mkdir',
1023 '-p',
11fdf7f2 1024 mnt_point,
7c673cae
FG
1025 ],
1026 )
1027 remote.run(
1028 args=[
1029 'sudo',
1030 'adjust-ulimits',
1031 'ceph-coverage',
1032 coverage_dir,
1033 'ceph-mon',
1034 '--cluster', cluster_name,
1035 '--mkfs',
1036 '-i', id_,
1037 '--monmap', monmap_path,
7c673cae
FG
1038 '--keyring', keyring_path,
1039 ],
1040 )
11fdf7f2
TL
1041 try:
1042 remote.run(args=[
1043 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
1044 ])
1045 except run.CommandFailedError as e:
1046 # hammer does not have ceph user, so ignore this error
1047 log.info('ignoring error when chown ceph:ceph,'
1048 'probably installing hammer: %s', e)
7c673cae
FG
1049
1050 run.wait(
1051 mons.run(
1052 args=[
1053 'rm',
1054 '--',
1055 monmap_path,
7c673cae
FG
1056 ],
1057 wait=False,
1058 ),
1059 )
1060
1061 try:
1062 yield
1063 except Exception:
1064 # we need to know this below
1065 ctx.summary['success'] = False
1066 raise
1067 finally:
1068 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1069
1070 log.info('Checking cluster log for badness...')
1071
1072 def first_in_ceph_log(pattern, excludes):
1073 """
11fdf7f2 1074 Find the first occurrence of the pattern specified in the Ceph log,
7c673cae
FG
1075 Returns None if none found.
1076
1077 :param pattern: Pattern scanned for.
1078 :param excludes: Patterns to ignore.
1079 :return: First line of text (or None if not found)
1080 """
1081 args = [
1082 'sudo',
1083 'egrep', pattern,
1084 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
1085 ]
1086 for exclude in excludes:
1087 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
1088 args.extend([
1089 run.Raw('|'), 'head', '-n', '1',
1090 ])
9f95a23c
TL
1091 stdout = mon0_remote.sh(args)
1092 return stdout or None
7c673cae
FG
1093
1094 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
1095 config['log_whitelist']) is not None:
1096 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
1097 ctx.summary['success'] = False
1098 # use the most severe problem as the failure reason
1099 if 'failure_reason' not in ctx.summary:
1100 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
1101 match = first_in_ceph_log(pattern, config['log_whitelist'])
1102 if match is not None:
1103 ctx.summary['failure_reason'] = \
1104 '"{match}" in cluster log'.format(
1105 match=match.rstrip('\n'),
1106 )
1107 break
1108
9f95a23c 1109 for remote, dirs in devs_to_clean.items():
7c673cae
FG
1110 for dir_ in dirs:
1111 log.info('Unmounting %s on %s' % (dir_, remote))
1112 try:
1113 remote.run(
1114 args=[
1115 'sync',
1116 run.Raw('&&'),
1117 'sudo',
1118 'umount',
1119 '-f',
1120 dir_
1121 ]
1122 )
1123 except Exception as e:
1124 remote.run(args=[
1125 'sudo',
1126 run.Raw('PATH=/usr/sbin:$PATH'),
1127 'lsof',
1128 run.Raw(';'),
1129 'ps', 'auxf',
1130 ])
1131 raise e
1132
7c673cae
FG
1133 if ctx.archive is not None and \
1134 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
1135
1136 # archive mon data, too
1137 log.info('Archiving mon data...')
1138 path = os.path.join(ctx.archive, 'data')
1139 try:
1140 os.makedirs(path)
1141 except OSError as e:
1142 if e.errno == errno.EEXIST:
1143 pass
1144 else:
1145 raise
9f95a23c 1146 for remote, roles in mons.remotes.items():
7c673cae
FG
1147 for role in roles:
1148 is_mon = teuthology.is_type('mon', cluster_name)
1149 if is_mon(role):
1150 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
1151 mon_dir = DATA_PATH.format(
1152 type_='mon', id_=id_, cluster=cluster_name)
7c673cae
FG
1153 teuthology.pull_directory_tarball(
1154 remote,
1155 mon_dir,
1156 path + '/' + role + '.tgz')
1157
1158 log.info('Cleaning ceph cluster...')
1159 run.wait(
1160 ctx.cluster.run(
1161 args=[
1162 'sudo',
1163 'rm',
1164 '-rf',
1165 '--',
1166 conf_path,
1167 keyring_path,
1168 data_dir,
1169 monmap_path,
7c673cae
FG
1170 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1171 ],
1172 wait=False,
1173 ),
1174 )
1175
1176
1177def osd_scrub_pgs(ctx, config):
1178 """
1179 Scrub pgs when we exit.
1180
1181 First make sure all pgs are active and clean.
1182 Next scrub all osds.
1183 Then periodically check until all pgs have scrub time stamps that
11fdf7f2 1184 indicate the last scrub completed. Time out if no progress is made
7c673cae
FG
1185 here after two minutes.
1186 """
d2e6a577
FG
1187 retries = 40
1188 delays = 20
7c673cae
FG
1189 cluster_name = config['cluster']
1190 manager = ctx.managers[cluster_name]
1191 all_clean = False
1192 for _ in range(0, retries):
1193 stats = manager.get_pg_stats()
11fdf7f2
TL
1194 unclean = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1195 split_merge = []
1196 osd_dump = manager.get_osd_dump_json()
9f95a23c
TL
1197 try:
1198 split_merge = [i['pool_name'] for i in osd_dump['pools'] if i['pg_num'] != i['pg_num_target']]
1199 except KeyError:
1200 # we don't support pg_num_target before nautilus
1201 pass
11fdf7f2 1202 if not unclean and not split_merge:
7c673cae
FG
1203 all_clean = True
1204 break
31f18b77 1205 log.info(
11fdf7f2 1206 "Waiting for all PGs to be active+clean and split+merged, waiting on %s to go clean and/or %s to split/merge" % (unclean, split_merge))
7c673cae
FG
1207 time.sleep(delays)
1208 if not all_clean:
31f18b77 1209 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
7c673cae
FG
1210 check_time_now = time.localtime()
1211 time.sleep(1)
1212 all_roles = teuthology.all_roles(ctx.cluster)
1213 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1214 log.info("Scrubbing {osd}".format(osd=role))
1215 _, _, id_ = teuthology.split_role(role)
31f18b77
FG
1216 # allow this to fail; in certain cases the OSD might not be up
1217 # at this point. we will catch all pgs below.
1218 try:
28e407b8
AA
1219 manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
1220 'osd_debug_deep_scrub_sleep', '0');
31f18b77
FG
1221 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1222 except run.CommandFailedError:
1223 pass
7c673cae
FG
1224 prev_good = 0
1225 gap_cnt = 0
1226 loop = True
1227 while loop:
1228 stats = manager.get_pg_stats()
1229 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1230 loop = False
1231 thiscnt = 0
9f95a23c 1232 re_scrub = []
7c673cae 1233 for (pgid, tmval) in timez:
9f95a23c
TL
1234 t = tmval[0:tmval.find('.')].replace(' ', 'T')
1235 pgtm = time.strptime(t, '%Y-%m-%dT%H:%M:%S')
7c673cae
FG
1236 if pgtm > check_time_now:
1237 thiscnt += 1
1238 else:
1239 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1240 loop = True
9f95a23c 1241 re_scrub.append(pgid)
7c673cae
FG
1242 if thiscnt > prev_good:
1243 prev_good = thiscnt
1244 gap_cnt = 0
1245 else:
1246 gap_cnt += 1
31f18b77 1247 if gap_cnt % 6 == 0:
9f95a23c 1248 for pgid in re_scrub:
31f18b77 1249 # re-request scrub every so often in case the earlier
11fdf7f2 1250 # request was missed. do not do it every time because
31f18b77
FG
1251 # the scrub may be in progress or not reported yet and
1252 # we will starve progress.
1253 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
7c673cae 1254 if gap_cnt > retries:
31f18b77 1255 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
7c673cae
FG
1256 if loop:
1257 log.info('Still waiting for all pgs to be scrubbed.')
1258 time.sleep(delays)
1259
1260
1261@contextlib.contextmanager
1262def run_daemon(ctx, config, type_):
1263 """
1264 Run daemons for a role type. Handle the startup and termination of a a daemon.
1265 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1266 and a max_mds value for one mds.
1267 On cleanup -- Stop all existing daemons of this type.
1268
1269 :param ctx: Context
1270 :param config: Configuration
9f95a23c 1271 :param type_: Role type
7c673cae
FG
1272 """
1273 cluster_name = config['cluster']
1274 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1275 testdir = teuthology.get_testdir(ctx)
1276 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1277
1278 # check whether any daemons if this type are configured
1279 if daemons is None:
1280 return
1281 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1282
1283 daemon_signal = 'kill'
1284 if config.get('coverage') or config.get('valgrind') is not None:
1285 daemon_signal = 'term'
1286
c07f9fc5
FG
1287 # create osds in order. (this only matters for pre-luminous, which might
1288 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1289 osd_uuids = {}
9f95a23c 1290 for remote, roles_for_host in daemons.remotes.items():
7c673cae
FG
1291 is_type_ = teuthology.is_type(type_, cluster_name)
1292 for role in roles_for_host:
1293 if not is_type_(role):
1294 continue
1295 _, _, id_ = teuthology.split_role(role)
1296
c07f9fc5 1297
224ce89b
WB
1298 if type_ == 'osd':
1299 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1300 cluster=cluster_name, id=id_)
9f95a23c 1301 osd_uuid = six.ensure_str(teuthology.get_file(
224ce89b
WB
1302 remote=remote,
1303 path=datadir + '/fsid',
1304 sudo=True,
9f95a23c 1305 )).strip()
c07f9fc5
FG
1306 osd_uuids[id_] = osd_uuid
1307 for osd_id in range(len(osd_uuids)):
1308 id_ = str(osd_id)
1309 osd_uuid = osd_uuids.get(id_)
1310 try:
1311 remote.run(
1312 args=[
1313 'sudo', 'ceph', '--cluster', cluster_name,
1314 'osd', 'new', osd_uuid, id_,
1315 ]
1316 )
1317 except:
1318 # fallback to pre-luminous (hammer or jewel)
1319 remote.run(
1320 args=[
1321 'sudo', 'ceph', '--cluster', cluster_name,
1322 'osd', 'create', osd_uuid,
1323 ]
1324 )
1325 if config.get('add_osds_to_crush'):
1326 remote.run(
1327 args=[
1328 'sudo', 'ceph', '--cluster', cluster_name,
1329 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1330 '1.0', 'host=localhost', 'root=default',
1331 ]
1332 )
1333
9f95a23c 1334 for remote, roles_for_host in daemons.remotes.items():
c07f9fc5
FG
1335 is_type_ = teuthology.is_type(type_, cluster_name)
1336 for role in roles_for_host:
1337 if not is_type_(role):
1338 continue
1339 _, _, id_ = teuthology.split_role(role)
224ce89b 1340
7c673cae
FG
1341 run_cmd = [
1342 'sudo',
1343 'adjust-ulimits',
1344 'ceph-coverage',
1345 coverage_dir,
1346 'daemon-helper',
1347 daemon_signal,
1348 ]
1349 run_cmd_tail = [
1350 'ceph-%s' % (type_),
1351 '-f',
1352 '--cluster', cluster_name,
1353 '-i', id_]
1354
1355 if type_ in config.get('cpu_profile', []):
1356 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1357 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1358
1359 if config.get('valgrind') is not None:
1360 valgrind_args = None
1361 if type_ in config['valgrind']:
1362 valgrind_args = config['valgrind'][type_]
1363 if role in config['valgrind']:
1364 valgrind_args = config['valgrind'][role]
1365 run_cmd = teuthology.get_valgrind_args(testdir, role,
1366 run_cmd,
1367 valgrind_args)
1368
1369 run_cmd.extend(run_cmd_tail)
1370
1371 # always register mgr; don't necessarily start
1372 ctx.daemons.register_daemon(
1373 remote, type_, id_,
1374 cluster=cluster_name,
1375 args=run_cmd,
1376 logger=log.getChild(role),
1377 stdin=run.PIPE,
1378 wait=False
1379 )
1380 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1381 role = cluster_name + '.' + type_
1382 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1383
9f95a23c
TL
1384 # kludge: run any pre-manager commands
1385 if type_ == 'mon':
1386 for cmd in config.get('pre-mgr-commands', []):
1387 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1388 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1389 remote.run(args=cmd.split(' '))
1390
7c673cae
FG
1391 try:
1392 yield
1393 finally:
1394 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1395
1396
1397def healthy(ctx, config):
1398 """
1399 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1400
1401 :param ctx: Context
1402 :param config: Configuration
1403 """
1404 config = config if isinstance(config, dict) else dict()
1405 cluster_name = config.get('cluster', 'ceph')
c07f9fc5
FG
1406 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1407 manager = ctx.managers[cluster_name]
1408 try:
d2e6a577
FG
1409 manager.wait_for_mgr_available(timeout=30)
1410 except (run.CommandFailedError, AssertionError) as e:
1411 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
c07f9fc5 1412
9f95a23c 1413 manager.wait_for_all_osds_up(timeout=300)
c07f9fc5
FG
1414
1415 try:
1416 manager.flush_all_pg_stats()
d2e6a577
FG
1417 except (run.CommandFailedError, Exception) as e:
1418 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
c07f9fc5
FG
1419 manager.wait_for_clean()
1420
b32b8144
FG
1421 if config.get('wait-for-healthy', True):
1422 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
9f95a23c 1423 manager.wait_until_healthy(timeout=300)
7c673cae
FG
1424
1425 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1426 # Some MDSs exist, wait for them to be healthy
1427 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1428 ceph_fs.wait_for_daemons(timeout=300)
1429
1430
7c673cae
FG
1431def wait_for_mon_quorum(ctx, config):
1432 """
1433 Check renote ceph status until all monitors are up.
1434
1435 :param ctx: Context
1436 :param config: Configuration
1437 """
1438 if isinstance(config, dict):
1439 mons = config['daemons']
1440 cluster_name = config.get('cluster', 'ceph')
1441 else:
1442 assert isinstance(config, list)
1443 mons = config
1444 cluster_name = 'ceph'
1445 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1446 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1447 with contextutil.safe_while(sleep=10, tries=60,
1448 action='wait for monitor quorum') as proceed:
1449 while proceed():
9f95a23c
TL
1450 quorum_status = remote.sh('sudo ceph quorum_status',
1451 logger=log.getChild('quorum_status'))
1452 j = json.loads(quorum_status)
7c673cae
FG
1453 q = j.get('quorum_names', [])
1454 log.debug('Quorum: %s', q)
1455 if sorted(q) == sorted(mons):
1456 break
1457
1458
1459def created_pool(ctx, config):
1460 """
1461 Add new pools to the dictionary of pools that the ceph-manager
1462 knows about.
1463 """
1464 for new_pool in config:
1465 if new_pool not in ctx.managers['ceph'].pools:
9f95a23c 1466 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_int_property(
7c673cae
FG
1467 new_pool, 'pg_num')
1468
1469
11fdf7f2 1470@contextlib.contextmanager
9f95a23c 1471def suppress_mon_health_to_clog(ctx, config):
11fdf7f2 1472 """
9f95a23c 1473 set the option, and then restore it with its original value
11fdf7f2
TL
1474
1475 Note, due to the way how tasks are executed/nested, it's not suggested to
1476 use this method as a standalone task. otherwise, it's likely that it will
1477 restore the tweaked option at the /end/ of 'tasks' block.
1478 """
9f95a23c
TL
1479 if config.get('mon-health-to-clog', 'true') == 'false':
1480 saved_options = {}
1481 cluster = config.get('cluster', 'ceph')
1482 manager = ctx.managers[cluster]
1483 manager.raw_cluster_command(
1484 'config', 'set', 'mon', 'mon_health_to_clog', 'false'
1485 )
1486 yield
1487 manager.raw_cluster_command(
1488 'config', 'rm', 'mon', 'mon_health_to_clog'
1489 )
11fdf7f2 1490 else:
9f95a23c 1491 yield
11fdf7f2 1492
7c673cae
FG
1493@contextlib.contextmanager
1494def restart(ctx, config):
1495 """
1496 restart ceph daemons
1497
1498 For example::
1499 tasks:
1500 - ceph.restart: [all]
1501
1502 For example::
1503 tasks:
1504 - ceph.restart: [osd.0, mon.1, mds.*]
1505
1506 or::
1507
1508 tasks:
1509 - ceph.restart:
1510 daemons: [osd.0, mon.1]
1511 wait-for-healthy: false
1512 wait-for-osds-up: true
1513
1514 :param ctx: Context
1515 :param config: Configuration
1516 """
1517 if config is None:
1518 config = {}
1519 elif isinstance(config, list):
1520 config = {'daemons': config}
1521
1522 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1523 clusters = set()
7c673cae 1524
9f95a23c 1525 with suppress_mon_health_to_clog(ctx, config):
11fdf7f2
TL
1526 for role in daemons:
1527 cluster, type_, id_ = teuthology.split_role(role)
9f95a23c
TL
1528 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1529 if type_ == 'osd':
1530 ctx.managers[cluster].mark_down_osd(id_)
11fdf7f2
TL
1531 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1532 clusters.add(cluster)
1533
7c673cae
FG
1534 if config.get('wait-for-healthy', True):
1535 for cluster in clusters:
1536 healthy(ctx=ctx, config=dict(cluster=cluster))
1537 if config.get('wait-for-osds-up', False):
1538 for cluster in clusters:
9f95a23c 1539 ctx.managers[cluster].wait_for_all_osds_up()
7c673cae
FG
1540 yield
1541
1542
1543@contextlib.contextmanager
1544def stop(ctx, config):
1545 """
1546 Stop ceph daemons
1547
1548 For example::
1549 tasks:
1550 - ceph.stop: [mds.*]
1551
1552 tasks:
1553 - ceph.stop: [osd.0, osd.2]
1554
1555 tasks:
1556 - ceph.stop:
1557 daemons: [osd.0, osd.2]
1558
1559 """
1560 if config is None:
1561 config = {}
1562 elif isinstance(config, list):
1563 config = {'daemons': config}
1564
1565 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
9f95a23c
TL
1566 clusters = set()
1567
7c673cae
FG
1568 for role in daemons:
1569 cluster, type_, id_ = teuthology.split_role(role)
1570 ctx.daemons.get_daemon(type_, id_, cluster).stop()
9f95a23c
TL
1571 clusters.add(cluster)
1572
1573
1574 for cluster in clusters:
1575 ctx.ceph[cluster].watchdog.stop()
1576 ctx.ceph[cluster].watchdog.join()
7c673cae
FG
1577
1578 yield
1579
1580
1581@contextlib.contextmanager
1582def wait_for_failure(ctx, config):
1583 """
1584 Wait for a failure of a ceph daemon
1585
1586 For example::
1587 tasks:
1588 - ceph.wait_for_failure: [mds.*]
1589
1590 tasks:
1591 - ceph.wait_for_failure: [osd.0, osd.2]
1592
1593 tasks:
1594 - ceph.wait_for_failure:
1595 daemons: [osd.0, osd.2]
1596
1597 """
1598 if config is None:
1599 config = {}
1600 elif isinstance(config, list):
1601 config = {'daemons': config}
1602
1603 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1604 for role in daemons:
1605 cluster, type_, id_ = teuthology.split_role(role)
1606 try:
1607 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1608 except:
1609 log.info('Saw expected daemon failure. Continuing.')
1610 pass
1611 else:
1612 raise RuntimeError('daemon %s did not fail' % role)
1613
1614 yield
1615
1616
1617def validate_config(ctx, config):
1618 """
1619 Perform some simple validation on task configuration.
1620 Raises exceptions.ConfigError if an error is found.
1621 """
1622 # check for osds from multiple clusters on the same host
1623 for remote, roles_for_host in ctx.cluster.remotes.items():
1624 last_cluster = None
1625 last_role = None
1626 for role in roles_for_host:
1627 role_cluster, role_type, _ = teuthology.split_role(role)
1628 if role_type != 'osd':
1629 continue
1630 if last_cluster and last_cluster != role_cluster:
1631 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1632 last_role, role)
1633 raise exceptions.ConfigError(msg)
1634 last_cluster = role_cluster
1635 last_role = role
1636
1637
1638@contextlib.contextmanager
1639def task(ctx, config):
1640 """
1641 Set up and tear down a Ceph cluster.
1642
1643 For example::
1644
1645 tasks:
1646 - ceph:
1647 - interactive:
1648
1649 You can also specify what branch to run::
1650
1651 tasks:
1652 - ceph:
1653 branch: foo
1654
1655 Or a tag::
1656
1657 tasks:
1658 - ceph:
1659 tag: v0.42.13
1660
1661 Or a sha1::
1662
1663 tasks:
1664 - ceph:
1665 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1666
1667 Or a local source dir::
1668
1669 tasks:
1670 - ceph:
1671 path: /home/sage/ceph
1672
1673 To capture code coverage data, use::
1674
1675 tasks:
1676 - ceph:
1677 coverage: true
1678
1679 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1680
1681 tasks:
1682 - ceph:
1683 fs: xfs
1684 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1685 mount_options: [nobarrier, inode64]
1686
1687 Note, this will cause the task to check the /scratch_devs file on each node
1688 for available devices. If no such file is found, /dev/sdb will be used.
1689
1690 To run some daemons under valgrind, include their names
1691 and the tool/args to use in a valgrind section::
1692
1693 tasks:
1694 - ceph:
1695 valgrind:
1696 mds.1: --tool=memcheck
1697 osd.1: [--tool=memcheck, --leak-check=no]
1698
1699 Those nodes which are using memcheck or valgrind will get
1700 checked for bad results.
1701
1702 To adjust or modify config options, use::
1703
1704 tasks:
1705 - ceph:
1706 conf:
1707 section:
1708 key: value
1709
1710 For example::
1711
1712 tasks:
1713 - ceph:
1714 conf:
1715 mds.0:
1716 some option: value
1717 other key: other value
1718 client.0:
1719 debug client: 10
1720 debug ms: 1
1721
1722 By default, the cluster log is checked for errors and warnings,
1723 and the run marked failed if any appear. You can ignore log
1724 entries by giving a list of egrep compatible regexes, i.e.:
1725
1726 tasks:
1727 - ceph:
1728 log-whitelist: ['foo.*bar', 'bad message']
1729
1730 To run multiple ceph clusters, use multiple ceph tasks, and roles
1731 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1732 cluster use the default cluster name, 'ceph'. OSDs from separate
1733 clusters must be on separate hosts. Clients and non-osd daemons
1734 from multiple clusters may be colocated. For each cluster, add an
1735 instance of the ceph task with the cluster name specified, e.g.::
1736
1737 roles:
1738 - [mon.a, osd.0, osd.1]
1739 - [backup.mon.a, backup.osd.0, backup.osd.1]
1740 - [client.0, backup.client.0]
1741 tasks:
1742 - ceph:
1743 cluster: ceph
1744 - ceph:
1745 cluster: backup
1746
1747 :param ctx: Context
1748 :param config: Configuration
1749
1750 """
1751 if config is None:
1752 config = {}
1753 assert isinstance(config, dict), \
1754 "task ceph only supports a dictionary for configuration"
1755
1756 overrides = ctx.config.get('overrides', {})
1757 teuthology.deep_merge(config, overrides.get('ceph', {}))
1758
1759 first_ceph_cluster = False
1760 if not hasattr(ctx, 'daemons'):
1761 first_ceph_cluster = True
1762 ctx.daemons = DaemonGroup()
1763
1764 testdir = teuthology.get_testdir(ctx)
1765 if config.get('coverage'):
1766 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1767 log.info('Creating coverage directory...')
1768 run.wait(
1769 ctx.cluster.run(
1770 args=[
1771 'install', '-d', '-m0755', '--',
1772 coverage_dir,
1773 ],
1774 wait=False,
1775 )
1776 )
1777
1778 if 'cluster' not in config:
1779 config['cluster'] = 'ceph'
1780
1781 validate_config(ctx, config)
1782
1783 subtasks = []
1784 if first_ceph_cluster:
1785 # these tasks handle general log setup and parsing on all hosts,
1786 # so they should only be run once
1787 subtasks = [
1788 lambda: ceph_log(ctx=ctx, config=None),
11fdf7f2 1789 lambda: ceph_crash(ctx=ctx, config=None),
7c673cae
FG
1790 lambda: valgrind_post(ctx=ctx, config=config),
1791 ]
1792
1793 subtasks += [
1794 lambda: cluster(ctx=ctx, config=dict(
1795 conf=config.get('conf', {}),
1796 fs=config.get('fs', 'xfs'),
1797 mkfs_options=config.get('mkfs_options', None),
1798 mount_options=config.get('mount_options', None),
7c673cae
FG
1799 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1800 log_whitelist=config.get('log-whitelist', []),
1801 cpu_profile=set(config.get('cpu_profile', []),),
1802 cluster=config['cluster'],
11fdf7f2
TL
1803 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1804 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
7c673cae
FG
1805 )),
1806 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1807 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1808 lambda: crush_setup(ctx=ctx, config=config),
1809 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
224ce89b 1810 lambda: create_rbd_pool(ctx=ctx, config=config),
7c673cae
FG
1811 lambda: cephfs_setup(ctx=ctx, config=config),
1812 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
9f95a23c 1813 lambda: watchdog_setup(ctx=ctx, config=config),
7c673cae
FG
1814 ]
1815
1816 with contextutil.nested(*subtasks):
1817 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
9f95a23c 1818 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
1819 if not hasattr(ctx, 'managers'):
1820 ctx.managers = {}
1821 ctx.managers[config['cluster']] = CephManager(
1822 mon,
1823 ctx=ctx,
1824 logger=log.getChild('ceph_manager.' + config['cluster']),
1825 cluster=config['cluster'],
1826 )
1827
1828 try:
1829 if config.get('wait-for-healthy', True):
1830 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1831
1832 yield
1833 finally:
11fdf7f2
TL
1834 # set pg_num_targets back to actual pg_num, so we don't have to
1835 # wait for pending merges (which can take a while!)
1836 ctx.managers[config['cluster']].stop_pg_num_changes()
1837
7c673cae
FG
1838 if config.get('wait-for-scrub', True):
1839 osd_scrub_pgs(ctx, config)
224ce89b
WB
1840
1841 # stop logging health to clog during shutdown, or else we generate
1842 # a bunch of scary messages unrelated to our actual run.
1843 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1844 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1845 mon0_remote.run(
1846 args=[
1847 'sudo',
1848 'ceph',
1849 '--cluster', config['cluster'],
9f95a23c
TL
1850 'config', 'set', 'global',
1851 'mon_health_to_clog', 'false',
1852 ],
1853 check_status=False,
224ce89b 1854 )