]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
bump version to 12.0.3-pve3
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
6from cStringIO import StringIO
7
8import argparse
9import contextlib
10import errno
11import logging
12import os
13import json
14import time
15import gevent
16import socket
17
18from paramiko import SSHException
19from ceph_manager import CephManager, write_conf
20from tasks.cephfs.filesystem import Filesystem
21from teuthology import misc as teuthology
22from teuthology import contextutil
23from teuthology import exceptions
24from teuthology.orchestra import run
25import ceph_client as cclient
26from teuthology.orchestra.daemon import DaemonGroup
27
28CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30log = logging.getLogger(__name__)
31
32
33def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
46 mon='allow *',
47 ),
48 mds=dict(
49 mon='allow *',
50 mgr='allow *',
51 osd='allow *',
52 mds='allow',
53 ),
54 client=dict(
55 mon='allow rw',
56 mgr='allow r',
57 osd='allow rwx',
58 mds='allow',
59 ),
60 )
61 for subsystem, capability in defaults[type_].items():
62 yield '--cap'
63 yield subsystem
64 yield capability
65
66
67@contextlib.contextmanager
68def ceph_log(ctx, config):
69 """
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
72
73 :param ctx: Context
74 :param config: Configuration
75 """
76 log.info('Making ceph log dir writeable by non-root...')
77 run.wait(
78 ctx.cluster.run(
79 args=[
80 'sudo',
81 'chmod',
82 '777',
83 '/var/log/ceph',
84 ],
85 wait=False,
86 )
87 )
88 log.info('Disabling ceph logrotate...')
89 run.wait(
90 ctx.cluster.run(
91 args=[
92 'sudo',
93 'rm', '-f', '--',
94 '/etc/logrotate.d/ceph',
95 ],
96 wait=False,
97 )
98 )
99 log.info('Creating extra log directories...')
100 run.wait(
101 ctx.cluster.run(
102 args=[
103 'sudo',
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
107 ],
108 wait=False,
109 )
110 )
111
112 class Rotater(object):
113 stop_event = gevent.event.Event()
114
115 def invoke_logrotate(self):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self.stop_event.is_set():
119 self.stop_event.wait(timeout=30)
120 try:
121 run.wait(
122 ctx.cluster.run(
123 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
124 ],
125 wait=False,
126 )
127 )
128 except exceptions.ConnectionLostError as e:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log.debug("Missed logrotate, node '{0}' is offline".format(
132 e.node))
133 except EOFError as e:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log.debug("Missed logrotate, EOFError")
139 except SSHException as e:
140 log.debug("Missed logrotate, SSHException")
141 except socket.error as e:
142 if e.errno == errno.EHOSTUNREACH:
143 log.debug("Missed logrotate, host unreachable")
144 else:
145 raise
146
147 def begin(self):
148 self.thread = gevent.spawn(self.invoke_logrotate)
149
150 def end(self):
151 self.stop_event.set()
152 self.thread.get()
153
154 def write_rotate_conf(ctx, daemons):
155 testdir = teuthology.get_testdir(ctx)
156 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
157 with file(rotate_conf_path, 'rb') as f:
158 conf = ""
159 for daemon, size in daemons.iteritems():
160 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
161 conf += f.read().format(daemon_type=daemon, max_size=size)
162 f.seek(0, 0)
163
164 for remote in ctx.cluster.remotes.iterkeys():
165 teuthology.write_file(remote=remote,
166 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
167 data=StringIO(conf)
168 )
169 remote.run(
170 args=[
171 'sudo',
172 'mv',
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
174 '/etc/logrotate.d/ceph-test.conf',
175 run.Raw('&&'),
176 'sudo',
177 'chmod',
178 '0644',
179 '/etc/logrotate.d/ceph-test.conf',
180 run.Raw('&&'),
181 'sudo',
182 'chown',
183 'root.root',
184 '/etc/logrotate.d/ceph-test.conf'
185 ]
186 )
187 remote.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
189
190 if ctx.config.get('log-rotate'):
191 daemons = ctx.config.get('log-rotate')
192 log.info('Setting up log rotation with ' + str(daemons))
193 write_rotate_conf(ctx, daemons)
194 logrotater = Rotater()
195 logrotater.begin()
196 try:
197 yield
198
199 finally:
200 if ctx.config.get('log-rotate'):
201 log.info('Shutting down logrotate')
202 logrotater.end()
203 ctx.cluster.run(
204 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
205 ]
206 )
207 if ctx.archive is not None and \
208 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
209 # and logs
210 log.info('Compressing logs...')
211 run.wait(
212 ctx.cluster.run(
213 args=[
214 'sudo',
215 'find',
216 '/var/log/ceph',
217 '-name',
218 '*.log',
219 '-print0',
220 run.Raw('|'),
221 'sudo',
222 'xargs',
223 '-0',
224 '--no-run-if-empty',
225 '--',
226 'gzip',
227 '--',
228 ],
229 wait=False,
230 ),
231 )
232
233 log.info('Archiving logs...')
234 path = os.path.join(ctx.archive, 'remote')
235 os.makedirs(path)
236 for remote in ctx.cluster.remotes.iterkeys():
237 sub = os.path.join(path, remote.shortname)
238 os.makedirs(sub)
239 teuthology.pull_directory(remote, '/var/log/ceph',
240 os.path.join(sub, 'log'))
241
242
243def assign_devs(roles, devs):
244 """
245 Create a dictionary of devs indexed by roles
246
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
250 """
251 return dict(zip(roles, devs))
252
253
254@contextlib.contextmanager
255def valgrind_post(ctx, config):
256 """
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
259 the logs.
260
261 :param ctx: Context
262 :param config: Configuration
263 """
264 try:
265 yield
266 finally:
267 lookup_procs = list()
268 log.info('Checking for errors in any valgrind logs...')
269 for remote in ctx.cluster.remotes.iterkeys():
270 # look at valgrind logs for each node
271 proc = remote.run(
272 args=[
273 'sudo',
274 'zgrep',
275 '<kind>',
276 run.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
278 run.Raw('|'),
279 'sort',
280 run.Raw('|'),
281 'uniq',
282 ],
283 wait=False,
284 check_status=False,
285 stdout=StringIO(),
286 )
287 lookup_procs.append((proc, remote))
288
289 valgrind_exception = None
290 for (proc, remote) in lookup_procs:
291 proc.wait()
292 out = proc.stdout.getvalue()
293 for line in out.split('\n'):
294 if line == '':
295 continue
296 try:
297 (file, kind) = line.split(':')
298 except Exception:
299 log.error('failed to split line %s', line)
300 raise
301 log.debug('file %s kind %s', file, kind)
302 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
303 continue
304 log.error('saw valgrind issue %s in %s', kind, file)
305 valgrind_exception = Exception('saw valgrind issues')
306
307 if config.get('expect_valgrind_errors'):
308 if not valgrind_exception:
309 raise Exception('expected valgrind issues and found none')
310 else:
311 if valgrind_exception:
312 raise valgrind_exception
313
314
315@contextlib.contextmanager
316def crush_setup(ctx, config):
317 cluster_name = config['cluster']
318 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
319 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
320
321 profile = config.get('crush_tunables', 'default')
322 log.info('Setting crush tunables to %s', profile)
323 mon_remote.run(
324 args=['sudo', 'ceph', '--cluster', cluster_name,
325 'osd', 'crush', 'tunables', profile])
326 yield
327
328
329@contextlib.contextmanager
330def cephfs_setup(ctx, config):
331 cluster_name = config['cluster']
332 testdir = teuthology.get_testdir(ctx)
333 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
334
335 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
336 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
337 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
338 # If there are any MDSs, then create a filesystem for them to use
339 # Do this last because requires mon cluster to be up and running
340 if mdss.remotes:
341 log.info('Setting up CephFS filesystem...')
342
343 fs = Filesystem(ctx, create='cephfs')
344
345 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
346 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
347 num_active = len([r for r in all_roles if is_active_mds(r)])
348
349 fs.set_allow_multimds(True)
350 fs.set_max_mds(num_active)
351 fs.set_allow_dirfrags(True)
352
353 yield
354
355
356@contextlib.contextmanager
357def cluster(ctx, config):
358 """
359 Handle the creation and removal of a ceph cluster.
360
361 On startup:
362 Create directories needed for the cluster.
363 Create remote journals for all osds.
364 Create and set keyring.
365 Copy the monmap to tht test systems.
366 Setup mon nodes.
367 Setup mds nodes.
368 Mkfs osd nodes.
369 Add keyring information to monmaps
370 Mkfs mon nodes.
371
372 On exit:
373 If errors occured, extract a failure message and store in ctx.summary.
374 Unmount all test files and temporary journaling files.
375 Save the monitor information and archive all ceph logs.
376 Cleanup the keyring setup, and remove all monitor map and data files left over.
377
378 :param ctx: Context
379 :param config: Configuration
380 """
381 if ctx.config.get('use_existing_cluster', False) is True:
382 log.info("'use_existing_cluster' is true; skipping cluster creation")
383 yield
384
385 testdir = teuthology.get_testdir(ctx)
386 cluster_name = config['cluster']
387 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
388 log.info('Creating ceph cluster %s...', cluster_name)
389 run.wait(
390 ctx.cluster.run(
391 args=[
392 'install', '-d', '-m0755', '--',
393 data_dir,
394 ],
395 wait=False,
396 )
397 )
398
399 run.wait(
400 ctx.cluster.run(
401 args=[
402 'sudo',
403 'install', '-d', '-m0777', '--', '/var/run/ceph',
404 ],
405 wait=False,
406 )
407 )
408
409 devs_to_clean = {}
410 remote_to_roles_to_devs = {}
411 remote_to_roles_to_journals = {}
412 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
413 for remote, roles_for_host in osds.remotes.iteritems():
414 devs = teuthology.get_scratch_devices(remote)
415 roles_to_devs = {}
416 roles_to_journals = {}
417 if config.get('fs'):
418 log.info('fs option selected, checking for scratch devs')
419 log.info('found devs: %s' % (str(devs),))
420 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
421 iddevs = devs_id_map.values()
422 roles_to_devs = assign_devs(
423 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
424 )
425 if len(roles_to_devs) < len(iddevs):
426 iddevs = iddevs[len(roles_to_devs):]
427 devs_to_clean[remote] = []
428
429 if config.get('block_journal'):
430 log.info('block journal enabled')
431 roles_to_journals = assign_devs(
432 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
433 )
434 log.info('journal map: %s', roles_to_journals)
435
436 if config.get('tmpfs_journal'):
437 log.info('tmpfs journal enabled')
438 roles_to_journals = {}
439 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
440 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
441 tmpfs = '/mnt/' + role
442 roles_to_journals[role] = tmpfs
443 remote.run(args=['truncate', '-s', '1500M', tmpfs])
444 log.info('journal map: %s', roles_to_journals)
445
446 log.info('dev map: %s' % (str(roles_to_devs),))
447 remote_to_roles_to_devs[remote] = roles_to_devs
448 remote_to_roles_to_journals[remote] = roles_to_journals
449
450 log.info('Generating config...')
451 remotes_and_roles = ctx.cluster.remotes.items()
452 roles = [role_list for (remote, role_list) in remotes_and_roles]
453 ips = [host for (host, port) in
454 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
455 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
456 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
457 for role, journal in roles_to_journals.iteritems():
458 name = teuthology.ceph_role(role)
459 if name not in conf:
460 conf[name] = {}
461 conf[name]['osd journal'] = journal
462 for section, keys in config['conf'].iteritems():
463 for key, value in keys.iteritems():
464 log.info("[%s] %s = %s" % (section, key, value))
465 if section not in conf:
466 conf[section] = {}
467 conf[section][key] = value
468
469 if config.get('tmpfs_journal'):
470 conf['journal dio'] = False
471
472 if not hasattr(ctx, 'ceph'):
473 ctx.ceph = {}
474 ctx.ceph[cluster_name] = argparse.Namespace()
475 ctx.ceph[cluster_name].conf = conf
476
477 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
478 keyring_path = config.get('keyring_path', default_keyring)
479
480 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
481
482 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
483
484 log.info('Setting up %s...' % firstmon)
485 ctx.cluster.only(firstmon).run(
486 args=[
487 'sudo',
488 'adjust-ulimits',
489 'ceph-coverage',
490 coverage_dir,
491 'ceph-authtool',
492 '--create-keyring',
493 keyring_path,
494 ],
495 )
496 ctx.cluster.only(firstmon).run(
497 args=[
498 'sudo',
499 'adjust-ulimits',
500 'ceph-coverage',
501 coverage_dir,
502 'ceph-authtool',
503 '--gen-key',
504 '--name=mon.',
505 keyring_path,
506 ],
507 )
508 ctx.cluster.only(firstmon).run(
509 args=[
510 'sudo',
511 'chmod',
512 '0644',
513 keyring_path,
514 ],
515 )
516 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
517 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
518 cluster=cluster_name)
519 fsid = teuthology.create_simple_monmap(
520 ctx,
521 remote=mon0_remote,
522 conf=conf,
523 path=monmap_path,
524 )
525 if not 'global' in conf:
526 conf['global'] = {}
527 conf['global']['fsid'] = fsid
528
529 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
530 conf_path = config.get('conf_path', default_conf_path)
531 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
532 write_conf(ctx, conf_path, cluster_name)
533
534 log.info('Creating admin key on %s...' % firstmon)
535 ctx.cluster.only(firstmon).run(
536 args=[
537 'sudo',
538 'adjust-ulimits',
539 'ceph-coverage',
540 coverage_dir,
541 'ceph-authtool',
542 '--gen-key',
543 '--name=client.admin',
544 '--set-uid=0',
545 '--cap', 'mon', 'allow *',
546 '--cap', 'osd', 'allow *',
547 '--cap', 'mds', 'allow *',
548 '--cap', 'mgr', 'allow *',
549 keyring_path,
550 ],
551 )
552
553 log.info('Copying monmap to all nodes...')
554 keyring = teuthology.get_file(
555 remote=mon0_remote,
556 path=keyring_path,
557 )
558 monmap = teuthology.get_file(
559 remote=mon0_remote,
560 path=monmap_path,
561 )
562
563 for rem in ctx.cluster.remotes.iterkeys():
564 # copy mon key and initial monmap
565 log.info('Sending monmap to node {remote}'.format(remote=rem))
566 teuthology.sudo_write_file(
567 remote=rem,
568 path=keyring_path,
569 data=keyring,
570 perms='0644'
571 )
572 teuthology.write_file(
573 remote=rem,
574 path=monmap_path,
575 data=monmap,
576 )
577
578 log.info('Setting up mon nodes...')
579 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
580 osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir,
581 cluster=cluster_name)
582 run.wait(
583 mons.run(
584 args=[
585 'adjust-ulimits',
586 'ceph-coverage',
587 coverage_dir,
588 'osdmaptool',
589 '-c', conf_path,
590 '--clobber',
591 '--createsimple', '{num:d}'.format(
592 num=teuthology.num_instances_of_type(ctx.cluster, 'osd',
593 cluster_name),
594 ),
595 osdmap_path,
596 '--pg_bits', '2',
597 '--pgp_bits', '4',
598 ],
599 wait=False,
600 ),
601 )
602
603 if not config.get('skip_mgr_daemons', False):
604 log.info('Setting up mgr nodes...')
605 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
606 for remote, roles_for_host in mgrs.remotes.iteritems():
607 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
608 cluster_name):
609 _, _, id_ = teuthology.split_role(role)
610 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
611 cluster=cluster_name,
612 id=id_,
613 )
614 remote.run(
615 args=[
616 'sudo',
617 'mkdir',
618 '-p',
619 mgr_dir,
620 run.Raw('&&'),
621 'sudo',
622 'adjust-ulimits',
623 'ceph-coverage',
624 coverage_dir,
625 'ceph-authtool',
626 '--create-keyring',
627 '--gen-key',
628 '--name=mgr.{id}'.format(id=id_),
629 mgr_dir + '/keyring',
630 ],
631 )
632
633 log.info('Setting up mds nodes...')
634 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
635 for remote, roles_for_host in mdss.remotes.iteritems():
636 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
637 cluster_name):
638 _, _, id_ = teuthology.split_role(role)
639 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
640 cluster=cluster_name,
641 id=id_,
642 )
643 remote.run(
644 args=[
645 'sudo',
646 'mkdir',
647 '-p',
648 mds_dir,
649 run.Raw('&&'),
650 'sudo',
651 'adjust-ulimits',
652 'ceph-coverage',
653 coverage_dir,
654 'ceph-authtool',
655 '--create-keyring',
656 '--gen-key',
657 '--name=mds.{id}'.format(id=id_),
658 mds_dir + '/keyring',
659 ],
660 )
661
662 cclient.create_keyring(ctx, cluster_name)
663 log.info('Running mkfs on osd nodes...')
664
665 if not hasattr(ctx, 'disk_config'):
666 ctx.disk_config = argparse.Namespace()
667 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
668 ctx.disk_config.remote_to_roles_to_dev = {}
669 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
670 ctx.disk_config.remote_to_roles_to_journals = {}
671 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
672 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
673 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
674 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
675
676 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
677 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
678
679 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
680 for remote, roles_for_host in osds.remotes.iteritems():
681 roles_to_devs = remote_to_roles_to_devs[remote]
682 roles_to_journals = remote_to_roles_to_journals[remote]
683
684 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
685 _, _, id_ = teuthology.split_role(role)
686 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
687 remote.run(
688 args=[
689 'sudo',
690 'mkdir',
691 '-p',
692 mnt_point,
693 ])
694 log.info(str(roles_to_journals))
695 log.info(role)
696 if roles_to_devs.get(role):
697 dev = roles_to_devs[role]
698 fs = config.get('fs')
699 package = None
700 mkfs_options = config.get('mkfs_options')
701 mount_options = config.get('mount_options')
702 if fs == 'btrfs':
703 # package = 'btrfs-tools'
704 if mount_options is None:
705 mount_options = ['noatime', 'user_subvol_rm_allowed']
706 if mkfs_options is None:
707 mkfs_options = ['-m', 'single',
708 '-l', '32768',
709 '-n', '32768']
710 if fs == 'xfs':
711 # package = 'xfsprogs'
712 if mount_options is None:
713 mount_options = ['noatime']
714 if mkfs_options is None:
715 mkfs_options = ['-f', '-i', 'size=2048']
716 if fs == 'ext4' or fs == 'ext3':
717 if mount_options is None:
718 mount_options = ['noatime', 'user_xattr']
719
720 if mount_options is None:
721 mount_options = []
722 if mkfs_options is None:
723 mkfs_options = []
724 mkfs = ['mkfs.%s' % fs] + mkfs_options
725 log.info('%s on %s on %s' % (mkfs, dev, remote))
726 if package is not None:
727 remote.run(
728 args=[
729 'sudo',
730 'apt-get', 'install', '-y', package
731 ],
732 stdout=StringIO(),
733 )
734
735 try:
736 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
737 except run.CommandFailedError:
738 # Newer btfs-tools doesn't prompt for overwrite, use -f
739 if '-f' not in mount_options:
740 mkfs_options.append('-f')
741 mkfs = ['mkfs.%s' % fs] + mkfs_options
742 log.info('%s on %s on %s' % (mkfs, dev, remote))
743 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
744
745 log.info('mount %s on %s -o %s' % (dev, remote,
746 ','.join(mount_options)))
747 remote.run(
748 args=[
749 'sudo',
750 'mount',
751 '-t', fs,
752 '-o', ','.join(mount_options),
753 dev,
754 mnt_point,
755 ]
756 )
757 remote.run(
758 args=[
759 'sudo', '/sbin/restorecon', mnt_point,
760 ],
761 check_status=False,
762 )
763 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
764 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
765 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
766 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
767 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
768 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
769 devs_to_clean[remote].append(mnt_point)
770
771 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
772 _, _, id_ = teuthology.split_role(role)
773 remote.run(
774 args=[
775 'sudo',
776 'MALLOC_CHECK_=3',
777 'adjust-ulimits',
778 'ceph-coverage',
779 coverage_dir,
780 'ceph-osd',
781 '--cluster',
782 cluster_name,
783 '--mkfs',
784 '--mkkey',
785 '-i', id_,
786 '--monmap', monmap_path,
787 ],
788 )
789
790 log.info('Reading keys from all nodes...')
791 keys_fp = StringIO()
792 keys = []
793 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
794 for type_ in ['mgr', 'mds', 'osd']:
795 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
796 continue
797 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
798 _, _, id_ = teuthology.split_role(role)
799 data = teuthology.get_file(
800 remote=remote,
801 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
802 type=type_,
803 id=id_,
804 cluster=cluster_name,
805 ),
806 sudo=True,
807 )
808 keys.append((type_, id_, data))
809 keys_fp.write(data)
810 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
811 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
812 _, _, id_ = teuthology.split_role(role)
813 data = teuthology.get_file(
814 remote=remote,
815 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
816 )
817 keys.append(('client', id_, data))
818 keys_fp.write(data)
819
820 log.info('Adding keys to all mons...')
821 writes = mons.run(
822 args=[
823 'sudo', 'tee', '-a',
824 keyring_path,
825 ],
826 stdin=run.PIPE,
827 wait=False,
828 stdout=StringIO(),
829 )
830 keys_fp.seek(0)
831 teuthology.feed_many_stdins_and_close(keys_fp, writes)
832 run.wait(writes)
833 for type_, id_, data in keys:
834 run.wait(
835 mons.run(
836 args=[
837 'sudo',
838 'adjust-ulimits',
839 'ceph-coverage',
840 coverage_dir,
841 'ceph-authtool',
842 keyring_path,
843 '--name={type}.{id}'.format(
844 type=type_,
845 id=id_,
846 ),
847 ] + list(generate_caps(type_)),
848 wait=False,
849 ),
850 )
851
852 log.info('Running mkfs on mon nodes...')
853 for remote, roles_for_host in mons.remotes.iteritems():
854 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
855 _, _, id_ = teuthology.split_role(role)
856 remote.run(
857 args=[
858 'sudo',
859 'mkdir',
860 '-p',
861 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
862 ],
863 )
864 remote.run(
865 args=[
866 'sudo',
867 'adjust-ulimits',
868 'ceph-coverage',
869 coverage_dir,
870 'ceph-mon',
871 '--cluster', cluster_name,
872 '--mkfs',
873 '-i', id_,
874 '--monmap', monmap_path,
875 '--osdmap', osdmap_path,
876 '--keyring', keyring_path,
877 ],
878 )
879
880 run.wait(
881 mons.run(
882 args=[
883 'rm',
884 '--',
885 monmap_path,
886 osdmap_path,
887 ],
888 wait=False,
889 ),
890 )
891
892 try:
893 yield
894 except Exception:
895 # we need to know this below
896 ctx.summary['success'] = False
897 raise
898 finally:
899 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
900
901 log.info('Checking cluster log for badness...')
902
903 def first_in_ceph_log(pattern, excludes):
904 """
905 Find the first occurence of the pattern specified in the Ceph log,
906 Returns None if none found.
907
908 :param pattern: Pattern scanned for.
909 :param excludes: Patterns to ignore.
910 :return: First line of text (or None if not found)
911 """
912 args = [
913 'sudo',
914 'egrep', pattern,
915 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
916 ]
917 for exclude in excludes:
918 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
919 args.extend([
920 run.Raw('|'), 'head', '-n', '1',
921 ])
922 r = mon0_remote.run(
923 stdout=StringIO(),
924 args=args,
925 )
926 stdout = r.stdout.getvalue()
927 if stdout != '':
928 return stdout
929 return None
930
931 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
932 config['log_whitelist']) is not None:
933 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
934 ctx.summary['success'] = False
935 # use the most severe problem as the failure reason
936 if 'failure_reason' not in ctx.summary:
937 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
938 match = first_in_ceph_log(pattern, config['log_whitelist'])
939 if match is not None:
940 ctx.summary['failure_reason'] = \
941 '"{match}" in cluster log'.format(
942 match=match.rstrip('\n'),
943 )
944 break
945
946 for remote, dirs in devs_to_clean.iteritems():
947 for dir_ in dirs:
948 log.info('Unmounting %s on %s' % (dir_, remote))
949 try:
950 remote.run(
951 args=[
952 'sync',
953 run.Raw('&&'),
954 'sudo',
955 'umount',
956 '-f',
957 dir_
958 ]
959 )
960 except Exception as e:
961 remote.run(args=[
962 'sudo',
963 run.Raw('PATH=/usr/sbin:$PATH'),
964 'lsof',
965 run.Raw(';'),
966 'ps', 'auxf',
967 ])
968 raise e
969
970 if config.get('tmpfs_journal'):
971 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
972 for remote, roles_for_host in osds.remotes.iteritems():
973 remote.run(
974 args=['sudo', 'umount', '-f', '/mnt'],
975 check_status=False,
976 )
977
978 if ctx.archive is not None and \
979 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
980
981 # archive mon data, too
982 log.info('Archiving mon data...')
983 path = os.path.join(ctx.archive, 'data')
984 try:
985 os.makedirs(path)
986 except OSError as e:
987 if e.errno == errno.EEXIST:
988 pass
989 else:
990 raise
991 for remote, roles in mons.remotes.iteritems():
992 for role in roles:
993 is_mon = teuthology.is_type('mon', cluster_name)
994 if is_mon(role):
995 _, _, id_ = teuthology.split_role(role)
996 mon_dir = '/var/lib/ceph/mon/' + \
997 '{0}-{1}'.format(cluster_name, id_)
998 teuthology.pull_directory_tarball(
999 remote,
1000 mon_dir,
1001 path + '/' + role + '.tgz')
1002
1003 log.info('Cleaning ceph cluster...')
1004 run.wait(
1005 ctx.cluster.run(
1006 args=[
1007 'sudo',
1008 'rm',
1009 '-rf',
1010 '--',
1011 conf_path,
1012 keyring_path,
1013 data_dir,
1014 monmap_path,
1015 osdmap_path,
1016 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1017 ],
1018 wait=False,
1019 ),
1020 )
1021
1022
1023def osd_scrub_pgs(ctx, config):
1024 """
1025 Scrub pgs when we exit.
1026
1027 First make sure all pgs are active and clean.
1028 Next scrub all osds.
1029 Then periodically check until all pgs have scrub time stamps that
1030 indicate the last scrub completed. Time out if no progess is made
1031 here after two minutes.
1032 """
1033 retries = 12
1034 delays = 10
1035 cluster_name = config['cluster']
1036 manager = ctx.managers[cluster_name]
1037 all_clean = False
1038 for _ in range(0, retries):
1039 stats = manager.get_pg_stats()
1040 states = [stat['state'] for stat in stats]
1041 if len(set(states)) == 1 and states[0] == 'active+clean':
1042 all_clean = True
1043 break
1044 log.info("Waiting for all osds to be active and clean.")
1045 time.sleep(delays)
1046 if not all_clean:
1047 log.info("Scrubbing terminated -- not all pgs were active and clean.")
1048 return
1049 check_time_now = time.localtime()
1050 time.sleep(1)
1051 all_roles = teuthology.all_roles(ctx.cluster)
1052 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1053 log.info("Scrubbing {osd}".format(osd=role))
1054 _, _, id_ = teuthology.split_role(role)
1055 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1056 prev_good = 0
1057 gap_cnt = 0
1058 loop = True
1059 while loop:
1060 stats = manager.get_pg_stats()
1061 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1062 loop = False
1063 thiscnt = 0
1064 for (pgid, tmval) in timez:
1065 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1066 if pgtm > check_time_now:
1067 thiscnt += 1
1068 else:
1069 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1070 loop = True
1071 if thiscnt > prev_good:
1072 prev_good = thiscnt
1073 gap_cnt = 0
1074 else:
1075 gap_cnt += 1
1076 if gap_cnt > retries:
1077 log.info('Exiting scrub checking -- not all pgs scrubbed.')
1078 return
1079 if loop:
1080 log.info('Still waiting for all pgs to be scrubbed.')
1081 time.sleep(delays)
1082
1083
1084@contextlib.contextmanager
1085def run_daemon(ctx, config, type_):
1086 """
1087 Run daemons for a role type. Handle the startup and termination of a a daemon.
1088 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1089 and a max_mds value for one mds.
1090 On cleanup -- Stop all existing daemons of this type.
1091
1092 :param ctx: Context
1093 :param config: Configuration
1094 :paran type_: Role type
1095 """
1096 cluster_name = config['cluster']
1097 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1098 testdir = teuthology.get_testdir(ctx)
1099 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1100
1101 # check whether any daemons if this type are configured
1102 if daemons is None:
1103 return
1104 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1105
1106 daemon_signal = 'kill'
1107 if config.get('coverage') or config.get('valgrind') is not None:
1108 daemon_signal = 'term'
1109
1110 for remote, roles_for_host in daemons.remotes.iteritems():
1111 is_type_ = teuthology.is_type(type_, cluster_name)
1112 for role in roles_for_host:
1113 if not is_type_(role):
1114 continue
1115 _, _, id_ = teuthology.split_role(role)
1116
1117 run_cmd = [
1118 'sudo',
1119 'adjust-ulimits',
1120 'ceph-coverage',
1121 coverage_dir,
1122 'daemon-helper',
1123 daemon_signal,
1124 ]
1125 run_cmd_tail = [
1126 'ceph-%s' % (type_),
1127 '-f',
1128 '--cluster', cluster_name,
1129 '-i', id_]
1130
1131 if type_ in config.get('cpu_profile', []):
1132 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1133 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1134
1135 if config.get('valgrind') is not None:
1136 valgrind_args = None
1137 if type_ in config['valgrind']:
1138 valgrind_args = config['valgrind'][type_]
1139 if role in config['valgrind']:
1140 valgrind_args = config['valgrind'][role]
1141 run_cmd = teuthology.get_valgrind_args(testdir, role,
1142 run_cmd,
1143 valgrind_args)
1144
1145 run_cmd.extend(run_cmd_tail)
1146
1147 # always register mgr; don't necessarily start
1148 ctx.daemons.register_daemon(
1149 remote, type_, id_,
1150 cluster=cluster_name,
1151 args=run_cmd,
1152 logger=log.getChild(role),
1153 stdin=run.PIPE,
1154 wait=False
1155 )
1156 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1157 role = cluster_name + '.' + type_
1158 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1159
1160 try:
1161 yield
1162 finally:
1163 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1164
1165
1166def healthy(ctx, config):
1167 """
1168 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1169
1170 :param ctx: Context
1171 :param config: Configuration
1172 """
1173 config = config if isinstance(config, dict) else dict()
1174 cluster_name = config.get('cluster', 'ceph')
1175 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1176 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1177 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1178 teuthology.wait_until_osds_up(
1179 ctx,
1180 cluster=ctx.cluster,
1181 remote=mon0_remote,
1182 ceph_cluster=cluster_name,
1183 )
1184 teuthology.wait_until_healthy(
1185 ctx,
1186 remote=mon0_remote,
1187 ceph_cluster=cluster_name,
1188 )
1189
1190 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1191 # Some MDSs exist, wait for them to be healthy
1192 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1193 ceph_fs.wait_for_daemons(timeout=300)
1194
1195
1196def wait_for_osds_up(ctx, config):
1197 """
1198 Wait for all osd's to come up.
1199
1200 :param ctx: Context
1201 :param config: Configuration
1202 """
1203 log.info('Waiting until ceph osds are all up...')
1204 cluster_name = config.get('cluster', 'ceph')
1205 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1206 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1207 teuthology.wait_until_osds_up(
1208 ctx,
1209 cluster=ctx.cluster,
1210 remote=mon0_remote
1211 )
1212
1213
1214def wait_for_mon_quorum(ctx, config):
1215 """
1216 Check renote ceph status until all monitors are up.
1217
1218 :param ctx: Context
1219 :param config: Configuration
1220 """
1221 if isinstance(config, dict):
1222 mons = config['daemons']
1223 cluster_name = config.get('cluster', 'ceph')
1224 else:
1225 assert isinstance(config, list)
1226 mons = config
1227 cluster_name = 'ceph'
1228 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1229 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1230 with contextutil.safe_while(sleep=10, tries=60,
1231 action='wait for monitor quorum') as proceed:
1232 while proceed():
1233 r = remote.run(
1234 args=[
1235 'sudo',
1236 'ceph',
1237 'quorum_status',
1238 ],
1239 stdout=StringIO(),
1240 logger=log.getChild('quorum_status'),
1241 )
1242 j = json.loads(r.stdout.getvalue())
1243 q = j.get('quorum_names', [])
1244 log.debug('Quorum: %s', q)
1245 if sorted(q) == sorted(mons):
1246 break
1247
1248
1249def created_pool(ctx, config):
1250 """
1251 Add new pools to the dictionary of pools that the ceph-manager
1252 knows about.
1253 """
1254 for new_pool in config:
1255 if new_pool not in ctx.managers['ceph'].pools:
1256 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1257 new_pool, 'pg_num')
1258
1259
1260@contextlib.contextmanager
1261def restart(ctx, config):
1262 """
1263 restart ceph daemons
1264
1265 For example::
1266 tasks:
1267 - ceph.restart: [all]
1268
1269 For example::
1270 tasks:
1271 - ceph.restart: [osd.0, mon.1, mds.*]
1272
1273 or::
1274
1275 tasks:
1276 - ceph.restart:
1277 daemons: [osd.0, mon.1]
1278 wait-for-healthy: false
1279 wait-for-osds-up: true
1280
1281 :param ctx: Context
1282 :param config: Configuration
1283 """
1284 if config is None:
1285 config = {}
1286 elif isinstance(config, list):
1287 config = {'daemons': config}
1288
1289 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1290 clusters = set()
1291 for role in daemons:
1292 cluster, type_, id_ = teuthology.split_role(role)
1293 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1294 clusters.add(cluster)
1295
1296 manager = ctx.managers['ceph']
1297 for dmon in daemons:
1298 if '.' in dmon:
1299 dm_parts = dmon.split('.')
1300 if dm_parts[1].isdigit():
1301 if dm_parts[0] == 'osd':
1302 manager.mark_down_osd(int(dm_parts[1]))
1303
1304 if config.get('wait-for-healthy', True):
1305 for cluster in clusters:
1306 healthy(ctx=ctx, config=dict(cluster=cluster))
1307 if config.get('wait-for-osds-up', False):
1308 for cluster in clusters:
1309 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1310 yield
1311
1312
1313@contextlib.contextmanager
1314def stop(ctx, config):
1315 """
1316 Stop ceph daemons
1317
1318 For example::
1319 tasks:
1320 - ceph.stop: [mds.*]
1321
1322 tasks:
1323 - ceph.stop: [osd.0, osd.2]
1324
1325 tasks:
1326 - ceph.stop:
1327 daemons: [osd.0, osd.2]
1328
1329 """
1330 if config is None:
1331 config = {}
1332 elif isinstance(config, list):
1333 config = {'daemons': config}
1334
1335 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1336 for role in daemons:
1337 cluster, type_, id_ = teuthology.split_role(role)
1338 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1339
1340 yield
1341
1342
1343@contextlib.contextmanager
1344def wait_for_failure(ctx, config):
1345 """
1346 Wait for a failure of a ceph daemon
1347
1348 For example::
1349 tasks:
1350 - ceph.wait_for_failure: [mds.*]
1351
1352 tasks:
1353 - ceph.wait_for_failure: [osd.0, osd.2]
1354
1355 tasks:
1356 - ceph.wait_for_failure:
1357 daemons: [osd.0, osd.2]
1358
1359 """
1360 if config is None:
1361 config = {}
1362 elif isinstance(config, list):
1363 config = {'daemons': config}
1364
1365 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1366 for role in daemons:
1367 cluster, type_, id_ = teuthology.split_role(role)
1368 try:
1369 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1370 except:
1371 log.info('Saw expected daemon failure. Continuing.')
1372 pass
1373 else:
1374 raise RuntimeError('daemon %s did not fail' % role)
1375
1376 yield
1377
1378
1379def validate_config(ctx, config):
1380 """
1381 Perform some simple validation on task configuration.
1382 Raises exceptions.ConfigError if an error is found.
1383 """
1384 # check for osds from multiple clusters on the same host
1385 for remote, roles_for_host in ctx.cluster.remotes.items():
1386 last_cluster = None
1387 last_role = None
1388 for role in roles_for_host:
1389 role_cluster, role_type, _ = teuthology.split_role(role)
1390 if role_type != 'osd':
1391 continue
1392 if last_cluster and last_cluster != role_cluster:
1393 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1394 last_role, role)
1395 raise exceptions.ConfigError(msg)
1396 last_cluster = role_cluster
1397 last_role = role
1398
1399
1400@contextlib.contextmanager
1401def task(ctx, config):
1402 """
1403 Set up and tear down a Ceph cluster.
1404
1405 For example::
1406
1407 tasks:
1408 - ceph:
1409 - interactive:
1410
1411 You can also specify what branch to run::
1412
1413 tasks:
1414 - ceph:
1415 branch: foo
1416
1417 Or a tag::
1418
1419 tasks:
1420 - ceph:
1421 tag: v0.42.13
1422
1423 Or a sha1::
1424
1425 tasks:
1426 - ceph:
1427 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1428
1429 Or a local source dir::
1430
1431 tasks:
1432 - ceph:
1433 path: /home/sage/ceph
1434
1435 To capture code coverage data, use::
1436
1437 tasks:
1438 - ceph:
1439 coverage: true
1440
1441 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1442
1443 tasks:
1444 - ceph:
1445 fs: xfs
1446 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1447 mount_options: [nobarrier, inode64]
1448
1449 Note, this will cause the task to check the /scratch_devs file on each node
1450 for available devices. If no such file is found, /dev/sdb will be used.
1451
1452 To run some daemons under valgrind, include their names
1453 and the tool/args to use in a valgrind section::
1454
1455 tasks:
1456 - ceph:
1457 valgrind:
1458 mds.1: --tool=memcheck
1459 osd.1: [--tool=memcheck, --leak-check=no]
1460
1461 Those nodes which are using memcheck or valgrind will get
1462 checked for bad results.
1463
1464 To adjust or modify config options, use::
1465
1466 tasks:
1467 - ceph:
1468 conf:
1469 section:
1470 key: value
1471
1472 For example::
1473
1474 tasks:
1475 - ceph:
1476 conf:
1477 mds.0:
1478 some option: value
1479 other key: other value
1480 client.0:
1481 debug client: 10
1482 debug ms: 1
1483
1484 By default, the cluster log is checked for errors and warnings,
1485 and the run marked failed if any appear. You can ignore log
1486 entries by giving a list of egrep compatible regexes, i.e.:
1487
1488 tasks:
1489 - ceph:
1490 log-whitelist: ['foo.*bar', 'bad message']
1491
1492 To run multiple ceph clusters, use multiple ceph tasks, and roles
1493 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1494 cluster use the default cluster name, 'ceph'. OSDs from separate
1495 clusters must be on separate hosts. Clients and non-osd daemons
1496 from multiple clusters may be colocated. For each cluster, add an
1497 instance of the ceph task with the cluster name specified, e.g.::
1498
1499 roles:
1500 - [mon.a, osd.0, osd.1]
1501 - [backup.mon.a, backup.osd.0, backup.osd.1]
1502 - [client.0, backup.client.0]
1503 tasks:
1504 - ceph:
1505 cluster: ceph
1506 - ceph:
1507 cluster: backup
1508
1509 :param ctx: Context
1510 :param config: Configuration
1511
1512 """
1513 if config is None:
1514 config = {}
1515 assert isinstance(config, dict), \
1516 "task ceph only supports a dictionary for configuration"
1517
1518 overrides = ctx.config.get('overrides', {})
1519 teuthology.deep_merge(config, overrides.get('ceph', {}))
1520
1521 first_ceph_cluster = False
1522 if not hasattr(ctx, 'daemons'):
1523 first_ceph_cluster = True
1524 ctx.daemons = DaemonGroup()
1525
1526 testdir = teuthology.get_testdir(ctx)
1527 if config.get('coverage'):
1528 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1529 log.info('Creating coverage directory...')
1530 run.wait(
1531 ctx.cluster.run(
1532 args=[
1533 'install', '-d', '-m0755', '--',
1534 coverage_dir,
1535 ],
1536 wait=False,
1537 )
1538 )
1539
1540 if 'cluster' not in config:
1541 config['cluster'] = 'ceph'
1542
1543 validate_config(ctx, config)
1544
1545 subtasks = []
1546 if first_ceph_cluster:
1547 # these tasks handle general log setup and parsing on all hosts,
1548 # so they should only be run once
1549 subtasks = [
1550 lambda: ceph_log(ctx=ctx, config=None),
1551 lambda: valgrind_post(ctx=ctx, config=config),
1552 ]
1553
1554 subtasks += [
1555 lambda: cluster(ctx=ctx, config=dict(
1556 conf=config.get('conf', {}),
1557 fs=config.get('fs', 'xfs'),
1558 mkfs_options=config.get('mkfs_options', None),
1559 mount_options=config.get('mount_options', None),
1560 block_journal=config.get('block_journal', None),
1561 tmpfs_journal=config.get('tmpfs_journal', None),
1562 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1563 log_whitelist=config.get('log-whitelist', []),
1564 cpu_profile=set(config.get('cpu_profile', []),),
1565 cluster=config['cluster'],
1566 )),
1567 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1568 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1569 lambda: crush_setup(ctx=ctx, config=config),
1570 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1571 lambda: cephfs_setup(ctx=ctx, config=config),
1572 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1573 ]
1574
1575 with contextutil.nested(*subtasks):
1576 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1577 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1578 if not hasattr(ctx, 'managers'):
1579 ctx.managers = {}
1580 ctx.managers[config['cluster']] = CephManager(
1581 mon,
1582 ctx=ctx,
1583 logger=log.getChild('ceph_manager.' + config['cluster']),
1584 cluster=config['cluster'],
1585 )
1586
1587 try:
1588 if config.get('wait-for-healthy', True):
1589 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1590
1591 yield
1592 finally:
1593 if config.get('wait-for-scrub', True):
1594 osd_scrub_pgs(ctx, config)