]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/ceph.py
fa43530a6f5cffd72b16110e927c7550873bc57a
[ceph.git] / ceph / qa / tasks / ceph.py
1 """
2 Ceph cluster task.
3
4 Handle the setup, starting, and clean-up of a Ceph cluster.
5 """
6 from cStringIO import StringIO
7
8 import argparse
9 import contextlib
10 import errno
11 import logging
12 import os
13 import json
14 import time
15 import gevent
16 import socket
17
18 from paramiko import SSHException
19 from ceph_manager import CephManager, write_conf
20 from tasks.cephfs.filesystem import Filesystem
21 from teuthology import misc as teuthology
22 from teuthology import contextutil
23 from teuthology import exceptions
24 from teuthology.orchestra import run
25 import ceph_client as cclient
26 from teuthology.orchestra.daemon import DaemonGroup
27
28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30 log = logging.getLogger(__name__)
31
32
33 def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
46 mon='allow *',
47 ),
48 mds=dict(
49 mon='allow *',
50 mgr='allow *',
51 osd='allow *',
52 mds='allow',
53 ),
54 client=dict(
55 mon='allow rw',
56 mgr='allow r',
57 osd='allow rwx',
58 mds='allow',
59 ),
60 )
61 for subsystem, capability in defaults[type_].items():
62 yield '--cap'
63 yield subsystem
64 yield capability
65
66
67 @contextlib.contextmanager
68 def ceph_log(ctx, config):
69 """
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
72
73 :param ctx: Context
74 :param config: Configuration
75 """
76 log.info('Making ceph log dir writeable by non-root...')
77 run.wait(
78 ctx.cluster.run(
79 args=[
80 'sudo',
81 'chmod',
82 '777',
83 '/var/log/ceph',
84 ],
85 wait=False,
86 )
87 )
88 log.info('Disabling ceph logrotate...')
89 run.wait(
90 ctx.cluster.run(
91 args=[
92 'sudo',
93 'rm', '-f', '--',
94 '/etc/logrotate.d/ceph',
95 ],
96 wait=False,
97 )
98 )
99 log.info('Creating extra log directories...')
100 run.wait(
101 ctx.cluster.run(
102 args=[
103 'sudo',
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
107 ],
108 wait=False,
109 )
110 )
111
112 class Rotater(object):
113 stop_event = gevent.event.Event()
114
115 def invoke_logrotate(self):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self.stop_event.is_set():
119 self.stop_event.wait(timeout=30)
120 try:
121 run.wait(
122 ctx.cluster.run(
123 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
124 ],
125 wait=False,
126 )
127 )
128 except exceptions.ConnectionLostError as e:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log.debug("Missed logrotate, node '{0}' is offline".format(
132 e.node))
133 except EOFError as e:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log.debug("Missed logrotate, EOFError")
139 except SSHException as e:
140 log.debug("Missed logrotate, SSHException")
141 except socket.error as e:
142 if e.errno == errno.EHOSTUNREACH:
143 log.debug("Missed logrotate, host unreachable")
144 else:
145 raise
146
147 def begin(self):
148 self.thread = gevent.spawn(self.invoke_logrotate)
149
150 def end(self):
151 self.stop_event.set()
152 self.thread.get()
153
154 def write_rotate_conf(ctx, daemons):
155 testdir = teuthology.get_testdir(ctx)
156 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
157 with file(rotate_conf_path, 'rb') as f:
158 conf = ""
159 for daemon, size in daemons.iteritems():
160 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
161 conf += f.read().format(daemon_type=daemon, max_size=size)
162 f.seek(0, 0)
163
164 for remote in ctx.cluster.remotes.iterkeys():
165 teuthology.write_file(remote=remote,
166 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
167 data=StringIO(conf)
168 )
169 remote.run(
170 args=[
171 'sudo',
172 'mv',
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
174 '/etc/logrotate.d/ceph-test.conf',
175 run.Raw('&&'),
176 'sudo',
177 'chmod',
178 '0644',
179 '/etc/logrotate.d/ceph-test.conf',
180 run.Raw('&&'),
181 'sudo',
182 'chown',
183 'root.root',
184 '/etc/logrotate.d/ceph-test.conf'
185 ]
186 )
187 remote.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
189
190 if ctx.config.get('log-rotate'):
191 daemons = ctx.config.get('log-rotate')
192 log.info('Setting up log rotation with ' + str(daemons))
193 write_rotate_conf(ctx, daemons)
194 logrotater = Rotater()
195 logrotater.begin()
196 try:
197 yield
198
199 finally:
200 if ctx.config.get('log-rotate'):
201 log.info('Shutting down logrotate')
202 logrotater.end()
203 ctx.cluster.run(
204 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
205 ]
206 )
207 if ctx.archive is not None and \
208 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
209 # and logs
210 log.info('Compressing logs...')
211 run.wait(
212 ctx.cluster.run(
213 args=[
214 'sudo',
215 'find',
216 '/var/log/ceph',
217 '-name',
218 '*.log',
219 '-print0',
220 run.Raw('|'),
221 'sudo',
222 'xargs',
223 '-0',
224 '--no-run-if-empty',
225 '--',
226 'gzip',
227 '--',
228 ],
229 wait=False,
230 ),
231 )
232
233 log.info('Archiving logs...')
234 path = os.path.join(ctx.archive, 'remote')
235 os.makedirs(path)
236 for remote in ctx.cluster.remotes.iterkeys():
237 sub = os.path.join(path, remote.shortname)
238 os.makedirs(sub)
239 teuthology.pull_directory(remote, '/var/log/ceph',
240 os.path.join(sub, 'log'))
241
242
243 def assign_devs(roles, devs):
244 """
245 Create a dictionary of devs indexed by roles
246
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
250 """
251 return dict(zip(roles, devs))
252
253
254 @contextlib.contextmanager
255 def valgrind_post(ctx, config):
256 """
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
259 the logs.
260
261 :param ctx: Context
262 :param config: Configuration
263 """
264 try:
265 yield
266 finally:
267 lookup_procs = list()
268 log.info('Checking for errors in any valgrind logs...')
269 for remote in ctx.cluster.remotes.iterkeys():
270 # look at valgrind logs for each node
271 proc = remote.run(
272 args=[
273 'sudo',
274 'zgrep',
275 '<kind>',
276 run.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
278 run.Raw('|'),
279 'sort',
280 run.Raw('|'),
281 'uniq',
282 ],
283 wait=False,
284 check_status=False,
285 stdout=StringIO(),
286 )
287 lookup_procs.append((proc, remote))
288
289 valgrind_exception = None
290 for (proc, remote) in lookup_procs:
291 proc.wait()
292 out = proc.stdout.getvalue()
293 for line in out.split('\n'):
294 if line == '':
295 continue
296 try:
297 (file, kind) = line.split(':')
298 except Exception:
299 log.error('failed to split line %s', line)
300 raise
301 log.debug('file %s kind %s', file, kind)
302 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
303 continue
304 log.error('saw valgrind issue %s in %s', kind, file)
305 valgrind_exception = Exception('saw valgrind issues')
306
307 if config.get('expect_valgrind_errors'):
308 if not valgrind_exception:
309 raise Exception('expected valgrind issues and found none')
310 else:
311 if valgrind_exception:
312 raise valgrind_exception
313
314
315 @contextlib.contextmanager
316 def crush_setup(ctx, config):
317 cluster_name = config['cluster']
318 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
319 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
320
321 profile = config.get('crush_tunables', 'default')
322 log.info('Setting crush tunables to %s', profile)
323 mon_remote.run(
324 args=['sudo', 'ceph', '--cluster', cluster_name,
325 'osd', 'crush', 'tunables', profile])
326 yield
327
328
329 @contextlib.contextmanager
330 def cephfs_setup(ctx, config):
331 cluster_name = config['cluster']
332 testdir = teuthology.get_testdir(ctx)
333 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
334
335 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
336 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
337 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
338 # If there are any MDSs, then create a filesystem for them to use
339 # Do this last because requires mon cluster to be up and running
340 if mdss.remotes:
341 log.info('Setting up CephFS filesystem...')
342
343 fs = Filesystem(ctx, create='cephfs')
344
345 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
346 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
347 num_active = len([r for r in all_roles if is_active_mds(r)])
348
349 fs.set_allow_multimds(True)
350 fs.set_max_mds(num_active)
351 fs.set_allow_dirfrags(True)
352
353 yield
354
355
356 @contextlib.contextmanager
357 def cluster(ctx, config):
358 """
359 Handle the creation and removal of a ceph cluster.
360
361 On startup:
362 Create directories needed for the cluster.
363 Create remote journals for all osds.
364 Create and set keyring.
365 Copy the monmap to tht test systems.
366 Setup mon nodes.
367 Setup mds nodes.
368 Mkfs osd nodes.
369 Add keyring information to monmaps
370 Mkfs mon nodes.
371
372 On exit:
373 If errors occured, extract a failure message and store in ctx.summary.
374 Unmount all test files and temporary journaling files.
375 Save the monitor information and archive all ceph logs.
376 Cleanup the keyring setup, and remove all monitor map and data files left over.
377
378 :param ctx: Context
379 :param config: Configuration
380 """
381 if ctx.config.get('use_existing_cluster', False) is True:
382 log.info("'use_existing_cluster' is true; skipping cluster creation")
383 yield
384
385 testdir = teuthology.get_testdir(ctx)
386 cluster_name = config['cluster']
387 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
388 log.info('Creating ceph cluster %s...', cluster_name)
389 run.wait(
390 ctx.cluster.run(
391 args=[
392 'install', '-d', '-m0755', '--',
393 data_dir,
394 ],
395 wait=False,
396 )
397 )
398
399 run.wait(
400 ctx.cluster.run(
401 args=[
402 'sudo',
403 'install', '-d', '-m0777', '--', '/var/run/ceph',
404 ],
405 wait=False,
406 )
407 )
408
409 devs_to_clean = {}
410 remote_to_roles_to_devs = {}
411 remote_to_roles_to_journals = {}
412 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
413 for remote, roles_for_host in osds.remotes.iteritems():
414 devs = teuthology.get_scratch_devices(remote)
415 roles_to_devs = {}
416 roles_to_journals = {}
417 if config.get('fs'):
418 log.info('fs option selected, checking for scratch devs')
419 log.info('found devs: %s' % (str(devs),))
420 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
421 iddevs = devs_id_map.values()
422 roles_to_devs = assign_devs(
423 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
424 )
425 if len(roles_to_devs) < len(iddevs):
426 iddevs = iddevs[len(roles_to_devs):]
427 devs_to_clean[remote] = []
428
429 if config.get('block_journal'):
430 log.info('block journal enabled')
431 roles_to_journals = assign_devs(
432 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
433 )
434 log.info('journal map: %s', roles_to_journals)
435
436 if config.get('tmpfs_journal'):
437 log.info('tmpfs journal enabled')
438 roles_to_journals = {}
439 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
440 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
441 tmpfs = '/mnt/' + role
442 roles_to_journals[role] = tmpfs
443 remote.run(args=['truncate', '-s', '1500M', tmpfs])
444 log.info('journal map: %s', roles_to_journals)
445
446 log.info('dev map: %s' % (str(roles_to_devs),))
447 remote_to_roles_to_devs[remote] = roles_to_devs
448 remote_to_roles_to_journals[remote] = roles_to_journals
449
450 log.info('Generating config...')
451 remotes_and_roles = ctx.cluster.remotes.items()
452 roles = [role_list for (remote, role_list) in remotes_and_roles]
453 ips = [host for (host, port) in
454 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
455 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
456 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
457 for role, journal in roles_to_journals.iteritems():
458 name = teuthology.ceph_role(role)
459 if name not in conf:
460 conf[name] = {}
461 conf[name]['osd journal'] = journal
462 for section, keys in config['conf'].iteritems():
463 for key, value in keys.iteritems():
464 log.info("[%s] %s = %s" % (section, key, value))
465 if section not in conf:
466 conf[section] = {}
467 conf[section][key] = value
468
469 if config.get('tmpfs_journal'):
470 conf['journal dio'] = False
471
472 if not hasattr(ctx, 'ceph'):
473 ctx.ceph = {}
474 ctx.ceph[cluster_name] = argparse.Namespace()
475 ctx.ceph[cluster_name].conf = conf
476
477 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
478 keyring_path = config.get('keyring_path', default_keyring)
479
480 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
481
482 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
483
484 log.info('Setting up %s...' % firstmon)
485 ctx.cluster.only(firstmon).run(
486 args=[
487 'sudo',
488 'adjust-ulimits',
489 'ceph-coverage',
490 coverage_dir,
491 'ceph-authtool',
492 '--create-keyring',
493 keyring_path,
494 ],
495 )
496 ctx.cluster.only(firstmon).run(
497 args=[
498 'sudo',
499 'adjust-ulimits',
500 'ceph-coverage',
501 coverage_dir,
502 'ceph-authtool',
503 '--gen-key',
504 '--name=mon.',
505 keyring_path,
506 ],
507 )
508 ctx.cluster.only(firstmon).run(
509 args=[
510 'sudo',
511 'chmod',
512 '0644',
513 keyring_path,
514 ],
515 )
516 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
517 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
518 cluster=cluster_name)
519 fsid = teuthology.create_simple_monmap(
520 ctx,
521 remote=mon0_remote,
522 conf=conf,
523 path=monmap_path,
524 )
525 if not 'global' in conf:
526 conf['global'] = {}
527 conf['global']['fsid'] = fsid
528
529 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
530 conf_path = config.get('conf_path', default_conf_path)
531 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
532 write_conf(ctx, conf_path, cluster_name)
533
534 log.info('Creating admin key on %s...' % firstmon)
535 ctx.cluster.only(firstmon).run(
536 args=[
537 'sudo',
538 'adjust-ulimits',
539 'ceph-coverage',
540 coverage_dir,
541 'ceph-authtool',
542 '--gen-key',
543 '--name=client.admin',
544 '--set-uid=0',
545 '--cap', 'mon', 'allow *',
546 '--cap', 'osd', 'allow *',
547 '--cap', 'mds', 'allow *',
548 '--cap', 'mgr', 'allow *',
549 keyring_path,
550 ],
551 )
552
553 log.info('Copying monmap to all nodes...')
554 keyring = teuthology.get_file(
555 remote=mon0_remote,
556 path=keyring_path,
557 )
558 monmap = teuthology.get_file(
559 remote=mon0_remote,
560 path=monmap_path,
561 )
562
563 for rem in ctx.cluster.remotes.iterkeys():
564 # copy mon key and initial monmap
565 log.info('Sending monmap to node {remote}'.format(remote=rem))
566 teuthology.sudo_write_file(
567 remote=rem,
568 path=keyring_path,
569 data=keyring,
570 perms='0644'
571 )
572 teuthology.write_file(
573 remote=rem,
574 path=monmap_path,
575 data=monmap,
576 )
577
578 log.info('Setting up mon nodes...')
579 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
580 osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir,
581 cluster=cluster_name)
582 run.wait(
583 mons.run(
584 args=[
585 'adjust-ulimits',
586 'ceph-coverage',
587 coverage_dir,
588 'osdmaptool',
589 '-c', conf_path,
590 '--clobber',
591 '--createsimple', '{num:d}'.format(
592 num=teuthology.num_instances_of_type(ctx.cluster, 'osd',
593 cluster_name),
594 ),
595 osdmap_path,
596 '--pg_bits', '2',
597 '--pgp_bits', '4',
598 ],
599 wait=False,
600 ),
601 )
602
603 if not config.get('skip_mgr_daemons', False):
604 log.info('Setting up mgr nodes...')
605 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
606 for remote, roles_for_host in mgrs.remotes.iteritems():
607 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
608 cluster_name):
609 _, _, id_ = teuthology.split_role(role)
610 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
611 cluster=cluster_name,
612 id=id_,
613 )
614 remote.run(
615 args=[
616 'sudo',
617 'mkdir',
618 '-p',
619 mgr_dir,
620 run.Raw('&&'),
621 'sudo',
622 'adjust-ulimits',
623 'ceph-coverage',
624 coverage_dir,
625 'ceph-authtool',
626 '--create-keyring',
627 '--gen-key',
628 '--name=mgr.{id}'.format(id=id_),
629 mgr_dir + '/keyring',
630 ],
631 )
632
633 log.info('Setting up mds nodes...')
634 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
635 for remote, roles_for_host in mdss.remotes.iteritems():
636 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
637 cluster_name):
638 _, _, id_ = teuthology.split_role(role)
639 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
640 cluster=cluster_name,
641 id=id_,
642 )
643 remote.run(
644 args=[
645 'sudo',
646 'mkdir',
647 '-p',
648 mds_dir,
649 run.Raw('&&'),
650 'sudo',
651 'adjust-ulimits',
652 'ceph-coverage',
653 coverage_dir,
654 'ceph-authtool',
655 '--create-keyring',
656 '--gen-key',
657 '--name=mds.{id}'.format(id=id_),
658 mds_dir + '/keyring',
659 ],
660 )
661
662 cclient.create_keyring(ctx, cluster_name)
663 log.info('Running mkfs on osd nodes...')
664
665 if not hasattr(ctx, 'disk_config'):
666 ctx.disk_config = argparse.Namespace()
667 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
668 ctx.disk_config.remote_to_roles_to_dev = {}
669 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
670 ctx.disk_config.remote_to_roles_to_journals = {}
671 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
672 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
673 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
674 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
675
676 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
677 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
678
679 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
680 for remote, roles_for_host in osds.remotes.iteritems():
681 roles_to_devs = remote_to_roles_to_devs[remote]
682 roles_to_journals = remote_to_roles_to_journals[remote]
683
684 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
685 _, _, id_ = teuthology.split_role(role)
686 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
687 remote.run(
688 args=[
689 'sudo',
690 'mkdir',
691 '-p',
692 mnt_point,
693 ])
694 log.info(str(roles_to_journals))
695 log.info(role)
696 if roles_to_devs.get(role):
697 dev = roles_to_devs[role]
698 fs = config.get('fs')
699 package = None
700 mkfs_options = config.get('mkfs_options')
701 mount_options = config.get('mount_options')
702 if fs == 'btrfs':
703 # package = 'btrfs-tools'
704 if mount_options is None:
705 mount_options = ['noatime', 'user_subvol_rm_allowed']
706 if mkfs_options is None:
707 mkfs_options = ['-m', 'single',
708 '-l', '32768',
709 '-n', '32768']
710 if fs == 'xfs':
711 # package = 'xfsprogs'
712 if mount_options is None:
713 mount_options = ['noatime']
714 if mkfs_options is None:
715 mkfs_options = ['-f', '-i', 'size=2048']
716 if fs == 'ext4' or fs == 'ext3':
717 if mount_options is None:
718 mount_options = ['noatime', 'user_xattr']
719
720 if mount_options is None:
721 mount_options = []
722 if mkfs_options is None:
723 mkfs_options = []
724 mkfs = ['mkfs.%s' % fs] + mkfs_options
725 log.info('%s on %s on %s' % (mkfs, dev, remote))
726 if package is not None:
727 remote.run(
728 args=[
729 'sudo',
730 'apt-get', 'install', '-y', package
731 ],
732 stdout=StringIO(),
733 )
734
735 try:
736 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
737 except run.CommandFailedError:
738 # Newer btfs-tools doesn't prompt for overwrite, use -f
739 if '-f' not in mount_options:
740 mkfs_options.append('-f')
741 mkfs = ['mkfs.%s' % fs] + mkfs_options
742 log.info('%s on %s on %s' % (mkfs, dev, remote))
743 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
744
745 log.info('mount %s on %s -o %s' % (dev, remote,
746 ','.join(mount_options)))
747 remote.run(
748 args=[
749 'sudo',
750 'mount',
751 '-t', fs,
752 '-o', ','.join(mount_options),
753 dev,
754 mnt_point,
755 ]
756 )
757 remote.run(
758 args=[
759 'sudo', '/sbin/restorecon', mnt_point,
760 ],
761 check_status=False,
762 )
763 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
764 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
765 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
766 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
767 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
768 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
769 devs_to_clean[remote].append(mnt_point)
770
771 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
772 _, _, id_ = teuthology.split_role(role)
773 remote.run(
774 args=[
775 'sudo',
776 'MALLOC_CHECK_=3',
777 'adjust-ulimits',
778 'ceph-coverage',
779 coverage_dir,
780 'ceph-osd',
781 '--cluster',
782 cluster_name,
783 '--mkfs',
784 '--mkkey',
785 '-i', id_,
786 '--monmap', monmap_path,
787 ],
788 )
789
790 log.info('Reading keys from all nodes...')
791 keys_fp = StringIO()
792 keys = []
793 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
794 for type_ in ['mgr', 'mds', 'osd']:
795 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
796 continue
797 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
798 _, _, id_ = teuthology.split_role(role)
799 data = teuthology.get_file(
800 remote=remote,
801 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
802 type=type_,
803 id=id_,
804 cluster=cluster_name,
805 ),
806 sudo=True,
807 )
808 keys.append((type_, id_, data))
809 keys_fp.write(data)
810 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
811 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
812 _, _, id_ = teuthology.split_role(role)
813 data = teuthology.get_file(
814 remote=remote,
815 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
816 )
817 keys.append(('client', id_, data))
818 keys_fp.write(data)
819
820 log.info('Adding keys to all mons...')
821 writes = mons.run(
822 args=[
823 'sudo', 'tee', '-a',
824 keyring_path,
825 ],
826 stdin=run.PIPE,
827 wait=False,
828 stdout=StringIO(),
829 )
830 keys_fp.seek(0)
831 teuthology.feed_many_stdins_and_close(keys_fp, writes)
832 run.wait(writes)
833 for type_, id_, data in keys:
834 run.wait(
835 mons.run(
836 args=[
837 'sudo',
838 'adjust-ulimits',
839 'ceph-coverage',
840 coverage_dir,
841 'ceph-authtool',
842 keyring_path,
843 '--name={type}.{id}'.format(
844 type=type_,
845 id=id_,
846 ),
847 ] + list(generate_caps(type_)),
848 wait=False,
849 ),
850 )
851
852 log.info('Running mkfs on mon nodes...')
853 for remote, roles_for_host in mons.remotes.iteritems():
854 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
855 _, _, id_ = teuthology.split_role(role)
856 remote.run(
857 args=[
858 'sudo',
859 'mkdir',
860 '-p',
861 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
862 ],
863 )
864 remote.run(
865 args=[
866 'sudo',
867 'adjust-ulimits',
868 'ceph-coverage',
869 coverage_dir,
870 'ceph-mon',
871 '--cluster', cluster_name,
872 '--mkfs',
873 '-i', id_,
874 '--monmap', monmap_path,
875 '--osdmap', osdmap_path,
876 '--keyring', keyring_path,
877 ],
878 )
879
880 run.wait(
881 mons.run(
882 args=[
883 'rm',
884 '--',
885 monmap_path,
886 osdmap_path,
887 ],
888 wait=False,
889 ),
890 )
891
892 try:
893 yield
894 except Exception:
895 # we need to know this below
896 ctx.summary['success'] = False
897 raise
898 finally:
899 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
900
901 log.info('Checking cluster log for badness...')
902
903 def first_in_ceph_log(pattern, excludes):
904 """
905 Find the first occurence of the pattern specified in the Ceph log,
906 Returns None if none found.
907
908 :param pattern: Pattern scanned for.
909 :param excludes: Patterns to ignore.
910 :return: First line of text (or None if not found)
911 """
912 args = [
913 'sudo',
914 'egrep', pattern,
915 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
916 ]
917 for exclude in excludes:
918 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
919 args.extend([
920 run.Raw('|'), 'head', '-n', '1',
921 ])
922 r = mon0_remote.run(
923 stdout=StringIO(),
924 args=args,
925 )
926 stdout = r.stdout.getvalue()
927 if stdout != '':
928 return stdout
929 return None
930
931 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
932 config['log_whitelist']) is not None:
933 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
934 ctx.summary['success'] = False
935 # use the most severe problem as the failure reason
936 if 'failure_reason' not in ctx.summary:
937 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
938 match = first_in_ceph_log(pattern, config['log_whitelist'])
939 if match is not None:
940 ctx.summary['failure_reason'] = \
941 '"{match}" in cluster log'.format(
942 match=match.rstrip('\n'),
943 )
944 break
945
946 for remote, dirs in devs_to_clean.iteritems():
947 for dir_ in dirs:
948 log.info('Unmounting %s on %s' % (dir_, remote))
949 try:
950 remote.run(
951 args=[
952 'sync',
953 run.Raw('&&'),
954 'sudo',
955 'umount',
956 '-f',
957 dir_
958 ]
959 )
960 except Exception as e:
961 remote.run(args=[
962 'sudo',
963 run.Raw('PATH=/usr/sbin:$PATH'),
964 'lsof',
965 run.Raw(';'),
966 'ps', 'auxf',
967 ])
968 raise e
969
970 if config.get('tmpfs_journal'):
971 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
972 for remote, roles_for_host in osds.remotes.iteritems():
973 remote.run(
974 args=['sudo', 'umount', '-f', '/mnt'],
975 check_status=False,
976 )
977
978 if ctx.archive is not None and \
979 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
980
981 # archive mon data, too
982 log.info('Archiving mon data...')
983 path = os.path.join(ctx.archive, 'data')
984 try:
985 os.makedirs(path)
986 except OSError as e:
987 if e.errno == errno.EEXIST:
988 pass
989 else:
990 raise
991 for remote, roles in mons.remotes.iteritems():
992 for role in roles:
993 is_mon = teuthology.is_type('mon', cluster_name)
994 if is_mon(role):
995 _, _, id_ = teuthology.split_role(role)
996 mon_dir = '/var/lib/ceph/mon/' + \
997 '{0}-{1}'.format(cluster_name, id_)
998 teuthology.pull_directory_tarball(
999 remote,
1000 mon_dir,
1001 path + '/' + role + '.tgz')
1002
1003 log.info('Cleaning ceph cluster...')
1004 run.wait(
1005 ctx.cluster.run(
1006 args=[
1007 'sudo',
1008 'rm',
1009 '-rf',
1010 '--',
1011 conf_path,
1012 keyring_path,
1013 data_dir,
1014 monmap_path,
1015 osdmap_path,
1016 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1017 ],
1018 wait=False,
1019 ),
1020 )
1021
1022
1023 def osd_scrub_pgs(ctx, config):
1024 """
1025 Scrub pgs when we exit.
1026
1027 First make sure all pgs are active and clean.
1028 Next scrub all osds.
1029 Then periodically check until all pgs have scrub time stamps that
1030 indicate the last scrub completed. Time out if no progess is made
1031 here after two minutes.
1032 """
1033 retries = 20
1034 delays = 10
1035 cluster_name = config['cluster']
1036 manager = ctx.managers[cluster_name]
1037 all_clean = False
1038 for _ in range(0, retries):
1039 stats = manager.get_pg_stats()
1040 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1041 if not bad:
1042 all_clean = True
1043 break
1044 log.info(
1045 "Waiting for all osds to be active and clean, waiting on %s" % bad)
1046 time.sleep(delays)
1047 if not all_clean:
1048 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1049 check_time_now = time.localtime()
1050 time.sleep(1)
1051 all_roles = teuthology.all_roles(ctx.cluster)
1052 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1053 log.info("Scrubbing {osd}".format(osd=role))
1054 _, _, id_ = teuthology.split_role(role)
1055 # allow this to fail; in certain cases the OSD might not be up
1056 # at this point. we will catch all pgs below.
1057 try:
1058 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1059 except run.CommandFailedError:
1060 pass
1061 prev_good = 0
1062 gap_cnt = 0
1063 loop = True
1064 while loop:
1065 stats = manager.get_pg_stats()
1066 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1067 loop = False
1068 thiscnt = 0
1069 for (pgid, tmval) in timez:
1070 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1071 if pgtm > check_time_now:
1072 thiscnt += 1
1073 else:
1074 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1075 loop = True
1076 if thiscnt > prev_good:
1077 prev_good = thiscnt
1078 gap_cnt = 0
1079 else:
1080 gap_cnt += 1
1081 if gap_cnt % 6 == 0:
1082 for (pgid, tmval) in timez:
1083 # re-request scrub every so often in case the earlier
1084 # request was missed. do not do it everytime because
1085 # the scrub may be in progress or not reported yet and
1086 # we will starve progress.
1087 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1088 if gap_cnt > retries:
1089 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1090 if loop:
1091 log.info('Still waiting for all pgs to be scrubbed.')
1092 time.sleep(delays)
1093
1094
1095 @contextlib.contextmanager
1096 def run_daemon(ctx, config, type_):
1097 """
1098 Run daemons for a role type. Handle the startup and termination of a a daemon.
1099 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1100 and a max_mds value for one mds.
1101 On cleanup -- Stop all existing daemons of this type.
1102
1103 :param ctx: Context
1104 :param config: Configuration
1105 :paran type_: Role type
1106 """
1107 cluster_name = config['cluster']
1108 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1109 testdir = teuthology.get_testdir(ctx)
1110 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1111
1112 # check whether any daemons if this type are configured
1113 if daemons is None:
1114 return
1115 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1116
1117 daemon_signal = 'kill'
1118 if config.get('coverage') or config.get('valgrind') is not None:
1119 daemon_signal = 'term'
1120
1121 for remote, roles_for_host in daemons.remotes.iteritems():
1122 is_type_ = teuthology.is_type(type_, cluster_name)
1123 for role in roles_for_host:
1124 if not is_type_(role):
1125 continue
1126 _, _, id_ = teuthology.split_role(role)
1127
1128 run_cmd = [
1129 'sudo',
1130 'adjust-ulimits',
1131 'ceph-coverage',
1132 coverage_dir,
1133 'daemon-helper',
1134 daemon_signal,
1135 ]
1136 run_cmd_tail = [
1137 'ceph-%s' % (type_),
1138 '-f',
1139 '--cluster', cluster_name,
1140 '-i', id_]
1141
1142 if type_ in config.get('cpu_profile', []):
1143 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1144 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1145
1146 if config.get('valgrind') is not None:
1147 valgrind_args = None
1148 if type_ in config['valgrind']:
1149 valgrind_args = config['valgrind'][type_]
1150 if role in config['valgrind']:
1151 valgrind_args = config['valgrind'][role]
1152 run_cmd = teuthology.get_valgrind_args(testdir, role,
1153 run_cmd,
1154 valgrind_args)
1155
1156 run_cmd.extend(run_cmd_tail)
1157
1158 # always register mgr; don't necessarily start
1159 ctx.daemons.register_daemon(
1160 remote, type_, id_,
1161 cluster=cluster_name,
1162 args=run_cmd,
1163 logger=log.getChild(role),
1164 stdin=run.PIPE,
1165 wait=False
1166 )
1167 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1168 role = cluster_name + '.' + type_
1169 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1170
1171 try:
1172 yield
1173 finally:
1174 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1175
1176
1177 def healthy(ctx, config):
1178 """
1179 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1180
1181 :param ctx: Context
1182 :param config: Configuration
1183 """
1184 config = config if isinstance(config, dict) else dict()
1185 cluster_name = config.get('cluster', 'ceph')
1186 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1187 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1188 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1189 teuthology.wait_until_osds_up(
1190 ctx,
1191 cluster=ctx.cluster,
1192 remote=mon0_remote,
1193 ceph_cluster=cluster_name,
1194 )
1195 teuthology.wait_until_healthy(
1196 ctx,
1197 remote=mon0_remote,
1198 ceph_cluster=cluster_name,
1199 )
1200
1201 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1202 # Some MDSs exist, wait for them to be healthy
1203 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1204 ceph_fs.wait_for_daemons(timeout=300)
1205
1206
1207 def wait_for_osds_up(ctx, config):
1208 """
1209 Wait for all osd's to come up.
1210
1211 :param ctx: Context
1212 :param config: Configuration
1213 """
1214 log.info('Waiting until ceph osds are all up...')
1215 cluster_name = config.get('cluster', 'ceph')
1216 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1217 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1218 teuthology.wait_until_osds_up(
1219 ctx,
1220 cluster=ctx.cluster,
1221 remote=mon0_remote
1222 )
1223
1224
1225 def wait_for_mon_quorum(ctx, config):
1226 """
1227 Check renote ceph status until all monitors are up.
1228
1229 :param ctx: Context
1230 :param config: Configuration
1231 """
1232 if isinstance(config, dict):
1233 mons = config['daemons']
1234 cluster_name = config.get('cluster', 'ceph')
1235 else:
1236 assert isinstance(config, list)
1237 mons = config
1238 cluster_name = 'ceph'
1239 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1240 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1241 with contextutil.safe_while(sleep=10, tries=60,
1242 action='wait for monitor quorum') as proceed:
1243 while proceed():
1244 r = remote.run(
1245 args=[
1246 'sudo',
1247 'ceph',
1248 'quorum_status',
1249 ],
1250 stdout=StringIO(),
1251 logger=log.getChild('quorum_status'),
1252 )
1253 j = json.loads(r.stdout.getvalue())
1254 q = j.get('quorum_names', [])
1255 log.debug('Quorum: %s', q)
1256 if sorted(q) == sorted(mons):
1257 break
1258
1259
1260 def created_pool(ctx, config):
1261 """
1262 Add new pools to the dictionary of pools that the ceph-manager
1263 knows about.
1264 """
1265 for new_pool in config:
1266 if new_pool not in ctx.managers['ceph'].pools:
1267 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1268 new_pool, 'pg_num')
1269
1270
1271 @contextlib.contextmanager
1272 def restart(ctx, config):
1273 """
1274 restart ceph daemons
1275
1276 For example::
1277 tasks:
1278 - ceph.restart: [all]
1279
1280 For example::
1281 tasks:
1282 - ceph.restart: [osd.0, mon.1, mds.*]
1283
1284 or::
1285
1286 tasks:
1287 - ceph.restart:
1288 daemons: [osd.0, mon.1]
1289 wait-for-healthy: false
1290 wait-for-osds-up: true
1291
1292 :param ctx: Context
1293 :param config: Configuration
1294 """
1295 if config is None:
1296 config = {}
1297 elif isinstance(config, list):
1298 config = {'daemons': config}
1299
1300 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1301 clusters = set()
1302 for role in daemons:
1303 cluster, type_, id_ = teuthology.split_role(role)
1304 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1305 clusters.add(cluster)
1306
1307 manager = ctx.managers['ceph']
1308 for dmon in daemons:
1309 if '.' in dmon:
1310 dm_parts = dmon.split('.')
1311 if dm_parts[1].isdigit():
1312 if dm_parts[0] == 'osd':
1313 manager.mark_down_osd(int(dm_parts[1]))
1314
1315 if config.get('wait-for-healthy', True):
1316 for cluster in clusters:
1317 healthy(ctx=ctx, config=dict(cluster=cluster))
1318 if config.get('wait-for-osds-up', False):
1319 for cluster in clusters:
1320 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1321 yield
1322
1323
1324 @contextlib.contextmanager
1325 def stop(ctx, config):
1326 """
1327 Stop ceph daemons
1328
1329 For example::
1330 tasks:
1331 - ceph.stop: [mds.*]
1332
1333 tasks:
1334 - ceph.stop: [osd.0, osd.2]
1335
1336 tasks:
1337 - ceph.stop:
1338 daemons: [osd.0, osd.2]
1339
1340 """
1341 if config is None:
1342 config = {}
1343 elif isinstance(config, list):
1344 config = {'daemons': config}
1345
1346 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1347 for role in daemons:
1348 cluster, type_, id_ = teuthology.split_role(role)
1349 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1350
1351 yield
1352
1353
1354 @contextlib.contextmanager
1355 def wait_for_failure(ctx, config):
1356 """
1357 Wait for a failure of a ceph daemon
1358
1359 For example::
1360 tasks:
1361 - ceph.wait_for_failure: [mds.*]
1362
1363 tasks:
1364 - ceph.wait_for_failure: [osd.0, osd.2]
1365
1366 tasks:
1367 - ceph.wait_for_failure:
1368 daemons: [osd.0, osd.2]
1369
1370 """
1371 if config is None:
1372 config = {}
1373 elif isinstance(config, list):
1374 config = {'daemons': config}
1375
1376 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1377 for role in daemons:
1378 cluster, type_, id_ = teuthology.split_role(role)
1379 try:
1380 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1381 except:
1382 log.info('Saw expected daemon failure. Continuing.')
1383 pass
1384 else:
1385 raise RuntimeError('daemon %s did not fail' % role)
1386
1387 yield
1388
1389
1390 def validate_config(ctx, config):
1391 """
1392 Perform some simple validation on task configuration.
1393 Raises exceptions.ConfigError if an error is found.
1394 """
1395 # check for osds from multiple clusters on the same host
1396 for remote, roles_for_host in ctx.cluster.remotes.items():
1397 last_cluster = None
1398 last_role = None
1399 for role in roles_for_host:
1400 role_cluster, role_type, _ = teuthology.split_role(role)
1401 if role_type != 'osd':
1402 continue
1403 if last_cluster and last_cluster != role_cluster:
1404 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1405 last_role, role)
1406 raise exceptions.ConfigError(msg)
1407 last_cluster = role_cluster
1408 last_role = role
1409
1410
1411 @contextlib.contextmanager
1412 def task(ctx, config):
1413 """
1414 Set up and tear down a Ceph cluster.
1415
1416 For example::
1417
1418 tasks:
1419 - ceph:
1420 - interactive:
1421
1422 You can also specify what branch to run::
1423
1424 tasks:
1425 - ceph:
1426 branch: foo
1427
1428 Or a tag::
1429
1430 tasks:
1431 - ceph:
1432 tag: v0.42.13
1433
1434 Or a sha1::
1435
1436 tasks:
1437 - ceph:
1438 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1439
1440 Or a local source dir::
1441
1442 tasks:
1443 - ceph:
1444 path: /home/sage/ceph
1445
1446 To capture code coverage data, use::
1447
1448 tasks:
1449 - ceph:
1450 coverage: true
1451
1452 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1453
1454 tasks:
1455 - ceph:
1456 fs: xfs
1457 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1458 mount_options: [nobarrier, inode64]
1459
1460 Note, this will cause the task to check the /scratch_devs file on each node
1461 for available devices. If no such file is found, /dev/sdb will be used.
1462
1463 To run some daemons under valgrind, include their names
1464 and the tool/args to use in a valgrind section::
1465
1466 tasks:
1467 - ceph:
1468 valgrind:
1469 mds.1: --tool=memcheck
1470 osd.1: [--tool=memcheck, --leak-check=no]
1471
1472 Those nodes which are using memcheck or valgrind will get
1473 checked for bad results.
1474
1475 To adjust or modify config options, use::
1476
1477 tasks:
1478 - ceph:
1479 conf:
1480 section:
1481 key: value
1482
1483 For example::
1484
1485 tasks:
1486 - ceph:
1487 conf:
1488 mds.0:
1489 some option: value
1490 other key: other value
1491 client.0:
1492 debug client: 10
1493 debug ms: 1
1494
1495 By default, the cluster log is checked for errors and warnings,
1496 and the run marked failed if any appear. You can ignore log
1497 entries by giving a list of egrep compatible regexes, i.e.:
1498
1499 tasks:
1500 - ceph:
1501 log-whitelist: ['foo.*bar', 'bad message']
1502
1503 To run multiple ceph clusters, use multiple ceph tasks, and roles
1504 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1505 cluster use the default cluster name, 'ceph'. OSDs from separate
1506 clusters must be on separate hosts. Clients and non-osd daemons
1507 from multiple clusters may be colocated. For each cluster, add an
1508 instance of the ceph task with the cluster name specified, e.g.::
1509
1510 roles:
1511 - [mon.a, osd.0, osd.1]
1512 - [backup.mon.a, backup.osd.0, backup.osd.1]
1513 - [client.0, backup.client.0]
1514 tasks:
1515 - ceph:
1516 cluster: ceph
1517 - ceph:
1518 cluster: backup
1519
1520 :param ctx: Context
1521 :param config: Configuration
1522
1523 """
1524 if config is None:
1525 config = {}
1526 assert isinstance(config, dict), \
1527 "task ceph only supports a dictionary for configuration"
1528
1529 overrides = ctx.config.get('overrides', {})
1530 teuthology.deep_merge(config, overrides.get('ceph', {}))
1531
1532 first_ceph_cluster = False
1533 if not hasattr(ctx, 'daemons'):
1534 first_ceph_cluster = True
1535 ctx.daemons = DaemonGroup()
1536
1537 testdir = teuthology.get_testdir(ctx)
1538 if config.get('coverage'):
1539 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1540 log.info('Creating coverage directory...')
1541 run.wait(
1542 ctx.cluster.run(
1543 args=[
1544 'install', '-d', '-m0755', '--',
1545 coverage_dir,
1546 ],
1547 wait=False,
1548 )
1549 )
1550
1551 if 'cluster' not in config:
1552 config['cluster'] = 'ceph'
1553
1554 validate_config(ctx, config)
1555
1556 subtasks = []
1557 if first_ceph_cluster:
1558 # these tasks handle general log setup and parsing on all hosts,
1559 # so they should only be run once
1560 subtasks = [
1561 lambda: ceph_log(ctx=ctx, config=None),
1562 lambda: valgrind_post(ctx=ctx, config=config),
1563 ]
1564
1565 subtasks += [
1566 lambda: cluster(ctx=ctx, config=dict(
1567 conf=config.get('conf', {}),
1568 fs=config.get('fs', 'xfs'),
1569 mkfs_options=config.get('mkfs_options', None),
1570 mount_options=config.get('mount_options', None),
1571 block_journal=config.get('block_journal', None),
1572 tmpfs_journal=config.get('tmpfs_journal', None),
1573 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1574 log_whitelist=config.get('log-whitelist', []),
1575 cpu_profile=set(config.get('cpu_profile', []),),
1576 cluster=config['cluster'],
1577 )),
1578 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1579 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1580 lambda: crush_setup(ctx=ctx, config=config),
1581 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1582 lambda: cephfs_setup(ctx=ctx, config=config),
1583 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1584 ]
1585
1586 with contextutil.nested(*subtasks):
1587 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1588 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1589 if not hasattr(ctx, 'managers'):
1590 ctx.managers = {}
1591 ctx.managers[config['cluster']] = CephManager(
1592 mon,
1593 ctx=ctx,
1594 logger=log.getChild('ceph_manager.' + config['cluster']),
1595 cluster=config['cluster'],
1596 )
1597
1598 try:
1599 if config.get('wait-for-healthy', True):
1600 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1601
1602 yield
1603 finally:
1604 if config.get('wait-for-scrub', True):
1605 osd_scrub_pgs(ctx, config)