]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/ceph.py
05c4c7d90abf3eda87c8a7e81bedec39637e0715
[ceph.git] / ceph / qa / tasks / ceph.py
1 """
2 Ceph cluster task.
3
4 Handle the setup, starting, and clean-up of a Ceph cluster.
5 """
6 from cStringIO import StringIO
7
8 import argparse
9 import contextlib
10 import errno
11 import logging
12 import os
13 import json
14 import time
15 import gevent
16 import socket
17
18 from paramiko import SSHException
19 from ceph_manager import CephManager, write_conf
20 from tasks.cephfs.filesystem import Filesystem
21 from teuthology import misc as teuthology
22 from teuthology import contextutil
23 from teuthology import exceptions
24 from teuthology.orchestra import run
25 import ceph_client as cclient
26 from teuthology.orchestra.daemon import DaemonGroup
27
28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30 log = logging.getLogger(__name__)
31
32
33 def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
46 mon='allow *',
47 ),
48 mds=dict(
49 mon='allow *',
50 mgr='allow *',
51 osd='allow *',
52 mds='allow',
53 ),
54 client=dict(
55 mon='allow rw',
56 mgr='allow r',
57 osd='allow rwx',
58 mds='allow',
59 ),
60 )
61 for subsystem, capability in defaults[type_].items():
62 yield '--cap'
63 yield subsystem
64 yield capability
65
66
67 @contextlib.contextmanager
68 def ceph_log(ctx, config):
69 """
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
72
73 :param ctx: Context
74 :param config: Configuration
75 """
76 log.info('Making ceph log dir writeable by non-root...')
77 run.wait(
78 ctx.cluster.run(
79 args=[
80 'sudo',
81 'chmod',
82 '777',
83 '/var/log/ceph',
84 ],
85 wait=False,
86 )
87 )
88 log.info('Disabling ceph logrotate...')
89 run.wait(
90 ctx.cluster.run(
91 args=[
92 'sudo',
93 'rm', '-f', '--',
94 '/etc/logrotate.d/ceph',
95 ],
96 wait=False,
97 )
98 )
99 log.info('Creating extra log directories...')
100 run.wait(
101 ctx.cluster.run(
102 args=[
103 'sudo',
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
107 ],
108 wait=False,
109 )
110 )
111
112 class Rotater(object):
113 stop_event = gevent.event.Event()
114
115 def invoke_logrotate(self):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self.stop_event.is_set():
119 self.stop_event.wait(timeout=30)
120 try:
121 run.wait(
122 ctx.cluster.run(
123 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
124 ],
125 wait=False,
126 )
127 )
128 except exceptions.ConnectionLostError as e:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log.debug("Missed logrotate, node '{0}' is offline".format(
132 e.node))
133 except EOFError as e:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log.debug("Missed logrotate, EOFError")
139 except SSHException as e:
140 log.debug("Missed logrotate, SSHException")
141 except socket.error as e:
142 if e.errno == errno.EHOSTUNREACH:
143 log.debug("Missed logrotate, host unreachable")
144 else:
145 raise
146
147 def begin(self):
148 self.thread = gevent.spawn(self.invoke_logrotate)
149
150 def end(self):
151 self.stop_event.set()
152 self.thread.get()
153
154 def write_rotate_conf(ctx, daemons):
155 testdir = teuthology.get_testdir(ctx)
156 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
157 with file(rotate_conf_path, 'rb') as f:
158 conf = ""
159 for daemon, size in daemons.iteritems():
160 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
161 conf += f.read().format(daemon_type=daemon, max_size=size)
162 f.seek(0, 0)
163
164 for remote in ctx.cluster.remotes.iterkeys():
165 teuthology.write_file(remote=remote,
166 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
167 data=StringIO(conf)
168 )
169 remote.run(
170 args=[
171 'sudo',
172 'mv',
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
174 '/etc/logrotate.d/ceph-test.conf',
175 run.Raw('&&'),
176 'sudo',
177 'chmod',
178 '0644',
179 '/etc/logrotate.d/ceph-test.conf',
180 run.Raw('&&'),
181 'sudo',
182 'chown',
183 'root.root',
184 '/etc/logrotate.d/ceph-test.conf'
185 ]
186 )
187 remote.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
189
190 if ctx.config.get('log-rotate'):
191 daemons = ctx.config.get('log-rotate')
192 log.info('Setting up log rotation with ' + str(daemons))
193 write_rotate_conf(ctx, daemons)
194 logrotater = Rotater()
195 logrotater.begin()
196 try:
197 yield
198
199 finally:
200 if ctx.config.get('log-rotate'):
201 log.info('Shutting down logrotate')
202 logrotater.end()
203 ctx.cluster.run(
204 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
205 ]
206 )
207 if ctx.archive is not None and \
208 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
209 # and logs
210 log.info('Compressing logs...')
211 run.wait(
212 ctx.cluster.run(
213 args=[
214 'sudo',
215 'find',
216 '/var/log/ceph',
217 '-name',
218 '*.log',
219 '-print0',
220 run.Raw('|'),
221 'sudo',
222 'xargs',
223 '-0',
224 '--no-run-if-empty',
225 '--',
226 'gzip',
227 '--',
228 ],
229 wait=False,
230 ),
231 )
232
233 log.info('Archiving logs...')
234 path = os.path.join(ctx.archive, 'remote')
235 os.makedirs(path)
236 for remote in ctx.cluster.remotes.iterkeys():
237 sub = os.path.join(path, remote.shortname)
238 os.makedirs(sub)
239 teuthology.pull_directory(remote, '/var/log/ceph',
240 os.path.join(sub, 'log'))
241
242
243 def assign_devs(roles, devs):
244 """
245 Create a dictionary of devs indexed by roles
246
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
250 """
251 return dict(zip(roles, devs))
252
253
254 @contextlib.contextmanager
255 def valgrind_post(ctx, config):
256 """
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
259 the logs.
260
261 :param ctx: Context
262 :param config: Configuration
263 """
264 try:
265 yield
266 finally:
267 lookup_procs = list()
268 log.info('Checking for errors in any valgrind logs...')
269 for remote in ctx.cluster.remotes.iterkeys():
270 # look at valgrind logs for each node
271 proc = remote.run(
272 args=[
273 'sudo',
274 'zgrep',
275 '<kind>',
276 run.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
278 run.Raw('|'),
279 'sort',
280 run.Raw('|'),
281 'uniq',
282 ],
283 wait=False,
284 check_status=False,
285 stdout=StringIO(),
286 )
287 lookup_procs.append((proc, remote))
288
289 valgrind_exception = None
290 for (proc, remote) in lookup_procs:
291 proc.wait()
292 out = proc.stdout.getvalue()
293 for line in out.split('\n'):
294 if line == '':
295 continue
296 try:
297 (file, kind) = line.split(':')
298 except Exception:
299 log.error('failed to split line %s', line)
300 raise
301 log.debug('file %s kind %s', file, kind)
302 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
303 continue
304 log.error('saw valgrind issue %s in %s', kind, file)
305 valgrind_exception = Exception('saw valgrind issues')
306
307 if config.get('expect_valgrind_errors'):
308 if not valgrind_exception:
309 raise Exception('expected valgrind issues and found none')
310 else:
311 if valgrind_exception:
312 raise valgrind_exception
313
314
315 @contextlib.contextmanager
316 def crush_setup(ctx, config):
317 cluster_name = config['cluster']
318 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
319 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
320
321 profile = config.get('crush_tunables', 'default')
322 log.info('Setting crush tunables to %s', profile)
323 mon_remote.run(
324 args=['sudo', 'ceph', '--cluster', cluster_name,
325 'osd', 'crush', 'tunables', profile])
326 yield
327
328
329 @contextlib.contextmanager
330 def create_rbd_pool(ctx, config):
331 cluster_name = config['cluster']
332 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
333 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
334 log.info('Waiting for OSDs to come up')
335 teuthology.wait_until_osds_up(
336 ctx,
337 cluster=ctx.cluster,
338 remote=mon_remote,
339 ceph_cluster=cluster_name,
340 )
341 log.info('Creating RBD pool')
342 mon_remote.run(
343 args=['sudo', 'ceph', '--cluster', cluster_name,
344 'osd', 'pool', 'create', 'rbd', '8'])
345 yield
346
347 @contextlib.contextmanager
348 def cephfs_setup(ctx, config):
349 cluster_name = config['cluster']
350 testdir = teuthology.get_testdir(ctx)
351 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
352
353 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
354 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
355 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
356 # If there are any MDSs, then create a filesystem for them to use
357 # Do this last because requires mon cluster to be up and running
358 if mdss.remotes:
359 log.info('Setting up CephFS filesystem...')
360
361 fs = Filesystem(ctx, create='cephfs')
362
363 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
364 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
365 num_active = len([r for r in all_roles if is_active_mds(r)])
366
367 fs.set_max_mds(num_active)
368 fs.set_allow_dirfrags(True)
369
370 yield
371
372
373 @contextlib.contextmanager
374 def cluster(ctx, config):
375 """
376 Handle the creation and removal of a ceph cluster.
377
378 On startup:
379 Create directories needed for the cluster.
380 Create remote journals for all osds.
381 Create and set keyring.
382 Copy the monmap to tht test systems.
383 Setup mon nodes.
384 Setup mds nodes.
385 Mkfs osd nodes.
386 Add keyring information to monmaps
387 Mkfs mon nodes.
388
389 On exit:
390 If errors occured, extract a failure message and store in ctx.summary.
391 Unmount all test files and temporary journaling files.
392 Save the monitor information and archive all ceph logs.
393 Cleanup the keyring setup, and remove all monitor map and data files left over.
394
395 :param ctx: Context
396 :param config: Configuration
397 """
398 if ctx.config.get('use_existing_cluster', False) is True:
399 log.info("'use_existing_cluster' is true; skipping cluster creation")
400 yield
401
402 testdir = teuthology.get_testdir(ctx)
403 cluster_name = config['cluster']
404 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
405 log.info('Creating ceph cluster %s...', cluster_name)
406 run.wait(
407 ctx.cluster.run(
408 args=[
409 'install', '-d', '-m0755', '--',
410 data_dir,
411 ],
412 wait=False,
413 )
414 )
415
416 run.wait(
417 ctx.cluster.run(
418 args=[
419 'sudo',
420 'install', '-d', '-m0777', '--', '/var/run/ceph',
421 ],
422 wait=False,
423 )
424 )
425
426 devs_to_clean = {}
427 remote_to_roles_to_devs = {}
428 remote_to_roles_to_journals = {}
429 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
430 for remote, roles_for_host in osds.remotes.iteritems():
431 devs = teuthology.get_scratch_devices(remote)
432 roles_to_devs = {}
433 roles_to_journals = {}
434 if config.get('fs'):
435 log.info('fs option selected, checking for scratch devs')
436 log.info('found devs: %s' % (str(devs),))
437 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
438 iddevs = devs_id_map.values()
439 roles_to_devs = assign_devs(
440 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
441 )
442 if len(roles_to_devs) < len(iddevs):
443 iddevs = iddevs[len(roles_to_devs):]
444 devs_to_clean[remote] = []
445
446 if config.get('block_journal'):
447 log.info('block journal enabled')
448 roles_to_journals = assign_devs(
449 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
450 )
451 log.info('journal map: %s', roles_to_journals)
452
453 if config.get('tmpfs_journal'):
454 log.info('tmpfs journal enabled')
455 roles_to_journals = {}
456 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
457 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
458 tmpfs = '/mnt/' + role
459 roles_to_journals[role] = tmpfs
460 remote.run(args=['truncate', '-s', '1500M', tmpfs])
461 log.info('journal map: %s', roles_to_journals)
462
463 log.info('dev map: %s' % (str(roles_to_devs),))
464 remote_to_roles_to_devs[remote] = roles_to_devs
465 remote_to_roles_to_journals[remote] = roles_to_journals
466
467 log.info('Generating config...')
468 remotes_and_roles = ctx.cluster.remotes.items()
469 roles = [role_list for (remote, role_list) in remotes_and_roles]
470 ips = [host for (host, port) in
471 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
472 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
473 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
474 for role, journal in roles_to_journals.iteritems():
475 name = teuthology.ceph_role(role)
476 if name not in conf:
477 conf[name] = {}
478 conf[name]['osd journal'] = journal
479 for section, keys in config['conf'].iteritems():
480 for key, value in keys.iteritems():
481 log.info("[%s] %s = %s" % (section, key, value))
482 if section not in conf:
483 conf[section] = {}
484 conf[section][key] = value
485
486 if config.get('tmpfs_journal'):
487 conf['journal dio'] = False
488
489 if not hasattr(ctx, 'ceph'):
490 ctx.ceph = {}
491 ctx.ceph[cluster_name] = argparse.Namespace()
492 ctx.ceph[cluster_name].conf = conf
493
494 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
495 keyring_path = config.get('keyring_path', default_keyring)
496
497 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
498
499 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
500
501 log.info('Setting up %s...' % firstmon)
502 ctx.cluster.only(firstmon).run(
503 args=[
504 'sudo',
505 'adjust-ulimits',
506 'ceph-coverage',
507 coverage_dir,
508 'ceph-authtool',
509 '--create-keyring',
510 keyring_path,
511 ],
512 )
513 ctx.cluster.only(firstmon).run(
514 args=[
515 'sudo',
516 'adjust-ulimits',
517 'ceph-coverage',
518 coverage_dir,
519 'ceph-authtool',
520 '--gen-key',
521 '--name=mon.',
522 keyring_path,
523 ],
524 )
525 ctx.cluster.only(firstmon).run(
526 args=[
527 'sudo',
528 'chmod',
529 '0644',
530 keyring_path,
531 ],
532 )
533 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
534 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
535 cluster=cluster_name)
536 fsid = teuthology.create_simple_monmap(
537 ctx,
538 remote=mon0_remote,
539 conf=conf,
540 path=monmap_path,
541 )
542 if not 'global' in conf:
543 conf['global'] = {}
544 conf['global']['fsid'] = fsid
545
546 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
547 conf_path = config.get('conf_path', default_conf_path)
548 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
549 write_conf(ctx, conf_path, cluster_name)
550
551 log.info('Creating admin key on %s...' % firstmon)
552 ctx.cluster.only(firstmon).run(
553 args=[
554 'sudo',
555 'adjust-ulimits',
556 'ceph-coverage',
557 coverage_dir,
558 'ceph-authtool',
559 '--gen-key',
560 '--name=client.admin',
561 '--set-uid=0',
562 '--cap', 'mon', 'allow *',
563 '--cap', 'osd', 'allow *',
564 '--cap', 'mds', 'allow *',
565 '--cap', 'mgr', 'allow *',
566 keyring_path,
567 ],
568 )
569
570 log.info('Copying monmap to all nodes...')
571 keyring = teuthology.get_file(
572 remote=mon0_remote,
573 path=keyring_path,
574 )
575 monmap = teuthology.get_file(
576 remote=mon0_remote,
577 path=monmap_path,
578 )
579
580 for rem in ctx.cluster.remotes.iterkeys():
581 # copy mon key and initial monmap
582 log.info('Sending monmap to node {remote}'.format(remote=rem))
583 teuthology.sudo_write_file(
584 remote=rem,
585 path=keyring_path,
586 data=keyring,
587 perms='0644'
588 )
589 teuthology.write_file(
590 remote=rem,
591 path=monmap_path,
592 data=monmap,
593 )
594
595 log.info('Setting up mon nodes...')
596 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
597
598 if not config.get('skip_mgr_daemons', False):
599 log.info('Setting up mgr nodes...')
600 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
601 for remote, roles_for_host in mgrs.remotes.iteritems():
602 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
603 cluster_name):
604 _, _, id_ = teuthology.split_role(role)
605 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
606 cluster=cluster_name,
607 id=id_,
608 )
609 remote.run(
610 args=[
611 'sudo',
612 'mkdir',
613 '-p',
614 mgr_dir,
615 run.Raw('&&'),
616 'sudo',
617 'adjust-ulimits',
618 'ceph-coverage',
619 coverage_dir,
620 'ceph-authtool',
621 '--create-keyring',
622 '--gen-key',
623 '--name=mgr.{id}'.format(id=id_),
624 mgr_dir + '/keyring',
625 ],
626 )
627
628 log.info('Setting up mds nodes...')
629 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
630 for remote, roles_for_host in mdss.remotes.iteritems():
631 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
632 cluster_name):
633 _, _, id_ = teuthology.split_role(role)
634 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
635 cluster=cluster_name,
636 id=id_,
637 )
638 remote.run(
639 args=[
640 'sudo',
641 'mkdir',
642 '-p',
643 mds_dir,
644 run.Raw('&&'),
645 'sudo',
646 'adjust-ulimits',
647 'ceph-coverage',
648 coverage_dir,
649 'ceph-authtool',
650 '--create-keyring',
651 '--gen-key',
652 '--name=mds.{id}'.format(id=id_),
653 mds_dir + '/keyring',
654 ],
655 )
656
657 cclient.create_keyring(ctx, cluster_name)
658 log.info('Running mkfs on osd nodes...')
659
660 if not hasattr(ctx, 'disk_config'):
661 ctx.disk_config = argparse.Namespace()
662 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
663 ctx.disk_config.remote_to_roles_to_dev = {}
664 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
665 ctx.disk_config.remote_to_roles_to_journals = {}
666 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
667 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
668 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
669 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
670
671 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
672 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
673
674 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
675 for remote, roles_for_host in osds.remotes.iteritems():
676 roles_to_devs = remote_to_roles_to_devs[remote]
677 roles_to_journals = remote_to_roles_to_journals[remote]
678
679 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
680 _, _, id_ = teuthology.split_role(role)
681 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
682 remote.run(
683 args=[
684 'sudo',
685 'mkdir',
686 '-p',
687 mnt_point,
688 ])
689 log.info(str(roles_to_journals))
690 log.info(role)
691 if roles_to_devs.get(role):
692 dev = roles_to_devs[role]
693 fs = config.get('fs')
694 package = None
695 mkfs_options = config.get('mkfs_options')
696 mount_options = config.get('mount_options')
697 if fs == 'btrfs':
698 # package = 'btrfs-tools'
699 if mount_options is None:
700 mount_options = ['noatime', 'user_subvol_rm_allowed']
701 if mkfs_options is None:
702 mkfs_options = ['-m', 'single',
703 '-l', '32768',
704 '-n', '32768']
705 if fs == 'xfs':
706 # package = 'xfsprogs'
707 if mount_options is None:
708 mount_options = ['noatime']
709 if mkfs_options is None:
710 mkfs_options = ['-f', '-i', 'size=2048']
711 if fs == 'ext4' or fs == 'ext3':
712 if mount_options is None:
713 mount_options = ['noatime', 'user_xattr']
714
715 if mount_options is None:
716 mount_options = []
717 if mkfs_options is None:
718 mkfs_options = []
719 mkfs = ['mkfs.%s' % fs] + mkfs_options
720 log.info('%s on %s on %s' % (mkfs, dev, remote))
721 if package is not None:
722 remote.run(
723 args=[
724 'sudo',
725 'apt-get', 'install', '-y', package
726 ],
727 stdout=StringIO(),
728 )
729
730 try:
731 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
732 except run.CommandFailedError:
733 # Newer btfs-tools doesn't prompt for overwrite, use -f
734 if '-f' not in mount_options:
735 mkfs_options.append('-f')
736 mkfs = ['mkfs.%s' % fs] + mkfs_options
737 log.info('%s on %s on %s' % (mkfs, dev, remote))
738 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
739
740 log.info('mount %s on %s -o %s' % (dev, remote,
741 ','.join(mount_options)))
742 remote.run(
743 args=[
744 'sudo',
745 'mount',
746 '-t', fs,
747 '-o', ','.join(mount_options),
748 dev,
749 mnt_point,
750 ]
751 )
752 remote.run(
753 args=[
754 'sudo', '/sbin/restorecon', mnt_point,
755 ],
756 check_status=False,
757 )
758 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
759 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
760 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
761 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
762 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
763 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
764 devs_to_clean[remote].append(mnt_point)
765
766 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
767 _, _, id_ = teuthology.split_role(role)
768 remote.run(
769 args=[
770 'sudo',
771 'MALLOC_CHECK_=3',
772 'adjust-ulimits',
773 'ceph-coverage',
774 coverage_dir,
775 'ceph-osd',
776 '--cluster',
777 cluster_name,
778 '--mkfs',
779 '--mkkey',
780 '-i', id_,
781 '--monmap', monmap_path,
782 ],
783 )
784
785 log.info('Reading keys from all nodes...')
786 keys_fp = StringIO()
787 keys = []
788 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
789 for type_ in ['mgr', 'mds', 'osd']:
790 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
791 continue
792 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
793 _, _, id_ = teuthology.split_role(role)
794 data = teuthology.get_file(
795 remote=remote,
796 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
797 type=type_,
798 id=id_,
799 cluster=cluster_name,
800 ),
801 sudo=True,
802 )
803 keys.append((type_, id_, data))
804 keys_fp.write(data)
805 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
806 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
807 _, _, id_ = teuthology.split_role(role)
808 data = teuthology.get_file(
809 remote=remote,
810 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
811 )
812 keys.append(('client', id_, data))
813 keys_fp.write(data)
814
815 log.info('Adding keys to all mons...')
816 writes = mons.run(
817 args=[
818 'sudo', 'tee', '-a',
819 keyring_path,
820 ],
821 stdin=run.PIPE,
822 wait=False,
823 stdout=StringIO(),
824 )
825 keys_fp.seek(0)
826 teuthology.feed_many_stdins_and_close(keys_fp, writes)
827 run.wait(writes)
828 for type_, id_, data in keys:
829 run.wait(
830 mons.run(
831 args=[
832 'sudo',
833 'adjust-ulimits',
834 'ceph-coverage',
835 coverage_dir,
836 'ceph-authtool',
837 keyring_path,
838 '--name={type}.{id}'.format(
839 type=type_,
840 id=id_,
841 ),
842 ] + list(generate_caps(type_)),
843 wait=False,
844 ),
845 )
846
847 log.info('Running mkfs on mon nodes...')
848 for remote, roles_for_host in mons.remotes.iteritems():
849 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
850 _, _, id_ = teuthology.split_role(role)
851 remote.run(
852 args=[
853 'sudo',
854 'mkdir',
855 '-p',
856 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
857 ],
858 )
859 remote.run(
860 args=[
861 'sudo',
862 'adjust-ulimits',
863 'ceph-coverage',
864 coverage_dir,
865 'ceph-mon',
866 '--cluster', cluster_name,
867 '--mkfs',
868 '-i', id_,
869 '--monmap', monmap_path,
870 '--keyring', keyring_path,
871 ],
872 )
873
874 run.wait(
875 mons.run(
876 args=[
877 'rm',
878 '--',
879 monmap_path,
880 ],
881 wait=False,
882 ),
883 )
884
885 try:
886 yield
887 except Exception:
888 # we need to know this below
889 ctx.summary['success'] = False
890 raise
891 finally:
892 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
893
894 log.info('Checking cluster log for badness...')
895
896 def first_in_ceph_log(pattern, excludes):
897 """
898 Find the first occurence of the pattern specified in the Ceph log,
899 Returns None if none found.
900
901 :param pattern: Pattern scanned for.
902 :param excludes: Patterns to ignore.
903 :return: First line of text (or None if not found)
904 """
905 args = [
906 'sudo',
907 'egrep', pattern,
908 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
909 ]
910 for exclude in excludes:
911 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
912 args.extend([
913 run.Raw('|'), 'head', '-n', '1',
914 ])
915 r = mon0_remote.run(
916 stdout=StringIO(),
917 args=args,
918 )
919 stdout = r.stdout.getvalue()
920 if stdout != '':
921 return stdout
922 return None
923
924 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
925 config['log_whitelist']) is not None:
926 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
927 ctx.summary['success'] = False
928 # use the most severe problem as the failure reason
929 if 'failure_reason' not in ctx.summary:
930 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
931 match = first_in_ceph_log(pattern, config['log_whitelist'])
932 if match is not None:
933 ctx.summary['failure_reason'] = \
934 '"{match}" in cluster log'.format(
935 match=match.rstrip('\n'),
936 )
937 break
938
939 for remote, dirs in devs_to_clean.iteritems():
940 for dir_ in dirs:
941 log.info('Unmounting %s on %s' % (dir_, remote))
942 try:
943 remote.run(
944 args=[
945 'sync',
946 run.Raw('&&'),
947 'sudo',
948 'umount',
949 '-f',
950 dir_
951 ]
952 )
953 except Exception as e:
954 remote.run(args=[
955 'sudo',
956 run.Raw('PATH=/usr/sbin:$PATH'),
957 'lsof',
958 run.Raw(';'),
959 'ps', 'auxf',
960 ])
961 raise e
962
963 if config.get('tmpfs_journal'):
964 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
965 for remote, roles_for_host in osds.remotes.iteritems():
966 remote.run(
967 args=['sudo', 'umount', '-f', '/mnt'],
968 check_status=False,
969 )
970
971 if ctx.archive is not None and \
972 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
973
974 # archive mon data, too
975 log.info('Archiving mon data...')
976 path = os.path.join(ctx.archive, 'data')
977 try:
978 os.makedirs(path)
979 except OSError as e:
980 if e.errno == errno.EEXIST:
981 pass
982 else:
983 raise
984 for remote, roles in mons.remotes.iteritems():
985 for role in roles:
986 is_mon = teuthology.is_type('mon', cluster_name)
987 if is_mon(role):
988 _, _, id_ = teuthology.split_role(role)
989 mon_dir = '/var/lib/ceph/mon/' + \
990 '{0}-{1}'.format(cluster_name, id_)
991 teuthology.pull_directory_tarball(
992 remote,
993 mon_dir,
994 path + '/' + role + '.tgz')
995
996 log.info('Cleaning ceph cluster...')
997 run.wait(
998 ctx.cluster.run(
999 args=[
1000 'sudo',
1001 'rm',
1002 '-rf',
1003 '--',
1004 conf_path,
1005 keyring_path,
1006 data_dir,
1007 monmap_path,
1008 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1009 ],
1010 wait=False,
1011 ),
1012 )
1013
1014
1015 def osd_scrub_pgs(ctx, config):
1016 """
1017 Scrub pgs when we exit.
1018
1019 First make sure all pgs are active and clean.
1020 Next scrub all osds.
1021 Then periodically check until all pgs have scrub time stamps that
1022 indicate the last scrub completed. Time out if no progess is made
1023 here after two minutes.
1024 """
1025 retries = 20
1026 delays = 10
1027 cluster_name = config['cluster']
1028 manager = ctx.managers[cluster_name]
1029 all_clean = False
1030 for _ in range(0, retries):
1031 stats = manager.get_pg_stats()
1032 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1033 if not bad:
1034 all_clean = True
1035 break
1036 log.info(
1037 "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1038 time.sleep(delays)
1039 if not all_clean:
1040 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1041 check_time_now = time.localtime()
1042 time.sleep(1)
1043 all_roles = teuthology.all_roles(ctx.cluster)
1044 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1045 log.info("Scrubbing {osd}".format(osd=role))
1046 _, _, id_ = teuthology.split_role(role)
1047 # allow this to fail; in certain cases the OSD might not be up
1048 # at this point. we will catch all pgs below.
1049 try:
1050 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1051 except run.CommandFailedError:
1052 pass
1053 prev_good = 0
1054 gap_cnt = 0
1055 loop = True
1056 while loop:
1057 stats = manager.get_pg_stats()
1058 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1059 loop = False
1060 thiscnt = 0
1061 for (pgid, tmval) in timez:
1062 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1063 if pgtm > check_time_now:
1064 thiscnt += 1
1065 else:
1066 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1067 loop = True
1068 if thiscnt > prev_good:
1069 prev_good = thiscnt
1070 gap_cnt = 0
1071 else:
1072 gap_cnt += 1
1073 if gap_cnt % 6 == 0:
1074 for (pgid, tmval) in timez:
1075 # re-request scrub every so often in case the earlier
1076 # request was missed. do not do it everytime because
1077 # the scrub may be in progress or not reported yet and
1078 # we will starve progress.
1079 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1080 if gap_cnt > retries:
1081 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1082 if loop:
1083 log.info('Still waiting for all pgs to be scrubbed.')
1084 time.sleep(delays)
1085
1086
1087 @contextlib.contextmanager
1088 def run_daemon(ctx, config, type_):
1089 """
1090 Run daemons for a role type. Handle the startup and termination of a a daemon.
1091 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1092 and a max_mds value for one mds.
1093 On cleanup -- Stop all existing daemons of this type.
1094
1095 :param ctx: Context
1096 :param config: Configuration
1097 :paran type_: Role type
1098 """
1099 cluster_name = config['cluster']
1100 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1101 testdir = teuthology.get_testdir(ctx)
1102 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1103
1104 # check whether any daemons if this type are configured
1105 if daemons is None:
1106 return
1107 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1108
1109 daemon_signal = 'kill'
1110 if config.get('coverage') or config.get('valgrind') is not None:
1111 daemon_signal = 'term'
1112
1113 # create osds in order. (this only matters for pre-luminous, which might
1114 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1115 osd_uuids = {}
1116 for remote, roles_for_host in daemons.remotes.iteritems():
1117 is_type_ = teuthology.is_type(type_, cluster_name)
1118 for role in roles_for_host:
1119 if not is_type_(role):
1120 continue
1121 _, _, id_ = teuthology.split_role(role)
1122
1123
1124 if type_ == 'osd':
1125 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1126 cluster=cluster_name, id=id_)
1127 osd_uuid = teuthology.get_file(
1128 remote=remote,
1129 path=datadir + '/fsid',
1130 sudo=True,
1131 ).strip()
1132 osd_uuids[id_] = osd_uuid
1133 for osd_id in range(len(osd_uuids)):
1134 id_ = str(osd_id)
1135 osd_uuid = osd_uuids.get(id_)
1136 try:
1137 remote.run(
1138 args=[
1139 'sudo', 'ceph', '--cluster', cluster_name,
1140 'osd', 'new', osd_uuid, id_,
1141 ]
1142 )
1143 except:
1144 # fallback to pre-luminous (hammer or jewel)
1145 remote.run(
1146 args=[
1147 'sudo', 'ceph', '--cluster', cluster_name,
1148 'osd', 'create', osd_uuid,
1149 ]
1150 )
1151 if config.get('add_osds_to_crush'):
1152 remote.run(
1153 args=[
1154 'sudo', 'ceph', '--cluster', cluster_name,
1155 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1156 '1.0', 'host=localhost', 'root=default',
1157 ]
1158 )
1159
1160 for remote, roles_for_host in daemons.remotes.iteritems():
1161 is_type_ = teuthology.is_type(type_, cluster_name)
1162 for role in roles_for_host:
1163 if not is_type_(role):
1164 continue
1165 _, _, id_ = teuthology.split_role(role)
1166
1167 run_cmd = [
1168 'sudo',
1169 'adjust-ulimits',
1170 'ceph-coverage',
1171 coverage_dir,
1172 'daemon-helper',
1173 daemon_signal,
1174 ]
1175 run_cmd_tail = [
1176 'ceph-%s' % (type_),
1177 '-f',
1178 '--cluster', cluster_name,
1179 '-i', id_]
1180
1181 if type_ in config.get('cpu_profile', []):
1182 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1183 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1184
1185 if config.get('valgrind') is not None:
1186 valgrind_args = None
1187 if type_ in config['valgrind']:
1188 valgrind_args = config['valgrind'][type_]
1189 if role in config['valgrind']:
1190 valgrind_args = config['valgrind'][role]
1191 run_cmd = teuthology.get_valgrind_args(testdir, role,
1192 run_cmd,
1193 valgrind_args)
1194
1195 run_cmd.extend(run_cmd_tail)
1196
1197 # always register mgr; don't necessarily start
1198 ctx.daemons.register_daemon(
1199 remote, type_, id_,
1200 cluster=cluster_name,
1201 args=run_cmd,
1202 logger=log.getChild(role),
1203 stdin=run.PIPE,
1204 wait=False
1205 )
1206 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1207 role = cluster_name + '.' + type_
1208 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1209
1210 try:
1211 yield
1212 finally:
1213 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1214
1215
1216 def healthy(ctx, config):
1217 """
1218 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1219
1220 :param ctx: Context
1221 :param config: Configuration
1222 """
1223 config = config if isinstance(config, dict) else dict()
1224 cluster_name = config.get('cluster', 'ceph')
1225 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1226 manager = ctx.managers[cluster_name]
1227 try:
1228 manager.wait_for_mgr_available()
1229 except run.CommandFailedError:
1230 log.info('ignoring mgr wait error, probably testing upgrade')
1231
1232 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1233 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1234 teuthology.wait_until_osds_up(
1235 ctx,
1236 cluster=ctx.cluster,
1237 remote=mon0_remote,
1238 ceph_cluster=cluster_name,
1239 )
1240
1241 try:
1242 manager.flush_all_pg_stats()
1243 except run.CommandFailedError:
1244 log.info('ignoring flush pg stats error, probably testing upgrade')
1245 manager.wait_for_clean()
1246
1247 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1248 teuthology.wait_until_healthy(
1249 ctx,
1250 remote=mon0_remote,
1251 ceph_cluster=cluster_name,
1252 )
1253
1254 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1255 # Some MDSs exist, wait for them to be healthy
1256 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1257 ceph_fs.wait_for_daemons(timeout=300)
1258
1259
1260 def wait_for_osds_up(ctx, config):
1261 """
1262 Wait for all osd's to come up.
1263
1264 :param ctx: Context
1265 :param config: Configuration
1266 """
1267 log.info('Waiting until ceph osds are all up...')
1268 cluster_name = config.get('cluster', 'ceph')
1269 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1270 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1271 teuthology.wait_until_osds_up(
1272 ctx,
1273 cluster=ctx.cluster,
1274 remote=mon0_remote
1275 )
1276
1277
1278 def wait_for_mon_quorum(ctx, config):
1279 """
1280 Check renote ceph status until all monitors are up.
1281
1282 :param ctx: Context
1283 :param config: Configuration
1284 """
1285 if isinstance(config, dict):
1286 mons = config['daemons']
1287 cluster_name = config.get('cluster', 'ceph')
1288 else:
1289 assert isinstance(config, list)
1290 mons = config
1291 cluster_name = 'ceph'
1292 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1293 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1294 with contextutil.safe_while(sleep=10, tries=60,
1295 action='wait for monitor quorum') as proceed:
1296 while proceed():
1297 r = remote.run(
1298 args=[
1299 'sudo',
1300 'ceph',
1301 'quorum_status',
1302 ],
1303 stdout=StringIO(),
1304 logger=log.getChild('quorum_status'),
1305 )
1306 j = json.loads(r.stdout.getvalue())
1307 q = j.get('quorum_names', [])
1308 log.debug('Quorum: %s', q)
1309 if sorted(q) == sorted(mons):
1310 break
1311
1312
1313 def created_pool(ctx, config):
1314 """
1315 Add new pools to the dictionary of pools that the ceph-manager
1316 knows about.
1317 """
1318 for new_pool in config:
1319 if new_pool not in ctx.managers['ceph'].pools:
1320 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1321 new_pool, 'pg_num')
1322
1323
1324 @contextlib.contextmanager
1325 def restart(ctx, config):
1326 """
1327 restart ceph daemons
1328
1329 For example::
1330 tasks:
1331 - ceph.restart: [all]
1332
1333 For example::
1334 tasks:
1335 - ceph.restart: [osd.0, mon.1, mds.*]
1336
1337 or::
1338
1339 tasks:
1340 - ceph.restart:
1341 daemons: [osd.0, mon.1]
1342 wait-for-healthy: false
1343 wait-for-osds-up: true
1344
1345 :param ctx: Context
1346 :param config: Configuration
1347 """
1348 if config is None:
1349 config = {}
1350 elif isinstance(config, list):
1351 config = {'daemons': config}
1352
1353 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1354 clusters = set()
1355 for role in daemons:
1356 cluster, type_, id_ = teuthology.split_role(role)
1357 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1358 clusters.add(cluster)
1359
1360 manager = ctx.managers['ceph']
1361 for dmon in daemons:
1362 if '.' in dmon:
1363 dm_parts = dmon.split('.')
1364 if dm_parts[1].isdigit():
1365 if dm_parts[0] == 'osd':
1366 manager.mark_down_osd(int(dm_parts[1]))
1367
1368 if config.get('wait-for-healthy', True):
1369 for cluster in clusters:
1370 healthy(ctx=ctx, config=dict(cluster=cluster))
1371 if config.get('wait-for-osds-up', False):
1372 for cluster in clusters:
1373 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1374 yield
1375
1376
1377 @contextlib.contextmanager
1378 def stop(ctx, config):
1379 """
1380 Stop ceph daemons
1381
1382 For example::
1383 tasks:
1384 - ceph.stop: [mds.*]
1385
1386 tasks:
1387 - ceph.stop: [osd.0, osd.2]
1388
1389 tasks:
1390 - ceph.stop:
1391 daemons: [osd.0, osd.2]
1392
1393 """
1394 if config is None:
1395 config = {}
1396 elif isinstance(config, list):
1397 config = {'daemons': config}
1398
1399 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1400 for role in daemons:
1401 cluster, type_, id_ = teuthology.split_role(role)
1402 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1403
1404 yield
1405
1406
1407 @contextlib.contextmanager
1408 def wait_for_failure(ctx, config):
1409 """
1410 Wait for a failure of a ceph daemon
1411
1412 For example::
1413 tasks:
1414 - ceph.wait_for_failure: [mds.*]
1415
1416 tasks:
1417 - ceph.wait_for_failure: [osd.0, osd.2]
1418
1419 tasks:
1420 - ceph.wait_for_failure:
1421 daemons: [osd.0, osd.2]
1422
1423 """
1424 if config is None:
1425 config = {}
1426 elif isinstance(config, list):
1427 config = {'daemons': config}
1428
1429 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1430 for role in daemons:
1431 cluster, type_, id_ = teuthology.split_role(role)
1432 try:
1433 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1434 except:
1435 log.info('Saw expected daemon failure. Continuing.')
1436 pass
1437 else:
1438 raise RuntimeError('daemon %s did not fail' % role)
1439
1440 yield
1441
1442
1443 def validate_config(ctx, config):
1444 """
1445 Perform some simple validation on task configuration.
1446 Raises exceptions.ConfigError if an error is found.
1447 """
1448 # check for osds from multiple clusters on the same host
1449 for remote, roles_for_host in ctx.cluster.remotes.items():
1450 last_cluster = None
1451 last_role = None
1452 for role in roles_for_host:
1453 role_cluster, role_type, _ = teuthology.split_role(role)
1454 if role_type != 'osd':
1455 continue
1456 if last_cluster and last_cluster != role_cluster:
1457 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1458 last_role, role)
1459 raise exceptions.ConfigError(msg)
1460 last_cluster = role_cluster
1461 last_role = role
1462
1463
1464 @contextlib.contextmanager
1465 def task(ctx, config):
1466 """
1467 Set up and tear down a Ceph cluster.
1468
1469 For example::
1470
1471 tasks:
1472 - ceph:
1473 - interactive:
1474
1475 You can also specify what branch to run::
1476
1477 tasks:
1478 - ceph:
1479 branch: foo
1480
1481 Or a tag::
1482
1483 tasks:
1484 - ceph:
1485 tag: v0.42.13
1486
1487 Or a sha1::
1488
1489 tasks:
1490 - ceph:
1491 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1492
1493 Or a local source dir::
1494
1495 tasks:
1496 - ceph:
1497 path: /home/sage/ceph
1498
1499 To capture code coverage data, use::
1500
1501 tasks:
1502 - ceph:
1503 coverage: true
1504
1505 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1506
1507 tasks:
1508 - ceph:
1509 fs: xfs
1510 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1511 mount_options: [nobarrier, inode64]
1512
1513 Note, this will cause the task to check the /scratch_devs file on each node
1514 for available devices. If no such file is found, /dev/sdb will be used.
1515
1516 To run some daemons under valgrind, include their names
1517 and the tool/args to use in a valgrind section::
1518
1519 tasks:
1520 - ceph:
1521 valgrind:
1522 mds.1: --tool=memcheck
1523 osd.1: [--tool=memcheck, --leak-check=no]
1524
1525 Those nodes which are using memcheck or valgrind will get
1526 checked for bad results.
1527
1528 To adjust or modify config options, use::
1529
1530 tasks:
1531 - ceph:
1532 conf:
1533 section:
1534 key: value
1535
1536 For example::
1537
1538 tasks:
1539 - ceph:
1540 conf:
1541 mds.0:
1542 some option: value
1543 other key: other value
1544 client.0:
1545 debug client: 10
1546 debug ms: 1
1547
1548 By default, the cluster log is checked for errors and warnings,
1549 and the run marked failed if any appear. You can ignore log
1550 entries by giving a list of egrep compatible regexes, i.e.:
1551
1552 tasks:
1553 - ceph:
1554 log-whitelist: ['foo.*bar', 'bad message']
1555
1556 To run multiple ceph clusters, use multiple ceph tasks, and roles
1557 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1558 cluster use the default cluster name, 'ceph'. OSDs from separate
1559 clusters must be on separate hosts. Clients and non-osd daemons
1560 from multiple clusters may be colocated. For each cluster, add an
1561 instance of the ceph task with the cluster name specified, e.g.::
1562
1563 roles:
1564 - [mon.a, osd.0, osd.1]
1565 - [backup.mon.a, backup.osd.0, backup.osd.1]
1566 - [client.0, backup.client.0]
1567 tasks:
1568 - ceph:
1569 cluster: ceph
1570 - ceph:
1571 cluster: backup
1572
1573 :param ctx: Context
1574 :param config: Configuration
1575
1576 """
1577 if config is None:
1578 config = {}
1579 assert isinstance(config, dict), \
1580 "task ceph only supports a dictionary for configuration"
1581
1582 overrides = ctx.config.get('overrides', {})
1583 teuthology.deep_merge(config, overrides.get('ceph', {}))
1584
1585 first_ceph_cluster = False
1586 if not hasattr(ctx, 'daemons'):
1587 first_ceph_cluster = True
1588 ctx.daemons = DaemonGroup()
1589
1590 testdir = teuthology.get_testdir(ctx)
1591 if config.get('coverage'):
1592 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1593 log.info('Creating coverage directory...')
1594 run.wait(
1595 ctx.cluster.run(
1596 args=[
1597 'install', '-d', '-m0755', '--',
1598 coverage_dir,
1599 ],
1600 wait=False,
1601 )
1602 )
1603
1604 if 'cluster' not in config:
1605 config['cluster'] = 'ceph'
1606
1607 validate_config(ctx, config)
1608
1609 subtasks = []
1610 if first_ceph_cluster:
1611 # these tasks handle general log setup and parsing on all hosts,
1612 # so they should only be run once
1613 subtasks = [
1614 lambda: ceph_log(ctx=ctx, config=None),
1615 lambda: valgrind_post(ctx=ctx, config=config),
1616 ]
1617
1618 subtasks += [
1619 lambda: cluster(ctx=ctx, config=dict(
1620 conf=config.get('conf', {}),
1621 fs=config.get('fs', 'xfs'),
1622 mkfs_options=config.get('mkfs_options', None),
1623 mount_options=config.get('mount_options', None),
1624 block_journal=config.get('block_journal', None),
1625 tmpfs_journal=config.get('tmpfs_journal', None),
1626 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1627 log_whitelist=config.get('log-whitelist', []),
1628 cpu_profile=set(config.get('cpu_profile', []),),
1629 cluster=config['cluster'],
1630 )),
1631 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1632 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1633 lambda: crush_setup(ctx=ctx, config=config),
1634 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1635 lambda: create_rbd_pool(ctx=ctx, config=config),
1636 lambda: cephfs_setup(ctx=ctx, config=config),
1637 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1638 ]
1639
1640 with contextutil.nested(*subtasks):
1641 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1642 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1643 if not hasattr(ctx, 'managers'):
1644 ctx.managers = {}
1645 ctx.managers[config['cluster']] = CephManager(
1646 mon,
1647 ctx=ctx,
1648 logger=log.getChild('ceph_manager.' + config['cluster']),
1649 cluster=config['cluster'],
1650 )
1651
1652 try:
1653 if config.get('wait-for-healthy', True):
1654 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1655
1656 yield
1657 finally:
1658 if config.get('wait-for-scrub', True):
1659 osd_scrub_pgs(ctx, config)
1660
1661 # stop logging health to clog during shutdown, or else we generate
1662 # a bunch of scary messages unrelated to our actual run.
1663 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1664 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1665 mon0_remote.run(
1666 args=[
1667 'sudo',
1668 'ceph',
1669 '--cluster', config['cluster'],
1670 'tell',
1671 'mon.*',
1672 'injectargs',
1673 '--',
1674 '--no-mon-health-to-clog',
1675 ]
1676 )