]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/ceph.py
update sources to 12.2.7
[ceph.git] / ceph / qa / tasks / ceph.py
1 """
2 Ceph cluster task.
3
4 Handle the setup, starting, and clean-up of a Ceph cluster.
5 """
6 from cStringIO import StringIO
7
8 import argparse
9 import contextlib
10 import errno
11 import logging
12 import os
13 import json
14 import time
15 import gevent
16 import socket
17
18 from paramiko import SSHException
19 from ceph_manager import CephManager, write_conf
20 from tasks.cephfs.filesystem import Filesystem
21 from teuthology import misc as teuthology
22 from teuthology import contextutil
23 from teuthology import exceptions
24 from teuthology.orchestra import run
25 import ceph_client as cclient
26 from teuthology.orchestra.daemon import DaemonGroup
27
28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30 log = logging.getLogger(__name__)
31
32
33 def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
46 mon='allow profile mgr',
47 osd='allow *',
48 mds='allow *',
49 ),
50 mds=dict(
51 mon='allow *',
52 mgr='allow *',
53 osd='allow *',
54 mds='allow',
55 ),
56 client=dict(
57 mon='allow rw',
58 mgr='allow r',
59 osd='allow rwx',
60 mds='allow',
61 ),
62 )
63 for subsystem, capability in defaults[type_].items():
64 yield '--cap'
65 yield subsystem
66 yield capability
67
68
69 @contextlib.contextmanager
70 def ceph_log(ctx, config):
71 """
72 Create /var/log/ceph log directory that is open to everyone.
73 Add valgrind and profiling-logger directories.
74
75 :param ctx: Context
76 :param config: Configuration
77 """
78 log.info('Making ceph log dir writeable by non-root...')
79 run.wait(
80 ctx.cluster.run(
81 args=[
82 'sudo',
83 'chmod',
84 '777',
85 '/var/log/ceph',
86 ],
87 wait=False,
88 )
89 )
90 log.info('Disabling ceph logrotate...')
91 run.wait(
92 ctx.cluster.run(
93 args=[
94 'sudo',
95 'rm', '-f', '--',
96 '/etc/logrotate.d/ceph',
97 ],
98 wait=False,
99 )
100 )
101 log.info('Creating extra log directories...')
102 run.wait(
103 ctx.cluster.run(
104 args=[
105 'sudo',
106 'install', '-d', '-m0777', '--',
107 '/var/log/ceph/valgrind',
108 '/var/log/ceph/profiling-logger',
109 ],
110 wait=False,
111 )
112 )
113
114 class Rotater(object):
115 stop_event = gevent.event.Event()
116
117 def invoke_logrotate(self):
118 # 1) install ceph-test.conf in /etc/logrotate.d
119 # 2) continuously loop over logrotate invocation with ceph-test.conf
120 while not self.stop_event.is_set():
121 self.stop_event.wait(timeout=30)
122 try:
123 run.wait(
124 ctx.cluster.run(
125 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
126 ],
127 wait=False,
128 )
129 )
130 except exceptions.ConnectionLostError as e:
131 # Some tests may power off nodes during test, in which
132 # case we will see connection errors that we should ignore.
133 log.debug("Missed logrotate, node '{0}' is offline".format(
134 e.node))
135 except EOFError as e:
136 # Paramiko sometimes raises this when it fails to
137 # connect to a node during open_session. As with
138 # ConnectionLostError, we ignore this because nodes
139 # are allowed to get power cycled during tests.
140 log.debug("Missed logrotate, EOFError")
141 except SSHException as e:
142 log.debug("Missed logrotate, SSHException")
143 except socket.error as e:
144 if e.errno == errno.EHOSTUNREACH:
145 log.debug("Missed logrotate, host unreachable")
146 else:
147 raise
148
149 def begin(self):
150 self.thread = gevent.spawn(self.invoke_logrotate)
151
152 def end(self):
153 self.stop_event.set()
154 self.thread.get()
155
156 def write_rotate_conf(ctx, daemons):
157 testdir = teuthology.get_testdir(ctx)
158 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
159 with file(rotate_conf_path, 'rb') as f:
160 conf = ""
161 for daemon, size in daemons.iteritems():
162 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
163 conf += f.read().format(daemon_type=daemon, max_size=size)
164 f.seek(0, 0)
165
166 for remote in ctx.cluster.remotes.iterkeys():
167 teuthology.write_file(remote=remote,
168 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
169 data=StringIO(conf)
170 )
171 remote.run(
172 args=[
173 'sudo',
174 'mv',
175 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
176 '/etc/logrotate.d/ceph-test.conf',
177 run.Raw('&&'),
178 'sudo',
179 'chmod',
180 '0644',
181 '/etc/logrotate.d/ceph-test.conf',
182 run.Raw('&&'),
183 'sudo',
184 'chown',
185 'root.root',
186 '/etc/logrotate.d/ceph-test.conf'
187 ]
188 )
189 remote.chcon('/etc/logrotate.d/ceph-test.conf',
190 'system_u:object_r:etc_t:s0')
191
192 if ctx.config.get('log-rotate'):
193 daemons = ctx.config.get('log-rotate')
194 log.info('Setting up log rotation with ' + str(daemons))
195 write_rotate_conf(ctx, daemons)
196 logrotater = Rotater()
197 logrotater.begin()
198 try:
199 yield
200
201 finally:
202 if ctx.config.get('log-rotate'):
203 log.info('Shutting down logrotate')
204 logrotater.end()
205 ctx.cluster.run(
206 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207 ]
208 )
209 if ctx.archive is not None and \
210 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
211 # and logs
212 log.info('Compressing logs...')
213 run.wait(
214 ctx.cluster.run(
215 args=[
216 'sudo',
217 'find',
218 '/var/log/ceph',
219 '-name',
220 '*.log',
221 '-print0',
222 run.Raw('|'),
223 'sudo',
224 'xargs',
225 '-0',
226 '--no-run-if-empty',
227 '--',
228 'gzip',
229 '--',
230 ],
231 wait=False,
232 ),
233 )
234
235 log.info('Archiving logs...')
236 path = os.path.join(ctx.archive, 'remote')
237 os.makedirs(path)
238 for remote in ctx.cluster.remotes.iterkeys():
239 sub = os.path.join(path, remote.shortname)
240 os.makedirs(sub)
241 teuthology.pull_directory(remote, '/var/log/ceph',
242 os.path.join(sub, 'log'))
243
244
245 def assign_devs(roles, devs):
246 """
247 Create a dictionary of devs indexed by roles
248
249 :param roles: List of roles
250 :param devs: Corresponding list of devices.
251 :returns: Dictionary of devs indexed by roles.
252 """
253 return dict(zip(roles, devs))
254
255
256 @contextlib.contextmanager
257 def valgrind_post(ctx, config):
258 """
259 After the tests run, look throught all the valgrind logs. Exceptions are raised
260 if textual errors occured in the logs, or if valgrind exceptions were detected in
261 the logs.
262
263 :param ctx: Context
264 :param config: Configuration
265 """
266 try:
267 yield
268 finally:
269 lookup_procs = list()
270 log.info('Checking for errors in any valgrind logs...')
271 for remote in ctx.cluster.remotes.iterkeys():
272 # look at valgrind logs for each node
273 proc = remote.run(
274 args=[
275 'sudo',
276 'zgrep',
277 '<kind>',
278 run.Raw('/var/log/ceph/valgrind/*'),
279 '/dev/null', # include a second file so that we always get a filename prefix on the output
280 run.Raw('|'),
281 'sort',
282 run.Raw('|'),
283 'uniq',
284 ],
285 wait=False,
286 check_status=False,
287 stdout=StringIO(),
288 )
289 lookup_procs.append((proc, remote))
290
291 valgrind_exception = None
292 for (proc, remote) in lookup_procs:
293 proc.wait()
294 out = proc.stdout.getvalue()
295 for line in out.split('\n'):
296 if line == '':
297 continue
298 try:
299 (file, kind) = line.split(':')
300 except Exception:
301 log.error('failed to split line %s', line)
302 raise
303 log.debug('file %s kind %s', file, kind)
304 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
305 continue
306 log.error('saw valgrind issue %s in %s', kind, file)
307 valgrind_exception = Exception('saw valgrind issues')
308
309 if config.get('expect_valgrind_errors'):
310 if not valgrind_exception:
311 raise Exception('expected valgrind issues and found none')
312 else:
313 if valgrind_exception:
314 raise valgrind_exception
315
316
317 @contextlib.contextmanager
318 def crush_setup(ctx, config):
319 cluster_name = config['cluster']
320 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
321 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
322
323 profile = config.get('crush_tunables', 'default')
324 log.info('Setting crush tunables to %s', profile)
325 mon_remote.run(
326 args=['sudo', 'ceph', '--cluster', cluster_name,
327 'osd', 'crush', 'tunables', profile])
328 yield
329
330
331 @contextlib.contextmanager
332 def create_rbd_pool(ctx, config):
333 cluster_name = config['cluster']
334 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
335 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
336 log.info('Waiting for OSDs to come up')
337 teuthology.wait_until_osds_up(
338 ctx,
339 cluster=ctx.cluster,
340 remote=mon_remote,
341 ceph_cluster=cluster_name,
342 )
343 if config.get('create_rbd_pool', True):
344 log.info('Creating RBD pool')
345 mon_remote.run(
346 args=['sudo', 'ceph', '--cluster', cluster_name,
347 'osd', 'pool', 'create', 'rbd', '8'])
348 mon_remote.run(
349 args=[
350 'sudo', 'ceph', '--cluster', cluster_name,
351 'osd', 'pool', 'application', 'enable',
352 'rbd', 'rbd', '--yes-i-really-mean-it'
353 ],
354 check_status=False)
355 yield
356
357 @contextlib.contextmanager
358 def cephfs_setup(ctx, config):
359 cluster_name = config['cluster']
360 testdir = teuthology.get_testdir(ctx)
361 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
362
363 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
364 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
365 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
366 # If there are any MDSs, then create a filesystem for them to use
367 # Do this last because requires mon cluster to be up and running
368 if mdss.remotes:
369 log.info('Setting up CephFS filesystem...')
370
371 fs = Filesystem(ctx, name='cephfs', create=True,
372 ec_profile=config.get('cephfs_ec_profile', None))
373
374 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
375 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
376 num_active = len([r for r in all_roles if is_active_mds(r)])
377
378 fs.set_max_mds(num_active)
379 fs.set_allow_dirfrags(True)
380
381 yield
382
383
384 @contextlib.contextmanager
385 def cluster(ctx, config):
386 """
387 Handle the creation and removal of a ceph cluster.
388
389 On startup:
390 Create directories needed for the cluster.
391 Create remote journals for all osds.
392 Create and set keyring.
393 Copy the monmap to tht test systems.
394 Setup mon nodes.
395 Setup mds nodes.
396 Mkfs osd nodes.
397 Add keyring information to monmaps
398 Mkfs mon nodes.
399
400 On exit:
401 If errors occured, extract a failure message and store in ctx.summary.
402 Unmount all test files and temporary journaling files.
403 Save the monitor information and archive all ceph logs.
404 Cleanup the keyring setup, and remove all monitor map and data files left over.
405
406 :param ctx: Context
407 :param config: Configuration
408 """
409 if ctx.config.get('use_existing_cluster', False) is True:
410 log.info("'use_existing_cluster' is true; skipping cluster creation")
411 yield
412
413 testdir = teuthology.get_testdir(ctx)
414 cluster_name = config['cluster']
415 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
416 log.info('Creating ceph cluster %s...', cluster_name)
417 run.wait(
418 ctx.cluster.run(
419 args=[
420 'install', '-d', '-m0755', '--',
421 data_dir,
422 ],
423 wait=False,
424 )
425 )
426
427 run.wait(
428 ctx.cluster.run(
429 args=[
430 'sudo',
431 'install', '-d', '-m0777', '--', '/var/run/ceph',
432 ],
433 wait=False,
434 )
435 )
436
437 devs_to_clean = {}
438 remote_to_roles_to_devs = {}
439 remote_to_roles_to_journals = {}
440 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
441 for remote, roles_for_host in osds.remotes.iteritems():
442 devs = teuthology.get_scratch_devices(remote)
443 roles_to_devs = {}
444 roles_to_journals = {}
445 if config.get('fs'):
446 log.info('fs option selected, checking for scratch devs')
447 log.info('found devs: %s' % (str(devs),))
448 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
449 iddevs = devs_id_map.values()
450 roles_to_devs = assign_devs(
451 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
452 )
453 if len(roles_to_devs) < len(iddevs):
454 iddevs = iddevs[len(roles_to_devs):]
455 devs_to_clean[remote] = []
456
457 if config.get('block_journal'):
458 log.info('block journal enabled')
459 roles_to_journals = assign_devs(
460 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
461 )
462 log.info('journal map: %s', roles_to_journals)
463
464 if config.get('tmpfs_journal'):
465 log.info('tmpfs journal enabled')
466 roles_to_journals = {}
467 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
468 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
469 tmpfs = '/mnt/' + role
470 roles_to_journals[role] = tmpfs
471 remote.run(args=['truncate', '-s', '1500M', tmpfs])
472 log.info('journal map: %s', roles_to_journals)
473
474 log.info('dev map: %s' % (str(roles_to_devs),))
475 remote_to_roles_to_devs[remote] = roles_to_devs
476 remote_to_roles_to_journals[remote] = roles_to_journals
477
478 log.info('Generating config...')
479 remotes_and_roles = ctx.cluster.remotes.items()
480 roles = [role_list for (remote, role_list) in remotes_and_roles]
481 ips = [host for (host, port) in
482 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
483 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
484 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
485 for role, journal in roles_to_journals.iteritems():
486 name = teuthology.ceph_role(role)
487 if name not in conf:
488 conf[name] = {}
489 conf[name]['osd journal'] = journal
490 for section, keys in config['conf'].iteritems():
491 for key, value in keys.iteritems():
492 log.info("[%s] %s = %s" % (section, key, value))
493 if section not in conf:
494 conf[section] = {}
495 conf[section][key] = value
496
497 if config.get('tmpfs_journal'):
498 conf['journal dio'] = False
499
500 if not hasattr(ctx, 'ceph'):
501 ctx.ceph = {}
502 ctx.ceph[cluster_name] = argparse.Namespace()
503 ctx.ceph[cluster_name].conf = conf
504
505 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
506 keyring_path = config.get('keyring_path', default_keyring)
507
508 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
509
510 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
511
512 log.info('Setting up %s...' % firstmon)
513 ctx.cluster.only(firstmon).run(
514 args=[
515 'sudo',
516 'adjust-ulimits',
517 'ceph-coverage',
518 coverage_dir,
519 'ceph-authtool',
520 '--create-keyring',
521 keyring_path,
522 ],
523 )
524 ctx.cluster.only(firstmon).run(
525 args=[
526 'sudo',
527 'adjust-ulimits',
528 'ceph-coverage',
529 coverage_dir,
530 'ceph-authtool',
531 '--gen-key',
532 '--name=mon.',
533 keyring_path,
534 ],
535 )
536 ctx.cluster.only(firstmon).run(
537 args=[
538 'sudo',
539 'chmod',
540 '0644',
541 keyring_path,
542 ],
543 )
544 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
545 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
546 cluster=cluster_name)
547 fsid = teuthology.create_simple_monmap(
548 ctx,
549 remote=mon0_remote,
550 conf=conf,
551 path=monmap_path,
552 )
553 if not 'global' in conf:
554 conf['global'] = {}
555 conf['global']['fsid'] = fsid
556
557 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
558 conf_path = config.get('conf_path', default_conf_path)
559 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
560 write_conf(ctx, conf_path, cluster_name)
561
562 log.info('Creating admin key on %s...' % firstmon)
563 ctx.cluster.only(firstmon).run(
564 args=[
565 'sudo',
566 'adjust-ulimits',
567 'ceph-coverage',
568 coverage_dir,
569 'ceph-authtool',
570 '--gen-key',
571 '--name=client.admin',
572 '--set-uid=0',
573 '--cap', 'mon', 'allow *',
574 '--cap', 'osd', 'allow *',
575 '--cap', 'mds', 'allow *',
576 '--cap', 'mgr', 'allow *',
577 keyring_path,
578 ],
579 )
580
581 log.info('Copying monmap to all nodes...')
582 keyring = teuthology.get_file(
583 remote=mon0_remote,
584 path=keyring_path,
585 )
586 monmap = teuthology.get_file(
587 remote=mon0_remote,
588 path=monmap_path,
589 )
590
591 for rem in ctx.cluster.remotes.iterkeys():
592 # copy mon key and initial monmap
593 log.info('Sending monmap to node {remote}'.format(remote=rem))
594 teuthology.sudo_write_file(
595 remote=rem,
596 path=keyring_path,
597 data=keyring,
598 perms='0644'
599 )
600 teuthology.write_file(
601 remote=rem,
602 path=monmap_path,
603 data=monmap,
604 )
605
606 log.info('Setting up mon nodes...')
607 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
608
609 if not config.get('skip_mgr_daemons', False):
610 log.info('Setting up mgr nodes...')
611 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
612 for remote, roles_for_host in mgrs.remotes.iteritems():
613 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
614 cluster_name):
615 _, _, id_ = teuthology.split_role(role)
616 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
617 cluster=cluster_name,
618 id=id_,
619 )
620 remote.run(
621 args=[
622 'sudo',
623 'mkdir',
624 '-p',
625 mgr_dir,
626 run.Raw('&&'),
627 'sudo',
628 'adjust-ulimits',
629 'ceph-coverage',
630 coverage_dir,
631 'ceph-authtool',
632 '--create-keyring',
633 '--gen-key',
634 '--name=mgr.{id}'.format(id=id_),
635 mgr_dir + '/keyring',
636 ],
637 )
638
639 log.info('Setting up mds nodes...')
640 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
641 for remote, roles_for_host in mdss.remotes.iteritems():
642 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
643 cluster_name):
644 _, _, id_ = teuthology.split_role(role)
645 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
646 cluster=cluster_name,
647 id=id_,
648 )
649 remote.run(
650 args=[
651 'sudo',
652 'mkdir',
653 '-p',
654 mds_dir,
655 run.Raw('&&'),
656 'sudo',
657 'adjust-ulimits',
658 'ceph-coverage',
659 coverage_dir,
660 'ceph-authtool',
661 '--create-keyring',
662 '--gen-key',
663 '--name=mds.{id}'.format(id=id_),
664 mds_dir + '/keyring',
665 ],
666 )
667
668 cclient.create_keyring(ctx, cluster_name)
669 log.info('Running mkfs on osd nodes...')
670
671 if not hasattr(ctx, 'disk_config'):
672 ctx.disk_config = argparse.Namespace()
673 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
674 ctx.disk_config.remote_to_roles_to_dev = {}
675 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
676 ctx.disk_config.remote_to_roles_to_journals = {}
677 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
678 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
679 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
680 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
681
682 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
683 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
684
685 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
686 for remote, roles_for_host in osds.remotes.iteritems():
687 roles_to_devs = remote_to_roles_to_devs[remote]
688 roles_to_journals = remote_to_roles_to_journals[remote]
689
690 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
691 _, _, id_ = teuthology.split_role(role)
692 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
693 remote.run(
694 args=[
695 'sudo',
696 'mkdir',
697 '-p',
698 mnt_point,
699 ])
700 log.info(str(roles_to_devs))
701 log.info(str(roles_to_journals))
702 log.info(role)
703 if roles_to_devs.get(role):
704 dev = roles_to_devs[role]
705 fs = config.get('fs')
706 package = None
707 mkfs_options = config.get('mkfs_options')
708 mount_options = config.get('mount_options')
709 if fs == 'btrfs':
710 # package = 'btrfs-tools'
711 if mount_options is None:
712 mount_options = ['noatime', 'user_subvol_rm_allowed']
713 if mkfs_options is None:
714 mkfs_options = ['-m', 'single',
715 '-l', '32768',
716 '-n', '32768']
717 if fs == 'xfs':
718 # package = 'xfsprogs'
719 if mount_options is None:
720 mount_options = ['noatime']
721 if mkfs_options is None:
722 mkfs_options = ['-f', '-i', 'size=2048']
723 if fs == 'ext4' or fs == 'ext3':
724 if mount_options is None:
725 mount_options = ['noatime', 'user_xattr']
726
727 if mount_options is None:
728 mount_options = []
729 if mkfs_options is None:
730 mkfs_options = []
731 mkfs = ['mkfs.%s' % fs] + mkfs_options
732 log.info('%s on %s on %s' % (mkfs, dev, remote))
733 if package is not None:
734 remote.run(
735 args=[
736 'sudo',
737 'apt-get', 'install', '-y', package
738 ],
739 stdout=StringIO(),
740 )
741
742 try:
743 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
744 except run.CommandFailedError:
745 # Newer btfs-tools doesn't prompt for overwrite, use -f
746 if '-f' not in mount_options:
747 mkfs_options.append('-f')
748 mkfs = ['mkfs.%s' % fs] + mkfs_options
749 log.info('%s on %s on %s' % (mkfs, dev, remote))
750 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
751
752 log.info('mount %s on %s -o %s' % (dev, remote,
753 ','.join(mount_options)))
754 remote.run(
755 args=[
756 'sudo',
757 'mount',
758 '-t', fs,
759 '-o', ','.join(mount_options),
760 dev,
761 mnt_point,
762 ]
763 )
764 remote.run(
765 args=[
766 'sudo', '/sbin/restorecon', mnt_point,
767 ],
768 check_status=False,
769 )
770 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
771 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
772 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
773 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
774 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
775 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
776 devs_to_clean[remote].append(mnt_point)
777
778 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
779 _, _, id_ = teuthology.split_role(role)
780 remote.run(
781 args=[
782 'sudo',
783 'MALLOC_CHECK_=3',
784 'adjust-ulimits',
785 'ceph-coverage',
786 coverage_dir,
787 'ceph-osd',
788 '--cluster',
789 cluster_name,
790 '--mkfs',
791 '--mkkey',
792 '-i', id_,
793 '--monmap', monmap_path,
794 ],
795 )
796
797 log.info('Reading keys from all nodes...')
798 keys_fp = StringIO()
799 keys = []
800 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
801 for type_ in ['mgr', 'mds', 'osd']:
802 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
803 continue
804 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
805 _, _, id_ = teuthology.split_role(role)
806 data = teuthology.get_file(
807 remote=remote,
808 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
809 type=type_,
810 id=id_,
811 cluster=cluster_name,
812 ),
813 sudo=True,
814 )
815 keys.append((type_, id_, data))
816 keys_fp.write(data)
817 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
818 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
819 _, _, id_ = teuthology.split_role(role)
820 data = teuthology.get_file(
821 remote=remote,
822 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
823 )
824 keys.append(('client', id_, data))
825 keys_fp.write(data)
826
827 log.info('Adding keys to all mons...')
828 writes = mons.run(
829 args=[
830 'sudo', 'tee', '-a',
831 keyring_path,
832 ],
833 stdin=run.PIPE,
834 wait=False,
835 stdout=StringIO(),
836 )
837 keys_fp.seek(0)
838 teuthology.feed_many_stdins_and_close(keys_fp, writes)
839 run.wait(writes)
840 for type_, id_, data in keys:
841 run.wait(
842 mons.run(
843 args=[
844 'sudo',
845 'adjust-ulimits',
846 'ceph-coverage',
847 coverage_dir,
848 'ceph-authtool',
849 keyring_path,
850 '--name={type}.{id}'.format(
851 type=type_,
852 id=id_,
853 ),
854 ] + list(generate_caps(type_)),
855 wait=False,
856 ),
857 )
858
859 log.info('Running mkfs on mon nodes...')
860 for remote, roles_for_host in mons.remotes.iteritems():
861 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
862 _, _, id_ = teuthology.split_role(role)
863 remote.run(
864 args=[
865 'sudo',
866 'mkdir',
867 '-p',
868 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
869 ],
870 )
871 remote.run(
872 args=[
873 'sudo',
874 'adjust-ulimits',
875 'ceph-coverage',
876 coverage_dir,
877 'ceph-mon',
878 '--cluster', cluster_name,
879 '--mkfs',
880 '-i', id_,
881 '--monmap', monmap_path,
882 '--keyring', keyring_path,
883 ],
884 )
885
886 run.wait(
887 mons.run(
888 args=[
889 'rm',
890 '--',
891 monmap_path,
892 ],
893 wait=False,
894 ),
895 )
896
897 try:
898 yield
899 except Exception:
900 # we need to know this below
901 ctx.summary['success'] = False
902 raise
903 finally:
904 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
905
906 log.info('Checking cluster log for badness...')
907
908 def first_in_ceph_log(pattern, excludes):
909 """
910 Find the first occurence of the pattern specified in the Ceph log,
911 Returns None if none found.
912
913 :param pattern: Pattern scanned for.
914 :param excludes: Patterns to ignore.
915 :return: First line of text (or None if not found)
916 """
917 args = [
918 'sudo',
919 'egrep', pattern,
920 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
921 ]
922 for exclude in excludes:
923 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
924 args.extend([
925 run.Raw('|'), 'head', '-n', '1',
926 ])
927 r = mon0_remote.run(
928 stdout=StringIO(),
929 args=args,
930 )
931 stdout = r.stdout.getvalue()
932 if stdout != '':
933 return stdout
934 return None
935
936 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
937 config['log_whitelist']) is not None:
938 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
939 ctx.summary['success'] = False
940 # use the most severe problem as the failure reason
941 if 'failure_reason' not in ctx.summary:
942 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
943 match = first_in_ceph_log(pattern, config['log_whitelist'])
944 if match is not None:
945 ctx.summary['failure_reason'] = \
946 '"{match}" in cluster log'.format(
947 match=match.rstrip('\n'),
948 )
949 break
950
951 for remote, dirs in devs_to_clean.iteritems():
952 for dir_ in dirs:
953 log.info('Unmounting %s on %s' % (dir_, remote))
954 try:
955 remote.run(
956 args=[
957 'sync',
958 run.Raw('&&'),
959 'sudo',
960 'umount',
961 '-f',
962 dir_
963 ]
964 )
965 except Exception as e:
966 remote.run(args=[
967 'sudo',
968 run.Raw('PATH=/usr/sbin:$PATH'),
969 'lsof',
970 run.Raw(';'),
971 'ps', 'auxf',
972 ])
973 raise e
974
975 if config.get('tmpfs_journal'):
976 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
977 for remote, roles_for_host in osds.remotes.iteritems():
978 remote.run(
979 args=['sudo', 'umount', '-f', '/mnt'],
980 check_status=False,
981 )
982
983 if ctx.archive is not None and \
984 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
985
986 # archive mon data, too
987 log.info('Archiving mon data...')
988 path = os.path.join(ctx.archive, 'data')
989 try:
990 os.makedirs(path)
991 except OSError as e:
992 if e.errno == errno.EEXIST:
993 pass
994 else:
995 raise
996 for remote, roles in mons.remotes.iteritems():
997 for role in roles:
998 is_mon = teuthology.is_type('mon', cluster_name)
999 if is_mon(role):
1000 _, _, id_ = teuthology.split_role(role)
1001 mon_dir = '/var/lib/ceph/mon/' + \
1002 '{0}-{1}'.format(cluster_name, id_)
1003 teuthology.pull_directory_tarball(
1004 remote,
1005 mon_dir,
1006 path + '/' + role + '.tgz')
1007
1008 log.info('Cleaning ceph cluster...')
1009 run.wait(
1010 ctx.cluster.run(
1011 args=[
1012 'sudo',
1013 'rm',
1014 '-rf',
1015 '--',
1016 conf_path,
1017 keyring_path,
1018 data_dir,
1019 monmap_path,
1020 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1021 ],
1022 wait=False,
1023 ),
1024 )
1025
1026
1027 def osd_scrub_pgs(ctx, config):
1028 """
1029 Scrub pgs when we exit.
1030
1031 First make sure all pgs are active and clean.
1032 Next scrub all osds.
1033 Then periodically check until all pgs have scrub time stamps that
1034 indicate the last scrub completed. Time out if no progess is made
1035 here after two minutes.
1036 """
1037 retries = 40
1038 delays = 20
1039 cluster_name = config['cluster']
1040 manager = ctx.managers[cluster_name]
1041 all_clean = False
1042 for _ in range(0, retries):
1043 stats = manager.get_pg_stats()
1044 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1045 if not bad:
1046 all_clean = True
1047 break
1048 log.info(
1049 "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1050 time.sleep(delays)
1051 if not all_clean:
1052 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1053 check_time_now = time.localtime()
1054 time.sleep(1)
1055 all_roles = teuthology.all_roles(ctx.cluster)
1056 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1057 log.info("Scrubbing {osd}".format(osd=role))
1058 _, _, id_ = teuthology.split_role(role)
1059 # allow this to fail; in certain cases the OSD might not be up
1060 # at this point. we will catch all pgs below.
1061 try:
1062 manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
1063 'osd_debug_deep_scrub_sleep', '0');
1064 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1065 except run.CommandFailedError:
1066 pass
1067 prev_good = 0
1068 gap_cnt = 0
1069 loop = True
1070 while loop:
1071 stats = manager.get_pg_stats()
1072 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1073 loop = False
1074 thiscnt = 0
1075 for (pgid, tmval) in timez:
1076 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1077 if pgtm > check_time_now:
1078 thiscnt += 1
1079 else:
1080 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1081 loop = True
1082 if thiscnt > prev_good:
1083 prev_good = thiscnt
1084 gap_cnt = 0
1085 else:
1086 gap_cnt += 1
1087 if gap_cnt % 6 == 0:
1088 for (pgid, tmval) in timez:
1089 # re-request scrub every so often in case the earlier
1090 # request was missed. do not do it everytime because
1091 # the scrub may be in progress or not reported yet and
1092 # we will starve progress.
1093 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1094 if gap_cnt > retries:
1095 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1096 if loop:
1097 log.info('Still waiting for all pgs to be scrubbed.')
1098 time.sleep(delays)
1099
1100
1101 @contextlib.contextmanager
1102 def run_daemon(ctx, config, type_):
1103 """
1104 Run daemons for a role type. Handle the startup and termination of a a daemon.
1105 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1106 and a max_mds value for one mds.
1107 On cleanup -- Stop all existing daemons of this type.
1108
1109 :param ctx: Context
1110 :param config: Configuration
1111 :paran type_: Role type
1112 """
1113 cluster_name = config['cluster']
1114 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1115 testdir = teuthology.get_testdir(ctx)
1116 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1117
1118 # check whether any daemons if this type are configured
1119 if daemons is None:
1120 return
1121 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1122
1123 daemon_signal = 'kill'
1124 if config.get('coverage') or config.get('valgrind') is not None:
1125 daemon_signal = 'term'
1126
1127 # create osds in order. (this only matters for pre-luminous, which might
1128 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1129 osd_uuids = {}
1130 for remote, roles_for_host in daemons.remotes.iteritems():
1131 is_type_ = teuthology.is_type(type_, cluster_name)
1132 for role in roles_for_host:
1133 if not is_type_(role):
1134 continue
1135 _, _, id_ = teuthology.split_role(role)
1136
1137
1138 if type_ == 'osd':
1139 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1140 cluster=cluster_name, id=id_)
1141 osd_uuid = teuthology.get_file(
1142 remote=remote,
1143 path=datadir + '/fsid',
1144 sudo=True,
1145 ).strip()
1146 osd_uuids[id_] = osd_uuid
1147 for osd_id in range(len(osd_uuids)):
1148 id_ = str(osd_id)
1149 osd_uuid = osd_uuids.get(id_)
1150 try:
1151 remote.run(
1152 args=[
1153 'sudo', 'ceph', '--cluster', cluster_name,
1154 'osd', 'new', osd_uuid, id_,
1155 ]
1156 )
1157 except:
1158 # fallback to pre-luminous (hammer or jewel)
1159 remote.run(
1160 args=[
1161 'sudo', 'ceph', '--cluster', cluster_name,
1162 'osd', 'create', osd_uuid,
1163 ]
1164 )
1165 if config.get('add_osds_to_crush'):
1166 remote.run(
1167 args=[
1168 'sudo', 'ceph', '--cluster', cluster_name,
1169 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1170 '1.0', 'host=localhost', 'root=default',
1171 ]
1172 )
1173
1174 for remote, roles_for_host in daemons.remotes.iteritems():
1175 is_type_ = teuthology.is_type(type_, cluster_name)
1176 for role in roles_for_host:
1177 if not is_type_(role):
1178 continue
1179 _, _, id_ = teuthology.split_role(role)
1180
1181 run_cmd = [
1182 'sudo',
1183 'adjust-ulimits',
1184 'ceph-coverage',
1185 coverage_dir,
1186 'daemon-helper',
1187 daemon_signal,
1188 ]
1189 run_cmd_tail = [
1190 'ceph-%s' % (type_),
1191 '-f',
1192 '--cluster', cluster_name,
1193 '-i', id_]
1194
1195 if type_ in config.get('cpu_profile', []):
1196 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1197 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1198
1199 if config.get('valgrind') is not None:
1200 valgrind_args = None
1201 if type_ in config['valgrind']:
1202 valgrind_args = config['valgrind'][type_]
1203 if role in config['valgrind']:
1204 valgrind_args = config['valgrind'][role]
1205 run_cmd = teuthology.get_valgrind_args(testdir, role,
1206 run_cmd,
1207 valgrind_args)
1208
1209 run_cmd.extend(run_cmd_tail)
1210
1211 # always register mgr; don't necessarily start
1212 ctx.daemons.register_daemon(
1213 remote, type_, id_,
1214 cluster=cluster_name,
1215 args=run_cmd,
1216 logger=log.getChild(role),
1217 stdin=run.PIPE,
1218 wait=False
1219 )
1220 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1221 role = cluster_name + '.' + type_
1222 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1223
1224 try:
1225 yield
1226 finally:
1227 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1228
1229
1230 def healthy(ctx, config):
1231 """
1232 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1233
1234 :param ctx: Context
1235 :param config: Configuration
1236 """
1237 config = config if isinstance(config, dict) else dict()
1238 cluster_name = config.get('cluster', 'ceph')
1239 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1240 manager = ctx.managers[cluster_name]
1241 try:
1242 manager.wait_for_mgr_available(timeout=30)
1243 except (run.CommandFailedError, AssertionError) as e:
1244 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
1245
1246 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1247 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1248 teuthology.wait_until_osds_up(
1249 ctx,
1250 cluster=ctx.cluster,
1251 remote=mon0_remote,
1252 ceph_cluster=cluster_name,
1253 )
1254
1255 try:
1256 manager.flush_all_pg_stats()
1257 except (run.CommandFailedError, Exception) as e:
1258 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
1259 manager.wait_for_clean()
1260
1261 if config.get('wait-for-healthy', True):
1262 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1263 teuthology.wait_until_healthy(
1264 ctx,
1265 remote=mon0_remote,
1266 ceph_cluster=cluster_name,
1267 )
1268
1269 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1270 # Some MDSs exist, wait for them to be healthy
1271 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1272 ceph_fs.wait_for_daemons(timeout=300)
1273
1274
1275 def wait_for_osds_up(ctx, config):
1276 """
1277 Wait for all osd's to come up.
1278
1279 :param ctx: Context
1280 :param config: Configuration
1281 """
1282 log.info('Waiting until ceph osds are all up...')
1283 cluster_name = config.get('cluster', 'ceph')
1284 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1285 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1286 teuthology.wait_until_osds_up(
1287 ctx,
1288 cluster=ctx.cluster,
1289 remote=mon0_remote
1290 )
1291
1292
1293 def wait_for_mon_quorum(ctx, config):
1294 """
1295 Check renote ceph status until all monitors are up.
1296
1297 :param ctx: Context
1298 :param config: Configuration
1299 """
1300 if isinstance(config, dict):
1301 mons = config['daemons']
1302 cluster_name = config.get('cluster', 'ceph')
1303 else:
1304 assert isinstance(config, list)
1305 mons = config
1306 cluster_name = 'ceph'
1307 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1308 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1309 with contextutil.safe_while(sleep=10, tries=60,
1310 action='wait for monitor quorum') as proceed:
1311 while proceed():
1312 r = remote.run(
1313 args=[
1314 'sudo',
1315 'ceph',
1316 'quorum_status',
1317 ],
1318 stdout=StringIO(),
1319 logger=log.getChild('quorum_status'),
1320 )
1321 j = json.loads(r.stdout.getvalue())
1322 q = j.get('quorum_names', [])
1323 log.debug('Quorum: %s', q)
1324 if sorted(q) == sorted(mons):
1325 break
1326
1327
1328 def created_pool(ctx, config):
1329 """
1330 Add new pools to the dictionary of pools that the ceph-manager
1331 knows about.
1332 """
1333 for new_pool in config:
1334 if new_pool not in ctx.managers['ceph'].pools:
1335 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1336 new_pool, 'pg_num')
1337
1338
1339 @contextlib.contextmanager
1340 def restart(ctx, config):
1341 """
1342 restart ceph daemons
1343
1344 For example::
1345 tasks:
1346 - ceph.restart: [all]
1347
1348 For example::
1349 tasks:
1350 - ceph.restart: [osd.0, mon.1, mds.*]
1351
1352 or::
1353
1354 tasks:
1355 - ceph.restart:
1356 daemons: [osd.0, mon.1]
1357 wait-for-healthy: false
1358 wait-for-osds-up: true
1359
1360 :param ctx: Context
1361 :param config: Configuration
1362 """
1363 if config is None:
1364 config = {}
1365 elif isinstance(config, list):
1366 config = {'daemons': config}
1367
1368 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1369 clusters = set()
1370 for role in daemons:
1371 cluster, type_, id_ = teuthology.split_role(role)
1372 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1373 clusters.add(cluster)
1374
1375 manager = ctx.managers['ceph']
1376 for dmon in daemons:
1377 if '.' in dmon:
1378 dm_parts = dmon.split('.')
1379 if dm_parts[1].isdigit():
1380 if dm_parts[0] == 'osd':
1381 manager.mark_down_osd(int(dm_parts[1]))
1382
1383 if config.get('wait-for-healthy', True):
1384 for cluster in clusters:
1385 healthy(ctx=ctx, config=dict(cluster=cluster))
1386 if config.get('wait-for-osds-up', False):
1387 for cluster in clusters:
1388 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1389 yield
1390
1391
1392 @contextlib.contextmanager
1393 def stop(ctx, config):
1394 """
1395 Stop ceph daemons
1396
1397 For example::
1398 tasks:
1399 - ceph.stop: [mds.*]
1400
1401 tasks:
1402 - ceph.stop: [osd.0, osd.2]
1403
1404 tasks:
1405 - ceph.stop:
1406 daemons: [osd.0, osd.2]
1407
1408 """
1409 if config is None:
1410 config = {}
1411 elif isinstance(config, list):
1412 config = {'daemons': config}
1413
1414 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1415 for role in daemons:
1416 cluster, type_, id_ = teuthology.split_role(role)
1417 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1418
1419 yield
1420
1421
1422 @contextlib.contextmanager
1423 def wait_for_failure(ctx, config):
1424 """
1425 Wait for a failure of a ceph daemon
1426
1427 For example::
1428 tasks:
1429 - ceph.wait_for_failure: [mds.*]
1430
1431 tasks:
1432 - ceph.wait_for_failure: [osd.0, osd.2]
1433
1434 tasks:
1435 - ceph.wait_for_failure:
1436 daemons: [osd.0, osd.2]
1437
1438 """
1439 if config is None:
1440 config = {}
1441 elif isinstance(config, list):
1442 config = {'daemons': config}
1443
1444 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1445 for role in daemons:
1446 cluster, type_, id_ = teuthology.split_role(role)
1447 try:
1448 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1449 except:
1450 log.info('Saw expected daemon failure. Continuing.')
1451 pass
1452 else:
1453 raise RuntimeError('daemon %s did not fail' % role)
1454
1455 yield
1456
1457
1458 def validate_config(ctx, config):
1459 """
1460 Perform some simple validation on task configuration.
1461 Raises exceptions.ConfigError if an error is found.
1462 """
1463 # check for osds from multiple clusters on the same host
1464 for remote, roles_for_host in ctx.cluster.remotes.items():
1465 last_cluster = None
1466 last_role = None
1467 for role in roles_for_host:
1468 role_cluster, role_type, _ = teuthology.split_role(role)
1469 if role_type != 'osd':
1470 continue
1471 if last_cluster and last_cluster != role_cluster:
1472 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1473 last_role, role)
1474 raise exceptions.ConfigError(msg)
1475 last_cluster = role_cluster
1476 last_role = role
1477
1478
1479 @contextlib.contextmanager
1480 def task(ctx, config):
1481 """
1482 Set up and tear down a Ceph cluster.
1483
1484 For example::
1485
1486 tasks:
1487 - ceph:
1488 - interactive:
1489
1490 You can also specify what branch to run::
1491
1492 tasks:
1493 - ceph:
1494 branch: foo
1495
1496 Or a tag::
1497
1498 tasks:
1499 - ceph:
1500 tag: v0.42.13
1501
1502 Or a sha1::
1503
1504 tasks:
1505 - ceph:
1506 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1507
1508 Or a local source dir::
1509
1510 tasks:
1511 - ceph:
1512 path: /home/sage/ceph
1513
1514 To capture code coverage data, use::
1515
1516 tasks:
1517 - ceph:
1518 coverage: true
1519
1520 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1521
1522 tasks:
1523 - ceph:
1524 fs: xfs
1525 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1526 mount_options: [nobarrier, inode64]
1527
1528 Note, this will cause the task to check the /scratch_devs file on each node
1529 for available devices. If no such file is found, /dev/sdb will be used.
1530
1531 To run some daemons under valgrind, include their names
1532 and the tool/args to use in a valgrind section::
1533
1534 tasks:
1535 - ceph:
1536 valgrind:
1537 mds.1: --tool=memcheck
1538 osd.1: [--tool=memcheck, --leak-check=no]
1539
1540 Those nodes which are using memcheck or valgrind will get
1541 checked for bad results.
1542
1543 To adjust or modify config options, use::
1544
1545 tasks:
1546 - ceph:
1547 conf:
1548 section:
1549 key: value
1550
1551 For example::
1552
1553 tasks:
1554 - ceph:
1555 conf:
1556 mds.0:
1557 some option: value
1558 other key: other value
1559 client.0:
1560 debug client: 10
1561 debug ms: 1
1562
1563 By default, the cluster log is checked for errors and warnings,
1564 and the run marked failed if any appear. You can ignore log
1565 entries by giving a list of egrep compatible regexes, i.e.:
1566
1567 tasks:
1568 - ceph:
1569 log-whitelist: ['foo.*bar', 'bad message']
1570
1571 To run multiple ceph clusters, use multiple ceph tasks, and roles
1572 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1573 cluster use the default cluster name, 'ceph'. OSDs from separate
1574 clusters must be on separate hosts. Clients and non-osd daemons
1575 from multiple clusters may be colocated. For each cluster, add an
1576 instance of the ceph task with the cluster name specified, e.g.::
1577
1578 roles:
1579 - [mon.a, osd.0, osd.1]
1580 - [backup.mon.a, backup.osd.0, backup.osd.1]
1581 - [client.0, backup.client.0]
1582 tasks:
1583 - ceph:
1584 cluster: ceph
1585 - ceph:
1586 cluster: backup
1587
1588 :param ctx: Context
1589 :param config: Configuration
1590
1591 """
1592 if config is None:
1593 config = {}
1594 assert isinstance(config, dict), \
1595 "task ceph only supports a dictionary for configuration"
1596
1597 overrides = ctx.config.get('overrides', {})
1598 teuthology.deep_merge(config, overrides.get('ceph', {}))
1599
1600 first_ceph_cluster = False
1601 if not hasattr(ctx, 'daemons'):
1602 first_ceph_cluster = True
1603 ctx.daemons = DaemonGroup()
1604
1605 testdir = teuthology.get_testdir(ctx)
1606 if config.get('coverage'):
1607 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1608 log.info('Creating coverage directory...')
1609 run.wait(
1610 ctx.cluster.run(
1611 args=[
1612 'install', '-d', '-m0755', '--',
1613 coverage_dir,
1614 ],
1615 wait=False,
1616 )
1617 )
1618
1619 if 'cluster' not in config:
1620 config['cluster'] = 'ceph'
1621
1622 validate_config(ctx, config)
1623
1624 subtasks = []
1625 if first_ceph_cluster:
1626 # these tasks handle general log setup and parsing on all hosts,
1627 # so they should only be run once
1628 subtasks = [
1629 lambda: ceph_log(ctx=ctx, config=None),
1630 lambda: valgrind_post(ctx=ctx, config=config),
1631 ]
1632
1633 subtasks += [
1634 lambda: cluster(ctx=ctx, config=dict(
1635 conf=config.get('conf', {}),
1636 fs=config.get('fs', 'xfs'),
1637 mkfs_options=config.get('mkfs_options', None),
1638 mount_options=config.get('mount_options', None),
1639 block_journal=config.get('block_journal', None),
1640 tmpfs_journal=config.get('tmpfs_journal', None),
1641 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1642 log_whitelist=config.get('log-whitelist', []),
1643 cpu_profile=set(config.get('cpu_profile', []),),
1644 cluster=config['cluster'],
1645 )),
1646 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1647 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1648 lambda: crush_setup(ctx=ctx, config=config),
1649 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1650 lambda: create_rbd_pool(ctx=ctx, config=config),
1651 lambda: cephfs_setup(ctx=ctx, config=config),
1652 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1653 ]
1654
1655 with contextutil.nested(*subtasks):
1656 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1657 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1658 if not hasattr(ctx, 'managers'):
1659 ctx.managers = {}
1660 ctx.managers[config['cluster']] = CephManager(
1661 mon,
1662 ctx=ctx,
1663 logger=log.getChild('ceph_manager.' + config['cluster']),
1664 cluster=config['cluster'],
1665 )
1666
1667 try:
1668 if config.get('wait-for-healthy', True):
1669 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1670
1671 yield
1672 finally:
1673 if config.get('wait-for-scrub', True):
1674 osd_scrub_pgs(ctx, config)
1675
1676 # stop logging health to clog during shutdown, or else we generate
1677 # a bunch of scary messages unrelated to our actual run.
1678 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1679 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1680 mon0_remote.run(
1681 args=[
1682 'sudo',
1683 'ceph',
1684 '--cluster', config['cluster'],
1685 'tell',
1686 'mon.*',
1687 'injectargs',
1688 '--',
1689 '--no-mon-health-to-clog',
1690 ]
1691 )