]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
update sources to v12.1.1
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
6from cStringIO import StringIO
7
8import argparse
9import contextlib
10import errno
11import logging
12import os
13import json
14import time
15import gevent
16import socket
17
18from paramiko import SSHException
19from ceph_manager import CephManager, write_conf
20from tasks.cephfs.filesystem import Filesystem
21from teuthology import misc as teuthology
22from teuthology import contextutil
23from teuthology import exceptions
24from teuthology.orchestra import run
25import ceph_client as cclient
26from teuthology.orchestra.daemon import DaemonGroup
27
28CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30log = logging.getLogger(__name__)
31
32
33def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
46 mon='allow *',
47 ),
48 mds=dict(
49 mon='allow *',
50 mgr='allow *',
51 osd='allow *',
52 mds='allow',
53 ),
54 client=dict(
55 mon='allow rw',
56 mgr='allow r',
57 osd='allow rwx',
58 mds='allow',
59 ),
60 )
61 for subsystem, capability in defaults[type_].items():
62 yield '--cap'
63 yield subsystem
64 yield capability
65
66
67@contextlib.contextmanager
68def ceph_log(ctx, config):
69 """
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
72
73 :param ctx: Context
74 :param config: Configuration
75 """
76 log.info('Making ceph log dir writeable by non-root...')
77 run.wait(
78 ctx.cluster.run(
79 args=[
80 'sudo',
81 'chmod',
82 '777',
83 '/var/log/ceph',
84 ],
85 wait=False,
86 )
87 )
88 log.info('Disabling ceph logrotate...')
89 run.wait(
90 ctx.cluster.run(
91 args=[
92 'sudo',
93 'rm', '-f', '--',
94 '/etc/logrotate.d/ceph',
95 ],
96 wait=False,
97 )
98 )
99 log.info('Creating extra log directories...')
100 run.wait(
101 ctx.cluster.run(
102 args=[
103 'sudo',
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
107 ],
108 wait=False,
109 )
110 )
111
112 class Rotater(object):
113 stop_event = gevent.event.Event()
114
115 def invoke_logrotate(self):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self.stop_event.is_set():
119 self.stop_event.wait(timeout=30)
120 try:
121 run.wait(
122 ctx.cluster.run(
123 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
124 ],
125 wait=False,
126 )
127 )
128 except exceptions.ConnectionLostError as e:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log.debug("Missed logrotate, node '{0}' is offline".format(
132 e.node))
133 except EOFError as e:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log.debug("Missed logrotate, EOFError")
139 except SSHException as e:
140 log.debug("Missed logrotate, SSHException")
141 except socket.error as e:
142 if e.errno == errno.EHOSTUNREACH:
143 log.debug("Missed logrotate, host unreachable")
144 else:
145 raise
146
147 def begin(self):
148 self.thread = gevent.spawn(self.invoke_logrotate)
149
150 def end(self):
151 self.stop_event.set()
152 self.thread.get()
153
154 def write_rotate_conf(ctx, daemons):
155 testdir = teuthology.get_testdir(ctx)
156 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
157 with file(rotate_conf_path, 'rb') as f:
158 conf = ""
159 for daemon, size in daemons.iteritems():
160 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
161 conf += f.read().format(daemon_type=daemon, max_size=size)
162 f.seek(0, 0)
163
164 for remote in ctx.cluster.remotes.iterkeys():
165 teuthology.write_file(remote=remote,
166 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
167 data=StringIO(conf)
168 )
169 remote.run(
170 args=[
171 'sudo',
172 'mv',
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
174 '/etc/logrotate.d/ceph-test.conf',
175 run.Raw('&&'),
176 'sudo',
177 'chmod',
178 '0644',
179 '/etc/logrotate.d/ceph-test.conf',
180 run.Raw('&&'),
181 'sudo',
182 'chown',
183 'root.root',
184 '/etc/logrotate.d/ceph-test.conf'
185 ]
186 )
187 remote.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
189
190 if ctx.config.get('log-rotate'):
191 daemons = ctx.config.get('log-rotate')
192 log.info('Setting up log rotation with ' + str(daemons))
193 write_rotate_conf(ctx, daemons)
194 logrotater = Rotater()
195 logrotater.begin()
196 try:
197 yield
198
199 finally:
200 if ctx.config.get('log-rotate'):
201 log.info('Shutting down logrotate')
202 logrotater.end()
203 ctx.cluster.run(
204 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
205 ]
206 )
207 if ctx.archive is not None and \
208 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
209 # and logs
210 log.info('Compressing logs...')
211 run.wait(
212 ctx.cluster.run(
213 args=[
214 'sudo',
215 'find',
216 '/var/log/ceph',
217 '-name',
218 '*.log',
219 '-print0',
220 run.Raw('|'),
221 'sudo',
222 'xargs',
223 '-0',
224 '--no-run-if-empty',
225 '--',
226 'gzip',
227 '--',
228 ],
229 wait=False,
230 ),
231 )
232
233 log.info('Archiving logs...')
234 path = os.path.join(ctx.archive, 'remote')
235 os.makedirs(path)
236 for remote in ctx.cluster.remotes.iterkeys():
237 sub = os.path.join(path, remote.shortname)
238 os.makedirs(sub)
239 teuthology.pull_directory(remote, '/var/log/ceph',
240 os.path.join(sub, 'log'))
241
242
243def assign_devs(roles, devs):
244 """
245 Create a dictionary of devs indexed by roles
246
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
250 """
251 return dict(zip(roles, devs))
252
253
254@contextlib.contextmanager
255def valgrind_post(ctx, config):
256 """
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
259 the logs.
260
261 :param ctx: Context
262 :param config: Configuration
263 """
264 try:
265 yield
266 finally:
267 lookup_procs = list()
268 log.info('Checking for errors in any valgrind logs...')
269 for remote in ctx.cluster.remotes.iterkeys():
270 # look at valgrind logs for each node
271 proc = remote.run(
272 args=[
273 'sudo',
274 'zgrep',
275 '<kind>',
276 run.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
278 run.Raw('|'),
279 'sort',
280 run.Raw('|'),
281 'uniq',
282 ],
283 wait=False,
284 check_status=False,
285 stdout=StringIO(),
286 )
287 lookup_procs.append((proc, remote))
288
289 valgrind_exception = None
290 for (proc, remote) in lookup_procs:
291 proc.wait()
292 out = proc.stdout.getvalue()
293 for line in out.split('\n'):
294 if line == '':
295 continue
296 try:
297 (file, kind) = line.split(':')
298 except Exception:
299 log.error('failed to split line %s', line)
300 raise
301 log.debug('file %s kind %s', file, kind)
302 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
303 continue
304 log.error('saw valgrind issue %s in %s', kind, file)
305 valgrind_exception = Exception('saw valgrind issues')
306
307 if config.get('expect_valgrind_errors'):
308 if not valgrind_exception:
309 raise Exception('expected valgrind issues and found none')
310 else:
311 if valgrind_exception:
312 raise valgrind_exception
313
314
315@contextlib.contextmanager
316def crush_setup(ctx, config):
317 cluster_name = config['cluster']
318 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
319 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
320
321 profile = config.get('crush_tunables', 'default')
322 log.info('Setting crush tunables to %s', profile)
323 mon_remote.run(
324 args=['sudo', 'ceph', '--cluster', cluster_name,
325 'osd', 'crush', 'tunables', profile])
326 yield
327
328
224ce89b
WB
329@contextlib.contextmanager
330def create_rbd_pool(ctx, config):
331 cluster_name = config['cluster']
332 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
333 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
334 log.info('Waiting for OSDs to come up')
335 teuthology.wait_until_osds_up(
336 ctx,
337 cluster=ctx.cluster,
338 remote=mon_remote,
339 ceph_cluster=cluster_name,
340 )
341 log.info('Creating RBD pool')
342 mon_remote.run(
343 args=['sudo', 'ceph', '--cluster', cluster_name,
344 'osd', 'pool', 'create', 'rbd', '8'])
345 yield
346
7c673cae
FG
347@contextlib.contextmanager
348def cephfs_setup(ctx, config):
349 cluster_name = config['cluster']
350 testdir = teuthology.get_testdir(ctx)
351 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
352
353 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
354 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
355 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
356 # If there are any MDSs, then create a filesystem for them to use
357 # Do this last because requires mon cluster to be up and running
358 if mdss.remotes:
359 log.info('Setting up CephFS filesystem...')
360
361 fs = Filesystem(ctx, create='cephfs')
362
363 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
364 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
365 num_active = len([r for r in all_roles if is_active_mds(r)])
366
7c673cae
FG
367 fs.set_max_mds(num_active)
368 fs.set_allow_dirfrags(True)
369
370 yield
371
372
373@contextlib.contextmanager
374def cluster(ctx, config):
375 """
376 Handle the creation and removal of a ceph cluster.
377
378 On startup:
379 Create directories needed for the cluster.
380 Create remote journals for all osds.
381 Create and set keyring.
382 Copy the monmap to tht test systems.
383 Setup mon nodes.
384 Setup mds nodes.
385 Mkfs osd nodes.
386 Add keyring information to monmaps
387 Mkfs mon nodes.
388
389 On exit:
390 If errors occured, extract a failure message and store in ctx.summary.
391 Unmount all test files and temporary journaling files.
392 Save the monitor information and archive all ceph logs.
393 Cleanup the keyring setup, and remove all monitor map and data files left over.
394
395 :param ctx: Context
396 :param config: Configuration
397 """
398 if ctx.config.get('use_existing_cluster', False) is True:
399 log.info("'use_existing_cluster' is true; skipping cluster creation")
400 yield
401
402 testdir = teuthology.get_testdir(ctx)
403 cluster_name = config['cluster']
404 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
405 log.info('Creating ceph cluster %s...', cluster_name)
406 run.wait(
407 ctx.cluster.run(
408 args=[
409 'install', '-d', '-m0755', '--',
410 data_dir,
411 ],
412 wait=False,
413 )
414 )
415
416 run.wait(
417 ctx.cluster.run(
418 args=[
419 'sudo',
420 'install', '-d', '-m0777', '--', '/var/run/ceph',
421 ],
422 wait=False,
423 )
424 )
425
426 devs_to_clean = {}
427 remote_to_roles_to_devs = {}
428 remote_to_roles_to_journals = {}
429 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
430 for remote, roles_for_host in osds.remotes.iteritems():
431 devs = teuthology.get_scratch_devices(remote)
432 roles_to_devs = {}
433 roles_to_journals = {}
434 if config.get('fs'):
435 log.info('fs option selected, checking for scratch devs')
436 log.info('found devs: %s' % (str(devs),))
437 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
438 iddevs = devs_id_map.values()
439 roles_to_devs = assign_devs(
440 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
441 )
442 if len(roles_to_devs) < len(iddevs):
443 iddevs = iddevs[len(roles_to_devs):]
444 devs_to_clean[remote] = []
445
446 if config.get('block_journal'):
447 log.info('block journal enabled')
448 roles_to_journals = assign_devs(
449 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
450 )
451 log.info('journal map: %s', roles_to_journals)
452
453 if config.get('tmpfs_journal'):
454 log.info('tmpfs journal enabled')
455 roles_to_journals = {}
456 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
457 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
458 tmpfs = '/mnt/' + role
459 roles_to_journals[role] = tmpfs
460 remote.run(args=['truncate', '-s', '1500M', tmpfs])
461 log.info('journal map: %s', roles_to_journals)
462
463 log.info('dev map: %s' % (str(roles_to_devs),))
464 remote_to_roles_to_devs[remote] = roles_to_devs
465 remote_to_roles_to_journals[remote] = roles_to_journals
466
467 log.info('Generating config...')
468 remotes_and_roles = ctx.cluster.remotes.items()
469 roles = [role_list for (remote, role_list) in remotes_and_roles]
470 ips = [host for (host, port) in
471 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
472 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
473 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
474 for role, journal in roles_to_journals.iteritems():
475 name = teuthology.ceph_role(role)
476 if name not in conf:
477 conf[name] = {}
478 conf[name]['osd journal'] = journal
479 for section, keys in config['conf'].iteritems():
480 for key, value in keys.iteritems():
481 log.info("[%s] %s = %s" % (section, key, value))
482 if section not in conf:
483 conf[section] = {}
484 conf[section][key] = value
485
486 if config.get('tmpfs_journal'):
487 conf['journal dio'] = False
488
489 if not hasattr(ctx, 'ceph'):
490 ctx.ceph = {}
491 ctx.ceph[cluster_name] = argparse.Namespace()
492 ctx.ceph[cluster_name].conf = conf
493
494 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
495 keyring_path = config.get('keyring_path', default_keyring)
496
497 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
498
499 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
500
501 log.info('Setting up %s...' % firstmon)
502 ctx.cluster.only(firstmon).run(
503 args=[
504 'sudo',
505 'adjust-ulimits',
506 'ceph-coverage',
507 coverage_dir,
508 'ceph-authtool',
509 '--create-keyring',
510 keyring_path,
511 ],
512 )
513 ctx.cluster.only(firstmon).run(
514 args=[
515 'sudo',
516 'adjust-ulimits',
517 'ceph-coverage',
518 coverage_dir,
519 'ceph-authtool',
520 '--gen-key',
521 '--name=mon.',
522 keyring_path,
523 ],
524 )
525 ctx.cluster.only(firstmon).run(
526 args=[
527 'sudo',
528 'chmod',
529 '0644',
530 keyring_path,
531 ],
532 )
533 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
534 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
535 cluster=cluster_name)
536 fsid = teuthology.create_simple_monmap(
537 ctx,
538 remote=mon0_remote,
539 conf=conf,
540 path=monmap_path,
541 )
542 if not 'global' in conf:
543 conf['global'] = {}
544 conf['global']['fsid'] = fsid
545
546 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
547 conf_path = config.get('conf_path', default_conf_path)
548 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
549 write_conf(ctx, conf_path, cluster_name)
550
551 log.info('Creating admin key on %s...' % firstmon)
552 ctx.cluster.only(firstmon).run(
553 args=[
554 'sudo',
555 'adjust-ulimits',
556 'ceph-coverage',
557 coverage_dir,
558 'ceph-authtool',
559 '--gen-key',
560 '--name=client.admin',
561 '--set-uid=0',
562 '--cap', 'mon', 'allow *',
563 '--cap', 'osd', 'allow *',
564 '--cap', 'mds', 'allow *',
565 '--cap', 'mgr', 'allow *',
566 keyring_path,
567 ],
568 )
569
570 log.info('Copying monmap to all nodes...')
571 keyring = teuthology.get_file(
572 remote=mon0_remote,
573 path=keyring_path,
574 )
575 monmap = teuthology.get_file(
576 remote=mon0_remote,
577 path=monmap_path,
578 )
579
580 for rem in ctx.cluster.remotes.iterkeys():
581 # copy mon key and initial monmap
582 log.info('Sending monmap to node {remote}'.format(remote=rem))
583 teuthology.sudo_write_file(
584 remote=rem,
585 path=keyring_path,
586 data=keyring,
587 perms='0644'
588 )
589 teuthology.write_file(
590 remote=rem,
591 path=monmap_path,
592 data=monmap,
593 )
594
595 log.info('Setting up mon nodes...')
596 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
7c673cae
FG
597
598 if not config.get('skip_mgr_daemons', False):
599 log.info('Setting up mgr nodes...')
600 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
601 for remote, roles_for_host in mgrs.remotes.iteritems():
602 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
603 cluster_name):
604 _, _, id_ = teuthology.split_role(role)
605 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
606 cluster=cluster_name,
607 id=id_,
608 )
609 remote.run(
610 args=[
611 'sudo',
612 'mkdir',
613 '-p',
614 mgr_dir,
615 run.Raw('&&'),
616 'sudo',
617 'adjust-ulimits',
618 'ceph-coverage',
619 coverage_dir,
620 'ceph-authtool',
621 '--create-keyring',
622 '--gen-key',
623 '--name=mgr.{id}'.format(id=id_),
624 mgr_dir + '/keyring',
625 ],
626 )
627
628 log.info('Setting up mds nodes...')
629 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
630 for remote, roles_for_host in mdss.remotes.iteritems():
631 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
632 cluster_name):
633 _, _, id_ = teuthology.split_role(role)
634 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
635 cluster=cluster_name,
636 id=id_,
637 )
638 remote.run(
639 args=[
640 'sudo',
641 'mkdir',
642 '-p',
643 mds_dir,
644 run.Raw('&&'),
645 'sudo',
646 'adjust-ulimits',
647 'ceph-coverage',
648 coverage_dir,
649 'ceph-authtool',
650 '--create-keyring',
651 '--gen-key',
652 '--name=mds.{id}'.format(id=id_),
653 mds_dir + '/keyring',
654 ],
655 )
656
657 cclient.create_keyring(ctx, cluster_name)
658 log.info('Running mkfs on osd nodes...')
659
660 if not hasattr(ctx, 'disk_config'):
661 ctx.disk_config = argparse.Namespace()
662 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
663 ctx.disk_config.remote_to_roles_to_dev = {}
664 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
665 ctx.disk_config.remote_to_roles_to_journals = {}
666 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
667 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
668 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
669 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
670
671 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
672 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
673
674 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
675 for remote, roles_for_host in osds.remotes.iteritems():
676 roles_to_devs = remote_to_roles_to_devs[remote]
677 roles_to_journals = remote_to_roles_to_journals[remote]
678
679 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
680 _, _, id_ = teuthology.split_role(role)
681 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
682 remote.run(
683 args=[
684 'sudo',
685 'mkdir',
686 '-p',
687 mnt_point,
688 ])
689 log.info(str(roles_to_journals))
690 log.info(role)
691 if roles_to_devs.get(role):
692 dev = roles_to_devs[role]
693 fs = config.get('fs')
694 package = None
695 mkfs_options = config.get('mkfs_options')
696 mount_options = config.get('mount_options')
697 if fs == 'btrfs':
698 # package = 'btrfs-tools'
699 if mount_options is None:
700 mount_options = ['noatime', 'user_subvol_rm_allowed']
701 if mkfs_options is None:
702 mkfs_options = ['-m', 'single',
703 '-l', '32768',
704 '-n', '32768']
705 if fs == 'xfs':
706 # package = 'xfsprogs'
707 if mount_options is None:
708 mount_options = ['noatime']
709 if mkfs_options is None:
710 mkfs_options = ['-f', '-i', 'size=2048']
711 if fs == 'ext4' or fs == 'ext3':
712 if mount_options is None:
713 mount_options = ['noatime', 'user_xattr']
714
715 if mount_options is None:
716 mount_options = []
717 if mkfs_options is None:
718 mkfs_options = []
719 mkfs = ['mkfs.%s' % fs] + mkfs_options
720 log.info('%s on %s on %s' % (mkfs, dev, remote))
721 if package is not None:
722 remote.run(
723 args=[
724 'sudo',
725 'apt-get', 'install', '-y', package
726 ],
727 stdout=StringIO(),
728 )
729
730 try:
731 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
732 except run.CommandFailedError:
733 # Newer btfs-tools doesn't prompt for overwrite, use -f
734 if '-f' not in mount_options:
735 mkfs_options.append('-f')
736 mkfs = ['mkfs.%s' % fs] + mkfs_options
737 log.info('%s on %s on %s' % (mkfs, dev, remote))
738 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
739
740 log.info('mount %s on %s -o %s' % (dev, remote,
741 ','.join(mount_options)))
742 remote.run(
743 args=[
744 'sudo',
745 'mount',
746 '-t', fs,
747 '-o', ','.join(mount_options),
748 dev,
749 mnt_point,
750 ]
751 )
752 remote.run(
753 args=[
754 'sudo', '/sbin/restorecon', mnt_point,
755 ],
756 check_status=False,
757 )
758 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
759 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
760 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
761 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
762 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
763 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
764 devs_to_clean[remote].append(mnt_point)
765
766 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
767 _, _, id_ = teuthology.split_role(role)
768 remote.run(
769 args=[
770 'sudo',
771 'MALLOC_CHECK_=3',
772 'adjust-ulimits',
773 'ceph-coverage',
774 coverage_dir,
775 'ceph-osd',
776 '--cluster',
777 cluster_name,
778 '--mkfs',
779 '--mkkey',
780 '-i', id_,
781 '--monmap', monmap_path,
782 ],
783 )
784
785 log.info('Reading keys from all nodes...')
786 keys_fp = StringIO()
787 keys = []
788 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
789 for type_ in ['mgr', 'mds', 'osd']:
790 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
791 continue
792 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
793 _, _, id_ = teuthology.split_role(role)
794 data = teuthology.get_file(
795 remote=remote,
796 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
797 type=type_,
798 id=id_,
799 cluster=cluster_name,
800 ),
801 sudo=True,
802 )
803 keys.append((type_, id_, data))
804 keys_fp.write(data)
805 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
806 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
807 _, _, id_ = teuthology.split_role(role)
808 data = teuthology.get_file(
809 remote=remote,
810 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
811 )
812 keys.append(('client', id_, data))
813 keys_fp.write(data)
814
815 log.info('Adding keys to all mons...')
816 writes = mons.run(
817 args=[
818 'sudo', 'tee', '-a',
819 keyring_path,
820 ],
821 stdin=run.PIPE,
822 wait=False,
823 stdout=StringIO(),
824 )
825 keys_fp.seek(0)
826 teuthology.feed_many_stdins_and_close(keys_fp, writes)
827 run.wait(writes)
828 for type_, id_, data in keys:
829 run.wait(
830 mons.run(
831 args=[
832 'sudo',
833 'adjust-ulimits',
834 'ceph-coverage',
835 coverage_dir,
836 'ceph-authtool',
837 keyring_path,
838 '--name={type}.{id}'.format(
839 type=type_,
840 id=id_,
841 ),
842 ] + list(generate_caps(type_)),
843 wait=False,
844 ),
845 )
846
847 log.info('Running mkfs on mon nodes...')
848 for remote, roles_for_host in mons.remotes.iteritems():
849 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
850 _, _, id_ = teuthology.split_role(role)
851 remote.run(
852 args=[
853 'sudo',
854 'mkdir',
855 '-p',
856 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
857 ],
858 )
859 remote.run(
860 args=[
861 'sudo',
862 'adjust-ulimits',
863 'ceph-coverage',
864 coverage_dir,
865 'ceph-mon',
866 '--cluster', cluster_name,
867 '--mkfs',
868 '-i', id_,
869 '--monmap', monmap_path,
7c673cae
FG
870 '--keyring', keyring_path,
871 ],
872 )
873
874 run.wait(
875 mons.run(
876 args=[
877 'rm',
878 '--',
879 monmap_path,
7c673cae
FG
880 ],
881 wait=False,
882 ),
883 )
884
885 try:
886 yield
887 except Exception:
888 # we need to know this below
889 ctx.summary['success'] = False
890 raise
891 finally:
892 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
893
894 log.info('Checking cluster log for badness...')
895
896 def first_in_ceph_log(pattern, excludes):
897 """
898 Find the first occurence of the pattern specified in the Ceph log,
899 Returns None if none found.
900
901 :param pattern: Pattern scanned for.
902 :param excludes: Patterns to ignore.
903 :return: First line of text (or None if not found)
904 """
905 args = [
906 'sudo',
907 'egrep', pattern,
908 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
909 ]
910 for exclude in excludes:
911 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
912 args.extend([
913 run.Raw('|'), 'head', '-n', '1',
914 ])
915 r = mon0_remote.run(
916 stdout=StringIO(),
917 args=args,
918 )
919 stdout = r.stdout.getvalue()
920 if stdout != '':
921 return stdout
922 return None
923
924 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
925 config['log_whitelist']) is not None:
926 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
927 ctx.summary['success'] = False
928 # use the most severe problem as the failure reason
929 if 'failure_reason' not in ctx.summary:
930 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
931 match = first_in_ceph_log(pattern, config['log_whitelist'])
932 if match is not None:
933 ctx.summary['failure_reason'] = \
934 '"{match}" in cluster log'.format(
935 match=match.rstrip('\n'),
936 )
937 break
938
939 for remote, dirs in devs_to_clean.iteritems():
940 for dir_ in dirs:
941 log.info('Unmounting %s on %s' % (dir_, remote))
942 try:
943 remote.run(
944 args=[
945 'sync',
946 run.Raw('&&'),
947 'sudo',
948 'umount',
949 '-f',
950 dir_
951 ]
952 )
953 except Exception as e:
954 remote.run(args=[
955 'sudo',
956 run.Raw('PATH=/usr/sbin:$PATH'),
957 'lsof',
958 run.Raw(';'),
959 'ps', 'auxf',
960 ])
961 raise e
962
963 if config.get('tmpfs_journal'):
964 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
965 for remote, roles_for_host in osds.remotes.iteritems():
966 remote.run(
967 args=['sudo', 'umount', '-f', '/mnt'],
968 check_status=False,
969 )
970
971 if ctx.archive is not None and \
972 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
973
974 # archive mon data, too
975 log.info('Archiving mon data...')
976 path = os.path.join(ctx.archive, 'data')
977 try:
978 os.makedirs(path)
979 except OSError as e:
980 if e.errno == errno.EEXIST:
981 pass
982 else:
983 raise
984 for remote, roles in mons.remotes.iteritems():
985 for role in roles:
986 is_mon = teuthology.is_type('mon', cluster_name)
987 if is_mon(role):
988 _, _, id_ = teuthology.split_role(role)
989 mon_dir = '/var/lib/ceph/mon/' + \
990 '{0}-{1}'.format(cluster_name, id_)
991 teuthology.pull_directory_tarball(
992 remote,
993 mon_dir,
994 path + '/' + role + '.tgz')
995
996 log.info('Cleaning ceph cluster...')
997 run.wait(
998 ctx.cluster.run(
999 args=[
1000 'sudo',
1001 'rm',
1002 '-rf',
1003 '--',
1004 conf_path,
1005 keyring_path,
1006 data_dir,
1007 monmap_path,
7c673cae
FG
1008 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1009 ],
1010 wait=False,
1011 ),
1012 )
1013
1014
1015def osd_scrub_pgs(ctx, config):
1016 """
1017 Scrub pgs when we exit.
1018
1019 First make sure all pgs are active and clean.
1020 Next scrub all osds.
1021 Then periodically check until all pgs have scrub time stamps that
1022 indicate the last scrub completed. Time out if no progess is made
1023 here after two minutes.
1024 """
31f18b77 1025 retries = 20
7c673cae
FG
1026 delays = 10
1027 cluster_name = config['cluster']
1028 manager = ctx.managers[cluster_name]
1029 all_clean = False
1030 for _ in range(0, retries):
1031 stats = manager.get_pg_stats()
31f18b77
FG
1032 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1033 if not bad:
7c673cae
FG
1034 all_clean = True
1035 break
31f18b77 1036 log.info(
224ce89b 1037 "Waiting for all PGs to be active and clean, waiting on %s" % bad)
7c673cae
FG
1038 time.sleep(delays)
1039 if not all_clean:
31f18b77 1040 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
7c673cae
FG
1041 check_time_now = time.localtime()
1042 time.sleep(1)
1043 all_roles = teuthology.all_roles(ctx.cluster)
1044 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1045 log.info("Scrubbing {osd}".format(osd=role))
1046 _, _, id_ = teuthology.split_role(role)
31f18b77
FG
1047 # allow this to fail; in certain cases the OSD might not be up
1048 # at this point. we will catch all pgs below.
1049 try:
1050 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1051 except run.CommandFailedError:
1052 pass
7c673cae
FG
1053 prev_good = 0
1054 gap_cnt = 0
1055 loop = True
1056 while loop:
1057 stats = manager.get_pg_stats()
1058 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1059 loop = False
1060 thiscnt = 0
1061 for (pgid, tmval) in timez:
1062 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1063 if pgtm > check_time_now:
1064 thiscnt += 1
1065 else:
1066 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1067 loop = True
1068 if thiscnt > prev_good:
1069 prev_good = thiscnt
1070 gap_cnt = 0
1071 else:
1072 gap_cnt += 1
31f18b77
FG
1073 if gap_cnt % 6 == 0:
1074 for (pgid, tmval) in timez:
1075 # re-request scrub every so often in case the earlier
1076 # request was missed. do not do it everytime because
1077 # the scrub may be in progress or not reported yet and
1078 # we will starve progress.
1079 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
7c673cae 1080 if gap_cnt > retries:
31f18b77 1081 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
7c673cae
FG
1082 if loop:
1083 log.info('Still waiting for all pgs to be scrubbed.')
1084 time.sleep(delays)
1085
1086
1087@contextlib.contextmanager
1088def run_daemon(ctx, config, type_):
1089 """
1090 Run daemons for a role type. Handle the startup and termination of a a daemon.
1091 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1092 and a max_mds value for one mds.
1093 On cleanup -- Stop all existing daemons of this type.
1094
1095 :param ctx: Context
1096 :param config: Configuration
1097 :paran type_: Role type
1098 """
1099 cluster_name = config['cluster']
1100 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1101 testdir = teuthology.get_testdir(ctx)
1102 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1103
1104 # check whether any daemons if this type are configured
1105 if daemons is None:
1106 return
1107 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1108
1109 daemon_signal = 'kill'
1110 if config.get('coverage') or config.get('valgrind') is not None:
1111 daemon_signal = 'term'
1112
1113 for remote, roles_for_host in daemons.remotes.iteritems():
1114 is_type_ = teuthology.is_type(type_, cluster_name)
1115 for role in roles_for_host:
1116 if not is_type_(role):
1117 continue
1118 _, _, id_ = teuthology.split_role(role)
1119
224ce89b
WB
1120 if type_ == 'osd':
1121 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1122 cluster=cluster_name, id=id_)
1123 osd_uuid = teuthology.get_file(
1124 remote=remote,
1125 path=datadir + '/fsid',
1126 sudo=True,
1127 ).strip()
1128 try:
1129 remote.run(
1130 args=[
1131 'sudo', 'ceph', '--cluster', cluster_name,
1132 'osd', 'new', osd_uuid, id_,
1133 ]
1134 )
1135 except:
1136 # fallback to pre-luminous (hammer or jewel)
1137 remote.run(
1138 args=[
1139 'sudo', 'ceph', '--cluster', cluster_name,
1140 'osd', 'create', osd_uuid,
1141 ]
1142 )
1143 if config.get('add_osds_to_crush'):
1144 remote.run(
1145 args=[
1146 'sudo', 'ceph', '--cluster', cluster_name,
1147 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1148 '1.0', 'host=localhost', 'root=default',
1149 ]
1150 )
1151
7c673cae
FG
1152 run_cmd = [
1153 'sudo',
1154 'adjust-ulimits',
1155 'ceph-coverage',
1156 coverage_dir,
1157 'daemon-helper',
1158 daemon_signal,
1159 ]
1160 run_cmd_tail = [
1161 'ceph-%s' % (type_),
1162 '-f',
1163 '--cluster', cluster_name,
1164 '-i', id_]
1165
1166 if type_ in config.get('cpu_profile', []):
1167 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1168 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1169
1170 if config.get('valgrind') is not None:
1171 valgrind_args = None
1172 if type_ in config['valgrind']:
1173 valgrind_args = config['valgrind'][type_]
1174 if role in config['valgrind']:
1175 valgrind_args = config['valgrind'][role]
1176 run_cmd = teuthology.get_valgrind_args(testdir, role,
1177 run_cmd,
1178 valgrind_args)
1179
1180 run_cmd.extend(run_cmd_tail)
1181
1182 # always register mgr; don't necessarily start
1183 ctx.daemons.register_daemon(
1184 remote, type_, id_,
1185 cluster=cluster_name,
1186 args=run_cmd,
1187 logger=log.getChild(role),
1188 stdin=run.PIPE,
1189 wait=False
1190 )
1191 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1192 role = cluster_name + '.' + type_
1193 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1194
1195 try:
1196 yield
1197 finally:
1198 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1199
1200
1201def healthy(ctx, config):
1202 """
1203 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1204
1205 :param ctx: Context
1206 :param config: Configuration
1207 """
1208 config = config if isinstance(config, dict) else dict()
1209 cluster_name = config.get('cluster', 'ceph')
1210 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1211 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1212 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1213 teuthology.wait_until_osds_up(
1214 ctx,
1215 cluster=ctx.cluster,
1216 remote=mon0_remote,
1217 ceph_cluster=cluster_name,
1218 )
1219 teuthology.wait_until_healthy(
1220 ctx,
1221 remote=mon0_remote,
1222 ceph_cluster=cluster_name,
1223 )
1224
1225 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1226 # Some MDSs exist, wait for them to be healthy
1227 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1228 ceph_fs.wait_for_daemons(timeout=300)
1229
1230
1231def wait_for_osds_up(ctx, config):
1232 """
1233 Wait for all osd's to come up.
1234
1235 :param ctx: Context
1236 :param config: Configuration
1237 """
1238 log.info('Waiting until ceph osds are all up...')
1239 cluster_name = config.get('cluster', 'ceph')
1240 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1241 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1242 teuthology.wait_until_osds_up(
1243 ctx,
1244 cluster=ctx.cluster,
1245 remote=mon0_remote
1246 )
1247
1248
1249def wait_for_mon_quorum(ctx, config):
1250 """
1251 Check renote ceph status until all monitors are up.
1252
1253 :param ctx: Context
1254 :param config: Configuration
1255 """
1256 if isinstance(config, dict):
1257 mons = config['daemons']
1258 cluster_name = config.get('cluster', 'ceph')
1259 else:
1260 assert isinstance(config, list)
1261 mons = config
1262 cluster_name = 'ceph'
1263 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1264 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1265 with contextutil.safe_while(sleep=10, tries=60,
1266 action='wait for monitor quorum') as proceed:
1267 while proceed():
1268 r = remote.run(
1269 args=[
1270 'sudo',
1271 'ceph',
1272 'quorum_status',
1273 ],
1274 stdout=StringIO(),
1275 logger=log.getChild('quorum_status'),
1276 )
1277 j = json.loads(r.stdout.getvalue())
1278 q = j.get('quorum_names', [])
1279 log.debug('Quorum: %s', q)
1280 if sorted(q) == sorted(mons):
1281 break
1282
1283
1284def created_pool(ctx, config):
1285 """
1286 Add new pools to the dictionary of pools that the ceph-manager
1287 knows about.
1288 """
1289 for new_pool in config:
1290 if new_pool not in ctx.managers['ceph'].pools:
1291 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1292 new_pool, 'pg_num')
1293
1294
1295@contextlib.contextmanager
1296def restart(ctx, config):
1297 """
1298 restart ceph daemons
1299
1300 For example::
1301 tasks:
1302 - ceph.restart: [all]
1303
1304 For example::
1305 tasks:
1306 - ceph.restart: [osd.0, mon.1, mds.*]
1307
1308 or::
1309
1310 tasks:
1311 - ceph.restart:
1312 daemons: [osd.0, mon.1]
1313 wait-for-healthy: false
1314 wait-for-osds-up: true
1315
1316 :param ctx: Context
1317 :param config: Configuration
1318 """
1319 if config is None:
1320 config = {}
1321 elif isinstance(config, list):
1322 config = {'daemons': config}
1323
1324 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1325 clusters = set()
1326 for role in daemons:
1327 cluster, type_, id_ = teuthology.split_role(role)
1328 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1329 clusters.add(cluster)
1330
1331 manager = ctx.managers['ceph']
1332 for dmon in daemons:
1333 if '.' in dmon:
1334 dm_parts = dmon.split('.')
1335 if dm_parts[1].isdigit():
1336 if dm_parts[0] == 'osd':
1337 manager.mark_down_osd(int(dm_parts[1]))
1338
1339 if config.get('wait-for-healthy', True):
1340 for cluster in clusters:
1341 healthy(ctx=ctx, config=dict(cluster=cluster))
1342 if config.get('wait-for-osds-up', False):
1343 for cluster in clusters:
1344 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1345 yield
1346
1347
1348@contextlib.contextmanager
1349def stop(ctx, config):
1350 """
1351 Stop ceph daemons
1352
1353 For example::
1354 tasks:
1355 - ceph.stop: [mds.*]
1356
1357 tasks:
1358 - ceph.stop: [osd.0, osd.2]
1359
1360 tasks:
1361 - ceph.stop:
1362 daemons: [osd.0, osd.2]
1363
1364 """
1365 if config is None:
1366 config = {}
1367 elif isinstance(config, list):
1368 config = {'daemons': config}
1369
1370 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1371 for role in daemons:
1372 cluster, type_, id_ = teuthology.split_role(role)
1373 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1374
1375 yield
1376
1377
1378@contextlib.contextmanager
1379def wait_for_failure(ctx, config):
1380 """
1381 Wait for a failure of a ceph daemon
1382
1383 For example::
1384 tasks:
1385 - ceph.wait_for_failure: [mds.*]
1386
1387 tasks:
1388 - ceph.wait_for_failure: [osd.0, osd.2]
1389
1390 tasks:
1391 - ceph.wait_for_failure:
1392 daemons: [osd.0, osd.2]
1393
1394 """
1395 if config is None:
1396 config = {}
1397 elif isinstance(config, list):
1398 config = {'daemons': config}
1399
1400 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1401 for role in daemons:
1402 cluster, type_, id_ = teuthology.split_role(role)
1403 try:
1404 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1405 except:
1406 log.info('Saw expected daemon failure. Continuing.')
1407 pass
1408 else:
1409 raise RuntimeError('daemon %s did not fail' % role)
1410
1411 yield
1412
1413
1414def validate_config(ctx, config):
1415 """
1416 Perform some simple validation on task configuration.
1417 Raises exceptions.ConfigError if an error is found.
1418 """
1419 # check for osds from multiple clusters on the same host
1420 for remote, roles_for_host in ctx.cluster.remotes.items():
1421 last_cluster = None
1422 last_role = None
1423 for role in roles_for_host:
1424 role_cluster, role_type, _ = teuthology.split_role(role)
1425 if role_type != 'osd':
1426 continue
1427 if last_cluster and last_cluster != role_cluster:
1428 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1429 last_role, role)
1430 raise exceptions.ConfigError(msg)
1431 last_cluster = role_cluster
1432 last_role = role
1433
1434
1435@contextlib.contextmanager
1436def task(ctx, config):
1437 """
1438 Set up and tear down a Ceph cluster.
1439
1440 For example::
1441
1442 tasks:
1443 - ceph:
1444 - interactive:
1445
1446 You can also specify what branch to run::
1447
1448 tasks:
1449 - ceph:
1450 branch: foo
1451
1452 Or a tag::
1453
1454 tasks:
1455 - ceph:
1456 tag: v0.42.13
1457
1458 Or a sha1::
1459
1460 tasks:
1461 - ceph:
1462 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1463
1464 Or a local source dir::
1465
1466 tasks:
1467 - ceph:
1468 path: /home/sage/ceph
1469
1470 To capture code coverage data, use::
1471
1472 tasks:
1473 - ceph:
1474 coverage: true
1475
1476 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1477
1478 tasks:
1479 - ceph:
1480 fs: xfs
1481 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1482 mount_options: [nobarrier, inode64]
1483
1484 Note, this will cause the task to check the /scratch_devs file on each node
1485 for available devices. If no such file is found, /dev/sdb will be used.
1486
1487 To run some daemons under valgrind, include their names
1488 and the tool/args to use in a valgrind section::
1489
1490 tasks:
1491 - ceph:
1492 valgrind:
1493 mds.1: --tool=memcheck
1494 osd.1: [--tool=memcheck, --leak-check=no]
1495
1496 Those nodes which are using memcheck or valgrind will get
1497 checked for bad results.
1498
1499 To adjust or modify config options, use::
1500
1501 tasks:
1502 - ceph:
1503 conf:
1504 section:
1505 key: value
1506
1507 For example::
1508
1509 tasks:
1510 - ceph:
1511 conf:
1512 mds.0:
1513 some option: value
1514 other key: other value
1515 client.0:
1516 debug client: 10
1517 debug ms: 1
1518
1519 By default, the cluster log is checked for errors and warnings,
1520 and the run marked failed if any appear. You can ignore log
1521 entries by giving a list of egrep compatible regexes, i.e.:
1522
1523 tasks:
1524 - ceph:
1525 log-whitelist: ['foo.*bar', 'bad message']
1526
1527 To run multiple ceph clusters, use multiple ceph tasks, and roles
1528 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1529 cluster use the default cluster name, 'ceph'. OSDs from separate
1530 clusters must be on separate hosts. Clients and non-osd daemons
1531 from multiple clusters may be colocated. For each cluster, add an
1532 instance of the ceph task with the cluster name specified, e.g.::
1533
1534 roles:
1535 - [mon.a, osd.0, osd.1]
1536 - [backup.mon.a, backup.osd.0, backup.osd.1]
1537 - [client.0, backup.client.0]
1538 tasks:
1539 - ceph:
1540 cluster: ceph
1541 - ceph:
1542 cluster: backup
1543
1544 :param ctx: Context
1545 :param config: Configuration
1546
1547 """
1548 if config is None:
1549 config = {}
1550 assert isinstance(config, dict), \
1551 "task ceph only supports a dictionary for configuration"
1552
1553 overrides = ctx.config.get('overrides', {})
1554 teuthology.deep_merge(config, overrides.get('ceph', {}))
1555
1556 first_ceph_cluster = False
1557 if not hasattr(ctx, 'daemons'):
1558 first_ceph_cluster = True
1559 ctx.daemons = DaemonGroup()
1560
1561 testdir = teuthology.get_testdir(ctx)
1562 if config.get('coverage'):
1563 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1564 log.info('Creating coverage directory...')
1565 run.wait(
1566 ctx.cluster.run(
1567 args=[
1568 'install', '-d', '-m0755', '--',
1569 coverage_dir,
1570 ],
1571 wait=False,
1572 )
1573 )
1574
1575 if 'cluster' not in config:
1576 config['cluster'] = 'ceph'
1577
1578 validate_config(ctx, config)
1579
1580 subtasks = []
1581 if first_ceph_cluster:
1582 # these tasks handle general log setup and parsing on all hosts,
1583 # so they should only be run once
1584 subtasks = [
1585 lambda: ceph_log(ctx=ctx, config=None),
1586 lambda: valgrind_post(ctx=ctx, config=config),
1587 ]
1588
1589 subtasks += [
1590 lambda: cluster(ctx=ctx, config=dict(
1591 conf=config.get('conf', {}),
1592 fs=config.get('fs', 'xfs'),
1593 mkfs_options=config.get('mkfs_options', None),
1594 mount_options=config.get('mount_options', None),
1595 block_journal=config.get('block_journal', None),
1596 tmpfs_journal=config.get('tmpfs_journal', None),
1597 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1598 log_whitelist=config.get('log-whitelist', []),
1599 cpu_profile=set(config.get('cpu_profile', []),),
1600 cluster=config['cluster'],
1601 )),
1602 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1603 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1604 lambda: crush_setup(ctx=ctx, config=config),
1605 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
224ce89b 1606 lambda: create_rbd_pool(ctx=ctx, config=config),
7c673cae
FG
1607 lambda: cephfs_setup(ctx=ctx, config=config),
1608 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1609 ]
1610
1611 with contextutil.nested(*subtasks):
1612 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1613 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1614 if not hasattr(ctx, 'managers'):
1615 ctx.managers = {}
1616 ctx.managers[config['cluster']] = CephManager(
1617 mon,
1618 ctx=ctx,
1619 logger=log.getChild('ceph_manager.' + config['cluster']),
1620 cluster=config['cluster'],
1621 )
1622
1623 try:
1624 if config.get('wait-for-healthy', True):
1625 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1626
1627 yield
1628 finally:
1629 if config.get('wait-for-scrub', True):
1630 osd_scrub_pgs(ctx, config)
224ce89b
WB
1631
1632 # stop logging health to clog during shutdown, or else we generate
1633 # a bunch of scary messages unrelated to our actual run.
1634 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1635 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1636 mon0_remote.run(
1637 args=[
1638 'sudo',
1639 'ceph',
1640 '--cluster', config['cluster'],
1641 'tell',
1642 'mon.*',
1643 'injectargs',
1644 '--',
1645 '--no-mon-health-to-clog',
1646 ]
1647 )