4 Handle the setup, starting, and clean-up of a Ceph cluster.
6 from cStringIO
import StringIO
18 from paramiko
import SSHException
19 from ceph_manager
import CephManager
, write_conf
20 from tasks
.cephfs
.filesystem
import Filesystem
21 from teuthology
import misc
as teuthology
22 from teuthology
import contextutil
23 from teuthology
import exceptions
24 from teuthology
.orchestra
import run
25 import ceph_client
as cclient
26 from teuthology
.orchestra
.daemon
import DaemonGroup
28 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw']
30 log
= logging
.getLogger(__name__
)
33 def generate_caps(type_
):
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
61 for subsystem
, capability
in defaults
[type_
].items():
67 @contextlib.contextmanager
68 def ceph_log(ctx
, config
):
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
74 :param config: Configuration
76 log
.info('Making ceph log dir writeable by non-root...')
88 log
.info('Disabling ceph logrotate...')
94 '/etc/logrotate.d/ceph',
99 log
.info('Creating extra log directories...')
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
112 class Rotater(object):
113 stop_event
= gevent
.event
.Event()
115 def invoke_logrotate(self
):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self
.stop_event
.is_set():
119 self
.stop_event
.wait(timeout
=30)
123 args
=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
128 except exceptions
.ConnectionLostError
as e
:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log
.debug("Missed logrotate, node '{0}' is offline".format(
133 except EOFError as e
:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log
.debug("Missed logrotate, EOFError")
139 except SSHException
as e
:
140 log
.debug("Missed logrotate, SSHException")
141 except socket
.error
as e
:
142 if e
.errno
== errno
.EHOSTUNREACH
:
143 log
.debug("Missed logrotate, host unreachable")
148 self
.thread
= gevent
.spawn(self
.invoke_logrotate
)
151 self
.stop_event
.set()
154 def write_rotate_conf(ctx
, daemons
):
155 testdir
= teuthology
.get_testdir(ctx
)
156 rotate_conf_path
= os
.path
.join(os
.path
.dirname(__file__
), 'logrotate.conf')
157 with
file(rotate_conf_path
, 'rb') as f
:
159 for daemon
, size
in daemons
.iteritems():
160 log
.info('writing logrotate stanza for {daemon}'.format(daemon
=daemon
))
161 conf
+= f
.read().format(daemon_type
=daemon
, max_size
=size
)
164 for remote
in ctx
.cluster
.remotes
.iterkeys():
165 teuthology
.write_file(remote
=remote
,
166 path
='{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
174 '/etc/logrotate.d/ceph-test.conf',
179 '/etc/logrotate.d/ceph-test.conf',
184 '/etc/logrotate.d/ceph-test.conf'
187 remote
.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
190 if ctx
.config
.get('log-rotate'):
191 daemons
= ctx
.config
.get('log-rotate')
192 log
.info('Setting up log rotation with ' + str(daemons
))
193 write_rotate_conf(ctx
, daemons
)
194 logrotater
= Rotater()
200 if ctx
.config
.get('log-rotate'):
201 log
.info('Shutting down logrotate')
204 args
=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207 if ctx
.archive
is not None and \
208 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
210 log
.info('Compressing logs...')
233 log
.info('Archiving logs...')
234 path
= os
.path
.join(ctx
.archive
, 'remote')
236 for remote
in ctx
.cluster
.remotes
.iterkeys():
237 sub
= os
.path
.join(path
, remote
.shortname
)
239 teuthology
.pull_directory(remote
, '/var/log/ceph',
240 os
.path
.join(sub
, 'log'))
243 def assign_devs(roles
, devs
):
245 Create a dictionary of devs indexed by roles
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
251 return dict(zip(roles
, devs
))
254 @contextlib.contextmanager
255 def valgrind_post(ctx
, config
):
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
262 :param config: Configuration
267 lookup_procs
= list()
268 log
.info('Checking for errors in any valgrind logs...')
269 for remote
in ctx
.cluster
.remotes
.iterkeys():
270 # look at valgrind logs for each node
276 run
.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
287 lookup_procs
.append((proc
, remote
))
289 valgrind_exception
= None
290 for (proc
, remote
) in lookup_procs
:
292 out
= proc
.stdout
.getvalue()
293 for line
in out
.split('\n'):
297 (file, kind
) = line
.split(':')
299 log
.error('failed to split line %s', line
)
301 log
.debug('file %s kind %s', file, kind
)
302 if (file.find('mds') >= 0) and kind
.find('Lost') > 0:
304 log
.error('saw valgrind issue %s in %s', kind
, file)
305 valgrind_exception
= Exception('saw valgrind issues')
307 if config
.get('expect_valgrind_errors'):
308 if not valgrind_exception
:
309 raise Exception('expected valgrind issues and found none')
311 if valgrind_exception
:
312 raise valgrind_exception
315 @contextlib.contextmanager
316 def crush_setup(ctx
, config
):
317 cluster_name
= config
['cluster']
318 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
319 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
321 profile
= config
.get('crush_tunables', 'default')
322 log
.info('Setting crush tunables to %s', profile
)
324 args
=['sudo', 'ceph', '--cluster', cluster_name
,
325 'osd', 'crush', 'tunables', profile
])
329 @contextlib.contextmanager
330 def create_rbd_pool(ctx
, config
):
331 cluster_name
= config
['cluster']
332 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
333 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
334 log
.info('Waiting for OSDs to come up')
335 teuthology
.wait_until_osds_up(
339 ceph_cluster
=cluster_name
,
341 log
.info('Creating RBD pool')
343 args
=['sudo', 'ceph', '--cluster', cluster_name
,
344 'osd', 'pool', 'create', 'rbd', '8'])
347 'sudo', 'ceph', '--cluster', cluster_name
,
348 'osd', 'pool', 'application', 'enable',
349 'rbd', 'rbd', '--yes-i-really-mean-it'
354 @contextlib.contextmanager
355 def cephfs_setup(ctx
, config
):
356 cluster_name
= config
['cluster']
357 testdir
= teuthology
.get_testdir(ctx
)
358 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
360 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
361 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
362 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
363 # If there are any MDSs, then create a filesystem for them to use
364 # Do this last because requires mon cluster to be up and running
366 log
.info('Setting up CephFS filesystem...')
368 fs
= Filesystem(ctx
, name
='cephfs', create
=True)
370 is_active_mds
= lambda role
: 'mds.' in role
and not role
.endswith('-s') and '-s-' not in role
371 all_roles
= [item
for remote_roles
in mdss
.remotes
.values() for item
in remote_roles
]
372 num_active
= len([r
for r
in all_roles
if is_active_mds(r
)])
374 fs
.set_max_mds(num_active
)
375 fs
.set_allow_dirfrags(True)
380 @contextlib.contextmanager
381 def cluster(ctx
, config
):
383 Handle the creation and removal of a ceph cluster.
386 Create directories needed for the cluster.
387 Create remote journals for all osds.
388 Create and set keyring.
389 Copy the monmap to tht test systems.
393 Add keyring information to monmaps
397 If errors occured, extract a failure message and store in ctx.summary.
398 Unmount all test files and temporary journaling files.
399 Save the monitor information and archive all ceph logs.
400 Cleanup the keyring setup, and remove all monitor map and data files left over.
403 :param config: Configuration
405 if ctx
.config
.get('use_existing_cluster', False) is True:
406 log
.info("'use_existing_cluster' is true; skipping cluster creation")
409 testdir
= teuthology
.get_testdir(ctx
)
410 cluster_name
= config
['cluster']
411 data_dir
= '{tdir}/{cluster}.data'.format(tdir
=testdir
, cluster
=cluster_name
)
412 log
.info('Creating ceph cluster %s...', cluster_name
)
416 'install', '-d', '-m0755', '--',
427 'install', '-d', '-m0777', '--', '/var/run/ceph',
434 remote_to_roles_to_devs
= {}
435 remote_to_roles_to_journals
= {}
436 osds
= ctx
.cluster
.only(teuthology
.is_type('osd', cluster_name
))
437 for remote
, roles_for_host
in osds
.remotes
.iteritems():
438 devs
= teuthology
.get_scratch_devices(remote
)
440 roles_to_journals
= {}
442 log
.info('fs option selected, checking for scratch devs')
443 log
.info('found devs: %s' % (str(devs
),))
444 devs_id_map
= teuthology
.get_wwn_id_map(remote
, devs
)
445 iddevs
= devs_id_map
.values()
446 roles_to_devs
= assign_devs(
447 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
449 if len(roles_to_devs
) < len(iddevs
):
450 iddevs
= iddevs
[len(roles_to_devs
):]
451 devs_to_clean
[remote
] = []
453 if config
.get('block_journal'):
454 log
.info('block journal enabled')
455 roles_to_journals
= assign_devs(
456 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
458 log
.info('journal map: %s', roles_to_journals
)
460 if config
.get('tmpfs_journal'):
461 log
.info('tmpfs journal enabled')
462 roles_to_journals
= {}
463 remote
.run(args
=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
464 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
465 tmpfs
= '/mnt/' + role
466 roles_to_journals
[role
] = tmpfs
467 remote
.run(args
=['truncate', '-s', '1500M', tmpfs
])
468 log
.info('journal map: %s', roles_to_journals
)
470 log
.info('dev map: %s' % (str(roles_to_devs
),))
471 remote_to_roles_to_devs
[remote
] = roles_to_devs
472 remote_to_roles_to_journals
[remote
] = roles_to_journals
474 log
.info('Generating config...')
475 remotes_and_roles
= ctx
.cluster
.remotes
.items()
476 roles
= [role_list
for (remote
, role_list
) in remotes_and_roles
]
477 ips
= [host
for (host
, port
) in
478 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
479 conf
= teuthology
.skeleton_config(ctx
, roles
=roles
, ips
=ips
, cluster
=cluster_name
)
480 for remote
, roles_to_journals
in remote_to_roles_to_journals
.iteritems():
481 for role
, journal
in roles_to_journals
.iteritems():
482 name
= teuthology
.ceph_role(role
)
485 conf
[name
]['osd journal'] = journal
486 for section
, keys
in config
['conf'].iteritems():
487 for key
, value
in keys
.iteritems():
488 log
.info("[%s] %s = %s" % (section
, key
, value
))
489 if section
not in conf
:
491 conf
[section
][key
] = value
493 if config
.get('tmpfs_journal'):
494 conf
['journal dio'] = False
496 if not hasattr(ctx
, 'ceph'):
498 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
499 ctx
.ceph
[cluster_name
].conf
= conf
501 default_keyring
= '/etc/ceph/{cluster}.keyring'.format(cluster
=cluster_name
)
502 keyring_path
= config
.get('keyring_path', default_keyring
)
504 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
506 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
508 log
.info('Setting up %s...' % firstmon
)
509 ctx
.cluster
.only(firstmon
).run(
520 ctx
.cluster
.only(firstmon
).run(
532 ctx
.cluster
.only(firstmon
).run(
540 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
541 monmap_path
= '{tdir}/{cluster}.monmap'.format(tdir
=testdir
,
542 cluster
=cluster_name
)
543 fsid
= teuthology
.create_simple_monmap(
549 if not 'global' in conf
:
551 conf
['global']['fsid'] = fsid
553 default_conf_path
= '/etc/ceph/{cluster}.conf'.format(cluster
=cluster_name
)
554 conf_path
= config
.get('conf_path', default_conf_path
)
555 log
.info('Writing %s for FSID %s...' % (conf_path
, fsid
))
556 write_conf(ctx
, conf_path
, cluster_name
)
558 log
.info('Creating admin key on %s...' % firstmon
)
559 ctx
.cluster
.only(firstmon
).run(
567 '--name=client.admin',
569 '--cap', 'mon', 'allow *',
570 '--cap', 'osd', 'allow *',
571 '--cap', 'mds', 'allow *',
572 '--cap', 'mgr', 'allow *',
577 log
.info('Copying monmap to all nodes...')
578 keyring
= teuthology
.get_file(
582 monmap
= teuthology
.get_file(
587 for rem
in ctx
.cluster
.remotes
.iterkeys():
588 # copy mon key and initial monmap
589 log
.info('Sending monmap to node {remote}'.format(remote
=rem
))
590 teuthology
.sudo_write_file(
596 teuthology
.write_file(
602 log
.info('Setting up mon nodes...')
603 mons
= ctx
.cluster
.only(teuthology
.is_type('mon', cluster_name
))
605 if not config
.get('skip_mgr_daemons', False):
606 log
.info('Setting up mgr nodes...')
607 mgrs
= ctx
.cluster
.only(teuthology
.is_type('mgr', cluster_name
))
608 for remote
, roles_for_host
in mgrs
.remotes
.iteritems():
609 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mgr',
611 _
, _
, id_
= teuthology
.split_role(role
)
612 mgr_dir
= '/var/lib/ceph/mgr/{cluster}-{id}'.format(
613 cluster
=cluster_name
,
630 '--name=mgr.{id}'.format(id=id_
),
631 mgr_dir
+ '/keyring',
635 log
.info('Setting up mds nodes...')
636 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
637 for remote
, roles_for_host
in mdss
.remotes
.iteritems():
638 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mds',
640 _
, _
, id_
= teuthology
.split_role(role
)
641 mds_dir
= '/var/lib/ceph/mds/{cluster}-{id}'.format(
642 cluster
=cluster_name
,
659 '--name=mds.{id}'.format(id=id_
),
660 mds_dir
+ '/keyring',
664 cclient
.create_keyring(ctx
, cluster_name
)
665 log
.info('Running mkfs on osd nodes...')
667 if not hasattr(ctx
, 'disk_config'):
668 ctx
.disk_config
= argparse
.Namespace()
669 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev'):
670 ctx
.disk_config
.remote_to_roles_to_dev
= {}
671 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_journals'):
672 ctx
.disk_config
.remote_to_roles_to_journals
= {}
673 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_mount_options'):
674 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
= {}
675 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_fstype'):
676 ctx
.disk_config
.remote_to_roles_to_dev_fstype
= {}
678 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_dev
, remote_to_roles_to_devs
)
679 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_journals
, remote_to_roles_to_journals
)
681 log
.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r
=str(ctx
.disk_config
.remote_to_roles_to_dev
)))
682 for remote
, roles_for_host
in osds
.remotes
.iteritems():
683 roles_to_devs
= remote_to_roles_to_devs
[remote
]
684 roles_to_journals
= remote_to_roles_to_journals
[remote
]
686 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
687 _
, _
, id_
= teuthology
.split_role(role
)
688 mnt_point
= '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster
=cluster_name
, id=id_
)
696 log
.info(str(roles_to_devs
))
697 log
.info(str(roles_to_journals
))
699 if roles_to_devs
.get(role
):
700 dev
= roles_to_devs
[role
]
701 fs
= config
.get('fs')
703 mkfs_options
= config
.get('mkfs_options')
704 mount_options
= config
.get('mount_options')
706 # package = 'btrfs-tools'
707 if mount_options
is None:
708 mount_options
= ['noatime', 'user_subvol_rm_allowed']
709 if mkfs_options
is None:
710 mkfs_options
= ['-m', 'single',
714 # package = 'xfsprogs'
715 if mount_options
is None:
716 mount_options
= ['noatime']
717 if mkfs_options
is None:
718 mkfs_options
= ['-f', '-i', 'size=2048']
719 if fs
== 'ext4' or fs
== 'ext3':
720 if mount_options
is None:
721 mount_options
= ['noatime', 'user_xattr']
723 if mount_options
is None:
725 if mkfs_options
is None:
727 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
728 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
729 if package
is not None:
733 'apt-get', 'install', '-y', package
739 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
740 except run
.CommandFailedError
:
741 # Newer btfs-tools doesn't prompt for overwrite, use -f
742 if '-f' not in mount_options
:
743 mkfs_options
.append('-f')
744 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
745 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
746 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
748 log
.info('mount %s on %s -o %s' % (dev
, remote
,
749 ','.join(mount_options
)))
755 '-o', ','.join(mount_options
),
762 'sudo', '/sbin/restorecon', mnt_point
,
766 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_mount_options
:
767 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
] = {}
768 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
][role
] = mount_options
769 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_fstype
:
770 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
] = {}
771 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
][role
] = fs
772 devs_to_clean
[remote
].append(mnt_point
)
774 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
775 _
, _
, id_
= teuthology
.split_role(role
)
789 '--monmap', monmap_path
,
793 log
.info('Reading keys from all nodes...')
796 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
797 for type_
in ['mgr', 'mds', 'osd']:
798 if type_
== 'mgr' and config
.get('skip_mgr_daemons', False):
800 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, type_
, cluster_name
):
801 _
, _
, id_
= teuthology
.split_role(role
)
802 data
= teuthology
.get_file(
804 path
='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
807 cluster
=cluster_name
,
811 keys
.append((type_
, id_
, data
))
813 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
814 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client', cluster_name
):
815 _
, _
, id_
= teuthology
.split_role(role
)
816 data
= teuthology
.get_file(
818 path
='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_
, cluster
=cluster_name
)
820 keys
.append(('client', id_
, data
))
823 log
.info('Adding keys to all mons...')
834 teuthology
.feed_many_stdins_and_close(keys_fp
, writes
)
836 for type_
, id_
, data
in keys
:
846 '--name={type}.{id}'.format(
850 ] + list(generate_caps(type_
)),
855 log
.info('Running mkfs on mon nodes...')
856 for remote
, roles_for_host
in mons
.remotes
.iteritems():
857 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mon', cluster_name
):
858 _
, _
, id_
= teuthology
.split_role(role
)
864 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_
, cluster
=cluster_name
),
874 '--cluster', cluster_name
,
877 '--monmap', monmap_path
,
878 '--keyring', keyring_path
,
896 # we need to know this below
897 ctx
.summary
['success'] = False
900 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
902 log
.info('Checking cluster log for badness...')
904 def first_in_ceph_log(pattern
, excludes
):
906 Find the first occurence of the pattern specified in the Ceph log,
907 Returns None if none found.
909 :param pattern: Pattern scanned for.
910 :param excludes: Patterns to ignore.
911 :return: First line of text (or None if not found)
916 '/var/log/ceph/{cluster}.log'.format(cluster
=cluster_name
),
918 for exclude
in excludes
:
919 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
921 run
.Raw('|'), 'head', '-n', '1',
927 stdout
= r
.stdout
.getvalue()
932 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
933 config
['log_whitelist']) is not None:
934 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
935 ctx
.summary
['success'] = False
936 # use the most severe problem as the failure reason
937 if 'failure_reason' not in ctx
.summary
:
938 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
939 match
= first_in_ceph_log(pattern
, config
['log_whitelist'])
940 if match
is not None:
941 ctx
.summary
['failure_reason'] = \
942 '"{match}" in cluster log'.format(
943 match
=match
.rstrip('\n'),
947 for remote
, dirs
in devs_to_clean
.iteritems():
949 log
.info('Unmounting %s on %s' % (dir_
, remote
))
961 except Exception as e
:
964 run
.Raw('PATH=/usr/sbin:$PATH'),
971 if config
.get('tmpfs_journal'):
972 log
.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
973 for remote
, roles_for_host
in osds
.remotes
.iteritems():
975 args
=['sudo', 'umount', '-f', '/mnt'],
979 if ctx
.archive
is not None and \
980 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
982 # archive mon data, too
983 log
.info('Archiving mon data...')
984 path
= os
.path
.join(ctx
.archive
, 'data')
988 if e
.errno
== errno
.EEXIST
:
992 for remote
, roles
in mons
.remotes
.iteritems():
994 is_mon
= teuthology
.is_type('mon', cluster_name
)
996 _
, _
, id_
= teuthology
.split_role(role
)
997 mon_dir
= '/var/lib/ceph/mon/' + \
998 '{0}-{1}'.format(cluster_name
, id_
)
999 teuthology
.pull_directory_tarball(
1002 path
+ '/' + role
+ '.tgz')
1004 log
.info('Cleaning ceph cluster...')
1016 run
.Raw('{tdir}/../*.pid'.format(tdir
=testdir
)),
1023 def osd_scrub_pgs(ctx
, config
):
1025 Scrub pgs when we exit.
1027 First make sure all pgs are active and clean.
1028 Next scrub all osds.
1029 Then periodically check until all pgs have scrub time stamps that
1030 indicate the last scrub completed. Time out if no progess is made
1031 here after two minutes.
1035 cluster_name
= config
['cluster']
1036 manager
= ctx
.managers
[cluster_name
]
1038 for _
in range(0, retries
):
1039 stats
= manager
.get_pg_stats()
1040 bad
= [stat
['pgid'] for stat
in stats
if 'active+clean' not in stat
['state']]
1045 "Waiting for all PGs to be active and clean, waiting on %s" % bad
)
1048 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1049 check_time_now
= time
.localtime()
1051 all_roles
= teuthology
.all_roles(ctx
.cluster
)
1052 for role
in teuthology
.cluster_roles_of_type(all_roles
, 'osd', cluster_name
):
1053 log
.info("Scrubbing {osd}".format(osd
=role
))
1054 _
, _
, id_
= teuthology
.split_role(role
)
1055 # allow this to fail; in certain cases the OSD might not be up
1056 # at this point. we will catch all pgs below.
1058 manager
.raw_cluster_cmd('osd', 'deep-scrub', id_
)
1059 except run
.CommandFailedError
:
1065 stats
= manager
.get_pg_stats()
1066 timez
= [(stat
['pgid'],stat
['last_scrub_stamp']) for stat
in stats
]
1069 for (pgid
, tmval
) in timez
:
1070 pgtm
= time
.strptime(tmval
[0:tmval
.find('.')], '%Y-%m-%d %H:%M:%S')
1071 if pgtm
> check_time_now
:
1074 log
.info('pgid %s last_scrub_stamp %s %s <= %s', pgid
, tmval
, pgtm
, check_time_now
)
1076 if thiscnt
> prev_good
:
1081 if gap_cnt
% 6 == 0:
1082 for (pgid
, tmval
) in timez
:
1083 # re-request scrub every so often in case the earlier
1084 # request was missed. do not do it everytime because
1085 # the scrub may be in progress or not reported yet and
1086 # we will starve progress.
1087 manager
.raw_cluster_cmd('pg', 'deep-scrub', pgid
)
1088 if gap_cnt
> retries
:
1089 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1091 log
.info('Still waiting for all pgs to be scrubbed.')
1095 @contextlib.contextmanager
1096 def run_daemon(ctx
, config
, type_
):
1098 Run daemons for a role type. Handle the startup and termination of a a daemon.
1099 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1100 and a max_mds value for one mds.
1101 On cleanup -- Stop all existing daemons of this type.
1104 :param config: Configuration
1105 :paran type_: Role type
1107 cluster_name
= config
['cluster']
1108 log
.info('Starting %s daemons in cluster %s...', type_
, cluster_name
)
1109 testdir
= teuthology
.get_testdir(ctx
)
1110 daemons
= ctx
.cluster
.only(teuthology
.is_type(type_
, cluster_name
))
1112 # check whether any daemons if this type are configured
1115 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1117 daemon_signal
= 'kill'
1118 if config
.get('coverage') or config
.get('valgrind') is not None:
1119 daemon_signal
= 'term'
1121 # create osds in order. (this only matters for pre-luminous, which might
1122 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1124 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1125 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1126 for role
in roles_for_host
:
1127 if not is_type_(role
):
1129 _
, _
, id_
= teuthology
.split_role(role
)
1133 datadir
='/var/lib/ceph/osd/{cluster}-{id}'.format(
1134 cluster
=cluster_name
, id=id_
)
1135 osd_uuid
= teuthology
.get_file(
1137 path
=datadir
+ '/fsid',
1140 osd_uuids
[id_
] = osd_uuid
1141 for osd_id
in range(len(osd_uuids
)):
1143 osd_uuid
= osd_uuids
.get(id_
)
1147 'sudo', 'ceph', '--cluster', cluster_name
,
1148 'osd', 'new', osd_uuid
, id_
,
1152 # fallback to pre-luminous (hammer or jewel)
1155 'sudo', 'ceph', '--cluster', cluster_name
,
1156 'osd', 'create', osd_uuid
,
1159 if config
.get('add_osds_to_crush'):
1162 'sudo', 'ceph', '--cluster', cluster_name
,
1163 'osd', 'crush', 'create-or-move', 'osd.' + id_
,
1164 '1.0', 'host=localhost', 'root=default',
1168 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1169 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1170 for role
in roles_for_host
:
1171 if not is_type_(role
):
1173 _
, _
, id_
= teuthology
.split_role(role
)
1184 'ceph-%s' % (type_
),
1186 '--cluster', cluster_name
,
1189 if type_
in config
.get('cpu_profile', []):
1190 profile_path
= '/var/log/ceph/profiling-logger/%s.prof' % (role
)
1191 run_cmd
.extend(['env', 'CPUPROFILE=%s' % profile_path
])
1193 if config
.get('valgrind') is not None:
1194 valgrind_args
= None
1195 if type_
in config
['valgrind']:
1196 valgrind_args
= config
['valgrind'][type_
]
1197 if role
in config
['valgrind']:
1198 valgrind_args
= config
['valgrind'][role
]
1199 run_cmd
= teuthology
.get_valgrind_args(testdir
, role
,
1203 run_cmd
.extend(run_cmd_tail
)
1205 # always register mgr; don't necessarily start
1206 ctx
.daemons
.register_daemon(
1208 cluster
=cluster_name
,
1210 logger
=log
.getChild(role
),
1214 if type_
!= 'mgr' or not config
.get('skip_mgr_daemons', False):
1215 role
= cluster_name
+ '.' + type_
1216 ctx
.daemons
.get_daemon(type_
, id_
, cluster_name
).restart()
1221 teuthology
.stop_daemons_of_type(ctx
, type_
, cluster_name
)
1224 def healthy(ctx
, config
):
1226 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1229 :param config: Configuration
1231 config
= config
if isinstance(config
, dict) else dict()
1232 cluster_name
= config
.get('cluster', 'ceph')
1233 log
.info('Waiting until %s daemons up and pgs clean...', cluster_name
)
1234 manager
= ctx
.managers
[cluster_name
]
1236 manager
.wait_for_mgr_available(timeout
=30)
1237 except (run
.CommandFailedError
, AssertionError) as e
:
1238 log
.info('ignoring mgr wait error, probably testing upgrade: %s', e
)
1240 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1241 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1242 teuthology
.wait_until_osds_up(
1244 cluster
=ctx
.cluster
,
1246 ceph_cluster
=cluster_name
,
1250 manager
.flush_all_pg_stats()
1251 except (run
.CommandFailedError
, Exception) as e
:
1252 log
.info('ignoring flush pg stats error, probably testing upgrade: %s', e
)
1253 manager
.wait_for_clean()
1255 log
.info('Waiting until ceph cluster %s is healthy...', cluster_name
)
1256 teuthology
.wait_until_healthy(
1259 ceph_cluster
=cluster_name
,
1262 if ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
)).remotes
:
1263 # Some MDSs exist, wait for them to be healthy
1264 ceph_fs
= Filesystem(ctx
) # TODO: make Filesystem cluster-aware
1265 ceph_fs
.wait_for_daemons(timeout
=300)
1268 def wait_for_osds_up(ctx
, config
):
1270 Wait for all osd's to come up.
1273 :param config: Configuration
1275 log
.info('Waiting until ceph osds are all up...')
1276 cluster_name
= config
.get('cluster', 'ceph')
1277 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1278 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1279 teuthology
.wait_until_osds_up(
1281 cluster
=ctx
.cluster
,
1286 def wait_for_mon_quorum(ctx
, config
):
1288 Check renote ceph status until all monitors are up.
1291 :param config: Configuration
1293 if isinstance(config
, dict):
1294 mons
= config
['daemons']
1295 cluster_name
= config
.get('cluster', 'ceph')
1297 assert isinstance(config
, list)
1299 cluster_name
= 'ceph'
1300 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1301 (remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1302 with contextutil
.safe_while(sleep
=10, tries
=60,
1303 action
='wait for monitor quorum') as proceed
:
1312 logger
=log
.getChild('quorum_status'),
1314 j
= json
.loads(r
.stdout
.getvalue())
1315 q
= j
.get('quorum_names', [])
1316 log
.debug('Quorum: %s', q
)
1317 if sorted(q
) == sorted(mons
):
1321 def created_pool(ctx
, config
):
1323 Add new pools to the dictionary of pools that the ceph-manager
1326 for new_pool
in config
:
1327 if new_pool
not in ctx
.managers
['ceph'].pools
:
1328 ctx
.managers
['ceph'].pools
[new_pool
] = ctx
.managers
['ceph'].get_pool_property(
1332 @contextlib.contextmanager
1333 def restart(ctx
, config
):
1335 restart ceph daemons
1339 - ceph.restart: [all]
1343 - ceph.restart: [osd.0, mon.1, mds.*]
1349 daemons: [osd.0, mon.1]
1350 wait-for-healthy: false
1351 wait-for-osds-up: true
1354 :param config: Configuration
1358 elif isinstance(config
, list):
1359 config
= {'daemons': config
}
1361 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1363 for role
in daemons
:
1364 cluster
, type_
, id_
= teuthology
.split_role(role
)
1365 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).restart()
1366 clusters
.add(cluster
)
1368 manager
= ctx
.managers
['ceph']
1369 for dmon
in daemons
:
1371 dm_parts
= dmon
.split('.')
1372 if dm_parts
[1].isdigit():
1373 if dm_parts
[0] == 'osd':
1374 manager
.mark_down_osd(int(dm_parts
[1]))
1376 if config
.get('wait-for-healthy', True):
1377 for cluster
in clusters
:
1378 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1379 if config
.get('wait-for-osds-up', False):
1380 for cluster
in clusters
:
1381 wait_for_osds_up(ctx
=ctx
, config
=dict(cluster
=cluster
))
1385 @contextlib.contextmanager
1386 def stop(ctx
, config
):
1392 - ceph.stop: [mds.*]
1395 - ceph.stop: [osd.0, osd.2]
1399 daemons: [osd.0, osd.2]
1404 elif isinstance(config
, list):
1405 config
= {'daemons': config
}
1407 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1408 for role
in daemons
:
1409 cluster
, type_
, id_
= teuthology
.split_role(role
)
1410 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1415 @contextlib.contextmanager
1416 def wait_for_failure(ctx
, config
):
1418 Wait for a failure of a ceph daemon
1422 - ceph.wait_for_failure: [mds.*]
1425 - ceph.wait_for_failure: [osd.0, osd.2]
1428 - ceph.wait_for_failure:
1429 daemons: [osd.0, osd.2]
1434 elif isinstance(config
, list):
1435 config
= {'daemons': config
}
1437 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1438 for role
in daemons
:
1439 cluster
, type_
, id_
= teuthology
.split_role(role
)
1441 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).wait()
1443 log
.info('Saw expected daemon failure. Continuing.')
1446 raise RuntimeError('daemon %s did not fail' % role
)
1451 def validate_config(ctx
, config
):
1453 Perform some simple validation on task configuration.
1454 Raises exceptions.ConfigError if an error is found.
1456 # check for osds from multiple clusters on the same host
1457 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
1460 for role
in roles_for_host
:
1461 role_cluster
, role_type
, _
= teuthology
.split_role(role
)
1462 if role_type
!= 'osd':
1464 if last_cluster
and last_cluster
!= role_cluster
:
1465 msg
= "Host should not have osds (%s and %s) from multiple clusters" % (
1467 raise exceptions
.ConfigError(msg
)
1468 last_cluster
= role_cluster
1472 @contextlib.contextmanager
1473 def task(ctx
, config
):
1475 Set up and tear down a Ceph cluster.
1483 You can also specify what branch to run::
1499 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1501 Or a local source dir::
1505 path: /home/sage/ceph
1507 To capture code coverage data, use::
1513 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1518 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1519 mount_options: [nobarrier, inode64]
1521 Note, this will cause the task to check the /scratch_devs file on each node
1522 for available devices. If no such file is found, /dev/sdb will be used.
1524 To run some daemons under valgrind, include their names
1525 and the tool/args to use in a valgrind section::
1530 mds.1: --tool=memcheck
1531 osd.1: [--tool=memcheck, --leak-check=no]
1533 Those nodes which are using memcheck or valgrind will get
1534 checked for bad results.
1536 To adjust or modify config options, use::
1551 other key: other value
1556 By default, the cluster log is checked for errors and warnings,
1557 and the run marked failed if any appear. You can ignore log
1558 entries by giving a list of egrep compatible regexes, i.e.:
1562 log-whitelist: ['foo.*bar', 'bad message']
1564 To run multiple ceph clusters, use multiple ceph tasks, and roles
1565 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1566 cluster use the default cluster name, 'ceph'. OSDs from separate
1567 clusters must be on separate hosts. Clients and non-osd daemons
1568 from multiple clusters may be colocated. For each cluster, add an
1569 instance of the ceph task with the cluster name specified, e.g.::
1572 - [mon.a, osd.0, osd.1]
1573 - [backup.mon.a, backup.osd.0, backup.osd.1]
1574 - [client.0, backup.client.0]
1582 :param config: Configuration
1587 assert isinstance(config
, dict), \
1588 "task ceph only supports a dictionary for configuration"
1590 overrides
= ctx
.config
.get('overrides', {})
1591 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1593 first_ceph_cluster
= False
1594 if not hasattr(ctx
, 'daemons'):
1595 first_ceph_cluster
= True
1596 ctx
.daemons
= DaemonGroup()
1598 testdir
= teuthology
.get_testdir(ctx
)
1599 if config
.get('coverage'):
1600 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1601 log
.info('Creating coverage directory...')
1605 'install', '-d', '-m0755', '--',
1612 if 'cluster' not in config
:
1613 config
['cluster'] = 'ceph'
1615 validate_config(ctx
, config
)
1618 if first_ceph_cluster
:
1619 # these tasks handle general log setup and parsing on all hosts,
1620 # so they should only be run once
1622 lambda: ceph_log(ctx
=ctx
, config
=None),
1623 lambda: valgrind_post(ctx
=ctx
, config
=config
),
1627 lambda: cluster(ctx
=ctx
, config
=dict(
1628 conf
=config
.get('conf', {}),
1629 fs
=config
.get('fs', 'xfs'),
1630 mkfs_options
=config
.get('mkfs_options', None),
1631 mount_options
=config
.get('mount_options', None),
1632 block_journal
=config
.get('block_journal', None),
1633 tmpfs_journal
=config
.get('tmpfs_journal', None),
1634 skip_mgr_daemons
=config
.get('skip_mgr_daemons', False),
1635 log_whitelist
=config
.get('log-whitelist', []),
1636 cpu_profile
=set(config
.get('cpu_profile', []),),
1637 cluster
=config
['cluster'],
1639 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mon'),
1640 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mgr'),
1641 lambda: crush_setup(ctx
=ctx
, config
=config
),
1642 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='osd'),
1643 lambda: create_rbd_pool(ctx
=ctx
, config
=config
),
1644 lambda: cephfs_setup(ctx
=ctx
, config
=config
),
1645 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mds'),
1648 with contextutil
.nested(*subtasks
):
1649 first_mon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1650 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
1651 if not hasattr(ctx
, 'managers'):
1653 ctx
.managers
[config
['cluster']] = CephManager(
1656 logger
=log
.getChild('ceph_manager.' + config
['cluster']),
1657 cluster
=config
['cluster'],
1661 if config
.get('wait-for-healthy', True):
1662 healthy(ctx
=ctx
, config
=dict(cluster
=config
['cluster']))
1666 if config
.get('wait-for-scrub', True):
1667 osd_scrub_pgs(ctx
, config
)
1669 # stop logging health to clog during shutdown, or else we generate
1670 # a bunch of scary messages unrelated to our actual run.
1671 firstmon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1672 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1677 '--cluster', config
['cluster'],
1682 '--no-mon-health-to-clog',