4 Handle the setup, starting, and clean-up of a Ceph cluster.
6 from cStringIO
import StringIO
18 from paramiko
import SSHException
19 from ceph_manager
import CephManager
, write_conf
20 from tasks
.cephfs
.filesystem
import Filesystem
21 from teuthology
import misc
as teuthology
22 from teuthology
import contextutil
23 from teuthology
import exceptions
24 from teuthology
.orchestra
import run
25 import ceph_client
as cclient
26 from teuthology
.orchestra
.daemon
import DaemonGroup
28 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw']
30 log
= logging
.getLogger(__name__
)
33 def generate_caps(type_
):
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
46 mon
='allow profile mgr',
63 for subsystem
, capability
in defaults
[type_
].items():
69 @contextlib.contextmanager
70 def ceph_log(ctx
, config
):
72 Create /var/log/ceph log directory that is open to everyone.
73 Add valgrind and profiling-logger directories.
76 :param config: Configuration
78 log
.info('Making ceph log dir writeable by non-root...')
90 log
.info('Disabling ceph logrotate...')
96 '/etc/logrotate.d/ceph',
101 log
.info('Creating extra log directories...')
106 'install', '-d', '-m0777', '--',
107 '/var/log/ceph/valgrind',
108 '/var/log/ceph/profiling-logger',
114 class Rotater(object):
115 stop_event
= gevent
.event
.Event()
117 def invoke_logrotate(self
):
118 # 1) install ceph-test.conf in /etc/logrotate.d
119 # 2) continuously loop over logrotate invocation with ceph-test.conf
120 while not self
.stop_event
.is_set():
121 self
.stop_event
.wait(timeout
=30)
125 args
=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
130 except exceptions
.ConnectionLostError
as e
:
131 # Some tests may power off nodes during test, in which
132 # case we will see connection errors that we should ignore.
133 log
.debug("Missed logrotate, node '{0}' is offline".format(
135 except EOFError as e
:
136 # Paramiko sometimes raises this when it fails to
137 # connect to a node during open_session. As with
138 # ConnectionLostError, we ignore this because nodes
139 # are allowed to get power cycled during tests.
140 log
.debug("Missed logrotate, EOFError")
141 except SSHException
as e
:
142 log
.debug("Missed logrotate, SSHException")
143 except socket
.error
as e
:
144 if e
.errno
== errno
.EHOSTUNREACH
:
145 log
.debug("Missed logrotate, host unreachable")
150 self
.thread
= gevent
.spawn(self
.invoke_logrotate
)
153 self
.stop_event
.set()
156 def write_rotate_conf(ctx
, daemons
):
157 testdir
= teuthology
.get_testdir(ctx
)
158 rotate_conf_path
= os
.path
.join(os
.path
.dirname(__file__
), 'logrotate.conf')
159 with
file(rotate_conf_path
, 'rb') as f
:
161 for daemon
, size
in daemons
.iteritems():
162 log
.info('writing logrotate stanza for {daemon}'.format(daemon
=daemon
))
163 conf
+= f
.read().format(daemon_type
=daemon
, max_size
=size
)
166 for remote
in ctx
.cluster
.remotes
.iterkeys():
167 teuthology
.write_file(remote
=remote
,
168 path
='{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
175 '{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
176 '/etc/logrotate.d/ceph-test.conf',
181 '/etc/logrotate.d/ceph-test.conf',
186 '/etc/logrotate.d/ceph-test.conf'
189 remote
.chcon('/etc/logrotate.d/ceph-test.conf',
190 'system_u:object_r:etc_t:s0')
192 if ctx
.config
.get('log-rotate'):
193 daemons
= ctx
.config
.get('log-rotate')
194 log
.info('Setting up log rotation with ' + str(daemons
))
195 write_rotate_conf(ctx
, daemons
)
196 logrotater
= Rotater()
202 if ctx
.config
.get('log-rotate'):
203 log
.info('Shutting down logrotate')
206 args
=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
209 if ctx
.archive
is not None and \
210 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
212 log
.info('Compressing logs...')
235 log
.info('Archiving logs...')
236 path
= os
.path
.join(ctx
.archive
, 'remote')
238 for remote
in ctx
.cluster
.remotes
.iterkeys():
239 sub
= os
.path
.join(path
, remote
.shortname
)
241 teuthology
.pull_directory(remote
, '/var/log/ceph',
242 os
.path
.join(sub
, 'log'))
245 def assign_devs(roles
, devs
):
247 Create a dictionary of devs indexed by roles
249 :param roles: List of roles
250 :param devs: Corresponding list of devices.
251 :returns: Dictionary of devs indexed by roles.
253 return dict(zip(roles
, devs
))
256 @contextlib.contextmanager
257 def valgrind_post(ctx
, config
):
259 After the tests run, look throught all the valgrind logs. Exceptions are raised
260 if textual errors occured in the logs, or if valgrind exceptions were detected in
264 :param config: Configuration
269 lookup_procs
= list()
270 log
.info('Checking for errors in any valgrind logs...')
271 for remote
in ctx
.cluster
.remotes
.iterkeys():
272 # look at valgrind logs for each node
278 run
.Raw('/var/log/ceph/valgrind/*'),
279 '/dev/null', # include a second file so that we always get a filename prefix on the output
289 lookup_procs
.append((proc
, remote
))
291 valgrind_exception
= None
292 for (proc
, remote
) in lookup_procs
:
294 out
= proc
.stdout
.getvalue()
295 for line
in out
.split('\n'):
299 (file, kind
) = line
.split(':')
301 log
.error('failed to split line %s', line
)
303 log
.debug('file %s kind %s', file, kind
)
304 if (file.find('mds') >= 0) and kind
.find('Lost') > 0:
306 log
.error('saw valgrind issue %s in %s', kind
, file)
307 valgrind_exception
= Exception('saw valgrind issues')
309 if config
.get('expect_valgrind_errors'):
310 if not valgrind_exception
:
311 raise Exception('expected valgrind issues and found none')
313 if valgrind_exception
:
314 raise valgrind_exception
317 @contextlib.contextmanager
318 def crush_setup(ctx
, config
):
319 cluster_name
= config
['cluster']
320 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
321 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
323 profile
= config
.get('crush_tunables', 'default')
324 log
.info('Setting crush tunables to %s', profile
)
326 args
=['sudo', 'ceph', '--cluster', cluster_name
,
327 'osd', 'crush', 'tunables', profile
])
331 @contextlib.contextmanager
332 def create_rbd_pool(ctx
, config
):
333 cluster_name
= config
['cluster']
334 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
335 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
336 log
.info('Waiting for OSDs to come up')
337 teuthology
.wait_until_osds_up(
341 ceph_cluster
=cluster_name
,
343 if config
.get('create_rbd_pool', True):
344 log
.info('Creating RBD pool')
346 args
=['sudo', 'ceph', '--cluster', cluster_name
,
347 'osd', 'pool', 'create', 'rbd', '8'])
350 'sudo', 'ceph', '--cluster', cluster_name
,
351 'osd', 'pool', 'application', 'enable',
352 'rbd', 'rbd', '--yes-i-really-mean-it'
357 @contextlib.contextmanager
358 def cephfs_setup(ctx
, config
):
359 cluster_name
= config
['cluster']
360 testdir
= teuthology
.get_testdir(ctx
)
361 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
363 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
364 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
365 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
366 # If there are any MDSs, then create a filesystem for them to use
367 # Do this last because requires mon cluster to be up and running
369 log
.info('Setting up CephFS filesystem...')
371 fs
= Filesystem(ctx
, name
='cephfs', create
=True,
372 ec_profile
=config
.get('cephfs_ec_profile', None))
374 is_active_mds
= lambda role
: 'mds.' in role
and not role
.endswith('-s') and '-s-' not in role
375 all_roles
= [item
for remote_roles
in mdss
.remotes
.values() for item
in remote_roles
]
376 num_active
= len([r
for r
in all_roles
if is_active_mds(r
)])
378 fs
.set_max_mds(num_active
)
379 fs
.set_allow_dirfrags(True)
384 @contextlib.contextmanager
385 def cluster(ctx
, config
):
387 Handle the creation and removal of a ceph cluster.
390 Create directories needed for the cluster.
391 Create remote journals for all osds.
392 Create and set keyring.
393 Copy the monmap to tht test systems.
397 Add keyring information to monmaps
401 If errors occured, extract a failure message and store in ctx.summary.
402 Unmount all test files and temporary journaling files.
403 Save the monitor information and archive all ceph logs.
404 Cleanup the keyring setup, and remove all monitor map and data files left over.
407 :param config: Configuration
409 if ctx
.config
.get('use_existing_cluster', False) is True:
410 log
.info("'use_existing_cluster' is true; skipping cluster creation")
413 testdir
= teuthology
.get_testdir(ctx
)
414 cluster_name
= config
['cluster']
415 data_dir
= '{tdir}/{cluster}.data'.format(tdir
=testdir
, cluster
=cluster_name
)
416 log
.info('Creating ceph cluster %s...', cluster_name
)
420 'install', '-d', '-m0755', '--',
431 'install', '-d', '-m0777', '--', '/var/run/ceph',
438 remote_to_roles_to_devs
= {}
439 remote_to_roles_to_journals
= {}
440 osds
= ctx
.cluster
.only(teuthology
.is_type('osd', cluster_name
))
441 for remote
, roles_for_host
in osds
.remotes
.iteritems():
442 devs
= teuthology
.get_scratch_devices(remote
)
444 roles_to_journals
= {}
446 log
.info('fs option selected, checking for scratch devs')
447 log
.info('found devs: %s' % (str(devs
),))
448 devs_id_map
= teuthology
.get_wwn_id_map(remote
, devs
)
449 iddevs
= devs_id_map
.values()
450 roles_to_devs
= assign_devs(
451 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
453 if len(roles_to_devs
) < len(iddevs
):
454 iddevs
= iddevs
[len(roles_to_devs
):]
455 devs_to_clean
[remote
] = []
457 if config
.get('block_journal'):
458 log
.info('block journal enabled')
459 roles_to_journals
= assign_devs(
460 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
462 log
.info('journal map: %s', roles_to_journals
)
464 if config
.get('tmpfs_journal'):
465 log
.info('tmpfs journal enabled')
466 roles_to_journals
= {}
467 remote
.run(args
=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
468 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
469 tmpfs
= '/mnt/' + role
470 roles_to_journals
[role
] = tmpfs
471 remote
.run(args
=['truncate', '-s', '1500M', tmpfs
])
472 log
.info('journal map: %s', roles_to_journals
)
474 log
.info('dev map: %s' % (str(roles_to_devs
),))
475 remote_to_roles_to_devs
[remote
] = roles_to_devs
476 remote_to_roles_to_journals
[remote
] = roles_to_journals
478 log
.info('Generating config...')
479 remotes_and_roles
= ctx
.cluster
.remotes
.items()
480 roles
= [role_list
for (remote
, role_list
) in remotes_and_roles
]
481 ips
= [host
for (host
, port
) in
482 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
483 conf
= teuthology
.skeleton_config(ctx
, roles
=roles
, ips
=ips
, cluster
=cluster_name
)
484 for remote
, roles_to_journals
in remote_to_roles_to_journals
.iteritems():
485 for role
, journal
in roles_to_journals
.iteritems():
486 name
= teuthology
.ceph_role(role
)
489 conf
[name
]['osd journal'] = journal
490 for section
, keys
in config
['conf'].iteritems():
491 for key
, value
in keys
.iteritems():
492 log
.info("[%s] %s = %s" % (section
, key
, value
))
493 if section
not in conf
:
495 conf
[section
][key
] = value
497 if config
.get('tmpfs_journal'):
498 conf
['journal dio'] = False
500 if not hasattr(ctx
, 'ceph'):
502 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
503 ctx
.ceph
[cluster_name
].conf
= conf
505 default_keyring
= '/etc/ceph/{cluster}.keyring'.format(cluster
=cluster_name
)
506 keyring_path
= config
.get('keyring_path', default_keyring
)
508 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
510 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
512 log
.info('Setting up %s...' % firstmon
)
513 ctx
.cluster
.only(firstmon
).run(
524 ctx
.cluster
.only(firstmon
).run(
536 ctx
.cluster
.only(firstmon
).run(
544 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
545 monmap_path
= '{tdir}/{cluster}.monmap'.format(tdir
=testdir
,
546 cluster
=cluster_name
)
547 fsid
= teuthology
.create_simple_monmap(
553 if not 'global' in conf
:
555 conf
['global']['fsid'] = fsid
557 default_conf_path
= '/etc/ceph/{cluster}.conf'.format(cluster
=cluster_name
)
558 conf_path
= config
.get('conf_path', default_conf_path
)
559 log
.info('Writing %s for FSID %s...' % (conf_path
, fsid
))
560 write_conf(ctx
, conf_path
, cluster_name
)
562 log
.info('Creating admin key on %s...' % firstmon
)
563 ctx
.cluster
.only(firstmon
).run(
571 '--name=client.admin',
573 '--cap', 'mon', 'allow *',
574 '--cap', 'osd', 'allow *',
575 '--cap', 'mds', 'allow *',
576 '--cap', 'mgr', 'allow *',
581 log
.info('Copying monmap to all nodes...')
582 keyring
= teuthology
.get_file(
586 monmap
= teuthology
.get_file(
591 for rem
in ctx
.cluster
.remotes
.iterkeys():
592 # copy mon key and initial monmap
593 log
.info('Sending monmap to node {remote}'.format(remote
=rem
))
594 teuthology
.sudo_write_file(
600 teuthology
.write_file(
606 log
.info('Setting up mon nodes...')
607 mons
= ctx
.cluster
.only(teuthology
.is_type('mon', cluster_name
))
609 if not config
.get('skip_mgr_daemons', False):
610 log
.info('Setting up mgr nodes...')
611 mgrs
= ctx
.cluster
.only(teuthology
.is_type('mgr', cluster_name
))
612 for remote
, roles_for_host
in mgrs
.remotes
.iteritems():
613 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mgr',
615 _
, _
, id_
= teuthology
.split_role(role
)
616 mgr_dir
= '/var/lib/ceph/mgr/{cluster}-{id}'.format(
617 cluster
=cluster_name
,
634 '--name=mgr.{id}'.format(id=id_
),
635 mgr_dir
+ '/keyring',
639 log
.info('Setting up mds nodes...')
640 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
641 for remote
, roles_for_host
in mdss
.remotes
.iteritems():
642 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mds',
644 _
, _
, id_
= teuthology
.split_role(role
)
645 mds_dir
= '/var/lib/ceph/mds/{cluster}-{id}'.format(
646 cluster
=cluster_name
,
663 '--name=mds.{id}'.format(id=id_
),
664 mds_dir
+ '/keyring',
668 cclient
.create_keyring(ctx
, cluster_name
)
669 log
.info('Running mkfs on osd nodes...')
671 if not hasattr(ctx
, 'disk_config'):
672 ctx
.disk_config
= argparse
.Namespace()
673 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev'):
674 ctx
.disk_config
.remote_to_roles_to_dev
= {}
675 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_journals'):
676 ctx
.disk_config
.remote_to_roles_to_journals
= {}
677 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_mount_options'):
678 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
= {}
679 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_fstype'):
680 ctx
.disk_config
.remote_to_roles_to_dev_fstype
= {}
682 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_dev
, remote_to_roles_to_devs
)
683 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_journals
, remote_to_roles_to_journals
)
685 log
.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r
=str(ctx
.disk_config
.remote_to_roles_to_dev
)))
686 for remote
, roles_for_host
in osds
.remotes
.iteritems():
687 roles_to_devs
= remote_to_roles_to_devs
[remote
]
688 roles_to_journals
= remote_to_roles_to_journals
[remote
]
690 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
691 _
, _
, id_
= teuthology
.split_role(role
)
692 mnt_point
= '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster
=cluster_name
, id=id_
)
700 log
.info(str(roles_to_devs
))
701 log
.info(str(roles_to_journals
))
703 if roles_to_devs
.get(role
):
704 dev
= roles_to_devs
[role
]
705 fs
= config
.get('fs')
707 mkfs_options
= config
.get('mkfs_options')
708 mount_options
= config
.get('mount_options')
710 # package = 'btrfs-tools'
711 if mount_options
is None:
712 mount_options
= ['noatime', 'user_subvol_rm_allowed']
713 if mkfs_options
is None:
714 mkfs_options
= ['-m', 'single',
718 # package = 'xfsprogs'
719 if mount_options
is None:
720 mount_options
= ['noatime']
721 if mkfs_options
is None:
722 mkfs_options
= ['-f', '-i', 'size=2048']
723 if fs
== 'ext4' or fs
== 'ext3':
724 if mount_options
is None:
725 mount_options
= ['noatime', 'user_xattr']
727 if mount_options
is None:
729 if mkfs_options
is None:
731 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
732 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
733 if package
is not None:
737 'apt-get', 'install', '-y', package
743 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
744 except run
.CommandFailedError
:
745 # Newer btfs-tools doesn't prompt for overwrite, use -f
746 if '-f' not in mount_options
:
747 mkfs_options
.append('-f')
748 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
749 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
750 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
752 log
.info('mount %s on %s -o %s' % (dev
, remote
,
753 ','.join(mount_options
)))
759 '-o', ','.join(mount_options
),
766 'sudo', '/sbin/restorecon', mnt_point
,
770 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_mount_options
:
771 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
] = {}
772 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
][role
] = mount_options
773 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_fstype
:
774 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
] = {}
775 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
][role
] = fs
776 devs_to_clean
[remote
].append(mnt_point
)
778 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
779 _
, _
, id_
= teuthology
.split_role(role
)
793 '--monmap', monmap_path
,
797 log
.info('Reading keys from all nodes...')
800 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
801 for type_
in ['mgr', 'mds', 'osd']:
802 if type_
== 'mgr' and config
.get('skip_mgr_daemons', False):
804 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, type_
, cluster_name
):
805 _
, _
, id_
= teuthology
.split_role(role
)
806 data
= teuthology
.get_file(
808 path
='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
811 cluster
=cluster_name
,
815 keys
.append((type_
, id_
, data
))
817 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
818 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client', cluster_name
):
819 _
, _
, id_
= teuthology
.split_role(role
)
820 data
= teuthology
.get_file(
822 path
='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_
, cluster
=cluster_name
)
824 keys
.append(('client', id_
, data
))
827 log
.info('Adding keys to all mons...')
838 teuthology
.feed_many_stdins_and_close(keys_fp
, writes
)
840 for type_
, id_
, data
in keys
:
850 '--name={type}.{id}'.format(
854 ] + list(generate_caps(type_
)),
859 log
.info('Running mkfs on mon nodes...')
860 for remote
, roles_for_host
in mons
.remotes
.iteritems():
861 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mon', cluster_name
):
862 _
, _
, id_
= teuthology
.split_role(role
)
868 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_
, cluster
=cluster_name
),
878 '--cluster', cluster_name
,
881 '--monmap', monmap_path
,
882 '--keyring', keyring_path
,
900 # we need to know this below
901 ctx
.summary
['success'] = False
904 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
906 log
.info('Checking cluster log for badness...')
908 def first_in_ceph_log(pattern
, excludes
):
910 Find the first occurence of the pattern specified in the Ceph log,
911 Returns None if none found.
913 :param pattern: Pattern scanned for.
914 :param excludes: Patterns to ignore.
915 :return: First line of text (or None if not found)
920 '/var/log/ceph/{cluster}.log'.format(cluster
=cluster_name
),
922 for exclude
in excludes
:
923 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
925 run
.Raw('|'), 'head', '-n', '1',
931 stdout
= r
.stdout
.getvalue()
936 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
937 config
['log_whitelist']) is not None:
938 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
939 ctx
.summary
['success'] = False
940 # use the most severe problem as the failure reason
941 if 'failure_reason' not in ctx
.summary
:
942 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
943 match
= first_in_ceph_log(pattern
, config
['log_whitelist'])
944 if match
is not None:
945 ctx
.summary
['failure_reason'] = \
946 '"{match}" in cluster log'.format(
947 match
=match
.rstrip('\n'),
951 for remote
, dirs
in devs_to_clean
.iteritems():
953 log
.info('Unmounting %s on %s' % (dir_
, remote
))
965 except Exception as e
:
968 run
.Raw('PATH=/usr/sbin:$PATH'),
975 if config
.get('tmpfs_journal'):
976 log
.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
977 for remote
, roles_for_host
in osds
.remotes
.iteritems():
979 args
=['sudo', 'umount', '-f', '/mnt'],
983 if ctx
.archive
is not None and \
984 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
986 # archive mon data, too
987 log
.info('Archiving mon data...')
988 path
= os
.path
.join(ctx
.archive
, 'data')
992 if e
.errno
== errno
.EEXIST
:
996 for remote
, roles
in mons
.remotes
.iteritems():
998 is_mon
= teuthology
.is_type('mon', cluster_name
)
1000 _
, _
, id_
= teuthology
.split_role(role
)
1001 mon_dir
= '/var/lib/ceph/mon/' + \
1002 '{0}-{1}'.format(cluster_name
, id_
)
1003 teuthology
.pull_directory_tarball(
1006 path
+ '/' + role
+ '.tgz')
1008 log
.info('Cleaning ceph cluster...')
1020 run
.Raw('{tdir}/../*.pid'.format(tdir
=testdir
)),
1027 def osd_scrub_pgs(ctx
, config
):
1029 Scrub pgs when we exit.
1031 First make sure all pgs are active and clean.
1032 Next scrub all osds.
1033 Then periodically check until all pgs have scrub time stamps that
1034 indicate the last scrub completed. Time out if no progess is made
1035 here after two minutes.
1039 cluster_name
= config
['cluster']
1040 manager
= ctx
.managers
[cluster_name
]
1042 for _
in range(0, retries
):
1043 stats
= manager
.get_pg_stats()
1044 bad
= [stat
['pgid'] for stat
in stats
if 'active+clean' not in stat
['state']]
1049 "Waiting for all PGs to be active and clean, waiting on %s" % bad
)
1052 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1053 check_time_now
= time
.localtime()
1055 all_roles
= teuthology
.all_roles(ctx
.cluster
)
1056 for role
in teuthology
.cluster_roles_of_type(all_roles
, 'osd', cluster_name
):
1057 log
.info("Scrubbing {osd}".format(osd
=role
))
1058 _
, _
, id_
= teuthology
.split_role(role
)
1059 # allow this to fail; in certain cases the OSD might not be up
1060 # at this point. we will catch all pgs below.
1062 manager
.raw_cluster_cmd('tell', 'osd.' + id_
, 'config', 'set',
1063 'osd_debug_deep_scrub_sleep', '0');
1064 manager
.raw_cluster_cmd('osd', 'deep-scrub', id_
)
1065 except run
.CommandFailedError
:
1071 stats
= manager
.get_pg_stats()
1072 timez
= [(stat
['pgid'],stat
['last_scrub_stamp']) for stat
in stats
]
1075 for (pgid
, tmval
) in timez
:
1076 pgtm
= time
.strptime(tmval
[0:tmval
.find('.')], '%Y-%m-%d %H:%M:%S')
1077 if pgtm
> check_time_now
:
1080 log
.info('pgid %s last_scrub_stamp %s %s <= %s', pgid
, tmval
, pgtm
, check_time_now
)
1082 if thiscnt
> prev_good
:
1087 if gap_cnt
% 6 == 0:
1088 for (pgid
, tmval
) in timez
:
1089 # re-request scrub every so often in case the earlier
1090 # request was missed. do not do it everytime because
1091 # the scrub may be in progress or not reported yet and
1092 # we will starve progress.
1093 manager
.raw_cluster_cmd('pg', 'deep-scrub', pgid
)
1094 if gap_cnt
> retries
:
1095 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1097 log
.info('Still waiting for all pgs to be scrubbed.')
1101 @contextlib.contextmanager
1102 def run_daemon(ctx
, config
, type_
):
1104 Run daemons for a role type. Handle the startup and termination of a a daemon.
1105 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1106 and a max_mds value for one mds.
1107 On cleanup -- Stop all existing daemons of this type.
1110 :param config: Configuration
1111 :paran type_: Role type
1113 cluster_name
= config
['cluster']
1114 log
.info('Starting %s daemons in cluster %s...', type_
, cluster_name
)
1115 testdir
= teuthology
.get_testdir(ctx
)
1116 daemons
= ctx
.cluster
.only(teuthology
.is_type(type_
, cluster_name
))
1118 # check whether any daemons if this type are configured
1121 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1123 daemon_signal
= 'kill'
1124 if config
.get('coverage') or config
.get('valgrind') is not None:
1125 daemon_signal
= 'term'
1127 # create osds in order. (this only matters for pre-luminous, which might
1128 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1130 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1131 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1132 for role
in roles_for_host
:
1133 if not is_type_(role
):
1135 _
, _
, id_
= teuthology
.split_role(role
)
1139 datadir
='/var/lib/ceph/osd/{cluster}-{id}'.format(
1140 cluster
=cluster_name
, id=id_
)
1141 osd_uuid
= teuthology
.get_file(
1143 path
=datadir
+ '/fsid',
1146 osd_uuids
[id_
] = osd_uuid
1147 for osd_id
in range(len(osd_uuids
)):
1149 osd_uuid
= osd_uuids
.get(id_
)
1153 'sudo', 'ceph', '--cluster', cluster_name
,
1154 'osd', 'new', osd_uuid
, id_
,
1158 # fallback to pre-luminous (hammer or jewel)
1161 'sudo', 'ceph', '--cluster', cluster_name
,
1162 'osd', 'create', osd_uuid
,
1165 if config
.get('add_osds_to_crush'):
1168 'sudo', 'ceph', '--cluster', cluster_name
,
1169 'osd', 'crush', 'create-or-move', 'osd.' + id_
,
1170 '1.0', 'host=localhost', 'root=default',
1174 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1175 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1176 for role
in roles_for_host
:
1177 if not is_type_(role
):
1179 _
, _
, id_
= teuthology
.split_role(role
)
1190 'ceph-%s' % (type_
),
1192 '--cluster', cluster_name
,
1195 if type_
in config
.get('cpu_profile', []):
1196 profile_path
= '/var/log/ceph/profiling-logger/%s.prof' % (role
)
1197 run_cmd
.extend(['env', 'CPUPROFILE=%s' % profile_path
])
1199 if config
.get('valgrind') is not None:
1200 valgrind_args
= None
1201 if type_
in config
['valgrind']:
1202 valgrind_args
= config
['valgrind'][type_
]
1203 if role
in config
['valgrind']:
1204 valgrind_args
= config
['valgrind'][role
]
1205 run_cmd
= teuthology
.get_valgrind_args(testdir
, role
,
1209 run_cmd
.extend(run_cmd_tail
)
1211 # always register mgr; don't necessarily start
1212 ctx
.daemons
.register_daemon(
1214 cluster
=cluster_name
,
1216 logger
=log
.getChild(role
),
1220 if type_
!= 'mgr' or not config
.get('skip_mgr_daemons', False):
1221 role
= cluster_name
+ '.' + type_
1222 ctx
.daemons
.get_daemon(type_
, id_
, cluster_name
).restart()
1227 teuthology
.stop_daemons_of_type(ctx
, type_
, cluster_name
)
1230 def healthy(ctx
, config
):
1232 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1235 :param config: Configuration
1237 config
= config
if isinstance(config
, dict) else dict()
1238 cluster_name
= config
.get('cluster', 'ceph')
1239 log
.info('Waiting until %s daemons up and pgs clean...', cluster_name
)
1240 manager
= ctx
.managers
[cluster_name
]
1242 manager
.wait_for_mgr_available(timeout
=30)
1243 except (run
.CommandFailedError
, AssertionError) as e
:
1244 log
.info('ignoring mgr wait error, probably testing upgrade: %s', e
)
1246 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1247 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1248 teuthology
.wait_until_osds_up(
1250 cluster
=ctx
.cluster
,
1252 ceph_cluster
=cluster_name
,
1256 manager
.flush_all_pg_stats()
1257 except (run
.CommandFailedError
, Exception) as e
:
1258 log
.info('ignoring flush pg stats error, probably testing upgrade: %s', e
)
1259 manager
.wait_for_clean()
1261 if config
.get('wait-for-healthy', True):
1262 log
.info('Waiting until ceph cluster %s is healthy...', cluster_name
)
1263 teuthology
.wait_until_healthy(
1266 ceph_cluster
=cluster_name
,
1269 if ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
)).remotes
:
1270 # Some MDSs exist, wait for them to be healthy
1271 ceph_fs
= Filesystem(ctx
) # TODO: make Filesystem cluster-aware
1272 ceph_fs
.wait_for_daemons(timeout
=300)
1275 def wait_for_osds_up(ctx
, config
):
1277 Wait for all osd's to come up.
1280 :param config: Configuration
1282 log
.info('Waiting until ceph osds are all up...')
1283 cluster_name
= config
.get('cluster', 'ceph')
1284 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1285 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1286 teuthology
.wait_until_osds_up(
1288 cluster
=ctx
.cluster
,
1293 def wait_for_mon_quorum(ctx
, config
):
1295 Check renote ceph status until all monitors are up.
1298 :param config: Configuration
1300 if isinstance(config
, dict):
1301 mons
= config
['daemons']
1302 cluster_name
= config
.get('cluster', 'ceph')
1304 assert isinstance(config
, list)
1306 cluster_name
= 'ceph'
1307 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1308 (remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1309 with contextutil
.safe_while(sleep
=10, tries
=60,
1310 action
='wait for monitor quorum') as proceed
:
1319 logger
=log
.getChild('quorum_status'),
1321 j
= json
.loads(r
.stdout
.getvalue())
1322 q
= j
.get('quorum_names', [])
1323 log
.debug('Quorum: %s', q
)
1324 if sorted(q
) == sorted(mons
):
1328 def created_pool(ctx
, config
):
1330 Add new pools to the dictionary of pools that the ceph-manager
1333 for new_pool
in config
:
1334 if new_pool
not in ctx
.managers
['ceph'].pools
:
1335 ctx
.managers
['ceph'].pools
[new_pool
] = ctx
.managers
['ceph'].get_pool_property(
1339 @contextlib.contextmanager
1340 def restart(ctx
, config
):
1342 restart ceph daemons
1346 - ceph.restart: [all]
1350 - ceph.restart: [osd.0, mon.1, mds.*]
1356 daemons: [osd.0, mon.1]
1357 wait-for-healthy: false
1358 wait-for-osds-up: true
1361 :param config: Configuration
1365 elif isinstance(config
, list):
1366 config
= {'daemons': config
}
1368 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1370 for role
in daemons
:
1371 cluster
, type_
, id_
= teuthology
.split_role(role
)
1372 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).restart()
1373 clusters
.add(cluster
)
1375 manager
= ctx
.managers
['ceph']
1376 for dmon
in daemons
:
1378 dm_parts
= dmon
.split('.')
1379 if dm_parts
[1].isdigit():
1380 if dm_parts
[0] == 'osd':
1381 manager
.mark_down_osd(int(dm_parts
[1]))
1383 if config
.get('wait-for-healthy', True):
1384 for cluster
in clusters
:
1385 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1386 if config
.get('wait-for-osds-up', False):
1387 for cluster
in clusters
:
1388 wait_for_osds_up(ctx
=ctx
, config
=dict(cluster
=cluster
))
1392 @contextlib.contextmanager
1393 def stop(ctx
, config
):
1399 - ceph.stop: [mds.*]
1402 - ceph.stop: [osd.0, osd.2]
1406 daemons: [osd.0, osd.2]
1411 elif isinstance(config
, list):
1412 config
= {'daemons': config
}
1414 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1415 for role
in daemons
:
1416 cluster
, type_
, id_
= teuthology
.split_role(role
)
1417 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1422 @contextlib.contextmanager
1423 def wait_for_failure(ctx
, config
):
1425 Wait for a failure of a ceph daemon
1429 - ceph.wait_for_failure: [mds.*]
1432 - ceph.wait_for_failure: [osd.0, osd.2]
1435 - ceph.wait_for_failure:
1436 daemons: [osd.0, osd.2]
1441 elif isinstance(config
, list):
1442 config
= {'daemons': config
}
1444 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1445 for role
in daemons
:
1446 cluster
, type_
, id_
= teuthology
.split_role(role
)
1448 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).wait()
1450 log
.info('Saw expected daemon failure. Continuing.')
1453 raise RuntimeError('daemon %s did not fail' % role
)
1458 def validate_config(ctx
, config
):
1460 Perform some simple validation on task configuration.
1461 Raises exceptions.ConfigError if an error is found.
1463 # check for osds from multiple clusters on the same host
1464 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
1467 for role
in roles_for_host
:
1468 role_cluster
, role_type
, _
= teuthology
.split_role(role
)
1469 if role_type
!= 'osd':
1471 if last_cluster
and last_cluster
!= role_cluster
:
1472 msg
= "Host should not have osds (%s and %s) from multiple clusters" % (
1474 raise exceptions
.ConfigError(msg
)
1475 last_cluster
= role_cluster
1479 @contextlib.contextmanager
1480 def task(ctx
, config
):
1482 Set up and tear down a Ceph cluster.
1490 You can also specify what branch to run::
1506 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1508 Or a local source dir::
1512 path: /home/sage/ceph
1514 To capture code coverage data, use::
1520 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1525 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1526 mount_options: [nobarrier, inode64]
1528 Note, this will cause the task to check the /scratch_devs file on each node
1529 for available devices. If no such file is found, /dev/sdb will be used.
1531 To run some daemons under valgrind, include their names
1532 and the tool/args to use in a valgrind section::
1537 mds.1: --tool=memcheck
1538 osd.1: [--tool=memcheck, --leak-check=no]
1540 Those nodes which are using memcheck or valgrind will get
1541 checked for bad results.
1543 To adjust or modify config options, use::
1558 other key: other value
1563 By default, the cluster log is checked for errors and warnings,
1564 and the run marked failed if any appear. You can ignore log
1565 entries by giving a list of egrep compatible regexes, i.e.:
1569 log-whitelist: ['foo.*bar', 'bad message']
1571 To run multiple ceph clusters, use multiple ceph tasks, and roles
1572 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1573 cluster use the default cluster name, 'ceph'. OSDs from separate
1574 clusters must be on separate hosts. Clients and non-osd daemons
1575 from multiple clusters may be colocated. For each cluster, add an
1576 instance of the ceph task with the cluster name specified, e.g.::
1579 - [mon.a, osd.0, osd.1]
1580 - [backup.mon.a, backup.osd.0, backup.osd.1]
1581 - [client.0, backup.client.0]
1589 :param config: Configuration
1594 assert isinstance(config
, dict), \
1595 "task ceph only supports a dictionary for configuration"
1597 overrides
= ctx
.config
.get('overrides', {})
1598 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1600 first_ceph_cluster
= False
1601 if not hasattr(ctx
, 'daemons'):
1602 first_ceph_cluster
= True
1603 ctx
.daemons
= DaemonGroup()
1605 testdir
= teuthology
.get_testdir(ctx
)
1606 if config
.get('coverage'):
1607 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1608 log
.info('Creating coverage directory...')
1612 'install', '-d', '-m0755', '--',
1619 if 'cluster' not in config
:
1620 config
['cluster'] = 'ceph'
1622 validate_config(ctx
, config
)
1625 if first_ceph_cluster
:
1626 # these tasks handle general log setup and parsing on all hosts,
1627 # so they should only be run once
1629 lambda: ceph_log(ctx
=ctx
, config
=None),
1630 lambda: valgrind_post(ctx
=ctx
, config
=config
),
1634 lambda: cluster(ctx
=ctx
, config
=dict(
1635 conf
=config
.get('conf', {}),
1636 fs
=config
.get('fs', 'xfs'),
1637 mkfs_options
=config
.get('mkfs_options', None),
1638 mount_options
=config
.get('mount_options', None),
1639 block_journal
=config
.get('block_journal', None),
1640 tmpfs_journal
=config
.get('tmpfs_journal', None),
1641 skip_mgr_daemons
=config
.get('skip_mgr_daemons', False),
1642 log_whitelist
=config
.get('log-whitelist', []),
1643 cpu_profile
=set(config
.get('cpu_profile', []),),
1644 cluster
=config
['cluster'],
1646 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mon'),
1647 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mgr'),
1648 lambda: crush_setup(ctx
=ctx
, config
=config
),
1649 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='osd'),
1650 lambda: create_rbd_pool(ctx
=ctx
, config
=config
),
1651 lambda: cephfs_setup(ctx
=ctx
, config
=config
),
1652 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mds'),
1655 with contextutil
.nested(*subtasks
):
1656 first_mon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1657 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
1658 if not hasattr(ctx
, 'managers'):
1660 ctx
.managers
[config
['cluster']] = CephManager(
1663 logger
=log
.getChild('ceph_manager.' + config
['cluster']),
1664 cluster
=config
['cluster'],
1668 if config
.get('wait-for-healthy', True):
1669 healthy(ctx
=ctx
, config
=dict(cluster
=config
['cluster']))
1673 if config
.get('wait-for-scrub', True):
1674 osd_scrub_pgs(ctx
, config
)
1676 # stop logging health to clog during shutdown, or else we generate
1677 # a bunch of scary messages unrelated to our actual run.
1678 firstmon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1679 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1684 '--cluster', config
['cluster'],
1689 '--no-mon-health-to-clog',