4 Handle the setup, starting, and clean-up of a Ceph cluster.
6 from cStringIO
import StringIO
18 from paramiko
import SSHException
19 from ceph_manager
import CephManager
, write_conf
20 from tasks
.cephfs
.filesystem
import Filesystem
21 from teuthology
import misc
as teuthology
22 from teuthology
import contextutil
23 from teuthology
import exceptions
24 from teuthology
.orchestra
import run
25 import ceph_client
as cclient
26 from teuthology
.orchestra
.daemon
import DaemonGroup
28 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw']
30 log
= logging
.getLogger(__name__
)
33 def generate_caps(type_
):
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
61 for subsystem
, capability
in defaults
[type_
].items():
67 @contextlib.contextmanager
68 def ceph_log(ctx
, config
):
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
74 :param config: Configuration
76 log
.info('Making ceph log dir writeable by non-root...')
88 log
.info('Disabling ceph logrotate...')
94 '/etc/logrotate.d/ceph',
99 log
.info('Creating extra log directories...')
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
112 class Rotater(object):
113 stop_event
= gevent
.event
.Event()
115 def invoke_logrotate(self
):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self
.stop_event
.is_set():
119 self
.stop_event
.wait(timeout
=30)
123 args
=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
128 except exceptions
.ConnectionLostError
as e
:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log
.debug("Missed logrotate, node '{0}' is offline".format(
133 except EOFError as e
:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log
.debug("Missed logrotate, EOFError")
139 except SSHException
as e
:
140 log
.debug("Missed logrotate, SSHException")
141 except socket
.error
as e
:
142 if e
.errno
== errno
.EHOSTUNREACH
:
143 log
.debug("Missed logrotate, host unreachable")
148 self
.thread
= gevent
.spawn(self
.invoke_logrotate
)
151 self
.stop_event
.set()
154 def write_rotate_conf(ctx
, daemons
):
155 testdir
= teuthology
.get_testdir(ctx
)
156 rotate_conf_path
= os
.path
.join(os
.path
.dirname(__file__
), 'logrotate.conf')
157 with
file(rotate_conf_path
, 'rb') as f
:
159 for daemon
, size
in daemons
.iteritems():
160 log
.info('writing logrotate stanza for {daemon}'.format(daemon
=daemon
))
161 conf
+= f
.read().format(daemon_type
=daemon
, max_size
=size
)
164 for remote
in ctx
.cluster
.remotes
.iterkeys():
165 teuthology
.write_file(remote
=remote
,
166 path
='{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
174 '/etc/logrotate.d/ceph-test.conf',
179 '/etc/logrotate.d/ceph-test.conf',
184 '/etc/logrotate.d/ceph-test.conf'
187 remote
.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
190 if ctx
.config
.get('log-rotate'):
191 daemons
= ctx
.config
.get('log-rotate')
192 log
.info('Setting up log rotation with ' + str(daemons
))
193 write_rotate_conf(ctx
, daemons
)
194 logrotater
= Rotater()
200 if ctx
.config
.get('log-rotate'):
201 log
.info('Shutting down logrotate')
204 args
=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207 if ctx
.archive
is not None and \
208 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
210 log
.info('Compressing logs...')
233 log
.info('Archiving logs...')
234 path
= os
.path
.join(ctx
.archive
, 'remote')
236 for remote
in ctx
.cluster
.remotes
.iterkeys():
237 sub
= os
.path
.join(path
, remote
.shortname
)
239 teuthology
.pull_directory(remote
, '/var/log/ceph',
240 os
.path
.join(sub
, 'log'))
243 def assign_devs(roles
, devs
):
245 Create a dictionary of devs indexed by roles
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
251 return dict(zip(roles
, devs
))
254 @contextlib.contextmanager
255 def valgrind_post(ctx
, config
):
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
262 :param config: Configuration
267 lookup_procs
= list()
268 log
.info('Checking for errors in any valgrind logs...')
269 for remote
in ctx
.cluster
.remotes
.iterkeys():
270 # look at valgrind logs for each node
276 run
.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
287 lookup_procs
.append((proc
, remote
))
289 valgrind_exception
= None
290 for (proc
, remote
) in lookup_procs
:
292 out
= proc
.stdout
.getvalue()
293 for line
in out
.split('\n'):
297 (file, kind
) = line
.split(':')
299 log
.error('failed to split line %s', line
)
301 log
.debug('file %s kind %s', file, kind
)
302 if (file.find('mds') >= 0) and kind
.find('Lost') > 0:
304 log
.error('saw valgrind issue %s in %s', kind
, file)
305 valgrind_exception
= Exception('saw valgrind issues')
307 if config
.get('expect_valgrind_errors'):
308 if not valgrind_exception
:
309 raise Exception('expected valgrind issues and found none')
311 if valgrind_exception
:
312 raise valgrind_exception
315 @contextlib.contextmanager
316 def crush_setup(ctx
, config
):
317 cluster_name
= config
['cluster']
318 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
319 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
321 profile
= config
.get('crush_tunables', 'default')
322 log
.info('Setting crush tunables to %s', profile
)
324 args
=['sudo', 'ceph', '--cluster', cluster_name
,
325 'osd', 'crush', 'tunables', profile
])
329 @contextlib.contextmanager
330 def cephfs_setup(ctx
, config
):
331 cluster_name
= config
['cluster']
332 testdir
= teuthology
.get_testdir(ctx
)
333 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
335 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
336 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
337 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
338 # If there are any MDSs, then create a filesystem for them to use
339 # Do this last because requires mon cluster to be up and running
341 log
.info('Setting up CephFS filesystem...')
343 fs
= Filesystem(ctx
, create
='cephfs')
345 is_active_mds
= lambda role
: 'mds.' in role
and not role
.endswith('-s') and '-s-' not in role
346 all_roles
= [item
for remote_roles
in mdss
.remotes
.values() for item
in remote_roles
]
347 num_active
= len([r
for r
in all_roles
if is_active_mds(r
)])
349 fs
.set_allow_multimds(True)
350 fs
.set_max_mds(num_active
)
351 fs
.set_allow_dirfrags(True)
356 @contextlib.contextmanager
357 def cluster(ctx
, config
):
359 Handle the creation and removal of a ceph cluster.
362 Create directories needed for the cluster.
363 Create remote journals for all osds.
364 Create and set keyring.
365 Copy the monmap to tht test systems.
369 Add keyring information to monmaps
373 If errors occured, extract a failure message and store in ctx.summary.
374 Unmount all test files and temporary journaling files.
375 Save the monitor information and archive all ceph logs.
376 Cleanup the keyring setup, and remove all monitor map and data files left over.
379 :param config: Configuration
381 if ctx
.config
.get('use_existing_cluster', False) is True:
382 log
.info("'use_existing_cluster' is true; skipping cluster creation")
385 testdir
= teuthology
.get_testdir(ctx
)
386 cluster_name
= config
['cluster']
387 data_dir
= '{tdir}/{cluster}.data'.format(tdir
=testdir
, cluster
=cluster_name
)
388 log
.info('Creating ceph cluster %s...', cluster_name
)
392 'install', '-d', '-m0755', '--',
403 'install', '-d', '-m0777', '--', '/var/run/ceph',
410 remote_to_roles_to_devs
= {}
411 remote_to_roles_to_journals
= {}
412 osds
= ctx
.cluster
.only(teuthology
.is_type('osd', cluster_name
))
413 for remote
, roles_for_host
in osds
.remotes
.iteritems():
414 devs
= teuthology
.get_scratch_devices(remote
)
416 roles_to_journals
= {}
418 log
.info('fs option selected, checking for scratch devs')
419 log
.info('found devs: %s' % (str(devs
),))
420 devs_id_map
= teuthology
.get_wwn_id_map(remote
, devs
)
421 iddevs
= devs_id_map
.values()
422 roles_to_devs
= assign_devs(
423 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
425 if len(roles_to_devs
) < len(iddevs
):
426 iddevs
= iddevs
[len(roles_to_devs
):]
427 devs_to_clean
[remote
] = []
429 if config
.get('block_journal'):
430 log
.info('block journal enabled')
431 roles_to_journals
= assign_devs(
432 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
434 log
.info('journal map: %s', roles_to_journals
)
436 if config
.get('tmpfs_journal'):
437 log
.info('tmpfs journal enabled')
438 roles_to_journals
= {}
439 remote
.run(args
=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
440 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
441 tmpfs
= '/mnt/' + role
442 roles_to_journals
[role
] = tmpfs
443 remote
.run(args
=['truncate', '-s', '1500M', tmpfs
])
444 log
.info('journal map: %s', roles_to_journals
)
446 log
.info('dev map: %s' % (str(roles_to_devs
),))
447 remote_to_roles_to_devs
[remote
] = roles_to_devs
448 remote_to_roles_to_journals
[remote
] = roles_to_journals
450 log
.info('Generating config...')
451 remotes_and_roles
= ctx
.cluster
.remotes
.items()
452 roles
= [role_list
for (remote
, role_list
) in remotes_and_roles
]
453 ips
= [host
for (host
, port
) in
454 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
455 conf
= teuthology
.skeleton_config(ctx
, roles
=roles
, ips
=ips
, cluster
=cluster_name
)
456 for remote
, roles_to_journals
in remote_to_roles_to_journals
.iteritems():
457 for role
, journal
in roles_to_journals
.iteritems():
458 name
= teuthology
.ceph_role(role
)
461 conf
[name
]['osd journal'] = journal
462 for section
, keys
in config
['conf'].iteritems():
463 for key
, value
in keys
.iteritems():
464 log
.info("[%s] %s = %s" % (section
, key
, value
))
465 if section
not in conf
:
467 conf
[section
][key
] = value
469 if config
.get('tmpfs_journal'):
470 conf
['journal dio'] = False
472 if not hasattr(ctx
, 'ceph'):
474 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
475 ctx
.ceph
[cluster_name
].conf
= conf
477 default_keyring
= '/etc/ceph/{cluster}.keyring'.format(cluster
=cluster_name
)
478 keyring_path
= config
.get('keyring_path', default_keyring
)
480 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
482 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
484 log
.info('Setting up %s...' % firstmon
)
485 ctx
.cluster
.only(firstmon
).run(
496 ctx
.cluster
.only(firstmon
).run(
508 ctx
.cluster
.only(firstmon
).run(
516 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
517 monmap_path
= '{tdir}/{cluster}.monmap'.format(tdir
=testdir
,
518 cluster
=cluster_name
)
519 fsid
= teuthology
.create_simple_monmap(
525 if not 'global' in conf
:
527 conf
['global']['fsid'] = fsid
529 default_conf_path
= '/etc/ceph/{cluster}.conf'.format(cluster
=cluster_name
)
530 conf_path
= config
.get('conf_path', default_conf_path
)
531 log
.info('Writing %s for FSID %s...' % (conf_path
, fsid
))
532 write_conf(ctx
, conf_path
, cluster_name
)
534 log
.info('Creating admin key on %s...' % firstmon
)
535 ctx
.cluster
.only(firstmon
).run(
543 '--name=client.admin',
545 '--cap', 'mon', 'allow *',
546 '--cap', 'osd', 'allow *',
547 '--cap', 'mds', 'allow *',
548 '--cap', 'mgr', 'allow *',
553 log
.info('Copying monmap to all nodes...')
554 keyring
= teuthology
.get_file(
558 monmap
= teuthology
.get_file(
563 for rem
in ctx
.cluster
.remotes
.iterkeys():
564 # copy mon key and initial monmap
565 log
.info('Sending monmap to node {remote}'.format(remote
=rem
))
566 teuthology
.sudo_write_file(
572 teuthology
.write_file(
578 log
.info('Setting up mon nodes...')
579 mons
= ctx
.cluster
.only(teuthology
.is_type('mon', cluster_name
))
580 osdmap_path
= '{tdir}/{cluster}.osdmap'.format(tdir
=testdir
,
581 cluster
=cluster_name
)
591 '--createsimple', '{num:d}'.format(
592 num
=teuthology
.num_instances_of_type(ctx
.cluster
, 'osd',
603 if not config
.get('skip_mgr_daemons', False):
604 log
.info('Setting up mgr nodes...')
605 mgrs
= ctx
.cluster
.only(teuthology
.is_type('mgr', cluster_name
))
606 for remote
, roles_for_host
in mgrs
.remotes
.iteritems():
607 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mgr',
609 _
, _
, id_
= teuthology
.split_role(role
)
610 mgr_dir
= '/var/lib/ceph/mgr/{cluster}-{id}'.format(
611 cluster
=cluster_name
,
628 '--name=mgr.{id}'.format(id=id_
),
629 mgr_dir
+ '/keyring',
633 log
.info('Setting up mds nodes...')
634 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
635 for remote
, roles_for_host
in mdss
.remotes
.iteritems():
636 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mds',
638 _
, _
, id_
= teuthology
.split_role(role
)
639 mds_dir
= '/var/lib/ceph/mds/{cluster}-{id}'.format(
640 cluster
=cluster_name
,
657 '--name=mds.{id}'.format(id=id_
),
658 mds_dir
+ '/keyring',
662 cclient
.create_keyring(ctx
, cluster_name
)
663 log
.info('Running mkfs on osd nodes...')
665 if not hasattr(ctx
, 'disk_config'):
666 ctx
.disk_config
= argparse
.Namespace()
667 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev'):
668 ctx
.disk_config
.remote_to_roles_to_dev
= {}
669 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_journals'):
670 ctx
.disk_config
.remote_to_roles_to_journals
= {}
671 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_mount_options'):
672 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
= {}
673 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_fstype'):
674 ctx
.disk_config
.remote_to_roles_to_dev_fstype
= {}
676 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_dev
, remote_to_roles_to_devs
)
677 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_journals
, remote_to_roles_to_journals
)
679 log
.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r
=str(ctx
.disk_config
.remote_to_roles_to_dev
)))
680 for remote
, roles_for_host
in osds
.remotes
.iteritems():
681 roles_to_devs
= remote_to_roles_to_devs
[remote
]
682 roles_to_journals
= remote_to_roles_to_journals
[remote
]
684 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
685 _
, _
, id_
= teuthology
.split_role(role
)
686 mnt_point
= '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster
=cluster_name
, id=id_
)
694 log
.info(str(roles_to_journals
))
696 if roles_to_devs
.get(role
):
697 dev
= roles_to_devs
[role
]
698 fs
= config
.get('fs')
700 mkfs_options
= config
.get('mkfs_options')
701 mount_options
= config
.get('mount_options')
703 # package = 'btrfs-tools'
704 if mount_options
is None:
705 mount_options
= ['noatime', 'user_subvol_rm_allowed']
706 if mkfs_options
is None:
707 mkfs_options
= ['-m', 'single',
711 # package = 'xfsprogs'
712 if mount_options
is None:
713 mount_options
= ['noatime']
714 if mkfs_options
is None:
715 mkfs_options
= ['-f', '-i', 'size=2048']
716 if fs
== 'ext4' or fs
== 'ext3':
717 if mount_options
is None:
718 mount_options
= ['noatime', 'user_xattr']
720 if mount_options
is None:
722 if mkfs_options
is None:
724 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
725 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
726 if package
is not None:
730 'apt-get', 'install', '-y', package
736 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
737 except run
.CommandFailedError
:
738 # Newer btfs-tools doesn't prompt for overwrite, use -f
739 if '-f' not in mount_options
:
740 mkfs_options
.append('-f')
741 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
742 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
743 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
745 log
.info('mount %s on %s -o %s' % (dev
, remote
,
746 ','.join(mount_options
)))
752 '-o', ','.join(mount_options
),
759 'sudo', '/sbin/restorecon', mnt_point
,
763 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_mount_options
:
764 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
] = {}
765 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
][role
] = mount_options
766 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_fstype
:
767 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
] = {}
768 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
][role
] = fs
769 devs_to_clean
[remote
].append(mnt_point
)
771 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
772 _
, _
, id_
= teuthology
.split_role(role
)
786 '--monmap', monmap_path
,
790 log
.info('Reading keys from all nodes...')
793 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
794 for type_
in ['mgr', 'mds', 'osd']:
795 if type_
== 'mgr' and config
.get('skip_mgr_daemons', False):
797 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, type_
, cluster_name
):
798 _
, _
, id_
= teuthology
.split_role(role
)
799 data
= teuthology
.get_file(
801 path
='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
804 cluster
=cluster_name
,
808 keys
.append((type_
, id_
, data
))
810 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
811 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client', cluster_name
):
812 _
, _
, id_
= teuthology
.split_role(role
)
813 data
= teuthology
.get_file(
815 path
='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_
, cluster
=cluster_name
)
817 keys
.append(('client', id_
, data
))
820 log
.info('Adding keys to all mons...')
831 teuthology
.feed_many_stdins_and_close(keys_fp
, writes
)
833 for type_
, id_
, data
in keys
:
843 '--name={type}.{id}'.format(
847 ] + list(generate_caps(type_
)),
852 log
.info('Running mkfs on mon nodes...')
853 for remote
, roles_for_host
in mons
.remotes
.iteritems():
854 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mon', cluster_name
):
855 _
, _
, id_
= teuthology
.split_role(role
)
861 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_
, cluster
=cluster_name
),
871 '--cluster', cluster_name
,
874 '--monmap', monmap_path
,
875 '--osdmap', osdmap_path
,
876 '--keyring', keyring_path
,
895 # we need to know this below
896 ctx
.summary
['success'] = False
899 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
901 log
.info('Checking cluster log for badness...')
903 def first_in_ceph_log(pattern
, excludes
):
905 Find the first occurence of the pattern specified in the Ceph log,
906 Returns None if none found.
908 :param pattern: Pattern scanned for.
909 :param excludes: Patterns to ignore.
910 :return: First line of text (or None if not found)
915 '/var/log/ceph/{cluster}.log'.format(cluster
=cluster_name
),
917 for exclude
in excludes
:
918 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
920 run
.Raw('|'), 'head', '-n', '1',
926 stdout
= r
.stdout
.getvalue()
931 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
932 config
['log_whitelist']) is not None:
933 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
934 ctx
.summary
['success'] = False
935 # use the most severe problem as the failure reason
936 if 'failure_reason' not in ctx
.summary
:
937 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
938 match
= first_in_ceph_log(pattern
, config
['log_whitelist'])
939 if match
is not None:
940 ctx
.summary
['failure_reason'] = \
941 '"{match}" in cluster log'.format(
942 match
=match
.rstrip('\n'),
946 for remote
, dirs
in devs_to_clean
.iteritems():
948 log
.info('Unmounting %s on %s' % (dir_
, remote
))
960 except Exception as e
:
963 run
.Raw('PATH=/usr/sbin:$PATH'),
970 if config
.get('tmpfs_journal'):
971 log
.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
972 for remote
, roles_for_host
in osds
.remotes
.iteritems():
974 args
=['sudo', 'umount', '-f', '/mnt'],
978 if ctx
.archive
is not None and \
979 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
981 # archive mon data, too
982 log
.info('Archiving mon data...')
983 path
= os
.path
.join(ctx
.archive
, 'data')
987 if e
.errno
== errno
.EEXIST
:
991 for remote
, roles
in mons
.remotes
.iteritems():
993 is_mon
= teuthology
.is_type('mon', cluster_name
)
995 _
, _
, id_
= teuthology
.split_role(role
)
996 mon_dir
= '/var/lib/ceph/mon/' + \
997 '{0}-{1}'.format(cluster_name
, id_
)
998 teuthology
.pull_directory_tarball(
1001 path
+ '/' + role
+ '.tgz')
1003 log
.info('Cleaning ceph cluster...')
1016 run
.Raw('{tdir}/../*.pid'.format(tdir
=testdir
)),
1023 def osd_scrub_pgs(ctx
, config
):
1025 Scrub pgs when we exit.
1027 First make sure all pgs are active and clean.
1028 Next scrub all osds.
1029 Then periodically check until all pgs have scrub time stamps that
1030 indicate the last scrub completed. Time out if no progess is made
1031 here after two minutes.
1035 cluster_name
= config
['cluster']
1036 manager
= ctx
.managers
[cluster_name
]
1038 for _
in range(0, retries
):
1039 stats
= manager
.get_pg_stats()
1040 bad
= [stat
['pgid'] for stat
in stats
if 'active+clean' not in stat
['state']]
1045 "Waiting for all osds to be active and clean, waiting on %s" % bad
)
1048 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1049 check_time_now
= time
.localtime()
1051 all_roles
= teuthology
.all_roles(ctx
.cluster
)
1052 for role
in teuthology
.cluster_roles_of_type(all_roles
, 'osd', cluster_name
):
1053 log
.info("Scrubbing {osd}".format(osd
=role
))
1054 _
, _
, id_
= teuthology
.split_role(role
)
1055 # allow this to fail; in certain cases the OSD might not be up
1056 # at this point. we will catch all pgs below.
1058 manager
.raw_cluster_cmd('osd', 'deep-scrub', id_
)
1059 except run
.CommandFailedError
:
1065 stats
= manager
.get_pg_stats()
1066 timez
= [(stat
['pgid'],stat
['last_scrub_stamp']) for stat
in stats
]
1069 for (pgid
, tmval
) in timez
:
1070 pgtm
= time
.strptime(tmval
[0:tmval
.find('.')], '%Y-%m-%d %H:%M:%S')
1071 if pgtm
> check_time_now
:
1074 log
.info('pgid %s last_scrub_stamp %s %s <= %s', pgid
, tmval
, pgtm
, check_time_now
)
1076 if thiscnt
> prev_good
:
1081 if gap_cnt
% 6 == 0:
1082 for (pgid
, tmval
) in timez
:
1083 # re-request scrub every so often in case the earlier
1084 # request was missed. do not do it everytime because
1085 # the scrub may be in progress or not reported yet and
1086 # we will starve progress.
1087 manager
.raw_cluster_cmd('pg', 'deep-scrub', pgid
)
1088 if gap_cnt
> retries
:
1089 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1091 log
.info('Still waiting for all pgs to be scrubbed.')
1095 @contextlib.contextmanager
1096 def run_daemon(ctx
, config
, type_
):
1098 Run daemons for a role type. Handle the startup and termination of a a daemon.
1099 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1100 and a max_mds value for one mds.
1101 On cleanup -- Stop all existing daemons of this type.
1104 :param config: Configuration
1105 :paran type_: Role type
1107 cluster_name
= config
['cluster']
1108 log
.info('Starting %s daemons in cluster %s...', type_
, cluster_name
)
1109 testdir
= teuthology
.get_testdir(ctx
)
1110 daemons
= ctx
.cluster
.only(teuthology
.is_type(type_
, cluster_name
))
1112 # check whether any daemons if this type are configured
1115 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1117 daemon_signal
= 'kill'
1118 if config
.get('coverage') or config
.get('valgrind') is not None:
1119 daemon_signal
= 'term'
1121 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1122 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1123 for role
in roles_for_host
:
1124 if not is_type_(role
):
1126 _
, _
, id_
= teuthology
.split_role(role
)
1137 'ceph-%s' % (type_
),
1139 '--cluster', cluster_name
,
1142 if type_
in config
.get('cpu_profile', []):
1143 profile_path
= '/var/log/ceph/profiling-logger/%s.prof' % (role
)
1144 run_cmd
.extend(['env', 'CPUPROFILE=%s' % profile_path
])
1146 if config
.get('valgrind') is not None:
1147 valgrind_args
= None
1148 if type_
in config
['valgrind']:
1149 valgrind_args
= config
['valgrind'][type_
]
1150 if role
in config
['valgrind']:
1151 valgrind_args
= config
['valgrind'][role
]
1152 run_cmd
= teuthology
.get_valgrind_args(testdir
, role
,
1156 run_cmd
.extend(run_cmd_tail
)
1158 # always register mgr; don't necessarily start
1159 ctx
.daemons
.register_daemon(
1161 cluster
=cluster_name
,
1163 logger
=log
.getChild(role
),
1167 if type_
!= 'mgr' or not config
.get('skip_mgr_daemons', False):
1168 role
= cluster_name
+ '.' + type_
1169 ctx
.daemons
.get_daemon(type_
, id_
, cluster_name
).restart()
1174 teuthology
.stop_daemons_of_type(ctx
, type_
, cluster_name
)
1177 def healthy(ctx
, config
):
1179 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1182 :param config: Configuration
1184 config
= config
if isinstance(config
, dict) else dict()
1185 cluster_name
= config
.get('cluster', 'ceph')
1186 log
.info('Waiting until ceph cluster %s is healthy...', cluster_name
)
1187 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1188 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1189 teuthology
.wait_until_osds_up(
1191 cluster
=ctx
.cluster
,
1193 ceph_cluster
=cluster_name
,
1195 teuthology
.wait_until_healthy(
1198 ceph_cluster
=cluster_name
,
1201 if ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
)).remotes
:
1202 # Some MDSs exist, wait for them to be healthy
1203 ceph_fs
= Filesystem(ctx
) # TODO: make Filesystem cluster-aware
1204 ceph_fs
.wait_for_daemons(timeout
=300)
1207 def wait_for_osds_up(ctx
, config
):
1209 Wait for all osd's to come up.
1212 :param config: Configuration
1214 log
.info('Waiting until ceph osds are all up...')
1215 cluster_name
= config
.get('cluster', 'ceph')
1216 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1217 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1218 teuthology
.wait_until_osds_up(
1220 cluster
=ctx
.cluster
,
1225 def wait_for_mon_quorum(ctx
, config
):
1227 Check renote ceph status until all monitors are up.
1230 :param config: Configuration
1232 if isinstance(config
, dict):
1233 mons
= config
['daemons']
1234 cluster_name
= config
.get('cluster', 'ceph')
1236 assert isinstance(config
, list)
1238 cluster_name
= 'ceph'
1239 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1240 (remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1241 with contextutil
.safe_while(sleep
=10, tries
=60,
1242 action
='wait for monitor quorum') as proceed
:
1251 logger
=log
.getChild('quorum_status'),
1253 j
= json
.loads(r
.stdout
.getvalue())
1254 q
= j
.get('quorum_names', [])
1255 log
.debug('Quorum: %s', q
)
1256 if sorted(q
) == sorted(mons
):
1260 def created_pool(ctx
, config
):
1262 Add new pools to the dictionary of pools that the ceph-manager
1265 for new_pool
in config
:
1266 if new_pool
not in ctx
.managers
['ceph'].pools
:
1267 ctx
.managers
['ceph'].pools
[new_pool
] = ctx
.managers
['ceph'].get_pool_property(
1271 @contextlib.contextmanager
1272 def restart(ctx
, config
):
1274 restart ceph daemons
1278 - ceph.restart: [all]
1282 - ceph.restart: [osd.0, mon.1, mds.*]
1288 daemons: [osd.0, mon.1]
1289 wait-for-healthy: false
1290 wait-for-osds-up: true
1293 :param config: Configuration
1297 elif isinstance(config
, list):
1298 config
= {'daemons': config
}
1300 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1302 for role
in daemons
:
1303 cluster
, type_
, id_
= teuthology
.split_role(role
)
1304 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).restart()
1305 clusters
.add(cluster
)
1307 manager
= ctx
.managers
['ceph']
1308 for dmon
in daemons
:
1310 dm_parts
= dmon
.split('.')
1311 if dm_parts
[1].isdigit():
1312 if dm_parts
[0] == 'osd':
1313 manager
.mark_down_osd(int(dm_parts
[1]))
1315 if config
.get('wait-for-healthy', True):
1316 for cluster
in clusters
:
1317 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1318 if config
.get('wait-for-osds-up', False):
1319 for cluster
in clusters
:
1320 wait_for_osds_up(ctx
=ctx
, config
=dict(cluster
=cluster
))
1324 @contextlib.contextmanager
1325 def stop(ctx
, config
):
1331 - ceph.stop: [mds.*]
1334 - ceph.stop: [osd.0, osd.2]
1338 daemons: [osd.0, osd.2]
1343 elif isinstance(config
, list):
1344 config
= {'daemons': config
}
1346 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1347 for role
in daemons
:
1348 cluster
, type_
, id_
= teuthology
.split_role(role
)
1349 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1354 @contextlib.contextmanager
1355 def wait_for_failure(ctx
, config
):
1357 Wait for a failure of a ceph daemon
1361 - ceph.wait_for_failure: [mds.*]
1364 - ceph.wait_for_failure: [osd.0, osd.2]
1367 - ceph.wait_for_failure:
1368 daemons: [osd.0, osd.2]
1373 elif isinstance(config
, list):
1374 config
= {'daemons': config
}
1376 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1377 for role
in daemons
:
1378 cluster
, type_
, id_
= teuthology
.split_role(role
)
1380 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).wait()
1382 log
.info('Saw expected daemon failure. Continuing.')
1385 raise RuntimeError('daemon %s did not fail' % role
)
1390 def validate_config(ctx
, config
):
1392 Perform some simple validation on task configuration.
1393 Raises exceptions.ConfigError if an error is found.
1395 # check for osds from multiple clusters on the same host
1396 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
1399 for role
in roles_for_host
:
1400 role_cluster
, role_type
, _
= teuthology
.split_role(role
)
1401 if role_type
!= 'osd':
1403 if last_cluster
and last_cluster
!= role_cluster
:
1404 msg
= "Host should not have osds (%s and %s) from multiple clusters" % (
1406 raise exceptions
.ConfigError(msg
)
1407 last_cluster
= role_cluster
1411 @contextlib.contextmanager
1412 def task(ctx
, config
):
1414 Set up and tear down a Ceph cluster.
1422 You can also specify what branch to run::
1438 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1440 Or a local source dir::
1444 path: /home/sage/ceph
1446 To capture code coverage data, use::
1452 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1457 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1458 mount_options: [nobarrier, inode64]
1460 Note, this will cause the task to check the /scratch_devs file on each node
1461 for available devices. If no such file is found, /dev/sdb will be used.
1463 To run some daemons under valgrind, include their names
1464 and the tool/args to use in a valgrind section::
1469 mds.1: --tool=memcheck
1470 osd.1: [--tool=memcheck, --leak-check=no]
1472 Those nodes which are using memcheck or valgrind will get
1473 checked for bad results.
1475 To adjust or modify config options, use::
1490 other key: other value
1495 By default, the cluster log is checked for errors and warnings,
1496 and the run marked failed if any appear. You can ignore log
1497 entries by giving a list of egrep compatible regexes, i.e.:
1501 log-whitelist: ['foo.*bar', 'bad message']
1503 To run multiple ceph clusters, use multiple ceph tasks, and roles
1504 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1505 cluster use the default cluster name, 'ceph'. OSDs from separate
1506 clusters must be on separate hosts. Clients and non-osd daemons
1507 from multiple clusters may be colocated. For each cluster, add an
1508 instance of the ceph task with the cluster name specified, e.g.::
1511 - [mon.a, osd.0, osd.1]
1512 - [backup.mon.a, backup.osd.0, backup.osd.1]
1513 - [client.0, backup.client.0]
1521 :param config: Configuration
1526 assert isinstance(config
, dict), \
1527 "task ceph only supports a dictionary for configuration"
1529 overrides
= ctx
.config
.get('overrides', {})
1530 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1532 first_ceph_cluster
= False
1533 if not hasattr(ctx
, 'daemons'):
1534 first_ceph_cluster
= True
1535 ctx
.daemons
= DaemonGroup()
1537 testdir
= teuthology
.get_testdir(ctx
)
1538 if config
.get('coverage'):
1539 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1540 log
.info('Creating coverage directory...')
1544 'install', '-d', '-m0755', '--',
1551 if 'cluster' not in config
:
1552 config
['cluster'] = 'ceph'
1554 validate_config(ctx
, config
)
1557 if first_ceph_cluster
:
1558 # these tasks handle general log setup and parsing on all hosts,
1559 # so they should only be run once
1561 lambda: ceph_log(ctx
=ctx
, config
=None),
1562 lambda: valgrind_post(ctx
=ctx
, config
=config
),
1566 lambda: cluster(ctx
=ctx
, config
=dict(
1567 conf
=config
.get('conf', {}),
1568 fs
=config
.get('fs', 'xfs'),
1569 mkfs_options
=config
.get('mkfs_options', None),
1570 mount_options
=config
.get('mount_options', None),
1571 block_journal
=config
.get('block_journal', None),
1572 tmpfs_journal
=config
.get('tmpfs_journal', None),
1573 skip_mgr_daemons
=config
.get('skip_mgr_daemons', False),
1574 log_whitelist
=config
.get('log-whitelist', []),
1575 cpu_profile
=set(config
.get('cpu_profile', []),),
1576 cluster
=config
['cluster'],
1578 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mon'),
1579 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mgr'),
1580 lambda: crush_setup(ctx
=ctx
, config
=config
),
1581 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='osd'),
1582 lambda: cephfs_setup(ctx
=ctx
, config
=config
),
1583 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mds'),
1586 with contextutil
.nested(*subtasks
):
1587 first_mon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1588 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
1589 if not hasattr(ctx
, 'managers'):
1591 ctx
.managers
[config
['cluster']] = CephManager(
1594 logger
=log
.getChild('ceph_manager.' + config
['cluster']),
1595 cluster
=config
['cluster'],
1599 if config
.get('wait-for-healthy', True):
1600 healthy(ctx
=ctx
, config
=dict(cluster
=config
['cluster']))
1604 if config
.get('wait-for-scrub', True):
1605 osd_scrub_pgs(ctx
, config
)