4 Handle the setup, starting, and clean-up of a Ceph cluster.
6 from cStringIO
import StringIO
18 from paramiko
import SSHException
19 from ceph_manager
import CephManager
, write_conf
20 from tasks
.cephfs
.filesystem
import Filesystem
21 from teuthology
import misc
as teuthology
22 from teuthology
import contextutil
23 from teuthology
import exceptions
24 from teuthology
.orchestra
import run
25 import ceph_client
as cclient
26 from teuthology
.orchestra
.daemon
import DaemonGroup
28 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw']
30 log
= logging
.getLogger(__name__
)
33 def generate_caps(type_
):
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
61 for subsystem
, capability
in defaults
[type_
].items():
67 @contextlib.contextmanager
68 def ceph_log(ctx
, config
):
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
74 :param config: Configuration
76 log
.info('Making ceph log dir writeable by non-root...')
88 log
.info('Disabling ceph logrotate...')
94 '/etc/logrotate.d/ceph',
99 log
.info('Creating extra log directories...')
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
112 class Rotater(object):
113 stop_event
= gevent
.event
.Event()
115 def invoke_logrotate(self
):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self
.stop_event
.is_set():
119 self
.stop_event
.wait(timeout
=30)
123 args
=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
128 except exceptions
.ConnectionLostError
as e
:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log
.debug("Missed logrotate, node '{0}' is offline".format(
133 except EOFError as e
:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log
.debug("Missed logrotate, EOFError")
139 except SSHException
as e
:
140 log
.debug("Missed logrotate, SSHException")
141 except socket
.error
as e
:
142 if e
.errno
== errno
.EHOSTUNREACH
:
143 log
.debug("Missed logrotate, host unreachable")
148 self
.thread
= gevent
.spawn(self
.invoke_logrotate
)
151 self
.stop_event
.set()
154 def write_rotate_conf(ctx
, daemons
):
155 testdir
= teuthology
.get_testdir(ctx
)
156 rotate_conf_path
= os
.path
.join(os
.path
.dirname(__file__
), 'logrotate.conf')
157 with
file(rotate_conf_path
, 'rb') as f
:
159 for daemon
, size
in daemons
.iteritems():
160 log
.info('writing logrotate stanza for {daemon}'.format(daemon
=daemon
))
161 conf
+= f
.read().format(daemon_type
=daemon
, max_size
=size
)
164 for remote
in ctx
.cluster
.remotes
.iterkeys():
165 teuthology
.write_file(remote
=remote
,
166 path
='{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
174 '/etc/logrotate.d/ceph-test.conf',
179 '/etc/logrotate.d/ceph-test.conf',
184 '/etc/logrotate.d/ceph-test.conf'
187 remote
.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
190 if ctx
.config
.get('log-rotate'):
191 daemons
= ctx
.config
.get('log-rotate')
192 log
.info('Setting up log rotation with ' + str(daemons
))
193 write_rotate_conf(ctx
, daemons
)
194 logrotater
= Rotater()
200 if ctx
.config
.get('log-rotate'):
201 log
.info('Shutting down logrotate')
204 args
=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207 if ctx
.archive
is not None and \
208 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
210 log
.info('Compressing logs...')
233 log
.info('Archiving logs...')
234 path
= os
.path
.join(ctx
.archive
, 'remote')
236 for remote
in ctx
.cluster
.remotes
.iterkeys():
237 sub
= os
.path
.join(path
, remote
.shortname
)
239 teuthology
.pull_directory(remote
, '/var/log/ceph',
240 os
.path
.join(sub
, 'log'))
243 def assign_devs(roles
, devs
):
245 Create a dictionary of devs indexed by roles
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
251 return dict(zip(roles
, devs
))
254 @contextlib.contextmanager
255 def valgrind_post(ctx
, config
):
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
262 :param config: Configuration
267 lookup_procs
= list()
268 log
.info('Checking for errors in any valgrind logs...')
269 for remote
in ctx
.cluster
.remotes
.iterkeys():
270 # look at valgrind logs for each node
276 run
.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
287 lookup_procs
.append((proc
, remote
))
289 valgrind_exception
= None
290 for (proc
, remote
) in lookup_procs
:
292 out
= proc
.stdout
.getvalue()
293 for line
in out
.split('\n'):
297 (file, kind
) = line
.split(':')
299 log
.error('failed to split line %s', line
)
301 log
.debug('file %s kind %s', file, kind
)
302 if (file.find('mds') >= 0) and kind
.find('Lost') > 0:
304 log
.error('saw valgrind issue %s in %s', kind
, file)
305 valgrind_exception
= Exception('saw valgrind issues')
307 if config
.get('expect_valgrind_errors'):
308 if not valgrind_exception
:
309 raise Exception('expected valgrind issues and found none')
311 if valgrind_exception
:
312 raise valgrind_exception
315 @contextlib.contextmanager
316 def crush_setup(ctx
, config
):
317 cluster_name
= config
['cluster']
318 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
319 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
321 profile
= config
.get('crush_tunables', 'default')
322 log
.info('Setting crush tunables to %s', profile
)
324 args
=['sudo', 'ceph', '--cluster', cluster_name
,
325 'osd', 'crush', 'tunables', profile
])
329 @contextlib.contextmanager
330 def create_rbd_pool(ctx
, config
):
331 cluster_name
= config
['cluster']
332 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
333 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
334 log
.info('Waiting for OSDs to come up')
335 teuthology
.wait_until_osds_up(
339 ceph_cluster
=cluster_name
,
341 log
.info('Creating RBD pool')
343 args
=['sudo', 'ceph', '--cluster', cluster_name
,
344 'osd', 'pool', 'create', 'rbd', '8'])
347 @contextlib.contextmanager
348 def cephfs_setup(ctx
, config
):
349 cluster_name
= config
['cluster']
350 testdir
= teuthology
.get_testdir(ctx
)
351 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
353 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
354 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
355 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
356 # If there are any MDSs, then create a filesystem for them to use
357 # Do this last because requires mon cluster to be up and running
359 log
.info('Setting up CephFS filesystem...')
361 fs
= Filesystem(ctx
, create
='cephfs')
363 is_active_mds
= lambda role
: 'mds.' in role
and not role
.endswith('-s') and '-s-' not in role
364 all_roles
= [item
for remote_roles
in mdss
.remotes
.values() for item
in remote_roles
]
365 num_active
= len([r
for r
in all_roles
if is_active_mds(r
)])
367 fs
.set_max_mds(num_active
)
368 fs
.set_allow_dirfrags(True)
373 @contextlib.contextmanager
374 def cluster(ctx
, config
):
376 Handle the creation and removal of a ceph cluster.
379 Create directories needed for the cluster.
380 Create remote journals for all osds.
381 Create and set keyring.
382 Copy the monmap to tht test systems.
386 Add keyring information to monmaps
390 If errors occured, extract a failure message and store in ctx.summary.
391 Unmount all test files and temporary journaling files.
392 Save the monitor information and archive all ceph logs.
393 Cleanup the keyring setup, and remove all monitor map and data files left over.
396 :param config: Configuration
398 if ctx
.config
.get('use_existing_cluster', False) is True:
399 log
.info("'use_existing_cluster' is true; skipping cluster creation")
402 testdir
= teuthology
.get_testdir(ctx
)
403 cluster_name
= config
['cluster']
404 data_dir
= '{tdir}/{cluster}.data'.format(tdir
=testdir
, cluster
=cluster_name
)
405 log
.info('Creating ceph cluster %s...', cluster_name
)
409 'install', '-d', '-m0755', '--',
420 'install', '-d', '-m0777', '--', '/var/run/ceph',
427 remote_to_roles_to_devs
= {}
428 remote_to_roles_to_journals
= {}
429 osds
= ctx
.cluster
.only(teuthology
.is_type('osd', cluster_name
))
430 for remote
, roles_for_host
in osds
.remotes
.iteritems():
431 devs
= teuthology
.get_scratch_devices(remote
)
433 roles_to_journals
= {}
435 log
.info('fs option selected, checking for scratch devs')
436 log
.info('found devs: %s' % (str(devs
),))
437 devs_id_map
= teuthology
.get_wwn_id_map(remote
, devs
)
438 iddevs
= devs_id_map
.values()
439 roles_to_devs
= assign_devs(
440 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
442 if len(roles_to_devs
) < len(iddevs
):
443 iddevs
= iddevs
[len(roles_to_devs
):]
444 devs_to_clean
[remote
] = []
446 if config
.get('block_journal'):
447 log
.info('block journal enabled')
448 roles_to_journals
= assign_devs(
449 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
451 log
.info('journal map: %s', roles_to_journals
)
453 if config
.get('tmpfs_journal'):
454 log
.info('tmpfs journal enabled')
455 roles_to_journals
= {}
456 remote
.run(args
=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
457 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
458 tmpfs
= '/mnt/' + role
459 roles_to_journals
[role
] = tmpfs
460 remote
.run(args
=['truncate', '-s', '1500M', tmpfs
])
461 log
.info('journal map: %s', roles_to_journals
)
463 log
.info('dev map: %s' % (str(roles_to_devs
),))
464 remote_to_roles_to_devs
[remote
] = roles_to_devs
465 remote_to_roles_to_journals
[remote
] = roles_to_journals
467 log
.info('Generating config...')
468 remotes_and_roles
= ctx
.cluster
.remotes
.items()
469 roles
= [role_list
for (remote
, role_list
) in remotes_and_roles
]
470 ips
= [host
for (host
, port
) in
471 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
472 conf
= teuthology
.skeleton_config(ctx
, roles
=roles
, ips
=ips
, cluster
=cluster_name
)
473 for remote
, roles_to_journals
in remote_to_roles_to_journals
.iteritems():
474 for role
, journal
in roles_to_journals
.iteritems():
475 name
= teuthology
.ceph_role(role
)
478 conf
[name
]['osd journal'] = journal
479 for section
, keys
in config
['conf'].iteritems():
480 for key
, value
in keys
.iteritems():
481 log
.info("[%s] %s = %s" % (section
, key
, value
))
482 if section
not in conf
:
484 conf
[section
][key
] = value
486 if config
.get('tmpfs_journal'):
487 conf
['journal dio'] = False
489 if not hasattr(ctx
, 'ceph'):
491 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
492 ctx
.ceph
[cluster_name
].conf
= conf
494 default_keyring
= '/etc/ceph/{cluster}.keyring'.format(cluster
=cluster_name
)
495 keyring_path
= config
.get('keyring_path', default_keyring
)
497 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
499 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
501 log
.info('Setting up %s...' % firstmon
)
502 ctx
.cluster
.only(firstmon
).run(
513 ctx
.cluster
.only(firstmon
).run(
525 ctx
.cluster
.only(firstmon
).run(
533 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
534 monmap_path
= '{tdir}/{cluster}.monmap'.format(tdir
=testdir
,
535 cluster
=cluster_name
)
536 fsid
= teuthology
.create_simple_monmap(
542 if not 'global' in conf
:
544 conf
['global']['fsid'] = fsid
546 default_conf_path
= '/etc/ceph/{cluster}.conf'.format(cluster
=cluster_name
)
547 conf_path
= config
.get('conf_path', default_conf_path
)
548 log
.info('Writing %s for FSID %s...' % (conf_path
, fsid
))
549 write_conf(ctx
, conf_path
, cluster_name
)
551 log
.info('Creating admin key on %s...' % firstmon
)
552 ctx
.cluster
.only(firstmon
).run(
560 '--name=client.admin',
562 '--cap', 'mon', 'allow *',
563 '--cap', 'osd', 'allow *',
564 '--cap', 'mds', 'allow *',
565 '--cap', 'mgr', 'allow *',
570 log
.info('Copying monmap to all nodes...')
571 keyring
= teuthology
.get_file(
575 monmap
= teuthology
.get_file(
580 for rem
in ctx
.cluster
.remotes
.iterkeys():
581 # copy mon key and initial monmap
582 log
.info('Sending monmap to node {remote}'.format(remote
=rem
))
583 teuthology
.sudo_write_file(
589 teuthology
.write_file(
595 log
.info('Setting up mon nodes...')
596 mons
= ctx
.cluster
.only(teuthology
.is_type('mon', cluster_name
))
598 if not config
.get('skip_mgr_daemons', False):
599 log
.info('Setting up mgr nodes...')
600 mgrs
= ctx
.cluster
.only(teuthology
.is_type('mgr', cluster_name
))
601 for remote
, roles_for_host
in mgrs
.remotes
.iteritems():
602 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mgr',
604 _
, _
, id_
= teuthology
.split_role(role
)
605 mgr_dir
= '/var/lib/ceph/mgr/{cluster}-{id}'.format(
606 cluster
=cluster_name
,
623 '--name=mgr.{id}'.format(id=id_
),
624 mgr_dir
+ '/keyring',
628 log
.info('Setting up mds nodes...')
629 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
630 for remote
, roles_for_host
in mdss
.remotes
.iteritems():
631 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mds',
633 _
, _
, id_
= teuthology
.split_role(role
)
634 mds_dir
= '/var/lib/ceph/mds/{cluster}-{id}'.format(
635 cluster
=cluster_name
,
652 '--name=mds.{id}'.format(id=id_
),
653 mds_dir
+ '/keyring',
657 cclient
.create_keyring(ctx
, cluster_name
)
658 log
.info('Running mkfs on osd nodes...')
660 if not hasattr(ctx
, 'disk_config'):
661 ctx
.disk_config
= argparse
.Namespace()
662 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev'):
663 ctx
.disk_config
.remote_to_roles_to_dev
= {}
664 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_journals'):
665 ctx
.disk_config
.remote_to_roles_to_journals
= {}
666 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_mount_options'):
667 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
= {}
668 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_fstype'):
669 ctx
.disk_config
.remote_to_roles_to_dev_fstype
= {}
671 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_dev
, remote_to_roles_to_devs
)
672 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_journals
, remote_to_roles_to_journals
)
674 log
.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r
=str(ctx
.disk_config
.remote_to_roles_to_dev
)))
675 for remote
, roles_for_host
in osds
.remotes
.iteritems():
676 roles_to_devs
= remote_to_roles_to_devs
[remote
]
677 roles_to_journals
= remote_to_roles_to_journals
[remote
]
679 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
680 _
, _
, id_
= teuthology
.split_role(role
)
681 mnt_point
= '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster
=cluster_name
, id=id_
)
689 log
.info(str(roles_to_journals
))
691 if roles_to_devs
.get(role
):
692 dev
= roles_to_devs
[role
]
693 fs
= config
.get('fs')
695 mkfs_options
= config
.get('mkfs_options')
696 mount_options
= config
.get('mount_options')
698 # package = 'btrfs-tools'
699 if mount_options
is None:
700 mount_options
= ['noatime', 'user_subvol_rm_allowed']
701 if mkfs_options
is None:
702 mkfs_options
= ['-m', 'single',
706 # package = 'xfsprogs'
707 if mount_options
is None:
708 mount_options
= ['noatime']
709 if mkfs_options
is None:
710 mkfs_options
= ['-f', '-i', 'size=2048']
711 if fs
== 'ext4' or fs
== 'ext3':
712 if mount_options
is None:
713 mount_options
= ['noatime', 'user_xattr']
715 if mount_options
is None:
717 if mkfs_options
is None:
719 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
720 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
721 if package
is not None:
725 'apt-get', 'install', '-y', package
731 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
732 except run
.CommandFailedError
:
733 # Newer btfs-tools doesn't prompt for overwrite, use -f
734 if '-f' not in mount_options
:
735 mkfs_options
.append('-f')
736 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
737 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
738 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
740 log
.info('mount %s on %s -o %s' % (dev
, remote
,
741 ','.join(mount_options
)))
747 '-o', ','.join(mount_options
),
754 'sudo', '/sbin/restorecon', mnt_point
,
758 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_mount_options
:
759 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
] = {}
760 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
][role
] = mount_options
761 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_fstype
:
762 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
] = {}
763 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
][role
] = fs
764 devs_to_clean
[remote
].append(mnt_point
)
766 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
767 _
, _
, id_
= teuthology
.split_role(role
)
781 '--monmap', monmap_path
,
785 log
.info('Reading keys from all nodes...')
788 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
789 for type_
in ['mgr', 'mds', 'osd']:
790 if type_
== 'mgr' and config
.get('skip_mgr_daemons', False):
792 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, type_
, cluster_name
):
793 _
, _
, id_
= teuthology
.split_role(role
)
794 data
= teuthology
.get_file(
796 path
='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
799 cluster
=cluster_name
,
803 keys
.append((type_
, id_
, data
))
805 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
806 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client', cluster_name
):
807 _
, _
, id_
= teuthology
.split_role(role
)
808 data
= teuthology
.get_file(
810 path
='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_
, cluster
=cluster_name
)
812 keys
.append(('client', id_
, data
))
815 log
.info('Adding keys to all mons...')
826 teuthology
.feed_many_stdins_and_close(keys_fp
, writes
)
828 for type_
, id_
, data
in keys
:
838 '--name={type}.{id}'.format(
842 ] + list(generate_caps(type_
)),
847 log
.info('Running mkfs on mon nodes...')
848 for remote
, roles_for_host
in mons
.remotes
.iteritems():
849 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mon', cluster_name
):
850 _
, _
, id_
= teuthology
.split_role(role
)
856 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_
, cluster
=cluster_name
),
866 '--cluster', cluster_name
,
869 '--monmap', monmap_path
,
870 '--keyring', keyring_path
,
888 # we need to know this below
889 ctx
.summary
['success'] = False
892 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
894 log
.info('Checking cluster log for badness...')
896 def first_in_ceph_log(pattern
, excludes
):
898 Find the first occurence of the pattern specified in the Ceph log,
899 Returns None if none found.
901 :param pattern: Pattern scanned for.
902 :param excludes: Patterns to ignore.
903 :return: First line of text (or None if not found)
908 '/var/log/ceph/{cluster}.log'.format(cluster
=cluster_name
),
910 for exclude
in excludes
:
911 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
913 run
.Raw('|'), 'head', '-n', '1',
919 stdout
= r
.stdout
.getvalue()
924 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
925 config
['log_whitelist']) is not None:
926 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
927 ctx
.summary
['success'] = False
928 # use the most severe problem as the failure reason
929 if 'failure_reason' not in ctx
.summary
:
930 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
931 match
= first_in_ceph_log(pattern
, config
['log_whitelist'])
932 if match
is not None:
933 ctx
.summary
['failure_reason'] = \
934 '"{match}" in cluster log'.format(
935 match
=match
.rstrip('\n'),
939 for remote
, dirs
in devs_to_clean
.iteritems():
941 log
.info('Unmounting %s on %s' % (dir_
, remote
))
953 except Exception as e
:
956 run
.Raw('PATH=/usr/sbin:$PATH'),
963 if config
.get('tmpfs_journal'):
964 log
.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
965 for remote
, roles_for_host
in osds
.remotes
.iteritems():
967 args
=['sudo', 'umount', '-f', '/mnt'],
971 if ctx
.archive
is not None and \
972 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
974 # archive mon data, too
975 log
.info('Archiving mon data...')
976 path
= os
.path
.join(ctx
.archive
, 'data')
980 if e
.errno
== errno
.EEXIST
:
984 for remote
, roles
in mons
.remotes
.iteritems():
986 is_mon
= teuthology
.is_type('mon', cluster_name
)
988 _
, _
, id_
= teuthology
.split_role(role
)
989 mon_dir
= '/var/lib/ceph/mon/' + \
990 '{0}-{1}'.format(cluster_name
, id_
)
991 teuthology
.pull_directory_tarball(
994 path
+ '/' + role
+ '.tgz')
996 log
.info('Cleaning ceph cluster...')
1008 run
.Raw('{tdir}/../*.pid'.format(tdir
=testdir
)),
1015 def osd_scrub_pgs(ctx
, config
):
1017 Scrub pgs when we exit.
1019 First make sure all pgs are active and clean.
1020 Next scrub all osds.
1021 Then periodically check until all pgs have scrub time stamps that
1022 indicate the last scrub completed. Time out if no progess is made
1023 here after two minutes.
1027 cluster_name
= config
['cluster']
1028 manager
= ctx
.managers
[cluster_name
]
1030 for _
in range(0, retries
):
1031 stats
= manager
.get_pg_stats()
1032 bad
= [stat
['pgid'] for stat
in stats
if 'active+clean' not in stat
['state']]
1037 "Waiting for all PGs to be active and clean, waiting on %s" % bad
)
1040 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1041 check_time_now
= time
.localtime()
1043 all_roles
= teuthology
.all_roles(ctx
.cluster
)
1044 for role
in teuthology
.cluster_roles_of_type(all_roles
, 'osd', cluster_name
):
1045 log
.info("Scrubbing {osd}".format(osd
=role
))
1046 _
, _
, id_
= teuthology
.split_role(role
)
1047 # allow this to fail; in certain cases the OSD might not be up
1048 # at this point. we will catch all pgs below.
1050 manager
.raw_cluster_cmd('osd', 'deep-scrub', id_
)
1051 except run
.CommandFailedError
:
1057 stats
= manager
.get_pg_stats()
1058 timez
= [(stat
['pgid'],stat
['last_scrub_stamp']) for stat
in stats
]
1061 for (pgid
, tmval
) in timez
:
1062 pgtm
= time
.strptime(tmval
[0:tmval
.find('.')], '%Y-%m-%d %H:%M:%S')
1063 if pgtm
> check_time_now
:
1066 log
.info('pgid %s last_scrub_stamp %s %s <= %s', pgid
, tmval
, pgtm
, check_time_now
)
1068 if thiscnt
> prev_good
:
1073 if gap_cnt
% 6 == 0:
1074 for (pgid
, tmval
) in timez
:
1075 # re-request scrub every so often in case the earlier
1076 # request was missed. do not do it everytime because
1077 # the scrub may be in progress or not reported yet and
1078 # we will starve progress.
1079 manager
.raw_cluster_cmd('pg', 'deep-scrub', pgid
)
1080 if gap_cnt
> retries
:
1081 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1083 log
.info('Still waiting for all pgs to be scrubbed.')
1087 @contextlib.contextmanager
1088 def run_daemon(ctx
, config
, type_
):
1090 Run daemons for a role type. Handle the startup and termination of a a daemon.
1091 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1092 and a max_mds value for one mds.
1093 On cleanup -- Stop all existing daemons of this type.
1096 :param config: Configuration
1097 :paran type_: Role type
1099 cluster_name
= config
['cluster']
1100 log
.info('Starting %s daemons in cluster %s...', type_
, cluster_name
)
1101 testdir
= teuthology
.get_testdir(ctx
)
1102 daemons
= ctx
.cluster
.only(teuthology
.is_type(type_
, cluster_name
))
1104 # check whether any daemons if this type are configured
1107 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1109 daemon_signal
= 'kill'
1110 if config
.get('coverage') or config
.get('valgrind') is not None:
1111 daemon_signal
= 'term'
1113 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1114 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1115 for role
in roles_for_host
:
1116 if not is_type_(role
):
1118 _
, _
, id_
= teuthology
.split_role(role
)
1121 datadir
='/var/lib/ceph/osd/{cluster}-{id}'.format(
1122 cluster
=cluster_name
, id=id_
)
1123 osd_uuid
= teuthology
.get_file(
1125 path
=datadir
+ '/fsid',
1131 'sudo', 'ceph', '--cluster', cluster_name
,
1132 'osd', 'new', osd_uuid
, id_
,
1136 # fallback to pre-luminous (hammer or jewel)
1139 'sudo', 'ceph', '--cluster', cluster_name
,
1140 'osd', 'create', osd_uuid
,
1143 if config
.get('add_osds_to_crush'):
1146 'sudo', 'ceph', '--cluster', cluster_name
,
1147 'osd', 'crush', 'create-or-move', 'osd.' + id_
,
1148 '1.0', 'host=localhost', 'root=default',
1161 'ceph-%s' % (type_
),
1163 '--cluster', cluster_name
,
1166 if type_
in config
.get('cpu_profile', []):
1167 profile_path
= '/var/log/ceph/profiling-logger/%s.prof' % (role
)
1168 run_cmd
.extend(['env', 'CPUPROFILE=%s' % profile_path
])
1170 if config
.get('valgrind') is not None:
1171 valgrind_args
= None
1172 if type_
in config
['valgrind']:
1173 valgrind_args
= config
['valgrind'][type_
]
1174 if role
in config
['valgrind']:
1175 valgrind_args
= config
['valgrind'][role
]
1176 run_cmd
= teuthology
.get_valgrind_args(testdir
, role
,
1180 run_cmd
.extend(run_cmd_tail
)
1182 # always register mgr; don't necessarily start
1183 ctx
.daemons
.register_daemon(
1185 cluster
=cluster_name
,
1187 logger
=log
.getChild(role
),
1191 if type_
!= 'mgr' or not config
.get('skip_mgr_daemons', False):
1192 role
= cluster_name
+ '.' + type_
1193 ctx
.daemons
.get_daemon(type_
, id_
, cluster_name
).restart()
1198 teuthology
.stop_daemons_of_type(ctx
, type_
, cluster_name
)
1201 def healthy(ctx
, config
):
1203 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1206 :param config: Configuration
1208 config
= config
if isinstance(config
, dict) else dict()
1209 cluster_name
= config
.get('cluster', 'ceph')
1210 log
.info('Waiting until ceph cluster %s is healthy...', cluster_name
)
1211 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1212 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1213 teuthology
.wait_until_osds_up(
1215 cluster
=ctx
.cluster
,
1217 ceph_cluster
=cluster_name
,
1219 teuthology
.wait_until_healthy(
1222 ceph_cluster
=cluster_name
,
1225 if ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
)).remotes
:
1226 # Some MDSs exist, wait for them to be healthy
1227 ceph_fs
= Filesystem(ctx
) # TODO: make Filesystem cluster-aware
1228 ceph_fs
.wait_for_daemons(timeout
=300)
1231 def wait_for_osds_up(ctx
, config
):
1233 Wait for all osd's to come up.
1236 :param config: Configuration
1238 log
.info('Waiting until ceph osds are all up...')
1239 cluster_name
= config
.get('cluster', 'ceph')
1240 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1241 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1242 teuthology
.wait_until_osds_up(
1244 cluster
=ctx
.cluster
,
1249 def wait_for_mon_quorum(ctx
, config
):
1251 Check renote ceph status until all monitors are up.
1254 :param config: Configuration
1256 if isinstance(config
, dict):
1257 mons
= config
['daemons']
1258 cluster_name
= config
.get('cluster', 'ceph')
1260 assert isinstance(config
, list)
1262 cluster_name
= 'ceph'
1263 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1264 (remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1265 with contextutil
.safe_while(sleep
=10, tries
=60,
1266 action
='wait for monitor quorum') as proceed
:
1275 logger
=log
.getChild('quorum_status'),
1277 j
= json
.loads(r
.stdout
.getvalue())
1278 q
= j
.get('quorum_names', [])
1279 log
.debug('Quorum: %s', q
)
1280 if sorted(q
) == sorted(mons
):
1284 def created_pool(ctx
, config
):
1286 Add new pools to the dictionary of pools that the ceph-manager
1289 for new_pool
in config
:
1290 if new_pool
not in ctx
.managers
['ceph'].pools
:
1291 ctx
.managers
['ceph'].pools
[new_pool
] = ctx
.managers
['ceph'].get_pool_property(
1295 @contextlib.contextmanager
1296 def restart(ctx
, config
):
1298 restart ceph daemons
1302 - ceph.restart: [all]
1306 - ceph.restart: [osd.0, mon.1, mds.*]
1312 daemons: [osd.0, mon.1]
1313 wait-for-healthy: false
1314 wait-for-osds-up: true
1317 :param config: Configuration
1321 elif isinstance(config
, list):
1322 config
= {'daemons': config
}
1324 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1326 for role
in daemons
:
1327 cluster
, type_
, id_
= teuthology
.split_role(role
)
1328 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).restart()
1329 clusters
.add(cluster
)
1331 manager
= ctx
.managers
['ceph']
1332 for dmon
in daemons
:
1334 dm_parts
= dmon
.split('.')
1335 if dm_parts
[1].isdigit():
1336 if dm_parts
[0] == 'osd':
1337 manager
.mark_down_osd(int(dm_parts
[1]))
1339 if config
.get('wait-for-healthy', True):
1340 for cluster
in clusters
:
1341 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1342 if config
.get('wait-for-osds-up', False):
1343 for cluster
in clusters
:
1344 wait_for_osds_up(ctx
=ctx
, config
=dict(cluster
=cluster
))
1348 @contextlib.contextmanager
1349 def stop(ctx
, config
):
1355 - ceph.stop: [mds.*]
1358 - ceph.stop: [osd.0, osd.2]
1362 daemons: [osd.0, osd.2]
1367 elif isinstance(config
, list):
1368 config
= {'daemons': config
}
1370 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1371 for role
in daemons
:
1372 cluster
, type_
, id_
= teuthology
.split_role(role
)
1373 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1378 @contextlib.contextmanager
1379 def wait_for_failure(ctx
, config
):
1381 Wait for a failure of a ceph daemon
1385 - ceph.wait_for_failure: [mds.*]
1388 - ceph.wait_for_failure: [osd.0, osd.2]
1391 - ceph.wait_for_failure:
1392 daemons: [osd.0, osd.2]
1397 elif isinstance(config
, list):
1398 config
= {'daemons': config
}
1400 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1401 for role
in daemons
:
1402 cluster
, type_
, id_
= teuthology
.split_role(role
)
1404 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).wait()
1406 log
.info('Saw expected daemon failure. Continuing.')
1409 raise RuntimeError('daemon %s did not fail' % role
)
1414 def validate_config(ctx
, config
):
1416 Perform some simple validation on task configuration.
1417 Raises exceptions.ConfigError if an error is found.
1419 # check for osds from multiple clusters on the same host
1420 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
1423 for role
in roles_for_host
:
1424 role_cluster
, role_type
, _
= teuthology
.split_role(role
)
1425 if role_type
!= 'osd':
1427 if last_cluster
and last_cluster
!= role_cluster
:
1428 msg
= "Host should not have osds (%s and %s) from multiple clusters" % (
1430 raise exceptions
.ConfigError(msg
)
1431 last_cluster
= role_cluster
1435 @contextlib.contextmanager
1436 def task(ctx
, config
):
1438 Set up and tear down a Ceph cluster.
1446 You can also specify what branch to run::
1462 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1464 Or a local source dir::
1468 path: /home/sage/ceph
1470 To capture code coverage data, use::
1476 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1481 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1482 mount_options: [nobarrier, inode64]
1484 Note, this will cause the task to check the /scratch_devs file on each node
1485 for available devices. If no such file is found, /dev/sdb will be used.
1487 To run some daemons under valgrind, include their names
1488 and the tool/args to use in a valgrind section::
1493 mds.1: --tool=memcheck
1494 osd.1: [--tool=memcheck, --leak-check=no]
1496 Those nodes which are using memcheck or valgrind will get
1497 checked for bad results.
1499 To adjust or modify config options, use::
1514 other key: other value
1519 By default, the cluster log is checked for errors and warnings,
1520 and the run marked failed if any appear. You can ignore log
1521 entries by giving a list of egrep compatible regexes, i.e.:
1525 log-whitelist: ['foo.*bar', 'bad message']
1527 To run multiple ceph clusters, use multiple ceph tasks, and roles
1528 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1529 cluster use the default cluster name, 'ceph'. OSDs from separate
1530 clusters must be on separate hosts. Clients and non-osd daemons
1531 from multiple clusters may be colocated. For each cluster, add an
1532 instance of the ceph task with the cluster name specified, e.g.::
1535 - [mon.a, osd.0, osd.1]
1536 - [backup.mon.a, backup.osd.0, backup.osd.1]
1537 - [client.0, backup.client.0]
1545 :param config: Configuration
1550 assert isinstance(config
, dict), \
1551 "task ceph only supports a dictionary for configuration"
1553 overrides
= ctx
.config
.get('overrides', {})
1554 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1556 first_ceph_cluster
= False
1557 if not hasattr(ctx
, 'daemons'):
1558 first_ceph_cluster
= True
1559 ctx
.daemons
= DaemonGroup()
1561 testdir
= teuthology
.get_testdir(ctx
)
1562 if config
.get('coverage'):
1563 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1564 log
.info('Creating coverage directory...')
1568 'install', '-d', '-m0755', '--',
1575 if 'cluster' not in config
:
1576 config
['cluster'] = 'ceph'
1578 validate_config(ctx
, config
)
1581 if first_ceph_cluster
:
1582 # these tasks handle general log setup and parsing on all hosts,
1583 # so they should only be run once
1585 lambda: ceph_log(ctx
=ctx
, config
=None),
1586 lambda: valgrind_post(ctx
=ctx
, config
=config
),
1590 lambda: cluster(ctx
=ctx
, config
=dict(
1591 conf
=config
.get('conf', {}),
1592 fs
=config
.get('fs', 'xfs'),
1593 mkfs_options
=config
.get('mkfs_options', None),
1594 mount_options
=config
.get('mount_options', None),
1595 block_journal
=config
.get('block_journal', None),
1596 tmpfs_journal
=config
.get('tmpfs_journal', None),
1597 skip_mgr_daemons
=config
.get('skip_mgr_daemons', False),
1598 log_whitelist
=config
.get('log-whitelist', []),
1599 cpu_profile
=set(config
.get('cpu_profile', []),),
1600 cluster
=config
['cluster'],
1602 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mon'),
1603 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mgr'),
1604 lambda: crush_setup(ctx
=ctx
, config
=config
),
1605 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='osd'),
1606 lambda: create_rbd_pool(ctx
=ctx
, config
=config
),
1607 lambda: cephfs_setup(ctx
=ctx
, config
=config
),
1608 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mds'),
1611 with contextutil
.nested(*subtasks
):
1612 first_mon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1613 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
1614 if not hasattr(ctx
, 'managers'):
1616 ctx
.managers
[config
['cluster']] = CephManager(
1619 logger
=log
.getChild('ceph_manager.' + config
['cluster']),
1620 cluster
=config
['cluster'],
1624 if config
.get('wait-for-healthy', True):
1625 healthy(ctx
=ctx
, config
=dict(cluster
=config
['cluster']))
1629 if config
.get('wait-for-scrub', True):
1630 osd_scrub_pgs(ctx
, config
)
1632 # stop logging health to clog during shutdown, or else we generate
1633 # a bunch of scary messages unrelated to our actual run.
1634 firstmon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1635 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1640 '--cluster', config
['cluster'],
1645 '--no-mon-health-to-clog',