4 Handle the setup, starting, and clean-up of a Ceph cluster.
6 from cStringIO
import StringIO
18 from paramiko
import SSHException
19 from ceph_manager
import CephManager
, write_conf
20 from tasks
.cephfs
.filesystem
import Filesystem
21 from teuthology
import misc
as teuthology
22 from teuthology
import contextutil
23 from teuthology
import exceptions
24 from teuthology
.orchestra
import run
25 import ceph_client
as cclient
26 from teuthology
.orchestra
.daemon
import DaemonGroup
28 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw']
30 log
= logging
.getLogger(__name__
)
33 def generate_caps(type_
):
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
61 for subsystem
, capability
in defaults
[type_
].items():
67 @contextlib.contextmanager
68 def ceph_log(ctx
, config
):
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
74 :param config: Configuration
76 log
.info('Making ceph log dir writeable by non-root...')
88 log
.info('Disabling ceph logrotate...')
94 '/etc/logrotate.d/ceph',
99 log
.info('Creating extra log directories...')
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
112 class Rotater(object):
113 stop_event
= gevent
.event
.Event()
115 def invoke_logrotate(self
):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self
.stop_event
.is_set():
119 self
.stop_event
.wait(timeout
=30)
123 args
=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
128 except exceptions
.ConnectionLostError
as e
:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log
.debug("Missed logrotate, node '{0}' is offline".format(
133 except EOFError as e
:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log
.debug("Missed logrotate, EOFError")
139 except SSHException
as e
:
140 log
.debug("Missed logrotate, SSHException")
141 except socket
.error
as e
:
142 if e
.errno
== errno
.EHOSTUNREACH
:
143 log
.debug("Missed logrotate, host unreachable")
148 self
.thread
= gevent
.spawn(self
.invoke_logrotate
)
151 self
.stop_event
.set()
154 def write_rotate_conf(ctx
, daemons
):
155 testdir
= teuthology
.get_testdir(ctx
)
156 rotate_conf_path
= os
.path
.join(os
.path
.dirname(__file__
), 'logrotate.conf')
157 with
file(rotate_conf_path
, 'rb') as f
:
159 for daemon
, size
in daemons
.iteritems():
160 log
.info('writing logrotate stanza for {daemon}'.format(daemon
=daemon
))
161 conf
+= f
.read().format(daemon_type
=daemon
, max_size
=size
)
164 for remote
in ctx
.cluster
.remotes
.iterkeys():
165 teuthology
.write_file(remote
=remote
,
166 path
='{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir
=testdir
),
174 '/etc/logrotate.d/ceph-test.conf',
179 '/etc/logrotate.d/ceph-test.conf',
184 '/etc/logrotate.d/ceph-test.conf'
187 remote
.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
190 if ctx
.config
.get('log-rotate'):
191 daemons
= ctx
.config
.get('log-rotate')
192 log
.info('Setting up log rotation with ' + str(daemons
))
193 write_rotate_conf(ctx
, daemons
)
194 logrotater
= Rotater()
200 if ctx
.config
.get('log-rotate'):
201 log
.info('Shutting down logrotate')
204 args
=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207 if ctx
.archive
is not None and \
208 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
210 log
.info('Compressing logs...')
233 log
.info('Archiving logs...')
234 path
= os
.path
.join(ctx
.archive
, 'remote')
236 for remote
in ctx
.cluster
.remotes
.iterkeys():
237 sub
= os
.path
.join(path
, remote
.shortname
)
239 teuthology
.pull_directory(remote
, '/var/log/ceph',
240 os
.path
.join(sub
, 'log'))
243 def assign_devs(roles
, devs
):
245 Create a dictionary of devs indexed by roles
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
251 return dict(zip(roles
, devs
))
254 @contextlib.contextmanager
255 def valgrind_post(ctx
, config
):
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
262 :param config: Configuration
267 lookup_procs
= list()
268 log
.info('Checking for errors in any valgrind logs...')
269 for remote
in ctx
.cluster
.remotes
.iterkeys():
270 # look at valgrind logs for each node
276 run
.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
287 lookup_procs
.append((proc
, remote
))
289 valgrind_exception
= None
290 for (proc
, remote
) in lookup_procs
:
292 out
= proc
.stdout
.getvalue()
293 for line
in out
.split('\n'):
297 (file, kind
) = line
.split(':')
299 log
.error('failed to split line %s', line
)
301 log
.debug('file %s kind %s', file, kind
)
302 if (file.find('mds') >= 0) and kind
.find('Lost') > 0:
304 log
.error('saw valgrind issue %s in %s', kind
, file)
305 valgrind_exception
= Exception('saw valgrind issues')
307 if config
.get('expect_valgrind_errors'):
308 if not valgrind_exception
:
309 raise Exception('expected valgrind issues and found none')
311 if valgrind_exception
:
312 raise valgrind_exception
315 @contextlib.contextmanager
316 def crush_setup(ctx
, config
):
317 cluster_name
= config
['cluster']
318 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
319 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
321 profile
= config
.get('crush_tunables', 'default')
322 log
.info('Setting crush tunables to %s', profile
)
324 args
=['sudo', 'ceph', '--cluster', cluster_name
,
325 'osd', 'crush', 'tunables', profile
])
329 @contextlib.contextmanager
330 def create_rbd_pool(ctx
, config
):
331 cluster_name
= config
['cluster']
332 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
333 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
334 log
.info('Waiting for OSDs to come up')
335 teuthology
.wait_until_osds_up(
339 ceph_cluster
=cluster_name
,
341 log
.info('Creating RBD pool')
343 args
=['sudo', 'ceph', '--cluster', cluster_name
,
344 'osd', 'pool', 'create', 'rbd', '8'])
347 @contextlib.contextmanager
348 def cephfs_setup(ctx
, config
):
349 cluster_name
= config
['cluster']
350 testdir
= teuthology
.get_testdir(ctx
)
351 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
353 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
354 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
355 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
356 # If there are any MDSs, then create a filesystem for them to use
357 # Do this last because requires mon cluster to be up and running
359 log
.info('Setting up CephFS filesystem...')
361 fs
= Filesystem(ctx
, create
='cephfs')
363 is_active_mds
= lambda role
: 'mds.' in role
and not role
.endswith('-s') and '-s-' not in role
364 all_roles
= [item
for remote_roles
in mdss
.remotes
.values() for item
in remote_roles
]
365 num_active
= len([r
for r
in all_roles
if is_active_mds(r
)])
367 fs
.set_max_mds(num_active
)
368 fs
.set_allow_dirfrags(True)
373 @contextlib.contextmanager
374 def cluster(ctx
, config
):
376 Handle the creation and removal of a ceph cluster.
379 Create directories needed for the cluster.
380 Create remote journals for all osds.
381 Create and set keyring.
382 Copy the monmap to tht test systems.
386 Add keyring information to monmaps
390 If errors occured, extract a failure message and store in ctx.summary.
391 Unmount all test files and temporary journaling files.
392 Save the monitor information and archive all ceph logs.
393 Cleanup the keyring setup, and remove all monitor map and data files left over.
396 :param config: Configuration
398 if ctx
.config
.get('use_existing_cluster', False) is True:
399 log
.info("'use_existing_cluster' is true; skipping cluster creation")
402 testdir
= teuthology
.get_testdir(ctx
)
403 cluster_name
= config
['cluster']
404 data_dir
= '{tdir}/{cluster}.data'.format(tdir
=testdir
, cluster
=cluster_name
)
405 log
.info('Creating ceph cluster %s...', cluster_name
)
409 'install', '-d', '-m0755', '--',
420 'install', '-d', '-m0777', '--', '/var/run/ceph',
427 remote_to_roles_to_devs
= {}
428 remote_to_roles_to_journals
= {}
429 osds
= ctx
.cluster
.only(teuthology
.is_type('osd', cluster_name
))
430 for remote
, roles_for_host
in osds
.remotes
.iteritems():
431 devs
= teuthology
.get_scratch_devices(remote
)
433 roles_to_journals
= {}
435 log
.info('fs option selected, checking for scratch devs')
436 log
.info('found devs: %s' % (str(devs
),))
437 devs_id_map
= teuthology
.get_wwn_id_map(remote
, devs
)
438 iddevs
= devs_id_map
.values()
439 roles_to_devs
= assign_devs(
440 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
442 if len(roles_to_devs
) < len(iddevs
):
443 iddevs
= iddevs
[len(roles_to_devs
):]
444 devs_to_clean
[remote
] = []
446 if config
.get('block_journal'):
447 log
.info('block journal enabled')
448 roles_to_journals
= assign_devs(
449 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), iddevs
451 log
.info('journal map: %s', roles_to_journals
)
453 if config
.get('tmpfs_journal'):
454 log
.info('tmpfs journal enabled')
455 roles_to_journals
= {}
456 remote
.run(args
=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
457 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
458 tmpfs
= '/mnt/' + role
459 roles_to_journals
[role
] = tmpfs
460 remote
.run(args
=['truncate', '-s', '1500M', tmpfs
])
461 log
.info('journal map: %s', roles_to_journals
)
463 log
.info('dev map: %s' % (str(roles_to_devs
),))
464 remote_to_roles_to_devs
[remote
] = roles_to_devs
465 remote_to_roles_to_journals
[remote
] = roles_to_journals
467 log
.info('Generating config...')
468 remotes_and_roles
= ctx
.cluster
.remotes
.items()
469 roles
= [role_list
for (remote
, role_list
) in remotes_and_roles
]
470 ips
= [host
for (host
, port
) in
471 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
472 conf
= teuthology
.skeleton_config(ctx
, roles
=roles
, ips
=ips
, cluster
=cluster_name
)
473 for remote
, roles_to_journals
in remote_to_roles_to_journals
.iteritems():
474 for role
, journal
in roles_to_journals
.iteritems():
475 name
= teuthology
.ceph_role(role
)
478 conf
[name
]['osd journal'] = journal
479 for section
, keys
in config
['conf'].iteritems():
480 for key
, value
in keys
.iteritems():
481 log
.info("[%s] %s = %s" % (section
, key
, value
))
482 if section
not in conf
:
484 conf
[section
][key
] = value
486 if config
.get('tmpfs_journal'):
487 conf
['journal dio'] = False
489 if not hasattr(ctx
, 'ceph'):
491 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
492 ctx
.ceph
[cluster_name
].conf
= conf
494 default_keyring
= '/etc/ceph/{cluster}.keyring'.format(cluster
=cluster_name
)
495 keyring_path
= config
.get('keyring_path', default_keyring
)
497 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
499 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
501 log
.info('Setting up %s...' % firstmon
)
502 ctx
.cluster
.only(firstmon
).run(
513 ctx
.cluster
.only(firstmon
).run(
525 ctx
.cluster
.only(firstmon
).run(
533 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
534 monmap_path
= '{tdir}/{cluster}.monmap'.format(tdir
=testdir
,
535 cluster
=cluster_name
)
536 fsid
= teuthology
.create_simple_monmap(
542 if not 'global' in conf
:
544 conf
['global']['fsid'] = fsid
546 default_conf_path
= '/etc/ceph/{cluster}.conf'.format(cluster
=cluster_name
)
547 conf_path
= config
.get('conf_path', default_conf_path
)
548 log
.info('Writing %s for FSID %s...' % (conf_path
, fsid
))
549 write_conf(ctx
, conf_path
, cluster_name
)
551 log
.info('Creating admin key on %s...' % firstmon
)
552 ctx
.cluster
.only(firstmon
).run(
560 '--name=client.admin',
562 '--cap', 'mon', 'allow *',
563 '--cap', 'osd', 'allow *',
564 '--cap', 'mds', 'allow *',
565 '--cap', 'mgr', 'allow *',
570 log
.info('Copying monmap to all nodes...')
571 keyring
= teuthology
.get_file(
575 monmap
= teuthology
.get_file(
580 for rem
in ctx
.cluster
.remotes
.iterkeys():
581 # copy mon key and initial monmap
582 log
.info('Sending monmap to node {remote}'.format(remote
=rem
))
583 teuthology
.sudo_write_file(
589 teuthology
.write_file(
595 log
.info('Setting up mon nodes...')
596 mons
= ctx
.cluster
.only(teuthology
.is_type('mon', cluster_name
))
598 if not config
.get('skip_mgr_daemons', False):
599 log
.info('Setting up mgr nodes...')
600 mgrs
= ctx
.cluster
.only(teuthology
.is_type('mgr', cluster_name
))
601 for remote
, roles_for_host
in mgrs
.remotes
.iteritems():
602 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mgr',
604 _
, _
, id_
= teuthology
.split_role(role
)
605 mgr_dir
= '/var/lib/ceph/mgr/{cluster}-{id}'.format(
606 cluster
=cluster_name
,
623 '--name=mgr.{id}'.format(id=id_
),
624 mgr_dir
+ '/keyring',
628 log
.info('Setting up mds nodes...')
629 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
630 for remote
, roles_for_host
in mdss
.remotes
.iteritems():
631 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mds',
633 _
, _
, id_
= teuthology
.split_role(role
)
634 mds_dir
= '/var/lib/ceph/mds/{cluster}-{id}'.format(
635 cluster
=cluster_name
,
652 '--name=mds.{id}'.format(id=id_
),
653 mds_dir
+ '/keyring',
657 cclient
.create_keyring(ctx
, cluster_name
)
658 log
.info('Running mkfs on osd nodes...')
660 if not hasattr(ctx
, 'disk_config'):
661 ctx
.disk_config
= argparse
.Namespace()
662 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev'):
663 ctx
.disk_config
.remote_to_roles_to_dev
= {}
664 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_journals'):
665 ctx
.disk_config
.remote_to_roles_to_journals
= {}
666 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_mount_options'):
667 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
= {}
668 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_fstype'):
669 ctx
.disk_config
.remote_to_roles_to_dev_fstype
= {}
671 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_dev
, remote_to_roles_to_devs
)
672 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_journals
, remote_to_roles_to_journals
)
674 log
.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r
=str(ctx
.disk_config
.remote_to_roles_to_dev
)))
675 for remote
, roles_for_host
in osds
.remotes
.iteritems():
676 roles_to_devs
= remote_to_roles_to_devs
[remote
]
677 roles_to_journals
= remote_to_roles_to_journals
[remote
]
679 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
680 _
, _
, id_
= teuthology
.split_role(role
)
681 mnt_point
= '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster
=cluster_name
, id=id_
)
689 log
.info(str(roles_to_journals
))
691 if roles_to_devs
.get(role
):
692 dev
= roles_to_devs
[role
]
693 fs
= config
.get('fs')
695 mkfs_options
= config
.get('mkfs_options')
696 mount_options
= config
.get('mount_options')
698 # package = 'btrfs-tools'
699 if mount_options
is None:
700 mount_options
= ['noatime', 'user_subvol_rm_allowed']
701 if mkfs_options
is None:
702 mkfs_options
= ['-m', 'single',
706 # package = 'xfsprogs'
707 if mount_options
is None:
708 mount_options
= ['noatime']
709 if mkfs_options
is None:
710 mkfs_options
= ['-f', '-i', 'size=2048']
711 if fs
== 'ext4' or fs
== 'ext3':
712 if mount_options
is None:
713 mount_options
= ['noatime', 'user_xattr']
715 if mount_options
is None:
717 if mkfs_options
is None:
719 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
720 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
721 if package
is not None:
725 'apt-get', 'install', '-y', package
731 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
732 except run
.CommandFailedError
:
733 # Newer btfs-tools doesn't prompt for overwrite, use -f
734 if '-f' not in mount_options
:
735 mkfs_options
.append('-f')
736 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
737 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
738 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
740 log
.info('mount %s on %s -o %s' % (dev
, remote
,
741 ','.join(mount_options
)))
747 '-o', ','.join(mount_options
),
754 'sudo', '/sbin/restorecon', mnt_point
,
758 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_mount_options
:
759 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
] = {}
760 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
][role
] = mount_options
761 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_fstype
:
762 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
] = {}
763 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
][role
] = fs
764 devs_to_clean
[remote
].append(mnt_point
)
766 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
767 _
, _
, id_
= teuthology
.split_role(role
)
781 '--monmap', monmap_path
,
785 log
.info('Reading keys from all nodes...')
788 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
789 for type_
in ['mgr', 'mds', 'osd']:
790 if type_
== 'mgr' and config
.get('skip_mgr_daemons', False):
792 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, type_
, cluster_name
):
793 _
, _
, id_
= teuthology
.split_role(role
)
794 data
= teuthology
.get_file(
796 path
='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
799 cluster
=cluster_name
,
803 keys
.append((type_
, id_
, data
))
805 for remote
, roles_for_host
in ctx
.cluster
.remotes
.iteritems():
806 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client', cluster_name
):
807 _
, _
, id_
= teuthology
.split_role(role
)
808 data
= teuthology
.get_file(
810 path
='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_
, cluster
=cluster_name
)
812 keys
.append(('client', id_
, data
))
815 log
.info('Adding keys to all mons...')
826 teuthology
.feed_many_stdins_and_close(keys_fp
, writes
)
828 for type_
, id_
, data
in keys
:
838 '--name={type}.{id}'.format(
842 ] + list(generate_caps(type_
)),
847 log
.info('Running mkfs on mon nodes...')
848 for remote
, roles_for_host
in mons
.remotes
.iteritems():
849 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mon', cluster_name
):
850 _
, _
, id_
= teuthology
.split_role(role
)
856 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_
, cluster
=cluster_name
),
866 '--cluster', cluster_name
,
869 '--monmap', monmap_path
,
870 '--keyring', keyring_path
,
888 # we need to know this below
889 ctx
.summary
['success'] = False
892 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
894 log
.info('Checking cluster log for badness...')
896 def first_in_ceph_log(pattern
, excludes
):
898 Find the first occurence of the pattern specified in the Ceph log,
899 Returns None if none found.
901 :param pattern: Pattern scanned for.
902 :param excludes: Patterns to ignore.
903 :return: First line of text (or None if not found)
908 '/var/log/ceph/{cluster}.log'.format(cluster
=cluster_name
),
910 for exclude
in excludes
:
911 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
913 run
.Raw('|'), 'head', '-n', '1',
919 stdout
= r
.stdout
.getvalue()
924 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
925 config
['log_whitelist']) is not None:
926 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
927 ctx
.summary
['success'] = False
928 # use the most severe problem as the failure reason
929 if 'failure_reason' not in ctx
.summary
:
930 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
931 match
= first_in_ceph_log(pattern
, config
['log_whitelist'])
932 if match
is not None:
933 ctx
.summary
['failure_reason'] = \
934 '"{match}" in cluster log'.format(
935 match
=match
.rstrip('\n'),
939 for remote
, dirs
in devs_to_clean
.iteritems():
941 log
.info('Unmounting %s on %s' % (dir_
, remote
))
953 except Exception as e
:
956 run
.Raw('PATH=/usr/sbin:$PATH'),
963 if config
.get('tmpfs_journal'):
964 log
.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
965 for remote
, roles_for_host
in osds
.remotes
.iteritems():
967 args
=['sudo', 'umount', '-f', '/mnt'],
971 if ctx
.archive
is not None and \
972 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
974 # archive mon data, too
975 log
.info('Archiving mon data...')
976 path
= os
.path
.join(ctx
.archive
, 'data')
980 if e
.errno
== errno
.EEXIST
:
984 for remote
, roles
in mons
.remotes
.iteritems():
986 is_mon
= teuthology
.is_type('mon', cluster_name
)
988 _
, _
, id_
= teuthology
.split_role(role
)
989 mon_dir
= '/var/lib/ceph/mon/' + \
990 '{0}-{1}'.format(cluster_name
, id_
)
991 teuthology
.pull_directory_tarball(
994 path
+ '/' + role
+ '.tgz')
996 log
.info('Cleaning ceph cluster...')
1008 run
.Raw('{tdir}/../*.pid'.format(tdir
=testdir
)),
1015 def osd_scrub_pgs(ctx
, config
):
1017 Scrub pgs when we exit.
1019 First make sure all pgs are active and clean.
1020 Next scrub all osds.
1021 Then periodically check until all pgs have scrub time stamps that
1022 indicate the last scrub completed. Time out if no progess is made
1023 here after two minutes.
1027 cluster_name
= config
['cluster']
1028 manager
= ctx
.managers
[cluster_name
]
1030 for _
in range(0, retries
):
1031 stats
= manager
.get_pg_stats()
1032 bad
= [stat
['pgid'] for stat
in stats
if 'active+clean' not in stat
['state']]
1037 "Waiting for all PGs to be active and clean, waiting on %s" % bad
)
1040 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1041 check_time_now
= time
.localtime()
1043 all_roles
= teuthology
.all_roles(ctx
.cluster
)
1044 for role
in teuthology
.cluster_roles_of_type(all_roles
, 'osd', cluster_name
):
1045 log
.info("Scrubbing {osd}".format(osd
=role
))
1046 _
, _
, id_
= teuthology
.split_role(role
)
1047 # allow this to fail; in certain cases the OSD might not be up
1048 # at this point. we will catch all pgs below.
1050 manager
.raw_cluster_cmd('osd', 'deep-scrub', id_
)
1051 except run
.CommandFailedError
:
1057 stats
= manager
.get_pg_stats()
1058 timez
= [(stat
['pgid'],stat
['last_scrub_stamp']) for stat
in stats
]
1061 for (pgid
, tmval
) in timez
:
1062 pgtm
= time
.strptime(tmval
[0:tmval
.find('.')], '%Y-%m-%d %H:%M:%S')
1063 if pgtm
> check_time_now
:
1066 log
.info('pgid %s last_scrub_stamp %s %s <= %s', pgid
, tmval
, pgtm
, check_time_now
)
1068 if thiscnt
> prev_good
:
1073 if gap_cnt
% 6 == 0:
1074 for (pgid
, tmval
) in timez
:
1075 # re-request scrub every so often in case the earlier
1076 # request was missed. do not do it everytime because
1077 # the scrub may be in progress or not reported yet and
1078 # we will starve progress.
1079 manager
.raw_cluster_cmd('pg', 'deep-scrub', pgid
)
1080 if gap_cnt
> retries
:
1081 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1083 log
.info('Still waiting for all pgs to be scrubbed.')
1087 @contextlib.contextmanager
1088 def run_daemon(ctx
, config
, type_
):
1090 Run daemons for a role type. Handle the startup and termination of a a daemon.
1091 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1092 and a max_mds value for one mds.
1093 On cleanup -- Stop all existing daemons of this type.
1096 :param config: Configuration
1097 :paran type_: Role type
1099 cluster_name
= config
['cluster']
1100 log
.info('Starting %s daemons in cluster %s...', type_
, cluster_name
)
1101 testdir
= teuthology
.get_testdir(ctx
)
1102 daemons
= ctx
.cluster
.only(teuthology
.is_type(type_
, cluster_name
))
1104 # check whether any daemons if this type are configured
1107 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1109 daemon_signal
= 'kill'
1110 if config
.get('coverage') or config
.get('valgrind') is not None:
1111 daemon_signal
= 'term'
1113 # create osds in order. (this only matters for pre-luminous, which might
1114 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1116 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1117 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1118 for role
in roles_for_host
:
1119 if not is_type_(role
):
1121 _
, _
, id_
= teuthology
.split_role(role
)
1125 datadir
='/var/lib/ceph/osd/{cluster}-{id}'.format(
1126 cluster
=cluster_name
, id=id_
)
1127 osd_uuid
= teuthology
.get_file(
1129 path
=datadir
+ '/fsid',
1132 osd_uuids
[id_
] = osd_uuid
1133 for osd_id
in range(len(osd_uuids
)):
1135 osd_uuid
= osd_uuids
.get(id_
)
1139 'sudo', 'ceph', '--cluster', cluster_name
,
1140 'osd', 'new', osd_uuid
, id_
,
1144 # fallback to pre-luminous (hammer or jewel)
1147 'sudo', 'ceph', '--cluster', cluster_name
,
1148 'osd', 'create', osd_uuid
,
1151 if config
.get('add_osds_to_crush'):
1154 'sudo', 'ceph', '--cluster', cluster_name
,
1155 'osd', 'crush', 'create-or-move', 'osd.' + id_
,
1156 '1.0', 'host=localhost', 'root=default',
1160 for remote
, roles_for_host
in daemons
.remotes
.iteritems():
1161 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1162 for role
in roles_for_host
:
1163 if not is_type_(role
):
1165 _
, _
, id_
= teuthology
.split_role(role
)
1176 'ceph-%s' % (type_
),
1178 '--cluster', cluster_name
,
1181 if type_
in config
.get('cpu_profile', []):
1182 profile_path
= '/var/log/ceph/profiling-logger/%s.prof' % (role
)
1183 run_cmd
.extend(['env', 'CPUPROFILE=%s' % profile_path
])
1185 if config
.get('valgrind') is not None:
1186 valgrind_args
= None
1187 if type_
in config
['valgrind']:
1188 valgrind_args
= config
['valgrind'][type_
]
1189 if role
in config
['valgrind']:
1190 valgrind_args
= config
['valgrind'][role
]
1191 run_cmd
= teuthology
.get_valgrind_args(testdir
, role
,
1195 run_cmd
.extend(run_cmd_tail
)
1197 # always register mgr; don't necessarily start
1198 ctx
.daemons
.register_daemon(
1200 cluster
=cluster_name
,
1202 logger
=log
.getChild(role
),
1206 if type_
!= 'mgr' or not config
.get('skip_mgr_daemons', False):
1207 role
= cluster_name
+ '.' + type_
1208 ctx
.daemons
.get_daemon(type_
, id_
, cluster_name
).restart()
1213 teuthology
.stop_daemons_of_type(ctx
, type_
, cluster_name
)
1216 def healthy(ctx
, config
):
1218 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1221 :param config: Configuration
1223 config
= config
if isinstance(config
, dict) else dict()
1224 cluster_name
= config
.get('cluster', 'ceph')
1225 log
.info('Waiting until %s daemons up and pgs clean...', cluster_name
)
1226 manager
= ctx
.managers
[cluster_name
]
1228 manager
.wait_for_mgr_available()
1229 except run
.CommandFailedError
:
1230 log
.info('ignoring mgr wait error, probably testing upgrade')
1232 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1233 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1234 teuthology
.wait_until_osds_up(
1236 cluster
=ctx
.cluster
,
1238 ceph_cluster
=cluster_name
,
1242 manager
.flush_all_pg_stats()
1243 except run
.CommandFailedError
:
1244 log
.info('ignoring flush pg stats error, probably testing upgrade')
1245 manager
.wait_for_clean()
1247 log
.info('Waiting until ceph cluster %s is healthy...', cluster_name
)
1248 teuthology
.wait_until_healthy(
1251 ceph_cluster
=cluster_name
,
1254 if ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
)).remotes
:
1255 # Some MDSs exist, wait for them to be healthy
1256 ceph_fs
= Filesystem(ctx
) # TODO: make Filesystem cluster-aware
1257 ceph_fs
.wait_for_daemons(timeout
=300)
1260 def wait_for_osds_up(ctx
, config
):
1262 Wait for all osd's to come up.
1265 :param config: Configuration
1267 log
.info('Waiting until ceph osds are all up...')
1268 cluster_name
= config
.get('cluster', 'ceph')
1269 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1270 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1271 teuthology
.wait_until_osds_up(
1273 cluster
=ctx
.cluster
,
1278 def wait_for_mon_quorum(ctx
, config
):
1280 Check renote ceph status until all monitors are up.
1283 :param config: Configuration
1285 if isinstance(config
, dict):
1286 mons
= config
['daemons']
1287 cluster_name
= config
.get('cluster', 'ceph')
1289 assert isinstance(config
, list)
1291 cluster_name
= 'ceph'
1292 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1293 (remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1294 with contextutil
.safe_while(sleep
=10, tries
=60,
1295 action
='wait for monitor quorum') as proceed
:
1304 logger
=log
.getChild('quorum_status'),
1306 j
= json
.loads(r
.stdout
.getvalue())
1307 q
= j
.get('quorum_names', [])
1308 log
.debug('Quorum: %s', q
)
1309 if sorted(q
) == sorted(mons
):
1313 def created_pool(ctx
, config
):
1315 Add new pools to the dictionary of pools that the ceph-manager
1318 for new_pool
in config
:
1319 if new_pool
not in ctx
.managers
['ceph'].pools
:
1320 ctx
.managers
['ceph'].pools
[new_pool
] = ctx
.managers
['ceph'].get_pool_property(
1324 @contextlib.contextmanager
1325 def restart(ctx
, config
):
1327 restart ceph daemons
1331 - ceph.restart: [all]
1335 - ceph.restart: [osd.0, mon.1, mds.*]
1341 daemons: [osd.0, mon.1]
1342 wait-for-healthy: false
1343 wait-for-osds-up: true
1346 :param config: Configuration
1350 elif isinstance(config
, list):
1351 config
= {'daemons': config
}
1353 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1355 for role
in daemons
:
1356 cluster
, type_
, id_
= teuthology
.split_role(role
)
1357 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).restart()
1358 clusters
.add(cluster
)
1360 manager
= ctx
.managers
['ceph']
1361 for dmon
in daemons
:
1363 dm_parts
= dmon
.split('.')
1364 if dm_parts
[1].isdigit():
1365 if dm_parts
[0] == 'osd':
1366 manager
.mark_down_osd(int(dm_parts
[1]))
1368 if config
.get('wait-for-healthy', True):
1369 for cluster
in clusters
:
1370 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1371 if config
.get('wait-for-osds-up', False):
1372 for cluster
in clusters
:
1373 wait_for_osds_up(ctx
=ctx
, config
=dict(cluster
=cluster
))
1377 @contextlib.contextmanager
1378 def stop(ctx
, config
):
1384 - ceph.stop: [mds.*]
1387 - ceph.stop: [osd.0, osd.2]
1391 daemons: [osd.0, osd.2]
1396 elif isinstance(config
, list):
1397 config
= {'daemons': config
}
1399 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1400 for role
in daemons
:
1401 cluster
, type_
, id_
= teuthology
.split_role(role
)
1402 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1407 @contextlib.contextmanager
1408 def wait_for_failure(ctx
, config
):
1410 Wait for a failure of a ceph daemon
1414 - ceph.wait_for_failure: [mds.*]
1417 - ceph.wait_for_failure: [osd.0, osd.2]
1420 - ceph.wait_for_failure:
1421 daemons: [osd.0, osd.2]
1426 elif isinstance(config
, list):
1427 config
= {'daemons': config
}
1429 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1430 for role
in daemons
:
1431 cluster
, type_
, id_
= teuthology
.split_role(role
)
1433 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).wait()
1435 log
.info('Saw expected daemon failure. Continuing.')
1438 raise RuntimeError('daemon %s did not fail' % role
)
1443 def validate_config(ctx
, config
):
1445 Perform some simple validation on task configuration.
1446 Raises exceptions.ConfigError if an error is found.
1448 # check for osds from multiple clusters on the same host
1449 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
1452 for role
in roles_for_host
:
1453 role_cluster
, role_type
, _
= teuthology
.split_role(role
)
1454 if role_type
!= 'osd':
1456 if last_cluster
and last_cluster
!= role_cluster
:
1457 msg
= "Host should not have osds (%s and %s) from multiple clusters" % (
1459 raise exceptions
.ConfigError(msg
)
1460 last_cluster
= role_cluster
1464 @contextlib.contextmanager
1465 def task(ctx
, config
):
1467 Set up and tear down a Ceph cluster.
1475 You can also specify what branch to run::
1491 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1493 Or a local source dir::
1497 path: /home/sage/ceph
1499 To capture code coverage data, use::
1505 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1510 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1511 mount_options: [nobarrier, inode64]
1513 Note, this will cause the task to check the /scratch_devs file on each node
1514 for available devices. If no such file is found, /dev/sdb will be used.
1516 To run some daemons under valgrind, include their names
1517 and the tool/args to use in a valgrind section::
1522 mds.1: --tool=memcheck
1523 osd.1: [--tool=memcheck, --leak-check=no]
1525 Those nodes which are using memcheck or valgrind will get
1526 checked for bad results.
1528 To adjust or modify config options, use::
1543 other key: other value
1548 By default, the cluster log is checked for errors and warnings,
1549 and the run marked failed if any appear. You can ignore log
1550 entries by giving a list of egrep compatible regexes, i.e.:
1554 log-whitelist: ['foo.*bar', 'bad message']
1556 To run multiple ceph clusters, use multiple ceph tasks, and roles
1557 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1558 cluster use the default cluster name, 'ceph'. OSDs from separate
1559 clusters must be on separate hosts. Clients and non-osd daemons
1560 from multiple clusters may be colocated. For each cluster, add an
1561 instance of the ceph task with the cluster name specified, e.g.::
1564 - [mon.a, osd.0, osd.1]
1565 - [backup.mon.a, backup.osd.0, backup.osd.1]
1566 - [client.0, backup.client.0]
1574 :param config: Configuration
1579 assert isinstance(config
, dict), \
1580 "task ceph only supports a dictionary for configuration"
1582 overrides
= ctx
.config
.get('overrides', {})
1583 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1585 first_ceph_cluster
= False
1586 if not hasattr(ctx
, 'daemons'):
1587 first_ceph_cluster
= True
1588 ctx
.daemons
= DaemonGroup()
1590 testdir
= teuthology
.get_testdir(ctx
)
1591 if config
.get('coverage'):
1592 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1593 log
.info('Creating coverage directory...')
1597 'install', '-d', '-m0755', '--',
1604 if 'cluster' not in config
:
1605 config
['cluster'] = 'ceph'
1607 validate_config(ctx
, config
)
1610 if first_ceph_cluster
:
1611 # these tasks handle general log setup and parsing on all hosts,
1612 # so they should only be run once
1614 lambda: ceph_log(ctx
=ctx
, config
=None),
1615 lambda: valgrind_post(ctx
=ctx
, config
=config
),
1619 lambda: cluster(ctx
=ctx
, config
=dict(
1620 conf
=config
.get('conf', {}),
1621 fs
=config
.get('fs', 'xfs'),
1622 mkfs_options
=config
.get('mkfs_options', None),
1623 mount_options
=config
.get('mount_options', None),
1624 block_journal
=config
.get('block_journal', None),
1625 tmpfs_journal
=config
.get('tmpfs_journal', None),
1626 skip_mgr_daemons
=config
.get('skip_mgr_daemons', False),
1627 log_whitelist
=config
.get('log-whitelist', []),
1628 cpu_profile
=set(config
.get('cpu_profile', []),),
1629 cluster
=config
['cluster'],
1631 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mon'),
1632 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mgr'),
1633 lambda: crush_setup(ctx
=ctx
, config
=config
),
1634 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='osd'),
1635 lambda: create_rbd_pool(ctx
=ctx
, config
=config
),
1636 lambda: cephfs_setup(ctx
=ctx
, config
=config
),
1637 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mds'),
1640 with contextutil
.nested(*subtasks
):
1641 first_mon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1642 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
1643 if not hasattr(ctx
, 'managers'):
1645 ctx
.managers
[config
['cluster']] = CephManager(
1648 logger
=log
.getChild('ceph_manager.' + config
['cluster']),
1649 cluster
=config
['cluster'],
1653 if config
.get('wait-for-healthy', True):
1654 healthy(ctx
=ctx
, config
=dict(cluster
=config
['cluster']))
1658 if config
.get('wait-for-scrub', True):
1659 osd_scrub_pgs(ctx
, config
)
1661 # stop logging health to clog during shutdown, or else we generate
1662 # a bunch of scary messages unrelated to our actual run.
1663 firstmon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1664 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1669 '--cluster', config
['cluster'],
1674 '--no-mon-health-to-clog',