4 Handle the setup, starting, and clean-up of a Ceph cluster.
7 from io
import StringIO
21 from paramiko
import SSHException
22 from tasks
.ceph_manager
import CephManager
, write_conf
23 from tarfile
import ReadError
24 from tasks
.cephfs
.filesystem
import Filesystem
25 from teuthology
import misc
as teuthology
26 from teuthology
import contextutil
27 from teuthology
import exceptions
28 from teuthology
.orchestra
import run
29 import tasks
.ceph_client
as cclient
30 from teuthology
.orchestra
.daemon
import DaemonGroup
31 from tasks
.daemonwatchdog
import DaemonWatchdog
33 CEPH_ROLE_TYPES
= ['mon', 'mgr', 'osd', 'mds', 'rgw']
34 DATA_PATH
= '/var/lib/ceph/{type_}/{cluster}-{id_}'
36 log
= logging
.getLogger(__name__
)
39 def generate_caps(type_
):
41 Each call will return the next capability for each system type
42 (essentially a subset of possible role values). Valid types are osd,
52 mon
='allow profile mgr',
69 for subsystem
, capability
in defaults
[type_
].items():
75 @contextlib.contextmanager
76 def ceph_crash(ctx
, config
):
78 Gather crash dumps from /var/lib/crash
84 if ctx
.archive
is not None:
85 log
.info('Archiving crash dumps...')
86 path
= os
.path
.join(ctx
.archive
, 'remote')
91 for remote
in ctx
.cluster
.remotes
.keys():
92 sub
= os
.path
.join(path
, remote
.shortname
)
98 teuthology
.pull_directory(remote
, '/var/lib/ceph/crash',
99 os
.path
.join(sub
, 'crash'))
104 @contextlib.contextmanager
105 def ceph_log(ctx
, config
):
107 Create /var/log/ceph log directory that is open to everyone.
108 Add valgrind and profiling-logger directories.
111 :param config: Configuration
113 log
.info('Making ceph log dir writeable by non-root...')
125 log
.info('Disabling ceph logrotate...')
131 '/etc/logrotate.d/ceph',
136 log
.info('Creating extra log directories...')
141 'install', '-d', '-m0777', '--',
142 '/var/log/ceph/valgrind',
143 '/var/log/ceph/profiling-logger',
149 class Rotater(object):
150 stop_event
= gevent
.event
.Event()
152 def invoke_logrotate(self
):
153 # 1) install ceph-test.conf in /etc/logrotate.d
154 # 2) continuously loop over logrotate invocation with ceph-test.conf
155 while not self
.stop_event
.is_set():
156 self
.stop_event
.wait(timeout
=30)
158 procs
= ctx
.cluster
.run(
159 args
=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'],
164 except exceptions
.ConnectionLostError
as e
:
165 # Some tests may power off nodes during test, in which
166 # case we will see connection errors that we should ignore.
167 log
.debug("Missed logrotate, node '{0}' is offline".format(
170 # Paramiko sometimes raises this when it fails to
171 # connect to a node during open_session. As with
172 # ConnectionLostError, we ignore this because nodes
173 # are allowed to get power cycled during tests.
174 log
.debug("Missed logrotate, EOFError")
176 log
.debug("Missed logrotate, SSHException")
177 except run
.CommandFailedError
as e
:
179 if p
.finished
and p
.exitstatus
!= 0:
180 err
= p
.stderr
.getvalue()
181 if 'error: error renaming temp state file' in err
:
182 log
.info('ignoring transient state error: %s', e
)
185 except socket
.error
as e
:
186 if e
.errno
in (errno
.EHOSTUNREACH
, errno
.ECONNRESET
):
187 log
.debug("Missed logrotate, host unreachable")
192 self
.thread
= gevent
.spawn(self
.invoke_logrotate
)
195 self
.stop_event
.set()
198 def write_rotate_conf(ctx
, daemons
):
199 testdir
= teuthology
.get_testdir(ctx
)
200 remote_logrotate_conf
= '%s/logrotate.ceph-test.conf' % testdir
201 rotate_conf_path
= os
.path
.join(os
.path
.dirname(__file__
), 'logrotate.conf')
202 with
open(rotate_conf_path
) as f
:
204 for daemon
, size
in daemons
.items():
205 log
.info('writing logrotate stanza for {}'.format(daemon
))
206 conf
+= f
.read().format(daemon_type
=daemon
,
210 for remote
in ctx
.cluster
.remotes
.keys():
211 teuthology
.write_file(remote
=remote
,
212 path
=remote_logrotate_conf
,
213 data
=BytesIO(conf
.encode())
219 remote_logrotate_conf
,
220 '/etc/logrotate.d/ceph-test.conf',
225 '/etc/logrotate.d/ceph-test.conf',
230 '/etc/logrotate.d/ceph-test.conf'
233 remote
.chcon('/etc/logrotate.d/ceph-test.conf',
234 'system_u:object_r:etc_t:s0')
236 if ctx
.config
.get('log-rotate'):
237 daemons
= ctx
.config
.get('log-rotate')
238 log
.info('Setting up log rotation with ' + str(daemons
))
239 write_rotate_conf(ctx
, daemons
)
240 logrotater
= Rotater()
246 if ctx
.config
.get('log-rotate'):
247 log
.info('Shutting down logrotate')
250 args
=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
253 if ctx
.archive
is not None and \
254 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
256 log
.info('Compressing logs...')
279 log
.info('Archiving logs...')
280 path
= os
.path
.join(ctx
.archive
, 'remote')
285 for remote
in ctx
.cluster
.remotes
.keys():
286 sub
= os
.path
.join(path
, remote
.shortname
)
291 teuthology
.pull_directory(remote
, '/var/log/ceph',
292 os
.path
.join(sub
, 'log'))
295 def assign_devs(roles
, devs
):
297 Create a dictionary of devs indexed by roles
299 :param roles: List of roles
300 :param devs: Corresponding list of devices.
301 :returns: Dictionary of devs indexed by roles.
303 return dict(zip(roles
, devs
))
306 @contextlib.contextmanager
307 def valgrind_post(ctx
, config
):
309 After the tests run, look through all the valgrind logs. Exceptions are raised
310 if textual errors occurred in the logs, or if valgrind exceptions were detected in
314 :param config: Configuration
319 lookup_procs
= list()
320 log
.info('Checking for errors in any valgrind logs...')
321 for remote
in ctx
.cluster
.remotes
.keys():
322 # look at valgrind logs for each node
324 args
="sudo zgrep '<kind>' /var/log/ceph/valgrind/* "
325 # include a second file so that we always get
326 # a filename prefix on the output
327 "/dev/null | sort | uniq",
332 lookup_procs
.append((proc
, remote
))
334 valgrind_exception
= None
335 for (proc
, remote
) in lookup_procs
:
337 out
= proc
.stdout
.getvalue()
338 for line
in out
.split('\n'):
342 (file, kind
) = line
.split(':')
344 log
.error('failed to split line %s', line
)
346 log
.debug('file %s kind %s', file, kind
)
347 if (file.find('mds') >= 0) and kind
.find('Lost') > 0:
349 log
.error('saw valgrind issue %s in %s', kind
, file)
350 valgrind_exception
= Exception('saw valgrind issues')
352 if config
.get('expect_valgrind_errors'):
353 if not valgrind_exception
:
354 raise Exception('expected valgrind issues and found none')
356 if valgrind_exception
:
357 raise valgrind_exception
360 @contextlib.contextmanager
361 def crush_setup(ctx
, config
):
362 cluster_name
= config
['cluster']
363 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
364 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
366 profile
= config
.get('crush_tunables', 'default')
367 log
.info('Setting crush tunables to %s', profile
)
369 args
=['sudo', 'ceph', '--cluster', cluster_name
,
370 'osd', 'crush', 'tunables', profile
])
374 @contextlib.contextmanager
375 def create_rbd_pool(ctx
, config
):
376 cluster_name
= config
['cluster']
377 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
378 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
379 log
.info('Waiting for OSDs to come up')
380 teuthology
.wait_until_osds_up(
384 ceph_cluster
=cluster_name
,
386 if config
.get('create_rbd_pool', True):
387 log
.info('Creating RBD pool')
389 args
=['sudo', 'ceph', '--cluster', cluster_name
,
390 'osd', 'pool', 'create', 'rbd', '8'])
393 'sudo', 'ceph', '--cluster', cluster_name
,
394 'osd', 'pool', 'application', 'enable',
395 'rbd', 'rbd', '--yes-i-really-mean-it'
400 @contextlib.contextmanager
401 def cephfs_setup(ctx
, config
):
402 cluster_name
= config
['cluster']
404 first_mon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
405 (mon_remote
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
406 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
407 # If there are any MDSs, then create a filesystem for them to use
408 # Do this last because requires mon cluster to be up and running
410 log
.info('Setting up CephFS filesystem...')
412 Filesystem(ctx
, fs_config
=config
.get('cephfs', None), name
='cephfs',
413 create
=True, ec_profile
=config
.get('cephfs_ec_profile', None))
417 @contextlib.contextmanager
418 def watchdog_setup(ctx
, config
):
419 ctx
.ceph
[config
['cluster']].thrashers
= []
420 ctx
.ceph
[config
['cluster']].watchdog
= DaemonWatchdog(ctx
, config
, ctx
.ceph
[config
['cluster']].thrashers
)
421 ctx
.ceph
[config
['cluster']].watchdog
.start()
424 def get_mons(roles
, ips
, cluster_name
,
425 mon_bind_msgr2
=False,
426 mon_bind_addrvec
=False):
428 Get monitors and their associated addresses
433 is_mon
= teuthology
.is_type('mon', cluster_name
)
434 for idx
, roles
in enumerate(roles
):
438 if ips
[idx
] not in v1_ports
:
439 v1_ports
[ips
[idx
]] = 6789
441 v1_ports
[ips
[idx
]] += 1
443 if ips
[idx
] not in v2_ports
:
444 v2_ports
[ips
[idx
]] = 3300
445 addr
= '{ip}'.format(ip
=ips
[idx
])
447 assert mon_bind_addrvec
448 v2_ports
[ips
[idx
]] += 1
449 addr
= '[v2:{ip}:{port2},v1:{ip}:{port1}]'.format(
451 port2
=v2_ports
[ips
[idx
]],
452 port1
=v1_ports
[ips
[idx
]],
454 elif mon_bind_addrvec
:
455 addr
= '[v1:{ip}:{port}]'.format(
457 port
=v1_ports
[ips
[idx
]],
460 addr
= '{ip}:{port}'.format(
462 port
=v1_ports
[ips
[idx
]],
468 def skeleton_config(ctx
, roles
, ips
, mons
, cluster
='ceph'):
470 Returns a ConfigObj that is prefilled with a skeleton config.
472 Use conf[section][key]=value or conf.merge to change it.
474 Use conf.write to write it out, override .filename first if you want.
476 path
= os
.path
.join(os
.path
.dirname(__file__
), 'ceph.conf.template')
477 conf
= configobj
.ConfigObj(path
, file_error
=True)
479 for role
, addr
in mons
.items():
480 mon_cluster
, _
, _
= teuthology
.split_role(role
)
481 if mon_cluster
!= cluster
:
483 name
= teuthology
.ceph_role(role
)
484 conf
.setdefault(name
, {})
485 mon_hosts
.append(addr
)
486 conf
.setdefault('global', {})
487 conf
['global']['mon host'] = ','.join(mon_hosts
)
488 # set up standby mds's
489 is_mds
= teuthology
.is_type('mds', cluster
)
490 for roles_subset
in roles
:
491 for role
in roles_subset
:
493 name
= teuthology
.ceph_role(role
)
494 conf
.setdefault(name
, {})
497 def create_simple_monmap(ctx
, remote
, conf
, mons
,
499 mon_bind_addrvec
=False):
501 Writes a simple monmap based on current ceph.conf into path, or
502 <testdir>/monmap by default.
504 Assumes ceph_conf is up to date.
506 Assumes mon sections are named "mon.*", with the dot.
508 :return the FSID (as a string) of the newly created monmap
511 addresses
= list(mons
.items())
512 assert addresses
, "There are no monitors in config!"
513 log
.debug('Ceph mon addresses: %s', addresses
)
515 testdir
= teuthology
.get_testdir(ctx
)
519 '{tdir}/archive/coverage'.format(tdir
=testdir
),
525 args
.extend(['--enable-all-features'])
526 for (role
, addr
) in addresses
:
527 _
, _
, n
= teuthology
.split_role(role
)
528 if mon_bind_addrvec
and (',' in addr
or 'v' in addr
or ':' in addr
):
529 args
.extend(('--addv', n
, addr
))
531 args
.extend(('--add', n
, addr
))
533 path
= '{tdir}/monmap'.format(tdir
=testdir
)
539 monmap_output
= remote
.sh(args
)
540 fsid
= re
.search("generated fsid (.+)$",
541 monmap_output
, re
.MULTILINE
).group(1)
544 @contextlib.contextmanager
545 def cluster(ctx
, config
):
547 Handle the creation and removal of a ceph cluster.
550 Create directories needed for the cluster.
551 Create remote journals for all osds.
552 Create and set keyring.
553 Copy the monmap to the test systems.
557 Add keyring information to monmaps
561 If errors occurred, extract a failure message and store in ctx.summary.
562 Unmount all test files and temporary journaling files.
563 Save the monitor information and archive all ceph logs.
564 Cleanup the keyring setup, and remove all monitor map and data files left over.
567 :param config: Configuration
569 if ctx
.config
.get('use_existing_cluster', False) is True:
570 log
.info("'use_existing_cluster' is true; skipping cluster creation")
573 testdir
= teuthology
.get_testdir(ctx
)
574 cluster_name
= config
['cluster']
575 data_dir
= '{tdir}/{cluster}.data'.format(tdir
=testdir
, cluster
=cluster_name
)
576 log
.info('Creating ceph cluster %s...', cluster_name
)
577 log
.info('config %s', config
)
578 log
.info('ctx.config %s', ctx
.config
)
582 'install', '-d', '-m0755', '--',
593 'install', '-d', '-m0777', '--', '/var/run/ceph',
600 remote_to_roles_to_devs
= {}
601 osds
= ctx
.cluster
.only(teuthology
.is_type('osd', cluster_name
))
602 for remote
, roles_for_host
in osds
.remotes
.items():
603 devs
= teuthology
.get_scratch_devices(remote
)
604 roles_to_devs
= assign_devs(
605 teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
), devs
607 devs_to_clean
[remote
] = []
608 log
.info('osd dev map: {}'.format(roles_to_devs
))
609 assert roles_to_devs
, \
610 "remote {} has osd roles, but no osd devices were specified!".format(remote
.hostname
)
611 remote_to_roles_to_devs
[remote
] = roles_to_devs
612 log
.info("remote_to_roles_to_devs: {}".format(remote_to_roles_to_devs
))
613 for osd_role
, dev_name
in remote_to_roles_to_devs
.items():
614 assert dev_name
, "{} has no associated device!".format(osd_role
)
616 log
.info('Generating config...')
617 remotes_and_roles
= ctx
.cluster
.remotes
.items()
618 roles
= [role_list
for (remote
, role_list
) in remotes_and_roles
]
619 ips
= [host
for (host
, port
) in
620 (remote
.ssh
.get_transport().getpeername() for (remote
, role_list
) in remotes_and_roles
)]
622 roles
, ips
, cluster_name
,
623 mon_bind_msgr2
=config
.get('mon_bind_msgr2'),
624 mon_bind_addrvec
=config
.get('mon_bind_addrvec'),
626 conf
= skeleton_config(
627 ctx
, roles
=roles
, ips
=ips
, mons
=mons
, cluster
=cluster_name
,
629 for section
, keys
in config
['conf'].items():
630 for key
, value
in keys
.items():
631 log
.info("[%s] %s = %s" % (section
, key
, value
))
632 if section
not in conf
:
634 conf
[section
][key
] = value
636 if not hasattr(ctx
, 'ceph'):
638 ctx
.ceph
[cluster_name
] = argparse
.Namespace()
639 ctx
.ceph
[cluster_name
].conf
= conf
640 ctx
.ceph
[cluster_name
].mons
= mons
642 default_keyring
= '/etc/ceph/{cluster}.keyring'.format(cluster
=cluster_name
)
643 keyring_path
= config
.get('keyring_path', default_keyring
)
645 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
647 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
649 log
.info('Setting up %s...' % firstmon
)
650 ctx
.cluster
.only(firstmon
).run(
661 ctx
.cluster
.only(firstmon
).run(
673 ctx
.cluster
.only(firstmon
).run(
681 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
682 monmap_path
= '{tdir}/{cluster}.monmap'.format(tdir
=testdir
,
683 cluster
=cluster_name
)
684 fsid
= create_simple_monmap(
690 mon_bind_addrvec
=config
.get('mon_bind_addrvec'),
692 if not 'global' in conf
:
694 conf
['global']['fsid'] = fsid
696 default_conf_path
= '/etc/ceph/{cluster}.conf'.format(cluster
=cluster_name
)
697 conf_path
= config
.get('conf_path', default_conf_path
)
698 log
.info('Writing %s for FSID %s...' % (conf_path
, fsid
))
699 write_conf(ctx
, conf_path
, cluster_name
)
701 log
.info('Creating admin key on %s...' % firstmon
)
702 ctx
.cluster
.only(firstmon
).run(
710 '--name=client.admin',
711 '--cap', 'mon', 'allow *',
712 '--cap', 'osd', 'allow *',
713 '--cap', 'mds', 'allow *',
714 '--cap', 'mgr', 'allow *',
719 log
.info('Copying monmap to all nodes...')
720 keyring
= teuthology
.get_file(
724 monmap
= teuthology
.get_file(
729 for rem
in ctx
.cluster
.remotes
.keys():
730 # copy mon key and initial monmap
731 log
.info('Sending monmap to node {remote}'.format(remote
=rem
))
732 teuthology
.sudo_write_file(
738 teuthology
.write_file(
744 log
.info('Setting up mon nodes...')
745 mons
= ctx
.cluster
.only(teuthology
.is_type('mon', cluster_name
))
747 if not config
.get('skip_mgr_daemons', False):
748 log
.info('Setting up mgr nodes...')
749 mgrs
= ctx
.cluster
.only(teuthology
.is_type('mgr', cluster_name
))
750 for remote
, roles_for_host
in mgrs
.remotes
.items():
751 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mgr',
753 _
, _
, id_
= teuthology
.split_role(role
)
754 mgr_dir
= DATA_PATH
.format(
755 type_
='mgr', cluster
=cluster_name
, id_
=id_
)
770 '--name=mgr.{id}'.format(id=id_
),
771 mgr_dir
+ '/keyring',
775 log
.info('Setting up mds nodes...')
776 mdss
= ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
))
777 for remote
, roles_for_host
in mdss
.remotes
.items():
778 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mds',
780 _
, _
, id_
= teuthology
.split_role(role
)
781 mds_dir
= DATA_PATH
.format(
782 type_
='mds', cluster
=cluster_name
, id_
=id_
)
797 '--name=mds.{id}'.format(id=id_
),
798 mds_dir
+ '/keyring',
802 'sudo', 'chown', '-R', 'ceph:ceph', mds_dir
805 cclient
.create_keyring(ctx
, cluster_name
)
806 log
.info('Running mkfs on osd nodes...')
808 if not hasattr(ctx
, 'disk_config'):
809 ctx
.disk_config
= argparse
.Namespace()
810 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev'):
811 ctx
.disk_config
.remote_to_roles_to_dev
= {}
812 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_mount_options'):
813 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
= {}
814 if not hasattr(ctx
.disk_config
, 'remote_to_roles_to_dev_fstype'):
815 ctx
.disk_config
.remote_to_roles_to_dev_fstype
= {}
817 teuthology
.deep_merge(ctx
.disk_config
.remote_to_roles_to_dev
, remote_to_roles_to_devs
)
819 log
.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r
=str(ctx
.disk_config
.remote_to_roles_to_dev
)))
820 for remote
, roles_for_host
in osds
.remotes
.items():
821 roles_to_devs
= remote_to_roles_to_devs
[remote
]
823 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
824 _
, _
, id_
= teuthology
.split_role(role
)
825 mnt_point
= DATA_PATH
.format(
826 type_
='osd', cluster
=cluster_name
, id_
=id_
)
834 log
.info('roles_to_devs: {}'.format(roles_to_devs
))
835 log
.info('role: {}'.format(role
))
836 if roles_to_devs
.get(role
):
837 dev
= roles_to_devs
[role
]
838 fs
= config
.get('fs')
840 mkfs_options
= config
.get('mkfs_options')
841 mount_options
= config
.get('mount_options')
843 # package = 'btrfs-tools'
844 if mount_options
is None:
845 mount_options
= ['noatime', 'user_subvol_rm_allowed']
846 if mkfs_options
is None:
847 mkfs_options
= ['-m', 'single',
851 # package = 'xfsprogs'
852 if mount_options
is None:
853 mount_options
= ['noatime']
854 if mkfs_options
is None:
855 mkfs_options
= ['-f', '-i', 'size=2048']
856 if fs
== 'ext4' or fs
== 'ext3':
857 if mount_options
is None:
858 mount_options
= ['noatime', 'user_xattr']
860 if mount_options
is None:
862 if mkfs_options
is None:
864 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
865 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
866 if package
is not None:
867 remote
.sh('sudo apt-get install -y %s' % package
)
870 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
871 except run
.CommandFailedError
:
872 # Newer btfs-tools doesn't prompt for overwrite, use -f
873 if '-f' not in mount_options
:
874 mkfs_options
.append('-f')
875 mkfs
= ['mkfs.%s' % fs
] + mkfs_options
876 log
.info('%s on %s on %s' % (mkfs
, dev
, remote
))
877 remote
.run(args
=['yes', run
.Raw('|')] + ['sudo'] + mkfs
+ [dev
])
879 log
.info('mount %s on %s -o %s' % (dev
, remote
,
880 ','.join(mount_options
)))
886 '-o', ','.join(mount_options
),
893 'sudo', '/sbin/restorecon', mnt_point
,
897 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_mount_options
:
898 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
] = {}
899 ctx
.disk_config
.remote_to_roles_to_dev_mount_options
[remote
][role
] = mount_options
900 if not remote
in ctx
.disk_config
.remote_to_roles_to_dev_fstype
:
901 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
] = {}
902 ctx
.disk_config
.remote_to_roles_to_dev_fstype
[remote
][role
] = fs
903 devs_to_clean
[remote
].append(mnt_point
)
905 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'osd', cluster_name
):
906 _
, _
, id_
= teuthology
.split_role(role
)
922 '--monmap', monmap_path
,
925 except run
.CommandFailedError
:
926 # try without --no-mon-config.. this may be an upgrade test
940 '--monmap', monmap_path
,
943 mnt_point
= DATA_PATH
.format(
944 type_
='osd', cluster
=cluster_name
, id_
=id_
)
946 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
949 log
.info('Reading keys from all nodes...')
952 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
953 for type_
in ['mgr', 'mds', 'osd']:
954 if type_
== 'mgr' and config
.get('skip_mgr_daemons', False):
956 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, type_
, cluster_name
):
957 _
, _
, id_
= teuthology
.split_role(role
)
958 data
= teuthology
.get_file(
962 type_
=type_
, id_
=id_
, cluster
=cluster_name
),
967 keys
.append((type_
, id_
, data
))
969 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
970 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'client', cluster_name
):
971 _
, _
, id_
= teuthology
.split_role(role
)
972 data
= teuthology
.get_file(
974 path
='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_
, cluster
=cluster_name
)
976 keys
.append(('client', id_
, data
))
979 log
.info('Adding keys to all mons...')
990 teuthology
.feed_many_stdins_and_close(keys_fp
, writes
)
992 for type_
, id_
, data
in keys
:
1002 '--name={type}.{id}'.format(
1006 ] + list(generate_caps(type_
)),
1011 log
.info('Running mkfs on mon nodes...')
1012 for remote
, roles_for_host
in mons
.remotes
.items():
1013 for role
in teuthology
.cluster_roles_of_type(roles_for_host
, 'mon', cluster_name
):
1014 _
, _
, id_
= teuthology
.split_role(role
)
1015 mnt_point
= DATA_PATH
.format(
1016 type_
='mon', id_
=id_
, cluster
=cluster_name
)
1032 '--cluster', cluster_name
,
1035 '--monmap', monmap_path
,
1036 '--keyring', keyring_path
,
1040 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
1057 # we need to know this below
1058 ctx
.summary
['success'] = False
1061 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1063 log
.info('Checking cluster log for badness...')
1065 def first_in_ceph_log(pattern
, excludes
):
1067 Find the first occurrence of the pattern specified in the Ceph log,
1068 Returns None if none found.
1070 :param pattern: Pattern scanned for.
1071 :param excludes: Patterns to ignore.
1072 :return: First line of text (or None if not found)
1077 '/var/log/ceph/{cluster}.log'.format(cluster
=cluster_name
),
1079 for exclude
in excludes
:
1080 args
.extend([run
.Raw('|'), 'egrep', '-v', exclude
])
1082 run
.Raw('|'), 'head', '-n', '1',
1084 stdout
= mon0_remote
.sh(args
)
1085 return stdout
or None
1087 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
1088 config
['log_whitelist']) is not None:
1089 log
.warning('Found errors (ERR|WRN|SEC) in cluster log')
1090 ctx
.summary
['success'] = False
1091 # use the most severe problem as the failure reason
1092 if 'failure_reason' not in ctx
.summary
:
1093 for pattern
in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
1094 match
= first_in_ceph_log(pattern
, config
['log_whitelist'])
1095 if match
is not None:
1096 ctx
.summary
['failure_reason'] = \
1097 '"{match}" in cluster log'.format(
1098 match
=match
.rstrip('\n'),
1102 for remote
, dirs
in devs_to_clean
.items():
1104 log
.info('Unmounting %s on %s' % (dir_
, remote
))
1116 except Exception as e
:
1119 run
.Raw('PATH=/usr/sbin:$PATH'),
1126 if ctx
.archive
is not None and \
1127 not (ctx
.config
.get('archive-on-error') and ctx
.summary
['success']):
1129 # archive mon data, too
1130 log
.info('Archiving mon data...')
1131 path
= os
.path
.join(ctx
.archive
, 'data')
1134 except OSError as e
:
1135 if e
.errno
== errno
.EEXIST
:
1139 for remote
, roles
in mons
.remotes
.items():
1141 is_mon
= teuthology
.is_type('mon', cluster_name
)
1143 _
, _
, id_
= teuthology
.split_role(role
)
1144 mon_dir
= DATA_PATH
.format(
1145 type_
='mon', id_
=id_
, cluster
=cluster_name
)
1146 teuthology
.pull_directory_tarball(
1149 path
+ '/' + role
+ '.tgz')
1151 log
.info('Cleaning ceph cluster...')
1163 run
.Raw('{tdir}/../*.pid'.format(tdir
=testdir
)),
1170 def osd_scrub_pgs(ctx
, config
):
1172 Scrub pgs when we exit.
1174 First make sure all pgs are active and clean.
1175 Next scrub all osds.
1176 Then periodically check until all pgs have scrub time stamps that
1177 indicate the last scrub completed. Time out if no progress is made
1178 here after two minutes.
1182 cluster_name
= config
['cluster']
1183 manager
= ctx
.managers
[cluster_name
]
1185 for _
in range(0, retries
):
1186 stats
= manager
.get_pg_stats()
1187 unclean
= [stat
['pgid'] for stat
in stats
if 'active+clean' not in stat
['state']]
1189 osd_dump
= manager
.get_osd_dump_json()
1191 split_merge
= [i
['pool_name'] for i
in osd_dump
['pools'] if i
['pg_num'] != i
['pg_num_target']]
1193 # we don't support pg_num_target before nautilus
1195 if not unclean
and not split_merge
:
1199 "Waiting for all PGs to be active+clean and split+merged, waiting on %s to go clean and/or %s to split/merge" % (unclean
, split_merge
))
1202 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1203 check_time_now
= time
.localtime()
1205 all_roles
= teuthology
.all_roles(ctx
.cluster
)
1206 for role
in teuthology
.cluster_roles_of_type(all_roles
, 'osd', cluster_name
):
1207 log
.info("Scrubbing {osd}".format(osd
=role
))
1208 _
, _
, id_
= teuthology
.split_role(role
)
1209 # allow this to fail; in certain cases the OSD might not be up
1210 # at this point. we will catch all pgs below.
1212 manager
.raw_cluster_cmd('tell', 'osd.' + id_
, 'config', 'set',
1213 'osd_debug_deep_scrub_sleep', '0');
1214 manager
.raw_cluster_cmd('osd', 'deep-scrub', id_
)
1215 except run
.CommandFailedError
:
1221 stats
= manager
.get_pg_stats()
1222 timez
= [(stat
['pgid'],stat
['last_scrub_stamp']) for stat
in stats
]
1226 for (pgid
, tmval
) in timez
:
1227 t
= tmval
[0:tmval
.find('.')].replace(' ', 'T')
1228 pgtm
= time
.strptime(t
, '%Y-%m-%dT%H:%M:%S')
1229 if pgtm
> check_time_now
:
1232 log
.info('pgid %s last_scrub_stamp %s %s <= %s', pgid
, tmval
, pgtm
, check_time_now
)
1234 re_scrub
.append(pgid
)
1235 if thiscnt
> prev_good
:
1240 if gap_cnt
% 6 == 0:
1241 for pgid
in re_scrub
:
1242 # re-request scrub every so often in case the earlier
1243 # request was missed. do not do it every time because
1244 # the scrub may be in progress or not reported yet and
1245 # we will starve progress.
1246 manager
.raw_cluster_cmd('pg', 'deep-scrub', pgid
)
1247 if gap_cnt
> retries
:
1248 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1250 log
.info('Still waiting for all pgs to be scrubbed.')
1254 @contextlib.contextmanager
1255 def run_daemon(ctx
, config
, type_
):
1257 Run daemons for a role type. Handle the startup and termination of a a daemon.
1258 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1259 and a max_mds value for one mds.
1260 On cleanup -- Stop all existing daemons of this type.
1263 :param config: Configuration
1264 :param type_: Role type
1266 cluster_name
= config
['cluster']
1267 log
.info('Starting %s daemons in cluster %s...', type_
, cluster_name
)
1268 testdir
= teuthology
.get_testdir(ctx
)
1269 daemons
= ctx
.cluster
.only(teuthology
.is_type(type_
, cluster_name
))
1271 # check whether any daemons if this type are configured
1274 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1276 daemon_signal
= 'kill'
1277 if config
.get('coverage') or config
.get('valgrind') is not None:
1278 daemon_signal
= 'term'
1280 # create osds in order. (this only matters for pre-luminous, which might
1281 # be jewel/hammer, which doesn't take an id_ argument to legacy 'osd create').
1283 for remote
, roles_for_host
in daemons
.remotes
.items():
1284 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1285 for role
in roles_for_host
:
1286 if not is_type_(role
):
1288 _
, _
, id_
= teuthology
.split_role(role
)
1292 datadir
='/var/lib/ceph/osd/{cluster}-{id}'.format(
1293 cluster
=cluster_name
, id=id_
)
1294 osd_uuid
= teuthology
.get_file(
1296 path
=datadir
+ '/fsid',
1299 osd_uuids
[id_
] = osd_uuid
1300 for osd_id
in range(len(osd_uuids
)):
1302 osd_uuid
= osd_uuids
.get(id_
)
1306 'sudo', 'ceph', '--cluster', cluster_name
,
1307 'osd', 'new', osd_uuid
, id_
,
1311 # fallback to pre-luminous (jewel)
1314 'sudo', 'ceph', '--cluster', cluster_name
,
1315 'osd', 'create', osd_uuid
,
1318 if config
.get('add_osds_to_crush'):
1321 'sudo', 'ceph', '--cluster', cluster_name
,
1322 'osd', 'crush', 'create-or-move', 'osd.' + id_
,
1323 '1.0', 'host=localhost', 'root=default',
1327 for remote
, roles_for_host
in daemons
.remotes
.items():
1328 is_type_
= teuthology
.is_type(type_
, cluster_name
)
1329 for role
in roles_for_host
:
1330 if not is_type_(role
):
1332 _
, _
, id_
= teuthology
.split_role(role
)
1343 'ceph-%s' % (type_
),
1345 '--cluster', cluster_name
,
1348 if type_
in config
.get('cpu_profile', []):
1349 profile_path
= '/var/log/ceph/profiling-logger/%s.prof' % (role
)
1350 run_cmd
.extend(['env', 'CPUPROFILE=%s' % profile_path
])
1352 if config
.get('valgrind') is not None:
1353 valgrind_args
= None
1354 if type_
in config
['valgrind']:
1355 valgrind_args
= config
['valgrind'][type_
]
1356 if role
in config
['valgrind']:
1357 valgrind_args
= config
['valgrind'][role
]
1358 run_cmd
= teuthology
.get_valgrind_args(testdir
, role
,
1362 run_cmd
.extend(run_cmd_tail
)
1364 # always register mgr; don't necessarily start
1365 ctx
.daemons
.register_daemon(
1367 cluster
=cluster_name
,
1369 logger
=log
.getChild(role
),
1373 if type_
!= 'mgr' or not config
.get('skip_mgr_daemons', False):
1374 role
= cluster_name
+ '.' + type_
1375 ctx
.daemons
.get_daemon(type_
, id_
, cluster_name
).restart()
1377 # kludge: run any pre-manager commands
1379 for cmd
in config
.get('pre-mgr-commands', []):
1380 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1381 (remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1382 remote
.run(args
=cmd
.split(' '))
1387 teuthology
.stop_daemons_of_type(ctx
, type_
, cluster_name
)
1390 def healthy(ctx
, config
):
1392 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1395 :param config: Configuration
1397 config
= config
if isinstance(config
, dict) else dict()
1398 cluster_name
= config
.get('cluster', 'ceph')
1399 log
.info('Waiting until %s daemons up and pgs clean...', cluster_name
)
1400 manager
= ctx
.managers
[cluster_name
]
1402 manager
.wait_for_mgr_available(timeout
=30)
1403 except (run
.CommandFailedError
, AssertionError) as e
:
1404 log
.info('ignoring mgr wait error, probably testing upgrade: %s', e
)
1406 manager
.wait_for_all_osds_up(timeout
=300)
1409 manager
.flush_all_pg_stats()
1410 except (run
.CommandFailedError
, Exception) as e
:
1411 log
.info('ignoring flush pg stats error, probably testing upgrade: %s', e
)
1412 manager
.wait_for_clean()
1414 if config
.get('wait-for-healthy', True):
1415 log
.info('Waiting until ceph cluster %s is healthy...', cluster_name
)
1416 manager
.wait_until_healthy(timeout
=300)
1418 if ctx
.cluster
.only(teuthology
.is_type('mds', cluster_name
)).remotes
:
1419 # Some MDSs exist, wait for them to be healthy
1420 ceph_fs
= Filesystem(ctx
) # TODO: make Filesystem cluster-aware
1421 ceph_fs
.wait_for_daemons(timeout
=300)
1424 def wait_for_mon_quorum(ctx
, config
):
1426 Check renote ceph status until all monitors are up.
1429 :param config: Configuration
1431 if isinstance(config
, dict):
1432 mons
= config
['daemons']
1433 cluster_name
= config
.get('cluster', 'ceph')
1435 assert isinstance(config
, list)
1437 cluster_name
= 'ceph'
1438 firstmon
= teuthology
.get_first_mon(ctx
, config
, cluster_name
)
1439 (remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1440 with contextutil
.safe_while(sleep
=10, tries
=60,
1441 action
='wait for monitor quorum') as proceed
:
1443 quorum_status
= remote
.sh('sudo ceph quorum_status',
1444 logger
=log
.getChild('quorum_status'))
1445 j
= json
.loads(quorum_status
)
1446 q
= j
.get('quorum_names', [])
1447 log
.debug('Quorum: %s', q
)
1448 if sorted(q
) == sorted(mons
):
1452 def created_pool(ctx
, config
):
1454 Add new pools to the dictionary of pools that the ceph-manager
1457 for new_pool
in config
:
1458 if new_pool
not in ctx
.managers
['ceph'].pools
:
1459 ctx
.managers
['ceph'].pools
[new_pool
] = ctx
.managers
['ceph'].get_pool_int_property(
1463 @contextlib.contextmanager
1464 def suppress_mon_health_to_clog(ctx
, config
):
1466 set the option, and then restore it with its original value
1468 Note, due to the way how tasks are executed/nested, it's not suggested to
1469 use this method as a standalone task. otherwise, it's likely that it will
1470 restore the tweaked option at the /end/ of 'tasks' block.
1472 if config
.get('mon-health-to-clog', 'true') == 'false':
1474 cluster
= config
.get('cluster', 'ceph')
1475 manager
= ctx
.managers
[cluster
]
1476 manager
.raw_cluster_command(
1477 'config', 'set', 'mon', 'mon_health_to_clog', 'false'
1480 manager
.raw_cluster_command(
1481 'config', 'rm', 'mon', 'mon_health_to_clog'
1486 @contextlib.contextmanager
1487 def restart(ctx
, config
):
1489 restart ceph daemons
1493 - ceph.restart: [all]
1497 - ceph.restart: [osd.0, mon.1, mds.*]
1503 daemons: [osd.0, mon.1]
1504 wait-for-healthy: false
1505 wait-for-osds-up: true
1508 :param config: Configuration
1512 elif isinstance(config
, list):
1513 config
= {'daemons': config
}
1515 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1518 with
suppress_mon_health_to_clog(ctx
, config
):
1519 for role
in daemons
:
1520 cluster
, type_
, id_
= teuthology
.split_role(role
)
1521 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1523 ctx
.managers
[cluster
].mark_down_osd(id_
)
1524 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).restart()
1525 clusters
.add(cluster
)
1527 if config
.get('wait-for-healthy', True):
1528 for cluster
in clusters
:
1529 healthy(ctx
=ctx
, config
=dict(cluster
=cluster
))
1530 if config
.get('wait-for-osds-up', False):
1531 for cluster
in clusters
:
1532 ctx
.managers
[cluster
].wait_for_all_osds_up()
1536 @contextlib.contextmanager
1537 def stop(ctx
, config
):
1543 - ceph.stop: [mds.*]
1546 - ceph.stop: [osd.0, osd.2]
1550 daemons: [osd.0, osd.2]
1555 elif isinstance(config
, list):
1556 config
= {'daemons': config
}
1558 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1561 for role
in daemons
:
1562 cluster
, type_
, id_
= teuthology
.split_role(role
)
1563 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).stop()
1564 clusters
.add(cluster
)
1567 for cluster
in clusters
:
1568 ctx
.ceph
[cluster
].watchdog
.stop()
1569 ctx
.ceph
[cluster
].watchdog
.join()
1574 @contextlib.contextmanager
1575 def wait_for_failure(ctx
, config
):
1577 Wait for a failure of a ceph daemon
1581 - ceph.wait_for_failure: [mds.*]
1584 - ceph.wait_for_failure: [osd.0, osd.2]
1587 - ceph.wait_for_failure:
1588 daemons: [osd.0, osd.2]
1593 elif isinstance(config
, list):
1594 config
= {'daemons': config
}
1596 daemons
= ctx
.daemons
.resolve_role_list(config
.get('daemons', None), CEPH_ROLE_TYPES
, True)
1597 for role
in daemons
:
1598 cluster
, type_
, id_
= teuthology
.split_role(role
)
1600 ctx
.daemons
.get_daemon(type_
, id_
, cluster
).wait()
1602 log
.info('Saw expected daemon failure. Continuing.')
1605 raise RuntimeError('daemon %s did not fail' % role
)
1610 def validate_config(ctx
, config
):
1612 Perform some simple validation on task configuration.
1613 Raises exceptions.ConfigError if an error is found.
1615 # check for osds from multiple clusters on the same host
1616 for remote
, roles_for_host
in ctx
.cluster
.remotes
.items():
1619 for role
in roles_for_host
:
1620 role_cluster
, role_type
, _
= teuthology
.split_role(role
)
1621 if role_type
!= 'osd':
1623 if last_cluster
and last_cluster
!= role_cluster
:
1624 msg
= "Host should not have osds (%s and %s) from multiple clusters" % (
1626 raise exceptions
.ConfigError(msg
)
1627 last_cluster
= role_cluster
1631 @contextlib.contextmanager
1632 def task(ctx
, config
):
1634 Set up and tear down a Ceph cluster.
1642 You can also specify what branch to run::
1658 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1660 Or a local source dir::
1664 path: /home/sage/ceph
1666 To capture code coverage data, use::
1672 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1677 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1678 mount_options: [nobarrier, inode64]
1680 To change the cephfs's default max_mds (1), use::
1687 To change the mdsmap's default session_timeout (60 seconds), use::
1692 session_timeout: 300
1694 Note, this will cause the task to check the /scratch_devs file on each node
1695 for available devices. If no such file is found, /dev/sdb will be used.
1697 To run some daemons under valgrind, include their names
1698 and the tool/args to use in a valgrind section::
1703 mds.1: --tool=memcheck
1704 osd.1: [--tool=memcheck, --leak-check=no]
1706 Those nodes which are using memcheck or valgrind will get
1707 checked for bad results.
1709 To adjust or modify config options, use::
1724 other key: other value
1729 By default, the cluster log is checked for errors and warnings,
1730 and the run marked failed if any appear. You can ignore log
1731 entries by giving a list of egrep compatible regexes, i.e.:
1735 log-whitelist: ['foo.*bar', 'bad message']
1737 To run multiple ceph clusters, use multiple ceph tasks, and roles
1738 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1739 cluster use the default cluster name, 'ceph'. OSDs from separate
1740 clusters must be on separate hosts. Clients and non-osd daemons
1741 from multiple clusters may be colocated. For each cluster, add an
1742 instance of the ceph task with the cluster name specified, e.g.::
1745 - [mon.a, osd.0, osd.1]
1746 - [backup.mon.a, backup.osd.0, backup.osd.1]
1747 - [client.0, backup.client.0]
1755 :param config: Configuration
1760 assert isinstance(config
, dict), \
1761 "task ceph only supports a dictionary for configuration"
1763 overrides
= ctx
.config
.get('overrides', {})
1764 teuthology
.deep_merge(config
, overrides
.get('ceph', {}))
1766 first_ceph_cluster
= False
1767 if not hasattr(ctx
, 'daemons'):
1768 first_ceph_cluster
= True
1769 ctx
.daemons
= DaemonGroup()
1771 testdir
= teuthology
.get_testdir(ctx
)
1772 if config
.get('coverage'):
1773 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
1774 log
.info('Creating coverage directory...')
1778 'install', '-d', '-m0755', '--',
1785 if 'cluster' not in config
:
1786 config
['cluster'] = 'ceph'
1788 validate_config(ctx
, config
)
1791 if first_ceph_cluster
:
1792 # these tasks handle general log setup and parsing on all hosts,
1793 # so they should only be run once
1795 lambda: ceph_log(ctx
=ctx
, config
=None),
1796 lambda: ceph_crash(ctx
=ctx
, config
=None),
1797 lambda: valgrind_post(ctx
=ctx
, config
=config
),
1801 lambda: cluster(ctx
=ctx
, config
=dict(
1802 conf
=config
.get('conf', {}),
1803 fs
=config
.get('fs', 'xfs'),
1804 mkfs_options
=config
.get('mkfs_options', None),
1805 mount_options
=config
.get('mount_options', None),
1806 skip_mgr_daemons
=config
.get('skip_mgr_daemons', False),
1807 log_whitelist
=config
.get('log-whitelist', []),
1808 cpu_profile
=set(config
.get('cpu_profile', []),),
1809 cluster
=config
['cluster'],
1810 mon_bind_msgr2
=config
.get('mon_bind_msgr2', True),
1811 mon_bind_addrvec
=config
.get('mon_bind_addrvec', True),
1813 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mon'),
1814 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mgr'),
1815 lambda: crush_setup(ctx
=ctx
, config
=config
),
1816 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='osd'),
1817 lambda: create_rbd_pool(ctx
=ctx
, config
=config
),
1818 lambda: run_daemon(ctx
=ctx
, config
=config
, type_
='mds'),
1819 lambda: cephfs_setup(ctx
=ctx
, config
=config
),
1820 lambda: watchdog_setup(ctx
=ctx
, config
=config
),
1823 with contextutil
.nested(*subtasks
):
1824 first_mon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1825 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
1826 if not hasattr(ctx
, 'managers'):
1828 ctx
.managers
[config
['cluster']] = CephManager(
1831 logger
=log
.getChild('ceph_manager.' + config
['cluster']),
1832 cluster
=config
['cluster'],
1836 if config
.get('wait-for-healthy', True):
1837 healthy(ctx
=ctx
, config
=dict(cluster
=config
['cluster']))
1841 # set pg_num_targets back to actual pg_num, so we don't have to
1842 # wait for pending merges (which can take a while!)
1843 ctx
.managers
[config
['cluster']].stop_pg_num_changes()
1845 if config
.get('wait-for-scrub', True):
1846 osd_scrub_pgs(ctx
, config
)
1848 # stop logging health to clog during shutdown, or else we generate
1849 # a bunch of scary messages unrelated to our actual run.
1850 firstmon
= teuthology
.get_first_mon(ctx
, config
, config
['cluster'])
1851 (mon0_remote
,) = ctx
.cluster
.only(firstmon
).remotes
.keys()
1856 '--cluster', config
['cluster'],
1857 'config', 'set', 'global',
1858 'mon_health_to_clog', 'false',