]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
import ceph 16.2.7
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
f67539c2 6from copy import deepcopy
9f95a23c 7from io import BytesIO
f91f0fd5 8from io import StringIO
7c673cae
FG
9
10import argparse
11fdf7f2 11import configobj
7c673cae
FG
12import contextlib
13import errno
14import logging
15import os
16import json
17import time
18import gevent
11fdf7f2 19import re
7c673cae 20import socket
f67539c2 21import yaml
7c673cae
FG
22
23from paramiko import SSHException
f67539c2 24from tasks.ceph_manager import CephManager, write_conf, get_valgrind_args
11fdf7f2 25from tarfile import ReadError
f67539c2 26from tasks.cephfs.filesystem import MDSCluster, Filesystem
7c673cae
FG
27from teuthology import misc as teuthology
28from teuthology import contextutil
29from teuthology import exceptions
30from teuthology.orchestra import run
f67539c2 31from tasks import ceph_client as cclient
7c673cae 32from teuthology.orchestra.daemon import DaemonGroup
9f95a23c 33from tasks.daemonwatchdog import DaemonWatchdog
7c673cae
FG
34
35CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
11fdf7f2 36DATA_PATH = '/var/lib/ceph/{type_}/{cluster}-{id_}'
7c673cae
FG
37
38log = logging.getLogger(__name__)
39
40
41def generate_caps(type_):
42 """
43 Each call will return the next capability for each system type
44 (essentially a subset of possible role values). Valid types are osd,
45 mds and client.
46 """
47 defaults = dict(
48 osd=dict(
a4b75251
TL
49 mon='allow profile osd',
50 mgr='allow profile osd',
7c673cae
FG
51 osd='allow *',
52 ),
53 mgr=dict(
3efd9988
FG
54 mon='allow profile mgr',
55 osd='allow *',
56 mds='allow *',
7c673cae
FG
57 ),
58 mds=dict(
59 mon='allow *',
60 mgr='allow *',
61 osd='allow *',
62 mds='allow',
63 ),
64 client=dict(
65 mon='allow rw',
66 mgr='allow r',
67 osd='allow rwx',
68 mds='allow',
69 ),
70 )
71 for subsystem, capability in defaults[type_].items():
72 yield '--cap'
73 yield subsystem
74 yield capability
75
76
f67539c2
TL
77def update_archive_setting(ctx, key, value):
78 """
79 Add logs directory to job's info log file
80 """
81 if ctx.archive is None:
82 return
83 with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file:
84 info_yaml = yaml.safe_load(info_file)
85 info_file.seek(0)
86 if 'archive' in info_yaml:
87 info_yaml['archive'][key] = value
88 else:
89 info_yaml['archive'] = {key: value}
90 yaml.safe_dump(info_yaml, info_file, default_flow_style=False)
91
92
11fdf7f2
TL
93@contextlib.contextmanager
94def ceph_crash(ctx, config):
95 """
f67539c2 96 Gather crash dumps from /var/lib/ceph/crash
11fdf7f2 97 """
f67539c2
TL
98
99 # Add crash directory to job's archive
100 update_archive_setting(ctx, 'crash', '/var/lib/ceph/crash')
101
11fdf7f2
TL
102 try:
103 yield
104
105 finally:
106 if ctx.archive is not None:
107 log.info('Archiving crash dumps...')
108 path = os.path.join(ctx.archive, 'remote')
109 try:
110 os.makedirs(path)
9f95a23c 111 except OSError:
11fdf7f2 112 pass
9f95a23c 113 for remote in ctx.cluster.remotes.keys():
11fdf7f2
TL
114 sub = os.path.join(path, remote.shortname)
115 try:
116 os.makedirs(sub)
9f95a23c 117 except OSError:
11fdf7f2
TL
118 pass
119 try:
120 teuthology.pull_directory(remote, '/var/lib/ceph/crash',
121 os.path.join(sub, 'crash'))
9f95a23c 122 except ReadError:
11fdf7f2
TL
123 pass
124
125
7c673cae
FG
126@contextlib.contextmanager
127def ceph_log(ctx, config):
128 """
129 Create /var/log/ceph log directory that is open to everyone.
130 Add valgrind and profiling-logger directories.
131
132 :param ctx: Context
133 :param config: Configuration
134 """
135 log.info('Making ceph log dir writeable by non-root...')
136 run.wait(
137 ctx.cluster.run(
138 args=[
139 'sudo',
140 'chmod',
141 '777',
142 '/var/log/ceph',
143 ],
144 wait=False,
145 )
146 )
147 log.info('Disabling ceph logrotate...')
148 run.wait(
149 ctx.cluster.run(
150 args=[
151 'sudo',
152 'rm', '-f', '--',
153 '/etc/logrotate.d/ceph',
154 ],
155 wait=False,
156 )
157 )
158 log.info('Creating extra log directories...')
159 run.wait(
160 ctx.cluster.run(
161 args=[
162 'sudo',
163 'install', '-d', '-m0777', '--',
164 '/var/log/ceph/valgrind',
165 '/var/log/ceph/profiling-logger',
166 ],
167 wait=False,
168 )
169 )
170
f67539c2
TL
171 # Add logs directory to job's info log file
172 update_archive_setting(ctx, 'log', '/var/log/ceph')
173
7c673cae
FG
174 class Rotater(object):
175 stop_event = gevent.event.Event()
176
177 def invoke_logrotate(self):
178 # 1) install ceph-test.conf in /etc/logrotate.d
179 # 2) continuously loop over logrotate invocation with ceph-test.conf
180 while not self.stop_event.is_set():
181 self.stop_event.wait(timeout=30)
182 try:
adb31ebb
TL
183 procs = ctx.cluster.run(
184 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'],
185 wait=False,
186 stderr=StringIO()
7c673cae 187 )
adb31ebb 188 run.wait(procs)
7c673cae
FG
189 except exceptions.ConnectionLostError as e:
190 # Some tests may power off nodes during test, in which
191 # case we will see connection errors that we should ignore.
192 log.debug("Missed logrotate, node '{0}' is offline".format(
193 e.node))
9f95a23c 194 except EOFError:
7c673cae
FG
195 # Paramiko sometimes raises this when it fails to
196 # connect to a node during open_session. As with
197 # ConnectionLostError, we ignore this because nodes
198 # are allowed to get power cycled during tests.
199 log.debug("Missed logrotate, EOFError")
9f95a23c 200 except SSHException:
7c673cae 201 log.debug("Missed logrotate, SSHException")
adb31ebb
TL
202 except run.CommandFailedError as e:
203 for p in procs:
204 if p.finished and p.exitstatus != 0:
205 err = p.stderr.getvalue()
206 if 'error: error renaming temp state file' in err:
207 log.info('ignoring transient state error: %s', e)
208 else:
209 raise
7c673cae 210 except socket.error as e:
92f5a8d4 211 if e.errno in (errno.EHOSTUNREACH, errno.ECONNRESET):
7c673cae
FG
212 log.debug("Missed logrotate, host unreachable")
213 else:
214 raise
215
216 def begin(self):
217 self.thread = gevent.spawn(self.invoke_logrotate)
218
219 def end(self):
220 self.stop_event.set()
221 self.thread.get()
222
223 def write_rotate_conf(ctx, daemons):
224 testdir = teuthology.get_testdir(ctx)
9f95a23c 225 remote_logrotate_conf = '%s/logrotate.ceph-test.conf' % testdir
7c673cae 226 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
f91f0fd5 227 with open(rotate_conf_path) as f:
7c673cae 228 conf = ""
9f95a23c
TL
229 for daemon, size in daemons.items():
230 log.info('writing logrotate stanza for {}'.format(daemon))
f91f0fd5
TL
231 conf += f.read().format(daemon_type=daemon,
232 max_size=size)
7c673cae
FG
233 f.seek(0, 0)
234
9f95a23c 235 for remote in ctx.cluster.remotes.keys():
f67539c2
TL
236 remote.write_file(remote_logrotate_conf, BytesIO(conf.encode()))
237 remote.sh(
238 f'sudo mv {remote_logrotate_conf} /etc/logrotate.d/ceph-test.conf && '
239 'sudo chmod 0644 /etc/logrotate.d/ceph-test.conf && '
240 'sudo chown root.root /etc/logrotate.d/ceph-test.conf')
7c673cae
FG
241 remote.chcon('/etc/logrotate.d/ceph-test.conf',
242 'system_u:object_r:etc_t:s0')
243
244 if ctx.config.get('log-rotate'):
245 daemons = ctx.config.get('log-rotate')
246 log.info('Setting up log rotation with ' + str(daemons))
247 write_rotate_conf(ctx, daemons)
248 logrotater = Rotater()
249 logrotater.begin()
250 try:
251 yield
252
253 finally:
254 if ctx.config.get('log-rotate'):
255 log.info('Shutting down logrotate')
256 logrotater.end()
f67539c2 257 ctx.cluster.sh('sudo rm /etc/logrotate.d/ceph-test.conf')
7c673cae
FG
258 if ctx.archive is not None and \
259 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
260 # and logs
261 log.info('Compressing logs...')
262 run.wait(
263 ctx.cluster.run(
264 args=[
265 'sudo',
266 'find',
267 '/var/log/ceph',
268 '-name',
269 '*.log',
270 '-print0',
271 run.Raw('|'),
272 'sudo',
273 'xargs',
274 '-0',
275 '--no-run-if-empty',
276 '--',
277 'gzip',
278 '--',
279 ],
280 wait=False,
281 ),
282 )
283
284 log.info('Archiving logs...')
285 path = os.path.join(ctx.archive, 'remote')
11fdf7f2
TL
286 try:
287 os.makedirs(path)
9f95a23c 288 except OSError:
11fdf7f2 289 pass
9f95a23c 290 for remote in ctx.cluster.remotes.keys():
7c673cae 291 sub = os.path.join(path, remote.shortname)
11fdf7f2
TL
292 try:
293 os.makedirs(sub)
9f95a23c 294 except OSError:
11fdf7f2 295 pass
7c673cae
FG
296 teuthology.pull_directory(remote, '/var/log/ceph',
297 os.path.join(sub, 'log'))
298
299
300def assign_devs(roles, devs):
301 """
302 Create a dictionary of devs indexed by roles
303
304 :param roles: List of roles
305 :param devs: Corresponding list of devices.
306 :returns: Dictionary of devs indexed by roles.
307 """
308 return dict(zip(roles, devs))
309
310
311@contextlib.contextmanager
312def valgrind_post(ctx, config):
313 """
11fdf7f2
TL
314 After the tests run, look through all the valgrind logs. Exceptions are raised
315 if textual errors occurred in the logs, or if valgrind exceptions were detected in
7c673cae
FG
316 the logs.
317
318 :param ctx: Context
319 :param config: Configuration
320 """
321 try:
322 yield
323 finally:
324 lookup_procs = list()
325 log.info('Checking for errors in any valgrind logs...')
9f95a23c 326 for remote in ctx.cluster.remotes.keys():
7c673cae
FG
327 # look at valgrind logs for each node
328 proc = remote.run(
9f95a23c
TL
329 args="sudo zgrep '<kind>' /var/log/ceph/valgrind/* "
330 # include a second file so that we always get
331 # a filename prefix on the output
332 "/dev/null | sort | uniq",
7c673cae
FG
333 wait=False,
334 check_status=False,
f91f0fd5 335 stdout=StringIO(),
7c673cae
FG
336 )
337 lookup_procs.append((proc, remote))
338
339 valgrind_exception = None
340 for (proc, remote) in lookup_procs:
341 proc.wait()
f91f0fd5 342 out = proc.stdout.getvalue()
7c673cae
FG
343 for line in out.split('\n'):
344 if line == '':
345 continue
346 try:
347 (file, kind) = line.split(':')
348 except Exception:
349 log.error('failed to split line %s', line)
350 raise
351 log.debug('file %s kind %s', file, kind)
352 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
353 continue
354 log.error('saw valgrind issue %s in %s', kind, file)
355 valgrind_exception = Exception('saw valgrind issues')
356
357 if config.get('expect_valgrind_errors'):
358 if not valgrind_exception:
359 raise Exception('expected valgrind issues and found none')
360 else:
361 if valgrind_exception:
362 raise valgrind_exception
363
364
365@contextlib.contextmanager
366def crush_setup(ctx, config):
367 cluster_name = config['cluster']
368 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 369 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
370
371 profile = config.get('crush_tunables', 'default')
372 log.info('Setting crush tunables to %s', profile)
373 mon_remote.run(
374 args=['sudo', 'ceph', '--cluster', cluster_name,
375 'osd', 'crush', 'tunables', profile])
376 yield
377
378
a4b75251
TL
379@contextlib.contextmanager
380def setup_manager(ctx, config):
381 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
382 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
383 if not hasattr(ctx, 'managers'):
384 ctx.managers = {}
385 ctx.managers[config['cluster']] = CephManager(
386 mon,
387 ctx=ctx,
388 logger=log.getChild('ceph_manager.' + config['cluster']),
389 cluster=config['cluster'],
390 )
391 yield
392
224ce89b
WB
393@contextlib.contextmanager
394def create_rbd_pool(ctx, config):
395 cluster_name = config['cluster']
396 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 397 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
224ce89b
WB
398 log.info('Waiting for OSDs to come up')
399 teuthology.wait_until_osds_up(
400 ctx,
401 cluster=ctx.cluster,
402 remote=mon_remote,
403 ceph_cluster=cluster_name,
404 )
3efd9988
FG
405 if config.get('create_rbd_pool', True):
406 log.info('Creating RBD pool')
407 mon_remote.run(
408 args=['sudo', 'ceph', '--cluster', cluster_name,
409 'osd', 'pool', 'create', 'rbd', '8'])
410 mon_remote.run(
411 args=[
412 'sudo', 'ceph', '--cluster', cluster_name,
413 'osd', 'pool', 'application', 'enable',
414 'rbd', 'rbd', '--yes-i-really-mean-it'
415 ],
416 check_status=False)
224ce89b
WB
417 yield
418
7c673cae
FG
419@contextlib.contextmanager
420def cephfs_setup(ctx, config):
421 cluster_name = config['cluster']
7c673cae
FG
422
423 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
9f95a23c 424 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
425 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
426 # If there are any MDSs, then create a filesystem for them to use
427 # Do this last because requires mon cluster to be up and running
428 if mdss.remotes:
f67539c2
TL
429 log.info('Setting up CephFS filesystem(s)...')
430 cephfs_config = config.get('cephfs', {})
431 fs_configs = cephfs_config.pop('fs', [{'name': 'cephfs'}])
432 set_allow_multifs = len(fs_configs) > 1
433
434 # wait for standbys to become available (slow due to valgrind, perhaps)
435 mdsc = MDSCluster(ctx)
436 mds_count = len(list(teuthology.all_roles_of_type(ctx.cluster, 'mds')))
437 with contextutil.safe_while(sleep=2,tries=150) as proceed:
438 while proceed():
439 if len(mdsc.get_standby_daemons()) >= mds_count:
440 break
441
442 fss = []
443 for fs_config in fs_configs:
444 assert isinstance(fs_config, dict)
445 name = fs_config.pop('name')
446 temp = deepcopy(cephfs_config)
447 teuthology.deep_merge(temp, fs_config)
448 fs = Filesystem(ctx, fs_config=temp, name=name, create=True)
449 if set_allow_multifs:
450 fs.set_allow_multifs()
451 set_allow_multifs = False
452 fss.append(fs)
7c673cae 453
f67539c2 454 yield
7c673cae 455
f67539c2
TL
456 for fs in fss:
457 fs.destroy()
458 else:
459 yield
7c673cae 460
9f95a23c
TL
461@contextlib.contextmanager
462def watchdog_setup(ctx, config):
463 ctx.ceph[config['cluster']].thrashers = []
464 ctx.ceph[config['cluster']].watchdog = DaemonWatchdog(ctx, config, ctx.ceph[config['cluster']].thrashers)
465 ctx.ceph[config['cluster']].watchdog.start()
466 yield
7c673cae 467
11fdf7f2
TL
468def get_mons(roles, ips, cluster_name,
469 mon_bind_msgr2=False,
470 mon_bind_addrvec=False):
471 """
472 Get monitors and their associated addresses
473 """
474 mons = {}
475 v1_ports = {}
476 v2_ports = {}
11fdf7f2
TL
477 is_mon = teuthology.is_type('mon', cluster_name)
478 for idx, roles in enumerate(roles):
479 for role in roles:
480 if not is_mon(role):
481 continue
482 if ips[idx] not in v1_ports:
483 v1_ports[ips[idx]] = 6789
484 else:
485 v1_ports[ips[idx]] += 1
486 if mon_bind_msgr2:
487 if ips[idx] not in v2_ports:
488 v2_ports[ips[idx]] = 3300
489 addr = '{ip}'.format(ip=ips[idx])
490 else:
491 assert mon_bind_addrvec
492 v2_ports[ips[idx]] += 1
493 addr = '[v2:{ip}:{port2},v1:{ip}:{port1}]'.format(
494 ip=ips[idx],
495 port2=v2_ports[ips[idx]],
496 port1=v1_ports[ips[idx]],
497 )
498 elif mon_bind_addrvec:
499 addr = '[v1:{ip}:{port}]'.format(
500 ip=ips[idx],
501 port=v1_ports[ips[idx]],
502 )
503 else:
504 addr = '{ip}:{port}'.format(
505 ip=ips[idx],
506 port=v1_ports[ips[idx]],
507 )
11fdf7f2
TL
508 mons[role] = addr
509 assert mons
510 return mons
511
512def skeleton_config(ctx, roles, ips, mons, cluster='ceph'):
513 """
514 Returns a ConfigObj that is prefilled with a skeleton config.
515
516 Use conf[section][key]=value or conf.merge to change it.
517
518 Use conf.write to write it out, override .filename first if you want.
519 """
520 path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template')
9f95a23c 521 conf = configobj.ConfigObj(path, file_error=True)
11fdf7f2 522 mon_hosts = []
9f95a23c 523 for role, addr in mons.items():
11fdf7f2
TL
524 mon_cluster, _, _ = teuthology.split_role(role)
525 if mon_cluster != cluster:
526 continue
527 name = teuthology.ceph_role(role)
528 conf.setdefault(name, {})
529 mon_hosts.append(addr)
530 conf.setdefault('global', {})
531 conf['global']['mon host'] = ','.join(mon_hosts)
532 # set up standby mds's
533 is_mds = teuthology.is_type('mds', cluster)
534 for roles_subset in roles:
535 for role in roles_subset:
536 if is_mds(role):
537 name = teuthology.ceph_role(role)
538 conf.setdefault(name, {})
539 return conf
540
541def create_simple_monmap(ctx, remote, conf, mons,
542 path=None,
543 mon_bind_addrvec=False):
544 """
545 Writes a simple monmap based on current ceph.conf into path, or
546 <testdir>/monmap by default.
547
548 Assumes ceph_conf is up to date.
549
550 Assumes mon sections are named "mon.*", with the dot.
551
552 :return the FSID (as a string) of the newly created monmap
553 """
554
9f95a23c 555 addresses = list(mons.items())
11fdf7f2
TL
556 assert addresses, "There are no monitors in config!"
557 log.debug('Ceph mon addresses: %s', addresses)
558
f67539c2
TL
559 try:
560 log.debug('writing out conf {c}'.format(c=conf))
561 except:
562 log.debug('my conf logging attempt failed')
11fdf7f2 563 testdir = teuthology.get_testdir(ctx)
f67539c2
TL
564 tmp_conf_path = '{tdir}/ceph.tmp.conf'.format(tdir=testdir)
565 conf_fp = BytesIO()
566 conf.write(conf_fp)
567 conf_fp.seek(0)
568 teuthology.write_file(remote, tmp_conf_path, conf_fp)
11fdf7f2
TL
569 args = [
570 'adjust-ulimits',
571 'ceph-coverage',
572 '{tdir}/archive/coverage'.format(tdir=testdir),
573 'monmaptool',
f67539c2
TL
574 '-c',
575 '{conf}'.format(conf=tmp_conf_path),
11fdf7f2
TL
576 '--create',
577 '--clobber',
578 ]
579 if mon_bind_addrvec:
580 args.extend(['--enable-all-features'])
581 for (role, addr) in addresses:
582 _, _, n = teuthology.split_role(role)
583 if mon_bind_addrvec and (',' in addr or 'v' in addr or ':' in addr):
584 args.extend(('--addv', n, addr))
585 else:
586 args.extend(('--add', n, addr))
587 if not path:
588 path = '{tdir}/monmap'.format(tdir=testdir)
589 args.extend([
590 '--print',
591 path
592 ])
593
9f95a23c 594 monmap_output = remote.sh(args)
11fdf7f2
TL
595 fsid = re.search("generated fsid (.+)$",
596 monmap_output, re.MULTILINE).group(1)
f67539c2 597 teuthology.delete_file(remote, tmp_conf_path)
11fdf7f2
TL
598 return fsid
599
f67539c2
TL
600
601def maybe_redirect_stderr(config, type_, args, log_path):
602 if type_ == 'osd' and \
603 config.get('flavor', 'default') == 'crimson':
604 # teuthworker uses ubuntu:ubuntu to access the test nodes
605 create_log_cmd = \
606 f'sudo install -b -o ubuntu -g ubuntu /dev/null {log_path}'
607 return create_log_cmd, args + [run.Raw('2>>'), log_path]
608 else:
609 return None, args
610
611
7c673cae
FG
612@contextlib.contextmanager
613def cluster(ctx, config):
614 """
615 Handle the creation and removal of a ceph cluster.
616
617 On startup:
618 Create directories needed for the cluster.
619 Create remote journals for all osds.
620 Create and set keyring.
11fdf7f2 621 Copy the monmap to the test systems.
7c673cae
FG
622 Setup mon nodes.
623 Setup mds nodes.
624 Mkfs osd nodes.
625 Add keyring information to monmaps
626 Mkfs mon nodes.
627
628 On exit:
11fdf7f2 629 If errors occurred, extract a failure message and store in ctx.summary.
7c673cae
FG
630 Unmount all test files and temporary journaling files.
631 Save the monitor information and archive all ceph logs.
632 Cleanup the keyring setup, and remove all monitor map and data files left over.
633
634 :param ctx: Context
635 :param config: Configuration
636 """
637 if ctx.config.get('use_existing_cluster', False) is True:
638 log.info("'use_existing_cluster' is true; skipping cluster creation")
639 yield
640
641 testdir = teuthology.get_testdir(ctx)
642 cluster_name = config['cluster']
643 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
644 log.info('Creating ceph cluster %s...', cluster_name)
11fdf7f2
TL
645 log.info('config %s', config)
646 log.info('ctx.config %s', ctx.config)
7c673cae
FG
647 run.wait(
648 ctx.cluster.run(
649 args=[
650 'install', '-d', '-m0755', '--',
651 data_dir,
652 ],
653 wait=False,
654 )
655 )
656
657 run.wait(
658 ctx.cluster.run(
659 args=[
660 'sudo',
661 'install', '-d', '-m0777', '--', '/var/run/ceph',
662 ],
663 wait=False,
664 )
665 )
666
667 devs_to_clean = {}
668 remote_to_roles_to_devs = {}
7c673cae 669 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
9f95a23c 670 for remote, roles_for_host in osds.remotes.items():
7c673cae 671 devs = teuthology.get_scratch_devices(remote)
801d1391
TL
672 roles_to_devs = assign_devs(
673 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), devs
674 )
675 devs_to_clean[remote] = []
676 log.info('osd dev map: {}'.format(roles_to_devs))
677 assert roles_to_devs, \
678 "remote {} has osd roles, but no osd devices were specified!".format(remote.hostname)
7c673cae 679 remote_to_roles_to_devs[remote] = roles_to_devs
801d1391
TL
680 log.info("remote_to_roles_to_devs: {}".format(remote_to_roles_to_devs))
681 for osd_role, dev_name in remote_to_roles_to_devs.items():
682 assert dev_name, "{} has no associated device!".format(osd_role)
7c673cae
FG
683
684 log.info('Generating config...')
685 remotes_and_roles = ctx.cluster.remotes.items()
686 roles = [role_list for (remote, role_list) in remotes_and_roles]
687 ips = [host for (host, port) in
688 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
11fdf7f2
TL
689 mons = get_mons(
690 roles, ips, cluster_name,
691 mon_bind_msgr2=config.get('mon_bind_msgr2'),
692 mon_bind_addrvec=config.get('mon_bind_addrvec'),
693 )
694 conf = skeleton_config(
695 ctx, roles=roles, ips=ips, mons=mons, cluster=cluster_name,
696 )
9f95a23c
TL
697 for section, keys in config['conf'].items():
698 for key, value in keys.items():
7c673cae
FG
699 log.info("[%s] %s = %s" % (section, key, value))
700 if section not in conf:
701 conf[section] = {}
702 conf[section][key] = value
703
7c673cae
FG
704 if not hasattr(ctx, 'ceph'):
705 ctx.ceph = {}
706 ctx.ceph[cluster_name] = argparse.Namespace()
707 ctx.ceph[cluster_name].conf = conf
11fdf7f2 708 ctx.ceph[cluster_name].mons = mons
7c673cae
FG
709
710 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
711 keyring_path = config.get('keyring_path', default_keyring)
712
713 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
714
715 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
716
717 log.info('Setting up %s...' % firstmon)
718 ctx.cluster.only(firstmon).run(
719 args=[
720 'sudo',
721 'adjust-ulimits',
722 'ceph-coverage',
723 coverage_dir,
724 'ceph-authtool',
725 '--create-keyring',
726 keyring_path,
727 ],
728 )
729 ctx.cluster.only(firstmon).run(
730 args=[
731 'sudo',
732 'adjust-ulimits',
733 'ceph-coverage',
734 coverage_dir,
735 'ceph-authtool',
736 '--gen-key',
737 '--name=mon.',
738 keyring_path,
739 ],
740 )
741 ctx.cluster.only(firstmon).run(
742 args=[
743 'sudo',
744 'chmod',
745 '0644',
746 keyring_path,
747 ],
748 )
749 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
750 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
751 cluster=cluster_name)
11fdf7f2 752 fsid = create_simple_monmap(
7c673cae
FG
753 ctx,
754 remote=mon0_remote,
755 conf=conf,
11fdf7f2 756 mons=mons,
7c673cae 757 path=monmap_path,
11fdf7f2 758 mon_bind_addrvec=config.get('mon_bind_addrvec'),
7c673cae 759 )
f67539c2 760 ctx.ceph[cluster_name].fsid = fsid
7c673cae
FG
761 if not 'global' in conf:
762 conf['global'] = {}
763 conf['global']['fsid'] = fsid
764
765 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
766 conf_path = config.get('conf_path', default_conf_path)
767 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
768 write_conf(ctx, conf_path, cluster_name)
769
770 log.info('Creating admin key on %s...' % firstmon)
771 ctx.cluster.only(firstmon).run(
772 args=[
773 'sudo',
774 'adjust-ulimits',
775 'ceph-coverage',
776 coverage_dir,
777 'ceph-authtool',
778 '--gen-key',
779 '--name=client.admin',
7c673cae
FG
780 '--cap', 'mon', 'allow *',
781 '--cap', 'osd', 'allow *',
782 '--cap', 'mds', 'allow *',
783 '--cap', 'mgr', 'allow *',
784 keyring_path,
785 ],
786 )
787
788 log.info('Copying monmap to all nodes...')
f67539c2
TL
789 keyring = mon0_remote.read_file(keyring_path)
790 monmap = mon0_remote.read_file(monmap_path)
7c673cae 791
9f95a23c 792 for rem in ctx.cluster.remotes.keys():
7c673cae
FG
793 # copy mon key and initial monmap
794 log.info('Sending monmap to node {remote}'.format(remote=rem))
f67539c2
TL
795 rem.write_file(keyring_path, keyring, mode='0644', sudo=True)
796 rem.write_file(monmap_path, monmap)
7c673cae
FG
797
798 log.info('Setting up mon nodes...')
799 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
7c673cae
FG
800
801 if not config.get('skip_mgr_daemons', False):
802 log.info('Setting up mgr nodes...')
803 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
9f95a23c 804 for remote, roles_for_host in mgrs.remotes.items():
7c673cae
FG
805 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
806 cluster_name):
807 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
808 mgr_dir = DATA_PATH.format(
809 type_='mgr', cluster=cluster_name, id_=id_)
7c673cae
FG
810 remote.run(
811 args=[
812 'sudo',
813 'mkdir',
814 '-p',
815 mgr_dir,
816 run.Raw('&&'),
817 'sudo',
818 'adjust-ulimits',
819 'ceph-coverage',
820 coverage_dir,
821 'ceph-authtool',
822 '--create-keyring',
823 '--gen-key',
824 '--name=mgr.{id}'.format(id=id_),
825 mgr_dir + '/keyring',
826 ],
827 )
828
829 log.info('Setting up mds nodes...')
830 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
9f95a23c 831 for remote, roles_for_host in mdss.remotes.items():
7c673cae
FG
832 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
833 cluster_name):
834 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
835 mds_dir = DATA_PATH.format(
836 type_='mds', cluster=cluster_name, id_=id_)
7c673cae
FG
837 remote.run(
838 args=[
839 'sudo',
840 'mkdir',
841 '-p',
842 mds_dir,
843 run.Raw('&&'),
844 'sudo',
845 'adjust-ulimits',
846 'ceph-coverage',
847 coverage_dir,
848 'ceph-authtool',
849 '--create-keyring',
850 '--gen-key',
851 '--name=mds.{id}'.format(id=id_),
852 mds_dir + '/keyring',
853 ],
854 )
11fdf7f2
TL
855 remote.run(args=[
856 'sudo', 'chown', '-R', 'ceph:ceph', mds_dir
857 ])
7c673cae
FG
858
859 cclient.create_keyring(ctx, cluster_name)
860 log.info('Running mkfs on osd nodes...')
861
862 if not hasattr(ctx, 'disk_config'):
863 ctx.disk_config = argparse.Namespace()
864 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
865 ctx.disk_config.remote_to_roles_to_dev = {}
7c673cae
FG
866 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
867 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
868 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
869 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
870
871 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
7c673cae
FG
872
873 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
9f95a23c 874 for remote, roles_for_host in osds.remotes.items():
7c673cae 875 roles_to_devs = remote_to_roles_to_devs[remote]
7c673cae
FG
876
877 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
878 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
879 mnt_point = DATA_PATH.format(
880 type_='osd', cluster=cluster_name, id_=id_)
7c673cae
FG
881 remote.run(
882 args=[
883 'sudo',
884 'mkdir',
885 '-p',
886 mnt_point,
887 ])
801d1391
TL
888 log.info('roles_to_devs: {}'.format(roles_to_devs))
889 log.info('role: {}'.format(role))
7c673cae
FG
890 if roles_to_devs.get(role):
891 dev = roles_to_devs[role]
892 fs = config.get('fs')
893 package = None
894 mkfs_options = config.get('mkfs_options')
895 mount_options = config.get('mount_options')
896 if fs == 'btrfs':
897 # package = 'btrfs-tools'
898 if mount_options is None:
899 mount_options = ['noatime', 'user_subvol_rm_allowed']
900 if mkfs_options is None:
901 mkfs_options = ['-m', 'single',
902 '-l', '32768',
903 '-n', '32768']
904 if fs == 'xfs':
905 # package = 'xfsprogs'
906 if mount_options is None:
907 mount_options = ['noatime']
908 if mkfs_options is None:
909 mkfs_options = ['-f', '-i', 'size=2048']
910 if fs == 'ext4' or fs == 'ext3':
911 if mount_options is None:
912 mount_options = ['noatime', 'user_xattr']
913
914 if mount_options is None:
915 mount_options = []
916 if mkfs_options is None:
917 mkfs_options = []
918 mkfs = ['mkfs.%s' % fs] + mkfs_options
919 log.info('%s on %s on %s' % (mkfs, dev, remote))
920 if package is not None:
9f95a23c 921 remote.sh('sudo apt-get install -y %s' % package)
7c673cae
FG
922
923 try:
924 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
925 except run.CommandFailedError:
926 # Newer btfs-tools doesn't prompt for overwrite, use -f
927 if '-f' not in mount_options:
928 mkfs_options.append('-f')
929 mkfs = ['mkfs.%s' % fs] + mkfs_options
930 log.info('%s on %s on %s' % (mkfs, dev, remote))
931 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
932
933 log.info('mount %s on %s -o %s' % (dev, remote,
934 ','.join(mount_options)))
935 remote.run(
936 args=[
937 'sudo',
938 'mount',
939 '-t', fs,
940 '-o', ','.join(mount_options),
941 dev,
942 mnt_point,
943 ]
944 )
945 remote.run(
946 args=[
947 'sudo', '/sbin/restorecon', mnt_point,
948 ],
949 check_status=False,
950 )
951 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
952 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
953 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
954 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
955 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
956 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
957 devs_to_clean[remote].append(mnt_point)
958
959 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
960 _, _, id_ = teuthology.split_role(role)
11fdf7f2 961 try:
f67539c2 962 args = ['sudo',
11fdf7f2
TL
963 'MALLOC_CHECK_=3',
964 'adjust-ulimits',
f67539c2 965 'ceph-coverage', coverage_dir,
11fdf7f2
TL
966 'ceph-osd',
967 '--no-mon-config',
f67539c2 968 '--cluster', cluster_name,
11fdf7f2
TL
969 '--mkfs',
970 '--mkkey',
971 '-i', id_,
f67539c2
TL
972 '--monmap', monmap_path]
973 log_path = f'/var/log/ceph/{cluster_name}-osd.{id_}.log'
974 create_log_cmd, args = \
975 maybe_redirect_stderr(config, 'osd', args, log_path)
976 if create_log_cmd:
977 remote.sh(create_log_cmd)
978 remote.run(args=args)
11fdf7f2
TL
979 except run.CommandFailedError:
980 # try without --no-mon-config.. this may be an upgrade test
981 remote.run(
982 args=[
983 'sudo',
984 'MALLOC_CHECK_=3',
985 'adjust-ulimits',
986 'ceph-coverage',
987 coverage_dir,
988 'ceph-osd',
989 '--cluster',
990 cluster_name,
991 '--mkfs',
992 '--mkkey',
993 '-i', id_,
994 '--monmap', monmap_path,
995 ],
996 )
997 mnt_point = DATA_PATH.format(
998 type_='osd', cluster=cluster_name, id_=id_)
f91f0fd5
TL
999 remote.run(args=[
1000 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
1001 ])
7c673cae
FG
1002
1003 log.info('Reading keys from all nodes...')
9f95a23c 1004 keys_fp = BytesIO()
7c673cae 1005 keys = []
9f95a23c 1006 for remote, roles_for_host in ctx.cluster.remotes.items():
7c673cae
FG
1007 for type_ in ['mgr', 'mds', 'osd']:
1008 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
1009 continue
1010 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
1011 _, _, id_ = teuthology.split_role(role)
f67539c2
TL
1012 data = remote.read_file(
1013 os.path.join(
11fdf7f2
TL
1014 DATA_PATH.format(
1015 type_=type_, id_=id_, cluster=cluster_name),
1016 'keyring',
7c673cae
FG
1017 ),
1018 sudo=True,
1019 )
1020 keys.append((type_, id_, data))
1021 keys_fp.write(data)
9f95a23c 1022 for remote, roles_for_host in ctx.cluster.remotes.items():
7c673cae
FG
1023 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
1024 _, _, id_ = teuthology.split_role(role)
f67539c2
TL
1025 data = remote.read_file(
1026 '/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
7c673cae
FG
1027 )
1028 keys.append(('client', id_, data))
1029 keys_fp.write(data)
1030
1031 log.info('Adding keys to all mons...')
1032 writes = mons.run(
1033 args=[
1034 'sudo', 'tee', '-a',
1035 keyring_path,
1036 ],
1037 stdin=run.PIPE,
1038 wait=False,
9f95a23c 1039 stdout=BytesIO(),
7c673cae
FG
1040 )
1041 keys_fp.seek(0)
1042 teuthology.feed_many_stdins_and_close(keys_fp, writes)
1043 run.wait(writes)
1044 for type_, id_, data in keys:
1045 run.wait(
1046 mons.run(
1047 args=[
1048 'sudo',
1049 'adjust-ulimits',
1050 'ceph-coverage',
1051 coverage_dir,
1052 'ceph-authtool',
1053 keyring_path,
1054 '--name={type}.{id}'.format(
1055 type=type_,
1056 id=id_,
1057 ),
1058 ] + list(generate_caps(type_)),
1059 wait=False,
1060 ),
1061 )
1062
1063 log.info('Running mkfs on mon nodes...')
9f95a23c 1064 for remote, roles_for_host in mons.remotes.items():
7c673cae
FG
1065 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
1066 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
1067 mnt_point = DATA_PATH.format(
1068 type_='mon', id_=id_, cluster=cluster_name)
7c673cae
FG
1069 remote.run(
1070 args=[
1071 'sudo',
1072 'mkdir',
1073 '-p',
11fdf7f2 1074 mnt_point,
7c673cae
FG
1075 ],
1076 )
1077 remote.run(
1078 args=[
1079 'sudo',
1080 'adjust-ulimits',
1081 'ceph-coverage',
1082 coverage_dir,
1083 'ceph-mon',
1084 '--cluster', cluster_name,
1085 '--mkfs',
1086 '-i', id_,
1087 '--monmap', monmap_path,
7c673cae
FG
1088 '--keyring', keyring_path,
1089 ],
1090 )
f91f0fd5
TL
1091 remote.run(args=[
1092 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
1093 ])
7c673cae
FG
1094
1095 run.wait(
1096 mons.run(
1097 args=[
1098 'rm',
1099 '--',
1100 monmap_path,
7c673cae
FG
1101 ],
1102 wait=False,
1103 ),
1104 )
1105
1106 try:
1107 yield
1108 except Exception:
1109 # we need to know this below
1110 ctx.summary['success'] = False
1111 raise
1112 finally:
1113 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1114
1115 log.info('Checking cluster log for badness...')
1116
1117 def first_in_ceph_log(pattern, excludes):
1118 """
11fdf7f2 1119 Find the first occurrence of the pattern specified in the Ceph log,
7c673cae
FG
1120 Returns None if none found.
1121
1122 :param pattern: Pattern scanned for.
1123 :param excludes: Patterns to ignore.
1124 :return: First line of text (or None if not found)
1125 """
1126 args = [
1127 'sudo',
1128 'egrep', pattern,
1129 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
1130 ]
1131 for exclude in excludes:
1132 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
1133 args.extend([
1134 run.Raw('|'), 'head', '-n', '1',
1135 ])
9f95a23c
TL
1136 stdout = mon0_remote.sh(args)
1137 return stdout or None
7c673cae
FG
1138
1139 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
cd265ab1 1140 config['log_ignorelist']) is not None:
7c673cae
FG
1141 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
1142 ctx.summary['success'] = False
1143 # use the most severe problem as the failure reason
1144 if 'failure_reason' not in ctx.summary:
1145 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
cd265ab1 1146 match = first_in_ceph_log(pattern, config['log_ignorelist'])
7c673cae
FG
1147 if match is not None:
1148 ctx.summary['failure_reason'] = \
1149 '"{match}" in cluster log'.format(
1150 match=match.rstrip('\n'),
1151 )
1152 break
1153
9f95a23c 1154 for remote, dirs in devs_to_clean.items():
7c673cae
FG
1155 for dir_ in dirs:
1156 log.info('Unmounting %s on %s' % (dir_, remote))
1157 try:
1158 remote.run(
1159 args=[
1160 'sync',
1161 run.Raw('&&'),
1162 'sudo',
1163 'umount',
1164 '-f',
1165 dir_
1166 ]
1167 )
1168 except Exception as e:
1169 remote.run(args=[
1170 'sudo',
1171 run.Raw('PATH=/usr/sbin:$PATH'),
1172 'lsof',
1173 run.Raw(';'),
1174 'ps', 'auxf',
1175 ])
1176 raise e
1177
7c673cae
FG
1178 if ctx.archive is not None and \
1179 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
1180
1181 # archive mon data, too
1182 log.info('Archiving mon data...')
1183 path = os.path.join(ctx.archive, 'data')
1184 try:
1185 os.makedirs(path)
1186 except OSError as e:
1187 if e.errno == errno.EEXIST:
1188 pass
1189 else:
1190 raise
9f95a23c 1191 for remote, roles in mons.remotes.items():
7c673cae
FG
1192 for role in roles:
1193 is_mon = teuthology.is_type('mon', cluster_name)
1194 if is_mon(role):
1195 _, _, id_ = teuthology.split_role(role)
11fdf7f2
TL
1196 mon_dir = DATA_PATH.format(
1197 type_='mon', id_=id_, cluster=cluster_name)
7c673cae
FG
1198 teuthology.pull_directory_tarball(
1199 remote,
1200 mon_dir,
1201 path + '/' + role + '.tgz')
1202
1203 log.info('Cleaning ceph cluster...')
1204 run.wait(
1205 ctx.cluster.run(
1206 args=[
1207 'sudo',
1208 'rm',
1209 '-rf',
1210 '--',
1211 conf_path,
1212 keyring_path,
1213 data_dir,
1214 monmap_path,
7c673cae
FG
1215 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1216 ],
1217 wait=False,
1218 ),
1219 )
1220
1221
1222def osd_scrub_pgs(ctx, config):
1223 """
1224 Scrub pgs when we exit.
1225
1226 First make sure all pgs are active and clean.
1227 Next scrub all osds.
1228 Then periodically check until all pgs have scrub time stamps that
11fdf7f2 1229 indicate the last scrub completed. Time out if no progress is made
7c673cae
FG
1230 here after two minutes.
1231 """
d2e6a577
FG
1232 retries = 40
1233 delays = 20
7c673cae
FG
1234 cluster_name = config['cluster']
1235 manager = ctx.managers[cluster_name]
f67539c2 1236 for _ in range(retries):
7c673cae 1237 stats = manager.get_pg_stats()
11fdf7f2
TL
1238 unclean = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1239 split_merge = []
1240 osd_dump = manager.get_osd_dump_json()
9f95a23c
TL
1241 try:
1242 split_merge = [i['pool_name'] for i in osd_dump['pools'] if i['pg_num'] != i['pg_num_target']]
1243 except KeyError:
1244 # we don't support pg_num_target before nautilus
1245 pass
11fdf7f2 1246 if not unclean and not split_merge:
7c673cae 1247 break
f67539c2
TL
1248 waiting_on = []
1249 if unclean:
1250 waiting_on.append(f'{unclean} to go clean')
1251 if split_merge:
1252 waiting_on.append(f'{split_merge} to split/merge')
1253 waiting_on = ' and '.join(waiting_on)
1254 log.info('Waiting for all PGs to be active+clean and split+merged, waiting on %s', waiting_on)
7c673cae 1255 time.sleep(delays)
f67539c2 1256 else:
31f18b77 1257 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
7c673cae
FG
1258 check_time_now = time.localtime()
1259 time.sleep(1)
1260 all_roles = teuthology.all_roles(ctx.cluster)
1261 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1262 log.info("Scrubbing {osd}".format(osd=role))
1263 _, _, id_ = teuthology.split_role(role)
31f18b77
FG
1264 # allow this to fail; in certain cases the OSD might not be up
1265 # at this point. we will catch all pgs below.
1266 try:
28e407b8
AA
1267 manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
1268 'osd_debug_deep_scrub_sleep', '0');
31f18b77
FG
1269 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1270 except run.CommandFailedError:
1271 pass
7c673cae
FG
1272 prev_good = 0
1273 gap_cnt = 0
1274 loop = True
1275 while loop:
1276 stats = manager.get_pg_stats()
1277 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1278 loop = False
1279 thiscnt = 0
9f95a23c 1280 re_scrub = []
7c673cae 1281 for (pgid, tmval) in timez:
9f95a23c
TL
1282 t = tmval[0:tmval.find('.')].replace(' ', 'T')
1283 pgtm = time.strptime(t, '%Y-%m-%dT%H:%M:%S')
7c673cae
FG
1284 if pgtm > check_time_now:
1285 thiscnt += 1
1286 else:
1287 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1288 loop = True
9f95a23c 1289 re_scrub.append(pgid)
7c673cae
FG
1290 if thiscnt > prev_good:
1291 prev_good = thiscnt
1292 gap_cnt = 0
1293 else:
1294 gap_cnt += 1
31f18b77 1295 if gap_cnt % 6 == 0:
9f95a23c 1296 for pgid in re_scrub:
31f18b77 1297 # re-request scrub every so often in case the earlier
11fdf7f2 1298 # request was missed. do not do it every time because
31f18b77
FG
1299 # the scrub may be in progress or not reported yet and
1300 # we will starve progress.
1301 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
7c673cae 1302 if gap_cnt > retries:
31f18b77 1303 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
7c673cae
FG
1304 if loop:
1305 log.info('Still waiting for all pgs to be scrubbed.')
1306 time.sleep(delays)
1307
1308
1309@contextlib.contextmanager
1310def run_daemon(ctx, config, type_):
1311 """
1312 Run daemons for a role type. Handle the startup and termination of a a daemon.
1313 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1314 and a max_mds value for one mds.
1315 On cleanup -- Stop all existing daemons of this type.
1316
1317 :param ctx: Context
1318 :param config: Configuration
9f95a23c 1319 :param type_: Role type
7c673cae
FG
1320 """
1321 cluster_name = config['cluster']
1322 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1323 testdir = teuthology.get_testdir(ctx)
1324 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1325
1326 # check whether any daemons if this type are configured
1327 if daemons is None:
1328 return
1329 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1330
1331 daemon_signal = 'kill'
1332 if config.get('coverage') or config.get('valgrind') is not None:
1333 daemon_signal = 'term'
1334
c07f9fc5 1335 # create osds in order. (this only matters for pre-luminous, which might
f91f0fd5 1336 # be jewel/hammer, which doesn't take an id_ argument to legacy 'osd create').
c07f9fc5 1337 osd_uuids = {}
9f95a23c 1338 for remote, roles_for_host in daemons.remotes.items():
7c673cae
FG
1339 is_type_ = teuthology.is_type(type_, cluster_name)
1340 for role in roles_for_host:
1341 if not is_type_(role):
1342 continue
1343 _, _, id_ = teuthology.split_role(role)
1344
c07f9fc5 1345
224ce89b
WB
1346 if type_ == 'osd':
1347 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1348 cluster=cluster_name, id=id_)
f67539c2
TL
1349 osd_uuid = remote.read_file(
1350 datadir + '/fsid', sudo=True).decode().strip()
c07f9fc5
FG
1351 osd_uuids[id_] = osd_uuid
1352 for osd_id in range(len(osd_uuids)):
1353 id_ = str(osd_id)
1354 osd_uuid = osd_uuids.get(id_)
1355 try:
1356 remote.run(
1357 args=[
1358 'sudo', 'ceph', '--cluster', cluster_name,
1359 'osd', 'new', osd_uuid, id_,
1360 ]
1361 )
1362 except:
f91f0fd5 1363 # fallback to pre-luminous (jewel)
c07f9fc5
FG
1364 remote.run(
1365 args=[
1366 'sudo', 'ceph', '--cluster', cluster_name,
1367 'osd', 'create', osd_uuid,
1368 ]
1369 )
1370 if config.get('add_osds_to_crush'):
1371 remote.run(
1372 args=[
1373 'sudo', 'ceph', '--cluster', cluster_name,
1374 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1375 '1.0', 'host=localhost', 'root=default',
1376 ]
1377 )
1378
9f95a23c 1379 for remote, roles_for_host in daemons.remotes.items():
c07f9fc5
FG
1380 is_type_ = teuthology.is_type(type_, cluster_name)
1381 for role in roles_for_host:
1382 if not is_type_(role):
1383 continue
1384 _, _, id_ = teuthology.split_role(role)
224ce89b 1385
7c673cae
FG
1386 run_cmd = [
1387 'sudo',
1388 'adjust-ulimits',
1389 'ceph-coverage',
1390 coverage_dir,
1391 'daemon-helper',
1392 daemon_signal,
1393 ]
1394 run_cmd_tail = [
1395 'ceph-%s' % (type_),
1396 '-f',
1397 '--cluster', cluster_name,
1398 '-i', id_]
1399
1400 if type_ in config.get('cpu_profile', []):
1401 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1402 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1403
f67539c2
TL
1404 vc = config.get('valgrind')
1405 if vc is not None:
7c673cae 1406 valgrind_args = None
f67539c2
TL
1407 if type_ in vc:
1408 valgrind_args = vc[type_]
1409 if role in vc:
1410 valgrind_args = vc[role]
1411 exit_on_first_error = vc.get('exit_on_first_error', True)
1412 run_cmd = get_valgrind_args(testdir, role, run_cmd, valgrind_args,
1413 exit_on_first_error=exit_on_first_error)
7c673cae
FG
1414
1415 run_cmd.extend(run_cmd_tail)
f67539c2
TL
1416 log_path = f'/var/log/ceph/{cluster_name}-{type_}.{id_}.log'
1417 create_log_cmd, run_cmd = \
1418 maybe_redirect_stderr(config, type_, run_cmd, log_path)
1419 if create_log_cmd:
1420 remote.sh(create_log_cmd)
7c673cae
FG
1421 # always register mgr; don't necessarily start
1422 ctx.daemons.register_daemon(
1423 remote, type_, id_,
1424 cluster=cluster_name,
1425 args=run_cmd,
1426 logger=log.getChild(role),
1427 stdin=run.PIPE,
1428 wait=False
1429 )
1430 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1431 role = cluster_name + '.' + type_
1432 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1433
9f95a23c
TL
1434 # kludge: run any pre-manager commands
1435 if type_ == 'mon':
1436 for cmd in config.get('pre-mgr-commands', []):
1437 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1438 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1439 remote.run(args=cmd.split(' '))
1440
7c673cae
FG
1441 try:
1442 yield
1443 finally:
1444 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1445
1446
1447def healthy(ctx, config):
1448 """
1449 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1450
1451 :param ctx: Context
1452 :param config: Configuration
1453 """
1454 config = config if isinstance(config, dict) else dict()
1455 cluster_name = config.get('cluster', 'ceph')
c07f9fc5
FG
1456 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1457 manager = ctx.managers[cluster_name]
1458 try:
d2e6a577
FG
1459 manager.wait_for_mgr_available(timeout=30)
1460 except (run.CommandFailedError, AssertionError) as e:
1461 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
c07f9fc5 1462
9f95a23c 1463 manager.wait_for_all_osds_up(timeout=300)
c07f9fc5
FG
1464
1465 try:
1466 manager.flush_all_pg_stats()
d2e6a577
FG
1467 except (run.CommandFailedError, Exception) as e:
1468 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
c07f9fc5
FG
1469 manager.wait_for_clean()
1470
b32b8144
FG
1471 if config.get('wait-for-healthy', True):
1472 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
9f95a23c 1473 manager.wait_until_healthy(timeout=300)
7c673cae
FG
1474
1475 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1476 # Some MDSs exist, wait for them to be healthy
1477 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1478 ceph_fs.wait_for_daemons(timeout=300)
1479
1480
7c673cae
FG
1481def wait_for_mon_quorum(ctx, config):
1482 """
1483 Check renote ceph status until all monitors are up.
1484
1485 :param ctx: Context
1486 :param config: Configuration
1487 """
1488 if isinstance(config, dict):
1489 mons = config['daemons']
1490 cluster_name = config.get('cluster', 'ceph')
1491 else:
1492 assert isinstance(config, list)
1493 mons = config
1494 cluster_name = 'ceph'
1495 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1496 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1497 with contextutil.safe_while(sleep=10, tries=60,
1498 action='wait for monitor quorum') as proceed:
1499 while proceed():
9f95a23c
TL
1500 quorum_status = remote.sh('sudo ceph quorum_status',
1501 logger=log.getChild('quorum_status'))
1502 j = json.loads(quorum_status)
7c673cae
FG
1503 q = j.get('quorum_names', [])
1504 log.debug('Quorum: %s', q)
1505 if sorted(q) == sorted(mons):
1506 break
1507
1508
1509def created_pool(ctx, config):
1510 """
1511 Add new pools to the dictionary of pools that the ceph-manager
1512 knows about.
1513 """
1514 for new_pool in config:
1515 if new_pool not in ctx.managers['ceph'].pools:
9f95a23c 1516 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_int_property(
7c673cae
FG
1517 new_pool, 'pg_num')
1518
1519
11fdf7f2 1520@contextlib.contextmanager
9f95a23c 1521def suppress_mon_health_to_clog(ctx, config):
11fdf7f2 1522 """
9f95a23c 1523 set the option, and then restore it with its original value
11fdf7f2
TL
1524
1525 Note, due to the way how tasks are executed/nested, it's not suggested to
1526 use this method as a standalone task. otherwise, it's likely that it will
1527 restore the tweaked option at the /end/ of 'tasks' block.
1528 """
9f95a23c 1529 if config.get('mon-health-to-clog', 'true') == 'false':
9f95a23c
TL
1530 cluster = config.get('cluster', 'ceph')
1531 manager = ctx.managers[cluster]
1532 manager.raw_cluster_command(
1533 'config', 'set', 'mon', 'mon_health_to_clog', 'false'
1534 )
1535 yield
1536 manager.raw_cluster_command(
1537 'config', 'rm', 'mon', 'mon_health_to_clog'
1538 )
11fdf7f2 1539 else:
9f95a23c 1540 yield
11fdf7f2 1541
7c673cae
FG
1542@contextlib.contextmanager
1543def restart(ctx, config):
1544 """
1545 restart ceph daemons
1546
1547 For example::
1548 tasks:
1549 - ceph.restart: [all]
1550
1551 For example::
1552 tasks:
1553 - ceph.restart: [osd.0, mon.1, mds.*]
1554
1555 or::
1556
1557 tasks:
1558 - ceph.restart:
1559 daemons: [osd.0, mon.1]
1560 wait-for-healthy: false
1561 wait-for-osds-up: true
1562
1563 :param ctx: Context
1564 :param config: Configuration
1565 """
1566 if config is None:
1567 config = {}
1568 elif isinstance(config, list):
1569 config = {'daemons': config}
1570
1571 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1572 clusters = set()
7c673cae 1573
9f95a23c 1574 with suppress_mon_health_to_clog(ctx, config):
11fdf7f2
TL
1575 for role in daemons:
1576 cluster, type_, id_ = teuthology.split_role(role)
9f95a23c
TL
1577 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1578 if type_ == 'osd':
1579 ctx.managers[cluster].mark_down_osd(id_)
11fdf7f2
TL
1580 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1581 clusters.add(cluster)
f67539c2 1582
7c673cae
FG
1583 if config.get('wait-for-healthy', True):
1584 for cluster in clusters:
1585 healthy(ctx=ctx, config=dict(cluster=cluster))
1586 if config.get('wait-for-osds-up', False):
1587 for cluster in clusters:
9f95a23c 1588 ctx.managers[cluster].wait_for_all_osds_up()
7c673cae
FG
1589 yield
1590
1591
1592@contextlib.contextmanager
1593def stop(ctx, config):
1594 """
1595 Stop ceph daemons
1596
1597 For example::
1598 tasks:
1599 - ceph.stop: [mds.*]
1600
1601 tasks:
1602 - ceph.stop: [osd.0, osd.2]
1603
1604 tasks:
1605 - ceph.stop:
1606 daemons: [osd.0, osd.2]
1607
1608 """
1609 if config is None:
1610 config = {}
1611 elif isinstance(config, list):
1612 config = {'daemons': config}
1613
1614 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
9f95a23c
TL
1615 clusters = set()
1616
7c673cae
FG
1617 for role in daemons:
1618 cluster, type_, id_ = teuthology.split_role(role)
1619 ctx.daemons.get_daemon(type_, id_, cluster).stop()
9f95a23c
TL
1620 clusters.add(cluster)
1621
1622
1623 for cluster in clusters:
1624 ctx.ceph[cluster].watchdog.stop()
1625 ctx.ceph[cluster].watchdog.join()
7c673cae
FG
1626
1627 yield
1628
1629
1630@contextlib.contextmanager
1631def wait_for_failure(ctx, config):
1632 """
1633 Wait for a failure of a ceph daemon
1634
1635 For example::
1636 tasks:
1637 - ceph.wait_for_failure: [mds.*]
1638
1639 tasks:
1640 - ceph.wait_for_failure: [osd.0, osd.2]
1641
1642 tasks:
1643 - ceph.wait_for_failure:
1644 daemons: [osd.0, osd.2]
1645
1646 """
1647 if config is None:
1648 config = {}
1649 elif isinstance(config, list):
1650 config = {'daemons': config}
1651
1652 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1653 for role in daemons:
1654 cluster, type_, id_ = teuthology.split_role(role)
1655 try:
1656 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1657 except:
1658 log.info('Saw expected daemon failure. Continuing.')
1659 pass
1660 else:
1661 raise RuntimeError('daemon %s did not fail' % role)
1662
1663 yield
1664
1665
1666def validate_config(ctx, config):
1667 """
1668 Perform some simple validation on task configuration.
1669 Raises exceptions.ConfigError if an error is found.
1670 """
1671 # check for osds from multiple clusters on the same host
1672 for remote, roles_for_host in ctx.cluster.remotes.items():
1673 last_cluster = None
1674 last_role = None
1675 for role in roles_for_host:
1676 role_cluster, role_type, _ = teuthology.split_role(role)
1677 if role_type != 'osd':
1678 continue
1679 if last_cluster and last_cluster != role_cluster:
1680 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1681 last_role, role)
1682 raise exceptions.ConfigError(msg)
1683 last_cluster = role_cluster
1684 last_role = role
1685
1686
1687@contextlib.contextmanager
1688def task(ctx, config):
1689 """
1690 Set up and tear down a Ceph cluster.
1691
1692 For example::
1693
1694 tasks:
1695 - ceph:
1696 - interactive:
1697
1698 You can also specify what branch to run::
1699
1700 tasks:
1701 - ceph:
1702 branch: foo
1703
1704 Or a tag::
1705
1706 tasks:
1707 - ceph:
1708 tag: v0.42.13
1709
1710 Or a sha1::
1711
1712 tasks:
1713 - ceph:
1714 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1715
1716 Or a local source dir::
1717
1718 tasks:
1719 - ceph:
1720 path: /home/sage/ceph
1721
1722 To capture code coverage data, use::
1723
1724 tasks:
1725 - ceph:
1726 coverage: true
1727
1728 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1729
1730 tasks:
1731 - ceph:
1732 fs: xfs
1733 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1734 mount_options: [nobarrier, inode64]
1735
f91f0fd5
TL
1736 To change the cephfs's default max_mds (1), use::
1737
1738 tasks:
1739 - ceph:
1740 cephfs:
1741 max_mds: 2
1742
f67539c2
TL
1743 To change the max_mds of a specific filesystem, use::
1744
1745 tasks:
1746 - ceph:
1747 cephfs:
1748 max_mds: 2
1749 fs:
1750 - name: a
1751 max_mds: 3
1752 - name: b
1753
1754 In the above example, filesystem 'a' will have 'max_mds' 3,
1755 and filesystme 'b' will have 'max_mds' 2.
1756
f91f0fd5
TL
1757 To change the mdsmap's default session_timeout (60 seconds), use::
1758
1759 tasks:
1760 - ceph:
1761 cephfs:
1762 session_timeout: 300
1763
7c673cae
FG
1764 Note, this will cause the task to check the /scratch_devs file on each node
1765 for available devices. If no such file is found, /dev/sdb will be used.
1766
1767 To run some daemons under valgrind, include their names
1768 and the tool/args to use in a valgrind section::
1769
1770 tasks:
1771 - ceph:
1772 valgrind:
1773 mds.1: --tool=memcheck
1774 osd.1: [--tool=memcheck, --leak-check=no]
1775
1776 Those nodes which are using memcheck or valgrind will get
1777 checked for bad results.
1778
1779 To adjust or modify config options, use::
1780
1781 tasks:
1782 - ceph:
1783 conf:
1784 section:
1785 key: value
1786
1787 For example::
1788
1789 tasks:
1790 - ceph:
1791 conf:
1792 mds.0:
1793 some option: value
1794 other key: other value
1795 client.0:
1796 debug client: 10
1797 debug ms: 1
1798
1799 By default, the cluster log is checked for errors and warnings,
1800 and the run marked failed if any appear. You can ignore log
1801 entries by giving a list of egrep compatible regexes, i.e.:
1802
1803 tasks:
1804 - ceph:
cd265ab1 1805 log-ignorelist: ['foo.*bar', 'bad message']
7c673cae
FG
1806
1807 To run multiple ceph clusters, use multiple ceph tasks, and roles
1808 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1809 cluster use the default cluster name, 'ceph'. OSDs from separate
1810 clusters must be on separate hosts. Clients and non-osd daemons
1811 from multiple clusters may be colocated. For each cluster, add an
1812 instance of the ceph task with the cluster name specified, e.g.::
1813
1814 roles:
1815 - [mon.a, osd.0, osd.1]
1816 - [backup.mon.a, backup.osd.0, backup.osd.1]
1817 - [client.0, backup.client.0]
1818 tasks:
1819 - ceph:
1820 cluster: ceph
1821 - ceph:
1822 cluster: backup
1823
1824 :param ctx: Context
1825 :param config: Configuration
1826
1827 """
1828 if config is None:
1829 config = {}
1830 assert isinstance(config, dict), \
1831 "task ceph only supports a dictionary for configuration"
1832
1833 overrides = ctx.config.get('overrides', {})
1834 teuthology.deep_merge(config, overrides.get('ceph', {}))
1835
1836 first_ceph_cluster = False
1837 if not hasattr(ctx, 'daemons'):
1838 first_ceph_cluster = True
1839 ctx.daemons = DaemonGroup()
1840
1841 testdir = teuthology.get_testdir(ctx)
1842 if config.get('coverage'):
1843 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1844 log.info('Creating coverage directory...')
1845 run.wait(
1846 ctx.cluster.run(
1847 args=[
1848 'install', '-d', '-m0755', '--',
1849 coverage_dir,
1850 ],
1851 wait=False,
1852 )
1853 )
1854
1855 if 'cluster' not in config:
1856 config['cluster'] = 'ceph'
1857
1858 validate_config(ctx, config)
1859
1860 subtasks = []
1861 if first_ceph_cluster:
1862 # these tasks handle general log setup and parsing on all hosts,
1863 # so they should only be run once
1864 subtasks = [
1865 lambda: ceph_log(ctx=ctx, config=None),
11fdf7f2 1866 lambda: ceph_crash(ctx=ctx, config=None),
7c673cae
FG
1867 lambda: valgrind_post(ctx=ctx, config=config),
1868 ]
1869
1870 subtasks += [
1871 lambda: cluster(ctx=ctx, config=dict(
1872 conf=config.get('conf', {}),
1873 fs=config.get('fs', 'xfs'),
1874 mkfs_options=config.get('mkfs_options', None),
1875 mount_options=config.get('mount_options', None),
7c673cae 1876 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
cd265ab1 1877 log_ignorelist=config.get('log-ignorelist', []),
7c673cae
FG
1878 cpu_profile=set(config.get('cpu_profile', []),),
1879 cluster=config['cluster'],
11fdf7f2
TL
1880 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1881 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
7c673cae
FG
1882 )),
1883 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1884 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1885 lambda: crush_setup(ctx=ctx, config=config),
1886 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
a4b75251 1887 lambda: setup_manager(ctx=ctx, config=config),
224ce89b 1888 lambda: create_rbd_pool(ctx=ctx, config=config),
7c673cae 1889 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
f91f0fd5 1890 lambda: cephfs_setup(ctx=ctx, config=config),
9f95a23c 1891 lambda: watchdog_setup(ctx=ctx, config=config),
7c673cae
FG
1892 ]
1893
1894 with contextutil.nested(*subtasks):
7c673cae
FG
1895 try:
1896 if config.get('wait-for-healthy', True):
1897 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1898
1899 yield
1900 finally:
11fdf7f2
TL
1901 # set pg_num_targets back to actual pg_num, so we don't have to
1902 # wait for pending merges (which can take a while!)
1903 ctx.managers[config['cluster']].stop_pg_num_changes()
1904
7c673cae 1905 if config.get('wait-for-scrub', True):
f67539c2
TL
1906 # wait for pgs to become active+clean in case any
1907 # recoveries were triggered since the last health check
1908 ctx.managers[config['cluster']].wait_for_clean()
7c673cae 1909 osd_scrub_pgs(ctx, config)
224ce89b
WB
1910
1911 # stop logging health to clog during shutdown, or else we generate
1912 # a bunch of scary messages unrelated to our actual run.
1913 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1914 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1915 mon0_remote.run(
1916 args=[
1917 'sudo',
1918 'ceph',
1919 '--cluster', config['cluster'],
9f95a23c
TL
1920 'config', 'set', 'global',
1921 'mon_health_to_clog', 'false',
1922 ],
1923 check_status=False,
224ce89b 1924 )