]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/ceph.py
3654ffa271ee41bcc9ec8d60c13b423b8c17190b
[ceph.git] / ceph / qa / tasks / ceph.py
1 """
2 Ceph cluster task.
3
4 Handle the setup, starting, and clean-up of a Ceph cluster.
5 """
6 from io import BytesIO
7 from io import StringIO
8
9 import argparse
10 import configobj
11 import contextlib
12 import errno
13 import logging
14 import os
15 import json
16 import time
17 import gevent
18 import re
19 import socket
20
21 from paramiko import SSHException
22 from tasks.ceph_manager import CephManager, write_conf
23 from tarfile import ReadError
24 from tasks.cephfs.filesystem import Filesystem
25 from teuthology import misc as teuthology
26 from teuthology import contextutil
27 from teuthology import exceptions
28 from teuthology.orchestra import run
29 import tasks.ceph_client as cclient
30 from teuthology.orchestra.daemon import DaemonGroup
31 from tasks.daemonwatchdog import DaemonWatchdog
32
33 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
34 DATA_PATH = '/var/lib/ceph/{type_}/{cluster}-{id_}'
35
36 log = logging.getLogger(__name__)
37
38
39 def generate_caps(type_):
40 """
41 Each call will return the next capability for each system type
42 (essentially a subset of possible role values). Valid types are osd,
43 mds and client.
44 """
45 defaults = dict(
46 osd=dict(
47 mon='allow *',
48 mgr='allow *',
49 osd='allow *',
50 ),
51 mgr=dict(
52 mon='allow profile mgr',
53 osd='allow *',
54 mds='allow *',
55 ),
56 mds=dict(
57 mon='allow *',
58 mgr='allow *',
59 osd='allow *',
60 mds='allow',
61 ),
62 client=dict(
63 mon='allow rw',
64 mgr='allow r',
65 osd='allow rwx',
66 mds='allow',
67 ),
68 )
69 for subsystem, capability in defaults[type_].items():
70 yield '--cap'
71 yield subsystem
72 yield capability
73
74
75 @contextlib.contextmanager
76 def ceph_crash(ctx, config):
77 """
78 Gather crash dumps from /var/lib/crash
79 """
80 try:
81 yield
82
83 finally:
84 if ctx.archive is not None:
85 log.info('Archiving crash dumps...')
86 path = os.path.join(ctx.archive, 'remote')
87 try:
88 os.makedirs(path)
89 except OSError:
90 pass
91 for remote in ctx.cluster.remotes.keys():
92 sub = os.path.join(path, remote.shortname)
93 try:
94 os.makedirs(sub)
95 except OSError:
96 pass
97 try:
98 teuthology.pull_directory(remote, '/var/lib/ceph/crash',
99 os.path.join(sub, 'crash'))
100 except ReadError:
101 pass
102
103
104 @contextlib.contextmanager
105 def ceph_log(ctx, config):
106 """
107 Create /var/log/ceph log directory that is open to everyone.
108 Add valgrind and profiling-logger directories.
109
110 :param ctx: Context
111 :param config: Configuration
112 """
113 log.info('Making ceph log dir writeable by non-root...')
114 run.wait(
115 ctx.cluster.run(
116 args=[
117 'sudo',
118 'chmod',
119 '777',
120 '/var/log/ceph',
121 ],
122 wait=False,
123 )
124 )
125 log.info('Disabling ceph logrotate...')
126 run.wait(
127 ctx.cluster.run(
128 args=[
129 'sudo',
130 'rm', '-f', '--',
131 '/etc/logrotate.d/ceph',
132 ],
133 wait=False,
134 )
135 )
136 log.info('Creating extra log directories...')
137 run.wait(
138 ctx.cluster.run(
139 args=[
140 'sudo',
141 'install', '-d', '-m0777', '--',
142 '/var/log/ceph/valgrind',
143 '/var/log/ceph/profiling-logger',
144 ],
145 wait=False,
146 )
147 )
148
149 class Rotater(object):
150 stop_event = gevent.event.Event()
151
152 def invoke_logrotate(self):
153 # 1) install ceph-test.conf in /etc/logrotate.d
154 # 2) continuously loop over logrotate invocation with ceph-test.conf
155 while not self.stop_event.is_set():
156 self.stop_event.wait(timeout=30)
157 try:
158 procs = ctx.cluster.run(
159 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'],
160 wait=False,
161 stderr=StringIO()
162 )
163 run.wait(procs)
164 except exceptions.ConnectionLostError as e:
165 # Some tests may power off nodes during test, in which
166 # case we will see connection errors that we should ignore.
167 log.debug("Missed logrotate, node '{0}' is offline".format(
168 e.node))
169 except EOFError:
170 # Paramiko sometimes raises this when it fails to
171 # connect to a node during open_session. As with
172 # ConnectionLostError, we ignore this because nodes
173 # are allowed to get power cycled during tests.
174 log.debug("Missed logrotate, EOFError")
175 except SSHException:
176 log.debug("Missed logrotate, SSHException")
177 except run.CommandFailedError as e:
178 for p in procs:
179 if p.finished and p.exitstatus != 0:
180 err = p.stderr.getvalue()
181 if 'error: error renaming temp state file' in err:
182 log.info('ignoring transient state error: %s', e)
183 else:
184 raise
185 except socket.error as e:
186 if e.errno in (errno.EHOSTUNREACH, errno.ECONNRESET):
187 log.debug("Missed logrotate, host unreachable")
188 else:
189 raise
190
191 def begin(self):
192 self.thread = gevent.spawn(self.invoke_logrotate)
193
194 def end(self):
195 self.stop_event.set()
196 self.thread.get()
197
198 def write_rotate_conf(ctx, daemons):
199 testdir = teuthology.get_testdir(ctx)
200 remote_logrotate_conf = '%s/logrotate.ceph-test.conf' % testdir
201 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
202 with open(rotate_conf_path) as f:
203 conf = ""
204 for daemon, size in daemons.items():
205 log.info('writing logrotate stanza for {}'.format(daemon))
206 conf += f.read().format(daemon_type=daemon,
207 max_size=size)
208 f.seek(0, 0)
209
210 for remote in ctx.cluster.remotes.keys():
211 teuthology.write_file(remote=remote,
212 path=remote_logrotate_conf,
213 data=BytesIO(conf.encode())
214 )
215 remote.run(
216 args=[
217 'sudo',
218 'mv',
219 remote_logrotate_conf,
220 '/etc/logrotate.d/ceph-test.conf',
221 run.Raw('&&'),
222 'sudo',
223 'chmod',
224 '0644',
225 '/etc/logrotate.d/ceph-test.conf',
226 run.Raw('&&'),
227 'sudo',
228 'chown',
229 'root.root',
230 '/etc/logrotate.d/ceph-test.conf'
231 ]
232 )
233 remote.chcon('/etc/logrotate.d/ceph-test.conf',
234 'system_u:object_r:etc_t:s0')
235
236 if ctx.config.get('log-rotate'):
237 daemons = ctx.config.get('log-rotate')
238 log.info('Setting up log rotation with ' + str(daemons))
239 write_rotate_conf(ctx, daemons)
240 logrotater = Rotater()
241 logrotater.begin()
242 try:
243 yield
244
245 finally:
246 if ctx.config.get('log-rotate'):
247 log.info('Shutting down logrotate')
248 logrotater.end()
249 ctx.cluster.run(
250 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
251 ]
252 )
253 if ctx.archive is not None and \
254 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
255 # and logs
256 log.info('Compressing logs...')
257 run.wait(
258 ctx.cluster.run(
259 args=[
260 'sudo',
261 'find',
262 '/var/log/ceph',
263 '-name',
264 '*.log',
265 '-print0',
266 run.Raw('|'),
267 'sudo',
268 'xargs',
269 '-0',
270 '--no-run-if-empty',
271 '--',
272 'gzip',
273 '--',
274 ],
275 wait=False,
276 ),
277 )
278
279 log.info('Archiving logs...')
280 path = os.path.join(ctx.archive, 'remote')
281 try:
282 os.makedirs(path)
283 except OSError:
284 pass
285 for remote in ctx.cluster.remotes.keys():
286 sub = os.path.join(path, remote.shortname)
287 try:
288 os.makedirs(sub)
289 except OSError:
290 pass
291 teuthology.pull_directory(remote, '/var/log/ceph',
292 os.path.join(sub, 'log'))
293
294
295 def assign_devs(roles, devs):
296 """
297 Create a dictionary of devs indexed by roles
298
299 :param roles: List of roles
300 :param devs: Corresponding list of devices.
301 :returns: Dictionary of devs indexed by roles.
302 """
303 return dict(zip(roles, devs))
304
305
306 @contextlib.contextmanager
307 def valgrind_post(ctx, config):
308 """
309 After the tests run, look through all the valgrind logs. Exceptions are raised
310 if textual errors occurred in the logs, or if valgrind exceptions were detected in
311 the logs.
312
313 :param ctx: Context
314 :param config: Configuration
315 """
316 try:
317 yield
318 finally:
319 lookup_procs = list()
320 log.info('Checking for errors in any valgrind logs...')
321 for remote in ctx.cluster.remotes.keys():
322 # look at valgrind logs for each node
323 proc = remote.run(
324 args="sudo zgrep '<kind>' /var/log/ceph/valgrind/* "
325 # include a second file so that we always get
326 # a filename prefix on the output
327 "/dev/null | sort | uniq",
328 wait=False,
329 check_status=False,
330 stdout=StringIO(),
331 )
332 lookup_procs.append((proc, remote))
333
334 valgrind_exception = None
335 for (proc, remote) in lookup_procs:
336 proc.wait()
337 out = proc.stdout.getvalue()
338 for line in out.split('\n'):
339 if line == '':
340 continue
341 try:
342 (file, kind) = line.split(':')
343 except Exception:
344 log.error('failed to split line %s', line)
345 raise
346 log.debug('file %s kind %s', file, kind)
347 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
348 continue
349 log.error('saw valgrind issue %s in %s', kind, file)
350 valgrind_exception = Exception('saw valgrind issues')
351
352 if config.get('expect_valgrind_errors'):
353 if not valgrind_exception:
354 raise Exception('expected valgrind issues and found none')
355 else:
356 if valgrind_exception:
357 raise valgrind_exception
358
359
360 @contextlib.contextmanager
361 def crush_setup(ctx, config):
362 cluster_name = config['cluster']
363 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
364 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
365
366 profile = config.get('crush_tunables', 'default')
367 log.info('Setting crush tunables to %s', profile)
368 mon_remote.run(
369 args=['sudo', 'ceph', '--cluster', cluster_name,
370 'osd', 'crush', 'tunables', profile])
371 yield
372
373
374 @contextlib.contextmanager
375 def create_rbd_pool(ctx, config):
376 cluster_name = config['cluster']
377 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
378 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
379 log.info('Waiting for OSDs to come up')
380 teuthology.wait_until_osds_up(
381 ctx,
382 cluster=ctx.cluster,
383 remote=mon_remote,
384 ceph_cluster=cluster_name,
385 )
386 if config.get('create_rbd_pool', True):
387 log.info('Creating RBD pool')
388 mon_remote.run(
389 args=['sudo', 'ceph', '--cluster', cluster_name,
390 'osd', 'pool', 'create', 'rbd', '8'])
391 mon_remote.run(
392 args=[
393 'sudo', 'ceph', '--cluster', cluster_name,
394 'osd', 'pool', 'application', 'enable',
395 'rbd', 'rbd', '--yes-i-really-mean-it'
396 ],
397 check_status=False)
398 yield
399
400 @contextlib.contextmanager
401 def cephfs_setup(ctx, config):
402 cluster_name = config['cluster']
403
404 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
405 (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
406 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
407 # If there are any MDSs, then create a filesystem for them to use
408 # Do this last because requires mon cluster to be up and running
409 if mdss.remotes:
410 log.info('Setting up CephFS filesystem...')
411
412 Filesystem(ctx, fs_config=config.get('cephfs', None), name='cephfs',
413 create=True, ec_profile=config.get('cephfs_ec_profile', None))
414
415 yield
416
417 @contextlib.contextmanager
418 def watchdog_setup(ctx, config):
419 ctx.ceph[config['cluster']].thrashers = []
420 ctx.ceph[config['cluster']].watchdog = DaemonWatchdog(ctx, config, ctx.ceph[config['cluster']].thrashers)
421 ctx.ceph[config['cluster']].watchdog.start()
422 yield
423
424 def get_mons(roles, ips, cluster_name,
425 mon_bind_msgr2=False,
426 mon_bind_addrvec=False):
427 """
428 Get monitors and their associated addresses
429 """
430 mons = {}
431 v1_ports = {}
432 v2_ports = {}
433 is_mon = teuthology.is_type('mon', cluster_name)
434 for idx, roles in enumerate(roles):
435 for role in roles:
436 if not is_mon(role):
437 continue
438 if ips[idx] not in v1_ports:
439 v1_ports[ips[idx]] = 6789
440 else:
441 v1_ports[ips[idx]] += 1
442 if mon_bind_msgr2:
443 if ips[idx] not in v2_ports:
444 v2_ports[ips[idx]] = 3300
445 addr = '{ip}'.format(ip=ips[idx])
446 else:
447 assert mon_bind_addrvec
448 v2_ports[ips[idx]] += 1
449 addr = '[v2:{ip}:{port2},v1:{ip}:{port1}]'.format(
450 ip=ips[idx],
451 port2=v2_ports[ips[idx]],
452 port1=v1_ports[ips[idx]],
453 )
454 elif mon_bind_addrvec:
455 addr = '[v1:{ip}:{port}]'.format(
456 ip=ips[idx],
457 port=v1_ports[ips[idx]],
458 )
459 else:
460 addr = '{ip}:{port}'.format(
461 ip=ips[idx],
462 port=v1_ports[ips[idx]],
463 )
464 mons[role] = addr
465 assert mons
466 return mons
467
468 def skeleton_config(ctx, roles, ips, mons, cluster='ceph'):
469 """
470 Returns a ConfigObj that is prefilled with a skeleton config.
471
472 Use conf[section][key]=value or conf.merge to change it.
473
474 Use conf.write to write it out, override .filename first if you want.
475 """
476 path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template')
477 conf = configobj.ConfigObj(path, file_error=True)
478 mon_hosts = []
479 for role, addr in mons.items():
480 mon_cluster, _, _ = teuthology.split_role(role)
481 if mon_cluster != cluster:
482 continue
483 name = teuthology.ceph_role(role)
484 conf.setdefault(name, {})
485 mon_hosts.append(addr)
486 conf.setdefault('global', {})
487 conf['global']['mon host'] = ','.join(mon_hosts)
488 # set up standby mds's
489 is_mds = teuthology.is_type('mds', cluster)
490 for roles_subset in roles:
491 for role in roles_subset:
492 if is_mds(role):
493 name = teuthology.ceph_role(role)
494 conf.setdefault(name, {})
495 return conf
496
497 def create_simple_monmap(ctx, remote, conf, mons,
498 path=None,
499 mon_bind_addrvec=False):
500 """
501 Writes a simple monmap based on current ceph.conf into path, or
502 <testdir>/monmap by default.
503
504 Assumes ceph_conf is up to date.
505
506 Assumes mon sections are named "mon.*", with the dot.
507
508 :return the FSID (as a string) of the newly created monmap
509 """
510
511 addresses = list(mons.items())
512 assert addresses, "There are no monitors in config!"
513 log.debug('Ceph mon addresses: %s', addresses)
514
515 testdir = teuthology.get_testdir(ctx)
516 args = [
517 'adjust-ulimits',
518 'ceph-coverage',
519 '{tdir}/archive/coverage'.format(tdir=testdir),
520 'monmaptool',
521 '--create',
522 '--clobber',
523 ]
524 if mon_bind_addrvec:
525 args.extend(['--enable-all-features'])
526 for (role, addr) in addresses:
527 _, _, n = teuthology.split_role(role)
528 if mon_bind_addrvec and (',' in addr or 'v' in addr or ':' in addr):
529 args.extend(('--addv', n, addr))
530 else:
531 args.extend(('--add', n, addr))
532 if not path:
533 path = '{tdir}/monmap'.format(tdir=testdir)
534 args.extend([
535 '--print',
536 path
537 ])
538
539 monmap_output = remote.sh(args)
540 fsid = re.search("generated fsid (.+)$",
541 monmap_output, re.MULTILINE).group(1)
542 return fsid
543
544 @contextlib.contextmanager
545 def cluster(ctx, config):
546 """
547 Handle the creation and removal of a ceph cluster.
548
549 On startup:
550 Create directories needed for the cluster.
551 Create remote journals for all osds.
552 Create and set keyring.
553 Copy the monmap to the test systems.
554 Setup mon nodes.
555 Setup mds nodes.
556 Mkfs osd nodes.
557 Add keyring information to monmaps
558 Mkfs mon nodes.
559
560 On exit:
561 If errors occurred, extract a failure message and store in ctx.summary.
562 Unmount all test files and temporary journaling files.
563 Save the monitor information and archive all ceph logs.
564 Cleanup the keyring setup, and remove all monitor map and data files left over.
565
566 :param ctx: Context
567 :param config: Configuration
568 """
569 if ctx.config.get('use_existing_cluster', False) is True:
570 log.info("'use_existing_cluster' is true; skipping cluster creation")
571 yield
572
573 testdir = teuthology.get_testdir(ctx)
574 cluster_name = config['cluster']
575 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
576 log.info('Creating ceph cluster %s...', cluster_name)
577 log.info('config %s', config)
578 log.info('ctx.config %s', ctx.config)
579 run.wait(
580 ctx.cluster.run(
581 args=[
582 'install', '-d', '-m0755', '--',
583 data_dir,
584 ],
585 wait=False,
586 )
587 )
588
589 run.wait(
590 ctx.cluster.run(
591 args=[
592 'sudo',
593 'install', '-d', '-m0777', '--', '/var/run/ceph',
594 ],
595 wait=False,
596 )
597 )
598
599 devs_to_clean = {}
600 remote_to_roles_to_devs = {}
601 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
602 for remote, roles_for_host in osds.remotes.items():
603 devs = teuthology.get_scratch_devices(remote)
604 roles_to_devs = assign_devs(
605 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), devs
606 )
607 devs_to_clean[remote] = []
608 log.info('osd dev map: {}'.format(roles_to_devs))
609 assert roles_to_devs, \
610 "remote {} has osd roles, but no osd devices were specified!".format(remote.hostname)
611 remote_to_roles_to_devs[remote] = roles_to_devs
612 log.info("remote_to_roles_to_devs: {}".format(remote_to_roles_to_devs))
613 for osd_role, dev_name in remote_to_roles_to_devs.items():
614 assert dev_name, "{} has no associated device!".format(osd_role)
615
616 log.info('Generating config...')
617 remotes_and_roles = ctx.cluster.remotes.items()
618 roles = [role_list for (remote, role_list) in remotes_and_roles]
619 ips = [host for (host, port) in
620 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
621 mons = get_mons(
622 roles, ips, cluster_name,
623 mon_bind_msgr2=config.get('mon_bind_msgr2'),
624 mon_bind_addrvec=config.get('mon_bind_addrvec'),
625 )
626 conf = skeleton_config(
627 ctx, roles=roles, ips=ips, mons=mons, cluster=cluster_name,
628 )
629 for section, keys in config['conf'].items():
630 for key, value in keys.items():
631 log.info("[%s] %s = %s" % (section, key, value))
632 if section not in conf:
633 conf[section] = {}
634 conf[section][key] = value
635
636 if not hasattr(ctx, 'ceph'):
637 ctx.ceph = {}
638 ctx.ceph[cluster_name] = argparse.Namespace()
639 ctx.ceph[cluster_name].conf = conf
640 ctx.ceph[cluster_name].mons = mons
641
642 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
643 keyring_path = config.get('keyring_path', default_keyring)
644
645 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
646
647 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
648
649 log.info('Setting up %s...' % firstmon)
650 ctx.cluster.only(firstmon).run(
651 args=[
652 'sudo',
653 'adjust-ulimits',
654 'ceph-coverage',
655 coverage_dir,
656 'ceph-authtool',
657 '--create-keyring',
658 keyring_path,
659 ],
660 )
661 ctx.cluster.only(firstmon).run(
662 args=[
663 'sudo',
664 'adjust-ulimits',
665 'ceph-coverage',
666 coverage_dir,
667 'ceph-authtool',
668 '--gen-key',
669 '--name=mon.',
670 keyring_path,
671 ],
672 )
673 ctx.cluster.only(firstmon).run(
674 args=[
675 'sudo',
676 'chmod',
677 '0644',
678 keyring_path,
679 ],
680 )
681 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
682 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
683 cluster=cluster_name)
684 fsid = create_simple_monmap(
685 ctx,
686 remote=mon0_remote,
687 conf=conf,
688 mons=mons,
689 path=monmap_path,
690 mon_bind_addrvec=config.get('mon_bind_addrvec'),
691 )
692 if not 'global' in conf:
693 conf['global'] = {}
694 conf['global']['fsid'] = fsid
695
696 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
697 conf_path = config.get('conf_path', default_conf_path)
698 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
699 write_conf(ctx, conf_path, cluster_name)
700
701 log.info('Creating admin key on %s...' % firstmon)
702 ctx.cluster.only(firstmon).run(
703 args=[
704 'sudo',
705 'adjust-ulimits',
706 'ceph-coverage',
707 coverage_dir,
708 'ceph-authtool',
709 '--gen-key',
710 '--name=client.admin',
711 '--cap', 'mon', 'allow *',
712 '--cap', 'osd', 'allow *',
713 '--cap', 'mds', 'allow *',
714 '--cap', 'mgr', 'allow *',
715 keyring_path,
716 ],
717 )
718
719 log.info('Copying monmap to all nodes...')
720 keyring = teuthology.get_file(
721 remote=mon0_remote,
722 path=keyring_path,
723 )
724 monmap = teuthology.get_file(
725 remote=mon0_remote,
726 path=monmap_path,
727 )
728
729 for rem in ctx.cluster.remotes.keys():
730 # copy mon key and initial monmap
731 log.info('Sending monmap to node {remote}'.format(remote=rem))
732 teuthology.sudo_write_file(
733 remote=rem,
734 path=keyring_path,
735 data=keyring,
736 perms='0644'
737 )
738 teuthology.write_file(
739 remote=rem,
740 path=monmap_path,
741 data=monmap,
742 )
743
744 log.info('Setting up mon nodes...')
745 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
746
747 if not config.get('skip_mgr_daemons', False):
748 log.info('Setting up mgr nodes...')
749 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
750 for remote, roles_for_host in mgrs.remotes.items():
751 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
752 cluster_name):
753 _, _, id_ = teuthology.split_role(role)
754 mgr_dir = DATA_PATH.format(
755 type_='mgr', cluster=cluster_name, id_=id_)
756 remote.run(
757 args=[
758 'sudo',
759 'mkdir',
760 '-p',
761 mgr_dir,
762 run.Raw('&&'),
763 'sudo',
764 'adjust-ulimits',
765 'ceph-coverage',
766 coverage_dir,
767 'ceph-authtool',
768 '--create-keyring',
769 '--gen-key',
770 '--name=mgr.{id}'.format(id=id_),
771 mgr_dir + '/keyring',
772 ],
773 )
774
775 log.info('Setting up mds nodes...')
776 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
777 for remote, roles_for_host in mdss.remotes.items():
778 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
779 cluster_name):
780 _, _, id_ = teuthology.split_role(role)
781 mds_dir = DATA_PATH.format(
782 type_='mds', cluster=cluster_name, id_=id_)
783 remote.run(
784 args=[
785 'sudo',
786 'mkdir',
787 '-p',
788 mds_dir,
789 run.Raw('&&'),
790 'sudo',
791 'adjust-ulimits',
792 'ceph-coverage',
793 coverage_dir,
794 'ceph-authtool',
795 '--create-keyring',
796 '--gen-key',
797 '--name=mds.{id}'.format(id=id_),
798 mds_dir + '/keyring',
799 ],
800 )
801 remote.run(args=[
802 'sudo', 'chown', '-R', 'ceph:ceph', mds_dir
803 ])
804
805 cclient.create_keyring(ctx, cluster_name)
806 log.info('Running mkfs on osd nodes...')
807
808 if not hasattr(ctx, 'disk_config'):
809 ctx.disk_config = argparse.Namespace()
810 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
811 ctx.disk_config.remote_to_roles_to_dev = {}
812 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
813 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
814 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
815 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
816
817 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
818
819 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
820 for remote, roles_for_host in osds.remotes.items():
821 roles_to_devs = remote_to_roles_to_devs[remote]
822
823 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
824 _, _, id_ = teuthology.split_role(role)
825 mnt_point = DATA_PATH.format(
826 type_='osd', cluster=cluster_name, id_=id_)
827 remote.run(
828 args=[
829 'sudo',
830 'mkdir',
831 '-p',
832 mnt_point,
833 ])
834 log.info('roles_to_devs: {}'.format(roles_to_devs))
835 log.info('role: {}'.format(role))
836 if roles_to_devs.get(role):
837 dev = roles_to_devs[role]
838 fs = config.get('fs')
839 package = None
840 mkfs_options = config.get('mkfs_options')
841 mount_options = config.get('mount_options')
842 if fs == 'btrfs':
843 # package = 'btrfs-tools'
844 if mount_options is None:
845 mount_options = ['noatime', 'user_subvol_rm_allowed']
846 if mkfs_options is None:
847 mkfs_options = ['-m', 'single',
848 '-l', '32768',
849 '-n', '32768']
850 if fs == 'xfs':
851 # package = 'xfsprogs'
852 if mount_options is None:
853 mount_options = ['noatime']
854 if mkfs_options is None:
855 mkfs_options = ['-f', '-i', 'size=2048']
856 if fs == 'ext4' or fs == 'ext3':
857 if mount_options is None:
858 mount_options = ['noatime', 'user_xattr']
859
860 if mount_options is None:
861 mount_options = []
862 if mkfs_options is None:
863 mkfs_options = []
864 mkfs = ['mkfs.%s' % fs] + mkfs_options
865 log.info('%s on %s on %s' % (mkfs, dev, remote))
866 if package is not None:
867 remote.sh('sudo apt-get install -y %s' % package)
868
869 try:
870 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
871 except run.CommandFailedError:
872 # Newer btfs-tools doesn't prompt for overwrite, use -f
873 if '-f' not in mount_options:
874 mkfs_options.append('-f')
875 mkfs = ['mkfs.%s' % fs] + mkfs_options
876 log.info('%s on %s on %s' % (mkfs, dev, remote))
877 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
878
879 log.info('mount %s on %s -o %s' % (dev, remote,
880 ','.join(mount_options)))
881 remote.run(
882 args=[
883 'sudo',
884 'mount',
885 '-t', fs,
886 '-o', ','.join(mount_options),
887 dev,
888 mnt_point,
889 ]
890 )
891 remote.run(
892 args=[
893 'sudo', '/sbin/restorecon', mnt_point,
894 ],
895 check_status=False,
896 )
897 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
898 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
899 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
900 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
901 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
902 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
903 devs_to_clean[remote].append(mnt_point)
904
905 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
906 _, _, id_ = teuthology.split_role(role)
907 try:
908 remote.run(
909 args=[
910 'sudo',
911 'MALLOC_CHECK_=3',
912 'adjust-ulimits',
913 'ceph-coverage',
914 coverage_dir,
915 'ceph-osd',
916 '--no-mon-config',
917 '--cluster',
918 cluster_name,
919 '--mkfs',
920 '--mkkey',
921 '-i', id_,
922 '--monmap', monmap_path,
923 ],
924 )
925 except run.CommandFailedError:
926 # try without --no-mon-config.. this may be an upgrade test
927 remote.run(
928 args=[
929 'sudo',
930 'MALLOC_CHECK_=3',
931 'adjust-ulimits',
932 'ceph-coverage',
933 coverage_dir,
934 'ceph-osd',
935 '--cluster',
936 cluster_name,
937 '--mkfs',
938 '--mkkey',
939 '-i', id_,
940 '--monmap', monmap_path,
941 ],
942 )
943 mnt_point = DATA_PATH.format(
944 type_='osd', cluster=cluster_name, id_=id_)
945 remote.run(args=[
946 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
947 ])
948
949 log.info('Reading keys from all nodes...')
950 keys_fp = BytesIO()
951 keys = []
952 for remote, roles_for_host in ctx.cluster.remotes.items():
953 for type_ in ['mgr', 'mds', 'osd']:
954 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
955 continue
956 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
957 _, _, id_ = teuthology.split_role(role)
958 data = teuthology.get_file(
959 remote=remote,
960 path=os.path.join(
961 DATA_PATH.format(
962 type_=type_, id_=id_, cluster=cluster_name),
963 'keyring',
964 ),
965 sudo=True,
966 )
967 keys.append((type_, id_, data))
968 keys_fp.write(data)
969 for remote, roles_for_host in ctx.cluster.remotes.items():
970 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
971 _, _, id_ = teuthology.split_role(role)
972 data = teuthology.get_file(
973 remote=remote,
974 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
975 )
976 keys.append(('client', id_, data))
977 keys_fp.write(data)
978
979 log.info('Adding keys to all mons...')
980 writes = mons.run(
981 args=[
982 'sudo', 'tee', '-a',
983 keyring_path,
984 ],
985 stdin=run.PIPE,
986 wait=False,
987 stdout=BytesIO(),
988 )
989 keys_fp.seek(0)
990 teuthology.feed_many_stdins_and_close(keys_fp, writes)
991 run.wait(writes)
992 for type_, id_, data in keys:
993 run.wait(
994 mons.run(
995 args=[
996 'sudo',
997 'adjust-ulimits',
998 'ceph-coverage',
999 coverage_dir,
1000 'ceph-authtool',
1001 keyring_path,
1002 '--name={type}.{id}'.format(
1003 type=type_,
1004 id=id_,
1005 ),
1006 ] + list(generate_caps(type_)),
1007 wait=False,
1008 ),
1009 )
1010
1011 log.info('Running mkfs on mon nodes...')
1012 for remote, roles_for_host in mons.remotes.items():
1013 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
1014 _, _, id_ = teuthology.split_role(role)
1015 mnt_point = DATA_PATH.format(
1016 type_='mon', id_=id_, cluster=cluster_name)
1017 remote.run(
1018 args=[
1019 'sudo',
1020 'mkdir',
1021 '-p',
1022 mnt_point,
1023 ],
1024 )
1025 remote.run(
1026 args=[
1027 'sudo',
1028 'adjust-ulimits',
1029 'ceph-coverage',
1030 coverage_dir,
1031 'ceph-mon',
1032 '--cluster', cluster_name,
1033 '--mkfs',
1034 '-i', id_,
1035 '--monmap', monmap_path,
1036 '--keyring', keyring_path,
1037 ],
1038 )
1039 remote.run(args=[
1040 'sudo', 'chown', '-R', 'ceph:ceph', mnt_point
1041 ])
1042
1043 run.wait(
1044 mons.run(
1045 args=[
1046 'rm',
1047 '--',
1048 monmap_path,
1049 ],
1050 wait=False,
1051 ),
1052 )
1053
1054 try:
1055 yield
1056 except Exception:
1057 # we need to know this below
1058 ctx.summary['success'] = False
1059 raise
1060 finally:
1061 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1062
1063 log.info('Checking cluster log for badness...')
1064
1065 def first_in_ceph_log(pattern, excludes):
1066 """
1067 Find the first occurrence of the pattern specified in the Ceph log,
1068 Returns None if none found.
1069
1070 :param pattern: Pattern scanned for.
1071 :param excludes: Patterns to ignore.
1072 :return: First line of text (or None if not found)
1073 """
1074 args = [
1075 'sudo',
1076 'egrep', pattern,
1077 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
1078 ]
1079 for exclude in excludes:
1080 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
1081 args.extend([
1082 run.Raw('|'), 'head', '-n', '1',
1083 ])
1084 stdout = mon0_remote.sh(args)
1085 return stdout or None
1086
1087 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
1088 config['log_whitelist']) is not None:
1089 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
1090 ctx.summary['success'] = False
1091 # use the most severe problem as the failure reason
1092 if 'failure_reason' not in ctx.summary:
1093 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
1094 match = first_in_ceph_log(pattern, config['log_whitelist'])
1095 if match is not None:
1096 ctx.summary['failure_reason'] = \
1097 '"{match}" in cluster log'.format(
1098 match=match.rstrip('\n'),
1099 )
1100 break
1101
1102 for remote, dirs in devs_to_clean.items():
1103 for dir_ in dirs:
1104 log.info('Unmounting %s on %s' % (dir_, remote))
1105 try:
1106 remote.run(
1107 args=[
1108 'sync',
1109 run.Raw('&&'),
1110 'sudo',
1111 'umount',
1112 '-f',
1113 dir_
1114 ]
1115 )
1116 except Exception as e:
1117 remote.run(args=[
1118 'sudo',
1119 run.Raw('PATH=/usr/sbin:$PATH'),
1120 'lsof',
1121 run.Raw(';'),
1122 'ps', 'auxf',
1123 ])
1124 raise e
1125
1126 if ctx.archive is not None and \
1127 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
1128
1129 # archive mon data, too
1130 log.info('Archiving mon data...')
1131 path = os.path.join(ctx.archive, 'data')
1132 try:
1133 os.makedirs(path)
1134 except OSError as e:
1135 if e.errno == errno.EEXIST:
1136 pass
1137 else:
1138 raise
1139 for remote, roles in mons.remotes.items():
1140 for role in roles:
1141 is_mon = teuthology.is_type('mon', cluster_name)
1142 if is_mon(role):
1143 _, _, id_ = teuthology.split_role(role)
1144 mon_dir = DATA_PATH.format(
1145 type_='mon', id_=id_, cluster=cluster_name)
1146 teuthology.pull_directory_tarball(
1147 remote,
1148 mon_dir,
1149 path + '/' + role + '.tgz')
1150
1151 log.info('Cleaning ceph cluster...')
1152 run.wait(
1153 ctx.cluster.run(
1154 args=[
1155 'sudo',
1156 'rm',
1157 '-rf',
1158 '--',
1159 conf_path,
1160 keyring_path,
1161 data_dir,
1162 monmap_path,
1163 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1164 ],
1165 wait=False,
1166 ),
1167 )
1168
1169
1170 def osd_scrub_pgs(ctx, config):
1171 """
1172 Scrub pgs when we exit.
1173
1174 First make sure all pgs are active and clean.
1175 Next scrub all osds.
1176 Then periodically check until all pgs have scrub time stamps that
1177 indicate the last scrub completed. Time out if no progress is made
1178 here after two minutes.
1179 """
1180 retries = 40
1181 delays = 20
1182 cluster_name = config['cluster']
1183 manager = ctx.managers[cluster_name]
1184 all_clean = False
1185 for _ in range(0, retries):
1186 stats = manager.get_pg_stats()
1187 unclean = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1188 split_merge = []
1189 osd_dump = manager.get_osd_dump_json()
1190 try:
1191 split_merge = [i['pool_name'] for i in osd_dump['pools'] if i['pg_num'] != i['pg_num_target']]
1192 except KeyError:
1193 # we don't support pg_num_target before nautilus
1194 pass
1195 if not unclean and not split_merge:
1196 all_clean = True
1197 break
1198 log.info(
1199 "Waiting for all PGs to be active+clean and split+merged, waiting on %s to go clean and/or %s to split/merge" % (unclean, split_merge))
1200 time.sleep(delays)
1201 if not all_clean:
1202 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1203 check_time_now = time.localtime()
1204 time.sleep(1)
1205 all_roles = teuthology.all_roles(ctx.cluster)
1206 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1207 log.info("Scrubbing {osd}".format(osd=role))
1208 _, _, id_ = teuthology.split_role(role)
1209 # allow this to fail; in certain cases the OSD might not be up
1210 # at this point. we will catch all pgs below.
1211 try:
1212 manager.raw_cluster_cmd('tell', 'osd.' + id_, 'config', 'set',
1213 'osd_debug_deep_scrub_sleep', '0');
1214 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1215 except run.CommandFailedError:
1216 pass
1217 prev_good = 0
1218 gap_cnt = 0
1219 loop = True
1220 while loop:
1221 stats = manager.get_pg_stats()
1222 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1223 loop = False
1224 thiscnt = 0
1225 re_scrub = []
1226 for (pgid, tmval) in timez:
1227 t = tmval[0:tmval.find('.')].replace(' ', 'T')
1228 pgtm = time.strptime(t, '%Y-%m-%dT%H:%M:%S')
1229 if pgtm > check_time_now:
1230 thiscnt += 1
1231 else:
1232 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1233 loop = True
1234 re_scrub.append(pgid)
1235 if thiscnt > prev_good:
1236 prev_good = thiscnt
1237 gap_cnt = 0
1238 else:
1239 gap_cnt += 1
1240 if gap_cnt % 6 == 0:
1241 for pgid in re_scrub:
1242 # re-request scrub every so often in case the earlier
1243 # request was missed. do not do it every time because
1244 # the scrub may be in progress or not reported yet and
1245 # we will starve progress.
1246 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1247 if gap_cnt > retries:
1248 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1249 if loop:
1250 log.info('Still waiting for all pgs to be scrubbed.')
1251 time.sleep(delays)
1252
1253
1254 @contextlib.contextmanager
1255 def run_daemon(ctx, config, type_):
1256 """
1257 Run daemons for a role type. Handle the startup and termination of a a daemon.
1258 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1259 and a max_mds value for one mds.
1260 On cleanup -- Stop all existing daemons of this type.
1261
1262 :param ctx: Context
1263 :param config: Configuration
1264 :param type_: Role type
1265 """
1266 cluster_name = config['cluster']
1267 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1268 testdir = teuthology.get_testdir(ctx)
1269 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1270
1271 # check whether any daemons if this type are configured
1272 if daemons is None:
1273 return
1274 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1275
1276 daemon_signal = 'kill'
1277 if config.get('coverage') or config.get('valgrind') is not None:
1278 daemon_signal = 'term'
1279
1280 # create osds in order. (this only matters for pre-luminous, which might
1281 # be jewel/hammer, which doesn't take an id_ argument to legacy 'osd create').
1282 osd_uuids = {}
1283 for remote, roles_for_host in daemons.remotes.items():
1284 is_type_ = teuthology.is_type(type_, cluster_name)
1285 for role in roles_for_host:
1286 if not is_type_(role):
1287 continue
1288 _, _, id_ = teuthology.split_role(role)
1289
1290
1291 if type_ == 'osd':
1292 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1293 cluster=cluster_name, id=id_)
1294 osd_uuid = teuthology.get_file(
1295 remote=remote,
1296 path=datadir + '/fsid',
1297 sudo=True,
1298 ).decode().strip()
1299 osd_uuids[id_] = osd_uuid
1300 for osd_id in range(len(osd_uuids)):
1301 id_ = str(osd_id)
1302 osd_uuid = osd_uuids.get(id_)
1303 try:
1304 remote.run(
1305 args=[
1306 'sudo', 'ceph', '--cluster', cluster_name,
1307 'osd', 'new', osd_uuid, id_,
1308 ]
1309 )
1310 except:
1311 # fallback to pre-luminous (jewel)
1312 remote.run(
1313 args=[
1314 'sudo', 'ceph', '--cluster', cluster_name,
1315 'osd', 'create', osd_uuid,
1316 ]
1317 )
1318 if config.get('add_osds_to_crush'):
1319 remote.run(
1320 args=[
1321 'sudo', 'ceph', '--cluster', cluster_name,
1322 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1323 '1.0', 'host=localhost', 'root=default',
1324 ]
1325 )
1326
1327 for remote, roles_for_host in daemons.remotes.items():
1328 is_type_ = teuthology.is_type(type_, cluster_name)
1329 for role in roles_for_host:
1330 if not is_type_(role):
1331 continue
1332 _, _, id_ = teuthology.split_role(role)
1333
1334 run_cmd = [
1335 'sudo',
1336 'adjust-ulimits',
1337 'ceph-coverage',
1338 coverage_dir,
1339 'daemon-helper',
1340 daemon_signal,
1341 ]
1342 run_cmd_tail = [
1343 'ceph-%s' % (type_),
1344 '-f',
1345 '--cluster', cluster_name,
1346 '-i', id_]
1347
1348 if type_ in config.get('cpu_profile', []):
1349 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1350 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1351
1352 if config.get('valgrind') is not None:
1353 valgrind_args = None
1354 if type_ in config['valgrind']:
1355 valgrind_args = config['valgrind'][type_]
1356 if role in config['valgrind']:
1357 valgrind_args = config['valgrind'][role]
1358 run_cmd = teuthology.get_valgrind_args(testdir, role,
1359 run_cmd,
1360 valgrind_args)
1361
1362 run_cmd.extend(run_cmd_tail)
1363
1364 # always register mgr; don't necessarily start
1365 ctx.daemons.register_daemon(
1366 remote, type_, id_,
1367 cluster=cluster_name,
1368 args=run_cmd,
1369 logger=log.getChild(role),
1370 stdin=run.PIPE,
1371 wait=False
1372 )
1373 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1374 role = cluster_name + '.' + type_
1375 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1376
1377 # kludge: run any pre-manager commands
1378 if type_ == 'mon':
1379 for cmd in config.get('pre-mgr-commands', []):
1380 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1381 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1382 remote.run(args=cmd.split(' '))
1383
1384 try:
1385 yield
1386 finally:
1387 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1388
1389
1390 def healthy(ctx, config):
1391 """
1392 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1393
1394 :param ctx: Context
1395 :param config: Configuration
1396 """
1397 config = config if isinstance(config, dict) else dict()
1398 cluster_name = config.get('cluster', 'ceph')
1399 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1400 manager = ctx.managers[cluster_name]
1401 try:
1402 manager.wait_for_mgr_available(timeout=30)
1403 except (run.CommandFailedError, AssertionError) as e:
1404 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
1405
1406 manager.wait_for_all_osds_up(timeout=300)
1407
1408 try:
1409 manager.flush_all_pg_stats()
1410 except (run.CommandFailedError, Exception) as e:
1411 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
1412 manager.wait_for_clean()
1413
1414 if config.get('wait-for-healthy', True):
1415 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1416 manager.wait_until_healthy(timeout=300)
1417
1418 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1419 # Some MDSs exist, wait for them to be healthy
1420 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1421 ceph_fs.wait_for_daemons(timeout=300)
1422
1423
1424 def wait_for_mon_quorum(ctx, config):
1425 """
1426 Check renote ceph status until all monitors are up.
1427
1428 :param ctx: Context
1429 :param config: Configuration
1430 """
1431 if isinstance(config, dict):
1432 mons = config['daemons']
1433 cluster_name = config.get('cluster', 'ceph')
1434 else:
1435 assert isinstance(config, list)
1436 mons = config
1437 cluster_name = 'ceph'
1438 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1439 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1440 with contextutil.safe_while(sleep=10, tries=60,
1441 action='wait for monitor quorum') as proceed:
1442 while proceed():
1443 quorum_status = remote.sh('sudo ceph quorum_status',
1444 logger=log.getChild('quorum_status'))
1445 j = json.loads(quorum_status)
1446 q = j.get('quorum_names', [])
1447 log.debug('Quorum: %s', q)
1448 if sorted(q) == sorted(mons):
1449 break
1450
1451
1452 def created_pool(ctx, config):
1453 """
1454 Add new pools to the dictionary of pools that the ceph-manager
1455 knows about.
1456 """
1457 for new_pool in config:
1458 if new_pool not in ctx.managers['ceph'].pools:
1459 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_int_property(
1460 new_pool, 'pg_num')
1461
1462
1463 @contextlib.contextmanager
1464 def suppress_mon_health_to_clog(ctx, config):
1465 """
1466 set the option, and then restore it with its original value
1467
1468 Note, due to the way how tasks are executed/nested, it's not suggested to
1469 use this method as a standalone task. otherwise, it's likely that it will
1470 restore the tweaked option at the /end/ of 'tasks' block.
1471 """
1472 if config.get('mon-health-to-clog', 'true') == 'false':
1473 saved_options = {}
1474 cluster = config.get('cluster', 'ceph')
1475 manager = ctx.managers[cluster]
1476 manager.raw_cluster_command(
1477 'config', 'set', 'mon', 'mon_health_to_clog', 'false'
1478 )
1479 yield
1480 manager.raw_cluster_command(
1481 'config', 'rm', 'mon', 'mon_health_to_clog'
1482 )
1483 else:
1484 yield
1485
1486 @contextlib.contextmanager
1487 def restart(ctx, config):
1488 """
1489 restart ceph daemons
1490
1491 For example::
1492 tasks:
1493 - ceph.restart: [all]
1494
1495 For example::
1496 tasks:
1497 - ceph.restart: [osd.0, mon.1, mds.*]
1498
1499 or::
1500
1501 tasks:
1502 - ceph.restart:
1503 daemons: [osd.0, mon.1]
1504 wait-for-healthy: false
1505 wait-for-osds-up: true
1506
1507 :param ctx: Context
1508 :param config: Configuration
1509 """
1510 if config is None:
1511 config = {}
1512 elif isinstance(config, list):
1513 config = {'daemons': config}
1514
1515 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1516 clusters = set()
1517
1518 with suppress_mon_health_to_clog(ctx, config):
1519 for role in daemons:
1520 cluster, type_, id_ = teuthology.split_role(role)
1521 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1522 if type_ == 'osd':
1523 ctx.managers[cluster].mark_down_osd(id_)
1524 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1525 clusters.add(cluster)
1526
1527 if config.get('wait-for-healthy', True):
1528 for cluster in clusters:
1529 healthy(ctx=ctx, config=dict(cluster=cluster))
1530 if config.get('wait-for-osds-up', False):
1531 for cluster in clusters:
1532 ctx.managers[cluster].wait_for_all_osds_up()
1533 yield
1534
1535
1536 @contextlib.contextmanager
1537 def stop(ctx, config):
1538 """
1539 Stop ceph daemons
1540
1541 For example::
1542 tasks:
1543 - ceph.stop: [mds.*]
1544
1545 tasks:
1546 - ceph.stop: [osd.0, osd.2]
1547
1548 tasks:
1549 - ceph.stop:
1550 daemons: [osd.0, osd.2]
1551
1552 """
1553 if config is None:
1554 config = {}
1555 elif isinstance(config, list):
1556 config = {'daemons': config}
1557
1558 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1559 clusters = set()
1560
1561 for role in daemons:
1562 cluster, type_, id_ = teuthology.split_role(role)
1563 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1564 clusters.add(cluster)
1565
1566
1567 for cluster in clusters:
1568 ctx.ceph[cluster].watchdog.stop()
1569 ctx.ceph[cluster].watchdog.join()
1570
1571 yield
1572
1573
1574 @contextlib.contextmanager
1575 def wait_for_failure(ctx, config):
1576 """
1577 Wait for a failure of a ceph daemon
1578
1579 For example::
1580 tasks:
1581 - ceph.wait_for_failure: [mds.*]
1582
1583 tasks:
1584 - ceph.wait_for_failure: [osd.0, osd.2]
1585
1586 tasks:
1587 - ceph.wait_for_failure:
1588 daemons: [osd.0, osd.2]
1589
1590 """
1591 if config is None:
1592 config = {}
1593 elif isinstance(config, list):
1594 config = {'daemons': config}
1595
1596 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1597 for role in daemons:
1598 cluster, type_, id_ = teuthology.split_role(role)
1599 try:
1600 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1601 except:
1602 log.info('Saw expected daemon failure. Continuing.')
1603 pass
1604 else:
1605 raise RuntimeError('daemon %s did not fail' % role)
1606
1607 yield
1608
1609
1610 def validate_config(ctx, config):
1611 """
1612 Perform some simple validation on task configuration.
1613 Raises exceptions.ConfigError if an error is found.
1614 """
1615 # check for osds from multiple clusters on the same host
1616 for remote, roles_for_host in ctx.cluster.remotes.items():
1617 last_cluster = None
1618 last_role = None
1619 for role in roles_for_host:
1620 role_cluster, role_type, _ = teuthology.split_role(role)
1621 if role_type != 'osd':
1622 continue
1623 if last_cluster and last_cluster != role_cluster:
1624 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1625 last_role, role)
1626 raise exceptions.ConfigError(msg)
1627 last_cluster = role_cluster
1628 last_role = role
1629
1630
1631 @contextlib.contextmanager
1632 def task(ctx, config):
1633 """
1634 Set up and tear down a Ceph cluster.
1635
1636 For example::
1637
1638 tasks:
1639 - ceph:
1640 - interactive:
1641
1642 You can also specify what branch to run::
1643
1644 tasks:
1645 - ceph:
1646 branch: foo
1647
1648 Or a tag::
1649
1650 tasks:
1651 - ceph:
1652 tag: v0.42.13
1653
1654 Or a sha1::
1655
1656 tasks:
1657 - ceph:
1658 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1659
1660 Or a local source dir::
1661
1662 tasks:
1663 - ceph:
1664 path: /home/sage/ceph
1665
1666 To capture code coverage data, use::
1667
1668 tasks:
1669 - ceph:
1670 coverage: true
1671
1672 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1673
1674 tasks:
1675 - ceph:
1676 fs: xfs
1677 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1678 mount_options: [nobarrier, inode64]
1679
1680 To change the cephfs's default max_mds (1), use::
1681
1682 tasks:
1683 - ceph:
1684 cephfs:
1685 max_mds: 2
1686
1687 To change the mdsmap's default session_timeout (60 seconds), use::
1688
1689 tasks:
1690 - ceph:
1691 cephfs:
1692 session_timeout: 300
1693
1694 Note, this will cause the task to check the /scratch_devs file on each node
1695 for available devices. If no such file is found, /dev/sdb will be used.
1696
1697 To run some daemons under valgrind, include their names
1698 and the tool/args to use in a valgrind section::
1699
1700 tasks:
1701 - ceph:
1702 valgrind:
1703 mds.1: --tool=memcheck
1704 osd.1: [--tool=memcheck, --leak-check=no]
1705
1706 Those nodes which are using memcheck or valgrind will get
1707 checked for bad results.
1708
1709 To adjust or modify config options, use::
1710
1711 tasks:
1712 - ceph:
1713 conf:
1714 section:
1715 key: value
1716
1717 For example::
1718
1719 tasks:
1720 - ceph:
1721 conf:
1722 mds.0:
1723 some option: value
1724 other key: other value
1725 client.0:
1726 debug client: 10
1727 debug ms: 1
1728
1729 By default, the cluster log is checked for errors and warnings,
1730 and the run marked failed if any appear. You can ignore log
1731 entries by giving a list of egrep compatible regexes, i.e.:
1732
1733 tasks:
1734 - ceph:
1735 log-whitelist: ['foo.*bar', 'bad message']
1736
1737 To run multiple ceph clusters, use multiple ceph tasks, and roles
1738 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1739 cluster use the default cluster name, 'ceph'. OSDs from separate
1740 clusters must be on separate hosts. Clients and non-osd daemons
1741 from multiple clusters may be colocated. For each cluster, add an
1742 instance of the ceph task with the cluster name specified, e.g.::
1743
1744 roles:
1745 - [mon.a, osd.0, osd.1]
1746 - [backup.mon.a, backup.osd.0, backup.osd.1]
1747 - [client.0, backup.client.0]
1748 tasks:
1749 - ceph:
1750 cluster: ceph
1751 - ceph:
1752 cluster: backup
1753
1754 :param ctx: Context
1755 :param config: Configuration
1756
1757 """
1758 if config is None:
1759 config = {}
1760 assert isinstance(config, dict), \
1761 "task ceph only supports a dictionary for configuration"
1762
1763 overrides = ctx.config.get('overrides', {})
1764 teuthology.deep_merge(config, overrides.get('ceph', {}))
1765
1766 first_ceph_cluster = False
1767 if not hasattr(ctx, 'daemons'):
1768 first_ceph_cluster = True
1769 ctx.daemons = DaemonGroup()
1770
1771 testdir = teuthology.get_testdir(ctx)
1772 if config.get('coverage'):
1773 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1774 log.info('Creating coverage directory...')
1775 run.wait(
1776 ctx.cluster.run(
1777 args=[
1778 'install', '-d', '-m0755', '--',
1779 coverage_dir,
1780 ],
1781 wait=False,
1782 )
1783 )
1784
1785 if 'cluster' not in config:
1786 config['cluster'] = 'ceph'
1787
1788 validate_config(ctx, config)
1789
1790 subtasks = []
1791 if first_ceph_cluster:
1792 # these tasks handle general log setup and parsing on all hosts,
1793 # so they should only be run once
1794 subtasks = [
1795 lambda: ceph_log(ctx=ctx, config=None),
1796 lambda: ceph_crash(ctx=ctx, config=None),
1797 lambda: valgrind_post(ctx=ctx, config=config),
1798 ]
1799
1800 subtasks += [
1801 lambda: cluster(ctx=ctx, config=dict(
1802 conf=config.get('conf', {}),
1803 fs=config.get('fs', 'xfs'),
1804 mkfs_options=config.get('mkfs_options', None),
1805 mount_options=config.get('mount_options', None),
1806 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1807 log_whitelist=config.get('log-whitelist', []),
1808 cpu_profile=set(config.get('cpu_profile', []),),
1809 cluster=config['cluster'],
1810 mon_bind_msgr2=config.get('mon_bind_msgr2', True),
1811 mon_bind_addrvec=config.get('mon_bind_addrvec', True),
1812 )),
1813 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1814 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1815 lambda: crush_setup(ctx=ctx, config=config),
1816 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1817 lambda: create_rbd_pool(ctx=ctx, config=config),
1818 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1819 lambda: cephfs_setup(ctx=ctx, config=config),
1820 lambda: watchdog_setup(ctx=ctx, config=config),
1821 ]
1822
1823 with contextutil.nested(*subtasks):
1824 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1825 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
1826 if not hasattr(ctx, 'managers'):
1827 ctx.managers = {}
1828 ctx.managers[config['cluster']] = CephManager(
1829 mon,
1830 ctx=ctx,
1831 logger=log.getChild('ceph_manager.' + config['cluster']),
1832 cluster=config['cluster'],
1833 )
1834
1835 try:
1836 if config.get('wait-for-healthy', True):
1837 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1838
1839 yield
1840 finally:
1841 # set pg_num_targets back to actual pg_num, so we don't have to
1842 # wait for pending merges (which can take a while!)
1843 ctx.managers[config['cluster']].stop_pg_num_changes()
1844
1845 if config.get('wait-for-scrub', True):
1846 osd_scrub_pgs(ctx, config)
1847
1848 # stop logging health to clog during shutdown, or else we generate
1849 # a bunch of scary messages unrelated to our actual run.
1850 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1851 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1852 mon0_remote.run(
1853 args=[
1854 'sudo',
1855 'ceph',
1856 '--cluster', config['cluster'],
1857 'config', 'set', 'global',
1858 'mon_health_to_clog', 'false',
1859 ],
1860 check_status=False,
1861 )