]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
update sources to v12.2.3
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
6from cStringIO import StringIO
7
8import argparse
9import contextlib
10import errno
11import logging
12import os
13import json
14import time
15import gevent
16import socket
17
18from paramiko import SSHException
19from ceph_manager import CephManager, write_conf
20from tasks.cephfs.filesystem import Filesystem
21from teuthology import misc as teuthology
22from teuthology import contextutil
23from teuthology import exceptions
24from teuthology.orchestra import run
25import ceph_client as cclient
26from teuthology.orchestra.daemon import DaemonGroup
27
28CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30log = logging.getLogger(__name__)
31
32
33def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
3efd9988
FG
46 mon='allow profile mgr',
47 osd='allow *',
48 mds='allow *',
7c673cae
FG
49 ),
50 mds=dict(
51 mon='allow *',
52 mgr='allow *',
53 osd='allow *',
54 mds='allow',
55 ),
56 client=dict(
57 mon='allow rw',
58 mgr='allow r',
59 osd='allow rwx',
60 mds='allow',
61 ),
62 )
63 for subsystem, capability in defaults[type_].items():
64 yield '--cap'
65 yield subsystem
66 yield capability
67
68
69@contextlib.contextmanager
70def ceph_log(ctx, config):
71 """
72 Create /var/log/ceph log directory that is open to everyone.
73 Add valgrind and profiling-logger directories.
74
75 :param ctx: Context
76 :param config: Configuration
77 """
78 log.info('Making ceph log dir writeable by non-root...')
79 run.wait(
80 ctx.cluster.run(
81 args=[
82 'sudo',
83 'chmod',
84 '777',
85 '/var/log/ceph',
86 ],
87 wait=False,
88 )
89 )
90 log.info('Disabling ceph logrotate...')
91 run.wait(
92 ctx.cluster.run(
93 args=[
94 'sudo',
95 'rm', '-f', '--',
96 '/etc/logrotate.d/ceph',
97 ],
98 wait=False,
99 )
100 )
101 log.info('Creating extra log directories...')
102 run.wait(
103 ctx.cluster.run(
104 args=[
105 'sudo',
106 'install', '-d', '-m0777', '--',
107 '/var/log/ceph/valgrind',
108 '/var/log/ceph/profiling-logger',
109 ],
110 wait=False,
111 )
112 )
113
114 class Rotater(object):
115 stop_event = gevent.event.Event()
116
117 def invoke_logrotate(self):
118 # 1) install ceph-test.conf in /etc/logrotate.d
119 # 2) continuously loop over logrotate invocation with ceph-test.conf
120 while not self.stop_event.is_set():
121 self.stop_event.wait(timeout=30)
122 try:
123 run.wait(
124 ctx.cluster.run(
125 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
126 ],
127 wait=False,
128 )
129 )
130 except exceptions.ConnectionLostError as e:
131 # Some tests may power off nodes during test, in which
132 # case we will see connection errors that we should ignore.
133 log.debug("Missed logrotate, node '{0}' is offline".format(
134 e.node))
135 except EOFError as e:
136 # Paramiko sometimes raises this when it fails to
137 # connect to a node during open_session. As with
138 # ConnectionLostError, we ignore this because nodes
139 # are allowed to get power cycled during tests.
140 log.debug("Missed logrotate, EOFError")
141 except SSHException as e:
142 log.debug("Missed logrotate, SSHException")
143 except socket.error as e:
144 if e.errno == errno.EHOSTUNREACH:
145 log.debug("Missed logrotate, host unreachable")
146 else:
147 raise
148
149 def begin(self):
150 self.thread = gevent.spawn(self.invoke_logrotate)
151
152 def end(self):
153 self.stop_event.set()
154 self.thread.get()
155
156 def write_rotate_conf(ctx, daemons):
157 testdir = teuthology.get_testdir(ctx)
158 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
159 with file(rotate_conf_path, 'rb') as f:
160 conf = ""
161 for daemon, size in daemons.iteritems():
162 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
163 conf += f.read().format(daemon_type=daemon, max_size=size)
164 f.seek(0, 0)
165
166 for remote in ctx.cluster.remotes.iterkeys():
167 teuthology.write_file(remote=remote,
168 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
169 data=StringIO(conf)
170 )
171 remote.run(
172 args=[
173 'sudo',
174 'mv',
175 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
176 '/etc/logrotate.d/ceph-test.conf',
177 run.Raw('&&'),
178 'sudo',
179 'chmod',
180 '0644',
181 '/etc/logrotate.d/ceph-test.conf',
182 run.Raw('&&'),
183 'sudo',
184 'chown',
185 'root.root',
186 '/etc/logrotate.d/ceph-test.conf'
187 ]
188 )
189 remote.chcon('/etc/logrotate.d/ceph-test.conf',
190 'system_u:object_r:etc_t:s0')
191
192 if ctx.config.get('log-rotate'):
193 daemons = ctx.config.get('log-rotate')
194 log.info('Setting up log rotation with ' + str(daemons))
195 write_rotate_conf(ctx, daemons)
196 logrotater = Rotater()
197 logrotater.begin()
198 try:
199 yield
200
201 finally:
202 if ctx.config.get('log-rotate'):
203 log.info('Shutting down logrotate')
204 logrotater.end()
205 ctx.cluster.run(
206 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207 ]
208 )
209 if ctx.archive is not None and \
210 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
211 # and logs
212 log.info('Compressing logs...')
213 run.wait(
214 ctx.cluster.run(
215 args=[
216 'sudo',
217 'find',
218 '/var/log/ceph',
219 '-name',
220 '*.log',
221 '-print0',
222 run.Raw('|'),
223 'sudo',
224 'xargs',
225 '-0',
226 '--no-run-if-empty',
227 '--',
228 'gzip',
229 '--',
230 ],
231 wait=False,
232 ),
233 )
234
235 log.info('Archiving logs...')
236 path = os.path.join(ctx.archive, 'remote')
237 os.makedirs(path)
238 for remote in ctx.cluster.remotes.iterkeys():
239 sub = os.path.join(path, remote.shortname)
240 os.makedirs(sub)
241 teuthology.pull_directory(remote, '/var/log/ceph',
242 os.path.join(sub, 'log'))
243
244
245def assign_devs(roles, devs):
246 """
247 Create a dictionary of devs indexed by roles
248
249 :param roles: List of roles
250 :param devs: Corresponding list of devices.
251 :returns: Dictionary of devs indexed by roles.
252 """
253 return dict(zip(roles, devs))
254
255
256@contextlib.contextmanager
257def valgrind_post(ctx, config):
258 """
259 After the tests run, look throught all the valgrind logs. Exceptions are raised
260 if textual errors occured in the logs, or if valgrind exceptions were detected in
261 the logs.
262
263 :param ctx: Context
264 :param config: Configuration
265 """
266 try:
267 yield
268 finally:
269 lookup_procs = list()
270 log.info('Checking for errors in any valgrind logs...')
271 for remote in ctx.cluster.remotes.iterkeys():
272 # look at valgrind logs for each node
273 proc = remote.run(
274 args=[
275 'sudo',
276 'zgrep',
277 '<kind>',
278 run.Raw('/var/log/ceph/valgrind/*'),
279 '/dev/null', # include a second file so that we always get a filename prefix on the output
280 run.Raw('|'),
281 'sort',
282 run.Raw('|'),
283 'uniq',
284 ],
285 wait=False,
286 check_status=False,
287 stdout=StringIO(),
288 )
289 lookup_procs.append((proc, remote))
290
291 valgrind_exception = None
292 for (proc, remote) in lookup_procs:
293 proc.wait()
294 out = proc.stdout.getvalue()
295 for line in out.split('\n'):
296 if line == '':
297 continue
298 try:
299 (file, kind) = line.split(':')
300 except Exception:
301 log.error('failed to split line %s', line)
302 raise
303 log.debug('file %s kind %s', file, kind)
304 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
305 continue
306 log.error('saw valgrind issue %s in %s', kind, file)
307 valgrind_exception = Exception('saw valgrind issues')
308
309 if config.get('expect_valgrind_errors'):
310 if not valgrind_exception:
311 raise Exception('expected valgrind issues and found none')
312 else:
313 if valgrind_exception:
314 raise valgrind_exception
315
316
317@contextlib.contextmanager
318def crush_setup(ctx, config):
319 cluster_name = config['cluster']
320 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
321 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
322
323 profile = config.get('crush_tunables', 'default')
324 log.info('Setting crush tunables to %s', profile)
325 mon_remote.run(
326 args=['sudo', 'ceph', '--cluster', cluster_name,
327 'osd', 'crush', 'tunables', profile])
328 yield
329
330
224ce89b
WB
331@contextlib.contextmanager
332def create_rbd_pool(ctx, config):
333 cluster_name = config['cluster']
334 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
335 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
336 log.info('Waiting for OSDs to come up')
337 teuthology.wait_until_osds_up(
338 ctx,
339 cluster=ctx.cluster,
340 remote=mon_remote,
341 ceph_cluster=cluster_name,
342 )
3efd9988
FG
343 if config.get('create_rbd_pool', True):
344 log.info('Creating RBD pool')
345 mon_remote.run(
346 args=['sudo', 'ceph', '--cluster', cluster_name,
347 'osd', 'pool', 'create', 'rbd', '8'])
348 mon_remote.run(
349 args=[
350 'sudo', 'ceph', '--cluster', cluster_name,
351 'osd', 'pool', 'application', 'enable',
352 'rbd', 'rbd', '--yes-i-really-mean-it'
353 ],
354 check_status=False)
224ce89b
WB
355 yield
356
7c673cae
FG
357@contextlib.contextmanager
358def cephfs_setup(ctx, config):
359 cluster_name = config['cluster']
360 testdir = teuthology.get_testdir(ctx)
361 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
362
363 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
364 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
365 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
366 # If there are any MDSs, then create a filesystem for them to use
367 # Do this last because requires mon cluster to be up and running
368 if mdss.remotes:
369 log.info('Setting up CephFS filesystem...')
370
3efd9988
FG
371 fs = Filesystem(ctx, name='cephfs', create=True,
372 ec_profile=config.get('cephfs_ec_profile', None))
7c673cae
FG
373
374 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
375 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
376 num_active = len([r for r in all_roles if is_active_mds(r)])
377
7c673cae
FG
378 fs.set_max_mds(num_active)
379 fs.set_allow_dirfrags(True)
380
381 yield
382
383
384@contextlib.contextmanager
385def cluster(ctx, config):
386 """
387 Handle the creation and removal of a ceph cluster.
388
389 On startup:
390 Create directories needed for the cluster.
391 Create remote journals for all osds.
392 Create and set keyring.
393 Copy the monmap to tht test systems.
394 Setup mon nodes.
395 Setup mds nodes.
396 Mkfs osd nodes.
397 Add keyring information to monmaps
398 Mkfs mon nodes.
399
400 On exit:
401 If errors occured, extract a failure message and store in ctx.summary.
402 Unmount all test files and temporary journaling files.
403 Save the monitor information and archive all ceph logs.
404 Cleanup the keyring setup, and remove all monitor map and data files left over.
405
406 :param ctx: Context
407 :param config: Configuration
408 """
409 if ctx.config.get('use_existing_cluster', False) is True:
410 log.info("'use_existing_cluster' is true; skipping cluster creation")
411 yield
412
413 testdir = teuthology.get_testdir(ctx)
414 cluster_name = config['cluster']
415 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
416 log.info('Creating ceph cluster %s...', cluster_name)
417 run.wait(
418 ctx.cluster.run(
419 args=[
420 'install', '-d', '-m0755', '--',
421 data_dir,
422 ],
423 wait=False,
424 )
425 )
426
427 run.wait(
428 ctx.cluster.run(
429 args=[
430 'sudo',
431 'install', '-d', '-m0777', '--', '/var/run/ceph',
432 ],
433 wait=False,
434 )
435 )
436
437 devs_to_clean = {}
438 remote_to_roles_to_devs = {}
439 remote_to_roles_to_journals = {}
440 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
441 for remote, roles_for_host in osds.remotes.iteritems():
442 devs = teuthology.get_scratch_devices(remote)
443 roles_to_devs = {}
444 roles_to_journals = {}
445 if config.get('fs'):
446 log.info('fs option selected, checking for scratch devs')
447 log.info('found devs: %s' % (str(devs),))
448 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
449 iddevs = devs_id_map.values()
450 roles_to_devs = assign_devs(
451 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
452 )
453 if len(roles_to_devs) < len(iddevs):
454 iddevs = iddevs[len(roles_to_devs):]
455 devs_to_clean[remote] = []
456
457 if config.get('block_journal'):
458 log.info('block journal enabled')
459 roles_to_journals = assign_devs(
460 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
461 )
462 log.info('journal map: %s', roles_to_journals)
463
464 if config.get('tmpfs_journal'):
465 log.info('tmpfs journal enabled')
466 roles_to_journals = {}
467 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
468 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
469 tmpfs = '/mnt/' + role
470 roles_to_journals[role] = tmpfs
471 remote.run(args=['truncate', '-s', '1500M', tmpfs])
472 log.info('journal map: %s', roles_to_journals)
473
474 log.info('dev map: %s' % (str(roles_to_devs),))
475 remote_to_roles_to_devs[remote] = roles_to_devs
476 remote_to_roles_to_journals[remote] = roles_to_journals
477
478 log.info('Generating config...')
479 remotes_and_roles = ctx.cluster.remotes.items()
480 roles = [role_list for (remote, role_list) in remotes_and_roles]
481 ips = [host for (host, port) in
482 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
483 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
484 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
485 for role, journal in roles_to_journals.iteritems():
486 name = teuthology.ceph_role(role)
487 if name not in conf:
488 conf[name] = {}
489 conf[name]['osd journal'] = journal
490 for section, keys in config['conf'].iteritems():
491 for key, value in keys.iteritems():
492 log.info("[%s] %s = %s" % (section, key, value))
493 if section not in conf:
494 conf[section] = {}
495 conf[section][key] = value
496
497 if config.get('tmpfs_journal'):
498 conf['journal dio'] = False
499
500 if not hasattr(ctx, 'ceph'):
501 ctx.ceph = {}
502 ctx.ceph[cluster_name] = argparse.Namespace()
503 ctx.ceph[cluster_name].conf = conf
504
505 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
506 keyring_path = config.get('keyring_path', default_keyring)
507
508 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
509
510 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
511
512 log.info('Setting up %s...' % firstmon)
513 ctx.cluster.only(firstmon).run(
514 args=[
515 'sudo',
516 'adjust-ulimits',
517 'ceph-coverage',
518 coverage_dir,
519 'ceph-authtool',
520 '--create-keyring',
521 keyring_path,
522 ],
523 )
524 ctx.cluster.only(firstmon).run(
525 args=[
526 'sudo',
527 'adjust-ulimits',
528 'ceph-coverage',
529 coverage_dir,
530 'ceph-authtool',
531 '--gen-key',
532 '--name=mon.',
533 keyring_path,
534 ],
535 )
536 ctx.cluster.only(firstmon).run(
537 args=[
538 'sudo',
539 'chmod',
540 '0644',
541 keyring_path,
542 ],
543 )
544 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
545 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
546 cluster=cluster_name)
547 fsid = teuthology.create_simple_monmap(
548 ctx,
549 remote=mon0_remote,
550 conf=conf,
551 path=monmap_path,
552 )
553 if not 'global' in conf:
554 conf['global'] = {}
555 conf['global']['fsid'] = fsid
556
557 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
558 conf_path = config.get('conf_path', default_conf_path)
559 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
560 write_conf(ctx, conf_path, cluster_name)
561
562 log.info('Creating admin key on %s...' % firstmon)
563 ctx.cluster.only(firstmon).run(
564 args=[
565 'sudo',
566 'adjust-ulimits',
567 'ceph-coverage',
568 coverage_dir,
569 'ceph-authtool',
570 '--gen-key',
571 '--name=client.admin',
572 '--set-uid=0',
573 '--cap', 'mon', 'allow *',
574 '--cap', 'osd', 'allow *',
575 '--cap', 'mds', 'allow *',
576 '--cap', 'mgr', 'allow *',
577 keyring_path,
578 ],
579 )
580
581 log.info('Copying monmap to all nodes...')
582 keyring = teuthology.get_file(
583 remote=mon0_remote,
584 path=keyring_path,
585 )
586 monmap = teuthology.get_file(
587 remote=mon0_remote,
588 path=monmap_path,
589 )
590
591 for rem in ctx.cluster.remotes.iterkeys():
592 # copy mon key and initial monmap
593 log.info('Sending monmap to node {remote}'.format(remote=rem))
594 teuthology.sudo_write_file(
595 remote=rem,
596 path=keyring_path,
597 data=keyring,
598 perms='0644'
599 )
600 teuthology.write_file(
601 remote=rem,
602 path=monmap_path,
603 data=monmap,
604 )
605
606 log.info('Setting up mon nodes...')
607 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
7c673cae
FG
608
609 if not config.get('skip_mgr_daemons', False):
610 log.info('Setting up mgr nodes...')
611 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
612 for remote, roles_for_host in mgrs.remotes.iteritems():
613 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
614 cluster_name):
615 _, _, id_ = teuthology.split_role(role)
616 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
617 cluster=cluster_name,
618 id=id_,
619 )
620 remote.run(
621 args=[
622 'sudo',
623 'mkdir',
624 '-p',
625 mgr_dir,
626 run.Raw('&&'),
627 'sudo',
628 'adjust-ulimits',
629 'ceph-coverage',
630 coverage_dir,
631 'ceph-authtool',
632 '--create-keyring',
633 '--gen-key',
634 '--name=mgr.{id}'.format(id=id_),
635 mgr_dir + '/keyring',
636 ],
637 )
638
639 log.info('Setting up mds nodes...')
640 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
641 for remote, roles_for_host in mdss.remotes.iteritems():
642 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
643 cluster_name):
644 _, _, id_ = teuthology.split_role(role)
645 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
646 cluster=cluster_name,
647 id=id_,
648 )
649 remote.run(
650 args=[
651 'sudo',
652 'mkdir',
653 '-p',
654 mds_dir,
655 run.Raw('&&'),
656 'sudo',
657 'adjust-ulimits',
658 'ceph-coverage',
659 coverage_dir,
660 'ceph-authtool',
661 '--create-keyring',
662 '--gen-key',
663 '--name=mds.{id}'.format(id=id_),
664 mds_dir + '/keyring',
665 ],
666 )
667
668 cclient.create_keyring(ctx, cluster_name)
669 log.info('Running mkfs on osd nodes...')
670
671 if not hasattr(ctx, 'disk_config'):
672 ctx.disk_config = argparse.Namespace()
673 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
674 ctx.disk_config.remote_to_roles_to_dev = {}
675 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
676 ctx.disk_config.remote_to_roles_to_journals = {}
677 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
678 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
679 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
680 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
681
682 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
683 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
684
685 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
686 for remote, roles_for_host in osds.remotes.iteritems():
687 roles_to_devs = remote_to_roles_to_devs[remote]
688 roles_to_journals = remote_to_roles_to_journals[remote]
689
690 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
691 _, _, id_ = teuthology.split_role(role)
692 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
693 remote.run(
694 args=[
695 'sudo',
696 'mkdir',
697 '-p',
698 mnt_point,
699 ])
d2e6a577 700 log.info(str(roles_to_devs))
7c673cae
FG
701 log.info(str(roles_to_journals))
702 log.info(role)
703 if roles_to_devs.get(role):
704 dev = roles_to_devs[role]
705 fs = config.get('fs')
706 package = None
707 mkfs_options = config.get('mkfs_options')
708 mount_options = config.get('mount_options')
709 if fs == 'btrfs':
710 # package = 'btrfs-tools'
711 if mount_options is None:
712 mount_options = ['noatime', 'user_subvol_rm_allowed']
713 if mkfs_options is None:
714 mkfs_options = ['-m', 'single',
715 '-l', '32768',
716 '-n', '32768']
717 if fs == 'xfs':
718 # package = 'xfsprogs'
719 if mount_options is None:
720 mount_options = ['noatime']
721 if mkfs_options is None:
722 mkfs_options = ['-f', '-i', 'size=2048']
723 if fs == 'ext4' or fs == 'ext3':
724 if mount_options is None:
725 mount_options = ['noatime', 'user_xattr']
726
727 if mount_options is None:
728 mount_options = []
729 if mkfs_options is None:
730 mkfs_options = []
731 mkfs = ['mkfs.%s' % fs] + mkfs_options
732 log.info('%s on %s on %s' % (mkfs, dev, remote))
733 if package is not None:
734 remote.run(
735 args=[
736 'sudo',
737 'apt-get', 'install', '-y', package
738 ],
739 stdout=StringIO(),
740 )
741
742 try:
743 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
744 except run.CommandFailedError:
745 # Newer btfs-tools doesn't prompt for overwrite, use -f
746 if '-f' not in mount_options:
747 mkfs_options.append('-f')
748 mkfs = ['mkfs.%s' % fs] + mkfs_options
749 log.info('%s on %s on %s' % (mkfs, dev, remote))
750 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
751
752 log.info('mount %s on %s -o %s' % (dev, remote,
753 ','.join(mount_options)))
754 remote.run(
755 args=[
756 'sudo',
757 'mount',
758 '-t', fs,
759 '-o', ','.join(mount_options),
760 dev,
761 mnt_point,
762 ]
763 )
764 remote.run(
765 args=[
766 'sudo', '/sbin/restorecon', mnt_point,
767 ],
768 check_status=False,
769 )
770 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
771 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
772 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
773 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
774 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
775 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
776 devs_to_clean[remote].append(mnt_point)
777
778 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
779 _, _, id_ = teuthology.split_role(role)
780 remote.run(
781 args=[
782 'sudo',
783 'MALLOC_CHECK_=3',
784 'adjust-ulimits',
785 'ceph-coverage',
786 coverage_dir,
787 'ceph-osd',
788 '--cluster',
789 cluster_name,
790 '--mkfs',
791 '--mkkey',
792 '-i', id_,
793 '--monmap', monmap_path,
794 ],
795 )
796
797 log.info('Reading keys from all nodes...')
798 keys_fp = StringIO()
799 keys = []
800 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
801 for type_ in ['mgr', 'mds', 'osd']:
802 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
803 continue
804 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
805 _, _, id_ = teuthology.split_role(role)
806 data = teuthology.get_file(
807 remote=remote,
808 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
809 type=type_,
810 id=id_,
811 cluster=cluster_name,
812 ),
813 sudo=True,
814 )
815 keys.append((type_, id_, data))
816 keys_fp.write(data)
817 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
818 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
819 _, _, id_ = teuthology.split_role(role)
820 data = teuthology.get_file(
821 remote=remote,
822 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
823 )
824 keys.append(('client', id_, data))
825 keys_fp.write(data)
826
827 log.info('Adding keys to all mons...')
828 writes = mons.run(
829 args=[
830 'sudo', 'tee', '-a',
831 keyring_path,
832 ],
833 stdin=run.PIPE,
834 wait=False,
835 stdout=StringIO(),
836 )
837 keys_fp.seek(0)
838 teuthology.feed_many_stdins_and_close(keys_fp, writes)
839 run.wait(writes)
840 for type_, id_, data in keys:
841 run.wait(
842 mons.run(
843 args=[
844 'sudo',
845 'adjust-ulimits',
846 'ceph-coverage',
847 coverage_dir,
848 'ceph-authtool',
849 keyring_path,
850 '--name={type}.{id}'.format(
851 type=type_,
852 id=id_,
853 ),
854 ] + list(generate_caps(type_)),
855 wait=False,
856 ),
857 )
858
859 log.info('Running mkfs on mon nodes...')
860 for remote, roles_for_host in mons.remotes.iteritems():
861 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
862 _, _, id_ = teuthology.split_role(role)
863 remote.run(
864 args=[
865 'sudo',
866 'mkdir',
867 '-p',
868 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
869 ],
870 )
871 remote.run(
872 args=[
873 'sudo',
874 'adjust-ulimits',
875 'ceph-coverage',
876 coverage_dir,
877 'ceph-mon',
878 '--cluster', cluster_name,
879 '--mkfs',
880 '-i', id_,
881 '--monmap', monmap_path,
7c673cae
FG
882 '--keyring', keyring_path,
883 ],
884 )
885
886 run.wait(
887 mons.run(
888 args=[
889 'rm',
890 '--',
891 monmap_path,
7c673cae
FG
892 ],
893 wait=False,
894 ),
895 )
896
897 try:
898 yield
899 except Exception:
900 # we need to know this below
901 ctx.summary['success'] = False
902 raise
903 finally:
904 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
905
906 log.info('Checking cluster log for badness...')
907
908 def first_in_ceph_log(pattern, excludes):
909 """
910 Find the first occurence of the pattern specified in the Ceph log,
911 Returns None if none found.
912
913 :param pattern: Pattern scanned for.
914 :param excludes: Patterns to ignore.
915 :return: First line of text (or None if not found)
916 """
917 args = [
918 'sudo',
919 'egrep', pattern,
920 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
921 ]
922 for exclude in excludes:
923 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
924 args.extend([
925 run.Raw('|'), 'head', '-n', '1',
926 ])
927 r = mon0_remote.run(
928 stdout=StringIO(),
929 args=args,
930 )
931 stdout = r.stdout.getvalue()
932 if stdout != '':
933 return stdout
934 return None
935
936 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
937 config['log_whitelist']) is not None:
938 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
939 ctx.summary['success'] = False
940 # use the most severe problem as the failure reason
941 if 'failure_reason' not in ctx.summary:
942 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
943 match = first_in_ceph_log(pattern, config['log_whitelist'])
944 if match is not None:
945 ctx.summary['failure_reason'] = \
946 '"{match}" in cluster log'.format(
947 match=match.rstrip('\n'),
948 )
949 break
950
951 for remote, dirs in devs_to_clean.iteritems():
952 for dir_ in dirs:
953 log.info('Unmounting %s on %s' % (dir_, remote))
954 try:
955 remote.run(
956 args=[
957 'sync',
958 run.Raw('&&'),
959 'sudo',
960 'umount',
961 '-f',
962 dir_
963 ]
964 )
965 except Exception as e:
966 remote.run(args=[
967 'sudo',
968 run.Raw('PATH=/usr/sbin:$PATH'),
969 'lsof',
970 run.Raw(';'),
971 'ps', 'auxf',
972 ])
973 raise e
974
975 if config.get('tmpfs_journal'):
976 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
977 for remote, roles_for_host in osds.remotes.iteritems():
978 remote.run(
979 args=['sudo', 'umount', '-f', '/mnt'],
980 check_status=False,
981 )
982
983 if ctx.archive is not None and \
984 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
985
986 # archive mon data, too
987 log.info('Archiving mon data...')
988 path = os.path.join(ctx.archive, 'data')
989 try:
990 os.makedirs(path)
991 except OSError as e:
992 if e.errno == errno.EEXIST:
993 pass
994 else:
995 raise
996 for remote, roles in mons.remotes.iteritems():
997 for role in roles:
998 is_mon = teuthology.is_type('mon', cluster_name)
999 if is_mon(role):
1000 _, _, id_ = teuthology.split_role(role)
1001 mon_dir = '/var/lib/ceph/mon/' + \
1002 '{0}-{1}'.format(cluster_name, id_)
1003 teuthology.pull_directory_tarball(
1004 remote,
1005 mon_dir,
1006 path + '/' + role + '.tgz')
1007
1008 log.info('Cleaning ceph cluster...')
1009 run.wait(
1010 ctx.cluster.run(
1011 args=[
1012 'sudo',
1013 'rm',
1014 '-rf',
1015 '--',
1016 conf_path,
1017 keyring_path,
1018 data_dir,
1019 monmap_path,
7c673cae
FG
1020 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1021 ],
1022 wait=False,
1023 ),
1024 )
1025
1026
1027def osd_scrub_pgs(ctx, config):
1028 """
1029 Scrub pgs when we exit.
1030
1031 First make sure all pgs are active and clean.
1032 Next scrub all osds.
1033 Then periodically check until all pgs have scrub time stamps that
1034 indicate the last scrub completed. Time out if no progess is made
1035 here after two minutes.
1036 """
d2e6a577
FG
1037 retries = 40
1038 delays = 20
7c673cae
FG
1039 cluster_name = config['cluster']
1040 manager = ctx.managers[cluster_name]
1041 all_clean = False
1042 for _ in range(0, retries):
1043 stats = manager.get_pg_stats()
31f18b77
FG
1044 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1045 if not bad:
7c673cae
FG
1046 all_clean = True
1047 break
31f18b77 1048 log.info(
224ce89b 1049 "Waiting for all PGs to be active and clean, waiting on %s" % bad)
7c673cae
FG
1050 time.sleep(delays)
1051 if not all_clean:
31f18b77 1052 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
7c673cae
FG
1053 check_time_now = time.localtime()
1054 time.sleep(1)
1055 all_roles = teuthology.all_roles(ctx.cluster)
1056 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1057 log.info("Scrubbing {osd}".format(osd=role))
1058 _, _, id_ = teuthology.split_role(role)
31f18b77
FG
1059 # allow this to fail; in certain cases the OSD might not be up
1060 # at this point. we will catch all pgs below.
1061 try:
1062 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1063 except run.CommandFailedError:
1064 pass
7c673cae
FG
1065 prev_good = 0
1066 gap_cnt = 0
1067 loop = True
1068 while loop:
1069 stats = manager.get_pg_stats()
1070 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1071 loop = False
1072 thiscnt = 0
1073 for (pgid, tmval) in timez:
1074 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1075 if pgtm > check_time_now:
1076 thiscnt += 1
1077 else:
1078 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1079 loop = True
1080 if thiscnt > prev_good:
1081 prev_good = thiscnt
1082 gap_cnt = 0
1083 else:
1084 gap_cnt += 1
31f18b77
FG
1085 if gap_cnt % 6 == 0:
1086 for (pgid, tmval) in timez:
1087 # re-request scrub every so often in case the earlier
1088 # request was missed. do not do it everytime because
1089 # the scrub may be in progress or not reported yet and
1090 # we will starve progress.
1091 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
7c673cae 1092 if gap_cnt > retries:
31f18b77 1093 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
7c673cae
FG
1094 if loop:
1095 log.info('Still waiting for all pgs to be scrubbed.')
1096 time.sleep(delays)
1097
1098
1099@contextlib.contextmanager
1100def run_daemon(ctx, config, type_):
1101 """
1102 Run daemons for a role type. Handle the startup and termination of a a daemon.
1103 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1104 and a max_mds value for one mds.
1105 On cleanup -- Stop all existing daemons of this type.
1106
1107 :param ctx: Context
1108 :param config: Configuration
1109 :paran type_: Role type
1110 """
1111 cluster_name = config['cluster']
1112 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1113 testdir = teuthology.get_testdir(ctx)
1114 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1115
1116 # check whether any daemons if this type are configured
1117 if daemons is None:
1118 return
1119 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1120
1121 daemon_signal = 'kill'
1122 if config.get('coverage') or config.get('valgrind') is not None:
1123 daemon_signal = 'term'
1124
c07f9fc5
FG
1125 # create osds in order. (this only matters for pre-luminous, which might
1126 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1127 osd_uuids = {}
7c673cae
FG
1128 for remote, roles_for_host in daemons.remotes.iteritems():
1129 is_type_ = teuthology.is_type(type_, cluster_name)
1130 for role in roles_for_host:
1131 if not is_type_(role):
1132 continue
1133 _, _, id_ = teuthology.split_role(role)
1134
c07f9fc5 1135
224ce89b
WB
1136 if type_ == 'osd':
1137 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1138 cluster=cluster_name, id=id_)
1139 osd_uuid = teuthology.get_file(
1140 remote=remote,
1141 path=datadir + '/fsid',
1142 sudo=True,
1143 ).strip()
c07f9fc5
FG
1144 osd_uuids[id_] = osd_uuid
1145 for osd_id in range(len(osd_uuids)):
1146 id_ = str(osd_id)
1147 osd_uuid = osd_uuids.get(id_)
1148 try:
1149 remote.run(
1150 args=[
1151 'sudo', 'ceph', '--cluster', cluster_name,
1152 'osd', 'new', osd_uuid, id_,
1153 ]
1154 )
1155 except:
1156 # fallback to pre-luminous (hammer or jewel)
1157 remote.run(
1158 args=[
1159 'sudo', 'ceph', '--cluster', cluster_name,
1160 'osd', 'create', osd_uuid,
1161 ]
1162 )
1163 if config.get('add_osds_to_crush'):
1164 remote.run(
1165 args=[
1166 'sudo', 'ceph', '--cluster', cluster_name,
1167 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1168 '1.0', 'host=localhost', 'root=default',
1169 ]
1170 )
1171
1172 for remote, roles_for_host in daemons.remotes.iteritems():
1173 is_type_ = teuthology.is_type(type_, cluster_name)
1174 for role in roles_for_host:
1175 if not is_type_(role):
1176 continue
1177 _, _, id_ = teuthology.split_role(role)
224ce89b 1178
7c673cae
FG
1179 run_cmd = [
1180 'sudo',
1181 'adjust-ulimits',
1182 'ceph-coverage',
1183 coverage_dir,
1184 'daemon-helper',
1185 daemon_signal,
1186 ]
1187 run_cmd_tail = [
1188 'ceph-%s' % (type_),
1189 '-f',
1190 '--cluster', cluster_name,
1191 '-i', id_]
1192
1193 if type_ in config.get('cpu_profile', []):
1194 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1195 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1196
1197 if config.get('valgrind') is not None:
1198 valgrind_args = None
1199 if type_ in config['valgrind']:
1200 valgrind_args = config['valgrind'][type_]
1201 if role in config['valgrind']:
1202 valgrind_args = config['valgrind'][role]
1203 run_cmd = teuthology.get_valgrind_args(testdir, role,
1204 run_cmd,
1205 valgrind_args)
1206
1207 run_cmd.extend(run_cmd_tail)
1208
1209 # always register mgr; don't necessarily start
1210 ctx.daemons.register_daemon(
1211 remote, type_, id_,
1212 cluster=cluster_name,
1213 args=run_cmd,
1214 logger=log.getChild(role),
1215 stdin=run.PIPE,
1216 wait=False
1217 )
1218 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1219 role = cluster_name + '.' + type_
1220 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1221
1222 try:
1223 yield
1224 finally:
1225 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1226
1227
1228def healthy(ctx, config):
1229 """
1230 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1231
1232 :param ctx: Context
1233 :param config: Configuration
1234 """
1235 config = config if isinstance(config, dict) else dict()
1236 cluster_name = config.get('cluster', 'ceph')
c07f9fc5
FG
1237 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1238 manager = ctx.managers[cluster_name]
1239 try:
d2e6a577
FG
1240 manager.wait_for_mgr_available(timeout=30)
1241 except (run.CommandFailedError, AssertionError) as e:
1242 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
c07f9fc5 1243
7c673cae
FG
1244 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1245 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1246 teuthology.wait_until_osds_up(
1247 ctx,
1248 cluster=ctx.cluster,
1249 remote=mon0_remote,
1250 ceph_cluster=cluster_name,
1251 )
c07f9fc5
FG
1252
1253 try:
1254 manager.flush_all_pg_stats()
d2e6a577
FG
1255 except (run.CommandFailedError, Exception) as e:
1256 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
c07f9fc5
FG
1257 manager.wait_for_clean()
1258
b32b8144
FG
1259 if config.get('wait-for-healthy', True):
1260 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1261 teuthology.wait_until_healthy(
1262 ctx,
1263 remote=mon0_remote,
1264 ceph_cluster=cluster_name,
1265 )
7c673cae
FG
1266
1267 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1268 # Some MDSs exist, wait for them to be healthy
1269 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1270 ceph_fs.wait_for_daemons(timeout=300)
1271
1272
1273def wait_for_osds_up(ctx, config):
1274 """
1275 Wait for all osd's to come up.
1276
1277 :param ctx: Context
1278 :param config: Configuration
1279 """
1280 log.info('Waiting until ceph osds are all up...')
1281 cluster_name = config.get('cluster', 'ceph')
1282 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1283 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1284 teuthology.wait_until_osds_up(
1285 ctx,
1286 cluster=ctx.cluster,
1287 remote=mon0_remote
1288 )
1289
1290
1291def wait_for_mon_quorum(ctx, config):
1292 """
1293 Check renote ceph status until all monitors are up.
1294
1295 :param ctx: Context
1296 :param config: Configuration
1297 """
1298 if isinstance(config, dict):
1299 mons = config['daemons']
1300 cluster_name = config.get('cluster', 'ceph')
1301 else:
1302 assert isinstance(config, list)
1303 mons = config
1304 cluster_name = 'ceph'
1305 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1306 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1307 with contextutil.safe_while(sleep=10, tries=60,
1308 action='wait for monitor quorum') as proceed:
1309 while proceed():
1310 r = remote.run(
1311 args=[
1312 'sudo',
1313 'ceph',
1314 'quorum_status',
1315 ],
1316 stdout=StringIO(),
1317 logger=log.getChild('quorum_status'),
1318 )
1319 j = json.loads(r.stdout.getvalue())
1320 q = j.get('quorum_names', [])
1321 log.debug('Quorum: %s', q)
1322 if sorted(q) == sorted(mons):
1323 break
1324
1325
1326def created_pool(ctx, config):
1327 """
1328 Add new pools to the dictionary of pools that the ceph-manager
1329 knows about.
1330 """
1331 for new_pool in config:
1332 if new_pool not in ctx.managers['ceph'].pools:
1333 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1334 new_pool, 'pg_num')
1335
1336
1337@contextlib.contextmanager
1338def restart(ctx, config):
1339 """
1340 restart ceph daemons
1341
1342 For example::
1343 tasks:
1344 - ceph.restart: [all]
1345
1346 For example::
1347 tasks:
1348 - ceph.restart: [osd.0, mon.1, mds.*]
1349
1350 or::
1351
1352 tasks:
1353 - ceph.restart:
1354 daemons: [osd.0, mon.1]
1355 wait-for-healthy: false
1356 wait-for-osds-up: true
1357
1358 :param ctx: Context
1359 :param config: Configuration
1360 """
1361 if config is None:
1362 config = {}
1363 elif isinstance(config, list):
1364 config = {'daemons': config}
1365
1366 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1367 clusters = set()
1368 for role in daemons:
1369 cluster, type_, id_ = teuthology.split_role(role)
1370 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1371 clusters.add(cluster)
1372
1373 manager = ctx.managers['ceph']
1374 for dmon in daemons:
1375 if '.' in dmon:
1376 dm_parts = dmon.split('.')
1377 if dm_parts[1].isdigit():
1378 if dm_parts[0] == 'osd':
1379 manager.mark_down_osd(int(dm_parts[1]))
1380
1381 if config.get('wait-for-healthy', True):
1382 for cluster in clusters:
1383 healthy(ctx=ctx, config=dict(cluster=cluster))
1384 if config.get('wait-for-osds-up', False):
1385 for cluster in clusters:
1386 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1387 yield
1388
1389
1390@contextlib.contextmanager
1391def stop(ctx, config):
1392 """
1393 Stop ceph daemons
1394
1395 For example::
1396 tasks:
1397 - ceph.stop: [mds.*]
1398
1399 tasks:
1400 - ceph.stop: [osd.0, osd.2]
1401
1402 tasks:
1403 - ceph.stop:
1404 daemons: [osd.0, osd.2]
1405
1406 """
1407 if config is None:
1408 config = {}
1409 elif isinstance(config, list):
1410 config = {'daemons': config}
1411
1412 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1413 for role in daemons:
1414 cluster, type_, id_ = teuthology.split_role(role)
1415 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1416
1417 yield
1418
1419
1420@contextlib.contextmanager
1421def wait_for_failure(ctx, config):
1422 """
1423 Wait for a failure of a ceph daemon
1424
1425 For example::
1426 tasks:
1427 - ceph.wait_for_failure: [mds.*]
1428
1429 tasks:
1430 - ceph.wait_for_failure: [osd.0, osd.2]
1431
1432 tasks:
1433 - ceph.wait_for_failure:
1434 daemons: [osd.0, osd.2]
1435
1436 """
1437 if config is None:
1438 config = {}
1439 elif isinstance(config, list):
1440 config = {'daemons': config}
1441
1442 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1443 for role in daemons:
1444 cluster, type_, id_ = teuthology.split_role(role)
1445 try:
1446 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1447 except:
1448 log.info('Saw expected daemon failure. Continuing.')
1449 pass
1450 else:
1451 raise RuntimeError('daemon %s did not fail' % role)
1452
1453 yield
1454
1455
1456def validate_config(ctx, config):
1457 """
1458 Perform some simple validation on task configuration.
1459 Raises exceptions.ConfigError if an error is found.
1460 """
1461 # check for osds from multiple clusters on the same host
1462 for remote, roles_for_host in ctx.cluster.remotes.items():
1463 last_cluster = None
1464 last_role = None
1465 for role in roles_for_host:
1466 role_cluster, role_type, _ = teuthology.split_role(role)
1467 if role_type != 'osd':
1468 continue
1469 if last_cluster and last_cluster != role_cluster:
1470 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1471 last_role, role)
1472 raise exceptions.ConfigError(msg)
1473 last_cluster = role_cluster
1474 last_role = role
1475
1476
1477@contextlib.contextmanager
1478def task(ctx, config):
1479 """
1480 Set up and tear down a Ceph cluster.
1481
1482 For example::
1483
1484 tasks:
1485 - ceph:
1486 - interactive:
1487
1488 You can also specify what branch to run::
1489
1490 tasks:
1491 - ceph:
1492 branch: foo
1493
1494 Or a tag::
1495
1496 tasks:
1497 - ceph:
1498 tag: v0.42.13
1499
1500 Or a sha1::
1501
1502 tasks:
1503 - ceph:
1504 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1505
1506 Or a local source dir::
1507
1508 tasks:
1509 - ceph:
1510 path: /home/sage/ceph
1511
1512 To capture code coverage data, use::
1513
1514 tasks:
1515 - ceph:
1516 coverage: true
1517
1518 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1519
1520 tasks:
1521 - ceph:
1522 fs: xfs
1523 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1524 mount_options: [nobarrier, inode64]
1525
1526 Note, this will cause the task to check the /scratch_devs file on each node
1527 for available devices. If no such file is found, /dev/sdb will be used.
1528
1529 To run some daemons under valgrind, include their names
1530 and the tool/args to use in a valgrind section::
1531
1532 tasks:
1533 - ceph:
1534 valgrind:
1535 mds.1: --tool=memcheck
1536 osd.1: [--tool=memcheck, --leak-check=no]
1537
1538 Those nodes which are using memcheck or valgrind will get
1539 checked for bad results.
1540
1541 To adjust or modify config options, use::
1542
1543 tasks:
1544 - ceph:
1545 conf:
1546 section:
1547 key: value
1548
1549 For example::
1550
1551 tasks:
1552 - ceph:
1553 conf:
1554 mds.0:
1555 some option: value
1556 other key: other value
1557 client.0:
1558 debug client: 10
1559 debug ms: 1
1560
1561 By default, the cluster log is checked for errors and warnings,
1562 and the run marked failed if any appear. You can ignore log
1563 entries by giving a list of egrep compatible regexes, i.e.:
1564
1565 tasks:
1566 - ceph:
1567 log-whitelist: ['foo.*bar', 'bad message']
1568
1569 To run multiple ceph clusters, use multiple ceph tasks, and roles
1570 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1571 cluster use the default cluster name, 'ceph'. OSDs from separate
1572 clusters must be on separate hosts. Clients and non-osd daemons
1573 from multiple clusters may be colocated. For each cluster, add an
1574 instance of the ceph task with the cluster name specified, e.g.::
1575
1576 roles:
1577 - [mon.a, osd.0, osd.1]
1578 - [backup.mon.a, backup.osd.0, backup.osd.1]
1579 - [client.0, backup.client.0]
1580 tasks:
1581 - ceph:
1582 cluster: ceph
1583 - ceph:
1584 cluster: backup
1585
1586 :param ctx: Context
1587 :param config: Configuration
1588
1589 """
1590 if config is None:
1591 config = {}
1592 assert isinstance(config, dict), \
1593 "task ceph only supports a dictionary for configuration"
1594
1595 overrides = ctx.config.get('overrides', {})
1596 teuthology.deep_merge(config, overrides.get('ceph', {}))
1597
1598 first_ceph_cluster = False
1599 if not hasattr(ctx, 'daemons'):
1600 first_ceph_cluster = True
1601 ctx.daemons = DaemonGroup()
1602
1603 testdir = teuthology.get_testdir(ctx)
1604 if config.get('coverage'):
1605 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1606 log.info('Creating coverage directory...')
1607 run.wait(
1608 ctx.cluster.run(
1609 args=[
1610 'install', '-d', '-m0755', '--',
1611 coverage_dir,
1612 ],
1613 wait=False,
1614 )
1615 )
1616
1617 if 'cluster' not in config:
1618 config['cluster'] = 'ceph'
1619
1620 validate_config(ctx, config)
1621
1622 subtasks = []
1623 if first_ceph_cluster:
1624 # these tasks handle general log setup and parsing on all hosts,
1625 # so they should only be run once
1626 subtasks = [
1627 lambda: ceph_log(ctx=ctx, config=None),
1628 lambda: valgrind_post(ctx=ctx, config=config),
1629 ]
1630
1631 subtasks += [
1632 lambda: cluster(ctx=ctx, config=dict(
1633 conf=config.get('conf', {}),
1634 fs=config.get('fs', 'xfs'),
1635 mkfs_options=config.get('mkfs_options', None),
1636 mount_options=config.get('mount_options', None),
1637 block_journal=config.get('block_journal', None),
1638 tmpfs_journal=config.get('tmpfs_journal', None),
1639 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1640 log_whitelist=config.get('log-whitelist', []),
1641 cpu_profile=set(config.get('cpu_profile', []),),
1642 cluster=config['cluster'],
1643 )),
1644 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1645 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1646 lambda: crush_setup(ctx=ctx, config=config),
1647 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
224ce89b 1648 lambda: create_rbd_pool(ctx=ctx, config=config),
7c673cae
FG
1649 lambda: cephfs_setup(ctx=ctx, config=config),
1650 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1651 ]
1652
1653 with contextutil.nested(*subtasks):
1654 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1655 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1656 if not hasattr(ctx, 'managers'):
1657 ctx.managers = {}
1658 ctx.managers[config['cluster']] = CephManager(
1659 mon,
1660 ctx=ctx,
1661 logger=log.getChild('ceph_manager.' + config['cluster']),
1662 cluster=config['cluster'],
1663 )
1664
1665 try:
1666 if config.get('wait-for-healthy', True):
1667 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1668
1669 yield
1670 finally:
1671 if config.get('wait-for-scrub', True):
1672 osd_scrub_pgs(ctx, config)
224ce89b
WB
1673
1674 # stop logging health to clog during shutdown, or else we generate
1675 # a bunch of scary messages unrelated to our actual run.
1676 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1677 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1678 mon0_remote.run(
1679 args=[
1680 'sudo',
1681 'ceph',
1682 '--cluster', config['cluster'],
1683 'tell',
1684 'mon.*',
1685 'injectargs',
1686 '--',
1687 '--no-mon-health-to-clog',
1688 ]
1689 )