]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
update sources to 12.2.2
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
6from cStringIO import StringIO
7
8import argparse
9import contextlib
10import errno
11import logging
12import os
13import json
14import time
15import gevent
16import socket
17
18from paramiko import SSHException
19from ceph_manager import CephManager, write_conf
20from tasks.cephfs.filesystem import Filesystem
21from teuthology import misc as teuthology
22from teuthology import contextutil
23from teuthology import exceptions
24from teuthology.orchestra import run
25import ceph_client as cclient
26from teuthology.orchestra.daemon import DaemonGroup
27
28CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30log = logging.getLogger(__name__)
31
32
33def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
3efd9988
FG
46 mon='allow profile mgr',
47 osd='allow *',
48 mds='allow *',
7c673cae
FG
49 ),
50 mds=dict(
51 mon='allow *',
52 mgr='allow *',
53 osd='allow *',
54 mds='allow',
55 ),
56 client=dict(
57 mon='allow rw',
58 mgr='allow r',
59 osd='allow rwx',
60 mds='allow',
61 ),
62 )
63 for subsystem, capability in defaults[type_].items():
64 yield '--cap'
65 yield subsystem
66 yield capability
67
68
69@contextlib.contextmanager
70def ceph_log(ctx, config):
71 """
72 Create /var/log/ceph log directory that is open to everyone.
73 Add valgrind and profiling-logger directories.
74
75 :param ctx: Context
76 :param config: Configuration
77 """
78 log.info('Making ceph log dir writeable by non-root...')
79 run.wait(
80 ctx.cluster.run(
81 args=[
82 'sudo',
83 'chmod',
84 '777',
85 '/var/log/ceph',
86 ],
87 wait=False,
88 )
89 )
90 log.info('Disabling ceph logrotate...')
91 run.wait(
92 ctx.cluster.run(
93 args=[
94 'sudo',
95 'rm', '-f', '--',
96 '/etc/logrotate.d/ceph',
97 ],
98 wait=False,
99 )
100 )
101 log.info('Creating extra log directories...')
102 run.wait(
103 ctx.cluster.run(
104 args=[
105 'sudo',
106 'install', '-d', '-m0777', '--',
107 '/var/log/ceph/valgrind',
108 '/var/log/ceph/profiling-logger',
109 ],
110 wait=False,
111 )
112 )
113
114 class Rotater(object):
115 stop_event = gevent.event.Event()
116
117 def invoke_logrotate(self):
118 # 1) install ceph-test.conf in /etc/logrotate.d
119 # 2) continuously loop over logrotate invocation with ceph-test.conf
120 while not self.stop_event.is_set():
121 self.stop_event.wait(timeout=30)
122 try:
123 run.wait(
124 ctx.cluster.run(
125 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
126 ],
127 wait=False,
128 )
129 )
130 except exceptions.ConnectionLostError as e:
131 # Some tests may power off nodes during test, in which
132 # case we will see connection errors that we should ignore.
133 log.debug("Missed logrotate, node '{0}' is offline".format(
134 e.node))
135 except EOFError as e:
136 # Paramiko sometimes raises this when it fails to
137 # connect to a node during open_session. As with
138 # ConnectionLostError, we ignore this because nodes
139 # are allowed to get power cycled during tests.
140 log.debug("Missed logrotate, EOFError")
141 except SSHException as e:
142 log.debug("Missed logrotate, SSHException")
143 except socket.error as e:
144 if e.errno == errno.EHOSTUNREACH:
145 log.debug("Missed logrotate, host unreachable")
146 else:
147 raise
148
149 def begin(self):
150 self.thread = gevent.spawn(self.invoke_logrotate)
151
152 def end(self):
153 self.stop_event.set()
154 self.thread.get()
155
156 def write_rotate_conf(ctx, daemons):
157 testdir = teuthology.get_testdir(ctx)
158 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
159 with file(rotate_conf_path, 'rb') as f:
160 conf = ""
161 for daemon, size in daemons.iteritems():
162 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
163 conf += f.read().format(daemon_type=daemon, max_size=size)
164 f.seek(0, 0)
165
166 for remote in ctx.cluster.remotes.iterkeys():
167 teuthology.write_file(remote=remote,
168 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
169 data=StringIO(conf)
170 )
171 remote.run(
172 args=[
173 'sudo',
174 'mv',
175 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
176 '/etc/logrotate.d/ceph-test.conf',
177 run.Raw('&&'),
178 'sudo',
179 'chmod',
180 '0644',
181 '/etc/logrotate.d/ceph-test.conf',
182 run.Raw('&&'),
183 'sudo',
184 'chown',
185 'root.root',
186 '/etc/logrotate.d/ceph-test.conf'
187 ]
188 )
189 remote.chcon('/etc/logrotate.d/ceph-test.conf',
190 'system_u:object_r:etc_t:s0')
191
192 if ctx.config.get('log-rotate'):
193 daemons = ctx.config.get('log-rotate')
194 log.info('Setting up log rotation with ' + str(daemons))
195 write_rotate_conf(ctx, daemons)
196 logrotater = Rotater()
197 logrotater.begin()
198 try:
199 yield
200
201 finally:
202 if ctx.config.get('log-rotate'):
203 log.info('Shutting down logrotate')
204 logrotater.end()
205 ctx.cluster.run(
206 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207 ]
208 )
209 if ctx.archive is not None and \
210 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
211 # and logs
212 log.info('Compressing logs...')
213 run.wait(
214 ctx.cluster.run(
215 args=[
216 'sudo',
217 'find',
218 '/var/log/ceph',
219 '-name',
220 '*.log',
221 '-print0',
222 run.Raw('|'),
223 'sudo',
224 'xargs',
225 '-0',
226 '--no-run-if-empty',
227 '--',
228 'gzip',
229 '--',
230 ],
231 wait=False,
232 ),
233 )
234
235 log.info('Archiving logs...')
236 path = os.path.join(ctx.archive, 'remote')
237 os.makedirs(path)
238 for remote in ctx.cluster.remotes.iterkeys():
239 sub = os.path.join(path, remote.shortname)
240 os.makedirs(sub)
241 teuthology.pull_directory(remote, '/var/log/ceph',
242 os.path.join(sub, 'log'))
243
244
245def assign_devs(roles, devs):
246 """
247 Create a dictionary of devs indexed by roles
248
249 :param roles: List of roles
250 :param devs: Corresponding list of devices.
251 :returns: Dictionary of devs indexed by roles.
252 """
253 return dict(zip(roles, devs))
254
255
256@contextlib.contextmanager
257def valgrind_post(ctx, config):
258 """
259 After the tests run, look throught all the valgrind logs. Exceptions are raised
260 if textual errors occured in the logs, or if valgrind exceptions were detected in
261 the logs.
262
263 :param ctx: Context
264 :param config: Configuration
265 """
266 try:
267 yield
268 finally:
269 lookup_procs = list()
270 log.info('Checking for errors in any valgrind logs...')
271 for remote in ctx.cluster.remotes.iterkeys():
272 # look at valgrind logs for each node
273 proc = remote.run(
274 args=[
275 'sudo',
276 'zgrep',
277 '<kind>',
278 run.Raw('/var/log/ceph/valgrind/*'),
279 '/dev/null', # include a second file so that we always get a filename prefix on the output
280 run.Raw('|'),
281 'sort',
282 run.Raw('|'),
283 'uniq',
284 ],
285 wait=False,
286 check_status=False,
287 stdout=StringIO(),
288 )
289 lookup_procs.append((proc, remote))
290
291 valgrind_exception = None
292 for (proc, remote) in lookup_procs:
293 proc.wait()
294 out = proc.stdout.getvalue()
295 for line in out.split('\n'):
296 if line == '':
297 continue
298 try:
299 (file, kind) = line.split(':')
300 except Exception:
301 log.error('failed to split line %s', line)
302 raise
303 log.debug('file %s kind %s', file, kind)
304 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
305 continue
306 log.error('saw valgrind issue %s in %s', kind, file)
307 valgrind_exception = Exception('saw valgrind issues')
308
309 if config.get('expect_valgrind_errors'):
310 if not valgrind_exception:
311 raise Exception('expected valgrind issues and found none')
312 else:
313 if valgrind_exception:
314 raise valgrind_exception
315
316
317@contextlib.contextmanager
318def crush_setup(ctx, config):
319 cluster_name = config['cluster']
320 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
321 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
322
323 profile = config.get('crush_tunables', 'default')
324 log.info('Setting crush tunables to %s', profile)
325 mon_remote.run(
326 args=['sudo', 'ceph', '--cluster', cluster_name,
327 'osd', 'crush', 'tunables', profile])
328 yield
329
330
224ce89b
WB
331@contextlib.contextmanager
332def create_rbd_pool(ctx, config):
333 cluster_name = config['cluster']
334 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
335 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
336 log.info('Waiting for OSDs to come up')
337 teuthology.wait_until_osds_up(
338 ctx,
339 cluster=ctx.cluster,
340 remote=mon_remote,
341 ceph_cluster=cluster_name,
342 )
3efd9988
FG
343 if config.get('create_rbd_pool', True):
344 log.info('Creating RBD pool')
345 mon_remote.run(
346 args=['sudo', 'ceph', '--cluster', cluster_name,
347 'osd', 'pool', 'create', 'rbd', '8'])
348 mon_remote.run(
349 args=[
350 'sudo', 'ceph', '--cluster', cluster_name,
351 'osd', 'pool', 'application', 'enable',
352 'rbd', 'rbd', '--yes-i-really-mean-it'
353 ],
354 check_status=False)
224ce89b
WB
355 yield
356
7c673cae
FG
357@contextlib.contextmanager
358def cephfs_setup(ctx, config):
359 cluster_name = config['cluster']
360 testdir = teuthology.get_testdir(ctx)
361 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
362
363 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
364 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
365 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
366 # If there are any MDSs, then create a filesystem for them to use
367 # Do this last because requires mon cluster to be up and running
368 if mdss.remotes:
369 log.info('Setting up CephFS filesystem...')
370
3efd9988
FG
371 fs = Filesystem(ctx, name='cephfs', create=True,
372 ec_profile=config.get('cephfs_ec_profile', None))
7c673cae
FG
373
374 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
375 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
376 num_active = len([r for r in all_roles if is_active_mds(r)])
377
7c673cae
FG
378 fs.set_max_mds(num_active)
379 fs.set_allow_dirfrags(True)
380
381 yield
382
383
384@contextlib.contextmanager
385def cluster(ctx, config):
386 """
387 Handle the creation and removal of a ceph cluster.
388
389 On startup:
390 Create directories needed for the cluster.
391 Create remote journals for all osds.
392 Create and set keyring.
393 Copy the monmap to tht test systems.
394 Setup mon nodes.
395 Setup mds nodes.
396 Mkfs osd nodes.
397 Add keyring information to monmaps
398 Mkfs mon nodes.
399
400 On exit:
401 If errors occured, extract a failure message and store in ctx.summary.
402 Unmount all test files and temporary journaling files.
403 Save the monitor information and archive all ceph logs.
404 Cleanup the keyring setup, and remove all monitor map and data files left over.
405
406 :param ctx: Context
407 :param config: Configuration
408 """
409 if ctx.config.get('use_existing_cluster', False) is True:
410 log.info("'use_existing_cluster' is true; skipping cluster creation")
411 yield
412
413 testdir = teuthology.get_testdir(ctx)
414 cluster_name = config['cluster']
415 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
416 log.info('Creating ceph cluster %s...', cluster_name)
417 run.wait(
418 ctx.cluster.run(
419 args=[
420 'install', '-d', '-m0755', '--',
421 data_dir,
422 ],
423 wait=False,
424 )
425 )
426
427 run.wait(
428 ctx.cluster.run(
429 args=[
430 'sudo',
431 'install', '-d', '-m0777', '--', '/var/run/ceph',
432 ],
433 wait=False,
434 )
435 )
436
437 devs_to_clean = {}
438 remote_to_roles_to_devs = {}
439 remote_to_roles_to_journals = {}
440 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
441 for remote, roles_for_host in osds.remotes.iteritems():
442 devs = teuthology.get_scratch_devices(remote)
443 roles_to_devs = {}
444 roles_to_journals = {}
445 if config.get('fs'):
446 log.info('fs option selected, checking for scratch devs')
447 log.info('found devs: %s' % (str(devs),))
448 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
449 iddevs = devs_id_map.values()
450 roles_to_devs = assign_devs(
451 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
452 )
453 if len(roles_to_devs) < len(iddevs):
454 iddevs = iddevs[len(roles_to_devs):]
455 devs_to_clean[remote] = []
456
457 if config.get('block_journal'):
458 log.info('block journal enabled')
459 roles_to_journals = assign_devs(
460 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
461 )
462 log.info('journal map: %s', roles_to_journals)
463
464 if config.get('tmpfs_journal'):
465 log.info('tmpfs journal enabled')
466 roles_to_journals = {}
467 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
468 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
469 tmpfs = '/mnt/' + role
470 roles_to_journals[role] = tmpfs
471 remote.run(args=['truncate', '-s', '1500M', tmpfs])
472 log.info('journal map: %s', roles_to_journals)
473
474 log.info('dev map: %s' % (str(roles_to_devs),))
475 remote_to_roles_to_devs[remote] = roles_to_devs
476 remote_to_roles_to_journals[remote] = roles_to_journals
477
478 log.info('Generating config...')
479 remotes_and_roles = ctx.cluster.remotes.items()
480 roles = [role_list for (remote, role_list) in remotes_and_roles]
481 ips = [host for (host, port) in
482 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
483 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
484 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
485 for role, journal in roles_to_journals.iteritems():
486 name = teuthology.ceph_role(role)
487 if name not in conf:
488 conf[name] = {}
489 conf[name]['osd journal'] = journal
490 for section, keys in config['conf'].iteritems():
491 for key, value in keys.iteritems():
492 log.info("[%s] %s = %s" % (section, key, value))
493 if section not in conf:
494 conf[section] = {}
495 conf[section][key] = value
496
497 if config.get('tmpfs_journal'):
498 conf['journal dio'] = False
499
500 if not hasattr(ctx, 'ceph'):
501 ctx.ceph = {}
502 ctx.ceph[cluster_name] = argparse.Namespace()
503 ctx.ceph[cluster_name].conf = conf
504
505 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
506 keyring_path = config.get('keyring_path', default_keyring)
507
508 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
509
510 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
511
512 log.info('Setting up %s...' % firstmon)
513 ctx.cluster.only(firstmon).run(
514 args=[
515 'sudo',
516 'adjust-ulimits',
517 'ceph-coverage',
518 coverage_dir,
519 'ceph-authtool',
520 '--create-keyring',
521 keyring_path,
522 ],
523 )
524 ctx.cluster.only(firstmon).run(
525 args=[
526 'sudo',
527 'adjust-ulimits',
528 'ceph-coverage',
529 coverage_dir,
530 'ceph-authtool',
531 '--gen-key',
532 '--name=mon.',
533 keyring_path,
534 ],
535 )
536 ctx.cluster.only(firstmon).run(
537 args=[
538 'sudo',
539 'chmod',
540 '0644',
541 keyring_path,
542 ],
543 )
544 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
545 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
546 cluster=cluster_name)
547 fsid = teuthology.create_simple_monmap(
548 ctx,
549 remote=mon0_remote,
550 conf=conf,
551 path=monmap_path,
552 )
553 if not 'global' in conf:
554 conf['global'] = {}
555 conf['global']['fsid'] = fsid
556
557 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
558 conf_path = config.get('conf_path', default_conf_path)
559 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
560 write_conf(ctx, conf_path, cluster_name)
561
562 log.info('Creating admin key on %s...' % firstmon)
563 ctx.cluster.only(firstmon).run(
564 args=[
565 'sudo',
566 'adjust-ulimits',
567 'ceph-coverage',
568 coverage_dir,
569 'ceph-authtool',
570 '--gen-key',
571 '--name=client.admin',
572 '--set-uid=0',
573 '--cap', 'mon', 'allow *',
574 '--cap', 'osd', 'allow *',
575 '--cap', 'mds', 'allow *',
576 '--cap', 'mgr', 'allow *',
577 keyring_path,
578 ],
579 )
580
581 log.info('Copying monmap to all nodes...')
582 keyring = teuthology.get_file(
583 remote=mon0_remote,
584 path=keyring_path,
585 )
586 monmap = teuthology.get_file(
587 remote=mon0_remote,
588 path=monmap_path,
589 )
590
591 for rem in ctx.cluster.remotes.iterkeys():
592 # copy mon key and initial monmap
593 log.info('Sending monmap to node {remote}'.format(remote=rem))
594 teuthology.sudo_write_file(
595 remote=rem,
596 path=keyring_path,
597 data=keyring,
598 perms='0644'
599 )
600 teuthology.write_file(
601 remote=rem,
602 path=monmap_path,
603 data=monmap,
604 )
605
606 log.info('Setting up mon nodes...')
607 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
7c673cae
FG
608
609 if not config.get('skip_mgr_daemons', False):
610 log.info('Setting up mgr nodes...')
611 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
612 for remote, roles_for_host in mgrs.remotes.iteritems():
613 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
614 cluster_name):
615 _, _, id_ = teuthology.split_role(role)
616 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
617 cluster=cluster_name,
618 id=id_,
619 )
620 remote.run(
621 args=[
622 'sudo',
623 'mkdir',
624 '-p',
625 mgr_dir,
626 run.Raw('&&'),
627 'sudo',
628 'adjust-ulimits',
629 'ceph-coverage',
630 coverage_dir,
631 'ceph-authtool',
632 '--create-keyring',
633 '--gen-key',
634 '--name=mgr.{id}'.format(id=id_),
635 mgr_dir + '/keyring',
636 ],
637 )
638
639 log.info('Setting up mds nodes...')
640 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
641 for remote, roles_for_host in mdss.remotes.iteritems():
642 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
643 cluster_name):
644 _, _, id_ = teuthology.split_role(role)
645 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
646 cluster=cluster_name,
647 id=id_,
648 )
649 remote.run(
650 args=[
651 'sudo',
652 'mkdir',
653 '-p',
654 mds_dir,
655 run.Raw('&&'),
656 'sudo',
657 'adjust-ulimits',
658 'ceph-coverage',
659 coverage_dir,
660 'ceph-authtool',
661 '--create-keyring',
662 '--gen-key',
663 '--name=mds.{id}'.format(id=id_),
664 mds_dir + '/keyring',
665 ],
666 )
667
668 cclient.create_keyring(ctx, cluster_name)
669 log.info('Running mkfs on osd nodes...')
670
671 if not hasattr(ctx, 'disk_config'):
672 ctx.disk_config = argparse.Namespace()
673 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
674 ctx.disk_config.remote_to_roles_to_dev = {}
675 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
676 ctx.disk_config.remote_to_roles_to_journals = {}
677 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
678 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
679 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
680 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
681
682 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
683 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
684
685 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
686 for remote, roles_for_host in osds.remotes.iteritems():
687 roles_to_devs = remote_to_roles_to_devs[remote]
688 roles_to_journals = remote_to_roles_to_journals[remote]
689
690 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
691 _, _, id_ = teuthology.split_role(role)
692 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
693 remote.run(
694 args=[
695 'sudo',
696 'mkdir',
697 '-p',
698 mnt_point,
699 ])
d2e6a577 700 log.info(str(roles_to_devs))
7c673cae
FG
701 log.info(str(roles_to_journals))
702 log.info(role)
703 if roles_to_devs.get(role):
704 dev = roles_to_devs[role]
705 fs = config.get('fs')
706 package = None
707 mkfs_options = config.get('mkfs_options')
708 mount_options = config.get('mount_options')
709 if fs == 'btrfs':
710 # package = 'btrfs-tools'
711 if mount_options is None:
712 mount_options = ['noatime', 'user_subvol_rm_allowed']
713 if mkfs_options is None:
714 mkfs_options = ['-m', 'single',
715 '-l', '32768',
716 '-n', '32768']
717 if fs == 'xfs':
718 # package = 'xfsprogs'
719 if mount_options is None:
720 mount_options = ['noatime']
721 if mkfs_options is None:
722 mkfs_options = ['-f', '-i', 'size=2048']
723 if fs == 'ext4' or fs == 'ext3':
724 if mount_options is None:
725 mount_options = ['noatime', 'user_xattr']
726
727 if mount_options is None:
728 mount_options = []
729 if mkfs_options is None:
730 mkfs_options = []
731 mkfs = ['mkfs.%s' % fs] + mkfs_options
732 log.info('%s on %s on %s' % (mkfs, dev, remote))
733 if package is not None:
734 remote.run(
735 args=[
736 'sudo',
737 'apt-get', 'install', '-y', package
738 ],
739 stdout=StringIO(),
740 )
741
742 try:
743 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
744 except run.CommandFailedError:
745 # Newer btfs-tools doesn't prompt for overwrite, use -f
746 if '-f' not in mount_options:
747 mkfs_options.append('-f')
748 mkfs = ['mkfs.%s' % fs] + mkfs_options
749 log.info('%s on %s on %s' % (mkfs, dev, remote))
750 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
751
752 log.info('mount %s on %s -o %s' % (dev, remote,
753 ','.join(mount_options)))
754 remote.run(
755 args=[
756 'sudo',
757 'mount',
758 '-t', fs,
759 '-o', ','.join(mount_options),
760 dev,
761 mnt_point,
762 ]
763 )
764 remote.run(
765 args=[
766 'sudo', '/sbin/restorecon', mnt_point,
767 ],
768 check_status=False,
769 )
770 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
771 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
772 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
773 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
774 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
775 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
776 devs_to_clean[remote].append(mnt_point)
777
778 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
779 _, _, id_ = teuthology.split_role(role)
780 remote.run(
781 args=[
782 'sudo',
783 'MALLOC_CHECK_=3',
784 'adjust-ulimits',
785 'ceph-coverage',
786 coverage_dir,
787 'ceph-osd',
788 '--cluster',
789 cluster_name,
790 '--mkfs',
791 '--mkkey',
792 '-i', id_,
793 '--monmap', monmap_path,
794 ],
795 )
796
797 log.info('Reading keys from all nodes...')
798 keys_fp = StringIO()
799 keys = []
800 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
801 for type_ in ['mgr', 'mds', 'osd']:
802 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
803 continue
804 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
805 _, _, id_ = teuthology.split_role(role)
806 data = teuthology.get_file(
807 remote=remote,
808 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
809 type=type_,
810 id=id_,
811 cluster=cluster_name,
812 ),
813 sudo=True,
814 )
815 keys.append((type_, id_, data))
816 keys_fp.write(data)
817 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
818 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
819 _, _, id_ = teuthology.split_role(role)
820 data = teuthology.get_file(
821 remote=remote,
822 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
823 )
824 keys.append(('client', id_, data))
825 keys_fp.write(data)
826
827 log.info('Adding keys to all mons...')
828 writes = mons.run(
829 args=[
830 'sudo', 'tee', '-a',
831 keyring_path,
832 ],
833 stdin=run.PIPE,
834 wait=False,
835 stdout=StringIO(),
836 )
837 keys_fp.seek(0)
838 teuthology.feed_many_stdins_and_close(keys_fp, writes)
839 run.wait(writes)
840 for type_, id_, data in keys:
841 run.wait(
842 mons.run(
843 args=[
844 'sudo',
845 'adjust-ulimits',
846 'ceph-coverage',
847 coverage_dir,
848 'ceph-authtool',
849 keyring_path,
850 '--name={type}.{id}'.format(
851 type=type_,
852 id=id_,
853 ),
854 ] + list(generate_caps(type_)),
855 wait=False,
856 ),
857 )
858
859 log.info('Running mkfs on mon nodes...')
860 for remote, roles_for_host in mons.remotes.iteritems():
861 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
862 _, _, id_ = teuthology.split_role(role)
863 remote.run(
864 args=[
865 'sudo',
866 'mkdir',
867 '-p',
868 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
869 ],
870 )
871 remote.run(
872 args=[
873 'sudo',
874 'adjust-ulimits',
875 'ceph-coverage',
876 coverage_dir,
877 'ceph-mon',
878 '--cluster', cluster_name,
879 '--mkfs',
880 '-i', id_,
881 '--monmap', monmap_path,
7c673cae
FG
882 '--keyring', keyring_path,
883 ],
884 )
885
886 run.wait(
887 mons.run(
888 args=[
889 'rm',
890 '--',
891 monmap_path,
7c673cae
FG
892 ],
893 wait=False,
894 ),
895 )
896
897 try:
898 yield
899 except Exception:
900 # we need to know this below
901 ctx.summary['success'] = False
902 raise
903 finally:
904 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
905
906 log.info('Checking cluster log for badness...')
907
908 def first_in_ceph_log(pattern, excludes):
909 """
910 Find the first occurence of the pattern specified in the Ceph log,
911 Returns None if none found.
912
913 :param pattern: Pattern scanned for.
914 :param excludes: Patterns to ignore.
915 :return: First line of text (or None if not found)
916 """
917 args = [
918 'sudo',
919 'egrep', pattern,
920 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
921 ]
922 for exclude in excludes:
923 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
924 args.extend([
925 run.Raw('|'), 'head', '-n', '1',
926 ])
927 r = mon0_remote.run(
928 stdout=StringIO(),
929 args=args,
930 )
931 stdout = r.stdout.getvalue()
932 if stdout != '':
933 return stdout
934 return None
935
936 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
937 config['log_whitelist']) is not None:
938 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
939 ctx.summary['success'] = False
940 # use the most severe problem as the failure reason
941 if 'failure_reason' not in ctx.summary:
942 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
943 match = first_in_ceph_log(pattern, config['log_whitelist'])
944 if match is not None:
945 ctx.summary['failure_reason'] = \
946 '"{match}" in cluster log'.format(
947 match=match.rstrip('\n'),
948 )
949 break
950
951 for remote, dirs in devs_to_clean.iteritems():
952 for dir_ in dirs:
953 log.info('Unmounting %s on %s' % (dir_, remote))
954 try:
955 remote.run(
956 args=[
957 'sync',
958 run.Raw('&&'),
959 'sudo',
960 'umount',
961 '-f',
962 dir_
963 ]
964 )
965 except Exception as e:
966 remote.run(args=[
967 'sudo',
968 run.Raw('PATH=/usr/sbin:$PATH'),
969 'lsof',
970 run.Raw(';'),
971 'ps', 'auxf',
972 ])
973 raise e
974
975 if config.get('tmpfs_journal'):
976 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
977 for remote, roles_for_host in osds.remotes.iteritems():
978 remote.run(
979 args=['sudo', 'umount', '-f', '/mnt'],
980 check_status=False,
981 )
982
983 if ctx.archive is not None and \
984 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
985
986 # archive mon data, too
987 log.info('Archiving mon data...')
988 path = os.path.join(ctx.archive, 'data')
989 try:
990 os.makedirs(path)
991 except OSError as e:
992 if e.errno == errno.EEXIST:
993 pass
994 else:
995 raise
996 for remote, roles in mons.remotes.iteritems():
997 for role in roles:
998 is_mon = teuthology.is_type('mon', cluster_name)
999 if is_mon(role):
1000 _, _, id_ = teuthology.split_role(role)
1001 mon_dir = '/var/lib/ceph/mon/' + \
1002 '{0}-{1}'.format(cluster_name, id_)
1003 teuthology.pull_directory_tarball(
1004 remote,
1005 mon_dir,
1006 path + '/' + role + '.tgz')
1007
1008 log.info('Cleaning ceph cluster...')
1009 run.wait(
1010 ctx.cluster.run(
1011 args=[
1012 'sudo',
1013 'rm',
1014 '-rf',
1015 '--',
1016 conf_path,
1017 keyring_path,
1018 data_dir,
1019 monmap_path,
7c673cae
FG
1020 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1021 ],
1022 wait=False,
1023 ),
1024 )
1025
1026
1027def osd_scrub_pgs(ctx, config):
1028 """
1029 Scrub pgs when we exit.
1030
1031 First make sure all pgs are active and clean.
1032 Next scrub all osds.
1033 Then periodically check until all pgs have scrub time stamps that
1034 indicate the last scrub completed. Time out if no progess is made
1035 here after two minutes.
1036 """
d2e6a577
FG
1037 retries = 40
1038 delays = 20
7c673cae
FG
1039 cluster_name = config['cluster']
1040 manager = ctx.managers[cluster_name]
1041 all_clean = False
1042 for _ in range(0, retries):
1043 stats = manager.get_pg_stats()
31f18b77
FG
1044 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1045 if not bad:
7c673cae
FG
1046 all_clean = True
1047 break
31f18b77 1048 log.info(
224ce89b 1049 "Waiting for all PGs to be active and clean, waiting on %s" % bad)
7c673cae
FG
1050 time.sleep(delays)
1051 if not all_clean:
31f18b77 1052 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
7c673cae
FG
1053 check_time_now = time.localtime()
1054 time.sleep(1)
1055 all_roles = teuthology.all_roles(ctx.cluster)
1056 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1057 log.info("Scrubbing {osd}".format(osd=role))
1058 _, _, id_ = teuthology.split_role(role)
31f18b77
FG
1059 # allow this to fail; in certain cases the OSD might not be up
1060 # at this point. we will catch all pgs below.
1061 try:
1062 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1063 except run.CommandFailedError:
1064 pass
7c673cae
FG
1065 prev_good = 0
1066 gap_cnt = 0
1067 loop = True
1068 while loop:
1069 stats = manager.get_pg_stats()
1070 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1071 loop = False
1072 thiscnt = 0
1073 for (pgid, tmval) in timez:
1074 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1075 if pgtm > check_time_now:
1076 thiscnt += 1
1077 else:
1078 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1079 loop = True
1080 if thiscnt > prev_good:
1081 prev_good = thiscnt
1082 gap_cnt = 0
1083 else:
1084 gap_cnt += 1
31f18b77
FG
1085 if gap_cnt % 6 == 0:
1086 for (pgid, tmval) in timez:
1087 # re-request scrub every so often in case the earlier
1088 # request was missed. do not do it everytime because
1089 # the scrub may be in progress or not reported yet and
1090 # we will starve progress.
1091 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
7c673cae 1092 if gap_cnt > retries:
31f18b77 1093 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
7c673cae
FG
1094 if loop:
1095 log.info('Still waiting for all pgs to be scrubbed.')
1096 time.sleep(delays)
1097
1098
1099@contextlib.contextmanager
1100def run_daemon(ctx, config, type_):
1101 """
1102 Run daemons for a role type. Handle the startup and termination of a a daemon.
1103 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1104 and a max_mds value for one mds.
1105 On cleanup -- Stop all existing daemons of this type.
1106
1107 :param ctx: Context
1108 :param config: Configuration
1109 :paran type_: Role type
1110 """
1111 cluster_name = config['cluster']
1112 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1113 testdir = teuthology.get_testdir(ctx)
1114 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1115
1116 # check whether any daemons if this type are configured
1117 if daemons is None:
1118 return
1119 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1120
1121 daemon_signal = 'kill'
1122 if config.get('coverage') or config.get('valgrind') is not None:
1123 daemon_signal = 'term'
1124
c07f9fc5
FG
1125 # create osds in order. (this only matters for pre-luminous, which might
1126 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1127 osd_uuids = {}
7c673cae
FG
1128 for remote, roles_for_host in daemons.remotes.iteritems():
1129 is_type_ = teuthology.is_type(type_, cluster_name)
1130 for role in roles_for_host:
1131 if not is_type_(role):
1132 continue
1133 _, _, id_ = teuthology.split_role(role)
1134
c07f9fc5 1135
224ce89b
WB
1136 if type_ == 'osd':
1137 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1138 cluster=cluster_name, id=id_)
1139 osd_uuid = teuthology.get_file(
1140 remote=remote,
1141 path=datadir + '/fsid',
1142 sudo=True,
1143 ).strip()
c07f9fc5
FG
1144 osd_uuids[id_] = osd_uuid
1145 for osd_id in range(len(osd_uuids)):
1146 id_ = str(osd_id)
1147 osd_uuid = osd_uuids.get(id_)
1148 try:
1149 remote.run(
1150 args=[
1151 'sudo', 'ceph', '--cluster', cluster_name,
1152 'osd', 'new', osd_uuid, id_,
1153 ]
1154 )
1155 except:
1156 # fallback to pre-luminous (hammer or jewel)
1157 remote.run(
1158 args=[
1159 'sudo', 'ceph', '--cluster', cluster_name,
1160 'osd', 'create', osd_uuid,
1161 ]
1162 )
1163 if config.get('add_osds_to_crush'):
1164 remote.run(
1165 args=[
1166 'sudo', 'ceph', '--cluster', cluster_name,
1167 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1168 '1.0', 'host=localhost', 'root=default',
1169 ]
1170 )
1171
1172 for remote, roles_for_host in daemons.remotes.iteritems():
1173 is_type_ = teuthology.is_type(type_, cluster_name)
1174 for role in roles_for_host:
1175 if not is_type_(role):
1176 continue
1177 _, _, id_ = teuthology.split_role(role)
224ce89b 1178
7c673cae
FG
1179 run_cmd = [
1180 'sudo',
1181 'adjust-ulimits',
1182 'ceph-coverage',
1183 coverage_dir,
1184 'daemon-helper',
1185 daemon_signal,
1186 ]
1187 run_cmd_tail = [
1188 'ceph-%s' % (type_),
1189 '-f',
1190 '--cluster', cluster_name,
1191 '-i', id_]
1192
1193 if type_ in config.get('cpu_profile', []):
1194 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1195 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1196
1197 if config.get('valgrind') is not None:
1198 valgrind_args = None
1199 if type_ in config['valgrind']:
1200 valgrind_args = config['valgrind'][type_]
1201 if role in config['valgrind']:
1202 valgrind_args = config['valgrind'][role]
1203 run_cmd = teuthology.get_valgrind_args(testdir, role,
1204 run_cmd,
1205 valgrind_args)
1206
1207 run_cmd.extend(run_cmd_tail)
1208
1209 # always register mgr; don't necessarily start
1210 ctx.daemons.register_daemon(
1211 remote, type_, id_,
1212 cluster=cluster_name,
1213 args=run_cmd,
1214 logger=log.getChild(role),
1215 stdin=run.PIPE,
1216 wait=False
1217 )
1218 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1219 role = cluster_name + '.' + type_
1220 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1221
1222 try:
1223 yield
1224 finally:
1225 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1226
1227
1228def healthy(ctx, config):
1229 """
1230 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1231
1232 :param ctx: Context
1233 :param config: Configuration
1234 """
1235 config = config if isinstance(config, dict) else dict()
1236 cluster_name = config.get('cluster', 'ceph')
c07f9fc5
FG
1237 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1238 manager = ctx.managers[cluster_name]
1239 try:
d2e6a577
FG
1240 manager.wait_for_mgr_available(timeout=30)
1241 except (run.CommandFailedError, AssertionError) as e:
1242 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
c07f9fc5 1243
7c673cae
FG
1244 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1245 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1246 teuthology.wait_until_osds_up(
1247 ctx,
1248 cluster=ctx.cluster,
1249 remote=mon0_remote,
1250 ceph_cluster=cluster_name,
1251 )
c07f9fc5
FG
1252
1253 try:
1254 manager.flush_all_pg_stats()
d2e6a577
FG
1255 except (run.CommandFailedError, Exception) as e:
1256 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
c07f9fc5
FG
1257 manager.wait_for_clean()
1258
1259 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
7c673cae
FG
1260 teuthology.wait_until_healthy(
1261 ctx,
1262 remote=mon0_remote,
1263 ceph_cluster=cluster_name,
1264 )
1265
1266 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1267 # Some MDSs exist, wait for them to be healthy
1268 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1269 ceph_fs.wait_for_daemons(timeout=300)
1270
1271
1272def wait_for_osds_up(ctx, config):
1273 """
1274 Wait for all osd's to come up.
1275
1276 :param ctx: Context
1277 :param config: Configuration
1278 """
1279 log.info('Waiting until ceph osds are all up...')
1280 cluster_name = config.get('cluster', 'ceph')
1281 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1282 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1283 teuthology.wait_until_osds_up(
1284 ctx,
1285 cluster=ctx.cluster,
1286 remote=mon0_remote
1287 )
1288
1289
1290def wait_for_mon_quorum(ctx, config):
1291 """
1292 Check renote ceph status until all monitors are up.
1293
1294 :param ctx: Context
1295 :param config: Configuration
1296 """
1297 if isinstance(config, dict):
1298 mons = config['daemons']
1299 cluster_name = config.get('cluster', 'ceph')
1300 else:
1301 assert isinstance(config, list)
1302 mons = config
1303 cluster_name = 'ceph'
1304 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1305 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1306 with contextutil.safe_while(sleep=10, tries=60,
1307 action='wait for monitor quorum') as proceed:
1308 while proceed():
1309 r = remote.run(
1310 args=[
1311 'sudo',
1312 'ceph',
1313 'quorum_status',
1314 ],
1315 stdout=StringIO(),
1316 logger=log.getChild('quorum_status'),
1317 )
1318 j = json.loads(r.stdout.getvalue())
1319 q = j.get('quorum_names', [])
1320 log.debug('Quorum: %s', q)
1321 if sorted(q) == sorted(mons):
1322 break
1323
1324
1325def created_pool(ctx, config):
1326 """
1327 Add new pools to the dictionary of pools that the ceph-manager
1328 knows about.
1329 """
1330 for new_pool in config:
1331 if new_pool not in ctx.managers['ceph'].pools:
1332 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1333 new_pool, 'pg_num')
1334
1335
1336@contextlib.contextmanager
1337def restart(ctx, config):
1338 """
1339 restart ceph daemons
1340
1341 For example::
1342 tasks:
1343 - ceph.restart: [all]
1344
1345 For example::
1346 tasks:
1347 - ceph.restart: [osd.0, mon.1, mds.*]
1348
1349 or::
1350
1351 tasks:
1352 - ceph.restart:
1353 daemons: [osd.0, mon.1]
1354 wait-for-healthy: false
1355 wait-for-osds-up: true
1356
1357 :param ctx: Context
1358 :param config: Configuration
1359 """
1360 if config is None:
1361 config = {}
1362 elif isinstance(config, list):
1363 config = {'daemons': config}
1364
1365 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1366 clusters = set()
1367 for role in daemons:
1368 cluster, type_, id_ = teuthology.split_role(role)
1369 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1370 clusters.add(cluster)
1371
1372 manager = ctx.managers['ceph']
1373 for dmon in daemons:
1374 if '.' in dmon:
1375 dm_parts = dmon.split('.')
1376 if dm_parts[1].isdigit():
1377 if dm_parts[0] == 'osd':
1378 manager.mark_down_osd(int(dm_parts[1]))
1379
1380 if config.get('wait-for-healthy', True):
1381 for cluster in clusters:
1382 healthy(ctx=ctx, config=dict(cluster=cluster))
1383 if config.get('wait-for-osds-up', False):
1384 for cluster in clusters:
1385 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1386 yield
1387
1388
1389@contextlib.contextmanager
1390def stop(ctx, config):
1391 """
1392 Stop ceph daemons
1393
1394 For example::
1395 tasks:
1396 - ceph.stop: [mds.*]
1397
1398 tasks:
1399 - ceph.stop: [osd.0, osd.2]
1400
1401 tasks:
1402 - ceph.stop:
1403 daemons: [osd.0, osd.2]
1404
1405 """
1406 if config is None:
1407 config = {}
1408 elif isinstance(config, list):
1409 config = {'daemons': config}
1410
1411 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1412 for role in daemons:
1413 cluster, type_, id_ = teuthology.split_role(role)
1414 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1415
1416 yield
1417
1418
1419@contextlib.contextmanager
1420def wait_for_failure(ctx, config):
1421 """
1422 Wait for a failure of a ceph daemon
1423
1424 For example::
1425 tasks:
1426 - ceph.wait_for_failure: [mds.*]
1427
1428 tasks:
1429 - ceph.wait_for_failure: [osd.0, osd.2]
1430
1431 tasks:
1432 - ceph.wait_for_failure:
1433 daemons: [osd.0, osd.2]
1434
1435 """
1436 if config is None:
1437 config = {}
1438 elif isinstance(config, list):
1439 config = {'daemons': config}
1440
1441 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1442 for role in daemons:
1443 cluster, type_, id_ = teuthology.split_role(role)
1444 try:
1445 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1446 except:
1447 log.info('Saw expected daemon failure. Continuing.')
1448 pass
1449 else:
1450 raise RuntimeError('daemon %s did not fail' % role)
1451
1452 yield
1453
1454
1455def validate_config(ctx, config):
1456 """
1457 Perform some simple validation on task configuration.
1458 Raises exceptions.ConfigError if an error is found.
1459 """
1460 # check for osds from multiple clusters on the same host
1461 for remote, roles_for_host in ctx.cluster.remotes.items():
1462 last_cluster = None
1463 last_role = None
1464 for role in roles_for_host:
1465 role_cluster, role_type, _ = teuthology.split_role(role)
1466 if role_type != 'osd':
1467 continue
1468 if last_cluster and last_cluster != role_cluster:
1469 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1470 last_role, role)
1471 raise exceptions.ConfigError(msg)
1472 last_cluster = role_cluster
1473 last_role = role
1474
1475
1476@contextlib.contextmanager
1477def task(ctx, config):
1478 """
1479 Set up and tear down a Ceph cluster.
1480
1481 For example::
1482
1483 tasks:
1484 - ceph:
1485 - interactive:
1486
1487 You can also specify what branch to run::
1488
1489 tasks:
1490 - ceph:
1491 branch: foo
1492
1493 Or a tag::
1494
1495 tasks:
1496 - ceph:
1497 tag: v0.42.13
1498
1499 Or a sha1::
1500
1501 tasks:
1502 - ceph:
1503 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1504
1505 Or a local source dir::
1506
1507 tasks:
1508 - ceph:
1509 path: /home/sage/ceph
1510
1511 To capture code coverage data, use::
1512
1513 tasks:
1514 - ceph:
1515 coverage: true
1516
1517 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1518
1519 tasks:
1520 - ceph:
1521 fs: xfs
1522 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1523 mount_options: [nobarrier, inode64]
1524
1525 Note, this will cause the task to check the /scratch_devs file on each node
1526 for available devices. If no such file is found, /dev/sdb will be used.
1527
1528 To run some daemons under valgrind, include their names
1529 and the tool/args to use in a valgrind section::
1530
1531 tasks:
1532 - ceph:
1533 valgrind:
1534 mds.1: --tool=memcheck
1535 osd.1: [--tool=memcheck, --leak-check=no]
1536
1537 Those nodes which are using memcheck or valgrind will get
1538 checked for bad results.
1539
1540 To adjust or modify config options, use::
1541
1542 tasks:
1543 - ceph:
1544 conf:
1545 section:
1546 key: value
1547
1548 For example::
1549
1550 tasks:
1551 - ceph:
1552 conf:
1553 mds.0:
1554 some option: value
1555 other key: other value
1556 client.0:
1557 debug client: 10
1558 debug ms: 1
1559
1560 By default, the cluster log is checked for errors and warnings,
1561 and the run marked failed if any appear. You can ignore log
1562 entries by giving a list of egrep compatible regexes, i.e.:
1563
1564 tasks:
1565 - ceph:
1566 log-whitelist: ['foo.*bar', 'bad message']
1567
1568 To run multiple ceph clusters, use multiple ceph tasks, and roles
1569 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1570 cluster use the default cluster name, 'ceph'. OSDs from separate
1571 clusters must be on separate hosts. Clients and non-osd daemons
1572 from multiple clusters may be colocated. For each cluster, add an
1573 instance of the ceph task with the cluster name specified, e.g.::
1574
1575 roles:
1576 - [mon.a, osd.0, osd.1]
1577 - [backup.mon.a, backup.osd.0, backup.osd.1]
1578 - [client.0, backup.client.0]
1579 tasks:
1580 - ceph:
1581 cluster: ceph
1582 - ceph:
1583 cluster: backup
1584
1585 :param ctx: Context
1586 :param config: Configuration
1587
1588 """
1589 if config is None:
1590 config = {}
1591 assert isinstance(config, dict), \
1592 "task ceph only supports a dictionary for configuration"
1593
1594 overrides = ctx.config.get('overrides', {})
1595 teuthology.deep_merge(config, overrides.get('ceph', {}))
1596
1597 first_ceph_cluster = False
1598 if not hasattr(ctx, 'daemons'):
1599 first_ceph_cluster = True
1600 ctx.daemons = DaemonGroup()
1601
1602 testdir = teuthology.get_testdir(ctx)
1603 if config.get('coverage'):
1604 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1605 log.info('Creating coverage directory...')
1606 run.wait(
1607 ctx.cluster.run(
1608 args=[
1609 'install', '-d', '-m0755', '--',
1610 coverage_dir,
1611 ],
1612 wait=False,
1613 )
1614 )
1615
1616 if 'cluster' not in config:
1617 config['cluster'] = 'ceph'
1618
1619 validate_config(ctx, config)
1620
1621 subtasks = []
1622 if first_ceph_cluster:
1623 # these tasks handle general log setup and parsing on all hosts,
1624 # so they should only be run once
1625 subtasks = [
1626 lambda: ceph_log(ctx=ctx, config=None),
1627 lambda: valgrind_post(ctx=ctx, config=config),
1628 ]
1629
1630 subtasks += [
1631 lambda: cluster(ctx=ctx, config=dict(
1632 conf=config.get('conf', {}),
1633 fs=config.get('fs', 'xfs'),
1634 mkfs_options=config.get('mkfs_options', None),
1635 mount_options=config.get('mount_options', None),
1636 block_journal=config.get('block_journal', None),
1637 tmpfs_journal=config.get('tmpfs_journal', None),
1638 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1639 log_whitelist=config.get('log-whitelist', []),
1640 cpu_profile=set(config.get('cpu_profile', []),),
1641 cluster=config['cluster'],
1642 )),
1643 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1644 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1645 lambda: crush_setup(ctx=ctx, config=config),
1646 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
224ce89b 1647 lambda: create_rbd_pool(ctx=ctx, config=config),
7c673cae
FG
1648 lambda: cephfs_setup(ctx=ctx, config=config),
1649 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1650 ]
1651
1652 with contextutil.nested(*subtasks):
1653 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1654 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1655 if not hasattr(ctx, 'managers'):
1656 ctx.managers = {}
1657 ctx.managers[config['cluster']] = CephManager(
1658 mon,
1659 ctx=ctx,
1660 logger=log.getChild('ceph_manager.' + config['cluster']),
1661 cluster=config['cluster'],
1662 )
1663
1664 try:
1665 if config.get('wait-for-healthy', True):
1666 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1667
1668 yield
1669 finally:
1670 if config.get('wait-for-scrub', True):
1671 osd_scrub_pgs(ctx, config)
224ce89b
WB
1672
1673 # stop logging health to clog during shutdown, or else we generate
1674 # a bunch of scary messages unrelated to our actual run.
1675 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1676 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1677 mon0_remote.run(
1678 args=[
1679 'sudo',
1680 'ceph',
1681 '--cluster', config['cluster'],
1682 'tell',
1683 'mon.*',
1684 'injectargs',
1685 '--',
1686 '--no-mon-health-to-clog',
1687 ]
1688 )